1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_2vect_mad_neon
32.type gf_2vect_mad_neon, %function
33
34
35/* arguments */
36x_len		.req	x0
37x_vec		.req	x1
38x_vec_i		.req	x2
39x_tbl		.req	x3
40x_src		.req	x4
41x_dest		.req	x5
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_src_end	.req	x6
48x_dest1		.req	x7
49x_dest2		.req	x8
50x_tmp		.req	x9
51x_tbl1		.req	x10
52x_tbl2		.req	x11
53x_const		.req	x12
54
55/* vectors */
56v_mask0f	.req	v0
57v_tmp_lo	.req	v1
58v_tmp_hi	.req	v2
59v_tmp		.req	v3
60q_tmp		.req	q3
61
62v_gft1_lo	.req	v4
63v_gft1_hi	.req	v5
64v_gft2_lo	.req	v6
65v_gft2_hi	.req	v7
66q_gft1_lo	.req	q4
67q_gft1_hi	.req	q5
68q_gft2_lo	.req	q6
69q_gft2_hi	.req	q7
70
71v_data_0	.req	v8
72v_data_1	.req	v9
73v_data_2	.req	v10
74v_data_3	.req	v11
75v_data_4	.req	v12
76v_data_5	.req	v13
77v_data_6	.req	v14
78v_data_7	.req	v15
79q_data_0	.req	q8
80q_data_1	.req	q9
81q_data_2	.req	q10
82q_data_3	.req	q11
83q_data_4	.req	q12
84q_data_5	.req	q13
85q_data_6	.req	q14
86q_data_7	.req	q15
87
88v_data_0_lo	.req	v16
89v_data_1_lo	.req	v17
90v_data_2_lo	.req	v18
91v_data_3_lo	.req	v19
92v_data_4_lo	.req	v20
93v_data_5_lo	.req	v21
94v_data_6_lo	.req	v22
95v_data_7_lo	.req	v23
96v_data_0_hi	.req	v_data_0
97v_data_1_hi	.req	v_data_1
98v_data_2_hi	.req	v_data_2
99v_data_3_hi	.req	v_data_3
100v_data_4_hi	.req	v_data_4
101v_data_5_hi	.req	v_data_5
102v_data_6_hi	.req	v_data_6
103v_data_7_hi	.req	v_data_7
104
105v_d0		.req	v24
106v_d1		.req	v25
107v_d2		.req	v26
108v_d3		.req	v27
109v_d4		.req	v28
110v_d5		.req	v29
111v_d6		.req	v30
112v_d7		.req	v31
113q_d0		.req	q24
114q_d1		.req	q25
115q_d2		.req	q26
116q_d3		.req	q27
117q_d4		.req	q28
118q_d5		.req	q29
119q_d6		.req	q30
120q_d7		.req	q31
121
122v_data		.req	v16
123q_data		.req	q16
124v_data_lo	.req	v17
125v_data_hi	.req	v18
126
127
128gf_2vect_mad_neon:
129	/* less than 16 bytes, return_fail */
130	cmp	x_len, #16
131	blt	.return_fail
132
133	movi	v_mask0f.16b, #0x0f
134	lsl	x_vec_i, x_vec_i, #5
135	lsl	x_vec, x_vec, #5
136	add	x_tbl1, x_tbl, x_vec_i
137	add	x_tbl2, x_tbl1, x_vec
138	add	x_src_end, x_src, x_len
139
140	ldr	x_dest1, [x_dest]
141	ldr	x_dest2, [x_dest, #8]
142	ldr	q_gft1_lo, [x_tbl1]
143	ldr	q_gft1_hi, [x_tbl1, #16]
144	ldr	q_gft2_lo, [x_tbl2]
145	ldr	q_gft2_hi, [x_tbl2, #16]
146
147.Lloop128_init:
148	/* less than 128 bytes, goto Lloop16_init */
149	cmp	x_len, #128
150	blt	.Lloop16_init
151
152	/* save d8 ~ d15 to stack */
153	sub	sp, sp, #64
154	stp	d8, d9, [sp]
155	stp	d10, d11, [sp, #16]
156	stp	d12, d13, [sp, #32]
157	stp	d14, d15, [sp, #48]
158
159	sub	x_src_end, x_src_end, #128
160
161.Lloop128:
162	ldr	q_data_0, [x_src, #16*0]
163	ldr	q_data_1, [x_src, #16*1]
164	ldr	q_data_2, [x_src, #16*2]
165	ldr	q_data_3, [x_src, #16*3]
166	ldr	q_data_4, [x_src, #16*4]
167	ldr	q_data_5, [x_src, #16*5]
168	ldr	q_data_6, [x_src, #16*6]
169	ldr	q_data_7, [x_src, #16*7]
170
171	ldr	q_d0, [x_dest1, #16*0]
172	ldr	q_d1, [x_dest1, #16*1]
173	ldr	q_d2, [x_dest1, #16*2]
174	ldr	q_d3, [x_dest1, #16*3]
175	ldr	q_d4, [x_dest1, #16*4]
176	ldr	q_d5, [x_dest1, #16*5]
177	ldr	q_d6, [x_dest1, #16*6]
178	ldr	q_d7, [x_dest1, #16*7]
179
180	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
181	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
182	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
183	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
184	and	v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
185	and	v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
186	and	v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
187	and	v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
188
189	ushr	v_data_0_hi.16b, v_data_0.16b, #4
190	ushr	v_data_1_hi.16b, v_data_1.16b, #4
191	ushr	v_data_2_hi.16b, v_data_2.16b, #4
192	ushr	v_data_3_hi.16b, v_data_3.16b, #4
193	ushr	v_data_4_hi.16b, v_data_4.16b, #4
194	ushr	v_data_5_hi.16b, v_data_5.16b, #4
195	ushr	v_data_6_hi.16b, v_data_6.16b, #4
196	ushr	v_data_7_hi.16b, v_data_7.16b, #4
197
198	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
199	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
200	eor	v_d0.16b, v_tmp_lo.16b, v_d0.16b
201	eor	v_d0.16b, v_d0.16b, v_tmp_hi.16b
202
203	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
204	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
205	eor	v_d1.16b, v_tmp_lo.16b, v_d1.16b
206	eor	v_d1.16b, v_d1.16b, v_tmp_hi.16b
207
208	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
209	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
210	eor	v_d2.16b, v_tmp_lo.16b, v_d2.16b
211	eor	v_d2.16b, v_d2.16b, v_tmp_hi.16b
212
213	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
214	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
215	eor	v_d3.16b, v_tmp_lo.16b, v_d3.16b
216	eor	v_d3.16b, v_d3.16b, v_tmp_hi.16b
217
218	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
219	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
220	eor	v_d4.16b, v_tmp_lo.16b, v_d4.16b
221	eor	v_d4.16b, v_d4.16b, v_tmp_hi.16b
222
223	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
224	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
225	eor	v_d5.16b, v_tmp_lo.16b, v_d5.16b
226	eor	v_d5.16b, v_d5.16b, v_tmp_hi.16b
227
228	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
229	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
230	eor	v_d6.16b, v_tmp_lo.16b, v_d6.16b
231	eor	v_d6.16b, v_d6.16b, v_tmp_hi.16b
232
233	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
234	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
235	eor	v_d7.16b, v_tmp_lo.16b, v_d7.16b
236	eor	v_d7.16b, v_d7.16b, v_tmp_hi.16b
237
238	str	q_d0, [x_dest1, #16*0]
239	str	q_d1, [x_dest1, #16*1]
240	str	q_d2, [x_dest1, #16*2]
241	str	q_d3, [x_dest1, #16*3]
242	str	q_d4, [x_dest1, #16*4]
243	str	q_d5, [x_dest1, #16*5]
244	str	q_d6, [x_dest1, #16*6]
245	str	q_d7, [x_dest1, #16*7]
246
247	ldr	q_d0, [x_dest2, #16*0]
248	ldr	q_d1, [x_dest2, #16*1]
249	ldr	q_d2, [x_dest2, #16*2]
250	ldr	q_d3, [x_dest2, #16*3]
251	ldr	q_d4, [x_dest2, #16*4]
252	ldr	q_d5, [x_dest2, #16*5]
253	ldr	q_d6, [x_dest2, #16*6]
254	ldr	q_d7, [x_dest2, #16*7]
255
256	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
257	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
258	eor	v_d0.16b, v_tmp_lo.16b, v_d0.16b
259	eor	v_d0.16b, v_d0.16b, v_tmp_hi.16b
260
261	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
262	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
263	eor	v_d1.16b, v_tmp_lo.16b, v_d1.16b
264	eor	v_d1.16b, v_d1.16b, v_tmp_hi.16b
265
266	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
267	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
268	eor	v_d2.16b, v_tmp_lo.16b, v_d2.16b
269	eor	v_d2.16b, v_d2.16b, v_tmp_hi.16b
270
271	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
272	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
273	eor	v_d3.16b, v_tmp_lo.16b, v_d3.16b
274	eor	v_d3.16b, v_d3.16b, v_tmp_hi.16b
275
276	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_4_lo.16b
277	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_4_hi.16b
278	eor	v_d4.16b, v_tmp_lo.16b, v_d4.16b
279	eor	v_d4.16b, v_d4.16b, v_tmp_hi.16b
280
281	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_5_lo.16b
282	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_5_hi.16b
283	eor	v_d5.16b, v_tmp_lo.16b, v_d5.16b
284	eor	v_d5.16b, v_d5.16b, v_tmp_hi.16b
285
286	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_6_lo.16b
287	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_6_hi.16b
288	eor	v_d6.16b, v_tmp_lo.16b, v_d6.16b
289	eor	v_d6.16b, v_d6.16b, v_tmp_hi.16b
290
291	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_7_lo.16b
292	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_7_hi.16b
293	eor	v_d7.16b, v_tmp_lo.16b, v_d7.16b
294	eor	v_d7.16b, v_d7.16b, v_tmp_hi.16b
295
296	str	q_d0, [x_dest2, #16*0]
297	str	q_d1, [x_dest2, #16*1]
298	str	q_d2, [x_dest2, #16*2]
299	str	q_d3, [x_dest2, #16*3]
300	str	q_d4, [x_dest2, #16*4]
301	str	q_d5, [x_dest2, #16*5]
302	str	q_d6, [x_dest2, #16*6]
303	str	q_d7, [x_dest2, #16*7]
304
305	add	x_src, x_src, #128
306	add	x_dest1, x_dest1, #128
307	add	x_dest2, x_dest2, #128
308	cmp	x_src, x_src_end
309	bls	.Lloop128
310
311.Lloop128_end:
312	/* restore d8 ~ d15 */
313	ldp	d8,  d9,  [sp]
314	ldp	d10, d11, [sp, #16]
315	ldp	d12, d13, [sp, #32]
316	ldp	d14, d15, [sp, #48]
317	add	sp, sp, #64
318	add	x_src_end, x_src_end, #128
319
320.Lloop16_init:
321	sub	x_src_end, x_src_end, #16
322	cmp	x_src, x_src_end
323	bhi	.lessthan16_init
324
325.Lloop16:
326	ldr	q_data, [x_src]
327
328	ldr	q_d0, [x_dest1]
329	ldr	q_d1, [x_dest2]
330
331	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
332	ushr	v_data_hi.16b, v_data.16b, #4
333
334	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
335	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
336	eor	v_d0.16b, v_tmp_lo.16b, v_d0.16b
337	eor	v_d0.16b, v_d0.16b, v_tmp_hi.16b
338
339	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
340	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
341	eor	v_d1.16b, v_tmp_lo.16b, v_d1.16b
342	eor	v_d1.16b, v_d1.16b, v_tmp_hi.16b
343
344	str	q_d0, [x_dest1]
345	str	q_d1, [x_dest2]
346
347	add	x_dest1, x_dest1, #16
348	add	x_dest2, x_dest2, #16
349	add	x_src, x_src, #16
350	cmp	x_src, x_src_end
351	bls	.Lloop16
352
353.lessthan16_init:
354	sub	x_tmp, x_src, x_src_end
355	cmp	x_tmp, #16
356	beq	.return_pass
357
358.lessthan16:
359	mov	x_src, x_src_end
360	sub	x_dest1, x_dest1, x_tmp
361	sub	x_dest2, x_dest2, x_tmp
362
363	ldr	x_const, =const_tbl
364	sub	x_const, x_const, x_tmp
365	ldr	q_tmp, [x_const, #16]
366
367	ldr	q_data, [x_src]
368	ldr	q_d0, [x_dest1]
369	ldr	q_d1, [x_dest2]
370
371	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
372	ushr	v_data_hi.16b, v_data.16b, #4
373
374	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
375	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
376	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
377	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
378	eor	v_d0.16b, v_d0.16b, v_tmp_hi.16b
379
380	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
381	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
382	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
383	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
384	eor	v_d1.16b, v_d1.16b, v_tmp_hi.16b
385
386	str	q_d0, [x_dest1]
387	str	q_d1, [x_dest2]
388
389.return_pass:
390	mov	w_ret, #0
391	ret
392
393.return_fail:
394	mov	w_ret, #1
395	ret
396
397.section .data
398.balign 8
399const_tbl:
400	.dword 0x0000000000000000, 0x0000000000000000
401	.dword 0xffffffffffffffff, 0xffffffffffffffff
402