1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31
32.global gf_5vect_dot_prod_neon
33.type gf_5vect_dot_prod_neon, %function
34
35
36/* arguments */
37x_len		.req	x0
38x_vec		.req	x1
39x_tbl		.req	x2
40x_src		.req	x3
41x_dest		.req	x4
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_vec_i		.req	x5
48x_ptr		.req	x6
49x_pos		.req	x7
50x_tmp		.req	x8
51x_dest1		.req	x9
52x_dest2		.req	x10
53x_dest3		.req	x11
54x_dest4		.req	x12
55x_dest5		.req	x13
56
57/* vectors */
58v_tmp1		.req	v0
59q_tmp1		.req	q0
60v_tmp2		.req	v1
61q_tmp2		.req	q1
62
63v_mask0f	.req	v_tmp1
64q_mask0f	.req	q_tmp1
65v_tmp_lo	.req	v_tmp1
66v_tmp_hi	.req	v_tmp2
67
68v_gft_lo	.req	v2
69v_gft_hi	.req	v3
70q_gft_lo	.req	q2
71q_gft_hi	.req	q3
72
73v_p1_0		.req	v4
74v_p2_0		.req	v5
75v_p3_0		.req	v6
76v_p4_0		.req	v7
77
78q_p1_0		.req	q4
79q_p2_0		.req	q5
80q_p3_0		.req	q6
81q_p4_0		.req	q7
82
83v_data_0	.req	v8
84v_data_1	.req	v9
85v_data_2	.req	v10
86v_data_3	.req	v11
87q_data_0	.req	q8
88q_data_1	.req	q9
89q_data_2	.req	q10
90q_data_3	.req	q11
91
92v_data_0_lo	.req	v12
93v_data_1_lo	.req	v13
94v_data_2_lo	.req	v14
95v_data_3_lo	.req	v15
96v_data_0_hi	.req	v_data_0
97v_data_1_hi	.req	v_data_1
98v_data_2_hi	.req	v_data_2
99v_data_3_hi	.req	v_data_3
100
101v_p5_0		.req	v16
102v_p1_1		.req	v17
103v_p2_1		.req	v18
104v_p3_1		.req	v19
105v_p4_1		.req	v20
106v_p5_1		.req	v21
107v_p1_2		.req	v22
108v_p2_2		.req	v23
109v_p3_2		.req	v24
110v_p4_2		.req	v25
111v_p5_2		.req	v26
112v_p1_3		.req	v27
113v_p2_3		.req	v28
114v_p3_3		.req	v29
115v_p4_3		.req	v30
116v_p5_3		.req	v31
117
118q_p5_0		.req	q16
119q_p1_1		.req	q17
120q_p2_1		.req	q18
121q_p3_1		.req	q19
122q_p4_1		.req	q20
123q_p5_1		.req	q21
124q_p1_2		.req	q22
125q_p2_2		.req	q23
126q_p3_2		.req	q24
127q_p4_2		.req	q25
128q_p5_2		.req	q26
129q_p1_3		.req	q27
130q_p2_3		.req	q28
131q_p3_3		.req	q29
132q_p4_3		.req	q30
133q_p5_3		.req	q31
134
135v_data		.req	v_p1_1
136q_data		.req	q_p1_1
137v_data_lo	.req	v_p2_1
138v_data_hi	.req	v_p3_1
139
140v_gft1_lo	.req	v_p4_1
141v_gft1_hi	.req	v_p5_1
142v_gft2_lo	.req	v_p1_2
143v_gft2_hi	.req	v_p2_2
144v_gft3_lo	.req	v_p3_2
145v_gft3_hi	.req	v_p4_2
146v_gft4_lo	.req	v_p5_2
147v_gft4_hi	.req	v_p1_3
148v_gft5_lo	.req	v_p2_3
149v_gft5_hi	.req	v_p3_3
150q_gft1_lo	.req	q_p4_1
151q_gft1_hi	.req	q_p5_1
152q_gft2_lo	.req	q_p1_2
153q_gft2_hi	.req	q_p2_2
154q_gft3_lo	.req	q_p3_2
155q_gft3_hi	.req	q_p4_2
156q_gft4_lo	.req	q_p5_2
157q_gft4_hi	.req	q_p1_3
158q_gft5_lo	.req	q_p2_3
159q_gft5_hi	.req	q_p3_3
160
161
162gf_5vect_dot_prod_neon:
163	/* less than 16 bytes, return_fail */
164	cmp	x_len, #16
165	blt	.return_fail
166
167	mov	x_pos, #0
168	lsl	x_vec, x_vec, #3
169	ldr	x_dest1, [x_dest, #8*0]
170	ldr	x_dest2, [x_dest, #8*1]
171	ldr	x_dest3, [x_dest, #8*2]
172	ldr	x_dest4, [x_dest, #8*3]
173	ldr	x_dest5, [x_dest, #8*4]
174
175.Lloop64_init:
176	/* less than 64 bytes, goto Lloop16_init */
177	cmp	x_len, #64
178	blt	.Lloop16_init
179
180	/* save d8 ~ d15 to stack */
181	sub	sp, sp, #64
182	stp	d8, d9, [sp]
183	stp	d10, d11, [sp, #16]
184	stp	d12, d13, [sp, #32]
185	stp	d14, d15, [sp, #48]
186
187	sub	x_len, x_len, #64
188
189.Lloop64:
190	movi	v_p1_0.16b, #0
191	movi	v_p1_1.16b, #0
192	movi	v_p1_2.16b, #0
193	movi	v_p1_3.16b, #0
194	movi	v_p2_0.16b, #0
195	movi	v_p2_1.16b, #0
196	movi	v_p2_2.16b, #0
197	movi	v_p2_3.16b, #0
198	movi	v_p3_0.16b, #0
199	movi	v_p3_1.16b, #0
200	movi	v_p3_2.16b, #0
201	movi	v_p3_3.16b, #0
202	movi	v_p4_0.16b, #0
203	movi	v_p4_1.16b, #0
204	movi	v_p4_2.16b, #0
205	movi	v_p4_3.16b, #0
206	movi	v_p5_0.16b, #0
207	movi	v_p5_1.16b, #0
208	movi	v_p5_2.16b, #0
209	movi	v_p5_3.16b, #0
210	mov	x_vec_i, #0
211
212.Lloop64_vects:
213	ldr	x_ptr, [x_src, x_vec_i]
214	add	x_ptr, x_ptr, x_pos
215
216	ldr	q_data_0, [x_ptr], #16
217	ldr	q_data_1, [x_ptr], #16
218	ldr	q_data_2, [x_ptr], #16
219	ldr	q_data_3, [x_ptr], #16
220	prfm	pldl2keep, [x_ptr]
221
222	movi	v_mask0f.16b, #0x0f
223	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
224	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
225	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
226	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
227	ushr	v_data_0_hi.16b, v_data_0.16b, #4
228	ushr	v_data_1_hi.16b, v_data_1.16b, #4
229	ushr	v_data_2_hi.16b, v_data_2.16b, #4
230	ushr	v_data_3_hi.16b, v_data_3.16b, #4
231
232	/* v_p1_x */
233	add	x_tmp, x_tbl, x_vec_i, lsl #2
234	add	x_vec_i, x_vec_i, #8
235	ldp	q_gft_lo, q_gft_hi, [x_tmp]
236	prfm	pldl3keep, [x_tmp, #32]
237	add	x_tmp, x_tmp, x_vec, lsl #2
238
239	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
240	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
241	eor	v_p1_0.16b, v_tmp_lo.16b, v_p1_0.16b
242	eor	v_p1_0.16b, v_p1_0.16b, v_tmp_hi.16b
243
244	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
245	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
246	eor	v_p1_1.16b, v_tmp_lo.16b, v_p1_1.16b
247	eor	v_p1_1.16b, v_p1_1.16b, v_tmp_hi.16b
248
249	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
250	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
251	eor	v_p1_2.16b, v_tmp_lo.16b, v_p1_2.16b
252	eor	v_p1_2.16b, v_p1_2.16b, v_tmp_hi.16b
253
254	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
255	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
256	eor	v_p1_3.16b, v_tmp_lo.16b, v_p1_3.16b
257	eor	v_p1_3.16b, v_p1_3.16b, v_tmp_hi.16b
258
259	/* v_p2_x */
260	ldp	q_gft_lo, q_gft_hi, [x_tmp]
261	prfm	pldl3keep, [x_tmp, #32]
262	add	x_tmp, x_tmp, x_vec, lsl #2
263
264	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
265	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
266	eor	v_p2_0.16b, v_tmp_lo.16b, v_p2_0.16b
267	eor	v_p2_0.16b, v_p2_0.16b, v_tmp_hi.16b
268
269	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
270	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
271	eor	v_p2_1.16b, v_tmp_lo.16b, v_p2_1.16b
272	eor	v_p2_1.16b, v_p2_1.16b, v_tmp_hi.16b
273
274	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
275	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
276	eor	v_p2_2.16b, v_tmp_lo.16b, v_p2_2.16b
277	eor	v_p2_2.16b, v_p2_2.16b, v_tmp_hi.16b
278
279	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
280	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
281	eor	v_p2_3.16b, v_tmp_lo.16b, v_p2_3.16b
282	eor	v_p2_3.16b, v_p2_3.16b, v_tmp_hi.16b
283
284	/* v_p3_x */
285	ldp	q_gft_lo, q_gft_hi, [x_tmp]
286	prfm	pldl3keep, [x_tmp, #32]
287	add	x_tmp, x_tmp, x_vec, lsl #2
288
289	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
290	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
291	eor	v_p3_0.16b, v_tmp_lo.16b, v_p3_0.16b
292	eor	v_p3_0.16b, v_p3_0.16b, v_tmp_hi.16b
293
294	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
295	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
296	eor	v_p3_1.16b, v_tmp_lo.16b, v_p3_1.16b
297	eor	v_p3_1.16b, v_p3_1.16b, v_tmp_hi.16b
298
299	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
300	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
301	eor	v_p3_2.16b, v_tmp_lo.16b, v_p3_2.16b
302	eor	v_p3_2.16b, v_p3_2.16b, v_tmp_hi.16b
303
304	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
305	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
306	eor	v_p3_3.16b, v_tmp_lo.16b, v_p3_3.16b
307	eor	v_p3_3.16b, v_p3_3.16b, v_tmp_hi.16b
308
309	/* v_p4_x */
310	ldp	q_gft_lo, q_gft_hi, [x_tmp]
311	prfm	pldl3keep, [x_tmp, #32]
312	add	x_tmp, x_tmp, x_vec, lsl #2
313
314	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
315	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
316	eor	v_p4_0.16b, v_tmp_lo.16b, v_p4_0.16b
317	eor	v_p4_0.16b, v_p4_0.16b, v_tmp_hi.16b
318
319	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
320	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
321	eor	v_p4_1.16b, v_tmp_lo.16b, v_p4_1.16b
322	eor	v_p4_1.16b, v_p4_1.16b, v_tmp_hi.16b
323
324	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
325	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
326	eor	v_p4_2.16b, v_tmp_lo.16b, v_p4_2.16b
327	eor	v_p4_2.16b, v_p4_2.16b, v_tmp_hi.16b
328
329	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
330	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
331	eor	v_p4_3.16b, v_tmp_lo.16b, v_p4_3.16b
332	eor	v_p4_3.16b, v_p4_3.16b, v_tmp_hi.16b
333
334	/* v_p5_x */
335	ldp	q_gft_lo, q_gft_hi, [x_tmp]
336	prfm	pldl3keep, [x_tmp, #32]
337
338	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_0_lo.16b
339	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_0_hi.16b
340	eor	v_p5_0.16b, v_tmp_lo.16b, v_p5_0.16b
341	eor	v_p5_0.16b, v_p5_0.16b, v_tmp_hi.16b
342
343	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_1_lo.16b
344	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_1_hi.16b
345	eor	v_p5_1.16b, v_tmp_lo.16b, v_p5_1.16b
346	eor	v_p5_1.16b, v_p5_1.16b, v_tmp_hi.16b
347
348	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_2_lo.16b
349	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_2_hi.16b
350	eor	v_p5_2.16b, v_tmp_lo.16b, v_p5_2.16b
351	eor	v_p5_2.16b, v_p5_2.16b, v_tmp_hi.16b
352
353	tbl	v_tmp_lo.16b, {v_gft_lo.16b}, v_data_3_lo.16b
354	tbl	v_tmp_hi.16b, {v_gft_hi.16b}, v_data_3_hi.16b
355	eor	v_p5_3.16b, v_tmp_lo.16b, v_p5_3.16b
356	eor	v_p5_3.16b, v_p5_3.16b, v_tmp_hi.16b
357
358	cmp	x_vec_i, x_vec
359	blt	.Lloop64_vects
360
361.Lloop64_vects_end:
362	add	x_ptr, x_dest1, x_pos
363	stp	q_p1_0, q_p1_1, [x_ptr], #32
364	stp	q_p1_2, q_p1_3, [x_ptr]
365
366	add	x_ptr, x_dest2, x_pos
367	stp	q_p2_0, q_p2_1, [x_ptr], #32
368	stp	q_p2_2, q_p2_3, [x_ptr]
369
370	add	x_ptr, x_dest3, x_pos
371	stp	q_p3_0, q_p3_1, [x_ptr], #32
372	stp	q_p3_2, q_p3_3, [x_ptr]
373
374	add	x_ptr, x_dest4, x_pos
375	stp	q_p4_0, q_p4_1, [x_ptr], #32
376	stp	q_p4_2, q_p4_3, [x_ptr]
377
378	add	x_ptr, x_dest5, x_pos
379	stp	q_p5_0, q_p5_1, [x_ptr], #32
380	stp	q_p5_2, q_p5_3, [x_ptr]
381
382	add	x_pos, x_pos, #64
383	cmp	x_pos, x_len
384	ble	.Lloop64
385
386.Lloop64_end:
387	/* restore d8 ~ d15 */
388	ldp	d8,  d9,  [sp]
389	ldp	d10, d11, [sp, #16]
390	ldp	d12, d13, [sp, #32]
391	ldp	d14, d15, [sp, #48]
392	add	sp, sp, #64
393
394	add	x_len, x_len, #64
395	cmp	x_pos, x_len
396	beq	.return_pass
397
398.Lloop16_init:
399	sub	x_len, x_len, #16
400	cmp	x_pos, x_len
401	bgt	.lessthan16_init
402
403.Lloop16:
404	movi	v_p1_0.16b, #0
405	movi	v_p2_0.16b, #0
406	movi	v_p3_0.16b, #0
407	movi	v_p4_0.16b, #0
408	movi	v_p5_0.16b, #0
409	mov	x_vec_i, #0
410
411.Lloop16_vects:
412	ldr	x_ptr, [x_src, x_vec_i]
413	ldr	q_data, [x_ptr, x_pos]
414
415	movi	v_mask0f.16b, #0x0f
416	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
417	ushr	v_data_hi.16b, v_data.16b, #4
418
419	add	x_tmp, x_tbl, x_vec_i, lsl #2
420	add	x_vec_i, x_vec_i, #8
421	ldp	q_gft1_lo, q_gft1_hi, [x_tmp]
422	add	x_tmp, x_tmp, x_vec, lsl #2
423	ldp	q_gft2_lo, q_gft2_hi, [x_tmp]
424	add	x_tmp, x_tmp, x_vec, lsl #2
425	ldp	q_gft3_lo, q_gft3_hi, [x_tmp]
426	add	x_tmp, x_tmp, x_vec, lsl #2
427	ldp	q_gft4_lo, q_gft4_hi, [x_tmp]
428	add	x_tmp, x_tmp, x_vec, lsl #2
429	ldp	q_gft5_lo, q_gft5_hi, [x_tmp]
430
431	tbl	v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
432	tbl	v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
433	tbl	v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
434	tbl	v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
435	tbl	v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
436	tbl	v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
437	tbl	v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
438	tbl	v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
439	tbl	v_gft5_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
440	tbl	v_gft5_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
441
442	eor	v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
443	eor	v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
444	eor	v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
445	eor	v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
446	eor	v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
447	eor	v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
448	eor	v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
449	eor	v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
450	eor	v_p5_0.16b, v_gft5_hi.16b, v_p5_0.16b
451	eor	v_p5_0.16b, v_p5_0.16b, v_gft5_lo.16b
452
453	cmp	x_vec_i, x_vec
454	bne	.Lloop16_vects
455
456.Lloop16_vects_end:
457	str	q_p1_0, [x_dest1, x_pos]
458	str	q_p2_0, [x_dest2, x_pos]
459	str	q_p3_0, [x_dest3, x_pos]
460	str	q_p4_0, [x_dest4, x_pos]
461	str	q_p5_0, [x_dest5, x_pos]
462	add	x_pos, x_pos, #16
463	cmp	x_pos, x_len
464	ble	.Lloop16
465
466.Lloop16_end:
467	sub	x_tmp, x_pos, x_len
468	cmp	x_tmp, #16
469	beq	.return_pass
470
471.lessthan16_init:
472	mov	x_pos, x_len
473	b	.Lloop16
474
475.return_pass:
476	mov	w_ret, #0
477	ret
478
479.return_fail:
480	mov	w_ret, #1
481	ret
482