1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_4vect_dot_prod_neon
32.type gf_4vect_dot_prod_neon, %function
33
34
35/* arguments */
36x_len		.req	x0
37x_vec		.req	x1
38x_tbl		.req	x2
39x_src		.req	x3
40x_dest		.req	x4
41
42/* returns */
43w_ret		.req	w0
44
45/* local variables */
46x_vec_i		.req	x5
47x_ptr		.req	x6
48x_pos		.req	x7
49x_tmp		.req	x8
50x_dest1		.req	x9
51x_tbl1		.req	x10
52x_dest2		.req	x11
53x_tbl2		.req	x12
54x_dest3		.req	x13
55x_tbl3		.req	x14
56x_dest4		.req	x_dest
57x_tbl4		.req	x15
58
59/* vectors */
60v_mask0f	.req	v0
61q_mask0f	.req	q0
62v_tmp1_lo	.req	v1
63v_tmp1_hi	.req	v2
64v_tmp1		.req	v3
65q_tmp1		.req	q3
66
67v_p1_0		.req	v4
68v_p2_0		.req	v5
69v_p3_0		.req	v6
70v_p4_0		.req	v7
71
72q_p1_0		.req	q4
73q_p2_0		.req	q5
74q_p3_0		.req	q6
75q_p4_0		.req	q7
76
77v_data_0	.req	v8
78v_data_1	.req	v9
79v_data_2	.req	v10
80v_data_3	.req	v11
81q_data_0	.req	q8
82q_data_1	.req	q9
83q_data_2	.req	q10
84q_data_3	.req	q11
85
86v_p1_3		.req	v12
87v_p2_3		.req	v13
88v_p3_3		.req	v14
89v_p4_3		.req	v15
90q_p1_3		.req	q12
91q_p2_3		.req	q13
92q_p3_3		.req	q14
93q_p4_3		.req	q15
94
95v_gft1_lo	.req	v16
96v_gft1_hi	.req	v17
97v_gft2_lo	.req	v18
98v_gft2_hi	.req	v19
99v_gft3_lo	.req	v20
100v_gft3_hi	.req	v21
101v_gft4_lo	.req	v22
102v_gft4_hi	.req	v23
103q_gft1_lo	.req	q16
104q_gft1_hi	.req	q17
105q_gft2_lo	.req	q18
106q_gft2_hi	.req	q19
107q_gft3_lo	.req	q20
108q_gft3_hi	.req	q21
109q_gft4_lo	.req	q22
110q_gft4_hi	.req	q23
111
112v_p1_1		.req	v24
113v_p1_2		.req	v25
114v_p2_1		.req	v26
115v_p2_2		.req	v27
116v_p3_1		.req	v28
117v_p3_2		.req	v29
118v_p4_1		.req	v30
119v_p4_2		.req	v31
120
121q_p1_1		.req	q24
122q_p1_2		.req	q25
123q_p2_1		.req	q26
124q_p2_2		.req	q27
125q_p3_1		.req	q28
126q_p3_2		.req	q29
127q_p4_1		.req	q30
128q_p4_2		.req	q31
129
130v_data		.req	v_tmp1
131q_data		.req	q_tmp1
132v_data_lo	.req	v_tmp1_lo
133v_data_hi	.req	v_tmp1_hi
134
135gf_4vect_dot_prod_neon:
136	/* less than 16 bytes, return_fail */
137	cmp	x_len, #16
138	blt	.return_fail
139
140	movi	v_mask0f.16b, #0x0f
141	mov	x_pos, #0
142	lsl	x_vec, x_vec, #3
143	ldr	x_dest1, [x_dest, #8*0]
144	ldr	x_dest2, [x_dest, #8*1]
145	ldr	x_dest3, [x_dest, #8*2]
146	ldr	x_dest4, [x_dest, #8*3]
147
148.Lloop64_init:
149	/* less than 64 bytes, goto Lloop16_init */
150	cmp	x_len, #64
151	blt	.Lloop16_init
152
153	/* save d8 ~ d15 to stack */
154	sub	sp, sp, #64
155	stp	d8, d9, [sp]
156	stp	d10, d11, [sp, #16]
157	stp	d12, d13, [sp, #32]
158	stp	d14, d15, [sp, #48]
159
160	sub	x_len, x_len, #64
161
162.Lloop64:
163	movi	v_p1_0.16b, #0
164	movi	v_p1_1.16b, #0
165	movi	v_p1_2.16b, #0
166	movi	v_p1_3.16b, #0
167	movi	v_p2_0.16b, #0
168	movi	v_p2_1.16b, #0
169	movi	v_p2_2.16b, #0
170	movi	v_p2_3.16b, #0
171	movi	v_p3_0.16b, #0
172	movi	v_p3_1.16b, #0
173	movi	v_p3_2.16b, #0
174	movi	v_p3_3.16b, #0
175	movi	v_p4_0.16b, #0
176	movi	v_p4_1.16b, #0
177	movi	v_p4_2.16b, #0
178	movi	v_p4_3.16b, #0
179
180	mov	x_tbl1, x_tbl
181	add	x_tbl2, x_tbl1, x_vec, lsl #2
182	add	x_tbl3, x_tbl2, x_vec, lsl #2
183	add	x_tbl4, x_tbl3, x_vec, lsl #2
184	mov	x_vec_i, #0
185	prfm	pldl1keep, [x_tbl1]
186	prfm	pldl1keep, [x_tbl2]
187	prfm	pldl1keep, [x_tbl3]
188	prfm	pldl1keep, [x_tbl4]
189
190.Lloop64_vects:
191	ldr	x_ptr, [x_src, x_vec_i]
192	add	x_vec_i, x_vec_i, #8
193	add	x_ptr, x_ptr, x_pos
194
195	ldr	q_data_0, [x_ptr], #16
196	ldr	q_data_1, [x_ptr], #16
197	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
198	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
199	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
200	ldp	q_gft4_lo, q_gft4_hi, [x_tbl4], #32
201	ldr	q_data_2, [x_ptr], #16
202	ldr	q_data_3, [x_ptr], #16
203
204	prfm	pldl1strm, [x_ptr]
205	prfm	pldl1keep, [x_tbl1]
206	prfm	pldl1keep, [x_tbl2]
207	prfm	pldl1keep, [x_tbl3]
208	prfm	pldl1keep, [x_tbl4]
209
210	/* data_0 */
211	and	v_tmp1.16b, v_data_0.16b, v_mask0f.16b
212	ushr	v_data_0.16b, v_data_0.16b, #4
213
214	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
215	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
216	eor	v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
217	eor	v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
218
219	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
220	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
221	eor	v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
222	eor	v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
223
224	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
225	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
226	eor	v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
227	eor	v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
228
229	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
230	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_0.16b
231	eor	v_p4_0.16b, v_tmp1_lo.16b, v_p4_0.16b
232	eor	v_p4_0.16b, v_p4_0.16b, v_tmp1_hi.16b
233
234	/* data_1 */
235	and	v_tmp1.16b, v_data_1.16b, v_mask0f.16b
236	ushr	v_data_1.16b, v_data_1.16b, #4
237
238	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
239	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
240	eor	v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
241	eor	v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
242
243	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
244	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
245	eor	v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
246	eor	v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
247
248	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
249	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
250	eor	v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
251	eor	v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
252
253	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
254	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_1.16b
255	eor	v_p4_1.16b, v_tmp1_lo.16b, v_p4_1.16b
256	eor	v_p4_1.16b, v_p4_1.16b, v_tmp1_hi.16b
257
258	/* data_2 */
259	and	v_tmp1.16b, v_data_2.16b, v_mask0f.16b
260	ushr	v_data_2.16b, v_data_2.16b, #4
261
262	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
263	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
264	eor	v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
265	eor	v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
266
267	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
268	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
269	eor	v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
270	eor	v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
271
272	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
273	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
274	eor	v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
275	eor	v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
276
277	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
278	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_2.16b
279	eor	v_p4_2.16b, v_tmp1_lo.16b, v_p4_2.16b
280	eor	v_p4_2.16b, v_p4_2.16b, v_tmp1_hi.16b
281
282	/* data_3 */
283	and	v_tmp1.16b, v_data_3.16b, v_mask0f.16b
284	ushr	v_data_3.16b, v_data_3.16b, #4
285
286	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
287	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
288	eor	v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
289	eor	v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
290
291	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
292	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
293	eor	v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
294	eor	v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
295
296	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
297	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
298	eor	v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
299	eor	v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
300
301	tbl	v_tmp1_lo.16b, {v_gft4_lo.16b}, v_tmp1.16b
302	tbl	v_tmp1_hi.16b, {v_gft4_hi.16b}, v_data_3.16b
303	eor	v_p4_3.16b, v_tmp1_lo.16b, v_p4_3.16b
304	eor	v_p4_3.16b, v_p4_3.16b, v_tmp1_hi.16b
305
306	cmp	x_vec_i, x_vec
307	blt	.Lloop64_vects
308
309.Lloop64_vects_end:
310	add	x_ptr, x_dest1, x_pos
311	stp	q_p1_0, q_p1_1, [x_ptr], #32
312	stp	q_p1_2, q_p1_3, [x_ptr]
313
314	add	x_ptr, x_dest2, x_pos
315	stp	q_p2_0, q_p2_1, [x_ptr], #32
316	stp	q_p2_2, q_p2_3, [x_ptr]
317
318	add	x_ptr, x_dest3, x_pos
319	stp	q_p3_0, q_p3_1, [x_ptr], #32
320	stp	q_p3_2, q_p3_3, [x_ptr]
321
322	add	x_ptr, x_dest4, x_pos
323	stp	q_p4_0, q_p4_1, [x_ptr], #32
324	stp	q_p4_2, q_p4_3, [x_ptr]
325
326	add	x_pos, x_pos, #64
327	cmp	x_pos, x_len
328	ble	.Lloop64
329
330.Lloop64_end:
331	/* restore d8 ~ d15 */
332	ldp	d8,  d9,  [sp]
333	ldp	d10, d11, [sp, #16]
334	ldp	d12, d13, [sp, #32]
335	ldp	d14, d15, [sp, #48]
336	add	sp, sp, #64
337
338	add	x_len, x_len, #64
339	cmp	x_pos, x_len
340	beq	.return_pass
341
342.Lloop16_init:
343	sub	x_len, x_len, #16
344	cmp	x_pos, x_len
345	bgt	.lessthan16_init
346
347.Lloop16:
348	movi	v_p1_0.16b, #0
349	movi	v_p2_0.16b, #0
350	movi	v_p3_0.16b, #0
351	movi	v_p4_0.16b, #0
352	mov	x_tbl1, x_tbl
353	add	x_tbl2, x_tbl1, x_vec, lsl #2
354	add	x_tbl3, x_tbl2, x_vec, lsl #2
355	add	x_tbl4, x_tbl3, x_vec, lsl #2
356	mov	x_vec_i, #0
357
358.Lloop16_vects:
359	ldr	x_ptr, [x_src, x_vec_i]
360	add	x_vec_i, x_vec_i, #8
361	ldr	q_data, [x_ptr, x_pos]
362
363	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
364	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
365	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
366	ldp	q_gft4_lo, q_gft4_hi, [x_tbl4], #32
367
368	prfm	pldl1keep, [x_tbl1]
369	prfm	pldl1keep, [x_tbl2]
370	prfm	pldl1keep, [x_tbl3]
371	prfm	pldl1keep, [x_tbl4]
372
373	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
374	ushr	v_data_hi.16b, v_data.16b, #4
375
376	tbl	v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
377	tbl	v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
378	tbl	v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
379	tbl	v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
380	tbl	v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
381	tbl	v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
382	tbl	v_gft4_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
383	tbl	v_gft4_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
384
385	eor	v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
386	eor	v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
387	eor	v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
388	eor	v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
389	eor	v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
390	eor	v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
391	eor	v_p4_0.16b, v_gft4_hi.16b, v_p4_0.16b
392	eor	v_p4_0.16b, v_p4_0.16b, v_gft4_lo.16b
393
394	cmp	x_vec_i, x_vec
395	bne	.Lloop16_vects
396
397.Lloop16_vects_end:
398	str	q_p1_0, [x_dest1, x_pos]
399	str	q_p2_0, [x_dest2, x_pos]
400	str	q_p3_0, [x_dest3, x_pos]
401	str	q_p4_0, [x_dest4, x_pos]
402	add	x_pos, x_pos, #16
403	cmp	x_pos, x_len
404	ble	.Lloop16
405
406.Lloop16_end:
407	sub	x_tmp, x_pos, x_len
408	cmp	x_tmp, #16
409	beq	.return_pass
410
411.lessthan16_init:
412	mov	x_pos, x_len
413	b	.Lloop16
414
415.return_pass:
416	mov	w_ret, #0
417	ret
418
419.return_fail:
420	mov	w_ret, #1
421	ret
422