1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31
32.global gf_2vect_dot_prod_neon
33.type gf_2vect_dot_prod_neon, %function
34
35
36/* arguments */
37x_len		.req	x0
38x_vec		.req	x1
39x_tbl		.req	x2
40x_src		.req	x3
41x_dest		.req	x4
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_vec_i		.req	x5
48x_ptr		.req	x6
49x_pos		.req	x7
50x_tmp		.req	x8
51x_tbl1		.req	x9
52x_tbl2		.req	x10
53x_dest1		.req	x11
54x_dest2		.req	x12
55
56/* vectors */
57v_gft1_lo	.req	v0
58v_gft1_hi	.req	v1
59v_gft2_lo	.req	v2
60v_gft2_hi	.req	v3
61q_gft1_lo	.req	q0
62q_gft1_hi	.req	q1
63q_gft2_lo	.req	q2
64q_gft2_hi	.req	q3
65
66v_mask0f	.req	v4
67q_mask0f	.req	q4
68
69v_tmp1_lo	.req	v5
70v_tmp1_hi	.req	v6
71v_tmp1		.req	v7
72
73v_data_0	.req	v8
74v_data_1	.req	v9
75v_data_2	.req	v10
76v_data_3	.req	v11
77v_data_4	.req	v12
78v_data_5	.req	v13
79v_data_6	.req	v14
80v_data_7	.req	v15
81q_data_0	.req	q8
82q_data_1	.req	q9
83q_data_2	.req	q10
84q_data_3	.req	q11
85q_data_4	.req	q12
86q_data_5	.req	q13
87q_data_6	.req	q14
88q_data_7	.req	q15
89
90v_p1_0		.req	v16
91v_p1_1		.req	v17
92v_p1_2		.req	v18
93v_p1_3		.req	v19
94v_p1_4		.req	v20
95v_p1_5		.req	v21
96v_p1_6		.req	v22
97v_p1_7		.req	v23
98v_p2_0		.req	v24
99v_p2_1		.req	v25
100v_p2_2		.req	v26
101v_p2_3		.req	v27
102v_p2_4		.req	v28
103v_p2_5		.req	v29
104v_p2_6		.req	v30
105v_p2_7		.req	v31
106
107q_p1_0		.req	q16
108q_p1_1		.req	q17
109q_p1_2		.req	q18
110q_p1_3		.req	q19
111q_p1_4		.req	q20
112q_p1_5		.req	q21
113q_p1_6		.req	q22
114q_p1_7		.req	q23
115q_p2_0		.req	q24
116q_p2_1		.req	q25
117q_p2_2		.req	q26
118q_p2_3		.req	q27
119q_p2_4		.req	q28
120q_p2_5		.req	q29
121q_p2_6		.req	q30
122q_p2_7		.req	q31
123
124v_p1		.req	v_p1_0
125q_p1		.req	q_p1_0
126v_p2		.req	v_p2_0
127q_p2		.req	q_p2_0
128v_data		.req	v_p1_1
129q_data		.req	q_p1_1
130v_data_lo	.req	v_p1_2
131v_data_hi	.req	v_p1_3
132
133gf_2vect_dot_prod_neon:
134	/* less than 16 bytes, return_fail */
135	cmp	x_len, #16
136	blt	.return_fail
137
138	movi	v_mask0f.16b, #0x0f
139	mov	x_pos, #0
140	lsl	x_vec, x_vec, #3
141	ldr	x_dest1, [x_dest, #8*0]
142	ldr	x_dest2, [x_dest, #8*1]
143
144.Lloop128_init:
145	/* less than 128 bytes, goto Lloop16_init */
146	cmp	x_len, #128
147	blt	.Lloop16_init
148
149	/* save d8 ~ d15 to stack */
150	sub	sp, sp, #64
151	stp	d8, d9, [sp]
152	stp	d10, d11, [sp, #16]
153	stp	d12, d13, [sp, #32]
154	stp	d14, d15, [sp, #48]
155
156	sub	x_len, x_len, #128
157
158.Lloop128:
159	movi	v_p1_0.16b, #0
160	movi	v_p1_1.16b, #0
161	movi	v_p1_2.16b, #0
162	movi	v_p1_3.16b, #0
163	movi	v_p1_4.16b, #0
164	movi	v_p1_5.16b, #0
165	movi	v_p1_6.16b, #0
166	movi	v_p1_7.16b, #0
167
168	movi	v_p2_0.16b, #0
169	movi	v_p2_1.16b, #0
170	movi	v_p2_2.16b, #0
171	movi	v_p2_3.16b, #0
172	movi	v_p2_4.16b, #0
173	movi	v_p2_5.16b, #0
174	movi	v_p2_6.16b, #0
175	movi	v_p2_7.16b, #0
176
177	mov	x_tbl1, x_tbl
178	add	x_tbl2, x_tbl, x_vec, lsl #2
179	mov	x_vec_i, #0
180
181.Lloop128_vects:
182	ldr	x_ptr, [x_src, x_vec_i]
183	add	x_vec_i, x_vec_i, #8
184	add	x_ptr, x_ptr, x_pos
185
186	ldp	q_data_0, q_data_1, [x_ptr], #32
187	ldp	q_data_2, q_data_3, [x_ptr], #32
188
189	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
190	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
191	ldp	q_data_4, q_data_5, [x_ptr], #32
192	ldp	q_data_6, q_data_7, [x_ptr], #32
193	prfm	pldl1strm, [x_ptr]
194	prfm	pldl1keep, [x_tbl1]
195	prfm	pldl1keep, [x_tbl2]
196
197	/* data_0 */
198	and	v_tmp1.16b, v_data_0.16b, v_mask0f.16b
199	ushr	v_data_0.16b, v_data_0.16b, #4
200
201	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
202	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
203	eor	v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
204	eor	v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
205
206	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
207	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
208	eor	v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
209	eor	v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
210
211	/* data_1 */
212	and	v_tmp1.16b, v_data_1.16b, v_mask0f.16b
213	ushr	v_data_1.16b, v_data_1.16b, #4
214
215	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
216	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
217	eor	v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
218	eor	v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
219
220	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
221	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
222	eor	v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
223	eor	v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
224
225	/* data_2 */
226	and	v_tmp1.16b, v_data_2.16b, v_mask0f.16b
227	ushr	v_data_2.16b, v_data_2.16b, #4
228
229	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
230	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
231	eor	v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
232	eor	v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
233
234	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
235	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
236	eor	v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
237	eor	v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
238
239	/* data_3 */
240	and	v_tmp1.16b, v_data_3.16b, v_mask0f.16b
241	ushr	v_data_3.16b, v_data_3.16b, #4
242
243	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
244	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
245	eor	v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
246	eor	v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
247
248	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
249	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
250	eor	v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
251	eor	v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
252
253	/* data_4 */
254	and	v_tmp1.16b, v_data_4.16b, v_mask0f.16b
255	ushr	v_data_4.16b, v_data_4.16b, #4
256
257	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
258	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_4.16b
259	eor	v_p1_4.16b, v_tmp1_lo.16b, v_p1_4.16b
260	eor	v_p1_4.16b, v_p1_4.16b, v_tmp1_hi.16b
261
262	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
263	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_4.16b
264	eor	v_p2_4.16b, v_tmp1_lo.16b, v_p2_4.16b
265	eor	v_p2_4.16b, v_p2_4.16b, v_tmp1_hi.16b
266
267	/* data_5 */
268	and	v_tmp1.16b, v_data_5.16b, v_mask0f.16b
269	ushr	v_data_5.16b, v_data_5.16b, #4
270
271	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
272	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_5.16b
273	eor	v_p1_5.16b, v_tmp1_lo.16b, v_p1_5.16b
274	eor	v_p1_5.16b, v_p1_5.16b, v_tmp1_hi.16b
275
276	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
277	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_5.16b
278	eor	v_p2_5.16b, v_tmp1_lo.16b, v_p2_5.16b
279	eor	v_p2_5.16b, v_p2_5.16b, v_tmp1_hi.16b
280
281	/* data_6 */
282	and	v_tmp1.16b, v_data_6.16b, v_mask0f.16b
283	ushr	v_data_6.16b, v_data_6.16b, #4
284
285	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
286	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_6.16b
287	eor	v_p1_6.16b, v_tmp1_lo.16b, v_p1_6.16b
288	eor	v_p1_6.16b, v_p1_6.16b, v_tmp1_hi.16b
289
290	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
291	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_6.16b
292	eor	v_p2_6.16b, v_tmp1_lo.16b, v_p2_6.16b
293	eor	v_p2_6.16b, v_p2_6.16b, v_tmp1_hi.16b
294
295	/* data_7 */
296	and	v_tmp1.16b, v_data_7.16b, v_mask0f.16b
297	ushr	v_data_7.16b, v_data_7.16b, #4
298
299	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
300	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_7.16b
301	eor	v_p1_7.16b, v_tmp1_lo.16b, v_p1_7.16b
302	eor	v_p1_7.16b, v_p1_7.16b, v_tmp1_hi.16b
303
304	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
305	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_7.16b
306	eor	v_p2_7.16b, v_tmp1_lo.16b, v_p2_7.16b
307	eor	v_p2_7.16b, v_p2_7.16b, v_tmp1_hi.16b
308
309	cmp	x_vec_i, x_vec
310	blt	.Lloop128_vects
311
312.Lloop128_vects_end:
313	add	x_ptr, x_dest1, x_pos
314	stp	q_p1_0, q_p1_1, [x_ptr], #32
315	stp	q_p1_2, q_p1_3, [x_ptr], #32
316	stp	q_p1_4, q_p1_5, [x_ptr], #32
317	stp	q_p1_6, q_p1_7, [x_ptr]
318
319	add	x_ptr, x_dest2, x_pos
320	stp	q_p2_0, q_p2_1, [x_ptr], #32
321	stp	q_p2_2, q_p2_3, [x_ptr], #32
322	stp	q_p2_4, q_p2_5, [x_ptr], #32
323	stp	q_p2_6, q_p2_7, [x_ptr]
324
325	add	x_pos, x_pos, #128
326	cmp	x_pos, x_len
327	ble	.Lloop128
328
329.Lloop128_end:
330	/* restore d8 ~ d15 */
331	ldp	d8,  d9,  [sp]
332	ldp	d10, d11, [sp, #16]
333	ldp	d12, d13, [sp, #32]
334	ldp	d14, d15, [sp, #48]
335	add	sp, sp, #64
336
337	add	x_len, x_len, #128
338	cmp	x_pos, x_len
339	beq	.return_pass
340
341.Lloop16_init:
342	sub	x_len, x_len, #16
343	cmp	x_pos, x_len
344	bgt	.lessthan16_init
345
346.Lloop16:
347	movi	v_p1.16b, #0
348	movi	v_p2.16b, #0
349	mov	x_tbl1, x_tbl
350	add	x_tbl2, x_tbl, x_vec, lsl #2
351	mov	x_vec_i, #0
352
353.Lloop16_vects:
354	ldr	x_ptr, [x_src, x_vec_i]
355	ldr	q_data, [x_ptr, x_pos]
356	add	x_vec_i, x_vec_i, #8
357
358	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
359	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
360
361	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
362	ushr	v_data_hi.16b, v_data.16b, #4
363
364	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
365	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
366	eor	v_p1.16b, v_tmp1_lo.16b, v_p1.16b
367	eor	v_p1.16b, v_p1.16b, v_tmp1_hi.16b
368
369	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
370	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
371	eor	v_p2.16b, v_tmp1_lo.16b, v_p2.16b
372	eor	v_p2.16b, v_p2.16b, v_tmp1_hi.16b
373
374	cmp	x_vec_i, x_vec
375	bne	.Lloop16_vects
376
377.Lloop16_vects_end:
378	str	q_p1, [x_dest1, x_pos]
379	str	q_p2, [x_dest2, x_pos]
380	add	x_pos, x_pos, #16
381	cmp	x_pos, x_len
382	ble	.Lloop16
383
384.Lloop16_end:
385	sub	x_tmp, x_pos, x_len
386	cmp	x_tmp, #16
387	beq	.return_pass
388
389.lessthan16_init:
390	mov	x_pos, x_len
391	b	.Lloop16
392
393.return_pass:
394	mov	w_ret, #0
395	ret
396
397.return_fail:
398	mov	w_ret, #1
399	ret
400