1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31
32.global gf_3vect_dot_prod_neon
33.type gf_3vect_dot_prod_neon, %function
34
35
36/* arguments */
37x_len		.req	x0
38x_vec		.req	x1
39x_tbl		.req	x2
40x_src		.req	x3
41x_dest		.req	x4
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_vec_i		.req	x5
48x_ptr		.req	x6
49x_pos		.req	x7
50x_tmp		.req	x8
51x_dest1		.req	x9
52x_tbl1		.req	x10
53x_dest2		.req	x11
54x_tbl2		.req	x12
55x_dest3		.req	x13
56x_tbl3		.req	x14
57
58/* vectors */
59v_gft1_lo	.req	v0
60v_gft1_hi	.req	v1
61v_gft2_lo	.req	v2
62v_gft2_hi	.req	v3
63v_gft3_lo	.req	v4
64v_gft3_hi	.req	v5
65q_gft1_lo	.req	q0
66q_gft1_hi	.req	q1
67q_gft2_lo	.req	q2
68q_gft2_hi	.req	q3
69q_gft3_lo	.req	q4
70q_gft3_hi	.req	q5
71
72v_mask0f	.req	v6
73q_mask0f	.req	q6
74v_tmp1		.req	v7
75
76v_data_0	.req	v8
77v_data_1	.req	v9
78v_data_2	.req	v10
79v_data_3	.req	v11
80q_data_0	.req	q8
81q_data_1	.req	q9
82q_data_2	.req	q10
83q_data_3	.req	q11
84
85v_tmp1_lo	.req	v12
86v_tmp1_hi	.req	v13
87
88v_p1_0		.req	v20
89v_p1_1		.req	v21
90v_p1_2		.req	v22
91v_p1_3		.req	v23
92v_p2_0		.req	v24
93v_p2_1		.req	v25
94v_p2_2		.req	v26
95v_p2_3		.req	v27
96v_p3_0		.req	v28
97v_p3_1		.req	v29
98v_p3_2		.req	v30
99v_p3_3		.req	v31
100
101q_p1_0		.req	q20
102q_p1_1		.req	q21
103q_p1_2		.req	q22
104q_p1_3		.req	q23
105q_p2_0		.req	q24
106q_p2_1		.req	q25
107q_p2_2		.req	q26
108q_p2_3		.req	q27
109q_p3_0		.req	q28
110q_p3_1		.req	q29
111q_p3_2		.req	q30
112q_p3_3		.req	q31
113
114v_data		.req	v_p1_1
115q_data		.req	q_p1_1
116v_data_lo	.req	v_p1_2
117v_data_hi	.req	v_p1_3
118
119
120gf_3vect_dot_prod_neon:
121	/* less than 16 bytes, return_fail */
122	cmp	x_len, #16
123	blt	.return_fail
124
125	movi	v_mask0f.16b, #0x0f
126	mov	x_pos, #0
127	lsl	x_vec, x_vec, #3
128	ldr	x_dest1, [x_dest, #8*0]
129	ldr	x_dest2, [x_dest, #8*1]
130	ldr	x_dest3, [x_dest, #8*2]
131
132.Lloop64_init:
133	/* less than 64 bytes, goto Lloop16_init */
134	cmp	x_len, #64
135	blt	.Lloop16_init
136
137	/* save d8 ~ d15 to stack */
138	sub	sp, sp, #64
139	stp	d8, d9, [sp]
140	stp	d10, d11, [sp, #16]
141	stp	d12, d13, [sp, #32]
142	stp	d14, d15, [sp, #48]
143
144	sub	x_len, x_len, #64
145
146.Lloop64:
147	movi	v_p1_0.16b, #0
148	movi	v_p1_1.16b, #0
149	movi	v_p1_2.16b, #0
150	movi	v_p1_3.16b, #0
151	movi	v_p2_0.16b, #0
152	movi	v_p2_1.16b, #0
153	movi	v_p2_2.16b, #0
154	movi	v_p2_3.16b, #0
155	movi	v_p3_0.16b, #0
156	movi	v_p3_1.16b, #0
157	movi	v_p3_2.16b, #0
158	movi	v_p3_3.16b, #0
159
160	mov	x_tbl1, x_tbl
161	add	x_tbl2, x_tbl1, x_vec, lsl #2
162	add	x_tbl3, x_tbl2, x_vec, lsl #2
163	mov	x_vec_i, #0
164
165.Lloop64_vects:
166	ldr	x_ptr, [x_src, x_vec_i]
167	add	x_vec_i, x_vec_i, #8
168	add	x_ptr, x_ptr, x_pos
169
170	ldr	q_data_0, [x_ptr], #16
171	ldr	q_data_1, [x_ptr], #16
172
173	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
174	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
175	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
176
177	ldr	q_data_2, [x_ptr], #16
178	ldr	q_data_3, [x_ptr], #16
179	prfm	pldl1strm, [x_ptr]
180	prfm	pldl1keep, [x_tbl1]
181	prfm	pldl1keep, [x_tbl2]
182	prfm	pldl1keep, [x_tbl3]
183
184	/* data_0 */
185	and	v_tmp1.16b, v_data_0.16b, v_mask0f.16b
186	ushr	v_data_0.16b, v_data_0.16b, #4
187
188	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
189	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_0.16b
190	eor	v_p1_0.16b, v_tmp1_lo.16b, v_p1_0.16b
191	eor	v_p1_0.16b, v_p1_0.16b, v_tmp1_hi.16b
192
193	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
194	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_0.16b
195	eor	v_p2_0.16b, v_tmp1_lo.16b, v_p2_0.16b
196	eor	v_p2_0.16b, v_p2_0.16b, v_tmp1_hi.16b
197
198	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
199	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_0.16b
200	eor	v_p3_0.16b, v_tmp1_lo.16b, v_p3_0.16b
201	eor	v_p3_0.16b, v_p3_0.16b, v_tmp1_hi.16b
202
203	/* data_1 */
204	and	v_tmp1.16b, v_data_1.16b, v_mask0f.16b
205	ushr	v_data_1.16b, v_data_1.16b, #4
206
207	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
208	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_1.16b
209	eor	v_p1_1.16b, v_tmp1_lo.16b, v_p1_1.16b
210	eor	v_p1_1.16b, v_p1_1.16b, v_tmp1_hi.16b
211
212	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
213	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_1.16b
214	eor	v_p2_1.16b, v_tmp1_lo.16b, v_p2_1.16b
215	eor	v_p2_1.16b, v_p2_1.16b, v_tmp1_hi.16b
216
217	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
218	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_1.16b
219	eor	v_p3_1.16b, v_tmp1_lo.16b, v_p3_1.16b
220	eor	v_p3_1.16b, v_p3_1.16b, v_tmp1_hi.16b
221
222	/* data_2 */
223	and	v_tmp1.16b, v_data_2.16b, v_mask0f.16b
224	ushr	v_data_2.16b, v_data_2.16b, #4
225
226	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
227	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_2.16b
228	eor	v_p1_2.16b, v_tmp1_lo.16b, v_p1_2.16b
229	eor	v_p1_2.16b, v_p1_2.16b, v_tmp1_hi.16b
230
231	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
232	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_2.16b
233	eor	v_p2_2.16b, v_tmp1_lo.16b, v_p2_2.16b
234	eor	v_p2_2.16b, v_p2_2.16b, v_tmp1_hi.16b
235
236	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
237	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_2.16b
238	eor	v_p3_2.16b, v_tmp1_lo.16b, v_p3_2.16b
239	eor	v_p3_2.16b, v_p3_2.16b, v_tmp1_hi.16b
240
241	/* data_3 */
242	and	v_tmp1.16b, v_data_3.16b, v_mask0f.16b
243	ushr	v_data_3.16b, v_data_3.16b, #4
244
245	tbl	v_tmp1_lo.16b, {v_gft1_lo.16b}, v_tmp1.16b
246	tbl	v_tmp1_hi.16b, {v_gft1_hi.16b}, v_data_3.16b
247	eor	v_p1_3.16b, v_tmp1_lo.16b, v_p1_3.16b
248	eor	v_p1_3.16b, v_p1_3.16b, v_tmp1_hi.16b
249
250	tbl	v_tmp1_lo.16b, {v_gft2_lo.16b}, v_tmp1.16b
251	tbl	v_tmp1_hi.16b, {v_gft2_hi.16b}, v_data_3.16b
252	eor	v_p2_3.16b, v_tmp1_lo.16b, v_p2_3.16b
253	eor	v_p2_3.16b, v_p2_3.16b, v_tmp1_hi.16b
254
255	tbl	v_tmp1_lo.16b, {v_gft3_lo.16b}, v_tmp1.16b
256	tbl	v_tmp1_hi.16b, {v_gft3_hi.16b}, v_data_3.16b
257	eor	v_p3_3.16b, v_tmp1_lo.16b, v_p3_3.16b
258	eor	v_p3_3.16b, v_p3_3.16b, v_tmp1_hi.16b
259
260	cmp	x_vec_i, x_vec
261	blt	.Lloop64_vects
262
263.Lloop64_vects_end:
264	add	x_ptr, x_dest1, x_pos
265	stp	q_p1_0, q_p1_1, [x_ptr], #32
266	stp	q_p1_2, q_p1_3, [x_ptr]
267
268	add	x_ptr, x_dest2, x_pos
269	stp	q_p2_0, q_p2_1, [x_ptr], #32
270	stp	q_p2_2, q_p2_3, [x_ptr]
271
272	add	x_ptr, x_dest3, x_pos
273	stp	q_p3_0, q_p3_1, [x_ptr], #32
274	stp	q_p3_2, q_p3_3, [x_ptr]
275
276	add	x_pos, x_pos, #64
277	cmp	x_pos, x_len
278	ble	.Lloop64
279
280.Lloop64_end:
281	/* restore d8 ~ d15 */
282	ldp	d8,  d9,  [sp]
283	ldp	d10, d11, [sp, #16]
284	ldp	d12, d13, [sp, #32]
285	ldp	d14, d15, [sp, #48]
286	add	sp, sp, #64
287
288	add	x_len, x_len, #64
289	cmp	x_pos, x_len
290	beq	.return_pass
291
292.Lloop16_init:
293	sub	x_len, x_len, #16
294	cmp	x_pos, x_len
295	bgt	.lessthan16_init
296
297.Lloop16:
298	movi	v_p1_0.16b, #0
299	movi	v_p2_0.16b, #0
300	movi	v_p3_0.16b, #0
301	mov	x_tbl1, x_tbl
302	add	x_tbl2, x_tbl1, x_vec, lsl #2
303	add	x_tbl3, x_tbl2, x_vec, lsl #2
304	mov	x_vec_i, #0
305
306.Lloop16_vects:
307	ldr	x_ptr, [x_src, x_vec_i]
308	add	x_vec_i, x_vec_i, #8
309	ldr	q_data, [x_ptr, x_pos]
310
311	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
312	ldp	q_gft2_lo, q_gft2_hi, [x_tbl2], #32
313	ldp	q_gft3_lo, q_gft3_hi, [x_tbl3], #32
314
315	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
316	ushr	v_data_hi.16b, v_data.16b, #4
317
318	tbl	v_gft1_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
319	tbl	v_gft1_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
320	tbl	v_gft2_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
321	tbl	v_gft2_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
322	tbl	v_gft3_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
323	tbl	v_gft3_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
324
325	eor	v_p1_0.16b, v_gft1_hi.16b, v_p1_0.16b
326	eor	v_p1_0.16b, v_p1_0.16b, v_gft1_lo.16b
327	eor	v_p2_0.16b, v_gft2_hi.16b, v_p2_0.16b
328	eor	v_p2_0.16b, v_p2_0.16b, v_gft2_lo.16b
329	eor	v_p3_0.16b, v_gft3_hi.16b, v_p3_0.16b
330	eor	v_p3_0.16b, v_p3_0.16b, v_gft3_lo.16b
331
332	cmp	x_vec_i, x_vec
333	bne	.Lloop16_vects
334
335.Lloop16_vects_end:
336	str	q_p1_0, [x_dest1, x_pos]
337	str	q_p2_0, [x_dest2, x_pos]
338	str	q_p3_0, [x_dest3, x_pos]
339	add	x_pos, x_pos, #16
340	cmp	x_pos, x_len
341	ble	.Lloop16
342
343.Lloop16_end:
344	sub	x_tmp, x_pos, x_len
345	cmp	x_tmp, #16
346	beq	.return_pass
347
348.lessthan16_init:
349	mov	x_pos, x_len
350	b	.Lloop16
351
352.return_pass:
353	mov	w_ret, #0
354	ret
355
356.return_fail:
357	mov	w_ret, #1
358	ret
359