1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_vect_dot_prod_neon
32.type gf_vect_dot_prod_neon, %function
33
34/* arguments */
35x_len		.req	x0
36x_vec		.req	x1
37x_tbl		.req	x2
38x_src		.req	x3
39x_dest1		.req	x4
40
41/* returns */
42w_ret		.req	w0
43
44/* local variables */
45x_vec_i		.req	x5
46x_ptr		.req	x6
47x_pos		.req	x7
48x_tmp		.req	x8
49x_tbl1		.req	x9
50
51/* vectors */
52v_gft1_lo	.req	v0
53v_gft1_hi	.req	v1
54q_gft1_lo	.req	q0
55q_gft1_hi	.req	q1
56v_mask0f	.req	v2
57q_mask0f	.req	q2
58
59v_data_0	.req	v8
60v_data_1	.req	v9
61v_data_2	.req	v10
62v_data_3	.req	v11
63v_data_4	.req	v12
64v_data_5	.req	v13
65v_data_6	.req	v14
66v_data_7	.req	v15
67q_data_0	.req	q8
68q_data_1	.req	q9
69q_data_2	.req	q10
70q_data_3	.req	q11
71q_data_4	.req	q12
72q_data_5	.req	q13
73q_data_6	.req	q14
74q_data_7	.req	q15
75
76v_data_0_lo	.req	v16
77v_data_1_lo	.req	v17
78v_data_2_lo	.req	v18
79v_data_3_lo	.req	v19
80v_data_4_lo	.req	v20
81v_data_5_lo	.req	v21
82v_data_6_lo	.req	v22
83v_data_7_lo	.req	v23
84v_data_0_hi	.req	v_data_0
85v_data_1_hi	.req	v_data_1
86v_data_2_hi	.req	v_data_2
87v_data_3_hi	.req	v_data_3
88v_data_4_hi	.req	v_data_4
89v_data_5_hi	.req	v_data_5
90v_data_6_hi	.req	v_data_6
91v_data_7_hi	.req	v_data_7
92
93v_p0		.req	v24
94v_p1		.req	v25
95v_p2		.req	v26
96v_p3		.req	v27
97v_p4		.req	v28
98v_p5		.req	v29
99v_p6		.req	v30
100v_p7		.req	v31
101q_p0		.req	q24
102q_p1		.req	q25
103q_p2		.req	q26
104q_p3		.req	q27
105q_p4		.req	q28
106q_p5		.req	q29
107q_p6		.req	q30
108q_p7		.req	q31
109
110v_p		.req	v_p0
111q_p		.req	q_p0
112v_data		.req	v_p1
113q_data		.req	q_p1
114v_data_lo	.req	v_p2
115v_data_hi	.req	v_p3
116
117
118gf_vect_dot_prod_neon:
119	/* less than 16 bytes, return_fail */
120	cmp	x_len, #16
121	blt	.return_fail
122
123	movi	v_mask0f.16b, #0x0f
124	mov	x_pos, #0
125
126	lsl	x_vec, x_vec, #3
127
128.Lloop128_init:
129	/* less than 128 bytes, goto Lloop16_init */
130	cmp	x_len, #128
131	blt	.Lloop16_init
132
133	/* save d8 ~ d15 to stack */
134	sub	sp, sp, #64
135	stp	d8, d9, [sp]
136	stp	d10, d11, [sp, #16]
137	stp	d12, d13, [sp, #32]
138	stp	d14, d15, [sp, #48]
139
140	sub	x_len, x_len, #128
141
142.Lloop128:
143	movi	v_p0.16b, #0
144	movi	v_p1.16b, #0
145	movi	v_p2.16b, #0
146	movi	v_p3.16b, #0
147	movi	v_p4.16b, #0
148	movi	v_p5.16b, #0
149	movi	v_p6.16b, #0
150	movi	v_p7.16b, #0
151
152	mov	x_tbl1, x_tbl
153	mov	x_vec_i, #0
154
155.Lloop128_vects:
156	ldr	x_ptr, [x_src, x_vec_i]
157	add	x_vec_i, x_vec_i, #8
158	add	x_ptr, x_ptr, x_pos
159
160	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
161
162	ldp	q_data_0, q_data_1, [x_ptr], #32
163	ldp	q_data_2, q_data_3, [x_ptr], #32
164	ldp	q_data_4, q_data_5, [x_ptr], #32
165	ldp	q_data_6, q_data_7, [x_ptr]
166
167	prfm	pldl1keep, [x_tbl1]
168	prfm	pldl1strm, [x_ptr]
169
170	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
171	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
172	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
173	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
174	and	v_data_4_lo.16b, v_data_4.16b, v_mask0f.16b
175	and	v_data_5_lo.16b, v_data_5.16b, v_mask0f.16b
176	and	v_data_6_lo.16b, v_data_6.16b, v_mask0f.16b
177	and	v_data_7_lo.16b, v_data_7.16b, v_mask0f.16b
178
179	ushr	v_data_0_hi.16b, v_data_0.16b, #4
180	ushr	v_data_1_hi.16b, v_data_1.16b, #4
181	ushr	v_data_2_hi.16b, v_data_2.16b, #4
182	ushr	v_data_3_hi.16b, v_data_3.16b, #4
183	ushr	v_data_4_hi.16b, v_data_4.16b, #4
184	ushr	v_data_5_hi.16b, v_data_5.16b, #4
185	ushr	v_data_6_hi.16b, v_data_6.16b, #4
186	ushr	v_data_7_hi.16b, v_data_7.16b, #4
187
188	tbl	v_data_0_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
189	tbl	v_data_1_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
190	tbl	v_data_2_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
191	tbl	v_data_3_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
192	tbl	v_data_4_lo.16b, {v_gft1_lo.16b}, v_data_4_lo.16b
193	tbl	v_data_5_lo.16b, {v_gft1_lo.16b}, v_data_5_lo.16b
194	tbl	v_data_6_lo.16b, {v_gft1_lo.16b}, v_data_6_lo.16b
195	tbl	v_data_7_lo.16b, {v_gft1_lo.16b}, v_data_7_lo.16b
196
197	tbl	v_data_0_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
198	tbl	v_data_1_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
199	tbl	v_data_2_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
200	tbl	v_data_3_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
201	tbl	v_data_4_hi.16b, {v_gft1_hi.16b}, v_data_4_hi.16b
202	tbl	v_data_5_hi.16b, {v_gft1_hi.16b}, v_data_5_hi.16b
203	tbl	v_data_6_hi.16b, {v_gft1_hi.16b}, v_data_6_hi.16b
204	tbl	v_data_7_hi.16b, {v_gft1_hi.16b}, v_data_7_hi.16b
205
206	eor	v_p0.16b, v_data_0_lo.16b, v_p0.16b
207	eor	v_p0.16b, v_p0.16b, v_data_0_hi.16b
208	eor	v_p1.16b, v_data_1_lo.16b, v_p1.16b
209	eor	v_p1.16b, v_p1.16b, v_data_1_hi.16b
210	eor	v_p2.16b, v_data_2_lo.16b, v_p2.16b
211	eor	v_p2.16b, v_p2.16b, v_data_2_hi.16b
212	eor	v_p3.16b, v_data_3_lo.16b, v_p3.16b
213	eor	v_p3.16b, v_p3.16b, v_data_3_hi.16b
214	eor	v_p4.16b, v_data_4_lo.16b, v_p4.16b
215	eor	v_p4.16b, v_p4.16b, v_data_4_hi.16b
216	eor	v_p5.16b, v_data_5_lo.16b, v_p5.16b
217	eor	v_p5.16b, v_p5.16b, v_data_5_hi.16b
218	eor	v_p6.16b, v_data_6_lo.16b, v_p6.16b
219	eor	v_p6.16b, v_p6.16b, v_data_6_hi.16b
220	eor	v_p7.16b, v_data_7_lo.16b, v_p7.16b
221	eor	v_p7.16b, v_p7.16b, v_data_7_hi.16b
222
223	cmp	x_vec_i, x_vec
224	blt	.Lloop128_vects
225
226.Lloop128_vects_end:
227	add	x_ptr, x_dest1, x_pos
228	stp	q_p0, q_p1, [x_ptr], #32
229	stp	q_p2, q_p3, [x_ptr], #32
230	stp	q_p4, q_p5, [x_ptr], #32
231	stp	q_p6, q_p7, [x_ptr]
232
233	add	x_pos, x_pos, #128
234	cmp	x_pos, x_len
235	ble	.Lloop128
236
237.Lloop128_end:
238	/* restore d8 ~ d15 */
239	ldp	d8,  d9,  [sp]
240	ldp	d10, d11, [sp, #16]
241	ldp	d12, d13, [sp, #32]
242	ldp	d14, d15, [sp, #48]
243	add	sp, sp, #64
244
245	add	x_len, x_len, #128
246	cmp	x_pos, x_len
247	beq	.return_pass
248
249.Lloop16_init:
250	sub	x_len, x_len, #16
251	cmp	x_pos, x_len
252	bgt	.lessthan16_init
253
254.Lloop16:
255	movi	v_p.16b, #0
256	mov	x_tbl1, x_tbl
257	mov	x_vec_i, #0
258
259.Lloop16_vects:
260	ldr	x_ptr, [x_src, x_vec_i]
261	ldr	q_data, [x_ptr, x_pos]
262	add	x_vec_i, x_vec_i, #8
263
264	ldp	q_gft1_lo, q_gft1_hi, [x_tbl1], #32
265
266	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
267	ushr	v_data_hi.16b, v_data.16b, #4
268
269	tbl	v_data_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
270	tbl	v_data_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
271	eor	v_p.16b, v_data_lo.16b, v_p.16b
272	eor	v_p.16b, v_p.16b, v_data_hi.16b
273
274	cmp	x_vec_i, x_vec
275	blt	.Lloop16_vects
276
277.Lloop16_vects_end:
278	str	q_p, [x_dest1, x_pos]
279	add	x_pos, x_pos, #16
280	cmp	x_pos, x_len
281	ble	.Lloop16
282
283.Lloop16_end:
284	sub	x_tmp, x_pos, x_len
285	cmp	x_tmp, #16
286	beq	.return_pass
287
288.lessthan16_init:
289	mov	x_pos, x_len
290	b	.Lloop16
291
292.return_pass:
293	mov	w_ret, #0
294	ret
295
296.return_fail:
297	mov	w_ret, #1
298	ret
299