1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29
30.text
31.global gf_6vect_mad_neon
32.type gf_6vect_mad_neon, %function
33
34
35/* arguments */
36x_len		.req	x0
37x_vec		.req	x1
38x_vec_i		.req	x2
39x_tbl		.req	x3
40x_src		.req	x4
41x_dest		.req	x5
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_src_end	.req	x6
48x_dest1		.req	x7
49x_dest2		.req	x8
50x_dest3		.req	x9
51x_dest4		.req	x10
52x_dest5		.req	x11
53x_dest6		.req	x_dest
54x_tmp		.req	x12
55x_tbl1		.req	x13
56x_tbl2		.req	x14
57x_tbl3		.req	x15
58x_tbl4		.req	x16
59x_tbl5		.req	x17
60x_tbl6		.req	x_tbl
61x_const		.req	x18
62
63/* vectors */
64v_mask0f	.req	v0
65v_tmp_lo	.req	v1
66v_tmp_hi	.req	v2
67v_tmp		.req	v3
68q_tmp		.req	q3
69
70v_gft1_lo	.req	v4
71v_gft1_hi	.req	v5
72v_gft2_lo	.req	v6
73v_gft2_hi	.req	v7
74v_gft3_lo	.req	v16
75v_gft3_hi	.req	v17
76q_gft1_lo	.req	q4
77q_gft1_hi	.req	q5
78q_gft2_lo	.req	q6
79q_gft2_hi	.req	q7
80q_gft3_lo	.req	q16
81q_gft3_hi	.req	q17
82
83v_gft4_lo	.req	v18
84v_gft4_hi	.req	v19
85q_gft4_lo	.req	q18
86q_gft4_hi	.req	q19
87v_gft5_lo	.req	v_gft2_lo
88v_gft5_hi	.req	v_gft2_hi
89q_gft5_lo	.req	q_gft2_lo
90q_gft5_hi	.req	q_gft2_hi
91v_gft6_lo	.req	v_gft3_lo
92v_gft6_hi	.req	v_gft3_hi
93q_gft6_lo	.req	q_gft3_lo
94q_gft6_hi	.req	q_gft3_hi
95
96v_data_0	.req	v8
97v_data_1	.req	v9
98v_data_2	.req	v10
99v_data_3	.req	v11
100q_data_0	.req	q8
101q_data_1	.req	q9
102q_data_2	.req	q10
103q_data_3	.req	q11
104
105v_data_0_lo	.req	v12
106v_data_1_lo	.req	v13
107v_data_2_lo	.req	v14
108v_data_3_lo	.req	v15
109v_data_0_hi	.req	v_data_0
110v_data_1_hi	.req	v_data_1
111v_data_2_hi	.req	v_data_2
112v_data_3_hi	.req	v_data_3
113
114v_d1_0		.req	v20
115v_d1_1		.req	v21
116v_d1_2		.req	v22
117v_d1_3		.req	v23
118v_d2_0		.req	v24
119v_d2_1		.req	v25
120v_d2_2		.req	v26
121v_d2_3		.req	v27
122v_d3_0		.req	v28
123v_d3_1		.req	v29
124v_d3_2		.req	v30
125v_d3_3		.req	v31
126q_d1_0		.req	q20
127q_d1_1		.req	q21
128q_d1_2		.req	q22
129q_d1_3		.req	q23
130q_d2_0		.req	q24
131q_d2_1		.req	q25
132q_d2_2		.req	q26
133q_d2_3		.req	q27
134q_d3_0		.req	q28
135q_d3_1		.req	q29
136q_d3_2		.req	q30
137q_d3_3		.req	q31
138
139v_d4_0		.req	v_d1_0
140v_d4_1		.req	v_d1_1
141v_d4_2		.req	v_d1_2
142v_d4_3		.req	v_d1_3
143q_d4_0		.req	q_d1_0
144q_d4_1		.req	q_d1_1
145q_d4_2		.req	q_d1_2
146q_d4_3		.req	q_d1_3
147v_d5_0		.req	v_d2_0
148v_d5_1		.req	v_d2_1
149v_d5_2		.req	v_d2_2
150v_d5_3		.req	v_d2_3
151q_d5_0		.req	q_d2_0
152q_d5_1		.req	q_d2_1
153q_d5_2		.req	q_d2_2
154q_d5_3		.req	q_d2_3
155v_d6_0		.req	v_d3_0
156v_d6_1		.req	v_d3_1
157v_d6_2		.req	v_d3_2
158v_d6_3		.req	v_d3_3
159q_d6_0		.req	q_d3_0
160q_d6_1		.req	q_d3_1
161q_d6_2		.req	q_d3_2
162q_d6_3		.req	q_d3_3
163
164v_data		.req	v21
165q_data		.req	q21
166v_data_lo	.req	v22
167v_data_hi	.req	v23
168
169gf_6vect_mad_neon:
170	/* less than 16 bytes, return_fail */
171	cmp	x_len, #16
172	blt	.return_fail
173
174	movi	v_mask0f.16b, #0x0f
175	lsl	x_vec_i, x_vec_i, #5
176	lsl	x_vec, x_vec, #5
177	add	x_tbl1, x_tbl, x_vec_i
178	add	x_tbl2, x_tbl1, x_vec
179	add	x_tbl3, x_tbl2, x_vec
180	add	x_tbl4, x_tbl3, x_vec
181	add	x_tbl5, x_tbl4, x_vec
182	add	x_tbl6, x_tbl5, x_vec
183	add	x_src_end, x_src, x_len
184	ldr	x_dest1, [x_dest, #8*0]
185	ldr	x_dest2, [x_dest, #8*1]
186	ldr	x_dest3, [x_dest, #8*2]
187	ldr	x_dest4, [x_dest, #8*3]
188	ldr	x_dest5, [x_dest, #8*4]
189	ldr	x_dest6, [x_dest, #8*5]
190	ldr	q_gft1_lo, [x_tbl1]
191	ldr	q_gft1_hi, [x_tbl1, #16]
192	ldr	q_gft4_lo, [x_tbl4]
193	ldr	q_gft4_hi, [x_tbl4, #16]
194
195.Lloop64_init:
196	/* less than 64 bytes, goto Lloop16_init */
197	cmp	x_len, #64
198	blt	.Lloop16_init
199
200	/* save d8 ~ d15 to stack */
201	sub	sp, sp, #64
202	stp	d8, d9, [sp]
203	stp	d10, d11, [sp, #16]
204	stp	d12, d13, [sp, #32]
205	stp	d14, d15, [sp, #48]
206
207	sub	x_src_end, x_src_end, #64
208
209.Lloop64:
210	ldr	q_data_0, [x_src, #16*0]
211	ldr	q_data_1, [x_src, #16*1]
212	ldr	q_data_2, [x_src, #16*2]
213	ldr	q_data_3, [x_src, #16*3]
214	add	x_src, x_src, #64
215
216	ldr	q_d1_0, [x_dest1, #16*0]
217	ldr	q_d1_1, [x_dest1, #16*1]
218	ldr	q_d1_2, [x_dest1, #16*2]
219	ldr	q_d1_3, [x_dest1, #16*3]
220
221	ldr	q_d2_0, [x_dest2, #16*0]
222	ldr	q_d2_1, [x_dest2, #16*1]
223	ldr	q_d2_2, [x_dest2, #16*2]
224	ldr	q_d2_3, [x_dest2, #16*3]
225
226	ldr	q_d3_0, [x_dest3, #16*0]
227	ldr	q_d3_1, [x_dest3, #16*1]
228	ldr	q_d3_2, [x_dest3, #16*2]
229	ldr	q_d3_3, [x_dest3, #16*3]
230
231	ldr	q_gft2_lo, [x_tbl2]
232	ldr	q_gft2_hi, [x_tbl2, #16]
233	ldr	q_gft3_lo, [x_tbl3]
234	ldr	q_gft3_hi, [x_tbl3, #16]
235
236	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
237	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
238	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
239	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
240
241	ushr	v_data_0_hi.16b, v_data_0.16b, #4
242	ushr	v_data_1_hi.16b, v_data_1.16b, #4
243	ushr	v_data_2_hi.16b, v_data_2.16b, #4
244	ushr	v_data_3_hi.16b, v_data_3.16b, #4
245
246	/* dest1 */
247	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
248	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
249	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
250	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
251
252	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
253	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
254	eor	v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
255	eor	v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
256
257	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
258	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
259	eor	v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
260	eor	v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
261
262	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
263	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
264	eor	v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
265	eor	v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
266
267	/* dest2 */
268	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
269	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
270	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
271	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
272
273	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
274	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
275	eor	v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
276	eor	v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
277
278	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
279	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
280	eor	v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
281	eor	v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
282
283	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
284	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
285	eor	v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
286	eor	v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
287
288	/* dest3 */
289	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
290	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
291	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
292	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
293
294	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
295	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
296	eor	v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
297	eor	v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
298
299	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
300	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
301	eor	v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
302	eor	v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
303
304	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
305	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
306	eor	v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
307	eor	v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
308
309	str	q_d1_0, [x_dest1, #16*0]
310	str	q_d1_1, [x_dest1, #16*1]
311	str	q_d1_2, [x_dest1, #16*2]
312	str	q_d1_3, [x_dest1, #16*3]
313	add	x_dest1, x_dest1, #64
314
315	str	q_d2_0, [x_dest2, #16*0]
316	str	q_d2_1, [x_dest2, #16*1]
317	str	q_d2_2, [x_dest2, #16*2]
318	str	q_d2_3, [x_dest2, #16*3]
319	add	x_dest2, x_dest2, #64
320
321	str	q_d3_0, [x_dest3, #16*0]
322	str	q_d3_1, [x_dest3, #16*1]
323	str	q_d3_2, [x_dest3, #16*2]
324	str	q_d3_3, [x_dest3, #16*3]
325	add	x_dest3, x_dest3, #64
326
327	ldr	q_d4_0, [x_dest4, #16*0]
328	ldr	q_d4_1, [x_dest4, #16*1]
329	ldr	q_d4_2, [x_dest4, #16*2]
330	ldr	q_d4_3, [x_dest4, #16*3]
331
332	ldr	q_d5_0, [x_dest5, #16*0]
333	ldr	q_d5_1, [x_dest5, #16*1]
334	ldr	q_d5_2, [x_dest5, #16*2]
335	ldr	q_d5_3, [x_dest5, #16*3]
336
337	ldr	q_d6_0, [x_dest6, #16*0]
338	ldr	q_d6_1, [x_dest6, #16*1]
339	ldr	q_d6_2, [x_dest6, #16*2]
340	ldr	q_d6_3, [x_dest6, #16*3]
341
342	ldr	q_gft5_lo, [x_tbl5]
343	ldr	q_gft5_hi, [x_tbl5, #16]
344	ldr	q_gft6_lo, [x_tbl6]
345	ldr	q_gft6_hi, [x_tbl6, #16]
346
347	/* dest4 */
348	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
349	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
350	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
351	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
352
353	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
354	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
355	eor	v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
356	eor	v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
357
358	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
359	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
360	eor	v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
361	eor	v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
362
363	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
364	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
365	eor	v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
366	eor	v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
367
368	/* dest5 */
369	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
370	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
371	eor	v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
372	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
373
374	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
375	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
376	eor	v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
377	eor	v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
378
379	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
380	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
381	eor	v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
382	eor	v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
383
384	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
385	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
386	eor	v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
387	eor	v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
388
389	/* dest6 */
390	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_0_lo.16b
391	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_0_hi.16b
392	eor	v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
393	eor	v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
394
395	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_1_lo.16b
396	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_1_hi.16b
397	eor	v_d6_1.16b, v_tmp_lo.16b, v_d6_1.16b
398	eor	v_d6_1.16b, v_d6_1.16b, v_tmp_hi.16b
399
400	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_2_lo.16b
401	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_2_hi.16b
402	eor	v_d6_2.16b, v_tmp_lo.16b, v_d6_2.16b
403	eor	v_d6_2.16b, v_d6_2.16b, v_tmp_hi.16b
404
405	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_3_lo.16b
406	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_3_hi.16b
407	eor	v_d6_3.16b, v_tmp_lo.16b, v_d6_3.16b
408	eor	v_d6_3.16b, v_d6_3.16b, v_tmp_hi.16b
409
410	str	q_d4_0, [x_dest4, #16*0]
411	str	q_d4_1, [x_dest4, #16*1]
412	str	q_d4_2, [x_dest4, #16*2]
413	str	q_d4_3, [x_dest4, #16*3]
414	add	x_dest4, x_dest4, #64
415
416	str	q_d5_0, [x_dest5, #16*0]
417	str	q_d5_1, [x_dest5, #16*1]
418	str	q_d5_2, [x_dest5, #16*2]
419	str	q_d5_3, [x_dest5, #16*3]
420	add	x_dest5, x_dest5, #64
421
422	str	q_d6_0, [x_dest6, #16*0]
423	str	q_d6_1, [x_dest6, #16*1]
424	str	q_d6_2, [x_dest6, #16*2]
425	str	q_d6_3, [x_dest6, #16*3]
426	add	x_dest6, x_dest6, #64
427
428	cmp	x_src, x_src_end
429	bls	.Lloop64
430
431.Lloop64_end:
432	/* restore d8 ~ d15 */
433	ldp	d8,  d9,  [sp]
434	ldp	d10, d11, [sp, #16]
435	ldp	d12, d13, [sp, #32]
436	ldp	d14, d15, [sp, #48]
437	add	sp, sp, #64
438	add	x_src_end, x_src_end, #64
439
440.Lloop16_init:
441	sub	x_src_end, x_src_end, #16
442	cmp	x_src, x_src_end
443	bhi	.lessthan16_init
444
445.Lloop16:
446	ldr	q_data, [x_src]
447
448	ldr	q_d1_0, [x_dest1]
449	ldr	q_d2_0, [x_dest2]
450	ldr	q_d3_0, [x_dest3]
451	ldr	q_gft2_lo, [x_tbl2]
452	ldr	q_gft2_hi, [x_tbl2, #16]
453	ldr	q_gft3_lo, [x_tbl3]
454	ldr	q_gft3_hi, [x_tbl3, #16]
455
456	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
457	ushr	v_data_hi.16b, v_data.16b, #4
458
459	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
460	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
461	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
462	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
463
464	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
465	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
466	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
467	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
468
469	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
470	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
471	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
472	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
473
474	str	q_d1_0, [x_dest1]
475	str	q_d2_0, [x_dest2]
476	str	q_d3_0, [x_dest3]
477
478	ldr	q_d4_0, [x_dest4]
479	ldr	q_d5_0, [x_dest5]
480	ldr	q_d6_0, [x_dest6]
481	ldr	q_gft5_lo, [x_tbl5]
482	ldr	q_gft5_hi, [x_tbl5, #16]
483	ldr	q_gft6_lo, [x_tbl6]
484	ldr	q_gft6_hi, [x_tbl6, #16]
485
486	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
487	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
488	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
489	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
490
491	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
492	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
493	eor	v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
494	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
495
496	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
497	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
498	eor	v_d6_0.16b, v_tmp_lo.16b, v_d6_0.16b
499	eor	v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
500
501	str	q_d4_0, [x_dest4]
502	str	q_d5_0, [x_dest5]
503	str	q_d6_0, [x_dest6]
504
505	add	x_src, x_src, #16
506	add	x_dest1, x_dest1, #16
507	add	x_dest2, x_dest2, #16
508	add	x_dest3, x_dest3, #16
509	add	x_dest4, x_dest4, #16
510	add	x_dest5, x_dest5, #16
511	add	x_dest6, x_dest6, #16
512	cmp	x_src, x_src_end
513	bls	.Lloop16
514
515.lessthan16_init:
516	sub	x_tmp, x_src, x_src_end
517	cmp	x_tmp, #16
518	beq	.return_pass
519
520.lessthan16:
521	mov	x_src, x_src_end
522	sub	x_dest1, x_dest1, x_tmp
523	sub	x_dest2, x_dest2, x_tmp
524	sub	x_dest3, x_dest3, x_tmp
525	sub	x_dest4, x_dest4, x_tmp
526	sub	x_dest5, x_dest5, x_tmp
527	sub	x_dest6, x_dest6, x_tmp
528
529	ldr	x_const, =const_tbl
530	sub	x_const, x_const, x_tmp
531	ldr	q_tmp, [x_const, #16]
532
533	ldr	q_data, [x_src]
534	ldr	q_d1_0, [x_dest1]
535	ldr	q_d2_0, [x_dest2]
536	ldr	q_d3_0, [x_dest3]
537	ldr	q_gft2_lo, [x_tbl2]
538	ldr	q_gft2_hi, [x_tbl2, #16]
539	ldr	q_gft3_lo, [x_tbl3]
540	ldr	q_gft3_hi, [x_tbl3, #16]
541
542	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
543	ushr	v_data_hi.16b, v_data.16b, #4
544
545	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
546	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
547	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
548	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
549	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
550
551	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
552	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
553	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
554	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
555	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
556
557	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
558	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
559	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
560	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
561	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
562
563	str	q_d1_0, [x_dest1]
564	str	q_d2_0, [x_dest2]
565	str	q_d3_0, [x_dest3]
566
567	ldr	q_d4_0, [x_dest4]
568	ldr	q_d5_0, [x_dest5]
569	ldr	q_d6_0, [x_dest6]
570	ldr	q_gft5_lo, [x_tbl5]
571	ldr	q_gft5_hi, [x_tbl5, #16]
572	ldr	q_gft6_lo, [x_tbl6]
573	ldr	q_gft6_hi, [x_tbl6, #16]
574
575	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
576	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
577	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
578	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
579	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
580
581	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
582	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
583	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
584	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
585	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
586
587	tbl	v_tmp_lo.16b, {v_gft6_lo.16b}, v_data_lo.16b
588	tbl	v_tmp_hi.16b, {v_gft6_hi.16b}, v_data_hi.16b
589	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
590	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
591	eor	v_d6_0.16b, v_d6_0.16b, v_tmp_hi.16b
592
593	str	q_d4_0, [x_dest4]
594	str	q_d5_0, [x_dest5]
595	str	q_d6_0, [x_dest6]
596
597.return_pass:
598	mov	w_ret, #0
599	ret
600
601.return_fail:
602	mov	w_ret, #1
603	ret
604
605.section .data
606.balign 8
607const_tbl:
608	.dword 0x0000000000000000, 0x0000000000000000
609	.dword 0xffffffffffffffff, 0xffffffffffffffff
610