1/**************************************************************
2  Copyright (c) 2019 Huawei Technologies Co., Ltd.
3
4  Redistribution and use in source and binary forms, with or without
5  modification, are permitted provided that the following conditions
6  are met:
7    * Redistributions of source code must retain the above copyright
8      notice, this list of conditions and the following disclaimer.
9    * Redistributions in binary form must reproduce the above copyright
10      notice, this list of conditions and the following disclaimer in
11      the documentation and/or other materials provided with the
12      distribution.
13    * Neither the name of Huawei Corporation nor the names of its
14      contributors may be used to endorse or promote products derived
15      from this software without specific prior written permission.
16
17  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28**********************************************************************/
29.text
30
31.global gf_5vect_mad_neon
32.type gf_5vect_mad_neon, %function
33
34
35/* arguments */
36x_len		.req	x0
37x_vec		.req	x1
38x_vec_i		.req	x2
39x_tbl		.req	x3
40x_src		.req	x4
41x_dest		.req	x5
42
43/* returns */
44w_ret		.req	w0
45
46/* local variables */
47x_src_end	.req	x6
48x_dest1		.req	x7
49x_dest2		.req	x8
50x_dest3		.req	x9
51x_dest4		.req	x10
52x_dest5		.req	x_dest
53x_tmp		.req	x11
54x_tbl1		.req	x12
55x_tbl2		.req	x13
56x_tbl3		.req	x14
57x_tbl4		.req	x15
58x_tbl5		.req	x16
59x_const		.req	x17
60
61/* vectors */
62v_mask0f	.req	v0
63v_tmp_lo	.req	v1
64v_tmp_hi	.req	v2
65v_tmp		.req	v3
66q_tmp		.req	q3
67
68v_gft1_lo	.req	v4
69v_gft1_hi	.req	v5
70v_gft2_lo	.req	v6
71v_gft2_hi	.req	v7
72v_gft3_lo	.req	v16
73v_gft3_hi	.req	v17
74q_gft1_lo	.req	q4
75q_gft1_hi	.req	q5
76q_gft2_lo	.req	q6
77q_gft2_hi	.req	q7
78q_gft3_lo	.req	q16
79q_gft3_hi	.req	q17
80
81v_gft4_lo	.req	v18
82v_gft4_hi	.req	v19
83q_gft4_lo	.req	q18
84q_gft4_hi	.req	q19
85v_gft5_lo	.req	v_gft2_lo
86v_gft5_hi	.req	v_gft2_hi
87q_gft5_lo	.req	q_gft2_lo
88q_gft5_hi	.req	q_gft2_hi
89
90v_data_0	.req	v8
91v_data_1	.req	v9
92v_data_2	.req	v10
93v_data_3	.req	v11
94q_data_0	.req	q8
95q_data_1	.req	q9
96q_data_2	.req	q10
97q_data_3	.req	q11
98
99v_data_0_lo	.req	v12
100v_data_1_lo	.req	v13
101v_data_2_lo	.req	v14
102v_data_3_lo	.req	v15
103v_data_0_hi	.req	v_data_0
104v_data_1_hi	.req	v_data_1
105v_data_2_hi	.req	v_data_2
106v_data_3_hi	.req	v_data_3
107
108v_d1_0		.req	v20
109v_d1_1		.req	v21
110v_d1_2		.req	v22
111v_d1_3		.req	v23
112v_d2_0		.req	v24
113v_d2_1		.req	v25
114v_d2_2		.req	v26
115v_d2_3		.req	v27
116v_d3_0		.req	v28
117v_d3_1		.req	v29
118v_d3_2		.req	v30
119v_d3_3		.req	v31
120q_d1_0		.req	q20
121q_d1_1		.req	q21
122q_d1_2		.req	q22
123q_d1_3		.req	q23
124q_d2_0		.req	q24
125q_d2_1		.req	q25
126q_d2_2		.req	q26
127q_d2_3		.req	q27
128q_d3_0		.req	q28
129q_d3_1		.req	q29
130q_d3_2		.req	q30
131q_d3_3		.req	q31
132
133v_d4_0		.req	v_d1_0
134v_d4_1		.req	v_d1_1
135v_d4_2		.req	v_d1_2
136v_d4_3		.req	v_d1_3
137q_d4_0		.req	q_d1_0
138q_d4_1		.req	q_d1_1
139q_d4_2		.req	q_d1_2
140q_d4_3		.req	q_d1_3
141v_d5_0		.req	v_d2_0
142v_d5_1		.req	v_d2_1
143v_d5_2		.req	v_d2_2
144v_d5_3		.req	v_d2_3
145q_d5_0		.req	q_d2_0
146q_d5_1		.req	q_d2_1
147q_d5_2		.req	q_d2_2
148q_d5_3		.req	q_d2_3
149
150v_data		.req	v21
151q_data		.req	q21
152v_data_lo	.req	v22
153v_data_hi	.req	v23
154
155gf_5vect_mad_neon:
156	/* less than 16 bytes, return_fail */
157	cmp	x_len, #16
158	blt	.return_fail
159
160	movi	v_mask0f.16b, #0x0f
161	lsl	x_vec_i, x_vec_i, #5
162	lsl	x_vec, x_vec, #5
163	add	x_tbl1, x_tbl, x_vec_i
164	add	x_tbl2, x_tbl1, x_vec
165	add	x_tbl3, x_tbl2, x_vec
166	add	x_tbl4, x_tbl3, x_vec
167	add	x_tbl5, x_tbl4, x_vec
168	add	x_src_end, x_src, x_len
169	ldr	x_dest1, [x_dest, #8*0]
170	ldr	x_dest2, [x_dest, #8*1]
171	ldr	x_dest3, [x_dest, #8*2]
172	ldr	x_dest4, [x_dest, #8*3]
173	ldr	x_dest5, [x_dest, #8*4]
174	ldr	q_gft1_lo, [x_tbl1]
175	ldr	q_gft1_hi, [x_tbl1, #16]
176	ldr	q_gft3_lo, [x_tbl3]
177	ldr	q_gft3_hi, [x_tbl3, #16]
178	ldr	q_gft4_lo, [x_tbl4]
179	ldr	q_gft4_hi, [x_tbl4, #16]
180
181.Lloop64_init:
182	/* less than 64 bytes, goto Lloop16_init */
183	cmp	x_len, #64
184	blt	.Lloop16_init
185
186	/* save d8 ~ d15 to stack */
187	sub	sp, sp, #64
188	stp	d8, d9, [sp]
189	stp	d10, d11, [sp, #16]
190	stp	d12, d13, [sp, #32]
191	stp	d14, d15, [sp, #48]
192
193	sub	x_src_end, x_src_end, #64
194
195.Lloop64:
196	ldr	q_data_0, [x_src, #16*0]
197	ldr	q_data_1, [x_src, #16*1]
198	ldr	q_data_2, [x_src, #16*2]
199	ldr	q_data_3, [x_src, #16*3]
200	add	x_src, x_src, #64
201
202	ldr	q_d1_0, [x_dest1, #16*0]
203	ldr	q_d1_1, [x_dest1, #16*1]
204	ldr	q_d1_2, [x_dest1, #16*2]
205	ldr	q_d1_3, [x_dest1, #16*3]
206
207	ldr	q_d2_0, [x_dest2, #16*0]
208	ldr	q_d2_1, [x_dest2, #16*1]
209	ldr	q_d2_2, [x_dest2, #16*2]
210	ldr	q_d2_3, [x_dest2, #16*3]
211
212	ldr	q_d3_0, [x_dest3, #16*0]
213	ldr	q_d3_1, [x_dest3, #16*1]
214	ldr	q_d3_2, [x_dest3, #16*2]
215	ldr	q_d3_3, [x_dest3, #16*3]
216
217	ldr	q_gft2_lo, [x_tbl2]
218	ldr	q_gft2_hi, [x_tbl2, #16]
219
220	and	v_data_0_lo.16b, v_data_0.16b, v_mask0f.16b
221	and	v_data_1_lo.16b, v_data_1.16b, v_mask0f.16b
222	and	v_data_2_lo.16b, v_data_2.16b, v_mask0f.16b
223	and	v_data_3_lo.16b, v_data_3.16b, v_mask0f.16b
224
225	ushr	v_data_0_hi.16b, v_data_0.16b, #4
226	ushr	v_data_1_hi.16b, v_data_1.16b, #4
227	ushr	v_data_2_hi.16b, v_data_2.16b, #4
228	ushr	v_data_3_hi.16b, v_data_3.16b, #4
229
230	/* dest1 */
231	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_0_lo.16b
232	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_0_hi.16b
233	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
234	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
235
236	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_1_lo.16b
237	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_1_hi.16b
238	eor	v_d1_1.16b, v_tmp_lo.16b, v_d1_1.16b
239	eor	v_d1_1.16b, v_d1_1.16b, v_tmp_hi.16b
240
241	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_2_lo.16b
242	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_2_hi.16b
243	eor	v_d1_2.16b, v_tmp_lo.16b, v_d1_2.16b
244	eor	v_d1_2.16b, v_d1_2.16b, v_tmp_hi.16b
245
246	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_3_lo.16b
247	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_3_hi.16b
248	eor	v_d1_3.16b, v_tmp_lo.16b, v_d1_3.16b
249	eor	v_d1_3.16b, v_d1_3.16b, v_tmp_hi.16b
250
251	/* dest2 */
252	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_0_lo.16b
253	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_0_hi.16b
254	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
255	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
256
257	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_1_lo.16b
258	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_1_hi.16b
259	eor	v_d2_1.16b, v_tmp_lo.16b, v_d2_1.16b
260	eor	v_d2_1.16b, v_d2_1.16b, v_tmp_hi.16b
261
262	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_2_lo.16b
263	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_2_hi.16b
264	eor	v_d2_2.16b, v_tmp_lo.16b, v_d2_2.16b
265	eor	v_d2_2.16b, v_d2_2.16b, v_tmp_hi.16b
266
267	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_3_lo.16b
268	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_3_hi.16b
269	eor	v_d2_3.16b, v_tmp_lo.16b, v_d2_3.16b
270	eor	v_d2_3.16b, v_d2_3.16b, v_tmp_hi.16b
271
272	/* dest3 */
273	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_0_lo.16b
274	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_0_hi.16b
275	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
276	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
277
278	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_1_lo.16b
279	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_1_hi.16b
280	eor	v_d3_1.16b, v_tmp_lo.16b, v_d3_1.16b
281	eor	v_d3_1.16b, v_d3_1.16b, v_tmp_hi.16b
282
283	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_2_lo.16b
284	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_2_hi.16b
285	eor	v_d3_2.16b, v_tmp_lo.16b, v_d3_2.16b
286	eor	v_d3_2.16b, v_d3_2.16b, v_tmp_hi.16b
287
288	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_3_lo.16b
289	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_3_hi.16b
290	eor	v_d3_3.16b, v_tmp_lo.16b, v_d3_3.16b
291	eor	v_d3_3.16b, v_d3_3.16b, v_tmp_hi.16b
292
293	str	q_d1_0, [x_dest1, #16*0]
294	str	q_d1_1, [x_dest1, #16*1]
295	str	q_d1_2, [x_dest1, #16*2]
296	str	q_d1_3, [x_dest1, #16*3]
297	add	x_dest1, x_dest1, #64
298
299	str	q_d2_0, [x_dest2, #16*0]
300	str	q_d2_1, [x_dest2, #16*1]
301	str	q_d2_2, [x_dest2, #16*2]
302	str	q_d2_3, [x_dest2, #16*3]
303	add	x_dest2, x_dest2, #64
304
305	str	q_d3_0, [x_dest3, #16*0]
306	str	q_d3_1, [x_dest3, #16*1]
307	str	q_d3_2, [x_dest3, #16*2]
308	str	q_d3_3, [x_dest3, #16*3]
309	add	x_dest3, x_dest3, #64
310
311	ldr	q_d4_0, [x_dest4, #16*0]
312	ldr	q_d4_1, [x_dest4, #16*1]
313	ldr	q_d4_2, [x_dest4, #16*2]
314	ldr	q_d4_3, [x_dest4, #16*3]
315
316	ldr	q_d5_0, [x_dest5, #16*0]
317	ldr	q_d5_1, [x_dest5, #16*1]
318	ldr	q_d5_2, [x_dest5, #16*2]
319	ldr	q_d5_3, [x_dest5, #16*3]
320
321	ldr	q_gft5_lo, [x_tbl5]
322	ldr	q_gft5_hi, [x_tbl5, #16]
323
324	/* dest4 */
325	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_0_lo.16b
326	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_0_hi.16b
327	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
328	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
329
330	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_1_lo.16b
331	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_1_hi.16b
332	eor	v_d4_1.16b, v_tmp_lo.16b, v_d4_1.16b
333	eor	v_d4_1.16b, v_d4_1.16b, v_tmp_hi.16b
334
335	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_2_lo.16b
336	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_2_hi.16b
337	eor	v_d4_2.16b, v_tmp_lo.16b, v_d4_2.16b
338	eor	v_d4_2.16b, v_d4_2.16b, v_tmp_hi.16b
339
340	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_3_lo.16b
341	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_3_hi.16b
342	eor	v_d4_3.16b, v_tmp_lo.16b, v_d4_3.16b
343	eor	v_d4_3.16b, v_d4_3.16b, v_tmp_hi.16b
344
345	/* dest5 */
346	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_0_lo.16b
347	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_0_hi.16b
348	eor	v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
349	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
350
351	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_1_lo.16b
352	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_1_hi.16b
353	eor	v_d5_1.16b, v_tmp_lo.16b, v_d5_1.16b
354	eor	v_d5_1.16b, v_d5_1.16b, v_tmp_hi.16b
355
356	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_2_lo.16b
357	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_2_hi.16b
358	eor	v_d5_2.16b, v_tmp_lo.16b, v_d5_2.16b
359	eor	v_d5_2.16b, v_d5_2.16b, v_tmp_hi.16b
360
361	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_3_lo.16b
362	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_3_hi.16b
363	eor	v_d5_3.16b, v_tmp_lo.16b, v_d5_3.16b
364	eor	v_d5_3.16b, v_d5_3.16b, v_tmp_hi.16b
365
366	str	q_d4_0, [x_dest4, #16*0]
367	str	q_d4_1, [x_dest4, #16*1]
368	str	q_d4_2, [x_dest4, #16*2]
369	str	q_d4_3, [x_dest4, #16*3]
370	add	x_dest4, x_dest4, #64
371
372	str	q_d5_0, [x_dest5, #16*0]
373	str	q_d5_1, [x_dest5, #16*1]
374	str	q_d5_2, [x_dest5, #16*2]
375	str	q_d5_3, [x_dest5, #16*3]
376	add	x_dest5, x_dest5, #64
377
378	cmp	x_src, x_src_end
379	bls	.Lloop64
380
381.Lloop64_end:
382	/* restore d8 ~ d15 */
383	ldp	d8,  d9,  [sp]
384	ldp	d10, d11, [sp, #16]
385	ldp	d12, d13, [sp, #32]
386	ldp	d14, d15, [sp, #48]
387	add	sp, sp, #64
388	add	x_src_end, x_src_end, #64
389
390.Lloop16_init:
391	sub	x_src_end, x_src_end, #16
392	cmp	x_src, x_src_end
393	bhi	.lessthan16_init
394
395.Lloop16:
396	ldr	q_data, [x_src]
397
398	ldr	q_d1_0, [x_dest1]
399	ldr	q_d2_0, [x_dest2]
400	ldr	q_d3_0, [x_dest3]
401	ldr	q_gft2_lo, [x_tbl2]
402	ldr	q_gft2_hi, [x_tbl2, #16]
403
404	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
405	ushr	v_data_hi.16b, v_data.16b, #4
406
407	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
408	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
409	eor	v_d1_0.16b, v_tmp_lo.16b, v_d1_0.16b
410	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
411
412	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
413	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
414	eor	v_d2_0.16b, v_tmp_lo.16b, v_d2_0.16b
415	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
416
417	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
418	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
419	eor	v_d3_0.16b, v_tmp_lo.16b, v_d3_0.16b
420	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
421
422	str	q_d1_0, [x_dest1]
423	str	q_d2_0, [x_dest2]
424	str	q_d3_0, [x_dest3]
425
426	ldr	q_d4_0, [x_dest4]
427	ldr	q_d5_0, [x_dest5]
428	ldr	q_gft5_lo, [x_tbl5]
429	ldr	q_gft5_hi, [x_tbl5, #16]
430
431	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
432	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
433	eor	v_d4_0.16b, v_tmp_lo.16b, v_d4_0.16b
434	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
435
436	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
437	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
438	eor	v_d5_0.16b, v_tmp_lo.16b, v_d5_0.16b
439	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
440
441	str	q_d4_0, [x_dest4]
442	str	q_d5_0, [x_dest5]
443
444	add	x_src, x_src, #16
445	add	x_dest1, x_dest1, #16
446	add	x_dest2, x_dest2, #16
447	add	x_dest3, x_dest3, #16
448	add	x_dest4, x_dest4, #16
449	add	x_dest5, x_dest5, #16
450	cmp	x_src, x_src_end
451	bls	.Lloop16
452
453.lessthan16_init:
454	sub	x_tmp, x_src, x_src_end
455	cmp	x_tmp, #16
456	beq	.return_pass
457
458.lessthan16:
459	mov	x_src, x_src_end
460	sub	x_dest1, x_dest1, x_tmp
461	sub	x_dest2, x_dest2, x_tmp
462	sub	x_dest3, x_dest3, x_tmp
463	sub	x_dest4, x_dest4, x_tmp
464	sub	x_dest5, x_dest5, x_tmp
465
466	ldr	x_const, =const_tbl
467	sub	x_const, x_const, x_tmp
468	ldr	q_tmp, [x_const, #16]
469
470	ldr	q_data, [x_src]
471	ldr	q_d1_0, [x_dest1]
472	ldr	q_d2_0, [x_dest2]
473	ldr	q_d3_0, [x_dest3]
474	ldr	q_gft2_lo, [x_tbl2]
475	ldr	q_gft2_hi, [x_tbl2, #16]
476
477	and	v_data_lo.16b, v_data.16b, v_mask0f.16b
478	ushr	v_data_hi.16b, v_data.16b, #4
479
480	tbl	v_tmp_lo.16b, {v_gft1_lo.16b}, v_data_lo.16b
481	tbl	v_tmp_hi.16b, {v_gft1_hi.16b}, v_data_hi.16b
482	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
483	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
484	eor	v_d1_0.16b, v_d1_0.16b, v_tmp_hi.16b
485
486	tbl	v_tmp_lo.16b, {v_gft2_lo.16b}, v_data_lo.16b
487	tbl	v_tmp_hi.16b, {v_gft2_hi.16b}, v_data_hi.16b
488	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
489	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
490	eor	v_d2_0.16b, v_d2_0.16b, v_tmp_hi.16b
491
492	tbl	v_tmp_lo.16b, {v_gft3_lo.16b}, v_data_lo.16b
493	tbl	v_tmp_hi.16b, {v_gft3_hi.16b}, v_data_hi.16b
494	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
495	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
496	eor	v_d3_0.16b, v_d3_0.16b, v_tmp_hi.16b
497
498	str	q_d1_0, [x_dest1]
499	str	q_d2_0, [x_dest2]
500	str	q_d3_0, [x_dest3]
501
502	ldr	q_d4_0, [x_dest4]
503	ldr	q_d5_0, [x_dest5]
504	ldr	q_gft5_lo, [x_tbl5]
505	ldr	q_gft5_hi, [x_tbl5, #16]
506
507	tbl	v_tmp_lo.16b, {v_gft4_lo.16b}, v_data_lo.16b
508	tbl	v_tmp_hi.16b, {v_gft4_hi.16b}, v_data_hi.16b
509	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
510	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
511	eor	v_d4_0.16b, v_d4_0.16b, v_tmp_hi.16b
512
513	tbl	v_tmp_lo.16b, {v_gft5_lo.16b}, v_data_lo.16b
514	tbl	v_tmp_hi.16b, {v_gft5_hi.16b}, v_data_hi.16b
515	eor	v_tmp_hi.16b, v_tmp_lo.16b, v_tmp_hi.16b
516	and	v_tmp_hi.16b, v_tmp_hi.16b, v_tmp.16b
517	eor	v_d5_0.16b, v_d5_0.16b, v_tmp_hi.16b
518
519	str	q_d4_0, [x_dest4]
520	str	q_d5_0, [x_dest5]
521
522.return_pass:
523	mov	w_ret, #0
524	ret
525
526.return_fail:
527	mov	w_ret, #1
528	ret
529
530.section .data
531.balign 8
532const_tbl:
533	.dword 0x0000000000000000, 0x0000000000000000
534	.dword 0xffffffffffffffff, 0xffffffffffffffff
535