1########################################################################
2#  Copyright(c) 2019 Arm Corporation All rights reserved.
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#    * Redistributions of source code must retain the above copyright
8#      notice, this list of conditions and the following disclaimer.
9#    * Redistributions in binary form must reproduce the above copyright
10#      notice, this list of conditions and the following disclaimer in
11#      the documentation and/or other materials provided with the
12#      distribution.
13#    * Neither the name of Arm Corporation nor the names of its
14#      contributors may be used to endorse or promote products derived
15#      from this software without specific prior written permission.
16#
17#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30	.arch armv8-a+crc+crypto
31	.text
32	.align	3
33	.global	crc16_t10dif_copy_pmull
34	.type	crc16_t10dif_copy_pmull, %function
35
36/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
37
38/* arguments */
39w_seed		.req	w0
40x_dst		.req	x1
41x_src		.req	x2
42x_len		.req	x3
43w_len		.req	w3
44
45/* returns */
46w_ret		.req	w0
47
48/* these as global temporary registers */
49w_tmp			.req	w6
50x_tmp			.req	x6
51x_tmp1			.req	x7
52x_tmp2			.req	x11
53
54d_tmp1			.req	d0
55d_tmp2			.req	d1
56q_tmp1			.req	q0
57q_tmp2			.req	q1
58v_tmp1			.req	v0
59v_tmp2			.req	v1
60
61/* local variables */
62w_counter		.req	w4
63w_crc			.req	w0
64x_crc			.req	x0
65x_counter		.req	x4
66x_crc16tab		.req	x5
67x_src_saved		.req	x0
68x_dst_saved		.req	x12
69
70crc16_t10dif_copy_pmull:
71	cmp	x_len, 1023
72	sub	sp, sp, #16
73	uxth	w_seed, w_seed
74	bhi	.crc_fold
75
76	mov	x_tmp, 0
77	mov	w_counter, 0
78
79.crc_table_loop_pre:
80	cmp	x_len, x_tmp
81	bls	.end
82
83	sxtw	x_counter, w_counter
84	adrp	x_crc16tab, .LANCHOR0
85	sub	x_src, x_src, x_counter
86	sub	x_dst, x_dst, x_counter
87	add	x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
88
89	.align 2
90.crc_table_loop:
91	ldrb	w_tmp, [x_src, x_counter]
92	strb	w_tmp, [x_dst, x_counter]
93	add	x_counter, x_counter, 1
94	cmp	x_len, x_counter
95	eor	w_tmp, w_tmp, w_crc, lsr 8
96	ldrh	w_tmp, [x_crc16tab, w_tmp, sxtw 1]
97	eor	w_crc, w_tmp, w_crc, lsl 8
98	uxth	w_crc, w_crc
99	bhi	.crc_table_loop
100
101.end:
102	add	sp, sp, 16
103	ret
104
105/* carry less multiplication, part1 - before loop */
106q_x0			.req	q2
107q_x1			.req	q3
108q_x2			.req	q4
109q_x3			.req	q5
110
111v_x0			.req	v2
112v_x1			.req	v3
113v_x2			.req	v4
114v_x3			.req	v5
115
116d_x0			.req	d2
117d_x1			.req	d3
118d_x2			.req	d4
119d_x3			.req	d5
120
121// the following registers only used this part1
122d_tmp3			.req	d16
123v_tmp3			.req	v16
124
125	.align 3
126.crc_fold:
127	fmov	d_tmp1, x_crc
128	fmov	d_tmp2, xzr
129	dup	d_tmp3, v_tmp2.d[0]
130	shl	d_tmp1, d_tmp1, 48
131	ins	v_tmp3.d[1], v_tmp1.d[0]
132
133	and	x_counter, x_len, -64
134	sub	x_counter, x_counter, #64
135	cmp	x_counter, 63
136	add	x_src_saved, x_src, 64
137	add	x_dst_saved, x_dst, 64
138
139	ldr	q_x0, [x_src]
140	ldr	q_x1, [x_src, 16]
141	ldr	q_x2, [x_src, 32]
142	ldr	q_x3, [x_src, 48]
143
144	str	q_x0, [x_dst]
145	str	q_x1, [x_dst, 16]
146	str	q_x2, [x_dst, 32]
147	str	q_x3, [x_dst, 48]
148
149	adrp	x_tmp, .shuffle_mask_lanchor
150	ldr	q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
151
152	tbl	v_tmp1.16b, {v_x0.16b}, v7.16b
153	eor	v_x0.16b, v_tmp3.16b, v_tmp1.16b
154
155	tbl	v_x1.16b, {v_x1.16b}, v7.16b
156	tbl	v_x2.16b, {v_x2.16b}, v7.16b
157	tbl	v_x3.16b, {v_x3.16b}, v7.16b
158	bls	.crc_fold_loop_end
159
160/* carry less multiplication, part2 - loop */
161q_y0			.req	q28
162q_y1			.req	q29
163q_y2			.req	q30
164q_y3			.req	q31
165
166v_y0			.req	v28
167v_y1			.req	v29
168v_y2			.req	v30
169v_y3			.req	v31
170
171d_x0_h			.req	d24
172d_x0_l			.req	d2
173d_x1_h			.req	d25
174d_x1_l			.req	d3
175d_x2_h			.req	d26
176d_x2_l			.req	d4
177d_x3_h			.req	d27
178d_x3_l			.req	d5
179
180v_x0_h			.req	v24
181v_x0_l			.req	v2
182v_x1_h			.req	v25
183v_x1_l			.req	v3
184v_x2_h			.req	v26
185v_x2_l			.req	v4
186v_x3_h			.req	v27
187v_x3_l			.req	v5
188
189v_tmp1_x0		.req	v24
190v_tmp1_x1		.req	v25
191v_tmp1_x2		.req	v26
192v_tmp1_x3		.req	v27
193
194d_p4_h			.req	d19
195v_p4_h			.req	v19
196d_p4_l			.req	d17
197v_p4_l			.req	v17
198
199	mov	x_tmp, 0x371d0000		/* p4 [1] */
200	fmov	d_p4_h, x_tmp
201	mov	x_tmp, 0x87e70000		/* p4 [0] */
202	fmov	d_p4_l, x_tmp
203
204	.align 2
205.crc_fold_loop:
206	add	x_src_saved, x_src_saved, 64
207	add	x_dst_saved, x_dst_saved, 64
208
209	sub	x_counter, x_counter, #64
210	cmp	x_counter, 63
211
212	dup	d_x0_h, v_x0.d[1]
213	dup	d_x1_h, v_x1.d[1]
214	dup	d_x2_h, v_x2.d[1]
215	dup	d_x3_h, v_x3.d[1]
216
217	dup	d_x0_l, v_x0.d[0]
218	dup	d_x1_l, v_x1.d[0]
219	dup	d_x2_l, v_x2.d[0]
220	dup	d_x3_l, v_x3.d[0]
221
222	ldr	q_y0, [x_src_saved, -64]
223	ldr	q_y1, [x_src_saved, -48]
224	ldr	q_y2, [x_src_saved, -32]
225	ldr	q_y3, [x_src_saved, -16]
226
227	str	q_y0, [x_dst_saved, -64]
228	str	q_y1, [x_dst_saved, -48]
229	str	q_y2, [x_dst_saved, -32]
230	str	q_y3, [x_dst_saved, -16]
231
232	pmull	v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
233	pmull	v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
234	pmull	v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
235	pmull	v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
236	pmull	v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
237	pmull	v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
238	pmull	v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
239	pmull	v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
240
241	tbl	v_y0.16b, {v_y0.16b}, v7.16b
242	tbl	v_y1.16b, {v_y1.16b}, v7.16b
243	tbl	v_y2.16b, {v_y2.16b}, v7.16b
244	tbl	v_y3.16b, {v_y3.16b}, v7.16b
245
246	eor	v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
247	eor	v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
248	eor	v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
249	eor	v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
250
251	eor	v_x0.16b, v_tmp1_x0.16b, v_y0.16b
252	eor	v_x1.16b, v_tmp1_x1.16b, v_y1.16b
253	eor	v_x2.16b, v_tmp1_x2.16b, v_y2.16b
254	eor	v_x3.16b, v_tmp1_x3.16b, v_y3.16b
255
256	bhi	.crc_fold_loop
257
258/* carry less multiplication, part3 - after loop */
259/* folding 512bit ---> 128bit */
260
261// input parameters:
262// v_x0 => v2
263// v_x1 => v3
264// v_x2 => v4
265// v_x3 => v5
266
267// v0, v1, v6, v30, are tmp registers
268
269.crc_fold_loop_end:
270	mov	x_tmp, 0x4c1a0000	/* p1 [1] */
271	fmov	d0, x_tmp
272	mov	x_tmp, 0xfb0b0000	/* p1 [0] */
273	fmov	d1, x_tmp
274
275	and	w_counter, w_len, -64
276	sxtw	x_tmp, w_counter
277
278	add	x_src, x_src, x_tmp
279	add	x_dst, x_dst, x_tmp
280
281	dup	d6, v_x0.d[1]
282	dup	d30, v_x0.d[0]
283	pmull	v6.1q, v6.1d, v0.1d
284	pmull	v30.1q, v30.1d, v1.1d
285	eor	v6.16b, v6.16b, v30.16b
286	eor	v_x1.16b, v6.16b, v_x1.16b
287
288	dup	d6, v_x1.d[1]
289	dup	d30, v_x1.d[0]
290	pmull	v6.1q, v6.1d, v0.1d
291	pmull	v16.1q, v30.1d, v1.1d
292	eor	v6.16b, v6.16b, v16.16b
293	eor	v_x2.16b, v6.16b, v_x2.16b
294
295	dup	d_x0, v_x2.d[1]
296	dup	d30, v_x2.d[0]
297	pmull	v0.1q, v_x0.1d, v0.1d
298	pmull	v_x0.1q, v30.1d, v1.1d
299	eor	v1.16b, v0.16b, v_x0.16b
300	eor	v_x0.16b, v1.16b, v_x3.16b
301
302/* carry less multiplication, part3 - after loop */
303/* crc16 fold function */
304d_16fold_p0_h		.req	d18
305v_16fold_p0_h		.req	v18
306
307d_16fold_p0_l		.req	d4
308v_16fold_p0_l		.req	v4
309
310v_16fold_from		.req	v_x0
311d_16fold_from_h		.req	d3
312v_16fold_from_h		.req	v3
313
314v_16fold_zero		.req	v7
315
316v_16fold_from1		.req	v16
317
318v_16fold_from2		.req	v0
319d_16fold_from2_h	.req	d6
320v_16fold_from2_h	.req	v6
321
322v_16fold_tmp		.req	v0
323
324	movi	v_16fold_zero.4s, 0
325	mov	x_tmp1, 0x2d560000		/* p0 [1] */
326	mov	x_tmp2, 0x13680000		/* p0 [0] */
327
328	ext	v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
329	ext	v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
330
331	dup	d_16fold_from_h, v_16fold_from.d[1]
332	fmov	d_16fold_p0_h, x_tmp1
333	pmull	v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
334	eor	v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
335
336	dup	d_16fold_from2_h, v_16fold_from2.d[1]
337	fmov	d_16fold_p0_l, x_tmp2
338	pmull	v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
339	eor	v_x0.16b, v0.16b, v6.16b
340
341/* carry less multiplication, part3 - after loop */
342/* crc16 barrett reduction function */
343
344// input parameters:
345// v_x0:			v2
346// barrett reduction constant:	br[0], br[1]
347
348d_br0	.req	d3
349v_br0	.req	v3
350d_br1	.req	d5
351v_br1	.req	v5
352
353	mov	x_tmp1, 0x57f9			/* br[0] low */
354	movk	x_tmp1, 0xf65a, lsl 16		/* br[0] high */
355	movk	x_tmp1, 0x1, lsl 32
356	fmov	d_br0, x_tmp1
357
358	dup	d1, v_x0.d[0]
359	dup	d1, v1.d[0]
360	ext	v1.16b, v1.16b, v7.16b, #4
361	pmull	v4.1q, v1.1d, v_br0.1d
362
363	ext	v1.16b, v4.16b, v7.16b, #4
364	mov	x_tmp1, 0x8bb70000		/* br[1] low */
365	movk	x_tmp1, 0x1, lsl 32		/* br[1] high */
366
367	fmov	d_br1, x_tmp1
368	pmull	v_br1.1q, v1.1d, v_br1.1d
369	eor	v_x0.16b, v_x0.16b, v_br1.16b
370
371	umov	x0, v_x0.d[0]
372	ubfx	x0, x0, 16, 16
373	b	.crc_table_loop_pre
374
375	.size	crc16_t10dif_copy_pmull, .-crc16_t10dif_copy_pmull
376
377	.section	.rodata
378
379	.align	4
380.shuffle_mask_lanchor = . + 0
381	.type	shuffle_mask, %object
382	.size	shuffle_mask, 16
383shuffle_mask:
384	.byte	15, 14, 13, 12, 11, 10, 9, 8
385	.byte	7,   6,  5,  4,  3,  2, 1, 0
386
387	.align	4
388.LANCHOR0 = . + 0
389	.type	crc16tab, %object
390	.size	crc16tab, 512
391crc16tab:
392	.hword  0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
393	.hword  0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
394	.hword  0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
395	.hword  0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
396	.hword  0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
397	.hword  0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
398	.hword  0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
399	.hword  0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
400	.hword  0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
401	.hword  0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
402	.hword  0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
403	.hword  0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
404	.hword  0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
405	.hword  0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
406	.hword  0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
407	.hword  0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
408	.hword  0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
409	.hword  0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
410	.hword  0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
411	.hword  0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
412	.hword  0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
413	.hword  0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
414	.hword  0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
415	.hword  0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
416	.hword  0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
417	.hword  0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
418	.hword  0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
419	.hword  0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
420	.hword  0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
421	.hword  0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
422	.hword  0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
423	.hword  0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
424