1########################################################################
2#  Copyright(c) 2019 Arm Corporation All rights reserved.
3#
4#  Redistribution and use in source and binary forms, with or without
5#  modification, are permitted provided that the following conditions
6#  are met:
7#    * Redistributions of source code must retain the above copyright
8#      notice, this list of conditions and the following disclaimer.
9#    * Redistributions in binary form must reproduce the above copyright
10#      notice, this list of conditions and the following disclaimer in
11#      the documentation and/or other materials provided with the
12#      distribution.
13#    * Neither the name of Arm Corporation nor the names of its
14#      contributors may be used to endorse or promote products derived
15#      from this software without specific prior written permission.
16#
17#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21#  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28#########################################################################
29
30	.arch armv8-a+crc+crypto
31	.text
32	.align	3
33	.global	crc16_t10dif_pmull
34	.type	crc16_t10dif_pmull, %function
35
36/* uint16_t crc16_t10dif_pmull(uint16_t seed, uint8_t *buf, uint64_t len) */
37
38/* arguments */
39w_seed			.req	w0
40x_buf			.req	x1
41x_len			.req	x2
42w_len			.req	w2
43
44/* returns */
45w_ret			.req	w0
46
47/* these as global temporary registers */
48w_tmp			.req	w5
49x_tmp			.req	x5
50x_tmp1			.req	x6
51x_tmp2			.req	x7
52
53d_tmp1			.req	d0
54d_tmp2			.req	d1
55q_tmp1			.req	q0
56q_tmp2			.req	q1
57v_tmp1			.req	v0
58v_tmp2			.req	v1
59
60/* local variables */
61w_counter		.req	w3
62w_crc			.req	w0
63x_crc			.req	x0
64x_counter		.req	x3
65x_crc16tab		.req	x4
66x_buf_saved		.req	x0
67
68crc16_t10dif_pmull:
69	cmp	x_len, 1023
70	sub	sp, sp, #16
71	uxth	w_seed, w_seed
72	bhi	.crc_fold
73
74	mov	x_tmp, 0
75	mov	w_counter, 0
76
77.crc_table_loop_pre:
78	cmp	x_len, x_tmp
79	bls	.end
80
81	sxtw	x_counter, w_counter
82	adrp	x_crc16tab, .LANCHOR0
83	sub	x_buf, x_buf, x_counter
84	add	x_crc16tab, x_crc16tab, :lo12:.LANCHOR0
85
86	.align 2
87.crc_table_loop:
88	ldrb	w_tmp, [x_buf, x_counter]
89	add	x_counter, x_counter, 1
90	cmp	x_len, x_counter
91	eor	w_tmp, w_tmp, w_crc, lsr 8
92	ldrh	w_tmp, [x_crc16tab, w_tmp, sxtw 1]
93	eor	w_crc, w_tmp, w_crc, lsl 8
94	uxth	w_crc, w_crc
95	bhi	.crc_table_loop
96
97.end:
98	add	sp, sp, 16
99	ret
100
101/* carry less multiplication, part1 - before loop */
102q_x0			.req	q2
103q_x1			.req	q3
104q_x2			.req	q4
105q_x3			.req	q5
106
107v_x0			.req	v2
108v_x1			.req	v3
109v_x2			.req	v4
110v_x3			.req	v5
111
112d_x0			.req	d2
113d_x1			.req	d3
114d_x2			.req	d4
115d_x3			.req	d5
116
117// the following registers only used this part1
118d_tmp3			.req	d16
119v_tmp3			.req	v16
120
121	.align 3
122.crc_fold:
123	fmov	d_tmp1, x_crc
124	fmov	d_tmp2, xzr
125	dup	d_tmp3, v_tmp2.d[0]
126	shl	d_tmp1, d_tmp1, 48
127	ins	v_tmp3.d[1], v_tmp1.d[0]
128
129	and	x_counter, x_len, -64
130	sub	x_counter, x_counter, #64
131	cmp	x_counter, 63
132	add	x_buf_saved, x_buf, 64
133
134	ldr	q_x0, [x_buf]
135	ldr	q_x1, [x_buf, 16]
136	ldr	q_x2, [x_buf, 32]
137	ldr	q_x3, [x_buf, 48]
138
139	adrp	x_tmp, .shuffle_mask_lanchor
140	ldr	q7, [x_tmp, :lo12:.shuffle_mask_lanchor]
141
142	tbl	v_tmp1.16b, {v_x0.16b}, v7.16b
143	eor	v_x0.16b, v_tmp3.16b, v_tmp1.16b
144
145	tbl	v_x1.16b, {v_x1.16b}, v7.16b
146	tbl	v_x2.16b, {v_x2.16b}, v7.16b
147	tbl	v_x3.16b, {v_x3.16b}, v7.16b
148	bls	.crc_fold_loop_end
149
150/* carry less multiplication, part2 - loop */
151q_y0			.req	q28
152q_y1			.req	q29
153q_y2			.req	q30
154q_y3			.req	q31
155
156v_y0			.req	v28
157v_y1			.req	v29
158v_y2			.req	v30
159v_y3			.req	v31
160
161d_x0_h			.req	d24
162d_x0_l			.req	d2
163d_x1_h			.req	d25
164d_x1_l			.req	d3
165d_x2_h			.req	d26
166d_x2_l			.req	d4
167d_x3_h			.req	d27
168d_x3_l			.req	d5
169
170v_x0_h			.req	v24
171v_x0_l			.req	v2
172v_x1_h			.req	v25
173v_x1_l			.req	v3
174v_x2_h			.req	v26
175v_x2_l			.req	v4
176v_x3_h			.req	v27
177v_x3_l			.req	v5
178
179v_tmp1_x0		.req	v24
180v_tmp1_x1		.req	v25
181v_tmp1_x2		.req	v26
182v_tmp1_x3		.req	v27
183
184d_p4_h			.req	d19
185v_p4_h			.req	v19
186d_p4_l			.req	d17
187v_p4_l			.req	v17
188
189	mov	x_tmp, 0x371d0000		/* p4 [1] */
190	fmov	d_p4_h, x_tmp
191	mov	x_tmp, 0x87e70000		/* p4 [0] */
192	fmov	d_p4_l, x_tmp
193
194	.align 2
195.crc_fold_loop:
196	add	x_buf_saved, x_buf_saved, 64
197	sub	x_counter, x_counter, #64
198	cmp	x_counter, 63
199
200	dup	d_x0_h, v_x0.d[1]
201	dup	d_x1_h, v_x1.d[1]
202	dup	d_x2_h, v_x2.d[1]
203	dup	d_x3_h, v_x3.d[1]
204
205	dup	d_x0_l, v_x0.d[0]
206	dup	d_x1_l, v_x1.d[0]
207	dup	d_x2_l, v_x2.d[0]
208	dup	d_x3_l, v_x3.d[0]
209
210	ldr	q_y0, [x_buf_saved, -64]
211	ldr	q_y1, [x_buf_saved, -48]
212	ldr	q_y2, [x_buf_saved, -32]
213	ldr	q_y3, [x_buf_saved, -16]
214
215	pmull	v_x0_h.1q, v_x0_h.1d, v_p4_h.1d
216	pmull	v_x0_l.1q, v_x0_l.1d, v_p4_l.1d
217	pmull	v_x1_h.1q, v_x1_h.1d, v_p4_h.1d
218	pmull	v_x1_l.1q, v_x1_l.1d, v_p4_l.1d
219	pmull	v_x2_h.1q, v_x2_h.1d, v_p4_h.1d
220	pmull	v_x2_l.1q, v_x2_l.1d, v_p4_l.1d
221	pmull	v_x3_h.1q, v_x3_h.1d, v_p4_h.1d
222	pmull	v_x3_l.1q, v_x3_l.1d, v_p4_l.1d
223
224	tbl	v_y0.16b, {v_y0.16b}, v7.16b
225	tbl	v_y1.16b, {v_y1.16b}, v7.16b
226	tbl	v_y2.16b, {v_y2.16b}, v7.16b
227	tbl	v_y3.16b, {v_y3.16b}, v7.16b
228
229	eor	v_tmp1_x0.16b, v_x0_h.16b, v_x0_l.16b
230	eor	v_tmp1_x1.16b, v_x1_h.16b, v_x1_l.16b
231	eor	v_tmp1_x2.16b, v_x2_h.16b, v_x2_l.16b
232	eor	v_tmp1_x3.16b, v_x3_h.16b, v_x3_l.16b
233
234	eor	v_x0.16b, v_tmp1_x0.16b, v_y0.16b
235	eor	v_x1.16b, v_tmp1_x1.16b, v_y1.16b
236	eor	v_x2.16b, v_tmp1_x2.16b, v_y2.16b
237	eor	v_x3.16b, v_tmp1_x3.16b, v_y3.16b
238
239	bhi	.crc_fold_loop
240
241/* carry less multiplication, part3 - after loop */
242/* folding 512bit ---> 128bit */
243
244// input parameters:
245// v_x0 => v2
246// v_x1 => v3
247// v_x2 => v4
248// v_x3 => v5
249
250// v0, v1, v6, v30, are tmp registers
251
252.crc_fold_loop_end:
253	mov	x_tmp, 0x4c1a0000	/* p1 [1] */
254	fmov	d0, x_tmp
255	mov	x_tmp, 0xfb0b0000	/* p1 [0] */
256	fmov	d1, x_tmp
257
258	and	w_counter, w_len, -64
259	sxtw	x_tmp, w_counter
260	add	x_buf, x_buf, x_tmp
261
262	dup	d6, v_x0.d[1]
263	dup	d30, v_x0.d[0]
264	pmull	v6.1q, v6.1d, v0.1d
265	pmull	v30.1q, v30.1d, v1.1d
266	eor	v6.16b, v6.16b, v30.16b
267	eor	v_x1.16b, v6.16b, v_x1.16b
268
269	dup	d6, v_x1.d[1]
270	dup	d30, v_x1.d[0]
271	pmull	v6.1q, v6.1d, v0.1d
272	pmull	v16.1q, v30.1d, v1.1d
273	eor	v6.16b, v6.16b, v16.16b
274	eor	v_x2.16b, v6.16b, v_x2.16b
275
276	dup	d_x0, v_x2.d[1]
277	dup	d30, v_x2.d[0]
278	pmull	v0.1q, v_x0.1d, v0.1d
279	pmull	v_x0.1q, v30.1d, v1.1d
280	eor	v1.16b, v0.16b, v_x0.16b
281	eor	v_x0.16b, v1.16b, v_x3.16b
282
283/* carry less multiplication, part3 - after loop */
284/* crc16 fold function */
285d_16fold_p0_h		.req	d18
286v_16fold_p0_h		.req	v18
287
288d_16fold_p0_l		.req	d4
289v_16fold_p0_l		.req	v4
290
291v_16fold_from		.req	v_x0
292d_16fold_from_h		.req	d3
293v_16fold_from_h		.req	v3
294
295v_16fold_zero		.req	v7
296
297v_16fold_from1		.req	v16
298
299v_16fold_from2		.req	v0
300d_16fold_from2_h	.req	d6
301v_16fold_from2_h	.req	v6
302
303v_16fold_tmp		.req	v0
304
305	movi	v_16fold_zero.4s, 0
306	mov	x_tmp1, 0x2d560000		/* p0 [1] */
307	mov	x_tmp2, 0x13680000		/* p0 [0] */
308
309	ext	v_16fold_tmp.16b, v_16fold_zero.16b, v_16fold_from.16b, #8
310	ext	v_16fold_tmp.16b, v0.16b, v_16fold_zero.16b, #4
311
312	dup	d_16fold_from_h, v_16fold_from.d[1]
313	fmov	d_16fold_p0_h, x_tmp1
314	pmull	v_16fold_from1.1q, v_16fold_from_h.1d, v_16fold_p0_h.1d
315	eor	v_16fold_from2.16b, v_16fold_tmp.16b, v_16fold_from1.16b
316
317	dup	d_16fold_from2_h, v_16fold_from2.d[1]
318	fmov	d_16fold_p0_l, x_tmp2
319	pmull	v6.1q, v_16fold_from2_h.1d, v_16fold_p0_l.1d
320	eor	v_x0.16b, v0.16b, v6.16b
321
322/* carry less multiplication, part3 - after loop */
323/* crc16 barrett reduction function */
324
325// input parameters:
326// v_x0:			v2
327// barrett reduction constant:	br[0], br[1]
328
329d_br0			.req	d3
330v_br0			.req	v3
331d_br1			.req	d5
332v_br1			.req	v5
333
334	mov	x_tmp1, 0x57f9			/* br[0] low */
335	movk	x_tmp1, 0xf65a, lsl 16		/* br[0] high */
336	movk	x_tmp1, 0x1, lsl 32
337	fmov	d_br0, x_tmp1
338
339	dup	d1, v_x0.d[0]
340	dup	d1, v1.d[0]
341	ext	v1.16b, v1.16b, v7.16b, #4
342	pmull	v4.1q, v1.1d, v_br0.1d
343
344	ext	v1.16b, v4.16b, v7.16b, #4
345	mov	x_tmp1, 0x8bb70000		/* br[1] low */
346	movk	x_tmp1, 0x1, lsl 32		/* br[1] high */
347
348	fmov	d_br1, x_tmp1
349	pmull	v_br1.1q, v1.1d, v_br1.1d
350	eor	v_x0.16b, v_x0.16b, v_br1.16b
351
352	umov	x0, v_x0.d[0]
353	ubfx	x0, x0, 16, 16
354	b	.crc_table_loop_pre
355
356	.size	crc16_t10dif_pmull, .-crc16_t10dif_pmull
357
358	.section	.rodata
359
360	.align	4
361.shuffle_mask_lanchor = . + 0
362	.type	shuffle_mask, %object
363	.size	shuffle_mask, 16
364shuffle_mask:
365	.byte	15, 14, 13, 12, 11, 10, 9, 8
366	.byte	7,   6,  5,  4,  3,  2, 1, 0
367
368	.align	4
369.LANCHOR0 = . + 0
370	.type	crc16tab, %object
371	.size	crc16tab, 512
372crc16tab:
373	.hword  0x0000, 0x8bb7, 0x9cd9, 0x176e, 0xb205, 0x39b2, 0x2edc, 0xa56b
374	.hword  0xEFBD, 0x640a, 0x7364, 0xf8d3, 0x5db8, 0xd60f, 0xc161, 0x4ad6
375	.hword  0x54CD, 0xdf7a, 0xc814, 0x43a3, 0xe6c8, 0x6d7f, 0x7a11, 0xf1a6
376	.hword  0xBB70, 0x30c7, 0x27a9, 0xac1e, 0x0975, 0x82c2, 0x95ac, 0x1e1b
377	.hword  0xA99A, 0x222d, 0x3543, 0xbef4, 0x1b9f, 0x9028, 0x8746, 0x0cf1
378	.hword  0x4627, 0xcd90, 0xdafe, 0x5149, 0xf422, 0x7f95, 0x68fb, 0xe34c
379	.hword  0xFD57, 0x76e0, 0x618e, 0xea39, 0x4f52, 0xc4e5, 0xd38b, 0x583c
380	.hword  0x12EA, 0x995d, 0x8e33, 0x0584, 0xa0ef, 0x2b58, 0x3c36, 0xb781
381	.hword  0xD883, 0x5334, 0x445a, 0xcfed, 0x6a86, 0xe131, 0xf65f, 0x7de8
382	.hword  0x373E, 0xbc89, 0xabe7, 0x2050, 0x853b, 0x0e8c, 0x19e2, 0x9255
383	.hword  0x8C4E, 0x07f9, 0x1097, 0x9b20, 0x3e4b, 0xb5fc, 0xa292, 0x2925
384	.hword  0x63F3, 0xe844, 0xff2a, 0x749d, 0xd1f6, 0x5a41, 0x4d2f, 0xc698
385	.hword  0x7119, 0xfaae, 0xedc0, 0x6677, 0xc31c, 0x48ab, 0x5fc5, 0xd472
386	.hword  0x9EA4, 0x1513, 0x027d, 0x89ca, 0x2ca1, 0xa716, 0xb078, 0x3bcf
387	.hword  0x25D4, 0xae63, 0xb90d, 0x32ba, 0x97d1, 0x1c66, 0x0b08, 0x80bf
388	.hword  0xCA69, 0x41de, 0x56b0, 0xdd07, 0x786c, 0xf3db, 0xe4b5, 0x6f02
389	.hword  0x3AB1, 0xb106, 0xa668, 0x2ddf, 0x88b4, 0x0303, 0x146d, 0x9fda
390	.hword  0xD50C, 0x5ebb, 0x49d5, 0xc262, 0x6709, 0xecbe, 0xfbd0, 0x7067
391	.hword  0x6E7C, 0xe5cb, 0xf2a5, 0x7912, 0xdc79, 0x57ce, 0x40a0, 0xcb17
392	.hword  0x81C1, 0x0a76, 0x1d18, 0x96af, 0x33c4, 0xb873, 0xaf1d, 0x24aa
393	.hword  0x932B, 0x189c, 0x0ff2, 0x8445, 0x212e, 0xaa99, 0xbdf7, 0x3640
394	.hword  0x7C96, 0xf721, 0xe04f, 0x6bf8, 0xce93, 0x4524, 0x524a, 0xd9fd
395	.hword  0xC7E6, 0x4c51, 0x5b3f, 0xd088, 0x75e3, 0xfe54, 0xe93a, 0x628d
396	.hword  0x285B, 0xa3ec, 0xb482, 0x3f35, 0x9a5e, 0x11e9, 0x0687, 0x8d30
397	.hword  0xE232, 0x6985, 0x7eeb, 0xf55c, 0x5037, 0xdb80, 0xccee, 0x4759
398	.hword  0x0D8F, 0x8638, 0x9156, 0x1ae1, 0xbf8a, 0x343d, 0x2353, 0xa8e4
399	.hword  0xB6FF, 0x3d48, 0x2a26, 0xa191, 0x04fa, 0x8f4d, 0x9823, 0x1394
400	.hword  0x5942, 0xd2f5, 0xc59b, 0x4e2c, 0xeb47, 0x60f0, 0x779e, 0xfc29
401	.hword  0x4BA8, 0xc01f, 0xd771, 0x5cc6, 0xf9ad, 0x721a, 0x6574, 0xeec3
402	.hword  0xA415, 0x2fa2, 0x38cc, 0xb37b, 0x1610, 0x9da7, 0x8ac9, 0x017e
403	.hword  0x1F65, 0x94d2, 0x83bc, 0x080b, 0xad60, 0x26d7, 0x31b9, 0xba0e
404	.hword  0xF0D8, 0x7b6f, 0x6c01, 0xe7b6, 0x42dd, 0xc96a, 0xde04, 0x55b3
405