1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2;  Copyright(c) 2011-2017 Intel Corporation All rights reserved.
3;
4;  Redistribution and use in source and binary forms, with or without
5;  modification, are permitted provided that the following conditions
6;  are met:
7;    * Redistributions of source code must retain the above copyright
8;      notice, this list of conditions and the following disclaimer.
9;    * Redistributions in binary form must reproduce the above copyright
10;      notice, this list of conditions and the following disclaimer in
11;      the documentation and/or other materials provided with the
12;      distribution.
13;    * Neither the name of Intel Corporation nor the names of its
14;      contributors may be used to endorse or promote products derived
15;      from this software without specific prior written permission.
16;
17;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29;
30;       Function API:
31;       UINT16 crc16_t10dif_copy_by4(
32;               UINT16 init_crc, //initial CRC value, 16 bits
33;               unsigned char *dst, //buffer pointer destination for copy
34;               const unsigned char *src, //buffer pointer to calculate CRC on
35;               UINT64 len //buffer length in bytes (64-bit data)
36;       );
37;
38;       Authors:
39;               Erdinc Ozturk
40;               Vinodh Gopal
41;               James Guilford
42;
43;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
44;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
45;
46
47%include "reg_sizes.asm"
48
49%define	fetch_dist	1024
50
51[bits 64]
52default rel
53
54section .text
55%ifidn __OUTPUT_FORMAT__, win64
56	%xdefine        arg1 rcx
57	%xdefine        arg2 rdx
58	%xdefine        arg3 r8
59	%xdefine        arg4 r9
60	%xdefine        tmp1 r10
61	%xdefine        arg1_low32 ecx
62%else
63	%xdefine        arg1 rdi
64	%xdefine        arg2 rsi
65	%xdefine        arg3 rdx
66	%xdefine        arg4 rcx
67	%xdefine	tmp1 r10
68	%xdefine        arg1_low32 edi
69%endif
70
71align 16
72mk_global 	crc16_t10dif_copy_by4, function
73crc16_t10dif_copy_by4:
74	endbranch
75
76	; adjust the 16-bit initial_crc value, scale it to 32 bits
77	shl	arg1_low32, 16
78
79	; After this point, code flow is exactly same as a 32-bit CRC.
80	; The only difference is before returning eax, we will shift
81	; it right 16 bits, to scale back to 16 bits.
82
83	sub	rsp,16*4+8
84
85	; push the xmm registers into the stack to maintain
86	movdqa [rsp+16*2],xmm6
87	movdqa [rsp+16*3],xmm7
88
89	; check if smaller than 128B
90	cmp	arg4, 128
91
92	; for sizes less than 128, we can't fold 64B at a time...
93	jl	_less_than_128
94
95
96	; load the initial crc value
97	movd	xmm6, arg1_low32	; initial crc
98
99	; crc value does not need to be byte-reflected, but it needs to
100	; be moved to the high part of the register.
101	; because data will be byte-reflected and will align with
102	; initial crc at correct place.
103	pslldq	xmm6, 12
104
105	movdqa xmm7, [SHUF_MASK]
106	; receive the initial 64B data, xor the initial crc value
107	movdqu	xmm0, [arg3]
108	movdqu	xmm1, [arg3+16]
109	movdqu	xmm2, [arg3+32]
110	movdqu	xmm3, [arg3+48]
111
112	; copy initial data
113	movdqu	[arg2], xmm0
114	movdqu	[arg2+16], xmm1
115	movdqu	[arg2+32], xmm2
116	movdqu	[arg2+48], xmm3
117
118	pshufb	xmm0, xmm7
119	; XOR the initial_crc value
120	pxor	xmm0, xmm6
121	pshufb	xmm1, xmm7
122	pshufb	xmm2, xmm7
123	pshufb	xmm3, xmm7
124
125	movdqa	xmm6, [rk3]	;xmm6 has rk3 and rk4
126					;imm value of pclmulqdq instruction
127					;will determine which constant to use
128	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
129	; we subtract 128 instead of 64 to save one instruction from the loop
130	sub	arg4, 128
131
132	; at this section of the code, there is 64*x+y (0<=y<64) bytes of
133	; buffer. The _fold_64_B_loop
134	; loop will fold 64B at a time until we have 64+y Bytes of buffer
135
136
137	; fold 64B at a time. This section of the code folds 4 xmm
138	; registers in parallel
139_fold_64_B_loop:
140
141	; update the buffer pointer
142	add	arg3, 64		;    buf += 64;
143	add	arg2, 64
144
145	prefetchnta [arg3+fetch_dist+0]
146	movdqu	xmm4, xmm0
147	movdqu	xmm5, xmm1
148
149	pclmulqdq	xmm0, xmm6 , 0x11
150	pclmulqdq	xmm1, xmm6 , 0x11
151
152	pclmulqdq	xmm4, xmm6, 0x0
153	pclmulqdq	xmm5, xmm6, 0x0
154
155	pxor	xmm0, xmm4
156	pxor	xmm1, xmm5
157
158	prefetchnta [arg3+fetch_dist+32]
159	movdqu	xmm4, xmm2
160	movdqu	xmm5, xmm3
161
162	pclmulqdq	xmm2, xmm6, 0x11
163	pclmulqdq	xmm3, xmm6, 0x11
164
165	pclmulqdq	xmm4, xmm6, 0x0
166	pclmulqdq	xmm5, xmm6, 0x0
167
168	pxor	xmm2, xmm4
169	pxor	xmm3, xmm5
170
171	movdqu	xmm4, [arg3]
172	movdqu	xmm5, [arg3+16]
173	movdqu	[arg2], xmm4
174	movdqu	[arg2+16], xmm5
175	pshufb	xmm4, xmm7
176	pshufb	xmm5, xmm7
177	pxor	xmm0, xmm4
178	pxor	xmm1, xmm5
179
180	movdqu	xmm4, [arg3+32]
181	movdqu	xmm5, [arg3+48]
182	movdqu	[arg2+32], xmm4
183	movdqu	[arg2+48], xmm5
184	pshufb	xmm4, xmm7
185	pshufb	xmm5, xmm7
186
187	pxor	xmm2, xmm4
188	pxor	xmm3, xmm5
189
190	sub	arg4, 64
191
192	; check if there is another 64B in the buffer to be able to fold
193	jge	_fold_64_B_loop
194	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
195
196
197	add	arg3, 64
198	add	arg2, 64
199	; at this point, the buffer pointer is pointing at the last y Bytes of the buffer
200	; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3
201
202
203	; fold the 4 xmm registers to 1 xmm register with different constants
204
205	movdqa	xmm6, [rk1]	;xmm6 has rk1 and rk2
206					;imm value of pclmulqdq instruction will
207					;determine which constant to use
208
209	movdqa	xmm4, xmm0
210	pclmulqdq	xmm0, xmm6, 0x11
211	pclmulqdq	xmm4, xmm6, 0x0
212	pxor	xmm1, xmm4
213	pxor	xmm1, xmm0
214
215	movdqa	xmm4, xmm1
216	pclmulqdq	xmm1, xmm6, 0x11
217	pclmulqdq	xmm4, xmm6, 0x0
218	pxor	xmm2, xmm4
219	pxor	xmm2, xmm1
220
221	movdqa	xmm4, xmm2
222	pclmulqdq	xmm2, xmm6, 0x11
223	pclmulqdq	xmm4, xmm6, 0x0
224	pxor	xmm3, xmm4
225	pxor	xmm3, xmm2
226
227
228	; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop
229	; instead of a cmp instruction, we use the negative flag with the jl instruction
230	add	arg4, 64-16
231	jl	_final_reduction_for_128
232
233	; now we have 16+y bytes left to reduce. 16 Bytes
234	; is in register xmm3 and the rest is in memory
235	; we can fold 16 bytes at a time if y>=16
236	; continue folding 16B at a time
237
238_16B_reduction_loop:
239	movdqa	xmm4, xmm3
240	pclmulqdq	xmm3, xmm6, 0x11
241	pclmulqdq	xmm4, xmm6, 0x0
242	pxor	xmm3, xmm4
243	movdqu	xmm0, [arg3]
244	movdqu	[arg2], xmm0
245	pshufb	xmm0, xmm7
246	pxor	xmm3, xmm0
247	add	arg3, 16
248	add	arg2, 16
249	sub	arg4, 16
250	; instead of a cmp instruction, we utilize the flags with the jge instruction
251	; equivalent of: cmp arg4, 16-16
252	; check if there is any more 16B in the buffer to be able to fold
253	jge	_16B_reduction_loop
254
255	;now we have 16+z bytes left to reduce, where 0<= z < 16.
256	;first, we reduce the data in the xmm3 register
257
258
259_final_reduction_for_128:
260	; check if any more data to fold. If not, compute the CRC of the final 128 bits
261	add	arg4, 16
262	je	_128_done
263
264	; here we are getting data that is less than 16 bytes.
265	; since we know that there was data before the pointer,
266	; we can offset the input pointer before the actual point,
267	; to receive exactly 16 bytes.
268	; after that the registers need to be adjusted.
269_get_last_two_xmms:
270	movdqa	xmm2, xmm3
271
272	movdqu	xmm1, [arg3 - 16 + arg4]
273	movdqu	[arg2 - 16 + arg4], xmm1
274	pshufb	xmm1, xmm7
275
276	; get rid of the extra data that was loaded before
277	; load the shift constant
278	lea	rax, [pshufb_shf_table + 16]
279	sub	rax, arg4
280	movdqu	xmm0, [rax]
281
282	; shift xmm2 to the left by arg4 bytes
283	pshufb	xmm2, xmm0
284
285	; shift xmm3 to the right by 16-arg4 bytes
286	pxor	xmm0, [mask1]
287	pshufb	xmm3, xmm0
288	pblendvb	xmm1, xmm2	;xmm0 is implicit
289
290	; fold 16 Bytes
291	movdqa	xmm2, xmm1
292	movdqa	xmm4, xmm3
293	pclmulqdq	xmm3, xmm6, 0x11
294	pclmulqdq	xmm4, xmm6, 0x0
295	pxor	xmm3, xmm4
296	pxor	xmm3, xmm2
297
298_128_done:
299	; compute crc of a 128-bit value
300	movdqa	xmm6, [rk5]	; rk5 and rk6 in xmm6
301	movdqa	xmm0, xmm3
302
303	;64b fold
304	pclmulqdq	xmm3, xmm6, 0x1
305	pslldq	xmm0, 8
306	pxor	xmm3, xmm0
307
308	;32b fold
309	movdqa	xmm0, xmm3
310
311	pand	xmm0, [mask2]
312
313	psrldq	xmm3, 12
314	pclmulqdq	xmm3, xmm6, 0x10
315	pxor	xmm3, xmm0
316
317	;barrett reduction
318_barrett:
319	movdqa	xmm6, [rk7]	; rk7 and rk8 in xmm6
320	movdqa	xmm0, xmm3
321	pclmulqdq	xmm3, xmm6, 0x01
322	pslldq	xmm3, 4
323	pclmulqdq	xmm3, xmm6, 0x11
324
325	pslldq	xmm3, 4
326	pxor	xmm3, xmm0
327	pextrd	eax, xmm3,1
328
329_cleanup:
330	; scale the result back to 16 bits
331	shr	eax, 16
332	movdqa	xmm6, [rsp+16*2]
333	movdqa	xmm7, [rsp+16*3]
334	add	rsp,16*4+8
335	ret
336
337
338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
339;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342
343align 16
344_less_than_128:
345
346	; check if there is enough buffer to be able to fold 16B at a time
347	cmp	arg4, 32
348	jl	_less_than_32
349	movdqa xmm7, [SHUF_MASK]
350
351	; if there is, load the constants
352	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
353
354	movd	xmm0, arg1_low32	; get the initial crc value
355	pslldq	xmm0, 12	; align it to its correct place
356	movdqu	xmm3, [arg3]	; load the plaintext
357	movdqu	[arg2], xmm3	; store copy
358	pshufb	xmm3, xmm7	; byte-reflect the plaintext
359	pxor	xmm3, xmm0
360
361
362	; update the buffer pointer
363	add	arg3, 16
364	add	arg2, 16
365
366	; update the counter. subtract 32 instead of 16 to save one instruction from the loop
367	sub	arg4, 32
368
369	jmp	_16B_reduction_loop
370
371
372align 16
373_less_than_32:
374	; mov initial crc to the return value. this is necessary for zero-length buffers.
375	mov	eax, arg1_low32
376	test	arg4, arg4
377	je	_cleanup
378
379	movdqa xmm7, [SHUF_MASK]
380
381	movd	xmm0, arg1_low32	; get the initial crc value
382	pslldq	xmm0, 12		; align it to its correct place
383
384	cmp	arg4, 16
385	je	_exact_16_left
386	jl	_less_than_16_left
387
388	movdqu	xmm3, [arg3]	; load the plaintext
389	movdqu	[arg2], xmm3	; store the copy
390	pshufb	xmm3, xmm7	; byte-reflect the plaintext
391	pxor	xmm3, xmm0	; xor the initial crc value
392	add	arg3, 16
393	add	arg2, 16
394	sub	arg4, 16
395	movdqa	xmm6, [rk1]	; rk1 and rk2 in xmm6
396	jmp	_get_last_two_xmms
397
398
399align 16
400_less_than_16_left:
401	; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.
402
403	pxor	xmm1, xmm1
404	mov	r11, rsp
405	movdqa	[r11], xmm1
406
407	cmp	arg4, 4
408	jl	_only_less_than_4
409
410	;	backup the counter value
411	mov	tmp1, arg4
412	cmp	arg4, 8
413	jl	_less_than_8_left
414
415	; load 8 Bytes
416	mov	rax, [arg3]
417	mov	[arg2], rax
418	mov	[r11], rax
419	add	r11, 8
420	sub	arg4, 8
421	add	arg3, 8
422	add	arg2, 8
423_less_than_8_left:
424
425	cmp	arg4, 4
426	jl	_less_than_4_left
427
428	; load 4 Bytes
429	mov	eax, [arg3]
430	mov	[arg2], eax
431	mov	[r11], eax
432	add	r11, 4
433	sub	arg4, 4
434	add	arg3, 4
435	add	arg2, 4
436_less_than_4_left:
437
438	cmp	arg4, 2
439	jl	_less_than_2_left
440
441	; load 2 Bytes
442	mov	ax, [arg3]
443	mov	[arg2], ax
444	mov	[r11], ax
445	add	r11, 2
446	sub	arg4, 2
447	add	arg3, 2
448	add	arg2, 2
449_less_than_2_left:
450	cmp	arg4, 1
451	jl	_zero_left
452
453	; load 1 Byte
454	mov	al, [arg3]
455	mov	[arg2], al
456	mov	[r11], al
457_zero_left:
458	movdqa	xmm3, [rsp]
459	pshufb	xmm3, xmm7
460	pxor	xmm3, xmm0	; xor the initial crc value
461
462	; shl tmp1, 4
463	lea	rax, [pshufb_shf_table + 16]
464	sub	rax, tmp1
465	movdqu	xmm0, [rax]
466	pxor	xmm0, [mask1]
467
468	pshufb	xmm3, xmm0
469	jmp	_128_done
470
471align 16
472_exact_16_left:
473	movdqu	xmm3, [arg3]
474	movdqu	[arg2], xmm3
475	pshufb	xmm3, xmm7
476	pxor	xmm3, xmm0	; xor the initial crc value
477
478	jmp	_128_done
479
480_only_less_than_4:
481	cmp	arg4, 3
482	jl	_only_less_than_3
483
484	; load 3 Bytes
485	mov	al, [arg3]
486	mov	[arg2], al
487	mov	[r11], al
488
489	mov	al, [arg3+1]
490	mov	[arg2+1], al
491	mov	[r11+1], al
492
493	mov	al, [arg3+2]
494	mov	[arg2+2], al
495	mov	[r11+2], al
496
497	movdqa	xmm3, [rsp]
498	pshufb	xmm3, xmm7
499	pxor	xmm3, xmm0	; xor the initial crc value
500
501	psrldq	xmm3, 5
502
503	jmp	_barrett
504_only_less_than_3:
505	cmp	arg4, 2
506	jl	_only_less_than_2
507
508	; load 2 Bytes
509	mov	al, [arg3]
510	mov	[arg2], al
511	mov	[r11], al
512
513	mov	al, [arg3+1]
514	mov	[arg2+1], al
515	mov	[r11+1], al
516
517	movdqa	xmm3, [rsp]
518	pshufb	xmm3, xmm7
519	pxor	xmm3, xmm0	; xor the initial crc value
520
521	psrldq	xmm3, 6
522
523	jmp	_barrett
524_only_less_than_2:
525
526	; load 1 Byte
527	mov	al, [arg3]
528	mov	[arg2],al
529	mov	[r11], al
530
531	movdqa	xmm3, [rsp]
532	pshufb	xmm3, xmm7
533	pxor	xmm3, xmm0	; xor the initial crc value
534
535	psrldq	xmm3, 7
536
537	jmp	_barrett
538
539section .data
540
541; precomputed constants
542; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits)
543align 16
544; Q = 0x18BB70000
545; rk1 = 2^(32*3) mod Q << 32
546; rk2 = 2^(32*5) mod Q << 32
547; rk3 = 2^(32*15) mod Q << 32
548; rk4 = 2^(32*17) mod Q << 32
549; rk5 = 2^(32*3) mod Q << 32
550; rk6 = 2^(32*2) mod Q << 32
551; rk7 = floor(2^64/Q)
552; rk8 = Q
553rk1:
554DQ 0x2d56000000000000
555rk2:
556DQ 0x06df000000000000
557rk3:
558DQ 0x044c000000000000
559rk4:
560DQ 0xe658000000000000
561rk5:
562DQ 0x2d56000000000000
563rk6:
564DQ 0x1368000000000000
565rk7:
566DQ 0x00000001f65a57f8
567rk8:
568DQ 0x000000018bb70000
569mask1:
570dq 0x8080808080808080, 0x8080808080808080
571mask2:
572dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF
573
574SHUF_MASK:
575dq 0x08090A0B0C0D0E0F, 0x0001020304050607
576
577pshufb_shf_table:
578; use these values for shift constants for the pshufb instruction
579; different alignments result in values as shown:
580;	dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
581;	dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
582;	dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
583;	dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
584;	dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
585;	dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
586;	dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
587;	dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16-8) / shr8
588;	dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
589;	dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
590;	dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
591;	dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
592;	dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
593;	dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
594;	dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
595dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
596dq 0x0706050403020100, 0x000e0d0c0b0a0908
597
598;;;       func                   core, ver, snum
599slversion crc16_t10dif_copy_by4, 05,   02,  0000
600