1;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2; Copyright(c) 2011-2017 Intel Corporation All rights reserved. 3; 4; Redistribution and use in source and binary forms, with or without 5; modification, are permitted provided that the following conditions 6; are met: 7; * Redistributions of source code must retain the above copyright 8; notice, this list of conditions and the following disclaimer. 9; * Redistributions in binary form must reproduce the above copyright 10; notice, this list of conditions and the following disclaimer in 11; the documentation and/or other materials provided with the 12; distribution. 13; * Neither the name of Intel Corporation nor the names of its 14; contributors may be used to endorse or promote products derived 15; from this software without specific prior written permission. 16; 17; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 29; 30; Function API: 31; UINT16 crc16_t10dif_copy_by4( 32; UINT16 init_crc, //initial CRC value, 16 bits 33; unsigned char *dst, //buffer pointer destination for copy 34; const unsigned char *src, //buffer pointer to calculate CRC on 35; UINT64 len //buffer length in bytes (64-bit data) 36; ); 37; 38; Authors: 39; Erdinc Ozturk 40; Vinodh Gopal 41; James Guilford 42; 43; Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" 44; URL: http://download.intel.com/design/intarch/papers/323102.pdf 45; 46 47%include "reg_sizes.asm" 48 49%define fetch_dist 1024 50 51[bits 64] 52default rel 53 54section .text 55%ifidn __OUTPUT_FORMAT__, win64 56 %xdefine arg1 rcx 57 %xdefine arg2 rdx 58 %xdefine arg3 r8 59 %xdefine arg4 r9 60 %xdefine tmp1 r10 61 %xdefine arg1_low32 ecx 62%else 63 %xdefine arg1 rdi 64 %xdefine arg2 rsi 65 %xdefine arg3 rdx 66 %xdefine arg4 rcx 67 %xdefine tmp1 r10 68 %xdefine arg1_low32 edi 69%endif 70 71align 16 72mk_global crc16_t10dif_copy_by4, function 73crc16_t10dif_copy_by4: 74 endbranch 75 76 ; adjust the 16-bit initial_crc value, scale it to 32 bits 77 shl arg1_low32, 16 78 79 ; After this point, code flow is exactly same as a 32-bit CRC. 80 ; The only difference is before returning eax, we will shift 81 ; it right 16 bits, to scale back to 16 bits. 82 83 sub rsp,16*4+8 84 85 ; push the xmm registers into the stack to maintain 86 movdqa [rsp+16*2],xmm6 87 movdqa [rsp+16*3],xmm7 88 89 ; check if smaller than 128B 90 cmp arg4, 128 91 92 ; for sizes less than 128, we can't fold 64B at a time... 93 jl _less_than_128 94 95 96 ; load the initial crc value 97 movd xmm6, arg1_low32 ; initial crc 98 99 ; crc value does not need to be byte-reflected, but it needs to 100 ; be moved to the high part of the register. 101 ; because data will be byte-reflected and will align with 102 ; initial crc at correct place. 103 pslldq xmm6, 12 104 105 movdqa xmm7, [SHUF_MASK] 106 ; receive the initial 64B data, xor the initial crc value 107 movdqu xmm0, [arg3] 108 movdqu xmm1, [arg3+16] 109 movdqu xmm2, [arg3+32] 110 movdqu xmm3, [arg3+48] 111 112 ; copy initial data 113 movdqu [arg2], xmm0 114 movdqu [arg2+16], xmm1 115 movdqu [arg2+32], xmm2 116 movdqu [arg2+48], xmm3 117 118 pshufb xmm0, xmm7 119 ; XOR the initial_crc value 120 pxor xmm0, xmm6 121 pshufb xmm1, xmm7 122 pshufb xmm2, xmm7 123 pshufb xmm3, xmm7 124 125 movdqa xmm6, [rk3] ;xmm6 has rk3 and rk4 126 ;imm value of pclmulqdq instruction 127 ;will determine which constant to use 128 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 129 ; we subtract 128 instead of 64 to save one instruction from the loop 130 sub arg4, 128 131 132 ; at this section of the code, there is 64*x+y (0<=y<64) bytes of 133 ; buffer. The _fold_64_B_loop 134 ; loop will fold 64B at a time until we have 64+y Bytes of buffer 135 136 137 ; fold 64B at a time. This section of the code folds 4 xmm 138 ; registers in parallel 139_fold_64_B_loop: 140 141 ; update the buffer pointer 142 add arg3, 64 ; buf += 64; 143 add arg2, 64 144 145 prefetchnta [arg3+fetch_dist+0] 146 movdqu xmm4, xmm0 147 movdqu xmm5, xmm1 148 149 pclmulqdq xmm0, xmm6 , 0x11 150 pclmulqdq xmm1, xmm6 , 0x11 151 152 pclmulqdq xmm4, xmm6, 0x0 153 pclmulqdq xmm5, xmm6, 0x0 154 155 pxor xmm0, xmm4 156 pxor xmm1, xmm5 157 158 prefetchnta [arg3+fetch_dist+32] 159 movdqu xmm4, xmm2 160 movdqu xmm5, xmm3 161 162 pclmulqdq xmm2, xmm6, 0x11 163 pclmulqdq xmm3, xmm6, 0x11 164 165 pclmulqdq xmm4, xmm6, 0x0 166 pclmulqdq xmm5, xmm6, 0x0 167 168 pxor xmm2, xmm4 169 pxor xmm3, xmm5 170 171 movdqu xmm4, [arg3] 172 movdqu xmm5, [arg3+16] 173 movdqu [arg2], xmm4 174 movdqu [arg2+16], xmm5 175 pshufb xmm4, xmm7 176 pshufb xmm5, xmm7 177 pxor xmm0, xmm4 178 pxor xmm1, xmm5 179 180 movdqu xmm4, [arg3+32] 181 movdqu xmm5, [arg3+48] 182 movdqu [arg2+32], xmm4 183 movdqu [arg2+48], xmm5 184 pshufb xmm4, xmm7 185 pshufb xmm5, xmm7 186 187 pxor xmm2, xmm4 188 pxor xmm3, xmm5 189 190 sub arg4, 64 191 192 ; check if there is another 64B in the buffer to be able to fold 193 jge _fold_64_B_loop 194 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 195 196 197 add arg3, 64 198 add arg2, 64 199 ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer 200 ; the 64B of folded data is in 4 of the xmm registers: xmm0, xmm1, xmm2, xmm3 201 202 203 ; fold the 4 xmm registers to 1 xmm register with different constants 204 205 movdqa xmm6, [rk1] ;xmm6 has rk1 and rk2 206 ;imm value of pclmulqdq instruction will 207 ;determine which constant to use 208 209 movdqa xmm4, xmm0 210 pclmulqdq xmm0, xmm6, 0x11 211 pclmulqdq xmm4, xmm6, 0x0 212 pxor xmm1, xmm4 213 pxor xmm1, xmm0 214 215 movdqa xmm4, xmm1 216 pclmulqdq xmm1, xmm6, 0x11 217 pclmulqdq xmm4, xmm6, 0x0 218 pxor xmm2, xmm4 219 pxor xmm2, xmm1 220 221 movdqa xmm4, xmm2 222 pclmulqdq xmm2, xmm6, 0x11 223 pclmulqdq xmm4, xmm6, 0x0 224 pxor xmm3, xmm4 225 pxor xmm3, xmm2 226 227 228 ; instead of 64, we add 48 to the loop counter to save 1 instruction from the loop 229 ; instead of a cmp instruction, we use the negative flag with the jl instruction 230 add arg4, 64-16 231 jl _final_reduction_for_128 232 233 ; now we have 16+y bytes left to reduce. 16 Bytes 234 ; is in register xmm3 and the rest is in memory 235 ; we can fold 16 bytes at a time if y>=16 236 ; continue folding 16B at a time 237 238_16B_reduction_loop: 239 movdqa xmm4, xmm3 240 pclmulqdq xmm3, xmm6, 0x11 241 pclmulqdq xmm4, xmm6, 0x0 242 pxor xmm3, xmm4 243 movdqu xmm0, [arg3] 244 movdqu [arg2], xmm0 245 pshufb xmm0, xmm7 246 pxor xmm3, xmm0 247 add arg3, 16 248 add arg2, 16 249 sub arg4, 16 250 ; instead of a cmp instruction, we utilize the flags with the jge instruction 251 ; equivalent of: cmp arg4, 16-16 252 ; check if there is any more 16B in the buffer to be able to fold 253 jge _16B_reduction_loop 254 255 ;now we have 16+z bytes left to reduce, where 0<= z < 16. 256 ;first, we reduce the data in the xmm3 register 257 258 259_final_reduction_for_128: 260 ; check if any more data to fold. If not, compute the CRC of the final 128 bits 261 add arg4, 16 262 je _128_done 263 264 ; here we are getting data that is less than 16 bytes. 265 ; since we know that there was data before the pointer, 266 ; we can offset the input pointer before the actual point, 267 ; to receive exactly 16 bytes. 268 ; after that the registers need to be adjusted. 269_get_last_two_xmms: 270 movdqa xmm2, xmm3 271 272 movdqu xmm1, [arg3 - 16 + arg4] 273 movdqu [arg2 - 16 + arg4], xmm1 274 pshufb xmm1, xmm7 275 276 ; get rid of the extra data that was loaded before 277 ; load the shift constant 278 lea rax, [pshufb_shf_table + 16] 279 sub rax, arg4 280 movdqu xmm0, [rax] 281 282 ; shift xmm2 to the left by arg4 bytes 283 pshufb xmm2, xmm0 284 285 ; shift xmm3 to the right by 16-arg4 bytes 286 pxor xmm0, [mask1] 287 pshufb xmm3, xmm0 288 pblendvb xmm1, xmm2 ;xmm0 is implicit 289 290 ; fold 16 Bytes 291 movdqa xmm2, xmm1 292 movdqa xmm4, xmm3 293 pclmulqdq xmm3, xmm6, 0x11 294 pclmulqdq xmm4, xmm6, 0x0 295 pxor xmm3, xmm4 296 pxor xmm3, xmm2 297 298_128_done: 299 ; compute crc of a 128-bit value 300 movdqa xmm6, [rk5] ; rk5 and rk6 in xmm6 301 movdqa xmm0, xmm3 302 303 ;64b fold 304 pclmulqdq xmm3, xmm6, 0x1 305 pslldq xmm0, 8 306 pxor xmm3, xmm0 307 308 ;32b fold 309 movdqa xmm0, xmm3 310 311 pand xmm0, [mask2] 312 313 psrldq xmm3, 12 314 pclmulqdq xmm3, xmm6, 0x10 315 pxor xmm3, xmm0 316 317 ;barrett reduction 318_barrett: 319 movdqa xmm6, [rk7] ; rk7 and rk8 in xmm6 320 movdqa xmm0, xmm3 321 pclmulqdq xmm3, xmm6, 0x01 322 pslldq xmm3, 4 323 pclmulqdq xmm3, xmm6, 0x11 324 325 pslldq xmm3, 4 326 pxor xmm3, xmm0 327 pextrd eax, xmm3,1 328 329_cleanup: 330 ; scale the result back to 16 bits 331 shr eax, 16 332 movdqa xmm6, [rsp+16*2] 333 movdqa xmm7, [rsp+16*3] 334 add rsp,16*4+8 335 ret 336 337 338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 339;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 341;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342 343align 16 344_less_than_128: 345 346 ; check if there is enough buffer to be able to fold 16B at a time 347 cmp arg4, 32 348 jl _less_than_32 349 movdqa xmm7, [SHUF_MASK] 350 351 ; if there is, load the constants 352 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6 353 354 movd xmm0, arg1_low32 ; get the initial crc value 355 pslldq xmm0, 12 ; align it to its correct place 356 movdqu xmm3, [arg3] ; load the plaintext 357 movdqu [arg2], xmm3 ; store copy 358 pshufb xmm3, xmm7 ; byte-reflect the plaintext 359 pxor xmm3, xmm0 360 361 362 ; update the buffer pointer 363 add arg3, 16 364 add arg2, 16 365 366 ; update the counter. subtract 32 instead of 16 to save one instruction from the loop 367 sub arg4, 32 368 369 jmp _16B_reduction_loop 370 371 372align 16 373_less_than_32: 374 ; mov initial crc to the return value. this is necessary for zero-length buffers. 375 mov eax, arg1_low32 376 test arg4, arg4 377 je _cleanup 378 379 movdqa xmm7, [SHUF_MASK] 380 381 movd xmm0, arg1_low32 ; get the initial crc value 382 pslldq xmm0, 12 ; align it to its correct place 383 384 cmp arg4, 16 385 je _exact_16_left 386 jl _less_than_16_left 387 388 movdqu xmm3, [arg3] ; load the plaintext 389 movdqu [arg2], xmm3 ; store the copy 390 pshufb xmm3, xmm7 ; byte-reflect the plaintext 391 pxor xmm3, xmm0 ; xor the initial crc value 392 add arg3, 16 393 add arg2, 16 394 sub arg4, 16 395 movdqa xmm6, [rk1] ; rk1 and rk2 in xmm6 396 jmp _get_last_two_xmms 397 398 399align 16 400_less_than_16_left: 401 ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first. 402 403 pxor xmm1, xmm1 404 mov r11, rsp 405 movdqa [r11], xmm1 406 407 cmp arg4, 4 408 jl _only_less_than_4 409 410 ; backup the counter value 411 mov tmp1, arg4 412 cmp arg4, 8 413 jl _less_than_8_left 414 415 ; load 8 Bytes 416 mov rax, [arg3] 417 mov [arg2], rax 418 mov [r11], rax 419 add r11, 8 420 sub arg4, 8 421 add arg3, 8 422 add arg2, 8 423_less_than_8_left: 424 425 cmp arg4, 4 426 jl _less_than_4_left 427 428 ; load 4 Bytes 429 mov eax, [arg3] 430 mov [arg2], eax 431 mov [r11], eax 432 add r11, 4 433 sub arg4, 4 434 add arg3, 4 435 add arg2, 4 436_less_than_4_left: 437 438 cmp arg4, 2 439 jl _less_than_2_left 440 441 ; load 2 Bytes 442 mov ax, [arg3] 443 mov [arg2], ax 444 mov [r11], ax 445 add r11, 2 446 sub arg4, 2 447 add arg3, 2 448 add arg2, 2 449_less_than_2_left: 450 cmp arg4, 1 451 jl _zero_left 452 453 ; load 1 Byte 454 mov al, [arg3] 455 mov [arg2], al 456 mov [r11], al 457_zero_left: 458 movdqa xmm3, [rsp] 459 pshufb xmm3, xmm7 460 pxor xmm3, xmm0 ; xor the initial crc value 461 462 ; shl tmp1, 4 463 lea rax, [pshufb_shf_table + 16] 464 sub rax, tmp1 465 movdqu xmm0, [rax] 466 pxor xmm0, [mask1] 467 468 pshufb xmm3, xmm0 469 jmp _128_done 470 471align 16 472_exact_16_left: 473 movdqu xmm3, [arg3] 474 movdqu [arg2], xmm3 475 pshufb xmm3, xmm7 476 pxor xmm3, xmm0 ; xor the initial crc value 477 478 jmp _128_done 479 480_only_less_than_4: 481 cmp arg4, 3 482 jl _only_less_than_3 483 484 ; load 3 Bytes 485 mov al, [arg3] 486 mov [arg2], al 487 mov [r11], al 488 489 mov al, [arg3+1] 490 mov [arg2+1], al 491 mov [r11+1], al 492 493 mov al, [arg3+2] 494 mov [arg2+2], al 495 mov [r11+2], al 496 497 movdqa xmm3, [rsp] 498 pshufb xmm3, xmm7 499 pxor xmm3, xmm0 ; xor the initial crc value 500 501 psrldq xmm3, 5 502 503 jmp _barrett 504_only_less_than_3: 505 cmp arg4, 2 506 jl _only_less_than_2 507 508 ; load 2 Bytes 509 mov al, [arg3] 510 mov [arg2], al 511 mov [r11], al 512 513 mov al, [arg3+1] 514 mov [arg2+1], al 515 mov [r11+1], al 516 517 movdqa xmm3, [rsp] 518 pshufb xmm3, xmm7 519 pxor xmm3, xmm0 ; xor the initial crc value 520 521 psrldq xmm3, 6 522 523 jmp _barrett 524_only_less_than_2: 525 526 ; load 1 Byte 527 mov al, [arg3] 528 mov [arg2],al 529 mov [r11], al 530 531 movdqa xmm3, [rsp] 532 pshufb xmm3, xmm7 533 pxor xmm3, xmm0 ; xor the initial crc value 534 535 psrldq xmm3, 7 536 537 jmp _barrett 538 539section .data 540 541; precomputed constants 542; these constants are precomputed from the poly: 0x8bb70000 (0x8bb7 scaled to 32 bits) 543align 16 544; Q = 0x18BB70000 545; rk1 = 2^(32*3) mod Q << 32 546; rk2 = 2^(32*5) mod Q << 32 547; rk3 = 2^(32*15) mod Q << 32 548; rk4 = 2^(32*17) mod Q << 32 549; rk5 = 2^(32*3) mod Q << 32 550; rk6 = 2^(32*2) mod Q << 32 551; rk7 = floor(2^64/Q) 552; rk8 = Q 553rk1: 554DQ 0x2d56000000000000 555rk2: 556DQ 0x06df000000000000 557rk3: 558DQ 0x044c000000000000 559rk4: 560DQ 0xe658000000000000 561rk5: 562DQ 0x2d56000000000000 563rk6: 564DQ 0x1368000000000000 565rk7: 566DQ 0x00000001f65a57f8 567rk8: 568DQ 0x000000018bb70000 569mask1: 570dq 0x8080808080808080, 0x8080808080808080 571mask2: 572dq 0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFF 573 574SHUF_MASK: 575dq 0x08090A0B0C0D0E0F, 0x0001020304050607 576 577pshufb_shf_table: 578; use these values for shift constants for the pshufb instruction 579; different alignments result in values as shown: 580; dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1 581; dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2 582; dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3 583; dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4 584; dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5 585; dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6 586; dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9 (16-7) / shr7 587; dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8 (16-8) / shr8 588; dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7 (16-9) / shr9 589; dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6 (16-10) / shr10 590; dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5 (16-11) / shr11 591; dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4 (16-12) / shr12 592; dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3 (16-13) / shr13 593; dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2 (16-14) / shr14 594; dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1 (16-15) / shr15 595dq 0x8786858483828100, 0x8f8e8d8c8b8a8988 596dq 0x0706050403020100, 0x000e0d0c0b0a0908 597 598;;; func core, ver, snum 599slversion crc16_t10dif_copy_by4, 05, 02, 0000 600