1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2// Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 3// Copyright (c) 2017 International Business Machines Corp. 4// All rights reserved. 5// This source code is licensed under both the GPLv2 (found in the 6// COPYING file in the root directory) and Apache 2.0 License 7// (found in the LICENSE.Apache file in the root directory). 8 9#if defined (__clang__) 10#include "third-party/gcc/ppc-asm.h" 11#else 12#include <ppc-asm.h> 13#endif 14#include "ppc-opcode.h" 15 16#undef toc 17 18#ifndef r1 19#define r1 1 20#endif 21 22#ifndef r2 23#define r2 2 24#endif 25 26 .section .rodata 27.balign 16 28 29.byteswap_constant: 30 /* byte reverse permute constant */ 31 .octa 0x0F0E0D0C0B0A09080706050403020100 32 33#define __ASSEMBLY__ 34#include "crc32c_ppc_constants.h" 35 36 .text 37 38#if defined(__BIG_ENDIAN__) && defined(REFLECT) 39#define BYTESWAP_DATA 40#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) 41#define BYTESWAP_DATA 42#else 43#undef BYTESWAP_DATA 44#endif 45 46#define off16 r25 47#define off32 r26 48#define off48 r27 49#define off64 r28 50#define off80 r29 51#define off96 r30 52#define off112 r31 53 54#define const1 v24 55#define const2 v25 56 57#define byteswap v26 58#define mask_32bit v27 59#define mask_64bit v28 60#define zeroes v29 61 62#ifdef BYTESWAP_DATA 63#define VPERM(A, B, C, D) vperm A, B, C, D 64#else 65#define VPERM(A, B, C, D) 66#endif 67 68/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ 69FUNC_START(__crc32_vpmsum) 70 std r31,-8(r1) 71 std r30,-16(r1) 72 std r29,-24(r1) 73 std r28,-32(r1) 74 std r27,-40(r1) 75 std r26,-48(r1) 76 std r25,-56(r1) 77 78 li off16,16 79 li off32,32 80 li off48,48 81 li off64,64 82 li off80,80 83 li off96,96 84 li off112,112 85 li r0,0 86 87 /* Enough room for saving 10 non volatile VMX registers */ 88 subi r6,r1,56+10*16 89 subi r7,r1,56+2*16 90 91 stvx v20,0,r6 92 stvx v21,off16,r6 93 stvx v22,off32,r6 94 stvx v23,off48,r6 95 stvx v24,off64,r6 96 stvx v25,off80,r6 97 stvx v26,off96,r6 98 stvx v27,off112,r6 99 stvx v28,0,r7 100 stvx v29,off16,r7 101 102 mr r10,r3 103 104 vxor zeroes,zeroes,zeroes 105 vspltisw v0,-1 106 107 vsldoi mask_32bit,zeroes,v0,4 108 vsldoi mask_64bit,zeroes,v0,8 109 110 /* Get the initial value into v8 */ 111 vxor v8,v8,v8 112 MTVRD(v8, r3) 113#ifdef REFLECT 114 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 115#else 116 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ 117#endif 118 119#ifdef BYTESWAP_DATA 120 addis r3,r2,.byteswap_constant@toc@ha 121 addi r3,r3,.byteswap_constant@toc@l 122 123 lvx byteswap,0,r3 124 addi r3,r3,16 125#endif 126 127 cmpdi r5,256 128 blt .Lshort 129 130 rldicr r6,r5,0,56 131 132 /* Checksum in blocks of MAX_SIZE */ 1331: lis r7,MAX_SIZE@h 134 ori r7,r7,MAX_SIZE@l 135 mr r9,r7 136 cmpd r6,r7 137 bgt 2f 138 mr r7,r6 1392: subf r6,r7,r6 140 141 /* our main loop does 128 bytes at a time */ 142 srdi r7,r7,7 143 144 /* 145 * Work out the offset into the constants table to start at. Each 146 * constant is 16 bytes, and it is used against 128 bytes of input 147 * data - 128 / 16 = 8 148 */ 149 sldi r8,r7,4 150 srdi r9,r9,3 151 subf r8,r8,r9 152 153 /* We reduce our final 128 bytes in a separate step */ 154 addi r7,r7,-1 155 mtctr r7 156 157 addis r3,r2,.constants@toc@ha 158 addi r3,r3,.constants@toc@l 159 160 /* Find the start of our constants */ 161 add r3,r3,r8 162 163 /* zero v0-v7 which will contain our checksums */ 164 vxor v0,v0,v0 165 vxor v1,v1,v1 166 vxor v2,v2,v2 167 vxor v3,v3,v3 168 vxor v4,v4,v4 169 vxor v5,v5,v5 170 vxor v6,v6,v6 171 vxor v7,v7,v7 172 173 lvx const1,0,r3 174 175 /* 176 * If we are looping back to consume more data we use the values 177 * already in v16-v23. 178 */ 179 cmpdi r0,1 180 beq 2f 181 182 /* First warm up pass */ 183 lvx v16,0,r4 184 lvx v17,off16,r4 185 VPERM(v16,v16,v16,byteswap) 186 VPERM(v17,v17,v17,byteswap) 187 lvx v18,off32,r4 188 lvx v19,off48,r4 189 VPERM(v18,v18,v18,byteswap) 190 VPERM(v19,v19,v19,byteswap) 191 lvx v20,off64,r4 192 lvx v21,off80,r4 193 VPERM(v20,v20,v20,byteswap) 194 VPERM(v21,v21,v21,byteswap) 195 lvx v22,off96,r4 196 lvx v23,off112,r4 197 VPERM(v22,v22,v22,byteswap) 198 VPERM(v23,v23,v23,byteswap) 199 addi r4,r4,8*16 200 201 /* xor in initial value */ 202 vxor v16,v16,v8 203 2042: bdz .Lfirst_warm_up_done 205 206 addi r3,r3,16 207 lvx const2,0,r3 208 209 /* Second warm up pass */ 210 VPMSUMD(v8,v16,const1) 211 lvx v16,0,r4 212 VPERM(v16,v16,v16,byteswap) 213 ori r2,r2,0 214 215 VPMSUMD(v9,v17,const1) 216 lvx v17,off16,r4 217 VPERM(v17,v17,v17,byteswap) 218 ori r2,r2,0 219 220 VPMSUMD(v10,v18,const1) 221 lvx v18,off32,r4 222 VPERM(v18,v18,v18,byteswap) 223 ori r2,r2,0 224 225 VPMSUMD(v11,v19,const1) 226 lvx v19,off48,r4 227 VPERM(v19,v19,v19,byteswap) 228 ori r2,r2,0 229 230 VPMSUMD(v12,v20,const1) 231 lvx v20,off64,r4 232 VPERM(v20,v20,v20,byteswap) 233 ori r2,r2,0 234 235 VPMSUMD(v13,v21,const1) 236 lvx v21,off80,r4 237 VPERM(v21,v21,v21,byteswap) 238 ori r2,r2,0 239 240 VPMSUMD(v14,v22,const1) 241 lvx v22,off96,r4 242 VPERM(v22,v22,v22,byteswap) 243 ori r2,r2,0 244 245 VPMSUMD(v15,v23,const1) 246 lvx v23,off112,r4 247 VPERM(v23,v23,v23,byteswap) 248 249 addi r4,r4,8*16 250 251 bdz .Lfirst_cool_down 252 253 /* 254 * main loop. We modulo schedule it such that it takes three iterations 255 * to complete - first iteration load, second iteration vpmsum, third 256 * iteration xor. 257 */ 258 .balign 16 2594: lvx const1,0,r3 260 addi r3,r3,16 261 ori r2,r2,0 262 263 vxor v0,v0,v8 264 VPMSUMD(v8,v16,const2) 265 lvx v16,0,r4 266 VPERM(v16,v16,v16,byteswap) 267 ori r2,r2,0 268 269 vxor v1,v1,v9 270 VPMSUMD(v9,v17,const2) 271 lvx v17,off16,r4 272 VPERM(v17,v17,v17,byteswap) 273 ori r2,r2,0 274 275 vxor v2,v2,v10 276 VPMSUMD(v10,v18,const2) 277 lvx v18,off32,r4 278 VPERM(v18,v18,v18,byteswap) 279 ori r2,r2,0 280 281 vxor v3,v3,v11 282 VPMSUMD(v11,v19,const2) 283 lvx v19,off48,r4 284 VPERM(v19,v19,v19,byteswap) 285 lvx const2,0,r3 286 ori r2,r2,0 287 288 vxor v4,v4,v12 289 VPMSUMD(v12,v20,const1) 290 lvx v20,off64,r4 291 VPERM(v20,v20,v20,byteswap) 292 ori r2,r2,0 293 294 vxor v5,v5,v13 295 VPMSUMD(v13,v21,const1) 296 lvx v21,off80,r4 297 VPERM(v21,v21,v21,byteswap) 298 ori r2,r2,0 299 300 vxor v6,v6,v14 301 VPMSUMD(v14,v22,const1) 302 lvx v22,off96,r4 303 VPERM(v22,v22,v22,byteswap) 304 ori r2,r2,0 305 306 vxor v7,v7,v15 307 VPMSUMD(v15,v23,const1) 308 lvx v23,off112,r4 309 VPERM(v23,v23,v23,byteswap) 310 311 addi r4,r4,8*16 312 313 bdnz 4b 314 315.Lfirst_cool_down: 316 /* First cool down pass */ 317 lvx const1,0,r3 318 addi r3,r3,16 319 320 vxor v0,v0,v8 321 VPMSUMD(v8,v16,const1) 322 ori r2,r2,0 323 324 vxor v1,v1,v9 325 VPMSUMD(v9,v17,const1) 326 ori r2,r2,0 327 328 vxor v2,v2,v10 329 VPMSUMD(v10,v18,const1) 330 ori r2,r2,0 331 332 vxor v3,v3,v11 333 VPMSUMD(v11,v19,const1) 334 ori r2,r2,0 335 336 vxor v4,v4,v12 337 VPMSUMD(v12,v20,const1) 338 ori r2,r2,0 339 340 vxor v5,v5,v13 341 VPMSUMD(v13,v21,const1) 342 ori r2,r2,0 343 344 vxor v6,v6,v14 345 VPMSUMD(v14,v22,const1) 346 ori r2,r2,0 347 348 vxor v7,v7,v15 349 VPMSUMD(v15,v23,const1) 350 ori r2,r2,0 351 352.Lsecond_cool_down: 353 /* Second cool down pass */ 354 vxor v0,v0,v8 355 vxor v1,v1,v9 356 vxor v2,v2,v10 357 vxor v3,v3,v11 358 vxor v4,v4,v12 359 vxor v5,v5,v13 360 vxor v6,v6,v14 361 vxor v7,v7,v15 362 363#ifdef REFLECT 364 /* 365 * vpmsumd produces a 96 bit result in the least significant bits 366 * of the register. Since we are bit reflected we have to shift it 367 * left 32 bits so it occupies the least significant bits in the 368 * bit reflected domain. 369 */ 370 vsldoi v0,v0,zeroes,4 371 vsldoi v1,v1,zeroes,4 372 vsldoi v2,v2,zeroes,4 373 vsldoi v3,v3,zeroes,4 374 vsldoi v4,v4,zeroes,4 375 vsldoi v5,v5,zeroes,4 376 vsldoi v6,v6,zeroes,4 377 vsldoi v7,v7,zeroes,4 378#endif 379 380 /* xor with last 1024 bits */ 381 lvx v8,0,r4 382 lvx v9,off16,r4 383 VPERM(v8,v8,v8,byteswap) 384 VPERM(v9,v9,v9,byteswap) 385 lvx v10,off32,r4 386 lvx v11,off48,r4 387 VPERM(v10,v10,v10,byteswap) 388 VPERM(v11,v11,v11,byteswap) 389 lvx v12,off64,r4 390 lvx v13,off80,r4 391 VPERM(v12,v12,v12,byteswap) 392 VPERM(v13,v13,v13,byteswap) 393 lvx v14,off96,r4 394 lvx v15,off112,r4 395 VPERM(v14,v14,v14,byteswap) 396 VPERM(v15,v15,v15,byteswap) 397 398 addi r4,r4,8*16 399 400 vxor v16,v0,v8 401 vxor v17,v1,v9 402 vxor v18,v2,v10 403 vxor v19,v3,v11 404 vxor v20,v4,v12 405 vxor v21,v5,v13 406 vxor v22,v6,v14 407 vxor v23,v7,v15 408 409 li r0,1 410 cmpdi r6,0 411 addi r6,r6,128 412 bne 1b 413 414 /* Work out how many bytes we have left */ 415 andi. r5,r5,127 416 417 /* Calculate where in the constant table we need to start */ 418 subfic r6,r5,128 419 add r3,r3,r6 420 421 /* How many 16 byte chunks are in the tail */ 422 srdi r7,r5,4 423 mtctr r7 424 425 /* 426 * Reduce the previously calculated 1024 bits to 64 bits, shifting 427 * 32 bits to include the trailing 32 bits of zeros 428 */ 429 lvx v0,0,r3 430 lvx v1,off16,r3 431 lvx v2,off32,r3 432 lvx v3,off48,r3 433 lvx v4,off64,r3 434 lvx v5,off80,r3 435 lvx v6,off96,r3 436 lvx v7,off112,r3 437 addi r3,r3,8*16 438 439 VPMSUMW(v0,v16,v0) 440 VPMSUMW(v1,v17,v1) 441 VPMSUMW(v2,v18,v2) 442 VPMSUMW(v3,v19,v3) 443 VPMSUMW(v4,v20,v4) 444 VPMSUMW(v5,v21,v5) 445 VPMSUMW(v6,v22,v6) 446 VPMSUMW(v7,v23,v7) 447 448 /* Now reduce the tail (0 - 112 bytes) */ 449 cmpdi r7,0 450 beq 1f 451 452 lvx v16,0,r4 453 lvx v17,0,r3 454 VPERM(v16,v16,v16,byteswap) 455 VPMSUMW(v16,v16,v17) 456 vxor v0,v0,v16 457 bdz 1f 458 459 lvx v16,off16,r4 460 lvx v17,off16,r3 461 VPERM(v16,v16,v16,byteswap) 462 VPMSUMW(v16,v16,v17) 463 vxor v0,v0,v16 464 bdz 1f 465 466 lvx v16,off32,r4 467 lvx v17,off32,r3 468 VPERM(v16,v16,v16,byteswap) 469 VPMSUMW(v16,v16,v17) 470 vxor v0,v0,v16 471 bdz 1f 472 473 lvx v16,off48,r4 474 lvx v17,off48,r3 475 VPERM(v16,v16,v16,byteswap) 476 VPMSUMW(v16,v16,v17) 477 vxor v0,v0,v16 478 bdz 1f 479 480 lvx v16,off64,r4 481 lvx v17,off64,r3 482 VPERM(v16,v16,v16,byteswap) 483 VPMSUMW(v16,v16,v17) 484 vxor v0,v0,v16 485 bdz 1f 486 487 lvx v16,off80,r4 488 lvx v17,off80,r3 489 VPERM(v16,v16,v16,byteswap) 490 VPMSUMW(v16,v16,v17) 491 vxor v0,v0,v16 492 bdz 1f 493 494 lvx v16,off96,r4 495 lvx v17,off96,r3 496 VPERM(v16,v16,v16,byteswap) 497 VPMSUMW(v16,v16,v17) 498 vxor v0,v0,v16 499 500 /* Now xor all the parallel chunks together */ 5011: vxor v0,v0,v1 502 vxor v2,v2,v3 503 vxor v4,v4,v5 504 vxor v6,v6,v7 505 506 vxor v0,v0,v2 507 vxor v4,v4,v6 508 509 vxor v0,v0,v4 510 511.Lbarrett_reduction: 512 /* Barrett constants */ 513 addis r3,r2,.barrett_constants@toc@ha 514 addi r3,r3,.barrett_constants@toc@l 515 516 lvx const1,0,r3 517 lvx const2,off16,r3 518 519 vsldoi v1,v0,v0,8 520 vxor v0,v0,v1 /* xor two 64 bit results together */ 521 522#ifdef REFLECT 523 /* shift left one bit */ 524 vspltisb v1,1 525 vsl v0,v0,v1 526#endif 527 528 vand v0,v0,mask_64bit 529 530#ifndef REFLECT 531 /* 532 * Now for the Barrett reduction algorithm. The idea is to calculate q, 533 * the multiple of our polynomial that we need to subtract. By 534 * doing the computation 2x bits higher (ie 64 bits) and shifting the 535 * result back down 2x bits, we round down to the nearest multiple. 536 */ 537 VPMSUMD(v1,v0,const1) /* ma */ 538 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ 539 VPMSUMD(v1,v1,const2) /* qn */ 540 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 541 542 /* 543 * Get the result into r3. We need to shift it left 8 bytes: 544 * V0 [ 0 1 2 X ] 545 * V0 [ 0 X 2 3 ] 546 */ 547 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ 548#else 549 /* 550 * The reflected version of Barrett reduction. Instead of bit 551 * reflecting our data (which is expensive to do), we bit reflect our 552 * constants and our algorithm, which means the intermediate data in 553 * our vector registers goes from 0-63 instead of 63-0. We can reflect 554 * the algorithm because we don't carry in mod 2 arithmetic. 555 */ 556 vand v1,v0,mask_32bit /* bottom 32 bits of a */ 557 VPMSUMD(v1,v1,const1) /* ma */ 558 vand v1,v1,mask_32bit /* bottom 32bits of ma */ 559 VPMSUMD(v1,v1,const2) /* qn */ 560 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 561 562 /* 563 * Since we are bit reflected, the result (ie the low 32 bits) is in 564 * the high 32 bits. We just need to shift it left 4 bytes 565 * V0 [ 0 1 X 3 ] 566 * V0 [ 0 X 2 3 ] 567 */ 568 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 569#endif 570 571 /* Get it into r3 */ 572 MFVRD(r3, v0) 573 574.Lout: 575 subi r6,r1,56+10*16 576 subi r7,r1,56+2*16 577 578 lvx v20,0,r6 579 lvx v21,off16,r6 580 lvx v22,off32,r6 581 lvx v23,off48,r6 582 lvx v24,off64,r6 583 lvx v25,off80,r6 584 lvx v26,off96,r6 585 lvx v27,off112,r6 586 lvx v28,0,r7 587 lvx v29,off16,r7 588 589 ld r31,-8(r1) 590 ld r30,-16(r1) 591 ld r29,-24(r1) 592 ld r28,-32(r1) 593 ld r27,-40(r1) 594 ld r26,-48(r1) 595 ld r25,-56(r1) 596 597 blr 598 599.Lfirst_warm_up_done: 600 lvx const1,0,r3 601 addi r3,r3,16 602 603 VPMSUMD(v8,v16,const1) 604 VPMSUMD(v9,v17,const1) 605 VPMSUMD(v10,v18,const1) 606 VPMSUMD(v11,v19,const1) 607 VPMSUMD(v12,v20,const1) 608 VPMSUMD(v13,v21,const1) 609 VPMSUMD(v14,v22,const1) 610 VPMSUMD(v15,v23,const1) 611 612 b .Lsecond_cool_down 613 614.Lshort: 615 cmpdi r5,0 616 beq .Lzero 617 618 addis r3,r2,.short_constants@toc@ha 619 addi r3,r3,.short_constants@toc@l 620 621 /* Calculate where in the constant table we need to start */ 622 subfic r6,r5,256 623 add r3,r3,r6 624 625 /* How many 16 byte chunks? */ 626 srdi r7,r5,4 627 mtctr r7 628 629 vxor v19,v19,v19 630 vxor v20,v20,v20 631 632 lvx v0,0,r4 633 lvx v16,0,r3 634 VPERM(v0,v0,v16,byteswap) 635 vxor v0,v0,v8 /* xor in initial value */ 636 VPMSUMW(v0,v0,v16) 637 bdz .Lv0 638 639 lvx v1,off16,r4 640 lvx v17,off16,r3 641 VPERM(v1,v1,v17,byteswap) 642 VPMSUMW(v1,v1,v17) 643 bdz .Lv1 644 645 lvx v2,off32,r4 646 lvx v16,off32,r3 647 VPERM(v2,v2,v16,byteswap) 648 VPMSUMW(v2,v2,v16) 649 bdz .Lv2 650 651 lvx v3,off48,r4 652 lvx v17,off48,r3 653 VPERM(v3,v3,v17,byteswap) 654 VPMSUMW(v3,v3,v17) 655 bdz .Lv3 656 657 lvx v4,off64,r4 658 lvx v16,off64,r3 659 VPERM(v4,v4,v16,byteswap) 660 VPMSUMW(v4,v4,v16) 661 bdz .Lv4 662 663 lvx v5,off80,r4 664 lvx v17,off80,r3 665 VPERM(v5,v5,v17,byteswap) 666 VPMSUMW(v5,v5,v17) 667 bdz .Lv5 668 669 lvx v6,off96,r4 670 lvx v16,off96,r3 671 VPERM(v6,v6,v16,byteswap) 672 VPMSUMW(v6,v6,v16) 673 bdz .Lv6 674 675 lvx v7,off112,r4 676 lvx v17,off112,r3 677 VPERM(v7,v7,v17,byteswap) 678 VPMSUMW(v7,v7,v17) 679 bdz .Lv7 680 681 addi r3,r3,128 682 addi r4,r4,128 683 684 lvx v8,0,r4 685 lvx v16,0,r3 686 VPERM(v8,v8,v16,byteswap) 687 VPMSUMW(v8,v8,v16) 688 bdz .Lv8 689 690 lvx v9,off16,r4 691 lvx v17,off16,r3 692 VPERM(v9,v9,v17,byteswap) 693 VPMSUMW(v9,v9,v17) 694 bdz .Lv9 695 696 lvx v10,off32,r4 697 lvx v16,off32,r3 698 VPERM(v10,v10,v16,byteswap) 699 VPMSUMW(v10,v10,v16) 700 bdz .Lv10 701 702 lvx v11,off48,r4 703 lvx v17,off48,r3 704 VPERM(v11,v11,v17,byteswap) 705 VPMSUMW(v11,v11,v17) 706 bdz .Lv11 707 708 lvx v12,off64,r4 709 lvx v16,off64,r3 710 VPERM(v12,v12,v16,byteswap) 711 VPMSUMW(v12,v12,v16) 712 bdz .Lv12 713 714 lvx v13,off80,r4 715 lvx v17,off80,r3 716 VPERM(v13,v13,v17,byteswap) 717 VPMSUMW(v13,v13,v17) 718 bdz .Lv13 719 720 lvx v14,off96,r4 721 lvx v16,off96,r3 722 VPERM(v14,v14,v16,byteswap) 723 VPMSUMW(v14,v14,v16) 724 bdz .Lv14 725 726 lvx v15,off112,r4 727 lvx v17,off112,r3 728 VPERM(v15,v15,v17,byteswap) 729 VPMSUMW(v15,v15,v17) 730 731.Lv15: vxor v19,v19,v15 732.Lv14: vxor v20,v20,v14 733.Lv13: vxor v19,v19,v13 734.Lv12: vxor v20,v20,v12 735.Lv11: vxor v19,v19,v11 736.Lv10: vxor v20,v20,v10 737.Lv9: vxor v19,v19,v9 738.Lv8: vxor v20,v20,v8 739.Lv7: vxor v19,v19,v7 740.Lv6: vxor v20,v20,v6 741.Lv5: vxor v19,v19,v5 742.Lv4: vxor v20,v20,v4 743.Lv3: vxor v19,v19,v3 744.Lv2: vxor v20,v20,v2 745.Lv1: vxor v19,v19,v1 746.Lv0: vxor v20,v20,v0 747 748 vxor v0,v19,v20 749 750 b .Lbarrett_reduction 751 752.Lzero: 753 mr r3,r10 754 b .Lout 755 756FUNC_END(__crc32_vpmsum) 757