1// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2// Copyright (c) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 3// Copyright (c) 2017 International Business Machines Corp. 4// All rights reserved. 5// This source code is licensed under both the GPLv2 (found in the 6// COPYING file in the root directory) and Apache 2.0 License 7// (found in the LICENSE.Apache file in the root directory). 8 9#include <ppc-asm.h> 10#include "ppc-opcode.h" 11 12#undef toc 13 14#ifndef r1 15#define r1 1 16#endif 17 18#ifndef r2 19#define r2 2 20#endif 21 22 .section .rodata 23.balign 16 24 25.byteswap_constant: 26 /* byte reverse permute constant */ 27 .octa 0x0F0E0D0C0B0A09080706050403020100 28 29#define __ASSEMBLY__ 30#include "crc32c_ppc_constants.h" 31 32 .text 33 34#if defined(__BIG_ENDIAN__) && defined(REFLECT) 35#define BYTESWAP_DATA 36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) 37#define BYTESWAP_DATA 38#else 39#undef BYTESWAP_DATA 40#endif 41 42#define off16 r25 43#define off32 r26 44#define off48 r27 45#define off64 r28 46#define off80 r29 47#define off96 r30 48#define off112 r31 49 50#define const1 v24 51#define const2 v25 52 53#define byteswap v26 54#define mask_32bit v27 55#define mask_64bit v28 56#define zeroes v29 57 58#ifdef BYTESWAP_DATA 59#define VPERM(A, B, C, D) vperm A, B, C, D 60#else 61#define VPERM(A, B, C, D) 62#endif 63 64/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ 65FUNC_START(__crc32_vpmsum) 66 std r31,-8(r1) 67 std r30,-16(r1) 68 std r29,-24(r1) 69 std r28,-32(r1) 70 std r27,-40(r1) 71 std r26,-48(r1) 72 std r25,-56(r1) 73 74 li off16,16 75 li off32,32 76 li off48,48 77 li off64,64 78 li off80,80 79 li off96,96 80 li off112,112 81 li r0,0 82 83 /* Enough room for saving 10 non volatile VMX registers */ 84 subi r6,r1,56+10*16 85 subi r7,r1,56+2*16 86 87 stvx v20,0,r6 88 stvx v21,off16,r6 89 stvx v22,off32,r6 90 stvx v23,off48,r6 91 stvx v24,off64,r6 92 stvx v25,off80,r6 93 stvx v26,off96,r6 94 stvx v27,off112,r6 95 stvx v28,0,r7 96 stvx v29,off16,r7 97 98 mr r10,r3 99 100 vxor zeroes,zeroes,zeroes 101 vspltisw v0,-1 102 103 vsldoi mask_32bit,zeroes,v0,4 104 vsldoi mask_64bit,zeroes,v0,8 105 106 /* Get the initial value into v8 */ 107 vxor v8,v8,v8 108 MTVRD(v8, r3) 109#ifdef REFLECT 110 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 111#else 112 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ 113#endif 114 115#ifdef BYTESWAP_DATA 116 addis r3,r2,.byteswap_constant@toc@ha 117 addi r3,r3,.byteswap_constant@toc@l 118 119 lvx byteswap,0,r3 120 addi r3,r3,16 121#endif 122 123 cmpdi r5,256 124 blt .Lshort 125 126 rldicr r6,r5,0,56 127 128 /* Checksum in blocks of MAX_SIZE */ 1291: lis r7,MAX_SIZE@h 130 ori r7,r7,MAX_SIZE@l 131 mr r9,r7 132 cmpd r6,r7 133 bgt 2f 134 mr r7,r6 1352: subf r6,r7,r6 136 137 /* our main loop does 128 bytes at a time */ 138 srdi r7,r7,7 139 140 /* 141 * Work out the offset into the constants table to start at. Each 142 * constant is 16 bytes, and it is used against 128 bytes of input 143 * data - 128 / 16 = 8 144 */ 145 sldi r8,r7,4 146 srdi r9,r9,3 147 subf r8,r8,r9 148 149 /* We reduce our final 128 bytes in a separate step */ 150 addi r7,r7,-1 151 mtctr r7 152 153 addis r3,r2,.constants@toc@ha 154 addi r3,r3,.constants@toc@l 155 156 /* Find the start of our constants */ 157 add r3,r3,r8 158 159 /* zero v0-v7 which will contain our checksums */ 160 vxor v0,v0,v0 161 vxor v1,v1,v1 162 vxor v2,v2,v2 163 vxor v3,v3,v3 164 vxor v4,v4,v4 165 vxor v5,v5,v5 166 vxor v6,v6,v6 167 vxor v7,v7,v7 168 169 lvx const1,0,r3 170 171 /* 172 * If we are looping back to consume more data we use the values 173 * already in v16-v23. 174 */ 175 cmpdi r0,1 176 beq 2f 177 178 /* First warm up pass */ 179 lvx v16,0,r4 180 lvx v17,off16,r4 181 VPERM(v16,v16,v16,byteswap) 182 VPERM(v17,v17,v17,byteswap) 183 lvx v18,off32,r4 184 lvx v19,off48,r4 185 VPERM(v18,v18,v18,byteswap) 186 VPERM(v19,v19,v19,byteswap) 187 lvx v20,off64,r4 188 lvx v21,off80,r4 189 VPERM(v20,v20,v20,byteswap) 190 VPERM(v21,v21,v21,byteswap) 191 lvx v22,off96,r4 192 lvx v23,off112,r4 193 VPERM(v22,v22,v22,byteswap) 194 VPERM(v23,v23,v23,byteswap) 195 addi r4,r4,8*16 196 197 /* xor in initial value */ 198 vxor v16,v16,v8 199 2002: bdz .Lfirst_warm_up_done 201 202 addi r3,r3,16 203 lvx const2,0,r3 204 205 /* Second warm up pass */ 206 VPMSUMD(v8,v16,const1) 207 lvx v16,0,r4 208 VPERM(v16,v16,v16,byteswap) 209 ori r2,r2,0 210 211 VPMSUMD(v9,v17,const1) 212 lvx v17,off16,r4 213 VPERM(v17,v17,v17,byteswap) 214 ori r2,r2,0 215 216 VPMSUMD(v10,v18,const1) 217 lvx v18,off32,r4 218 VPERM(v18,v18,v18,byteswap) 219 ori r2,r2,0 220 221 VPMSUMD(v11,v19,const1) 222 lvx v19,off48,r4 223 VPERM(v19,v19,v19,byteswap) 224 ori r2,r2,0 225 226 VPMSUMD(v12,v20,const1) 227 lvx v20,off64,r4 228 VPERM(v20,v20,v20,byteswap) 229 ori r2,r2,0 230 231 VPMSUMD(v13,v21,const1) 232 lvx v21,off80,r4 233 VPERM(v21,v21,v21,byteswap) 234 ori r2,r2,0 235 236 VPMSUMD(v14,v22,const1) 237 lvx v22,off96,r4 238 VPERM(v22,v22,v22,byteswap) 239 ori r2,r2,0 240 241 VPMSUMD(v15,v23,const1) 242 lvx v23,off112,r4 243 VPERM(v23,v23,v23,byteswap) 244 245 addi r4,r4,8*16 246 247 bdz .Lfirst_cool_down 248 249 /* 250 * main loop. We modulo schedule it such that it takes three iterations 251 * to complete - first iteration load, second iteration vpmsum, third 252 * iteration xor. 253 */ 254 .balign 16 2554: lvx const1,0,r3 256 addi r3,r3,16 257 ori r2,r2,0 258 259 vxor v0,v0,v8 260 VPMSUMD(v8,v16,const2) 261 lvx v16,0,r4 262 VPERM(v16,v16,v16,byteswap) 263 ori r2,r2,0 264 265 vxor v1,v1,v9 266 VPMSUMD(v9,v17,const2) 267 lvx v17,off16,r4 268 VPERM(v17,v17,v17,byteswap) 269 ori r2,r2,0 270 271 vxor v2,v2,v10 272 VPMSUMD(v10,v18,const2) 273 lvx v18,off32,r4 274 VPERM(v18,v18,v18,byteswap) 275 ori r2,r2,0 276 277 vxor v3,v3,v11 278 VPMSUMD(v11,v19,const2) 279 lvx v19,off48,r4 280 VPERM(v19,v19,v19,byteswap) 281 lvx const2,0,r3 282 ori r2,r2,0 283 284 vxor v4,v4,v12 285 VPMSUMD(v12,v20,const1) 286 lvx v20,off64,r4 287 VPERM(v20,v20,v20,byteswap) 288 ori r2,r2,0 289 290 vxor v5,v5,v13 291 VPMSUMD(v13,v21,const1) 292 lvx v21,off80,r4 293 VPERM(v21,v21,v21,byteswap) 294 ori r2,r2,0 295 296 vxor v6,v6,v14 297 VPMSUMD(v14,v22,const1) 298 lvx v22,off96,r4 299 VPERM(v22,v22,v22,byteswap) 300 ori r2,r2,0 301 302 vxor v7,v7,v15 303 VPMSUMD(v15,v23,const1) 304 lvx v23,off112,r4 305 VPERM(v23,v23,v23,byteswap) 306 307 addi r4,r4,8*16 308 309 bdnz 4b 310 311.Lfirst_cool_down: 312 /* First cool down pass */ 313 lvx const1,0,r3 314 addi r3,r3,16 315 316 vxor v0,v0,v8 317 VPMSUMD(v8,v16,const1) 318 ori r2,r2,0 319 320 vxor v1,v1,v9 321 VPMSUMD(v9,v17,const1) 322 ori r2,r2,0 323 324 vxor v2,v2,v10 325 VPMSUMD(v10,v18,const1) 326 ori r2,r2,0 327 328 vxor v3,v3,v11 329 VPMSUMD(v11,v19,const1) 330 ori r2,r2,0 331 332 vxor v4,v4,v12 333 VPMSUMD(v12,v20,const1) 334 ori r2,r2,0 335 336 vxor v5,v5,v13 337 VPMSUMD(v13,v21,const1) 338 ori r2,r2,0 339 340 vxor v6,v6,v14 341 VPMSUMD(v14,v22,const1) 342 ori r2,r2,0 343 344 vxor v7,v7,v15 345 VPMSUMD(v15,v23,const1) 346 ori r2,r2,0 347 348.Lsecond_cool_down: 349 /* Second cool down pass */ 350 vxor v0,v0,v8 351 vxor v1,v1,v9 352 vxor v2,v2,v10 353 vxor v3,v3,v11 354 vxor v4,v4,v12 355 vxor v5,v5,v13 356 vxor v6,v6,v14 357 vxor v7,v7,v15 358 359#ifdef REFLECT 360 /* 361 * vpmsumd produces a 96 bit result in the least significant bits 362 * of the register. Since we are bit reflected we have to shift it 363 * left 32 bits so it occupies the least significant bits in the 364 * bit reflected domain. 365 */ 366 vsldoi v0,v0,zeroes,4 367 vsldoi v1,v1,zeroes,4 368 vsldoi v2,v2,zeroes,4 369 vsldoi v3,v3,zeroes,4 370 vsldoi v4,v4,zeroes,4 371 vsldoi v5,v5,zeroes,4 372 vsldoi v6,v6,zeroes,4 373 vsldoi v7,v7,zeroes,4 374#endif 375 376 /* xor with last 1024 bits */ 377 lvx v8,0,r4 378 lvx v9,off16,r4 379 VPERM(v8,v8,v8,byteswap) 380 VPERM(v9,v9,v9,byteswap) 381 lvx v10,off32,r4 382 lvx v11,off48,r4 383 VPERM(v10,v10,v10,byteswap) 384 VPERM(v11,v11,v11,byteswap) 385 lvx v12,off64,r4 386 lvx v13,off80,r4 387 VPERM(v12,v12,v12,byteswap) 388 VPERM(v13,v13,v13,byteswap) 389 lvx v14,off96,r4 390 lvx v15,off112,r4 391 VPERM(v14,v14,v14,byteswap) 392 VPERM(v15,v15,v15,byteswap) 393 394 addi r4,r4,8*16 395 396 vxor v16,v0,v8 397 vxor v17,v1,v9 398 vxor v18,v2,v10 399 vxor v19,v3,v11 400 vxor v20,v4,v12 401 vxor v21,v5,v13 402 vxor v22,v6,v14 403 vxor v23,v7,v15 404 405 li r0,1 406 cmpdi r6,0 407 addi r6,r6,128 408 bne 1b 409 410 /* Work out how many bytes we have left */ 411 andi. r5,r5,127 412 413 /* Calculate where in the constant table we need to start */ 414 subfic r6,r5,128 415 add r3,r3,r6 416 417 /* How many 16 byte chunks are in the tail */ 418 srdi r7,r5,4 419 mtctr r7 420 421 /* 422 * Reduce the previously calculated 1024 bits to 64 bits, shifting 423 * 32 bits to include the trailing 32 bits of zeros 424 */ 425 lvx v0,0,r3 426 lvx v1,off16,r3 427 lvx v2,off32,r3 428 lvx v3,off48,r3 429 lvx v4,off64,r3 430 lvx v5,off80,r3 431 lvx v6,off96,r3 432 lvx v7,off112,r3 433 addi r3,r3,8*16 434 435 VPMSUMW(v0,v16,v0) 436 VPMSUMW(v1,v17,v1) 437 VPMSUMW(v2,v18,v2) 438 VPMSUMW(v3,v19,v3) 439 VPMSUMW(v4,v20,v4) 440 VPMSUMW(v5,v21,v5) 441 VPMSUMW(v6,v22,v6) 442 VPMSUMW(v7,v23,v7) 443 444 /* Now reduce the tail (0 - 112 bytes) */ 445 cmpdi r7,0 446 beq 1f 447 448 lvx v16,0,r4 449 lvx v17,0,r3 450 VPERM(v16,v16,v16,byteswap) 451 VPMSUMW(v16,v16,v17) 452 vxor v0,v0,v16 453 bdz 1f 454 455 lvx v16,off16,r4 456 lvx v17,off16,r3 457 VPERM(v16,v16,v16,byteswap) 458 VPMSUMW(v16,v16,v17) 459 vxor v0,v0,v16 460 bdz 1f 461 462 lvx v16,off32,r4 463 lvx v17,off32,r3 464 VPERM(v16,v16,v16,byteswap) 465 VPMSUMW(v16,v16,v17) 466 vxor v0,v0,v16 467 bdz 1f 468 469 lvx v16,off48,r4 470 lvx v17,off48,r3 471 VPERM(v16,v16,v16,byteswap) 472 VPMSUMW(v16,v16,v17) 473 vxor v0,v0,v16 474 bdz 1f 475 476 lvx v16,off64,r4 477 lvx v17,off64,r3 478 VPERM(v16,v16,v16,byteswap) 479 VPMSUMW(v16,v16,v17) 480 vxor v0,v0,v16 481 bdz 1f 482 483 lvx v16,off80,r4 484 lvx v17,off80,r3 485 VPERM(v16,v16,v16,byteswap) 486 VPMSUMW(v16,v16,v17) 487 vxor v0,v0,v16 488 bdz 1f 489 490 lvx v16,off96,r4 491 lvx v17,off96,r3 492 VPERM(v16,v16,v16,byteswap) 493 VPMSUMW(v16,v16,v17) 494 vxor v0,v0,v16 495 496 /* Now xor all the parallel chunks together */ 4971: vxor v0,v0,v1 498 vxor v2,v2,v3 499 vxor v4,v4,v5 500 vxor v6,v6,v7 501 502 vxor v0,v0,v2 503 vxor v4,v4,v6 504 505 vxor v0,v0,v4 506 507.Lbarrett_reduction: 508 /* Barrett constants */ 509 addis r3,r2,.barrett_constants@toc@ha 510 addi r3,r3,.barrett_constants@toc@l 511 512 lvx const1,0,r3 513 lvx const2,off16,r3 514 515 vsldoi v1,v0,v0,8 516 vxor v0,v0,v1 /* xor two 64 bit results together */ 517 518#ifdef REFLECT 519 /* shift left one bit */ 520 vspltisb v1,1 521 vsl v0,v0,v1 522#endif 523 524 vand v0,v0,mask_64bit 525 526#ifndef REFLECT 527 /* 528 * Now for the Barrett reduction algorithm. The idea is to calculate q, 529 * the multiple of our polynomial that we need to subtract. By 530 * doing the computation 2x bits higher (ie 64 bits) and shifting the 531 * result back down 2x bits, we round down to the nearest multiple. 532 */ 533 VPMSUMD(v1,v0,const1) /* ma */ 534 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ 535 VPMSUMD(v1,v1,const2) /* qn */ 536 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 537 538 /* 539 * Get the result into r3. We need to shift it left 8 bytes: 540 * V0 [ 0 1 2 X ] 541 * V0 [ 0 X 2 3 ] 542 */ 543 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ 544#else 545 /* 546 * The reflected version of Barrett reduction. Instead of bit 547 * reflecting our data (which is expensive to do), we bit reflect our 548 * constants and our algorithm, which means the intermediate data in 549 * our vector registers goes from 0-63 instead of 63-0. We can reflect 550 * the algorithm because we don't carry in mod 2 arithmetic. 551 */ 552 vand v1,v0,mask_32bit /* bottom 32 bits of a */ 553 VPMSUMD(v1,v1,const1) /* ma */ 554 vand v1,v1,mask_32bit /* bottom 32bits of ma */ 555 VPMSUMD(v1,v1,const2) /* qn */ 556 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 557 558 /* 559 * Since we are bit reflected, the result (ie the low 32 bits) is in 560 * the high 32 bits. We just need to shift it left 4 bytes 561 * V0 [ 0 1 X 3 ] 562 * V0 [ 0 X 2 3 ] 563 */ 564 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 565#endif 566 567 /* Get it into r3 */ 568 MFVRD(r3, v0) 569 570.Lout: 571 subi r6,r1,56+10*16 572 subi r7,r1,56+2*16 573 574 lvx v20,0,r6 575 lvx v21,off16,r6 576 lvx v22,off32,r6 577 lvx v23,off48,r6 578 lvx v24,off64,r6 579 lvx v25,off80,r6 580 lvx v26,off96,r6 581 lvx v27,off112,r6 582 lvx v28,0,r7 583 lvx v29,off16,r7 584 585 ld r31,-8(r1) 586 ld r30,-16(r1) 587 ld r29,-24(r1) 588 ld r28,-32(r1) 589 ld r27,-40(r1) 590 ld r26,-48(r1) 591 ld r25,-56(r1) 592 593 blr 594 595.Lfirst_warm_up_done: 596 lvx const1,0,r3 597 addi r3,r3,16 598 599 VPMSUMD(v8,v16,const1) 600 VPMSUMD(v9,v17,const1) 601 VPMSUMD(v10,v18,const1) 602 VPMSUMD(v11,v19,const1) 603 VPMSUMD(v12,v20,const1) 604 VPMSUMD(v13,v21,const1) 605 VPMSUMD(v14,v22,const1) 606 VPMSUMD(v15,v23,const1) 607 608 b .Lsecond_cool_down 609 610.Lshort: 611 cmpdi r5,0 612 beq .Lzero 613 614 addis r3,r2,.short_constants@toc@ha 615 addi r3,r3,.short_constants@toc@l 616 617 /* Calculate where in the constant table we need to start */ 618 subfic r6,r5,256 619 add r3,r3,r6 620 621 /* How many 16 byte chunks? */ 622 srdi r7,r5,4 623 mtctr r7 624 625 vxor v19,v19,v19 626 vxor v20,v20,v20 627 628 lvx v0,0,r4 629 lvx v16,0,r3 630 VPERM(v0,v0,v16,byteswap) 631 vxor v0,v0,v8 /* xor in initial value */ 632 VPMSUMW(v0,v0,v16) 633 bdz .Lv0 634 635 lvx v1,off16,r4 636 lvx v17,off16,r3 637 VPERM(v1,v1,v17,byteswap) 638 VPMSUMW(v1,v1,v17) 639 bdz .Lv1 640 641 lvx v2,off32,r4 642 lvx v16,off32,r3 643 VPERM(v2,v2,v16,byteswap) 644 VPMSUMW(v2,v2,v16) 645 bdz .Lv2 646 647 lvx v3,off48,r4 648 lvx v17,off48,r3 649 VPERM(v3,v3,v17,byteswap) 650 VPMSUMW(v3,v3,v17) 651 bdz .Lv3 652 653 lvx v4,off64,r4 654 lvx v16,off64,r3 655 VPERM(v4,v4,v16,byteswap) 656 VPMSUMW(v4,v4,v16) 657 bdz .Lv4 658 659 lvx v5,off80,r4 660 lvx v17,off80,r3 661 VPERM(v5,v5,v17,byteswap) 662 VPMSUMW(v5,v5,v17) 663 bdz .Lv5 664 665 lvx v6,off96,r4 666 lvx v16,off96,r3 667 VPERM(v6,v6,v16,byteswap) 668 VPMSUMW(v6,v6,v16) 669 bdz .Lv6 670 671 lvx v7,off112,r4 672 lvx v17,off112,r3 673 VPERM(v7,v7,v17,byteswap) 674 VPMSUMW(v7,v7,v17) 675 bdz .Lv7 676 677 addi r3,r3,128 678 addi r4,r4,128 679 680 lvx v8,0,r4 681 lvx v16,0,r3 682 VPERM(v8,v8,v16,byteswap) 683 VPMSUMW(v8,v8,v16) 684 bdz .Lv8 685 686 lvx v9,off16,r4 687 lvx v17,off16,r3 688 VPERM(v9,v9,v17,byteswap) 689 VPMSUMW(v9,v9,v17) 690 bdz .Lv9 691 692 lvx v10,off32,r4 693 lvx v16,off32,r3 694 VPERM(v10,v10,v16,byteswap) 695 VPMSUMW(v10,v10,v16) 696 bdz .Lv10 697 698 lvx v11,off48,r4 699 lvx v17,off48,r3 700 VPERM(v11,v11,v17,byteswap) 701 VPMSUMW(v11,v11,v17) 702 bdz .Lv11 703 704 lvx v12,off64,r4 705 lvx v16,off64,r3 706 VPERM(v12,v12,v16,byteswap) 707 VPMSUMW(v12,v12,v16) 708 bdz .Lv12 709 710 lvx v13,off80,r4 711 lvx v17,off80,r3 712 VPERM(v13,v13,v17,byteswap) 713 VPMSUMW(v13,v13,v17) 714 bdz .Lv13 715 716 lvx v14,off96,r4 717 lvx v16,off96,r3 718 VPERM(v14,v14,v16,byteswap) 719 VPMSUMW(v14,v14,v16) 720 bdz .Lv14 721 722 lvx v15,off112,r4 723 lvx v17,off112,r3 724 VPERM(v15,v15,v17,byteswap) 725 VPMSUMW(v15,v15,v17) 726 727.Lv15: vxor v19,v19,v15 728.Lv14: vxor v20,v20,v14 729.Lv13: vxor v19,v19,v13 730.Lv12: vxor v20,v20,v12 731.Lv11: vxor v19,v19,v11 732.Lv10: vxor v20,v20,v10 733.Lv9: vxor v19,v19,v9 734.Lv8: vxor v20,v20,v8 735.Lv7: vxor v19,v19,v7 736.Lv6: vxor v20,v20,v6 737.Lv5: vxor v19,v19,v5 738.Lv4: vxor v20,v20,v4 739.Lv3: vxor v19,v19,v3 740.Lv2: vxor v20,v20,v2 741.Lv1: vxor v19,v19,v1 742.Lv0: vxor v20,v20,v0 743 744 vxor v0,v19,v20 745 746 b .Lbarrett_reduction 747 748.Lzero: 749 mr r3,r10 750 b .Lout 751 752FUNC_END(__crc32_vpmsum) 753