1// Copyright 2017 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// The vectorized implementation found below is a derived work 6// from code written by Anton Blanchard <anton@au.ibm.com> found 7// at https://github.com/antonblanchard/crc32-vpmsum. The original 8// is dual licensed under GPL and Apache 2. As the copyright holder 9// for the work, IBM has contributed this new work under 10// the golang license. 11 12// Changes include porting to Go assembler with modifications for 13// the Go ABI for ppc64le. 14 15#include "textflag.h" 16 17#define POWER8_OFFSET 132 18 19#define off16 R16 20#define off32 R17 21#define off48 R18 22#define off64 R19 23#define off80 R20 24#define off96 R21 25#define off112 R22 26 27#define const1 V24 28#define const2 V25 29 30#define byteswap V26 31#define mask_32bit V27 32#define mask_64bit V28 33#define zeroes V29 34 35#define MAX_SIZE 32*1024 36#define REFLECT 37 38TEXT ·ppc64SlicingUpdateBy8(SB), NOSPLIT|NOFRAME, $0-44 39 MOVWZ crc+0(FP), R3 // incoming crc 40 MOVD table8+8(FP), R4 // *Table 41 MOVD p+16(FP), R5 42 MOVD p_len+24(FP), R6 // p len 43 44 CMP $0,R6 // len == 0? 45 BNE start 46 MOVW R3,ret+40(FP) // return crc 47 RET 48 49start: 50 NOR R3,R3,R7 // ^crc 51 MOVWZ R7,R7 // 32 bits 52 CMP R6,$16 53 MOVD R6,CTR 54 BLT short 55 SRAD $3,R6,R8 // 8 byte chunks 56 MOVD R8,CTR 57 58loop: 59 MOVWZ 0(R5),R8 // 0-3 bytes of p ?Endian? 60 MOVWZ 4(R5),R9 // 4-7 bytes of p 61 MOVD R4,R10 // &tab[0] 62 XOR R7,R8,R7 // crc ^= byte[0:3] 63 RLDICL $40,R9,$56,R17 // p[7] 64 SLD $2,R17,R17 // p[7]*4 65 RLDICL $40,R7,$56,R8 // crc>>24 66 ADD R17,R10,R17 // &tab[0][p[7]] 67 SLD $2,R8,R8 // crc>>24*4 68 RLDICL $48,R9,$56,R18 // p[6] 69 SLD $2,R18,R18 // p[6]*4 70 ADD $1024,R10,R10 // tab[1] 71 MOVWZ 0(R17),R21 // tab[0][p[7]] 72 RLDICL $56,R9,$56,R19 // p[5] 73 ADD R10,R18,R18 // &tab[1][p[6]] 74 SLD $2,R19,R19 // p[5]*4:1 75 MOVWZ 0(R18),R22 // tab[1][p[6]] 76 ADD $1024,R10,R10 // tab[2] 77 XOR R21,R22,R21 // xor done R22 78 ADD R19,R10,R19 // &tab[2][p[5]] 79 ANDCC $255,R9,R20 // p[4] ?? 80 SLD $2,R20,R20 // p[4]*4 81 MOVWZ 0(R19),R23 // tab[2][p[5]] 82 ADD $1024,R10,R10 // &tab[3] 83 ADD R20,R10,R20 // tab[3][p[4]] 84 XOR R21,R23,R21 // xor done R23 85 ADD $1024,R10,R10 // &tab[4] 86 MOVWZ 0(R20),R24 // tab[3][p[4]] 87 ADD R10,R8,R23 // &tab[4][crc>>24] 88 XOR R21,R24,R21 // xor done R24 89 MOVWZ 0(R23),R25 // tab[4][crc>>24] 90 RLDICL $48,R7,$56,R24 // crc>>16&0xFF 91 XOR R21,R25,R21 // xor done R25 92 ADD $1024,R10,R10 // &tab[5] 93 SLD $2,R24,R24 // crc>>16&0xFF*4 94 ADD R24,R10,R24 // &tab[5][crc>>16&0xFF] 95 MOVWZ 0(R24),R26 // tab[5][crc>>16&0xFF] 96 XOR R21,R26,R21 // xor done R26 97 RLDICL $56,R7,$56,R25 // crc>>8 98 ADD $1024,R10,R10 // &tab[6] 99 SLD $2,R25,R25 // crc>>8&FF*2 100 ADD R25,R10,R25 // &tab[6][crc>>8&0xFF] 101 MOVBZ R7,R26 // crc&0xFF 102 ADD $1024,R10,R10 // &tab[7] 103 MOVWZ 0(R25),R27 // tab[6][crc>>8&0xFF] 104 SLD $2,R26,R26 // crc&0xFF*2 105 XOR R21,R27,R21 // xor done R27 106 ADD R26,R10,R26 // &tab[7][crc&0xFF] 107 ADD $8,R5 // p = p[8:] 108 MOVWZ 0(R26),R28 // tab[7][crc&0xFF] 109 XOR R21,R28,R21 // xor done R28 110 MOVWZ R21,R7 // crc for next round 111 BC 16,0,loop // next 8 bytes 112 ANDCC $7,R6,R8 // any leftover bytes 113 BEQ done // none --> done 114 MOVD R8,CTR // byte count 115 PCALIGN $16 // align short loop 116short: 117 MOVBZ 0(R5),R8 // get v 118 MOVBZ R7,R9 // byte(crc) -> R8 BE vs LE? 119 SRD $8,R7,R14 // crc>>8 120 XOR R8,R9,R8 // byte(crc)^v -> R8 121 ADD $1,R5 // ptr to next v 122 SLD $2,R8 // convert index-> bytes 123 ADD R8,R4,R9 // &tab[byte(crc)^v] 124 MOVWZ 0(R9),R10 // tab[byte(crc)^v] 125 XOR R10,R14,R7 // loop crc in R7 126 BC 16,0,short 127done: 128 NOR R7,R7,R7 // ^crc 129 MOVW R7,ret+40(FP) // return crc 130 RET 131 132#ifdef BYTESWAP_DATA 133DATA ·byteswapcons+0(SB)/8,$0x0706050403020100 134DATA ·byteswapcons+8(SB)/8,$0x0f0e0d0c0b0a0908 135 136GLOBL ·byteswapcons+0(SB),RODATA,$16 137#endif 138 139TEXT ·vectorCrc32(SB), NOSPLIT|NOFRAME, $0-36 140 MOVWZ crc+0(FP), R3 // incoming crc 141 MOVWZ ctab+4(FP), R14 // crc poly id 142 MOVD p+8(FP), R4 143 MOVD p_len+16(FP), R5 // p len 144 145 // R3 = incoming crc 146 // R14 = constant table identifier 147 // R5 = address of bytes 148 // R6 = length of bytes 149 150 // defines for index loads 151 152 MOVD $16,off16 153 MOVD $32,off32 154 MOVD $48,off48 155 MOVD $64,off64 156 MOVD $80,off80 157 MOVD $96,off96 158 MOVD $112,off112 159 MOVD $0,R15 160 161 MOVD R3,R10 // save initial crc 162 163 NOR R3,R3,R3 // ^crc 164 MOVWZ R3,R3 // 32 bits 165 VXOR zeroes,zeroes,zeroes // clear the V reg 166 VSPLTISW $-1,V0 167 VSLDOI $4,V29,V0,mask_32bit 168 VSLDOI $8,V29,V0,mask_64bit 169 170 VXOR V8,V8,V8 171 MTVSRD R3,VS40 // crc initial value VS40 = V8 172 173#ifdef REFLECT 174 VSLDOI $8,zeroes,V8,V8 // or: VSLDOI V29,V8,V27,4 for top 32 bits? 175#else 176 VSLDOI $4,V8,zeroes,V8 177#endif 178 179#ifdef BYTESWAP_DATA 180 MOVD $·byteswapcons(SB),R3 181 LVX (R3),byteswap 182#endif 183 184 CMPU R5,$256 // length of bytes 185 BLT short 186 187 RLDICR $0,R5,$56,R6 // chunk to process 188 189 // First step for larger sizes 190l1: MOVD $32768,R7 191 MOVD R7,R9 192 CMP R6,R7 // compare R6, R7 (MAX SIZE) 193 BGT top // less than MAX, just do remainder 194 MOVD R6,R7 195top: 196 SUB R7,R6,R6 197 198 // mainloop does 128 bytes at a time 199 SRD $7,R7 200 201 // determine the offset into the constants table to start with. 202 // Each constant is 128 bytes, used against 16 bytes of data. 203 SLD $4,R7,R8 204 SRD $3,R9,R9 205 SUB R8,R9,R8 206 207 // The last iteration is reduced in a separate step 208 ADD $-1,R7 209 MOVD R7,CTR 210 211 // Determine which constant table (depends on poly) 212 CMP R14,$1 213 BNE castTable 214 MOVD $·IEEEConst(SB),R3 215 BR startConst 216castTable: 217 MOVD $·CastConst(SB),R3 218 219startConst: 220 ADD R3,R8,R3 // starting point in constants table 221 222 VXOR V0,V0,V0 // clear the V regs 223 VXOR V1,V1,V1 224 VXOR V2,V2,V2 225 VXOR V3,V3,V3 226 VXOR V4,V4,V4 227 VXOR V5,V5,V5 228 VXOR V6,V6,V6 229 VXOR V7,V7,V7 230 231 LVX (R3),const1 // loading constant values 232 233 CMP R15,$1 // Identify warm up pass 234 BEQ next 235 236 // First warm up pass: load the bytes to process 237 LVX (R4),V16 238 LVX (R4+off16),V17 239 LVX (R4+off32),V18 240 LVX (R4+off48),V19 241 LVX (R4+off64),V20 242 LVX (R4+off80),V21 243 LVX (R4+off96),V22 244 LVX (R4+off112),V23 245 ADD $128,R4 // bump up to next 128 bytes in buffer 246 247 VXOR V16,V8,V16 // xor in initial CRC in V8 248 249next: 250 BC 18,0,first_warm_up_done 251 252 ADD $16,R3 // bump up to next constants 253 LVX (R3),const2 // table values 254 255 VPMSUMD V16,const1,V8 // second warm up pass 256 LVX (R4),V16 // load from buffer 257 OR $0,R2,R2 258 259 VPMSUMD V17,const1,V9 // vpmsumd with constants 260 LVX (R4+off16),V17 // load next from buffer 261 OR $0,R2,R2 262 263 VPMSUMD V18,const1,V10 // vpmsumd with constants 264 LVX (R4+off32),V18 // load next from buffer 265 OR $0,R2,R2 266 267 VPMSUMD V19,const1,V11 // vpmsumd with constants 268 LVX (R4+off48),V19 // load next from buffer 269 OR $0,R2,R2 270 271 VPMSUMD V20,const1,V12 // vpmsumd with constants 272 LVX (R4+off64),V20 // load next from buffer 273 OR $0,R2,R2 274 275 VPMSUMD V21,const1,V13 // vpmsumd with constants 276 LVX (R4+off80),V21 // load next from buffer 277 OR $0,R2,R2 278 279 VPMSUMD V22,const1,V14 // vpmsumd with constants 280 LVX (R4+off96),V22 // load next from buffer 281 OR $0,R2,R2 282 283 VPMSUMD V23,const1,V15 // vpmsumd with constants 284 LVX (R4+off112),V23 // load next from buffer 285 286 ADD $128,R4 // bump up to next 128 bytes in buffer 287 288 BC 18,0,first_cool_down 289 290cool_top: 291 LVX (R3),const1 // constants 292 ADD $16,R3 // inc to next constants 293 OR $0,R2,R2 294 295 VXOR V0,V8,V0 // xor in previous vpmsumd 296 VPMSUMD V16,const2,V8 // vpmsumd with constants 297 LVX (R4),V16 // buffer 298 OR $0,R2,R2 299 300 VXOR V1,V9,V1 // xor in previous 301 VPMSUMD V17,const2,V9 // vpmsumd with constants 302 LVX (R4+off16),V17 // next in buffer 303 OR $0,R2,R2 304 305 VXOR V2,V10,V2 // xor in previous 306 VPMSUMD V18,const2,V10 // vpmsumd with constants 307 LVX (R4+off32),V18 // next in buffer 308 OR $0,R2,R2 309 310 VXOR V3,V11,V3 // xor in previous 311 VPMSUMD V19,const2,V11 // vpmsumd with constants 312 LVX (R4+off48),V19 // next in buffer 313 LVX (R3),const2 // get next constant 314 OR $0,R2,R2 315 316 VXOR V4,V12,V4 // xor in previous 317 VPMSUMD V20,const1,V12 // vpmsumd with constants 318 LVX (R4+off64),V20 // next in buffer 319 OR $0,R2,R2 320 321 VXOR V5,V13,V5 // xor in previous 322 VPMSUMD V21,const1,V13 // vpmsumd with constants 323 LVX (R4+off80),V21 // next in buffer 324 OR $0,R2,R2 325 326 VXOR V6,V14,V6 // xor in previous 327 VPMSUMD V22,const1,V14 // vpmsumd with constants 328 LVX (R4+off96),V22 // next in buffer 329 OR $0,R2,R2 330 331 VXOR V7,V15,V7 // xor in previous 332 VPMSUMD V23,const1,V15 // vpmsumd with constants 333 LVX (R4+off112),V23 // next in buffer 334 335 ADD $128,R4 // bump up buffer pointer 336 BC 16,0,cool_top // are we done? 337 338first_cool_down: 339 340 // load the constants 341 // xor in the previous value 342 // vpmsumd the result with constants 343 344 LVX (R3),const1 345 ADD $16,R3 346 347 VXOR V0,V8,V0 348 VPMSUMD V16,const1,V8 349 OR $0,R2,R2 350 351 VXOR V1,V9,V1 352 VPMSUMD V17,const1,V9 353 OR $0,R2,R2 354 355 VXOR V2,V10,V2 356 VPMSUMD V18,const1,V10 357 OR $0,R2,R2 358 359 VXOR V3,V11,V3 360 VPMSUMD V19,const1,V11 361 OR $0,R2,R2 362 363 VXOR V4,V12,V4 364 VPMSUMD V20,const1,V12 365 OR $0,R2,R2 366 367 VXOR V5,V13,V5 368 VPMSUMD V21,const1,V13 369 OR $0,R2,R2 370 371 VXOR V6,V14,V6 372 VPMSUMD V22,const1,V14 373 OR $0,R2,R2 374 375 VXOR V7,V15,V7 376 VPMSUMD V23,const1,V15 377 OR $0,R2,R2 378 379second_cool_down: 380 381 VXOR V0,V8,V0 382 VXOR V1,V9,V1 383 VXOR V2,V10,V2 384 VXOR V3,V11,V3 385 VXOR V4,V12,V4 386 VXOR V5,V13,V5 387 VXOR V6,V14,V6 388 VXOR V7,V15,V7 389 390#ifdef REFLECT 391 VSLDOI $4,V0,zeroes,V0 392 VSLDOI $4,V1,zeroes,V1 393 VSLDOI $4,V2,zeroes,V2 394 VSLDOI $4,V3,zeroes,V3 395 VSLDOI $4,V4,zeroes,V4 396 VSLDOI $4,V5,zeroes,V5 397 VSLDOI $4,V6,zeroes,V6 398 VSLDOI $4,V7,zeroes,V7 399#endif 400 401 LVX (R4),V8 402 LVX (R4+off16),V9 403 LVX (R4+off32),V10 404 LVX (R4+off48),V11 405 LVX (R4+off64),V12 406 LVX (R4+off80),V13 407 LVX (R4+off96),V14 408 LVX (R4+off112),V15 409 410 ADD $128,R4 411 412 VXOR V0,V8,V16 413 VXOR V1,V9,V17 414 VXOR V2,V10,V18 415 VXOR V3,V11,V19 416 VXOR V4,V12,V20 417 VXOR V5,V13,V21 418 VXOR V6,V14,V22 419 VXOR V7,V15,V23 420 421 MOVD $1,R15 422 CMP $0,R6 423 ADD $128,R6 424 425 BNE l1 426 ANDCC $127,R5 427 SUBC R5,$128,R6 428 ADD R3,R6,R3 429 430 SRD $4,R5,R7 431 MOVD R7,CTR 432 LVX (R3),V0 433 LVX (R3+off16),V1 434 LVX (R3+off32),V2 435 LVX (R3+off48),V3 436 LVX (R3+off64),V4 437 LVX (R3+off80),V5 438 LVX (R3+off96),V6 439 LVX (R3+off112),V7 440 441 ADD $128,R3 442 443 VPMSUMW V16,V0,V0 444 VPMSUMW V17,V1,V1 445 VPMSUMW V18,V2,V2 446 VPMSUMW V19,V3,V3 447 VPMSUMW V20,V4,V4 448 VPMSUMW V21,V5,V5 449 VPMSUMW V22,V6,V6 450 VPMSUMW V23,V7,V7 451 452 // now reduce the tail 453 454 CMP $0,R7 455 BEQ next1 456 457 LVX (R4),V16 458 LVX (R3),V17 459 VPMSUMW V16,V17,V16 460 VXOR V0,V16,V0 461 BC 18,0,next1 462 463 LVX (R4+off16),V16 464 LVX (R3+off16),V17 465 VPMSUMW V16,V17,V16 466 VXOR V0,V16,V0 467 BC 18,0,next1 468 469 LVX (R4+off32),V16 470 LVX (R3+off32),V17 471 VPMSUMW V16,V17,V16 472 VXOR V0,V16,V0 473 BC 18,0,next1 474 475 LVX (R4+off48),V16 476 LVX (R3+off48),V17 477 VPMSUMW V16,V17,V16 478 VXOR V0,V16,V0 479 BC 18,0,next1 480 481 LVX (R4+off64),V16 482 LVX (R3+off64),V17 483 VPMSUMW V16,V17,V16 484 VXOR V0,V16,V0 485 BC 18,0,next1 486 487 LVX (R4+off80),V16 488 LVX (R3+off80),V17 489 VPMSUMW V16,V17,V16 490 VXOR V0,V16,V0 491 BC 18,0,next1 492 493 LVX (R4+off96),V16 494 LVX (R3+off96),V17 495 VPMSUMW V16,V17,V16 496 VXOR V0,V16,V0 497 498next1: 499 VXOR V0,V1,V0 500 VXOR V2,V3,V2 501 VXOR V4,V5,V4 502 VXOR V6,V7,V6 503 VXOR V0,V2,V0 504 VXOR V4,V6,V4 505 VXOR V0,V4,V0 506 507barrett_reduction: 508 509 CMP R14,$1 510 BNE barcstTable 511 MOVD $·IEEEBarConst(SB),R3 512 BR startbarConst 513barcstTable: 514 MOVD $·CastBarConst(SB),R3 515 516startbarConst: 517 LVX (R3),const1 518 LVX (R3+off16),const2 519 520 VSLDOI $8,V0,V0,V1 521 VXOR V0,V1,V0 522 523#ifdef REFLECT 524 VSPLTISB $1,V1 525 VSL V0,V1,V0 526#endif 527 528 VAND V0,mask_64bit,V0 529 530#ifndef REFLECT 531 532 VPMSUMD V0,const1,V1 533 VSLDOI $8,zeroes,V1,V1 534 VPMSUMD V1,const2,V1 535 VXOR V0,V1,V0 536 VSLDOI $8,V0,zeroes,V0 537 538#else 539 540 VAND V0,mask_32bit,V1 541 VPMSUMD V1,const1,V1 542 VAND V1,mask_32bit,V1 543 VPMSUMD V1,const2,V1 544 VXOR V0,V1,V0 545 VSLDOI $4,V0,zeroes,V0 546 547#endif 548 549 MFVSRD VS32,R3 // VS32 = V0 550 551 NOR R3,R3,R3 // return ^crc 552 MOVW R3,ret+32(FP) 553 RET 554 555first_warm_up_done: 556 557 LVX (R3),const1 558 ADD $16,R3 559 560 VPMSUMD V16,const1,V8 561 VPMSUMD V17,const1,V9 562 VPMSUMD V18,const1,V10 563 VPMSUMD V19,const1,V11 564 VPMSUMD V20,const1,V12 565 VPMSUMD V21,const1,V13 566 VPMSUMD V22,const1,V14 567 VPMSUMD V23,const1,V15 568 569 BR second_cool_down 570 571short: 572 CMP $0,R5 573 BEQ zero 574 575 // compute short constants 576 577 CMP R14,$1 578 BNE castshTable 579 MOVD $·IEEEConst(SB),R3 580 ADD $4080,R3 581 BR startshConst 582castshTable: 583 MOVD $·CastConst(SB),R3 584 ADD $4080,R3 585 586startshConst: 587 SUBC R5,$256,R6 // sub from 256 588 ADD R3,R6,R3 589 590 // calculate where to start 591 592 SRD $4,R5,R7 593 MOVD R7,CTR 594 595 VXOR V19,V19,V19 596 VXOR V20,V20,V20 597 598 LVX (R4),V0 599 LVX (R3),V16 600 VXOR V0,V8,V0 601 VPMSUMW V0,V16,V0 602 BC 18,0,v0 603 604 LVX (R4+off16),V1 605 LVX (R3+off16),V17 606 VPMSUMW V1,V17,V1 607 BC 18,0,v1 608 609 LVX (R4+off32),V2 610 LVX (R3+off32),V16 611 VPMSUMW V2,V16,V2 612 BC 18,0,v2 613 614 LVX (R4+off48),V3 615 LVX (R3+off48),V17 616 VPMSUMW V3,V17,V3 617 BC 18,0,v3 618 619 LVX (R4+off64),V4 620 LVX (R3+off64),V16 621 VPMSUMW V4,V16,V4 622 BC 18,0,v4 623 624 LVX (R4+off80),V5 625 LVX (R3+off80),V17 626 VPMSUMW V5,V17,V5 627 BC 18,0,v5 628 629 LVX (R4+off96),V6 630 LVX (R3+off96),V16 631 VPMSUMW V6,V16,V6 632 BC 18,0,v6 633 634 LVX (R4+off112),V7 635 LVX (R3+off112),V17 636 VPMSUMW V7,V17,V7 637 BC 18,0,v7 638 639 ADD $128,R3 640 ADD $128,R4 641 642 LVX (R4),V8 643 LVX (R3),V16 644 VPMSUMW V8,V16,V8 645 BC 18,0,v8 646 647 LVX (R4+off16),V9 648 LVX (R3+off16),V17 649 VPMSUMW V9,V17,V9 650 BC 18,0,v9 651 652 LVX (R4+off32),V10 653 LVX (R3+off32),V16 654 VPMSUMW V10,V16,V10 655 BC 18,0,v10 656 657 LVX (R4+off48),V11 658 LVX (R3+off48),V17 659 VPMSUMW V11,V17,V11 660 BC 18,0,v11 661 662 LVX (R4+off64),V12 663 LVX (R3+off64),V16 664 VPMSUMW V12,V16,V12 665 BC 18,0,v12 666 667 LVX (R4+off80),V13 668 LVX (R3+off80),V17 669 VPMSUMW V13,V17,V13 670 BC 18,0,v13 671 672 LVX (R4+off96),V14 673 LVX (R3+off96),V16 674 VPMSUMW V14,V16,V14 675 BC 18,0,v14 676 677 LVX (R4+off112),V15 678 LVX (R3+off112),V17 679 VPMSUMW V15,V17,V15 680 681 VXOR V19,V15,V19 682v14: VXOR V20,V14,V20 683v13: VXOR V19,V13,V19 684v12: VXOR V20,V12,V20 685v11: VXOR V19,V11,V19 686v10: VXOR V20,V10,V20 687v9: VXOR V19,V9,V19 688v8: VXOR V20,V8,V20 689v7: VXOR V19,V7,V19 690v6: VXOR V20,V6,V20 691v5: VXOR V19,V5,V19 692v4: VXOR V20,V4,V20 693v3: VXOR V19,V3,V19 694v2: VXOR V20,V2,V20 695v1: VXOR V19,V1,V19 696v0: VXOR V20,V0,V20 697 698 VXOR V19,V20,V0 699 700 BR barrett_reduction 701 702zero: 703 // This case is the original crc, so just return it 704 MOVW R10,ret+32(FP) 705 RET 706