1# This Source Code Form is subject to the terms of the Mozilla Public 2# License, v. 2.0. If a copy of the MPL was not distributed with this 3# file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 6# vs0 - vs15 : buffer for xor 7# vs32 - vs47 (v0 - v15) : 4 "converted" states 8# vs48 - vs51 (v16 - v19) : original state 9# vs52 - vs55 (v20 - v23) : "converted" constants 10# vs56 (v24) : "converted" counter 11# vs57 (v25) : increment for "converted" counter 12# vs60 - vs63 (v28 - v31) : constants for rotate left or vpermxor 13 14#define r0 0 15#define sp 1 16#define r2 2 17#define rSIZE 3 18#define rDST 4 19#define rSRC 5 20#define rKEY 6 21#define rNONCE 7 22#define rCNTR 8 23#define r9 9 24#define r10 10 25#define r11 11 26#define r12 12 27#define r13 13 28#define r14 14 29#define r15 15 30#define r16 16 31#define r17 17 32#define r18 18 33#define r19 19 34#define r20 20 35#define r21 21 36#define r22 22 37#define r23 23 38#define r24 24 39#define r25 25 40#define r26 26 41#define r27 27 42#define r28 28 43#define r29 29 44#define r30 30 45#define r31 31 46 47#define v0 0 48#define v1 1 49#define v2 2 50#define v3 3 51#define v4 4 52#define v5 5 53#define v6 6 54#define v7 7 55#define v8 8 56#define v9 9 57#define v10 10 58#define v11 11 59#define v12 12 60#define v13 13 61#define v14 14 62#define v15 15 63#define v16 16 64#define v17 17 65#define v18 18 66#define v19 19 67#define v20 20 68#define v21 21 69#define v22 22 70#define v23 23 71#define v24 24 72#define v25 25 73#define v26 26 74#define v27 27 75#define v28 28 76#define v29 29 77#define v30 30 78#define v31 31 79 80#define vs0 0 81#define vs1 1 82#define vs2 2 83#define vs3 3 84#define vs4 4 85#define vs5 5 86#define vs6 6 87#define vs7 7 88#define vs8 8 89#define vs9 9 90#define vs10 10 91#define vs11 11 92#define vs12 12 93#define vs13 13 94#define vs14 14 95#define vs15 15 96#define vs16 16 97#define vs17 17 98#define vs18 18 99#define vs19 19 100#define vs20 20 101#define vs21 21 102#define vs22 22 103#define vs23 23 104#define vs24 24 105#define vs25 25 106#define vs26 26 107#define vs27 27 108#define vs28 28 109#define vs29 29 110#define vs30 30 111#define vs31 31 112#define vs32 32 113#define vs33 33 114#define vs34 34 115#define vs35 35 116#define vs36 36 117#define vs37 37 118#define vs38 38 119#define vs39 39 120#define vs40 40 121#define vs41 41 122#define vs42 42 123#define vs43 43 124#define vs44 44 125#define vs45 45 126#define vs46 46 127#define vs47 47 128#define vs48 48 129#define vs49 49 130#define vs50 50 131#define vs51 51 132#define vs52 52 133#define vs53 53 134#define vs54 54 135#define vs55 55 136#define vs56 56 137#define vs57 57 138#define vs58 58 139#define vs59 59 140#define vs60 60 141#define vs61 61 142#define vs62 62 143#define vs63 63 144 145.abiversion 2 146.section ".data" 147.align 5 148lblock: .skip 256 149cnts0: .long 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 150cnts1: .long 0x61707865, 0x61707865, 0x61707865, 0x61707865 151cnts2: .long 0x3320646e, 0x3320646e, 0x3320646e, 0x3320646e 152cnts3: .long 0x79622d32, 0x79622d32, 0x79622d32, 0x79622d32 153cnts4: .long 0x6b206574, 0x6b206574, 0x6b206574, 0x6b206574 154st4: .long 0, 0, 0, 0 155cntr: .long 0, 0, 0, 0 156incr: .long 4, 4, 4, 4 157rotl1: .long 0x22330011, 0x66774455, 0xAABB8899, 0xEEFFCCDD 158rotl2: .long 12, 12, 12, 12 159rotl3: .long 0x11223300, 0x55667744, 0x99AABB88, 0xDDEEFFCC 160rotl4: .long 7, 7, 7, 7 161 162.section ".text" 163.align 5 164.globl chacha20vsx 165.type chacha20vsx, @function 166chacha20vsx: 167 # prologue 168 addis 2, r12, .TOC.-chacha20vsx@ha 169 addi 2, 2, .TOC.-chacha20vsx@l 170 .localentry chacha20vsx, .-chacha20vsx 171 std r14, -8(sp) 172 std r15, -16(sp) 173 std r16, -24(sp) 174 std r17, -32(sp) 175 std r18, -40(sp) 176 std r19, -48(sp) 177 std r20, -56(sp) 178 std r21, -64(sp) 179 std r22, -72(sp) 180 std r23, -80(sp) 181 std r24, -88(sp) 182 std r25, -96(sp) 183 std r26, -104(sp) 184 std r27, -112(sp) 185 std r28, -120(sp) 186 std r29, -128(sp) 187 std r30, -136(sp) 188 std r31, -144(sp) 189 190 addi r14, sp, -160 191 192 li r16, -16 193 li r17, -32 194 li r18, -48 195 li r19, -64 196 li r20, -80 197 li r21, -96 198 li r22, -112 199 li r23, -128 200 li r24, -144 201 li r25, -160 202 li r26, -176 203 li r27, -192 204 li r28, -208 205 206 # save f14, f15 207 stxvw4x vs14, 0, r14 208 stxvw4x vs15, r16, r14 209 210 # save v20 - v31 211 stxvw4x vs52, r17, r14 212 stxvw4x vs53, r18, r14 213 stxvw4x vs54, r19, r14 214 stxvw4x vs55, r20, r14 215 stxvw4x vs56, r21, r14 216 stxvw4x vs57, r22, r14 217 stxvw4x vs58, r23, r14 218 stxvw4x vs59, r24, r14 219 stxvw4x vs60, r25, r14 220 stxvw4x vs61, r26, r14 221 stxvw4x vs62, r27, r14 222 stxvw4x vs63, r28, r14 223 224 # offset in src/dst 225 li r17, 16 226 li r18, 32 227 li r19, 48 228 li r20, 64 229 li r21, 80 230 li r22, 96 231 li r23, 112 232 li r24, 128 233 li r25, 144 234 li r26, 160 235 li r27, 176 236 li r28, 192 237 li r29, 208 238 li r30, 224 239 li r31, 240 240 241 # load const's address 242 addis r14, 2, cnts0@toc@ha 243 addi r14, r14, cnts0@toc@l 244 245 # save nonce to st4 246 lwz r15, 0(rNONCE) 247 stw r15, 84(r14) 248 lwz r15, 4(rNONCE) 249 stw r15, 88(r14) 250 lwz r15, 8(rNONCE) 251 stw r15, 92(r14) 252 253 # load state to vectors 254 lxvw4x vs48, 0, r14 255 lxvw4x vs49, 0, rKEY 256 lxvw4x vs50, r17, rKEY 257 lxvw4x vs51, r21, r14 258 259 # load consts for x4 rounds 260 lxvw4x vs52, r17, r14 261 lxvw4x vs53, r18, r14 262 lxvw4x vs54, r19, r14 263 lxvw4x vs55, r20, r14 264 265 # counter 266 stw rCNTR, 96(r14) 267 addi rCNTR, rCNTR, 1 268 stw rCNTR, 100(r14) 269 addi rCNTR, rCNTR, 1 270 stw rCNTR, 104(r14) 271 addi rCNTR, rCNTR, 1 272 stw rCNTR, 108(r14) 273 lxvw4x vs56, r22, r14 274 275 # load increment 276 lxvw4x vs57, r23, r14 277 278 # load rotl to vectors 279 lxvw4x vs60, r24, r14 280 lxvw4x vs61, r25, r14 281 lxvw4x vs62, r26, r14 282 lxvw4x vs63, r27, r14 283 284 # counter for loop = size/256 285 li r15, 256 286 divdu. r16, rSIZE, r15 287 beq lastblock 288 mtctr r16 289 290mainloop: 291 # init 16 vectors (4 states x4) 292 vor v0, v20, v20 293 vor v1, v21, v21 294 vor v2, v22, v22 295 vor v3, v23, v23 296 vspltw v4, v17, v0 297 vspltw v5, v17, v1 298 vspltw v6, v17, v2 299 vspltw v7, v17, v3 300 vspltw v8, v18, v0 301 vspltw v9, v18, v1 302 vspltw v10, v18, v2 303 vspltw v11, v18, v3 304 vor v12, v24, v24 305 vspltw v13, v19, v1 306 vspltw v14, v19, v2 307 vspltw v15, v19, v3 308 309.macro _plus a b_y b_x 310 vadduwm \a, \a, \b_y*4+(\b_x)%4 311 vadduwm \a+1, \a+1, \b_y*4+(\b_x+1)%4 312 vadduwm \a+2, \a+2, \b_y*4+(\b_x+2)%4 313 vadduwm \a+3, \a+3, \b_y*4+(\b_x+3)%4 314.endm 315 316.macro _xor a b_y b_x 317 vxor \a, \a, \b_y*4+(\b_x)%4 318 vxor \a+1, \a+1, \b_y*4+(\b_x+1)%4 319 vxor \a+2, \a+2, \b_y*4+(\b_x+2)%4 320 vxor \a+3, \a+3, \b_y*4+(\b_x+3)%4 321.endm 322 323.macro _rotl a b 324 vrlw \a, \a, \b 325 vrlw \a+1, \a+1, \b 326 vrlw \a+2, \a+2, \b 327 vrlw \a+3, \a+3, \b 328.endm 329 330.macro _pxor a b_y b_x c 331 vpermxor \a, \a, \b_y*4+(\b_x)%4, \c 332 vpermxor \a+1, \a+1, \b_y*4+(\b_x+1)%4, \c 333 vpermxor \a+2, \a+2, \b_y*4+(\b_x+2)%4, \c 334 vpermxor \a+3, \a+3, \b_y*4+(\b_x+3)%4, \c 335.endm 336 337# 00 01 02 03 338# 04 05 06 07 339# 08 09 10 11 340# 12 13 14 15 341.macro doubleround 342 # column round 343 _plus v0, v1, v0 # a+=b 344 _pxor v12, v0, v0, v28 # d^=a; d<<<=16 345 _plus v8, v3, v0 # c+=d 346 _xor v4, v2, v0 # b^=c 347 _rotl v4, v29 # b<<<=12 348 _plus v0, v1, v0 # a+=b 349 _pxor v12, v0, v0, v30 # d^=a; d<<<=8 350 _plus v8, v3, v0 # c+=d 351 _xor v4, v2, v0 # b^=c 352 _rotl v4, v31 # b<<<=7 353 354 # diagonal round 355 _plus v0, v1, v1 # a+=b 356 _pxor v12, v0, v1, v28 # d^=a; d<<<=16 357 _plus v8, v3, v1 # c+=d 358 _xor v4, v2, v1 # b^=c 359 _rotl v4, v29 # b<<<=12 360 _plus v0, v1, v1 # a+=b 361 _pxor v12, v0, v1, v30 # d^=a; d<<<=8 362 _plus v8, v3, v1 # c+=d 363 _xor v4, v2, v1 # b^=c 364 _rotl v4, v31 # b<<<=7 365.endm 366 367 doubleround # 1 368 doubleround # 2 369 doubleround # 3 370 doubleround # 4 371 doubleround # 5 372 doubleround # 6 373 doubleround # 7 374 doubleround # 8 375 doubleround # 9 376 doubleround # 10 377 378 # counter += original counter 379 vadduwm v12, v12, v24 380 381.macro convert a 382 vmrgew 26, 0+\a, 1+\a 383 vmrgew 27, 2+\a, 3+\a 384 vmrgow 0+\a, 0+\a, 1+\a 385 vmrgow 2+\a, 2+\a, 3+\a 386 xxmrghd 33+\a, 32+\a, 34+\a 387 xxmrgld 35+\a, 32+\a, 34+\a 388 xxmrghd 32+\a, 58, 59 389 xxmrgld 34+\a, 58, 59 390.endm 391 392 convert 0 393 convert 4 394 convert 8 395 convert 12 396 397.macro addition a 398 vadduwm 0+\a, 0+\a, 16 399 vadduwm 4+\a, 4+\a, 17 400 vadduwm 8+\a, 8+\a, 18 401 vadduwm 12+\a, 12+\a, 19 402.endm 403 404 addition 0 405 addition 1 406 addition 2 407 addition 3 408 409 # load text/cipher 410 lxvw4x vs0, 0, rSRC 411 lxvw4x vs1, r17, rSRC 412 lxvw4x vs2, r18, rSRC 413 lxvw4x vs3, r19, rSRC 414 lxvw4x vs4, r20, rSRC 415 lxvw4x vs5, r21, rSRC 416 lxvw4x vs6, r22, rSRC 417 lxvw4x vs7, r23, rSRC 418 lxvw4x vs8, r24, rSRC 419 lxvw4x vs9, r25, rSRC 420 lxvw4x vs10, r26, rSRC 421 lxvw4x vs11, r27, rSRC 422 lxvw4x vs12, r28, rSRC 423 lxvw4x vs13, r29, rSRC 424 lxvw4x vs14, r30, rSRC 425 lxvw4x vs15, r31, rSRC 426 # xor (encrypt/decrypt) 427 xxlxor vs0, vs0, vs32 428 xxlxor vs1, vs1, vs36 429 xxlxor vs2, vs2, vs40 430 xxlxor vs3, vs3, vs44 431 xxlxor vs4, vs4, vs33 432 xxlxor vs5, vs5, vs37 433 xxlxor vs6, vs6, vs41 434 xxlxor vs7, vs7, vs45 435 xxlxor vs8, vs8, vs34 436 xxlxor vs9, vs9, vs38 437 xxlxor vs10, vs10, vs42 438 xxlxor vs11, vs11, vs46 439 xxlxor vs12, vs12, vs35 440 xxlxor vs13, vs13, vs39 441 xxlxor vs14, vs14, vs43 442 xxlxor vs15, vs15, vs47 443 # store cipher/text 444 stxvw4x vs0, 0, rDST 445 stxvw4x vs1, r17, rDST 446 stxvw4x vs2, r18, rDST 447 stxvw4x vs3, r19, rDST 448 stxvw4x vs4, r20, rDST 449 stxvw4x vs5, r21, rDST 450 stxvw4x vs6, r22, rDST 451 stxvw4x vs7, r23, rDST 452 stxvw4x vs8, r24, rDST 453 stxvw4x vs9, r25, rDST 454 stxvw4x vs10, r26, rDST 455 stxvw4x vs11, r27, rDST 456 stxvw4x vs12, r28, rDST 457 stxvw4x vs13, r29, rDST 458 stxvw4x vs14, r30, rDST 459 stxvw4x vs15, r31, rDST 460 461 # src/dst increment 462 addi rSRC, rSRC, 256 463 addi rDST, rDST, 256 464 465 # counter increment 466 vadduwm v24, v24, v25 467 468 bdnz mainloop 469 470lastblock: 471 # reminder 472 mulld r16, r16, r15 473 subf. r16, r16, rSIZE 474 475 # check reminder 476 beq exitsub 477 478 addi r14, r14, -256 479 # last block x4 480 # init 16 vectors (4 states x4) 481 vor v0, v20, v20 482 vor v1, v21, v21 483 vor v2, v22, v22 484 vor v3, v23, v23 485 vspltw v4, v17, v0 486 vspltw v5, v17, v1 487 vspltw v6, v17, v2 488 vspltw v7, v17, v3 489 vspltw v8, v18, v0 490 vspltw v9, v18, v1 491 vspltw v10, v18, v2 492 vspltw v11, v18, v3 493 vor v12, v24, v24 494 vspltw v13, v19, v1 495 vspltw v14, v19, v2 496 vspltw v15, v19, v3 497 498 doubleround # 1 499 doubleround # 2 500 doubleround # 3 501 doubleround # 4 502 doubleround # 5 503 doubleround # 6 504 doubleround # 7 505 doubleround # 8 506 doubleround # 9 507 doubleround # 10 508 509 vadduwm v12, v12, v24 510 511 convert 0 512 convert 4 513 convert 8 514 convert 12 515 516 addition 0 517 addition 1 518 addition 2 519 addition 3 520 521 # store vectors 522 stxvw4x vs32, 0, r14 523 stxvw4x vs36, r17, r14 524 stxvw4x vs40, r18, r14 525 stxvw4x vs44, r19, r14 526 stxvw4x vs33, r20, r14 527 stxvw4x vs37, r21, r14 528 stxvw4x vs41, r22, r14 529 stxvw4x vs45, r23, r14 530 stxvw4x vs34, r24, r14 531 stxvw4x vs38, r25, r14 532 stxvw4x vs42, r26, r14 533 stxvw4x vs46, r27, r14 534 stxvw4x vs35, r28, r14 535 stxvw4x vs39, r29, r14 536 stxvw4x vs43, r30, r14 537 stxvw4x vs47, r31, r14 538 539 mtctr r16 540 addi rSIZE, r14, -1 541 addi rSRC, rSRC, -1 542 addi rDST, rDST, -1 543xorlast: 544 lbzu r15, 1(rSIZE) 545 lbzu r16, 1(rSRC) 546 xor r15, r15, r16 547 stbu r15, 1(rDST) 548 bdnz xorlast 549 550 # zeroing last block 551 xxlxor vs0, vs0, vs0 552 stxvw4x vs0, 0, r14 553 stxvw4x vs0, r17, r14 554 stxvw4x vs0, r18, r14 555 stxvw4x vs0, r19, r14 556 stxvw4x vs0, r20, r14 557 stxvw4x vs0, r21, r14 558 stxvw4x vs0, r22, r14 559 stxvw4x vs0, r23, r14 560 stxvw4x vs0, r24, r14 561 stxvw4x vs0, r25, r14 562 stxvw4x vs0, r26, r14 563 stxvw4x vs0, r27, r14 564 stxvw4x vs0, r28, r14 565 stxvw4x vs0, r29, r14 566 stxvw4x vs0, r30, r14 567 stxvw4x vs0, r31, r14 568 569exitsub: 570 # zeroing volatile registers 571 xxlxor vs0, vs0, vs0 572 xxlxor vs1, vs1, vs1 573 xxlxor vs2, vs2, vs2 574 xxlxor vs3, vs3, vs3 575 xxlxor vs4, vs4, vs4 576 xxlxor vs5, vs5, vs5 577 xxlxor vs6, vs6, vs6 578 xxlxor vs7, vs7, vs7 579 xxlxor vs8, vs8, vs8 580 xxlxor vs9, vs9, vs9 581 xxlxor vs10, vs10, vs10 582 xxlxor vs11, vs11, vs11 583 xxlxor vs12, vs12, vs12 584 xxlxor vs13, vs13, vs13 585 586 xxlxor vs32, vs32, vs32 587 xxlxor vs33, vs33, vs33 588 xxlxor vs34, vs34, vs34 589 xxlxor vs35, vs35, vs35 590 xxlxor vs36, vs36, vs36 591 xxlxor vs37, vs37, vs37 592 xxlxor vs38, vs38, vs38 593 xxlxor vs39, vs39, vs39 594 xxlxor vs40, vs40, vs40 595 xxlxor vs41, vs41, vs41 596 xxlxor vs42, vs42, vs42 597 xxlxor vs43, vs43, vs43 598 xxlxor vs44, vs44, vs44 599 xxlxor vs45, vs45, vs45 600 xxlxor vs46, vs46, vs46 601 xxlxor vs47, vs47, vs47 602 xxlxor vs48, vs48, vs48 603 xxlxor vs49, vs49, vs49 604 xxlxor vs50, vs50, vs50 605 xxlxor vs51, vs51, vs51 606 607 li rSIZE, 0 608 li rDST, 0 609 li rSRC, 0 610 li rKEY, 0 611 li rNONCE, 0 612 li rCNTR, 0 613 614 # epilogue 615 addi r14, sp, -160 616 617 li r16, -16 618 li r17, -32 619 li r18, -48 620 li r19, -64 621 li r20, -80 622 li r21, -96 623 li r22, -112 624 li r23, -128 625 li r24, -144 626 li r25, -160 627 li r26, -176 628 li r27, -192 629 li r28, -208 630 631 # load f14, f15 632 lxvw4x vs14, 0, r14 633 lxvw4x vs15, r16, r14 634 635 # load v20 - v31 636 lxvw4x vs52, r17, r14 637 lxvw4x vs53, r18, r14 638 lxvw4x vs54, r19, r14 639 lxvw4x vs55, r20, r14 640 lxvw4x vs56, r21, r14 641 lxvw4x vs57, r22, r14 642 lxvw4x vs58, r23, r14 643 lxvw4x vs59, r24, r14 644 lxvw4x vs60, r25, r14 645 lxvw4x vs61, r26, r14 646 lxvw4x vs62, r27, r14 647 lxvw4x vs63, r28, r14 648 649 ld r14, -8(sp) 650 ld r15, -16(sp) 651 ld r16, -24(sp) 652 ld r17, -32(sp) 653 ld r18, -40(sp) 654 ld r19, -48(sp) 655 ld r20, -56(sp) 656 ld r21, -64(sp) 657 ld r22, -72(sp) 658 ld r23, -80(sp) 659 ld r24, -88(sp) 660 ld r25, -96(sp) 661 ld r26, -104(sp) 662 ld r27, -112(sp) 663 ld r28, -120(sp) 664 ld r29, -128(sp) 665 ld r30, -136(sp) 666 ld r31, -144(sp) 667 668 blr 669