1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for PowerISA 2.07. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT SIMD implementation, but with 21# disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral. 22# POWER8 processor spends 9.8 cycles to process byte out of large 23# buffer for r=1088, which matches SHA3-256. This is 17% better than 24# scalar PPC64 code. It probably should be noted that if POWER8's 25# successor can achieve higher scalar instruction issue rate, then 26# this module will loose... And it does on POWER9 with 12.0 vs. 9.4. 27 28$flavour = shift; 29 30if ($flavour =~ /64/) { 31 $SIZE_T =8; 32 $LRSAVE =2*$SIZE_T; 33 $UCMP ="cmpld"; 34 $STU ="stdu"; 35 $POP ="ld"; 36 $PUSH ="std"; 37} elsif ($flavour =~ /32/) { 38 $SIZE_T =4; 39 $LRSAVE =$SIZE_T; 40 $STU ="stwu"; 41 $POP ="lwz"; 42 $PUSH ="stw"; 43 $UCMP ="cmplw"; 44} else { die "nonsense $flavour"; } 45 46$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 47( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 48( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 49die "can't locate ppc-xlate.pl"; 50 51open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 52 53$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload 54 55my $sp ="r1"; 56 57my $iotas = "r12"; 58 59######################################################################## 60# Register layout: 61# 62# v0 A[0][0] A[1][0] 63# v1 A[0][1] A[1][1] 64# v2 A[0][2] A[1][2] 65# v3 A[0][3] A[1][3] 66# v4 A[0][4] A[1][4] 67# 68# v5 A[2][0] A[3][0] 69# v6 A[2][1] A[3][1] 70# v7 A[2][2] A[3][2] 71# v8 A[2][3] A[3][3] 72# v9 A[2][4] A[3][4] 73# 74# v10 A[4][0] A[4][1] 75# v11 A[4][2] A[4][3] 76# v12 A[4][4] A[4][4] 77# 78# v13..25 rhotates[][] 79# v26..31 volatile 80# 81$code.=<<___; 82.machine "any" 83.text 84 85.type KeccakF1600_int,\@function 86.align 5 87KeccakF1600_int: 88 li r0,24 89 mtctr r0 90 li r0,0 91 b .Loop 92 93.align 4 94.Loop: 95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta 96 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0] 97 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1] 98 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2] 99 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3] 100 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4] 101 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1] 102 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1] 103 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3] 104 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3] 105 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4] 106 vxor v26,v26,v31 ; C[0..1] 107 vxor v27,v27,v28 ; C[2..3] 108 vxor v28,v29,v30 ; C[4..4] 109 vspltisb v31,1 110 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1] 111 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3] 112 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low! 113 114 vrld v29,v26,v31 ; ROL64(C[0..1],1) 115 vrld v30,v27,v31 ; ROL64(C[2..3],1) 116 vrld v31,v28,v31 ; ROL64(C[4..4],1) 117 vpermdi v31,v31,v29,0b10 118 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1) 119 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1) 120 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low! 121 122 vpermdi v29,v26,v26,0b00 ; C[0..0] 123 vpermdi v30,v28,v26,0b10 ; C[4..0] 124 vpermdi v31,v28,v28,0b11 ; C[4..4] 125 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0] 126 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0] 127 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0] 128 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4] 129 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4] 130 131 vpermdi v29,v27,v27,0b00 ; C[2..2] 132 vpermdi v30,v26,v26,0b11 ; C[1..1] 133 vpermdi v31,v26,v27,0b10 ; C[1..2] 134 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2] 135 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2] 136 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1] 137 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1] 138 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2] 139 140 vpermdi v29,v27,v27,0b11 ; C[3..3] 141 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3] 142 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3] 143 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3] 144 145 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho 146 vrld v26,v0, v13 ; v0 147 vrld v1, v1, v14 148 vrld v27,v2, v15 ; v2 149 vrld v28,v3, v16 ; v3 150 vrld v4, v4, v17 151 vrld v5, v5, v18 152 vrld v6, v6, v19 153 vrld v29,v7, v20 ; v7 154 vrld v8, v8, v21 155 vrld v9, v9, v22 156 vrld v10,v10,v23 157 vrld v30,v11,v24 ; v11 158 vrld v12,v12,v25 159 160 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi 161 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3] 162 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0] 163 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0] 164 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4] 165 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4] 166 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1] 167 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2] 168 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1] 169 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0] 170 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2] 171 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1] 172 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3] 173 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3] 174 175 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota 176 lvx_u v31,$iotas,r0 ; iotas[index] 177 addic r0,r0,16 ; index++ 178 179 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2]) 180 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3]) 181 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4]) 182 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0]) 183 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1]) 184 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2]) 185 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3]) 186 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) 187 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) 188 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) 189 190 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2]) 191 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3]) 192 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4]) 193 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0]) 194 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1]) 195 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) 196 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) 197 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) 198 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) 199 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) 200 201 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++] 202 203 vpermdi v26,v10,v11,0b10 ; A[4][1..2] 204 vpermdi v27,v12,v10,0b00 ; A[4][4..0] 205 vpermdi v28,v11,v12,0b10 ; A[4][3..4] 206 vpermdi v29,v10,v10,0b10 ; A[4][1..0] 207 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3]) 208 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0]) 209 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1]) 210 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3]) 211 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0]) 212 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0]) 213 214 bdnz .Loop 215 216 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4] 217 blr 218 .long 0 219 .byte 0,12,0x14,0,0,0,0,0 220.size KeccakF1600_int,.-KeccakF1600_int 221 222.type KeccakF1600,\@function 223.align 5 224KeccakF1600: 225 $STU $sp,-$FRAME($sp) 226 li r10,`15+6*$SIZE_T` 227 li r11,`31+6*$SIZE_T` 228 mflr r8 229 mfspr r7, 256 ; save vrsave 230 stvx v20,r10,$sp 231 addi r10,r10,32 232 stvx v21,r11,$sp 233 addi r11,r11,32 234 stvx v22,r10,$sp 235 addi r10,r10,32 236 stvx v23,r11,$sp 237 addi r11,r11,32 238 stvx v24,r10,$sp 239 addi r10,r10,32 240 stvx v25,r11,$sp 241 addi r11,r11,32 242 stvx v26,r10,$sp 243 addi r10,r10,32 244 stvx v27,r11,$sp 245 addi r11,r11,32 246 stvx v28,r10,$sp 247 addi r10,r10,32 248 stvx v29,r11,$sp 249 addi r11,r11,32 250 stvx v30,r10,$sp 251 stvx v31,r11,$sp 252 stw r7,`$FRAME-4`($sp) ; save vrsave 253 li r0, -1 254 $PUSH r8,`$FRAME+$LRSAVE`($sp) 255 mtspr 256, r0 ; preserve all AltiVec registers 256 257 li r11,16 258 lvx_4w v0,0,r3 ; load A[5][5] 259 li r10,32 260 lvx_4w v1,r11,r3 261 addi r11,r11,32 262 lvx_4w v2,r10,r3 263 addi r10,r10,32 264 lvx_4w v3,r11,r3 265 addi r11,r11,32 266 lvx_4w v4,r10,r3 267 addi r10,r10,32 268 lvx_4w v5,r11,r3 269 addi r11,r11,32 270 lvx_4w v6,r10,r3 271 addi r10,r10,32 272 lvx_4w v7,r11,r3 273 addi r11,r11,32 274 lvx_4w v8,r10,r3 275 addi r10,r10,32 276 lvx_4w v9,r11,r3 277 addi r11,r11,32 278 lvx_4w v10,r10,r3 279 addi r10,r10,32 280 lvx_4w v11,r11,r3 281 lvx_splt v12,r10,r3 282 283 bl PICmeup 284 285 li r11,16 286 lvx_u v13,0,r12 ; load rhotates 287 li r10,32 288 lvx_u v14,r11,r12 289 addi r11,r11,32 290 lvx_u v15,r10,r12 291 addi r10,r10,32 292 lvx_u v16,r11,r12 293 addi r11,r11,32 294 lvx_u v17,r10,r12 295 addi r10,r10,32 296 lvx_u v18,r11,r12 297 addi r11,r11,32 298 lvx_u v19,r10,r12 299 addi r10,r10,32 300 lvx_u v20,r11,r12 301 addi r11,r11,32 302 lvx_u v21,r10,r12 303 addi r10,r10,32 304 lvx_u v22,r11,r12 305 addi r11,r11,32 306 lvx_u v23,r10,r12 307 addi r10,r10,32 308 lvx_u v24,r11,r12 309 lvx_u v25,r10,r12 310 addi r12,r12,`16*16` ; points at iotas 311 312 bl KeccakF1600_int 313 314 li r11,16 315 stvx_4w v0,0,r3 ; return A[5][5] 316 li r10,32 317 stvx_4w v1,r11,r3 318 addi r11,r11,32 319 stvx_4w v2,r10,r3 320 addi r10,r10,32 321 stvx_4w v3,r11,r3 322 addi r11,r11,32 323 stvx_4w v4,r10,r3 324 addi r10,r10,32 325 stvx_4w v5,r11,r3 326 addi r11,r11,32 327 stvx_4w v6,r10,r3 328 addi r10,r10,32 329 stvx_4w v7,r11,r3 330 addi r11,r11,32 331 stvx_4w v8,r10,r3 332 addi r10,r10,32 333 stvx_4w v9,r11,r3 334 addi r11,r11,32 335 stvx_4w v10,r10,r3 336 addi r10,r10,32 337 stvx_4w v11,r11,r3 338 stvdx_u v12,r10,r3 339 340 li r10,`15+6*$SIZE_T` 341 li r11,`31+6*$SIZE_T` 342 mtlr r8 343 mtspr 256, r7 ; restore vrsave 344 lvx v20,r10,$sp 345 addi r10,r10,32 346 lvx v21,r11,$sp 347 addi r11,r11,32 348 lvx v22,r10,$sp 349 addi r10,r10,32 350 lvx v23,r11,$sp 351 addi r11,r11,32 352 lvx v24,r10,$sp 353 addi r10,r10,32 354 lvx v25,r11,$sp 355 addi r11,r11,32 356 lvx v26,r10,$sp 357 addi r10,r10,32 358 lvx v27,r11,$sp 359 addi r11,r11,32 360 lvx v28,r10,$sp 361 addi r10,r10,32 362 lvx v29,r11,$sp 363 addi r11,r11,32 364 lvx v30,r10,$sp 365 lvx v31,r11,$sp 366 addi $sp,$sp,$FRAME 367 blr 368 .long 0 369 .byte 0,12,0x04,1,0x80,0,1,0 370 .long 0 371.size KeccakF1600,.-KeccakF1600 372___ 373{ 374my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6)); 375 376$code.=<<___; 377.globl SHA3_absorb 378.type SHA3_absorb,\@function 379.align 5 380SHA3_absorb: 381 $STU $sp,-$FRAME($sp) 382 li r10,`15+6*$SIZE_T` 383 li r11,`31+6*$SIZE_T` 384 mflr r8 385 mfspr r7, 256 ; save vrsave 386 stvx v20,r10,$sp 387 addi r10,r10,32 388 stvx v21,r11,$sp 389 addi r11,r11,32 390 stvx v22,r10,$sp 391 addi r10,r10,32 392 stvx v23,r11,$sp 393 addi r11,r11,32 394 stvx v24,r10,$sp 395 addi r10,r10,32 396 stvx v25,r11,$sp 397 addi r11,r11,32 398 stvx v26,r10,$sp 399 addi r10,r10,32 400 stvx v27,r11,$sp 401 addi r11,r11,32 402 stvx v28,r10,$sp 403 addi r10,r10,32 404 stvx v29,r11,$sp 405 addi r11,r11,32 406 stvx v30,r10,$sp 407 stvx v31,r11,$sp 408 stw r7,`$FRAME-4`($sp) ; save vrsave 409 li r0, -1 410 $PUSH r8,`$FRAME+$LRSAVE`($sp) 411 mtspr 256, r0 ; preserve all AltiVec registers 412 413 li r11,16 414 lvx_4w v0,0,$A_jagged ; load A[5][5] 415 li r10,32 416 lvx_4w v1,r11,$A_jagged 417 addi r11,r11,32 418 lvx_4w v2,r10,$A_jagged 419 addi r10,r10,32 420 lvx_4w v3,r11,$A_jagged 421 addi r11,r11,32 422 lvx_4w v4,r10,$A_jagged 423 addi r10,r10,32 424 lvx_4w v5,r11,$A_jagged 425 addi r11,r11,32 426 lvx_4w v6,r10,$A_jagged 427 addi r10,r10,32 428 lvx_4w v7,r11,$A_jagged 429 addi r11,r11,32 430 lvx_4w v8,r10,$A_jagged 431 addi r10,r10,32 432 lvx_4w v9,r11,$A_jagged 433 addi r11,r11,32 434 lvx_4w v10,r10,$A_jagged 435 addi r10,r10,32 436 lvx_4w v11,r11,$A_jagged 437 lvx_splt v12,r10,$A_jagged 438 439 bl PICmeup 440 441 li r11,16 442 lvx_u v13,0,r12 ; load rhotates 443 li r10,32 444 lvx_u v14,r11,r12 445 addi r11,r11,32 446 lvx_u v15,r10,r12 447 addi r10,r10,32 448 lvx_u v16,r11,r12 449 addi r11,r11,32 450 lvx_u v17,r10,r12 451 addi r10,r10,32 452 lvx_u v18,r11,r12 453 addi r11,r11,32 454 lvx_u v19,r10,r12 455 addi r10,r10,32 456 lvx_u v20,r11,r12 457 addi r11,r11,32 458 lvx_u v21,r10,r12 459 addi r10,r10,32 460 lvx_u v22,r11,r12 461 addi r11,r11,32 462 lvx_u v23,r10,r12 463 addi r10,r10,32 464 lvx_u v24,r11,r12 465 lvx_u v25,r10,r12 466 li r10,-32 467 li r11,-16 468 addi r12,r12,`16*16` ; points at iotas 469 b .Loop_absorb 470 471.align 4 472.Loop_absorb: 473 $UCMP $len,$bsz ; len < bsz? 474 blt .Labsorbed 475 476 sub $len,$len,$bsz ; len -= bsz 477 srwi r0,$bsz,3 478 mtctr r0 479 480 lvx_u v30,r10,r12 ; permutation masks 481 lvx_u v31,r11,r12 482 ?vspltisb v27,7 ; prepare masks for byte swap 483 ?vxor v30,v30,v27 ; on big-endian 484 ?vxor v31,v31,v27 485 486 vxor v27,v27,v27 ; zero 487 lvdx_u v26,0,$inp 488 addi $inp,$inp,8 489 vperm v26,v26,v27,v30 490 vxor v0, v0, v26 491 bdz .Lprocess_block 492 lvdx_u v26,0,$inp 493 addi $inp,$inp,8 494 vperm v26,v26,v27,v30 495 vxor v1, v1, v26 496 bdz .Lprocess_block 497 lvdx_u v26,0,$inp 498 addi $inp,$inp,8 499 vperm v26,v26,v27,v30 500 vxor v2, v2, v26 501 bdz .Lprocess_block 502 lvdx_u v26,0,$inp 503 addi $inp,$inp,8 504 vperm v26,v26,v27,v30 505 vxor v3, v3, v26 506 bdz .Lprocess_block 507 lvdx_u v26,0,$inp 508 addi $inp,$inp,8 509 vperm v26,v26,v27,v30 510 vxor v4, v4, v26 511 bdz .Lprocess_block 512 lvdx_u v26,0,$inp 513 addi $inp,$inp,8 514 vperm v26,v26,v27,v31 515 vxor v0, v0, v26 516 bdz .Lprocess_block 517 lvdx_u v26,0,$inp 518 addi $inp,$inp,8 519 vperm v26,v26,v27,v31 520 vxor v1, v1, v26 521 bdz .Lprocess_block 522 lvdx_u v26,0,$inp 523 addi $inp,$inp,8 524 vperm v26,v26,v27,v31 525 vxor v2, v2, v26 526 bdz .Lprocess_block 527 lvdx_u v26,0,$inp 528 addi $inp,$inp,8 529 vperm v26,v26,v27,v31 530 vxor v3, v3, v26 531 bdz .Lprocess_block 532 lvdx_u v26,0,$inp 533 addi $inp,$inp,8 534 vperm v26,v26,v27,v31 535 vxor v4, v4, v26 536 bdz .Lprocess_block 537 lvdx_u v26,0,$inp 538 addi $inp,$inp,8 539 vperm v26,v26,v27,v30 540 vxor v5, v5, v26 541 bdz .Lprocess_block 542 lvdx_u v26,0,$inp 543 addi $inp,$inp,8 544 vperm v26,v26,v27,v30 545 vxor v6, v6, v26 546 bdz .Lprocess_block 547 lvdx_u v26,0,$inp 548 addi $inp,$inp,8 549 vperm v26,v26,v27,v30 550 vxor v7, v7, v26 551 bdz .Lprocess_block 552 lvdx_u v26,0,$inp 553 addi $inp,$inp,8 554 vperm v26,v26,v27,v30 555 vxor v8, v8, v26 556 bdz .Lprocess_block 557 lvdx_u v26,0,$inp 558 addi $inp,$inp,8 559 vperm v26,v26,v27,v30 560 vxor v9, v9, v26 561 bdz .Lprocess_block 562 lvdx_u v26,0,$inp 563 addi $inp,$inp,8 564 vperm v26,v26,v27,v31 565 vxor v5, v5, v26 566 bdz .Lprocess_block 567 lvdx_u v26,0,$inp 568 addi $inp,$inp,8 569 vperm v26,v26,v27,v31 570 vxor v6, v6, v26 571 bdz .Lprocess_block 572 lvdx_u v26,0,$inp 573 addi $inp,$inp,8 574 vperm v26,v26,v27,v31 575 vxor v7, v7, v26 576 bdz .Lprocess_block 577 lvdx_u v26,0,$inp 578 addi $inp,$inp,8 579 vperm v26,v26,v27,v31 580 vxor v8, v8, v26 581 bdz .Lprocess_block 582 lvdx_u v26,0,$inp 583 addi $inp,$inp,8 584 vperm v26,v26,v27,v31 585 vxor v9, v9, v26 586 bdz .Lprocess_block 587 lvdx_u v26,0,$inp 588 addi $inp,$inp,8 589 vperm v26,v26,v27,v30 590 vxor v10, v10, v26 591 bdz .Lprocess_block 592 lvdx_u v26,0,$inp 593 addi $inp,$inp,8 594 vperm v26,v26,v27,v31 595 vxor v10, v10, v26 596 bdz .Lprocess_block 597 lvdx_u v26,0,$inp 598 addi $inp,$inp,8 599 vperm v26,v26,v27,v30 600 vxor v11, v11, v26 601 bdz .Lprocess_block 602 lvdx_u v26,0,$inp 603 addi $inp,$inp,8 604 vperm v26,v26,v27,v31 605 vxor v11, v11, v26 606 bdz .Lprocess_block 607 lvdx_u v26,0,$inp 608 addi $inp,$inp,8 609 vperm v26,v26,v27,v31 610 vxor v12, v12, v26 611 612.Lprocess_block: 613 bl KeccakF1600_int 614 615 b .Loop_absorb 616 617.align 4 618.Labsorbed: 619 li r11,16 620 stvx_4w v0,0,$A_jagged ; return A[5][5] 621 li r10,32 622 stvx_4w v1,r11,$A_jagged 623 addi r11,r11,32 624 stvx_4w v2,r10,$A_jagged 625 addi r10,r10,32 626 stvx_4w v3,r11,$A_jagged 627 addi r11,r11,32 628 stvx_4w v4,r10,$A_jagged 629 addi r10,r10,32 630 stvx_4w v5,r11,$A_jagged 631 addi r11,r11,32 632 stvx_4w v6,r10,$A_jagged 633 addi r10,r10,32 634 stvx_4w v7,r11,$A_jagged 635 addi r11,r11,32 636 stvx_4w v8,r10,$A_jagged 637 addi r10,r10,32 638 stvx_4w v9,r11,$A_jagged 639 addi r11,r11,32 640 stvx_4w v10,r10,$A_jagged 641 addi r10,r10,32 642 stvx_4w v11,r11,$A_jagged 643 stvdx_u v12,r10,$A_jagged 644 645 mr r3,$len ; return value 646 li r10,`15+6*$SIZE_T` 647 li r11,`31+6*$SIZE_T` 648 mtlr r8 649 mtspr 256, r7 ; restore vrsave 650 lvx v20,r10,$sp 651 addi r10,r10,32 652 lvx v21,r11,$sp 653 addi r11,r11,32 654 lvx v22,r10,$sp 655 addi r10,r10,32 656 lvx v23,r11,$sp 657 addi r11,r11,32 658 lvx v24,r10,$sp 659 addi r10,r10,32 660 lvx v25,r11,$sp 661 addi r11,r11,32 662 lvx v26,r10,$sp 663 addi r10,r10,32 664 lvx v27,r11,$sp 665 addi r11,r11,32 666 lvx v28,r10,$sp 667 addi r10,r10,32 668 lvx v29,r11,$sp 669 addi r11,r11,32 670 lvx v30,r10,$sp 671 lvx v31,r11,$sp 672 addi $sp,$sp,$FRAME 673 blr 674 .long 0 675 .byte 0,12,0x04,1,0x80,0,4,0 676 .long 0 677.size SHA3_absorb,.-SHA3_absorb 678___ 679} 680{ 681my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6)); 682 683$code.=<<___; 684.globl SHA3_squeeze 685.type SHA3_squeeze,\@function 686.align 5 687SHA3_squeeze: 688 mflr r9 ; r9 is not touched by KeccakF1600 689 subi $out,$out,1 ; prepare for stbu 690 addi r8,$A_jagged,4 ; prepare volatiles 691 mr r10,$bsz 692 li r11,0 693 b .Loop_squeeze 694.align 4 695.Loop_squeeze: 696 lwzx r7,r11,r8 ; lo 697 lwzx r0,r11,$A_jagged ; hi 698 ${UCMP}i $len,8 699 blt .Lsqueeze_tail 700 701 stbu r7,1($out) ; write lo 702 srwi r7,r7,8 703 stbu r7,1($out) 704 srwi r7,r7,8 705 stbu r7,1($out) 706 srwi r7,r7,8 707 stbu r7,1($out) 708 stbu r0,1($out) ; write hi 709 srwi r0,r0,8 710 stbu r0,1($out) 711 srwi r0,r0,8 712 stbu r0,1($out) 713 srwi r0,r0,8 714 stbu r0,1($out) 715 716 subic. $len,$len,8 717 beqlr ; return if done 718 719 subic. r10,r10,8 720 ble .Loutput_expand 721 722 addi r11,r11,16 ; calculate jagged index 723 cmplwi r11,`16*5` 724 blt .Loop_squeeze 725 subi r11,r11,72 726 beq .Loop_squeeze 727 addi r11,r11,72 728 cmplwi r11,`16*5+8` 729 subi r11,r11,8 730 beq .Loop_squeeze 731 addi r11,r11,8 732 cmplwi r11,`16*10` 733 subi r11,r11,72 734 beq .Loop_squeeze 735 addi r11,r11,72 736 blt .Loop_squeeze 737 subi r11,r11,8 738 b .Loop_squeeze 739 740.align 4 741.Loutput_expand: 742 bl KeccakF1600 743 mtlr r9 744 745 addi r8,$A_jagged,4 ; restore volatiles 746 mr r10,$bsz 747 li r11,0 748 b .Loop_squeeze 749 750.align 4 751.Lsqueeze_tail: 752 mtctr $len 753 subic. $len,$len,4 754 ble .Loop_tail_lo 755 li r8,4 756 mtctr r8 757.Loop_tail_lo: 758 stbu r7,1($out) 759 srdi r7,r7,8 760 bdnz .Loop_tail_lo 761 ble .Lsqueeze_done 762 mtctr $len 763.Loop_tail_hi: 764 stbu r0,1($out) 765 srdi r0,r0,8 766 bdnz .Loop_tail_hi 767 768.Lsqueeze_done: 769 blr 770 .long 0 771 .byte 0,12,0x14,0,0,0,4,0 772 .long 0 773.size SHA3_squeeze,.-SHA3_squeeze 774___ 775} 776$code.=<<___; 777.align 6 778PICmeup: 779 mflr r0 780 bcl 20,31,\$+4 781 mflr r12 ; vvvvvv "distance" between . and 1st data entry 782 addi r12,r12,`64-8` 783 mtlr r0 784 blr 785 .long 0 786 .byte 0,12,0x14,0,0,0,0,0 787 .space `64-9*4` 788.type rhotates,\@object 789.align 6 790rhotates: 791 .quad 0, 36 792 .quad 1, 44 793 .quad 62, 6 794 .quad 28, 55 795 .quad 27, 20 796 .quad 3, 41 797 .quad 10, 45 798 .quad 43, 15 799 .quad 25, 21 800 .quad 39, 8 801 .quad 18, 2 802 .quad 61, 56 803 .quad 14, 14 804.size rhotates,.-rhotates 805 .quad 0,0 806 .quad 0x0001020304050607,0x1011121314151617 807 .quad 0x1011121314151617,0x0001020304050607 808.type iotas,\@object 809iotas: 810 .quad 0x0000000000000001,0 811 .quad 0x0000000000008082,0 812 .quad 0x800000000000808a,0 813 .quad 0x8000000080008000,0 814 .quad 0x000000000000808b,0 815 .quad 0x0000000080000001,0 816 .quad 0x8000000080008081,0 817 .quad 0x8000000000008009,0 818 .quad 0x000000000000008a,0 819 .quad 0x0000000000000088,0 820 .quad 0x0000000080008009,0 821 .quad 0x000000008000000a,0 822 .quad 0x000000008000808b,0 823 .quad 0x800000000000008b,0 824 .quad 0x8000000000008089,0 825 .quad 0x8000000000008003,0 826 .quad 0x8000000000008002,0 827 .quad 0x8000000000000080,0 828 .quad 0x000000000000800a,0 829 .quad 0x800000008000000a,0 830 .quad 0x8000000080008081,0 831 .quad 0x8000000000008080,0 832 .quad 0x0000000080000001,0 833 .quad 0x8000000080008008,0 834.size iotas,.-iotas 835.asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 836___ 837 838foreach (split("\n",$code)) { 839 s/\`([^\`]*)\`/eval $1/ge; 840 841 if ($flavour =~ /le$/) { # little-endian 842 s/\?([a-z]+)/;$1/; 843 } else { # big-endian 844 s/\?([a-z]+)/$1/; 845 } 846 847 print $_,"\n"; 848} 849 850close STDOUT or die "error closing STDOUT: $!"; 851