1#! /usr/bin/env perl 2# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# This module implements Poly1305 hash for PowerPC FPU. 18# 19# June 2015 20# 21# Numbers are cycles per processed byte with poly1305_blocks alone, 22# and improvement coefficients relative to gcc-generated code. 23# 24# Freescale e300 9.78/+30% 25# PPC74x0 6.92/+50% 26# PPC970 6.03/+80% 27# POWER7 3.50/+30% 28# POWER8 3.75/+10% 29 30$flavour = shift; 31 32if ($flavour =~ /64/) { 33 $SIZE_T =8; 34 $LRSAVE =2*$SIZE_T; 35 $UCMP ="cmpld"; 36 $STU ="stdu"; 37 $POP ="ld"; 38 $PUSH ="std"; 39} elsif ($flavour =~ /32/) { 40 $SIZE_T =4; 41 $LRSAVE =$SIZE_T; 42 $UCMP ="cmplw"; 43 $STU ="stwu"; 44 $POP ="lwz"; 45 $PUSH ="stw"; 46} else { die "nonsense $flavour"; } 47 48$LITTLE_ENDIAN = ($flavour=~/le$/) ? 4 : 0; 49 50$LWXLE = $LITTLE_ENDIAN ? "lwzx" : "lwbrx"; 51 52$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 54( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 55die "can't locate ppc-xlate.pl"; 56 57open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; 58 59$LOCALS=6*$SIZE_T; 60$FRAME=$LOCALS+6*8+18*8; 61 62my $sp="r1"; 63 64my ($ctx,$inp,$len,$padbit) = map("r$_",(3..6)); 65my ($in0,$in1,$in2,$in3,$i1,$i2,$i3) = map("r$_",(7..12,6)); 66 67my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi, 68 $two0,$two32,$two64,$two96,$two130,$five_two130, 69 $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi, 70 $s2lo,$s2hi,$s3lo,$s3hi, 71 $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("f$_",(0..31)); 72# borrowings 73my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi); 74my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi); 75my ($y0,$y1,$y2,$y3) = ($c3lo,$c3hi,$c1lo,$c1hi); 76 77$code.=<<___; 78.machine "any" 79.text 80 81.globl .poly1305_init_fpu 82.align 6 83.poly1305_init_fpu: 84 $STU $sp,-$LOCALS($sp) # minimal frame 85 mflr $padbit 86 $PUSH $padbit,`$LOCALS+$LRSAVE`($sp) 87 88 bl LPICmeup 89 90 xor r0,r0,r0 91 mtlr $padbit # restore lr 92 93 lfd $two0,8*0($len) # load constants 94 lfd $two32,8*1($len) 95 lfd $two64,8*2($len) 96 lfd $two96,8*3($len) 97 lfd $two130,8*4($len) 98 lfd $five_two130,8*5($len) 99 100 stfd $two0,8*0($ctx) # initial hash value, biased 0 101 stfd $two32,8*1($ctx) 102 stfd $two64,8*2($ctx) 103 stfd $two96,8*3($ctx) 104 105 $UCMP $inp,r0 106 beq- Lno_key 107 108 lfd $h3lo,8*13($len) # new fpscr 109 mffs $h3hi # old fpscr 110 111 stfd $two0,8*4($ctx) # key "template" 112 stfd $two32,8*5($ctx) 113 stfd $two64,8*6($ctx) 114 stfd $two96,8*7($ctx) 115 116 li $in1,4 117 li $in2,8 118 li $in3,12 119 $LWXLE $in0,0,$inp # load key 120 $LWXLE $in1,$in1,$inp 121 $LWXLE $in2,$in2,$inp 122 $LWXLE $in3,$in3,$inp 123 124 lis $i1,0xf000 # 0xf0000000 125 ori $i2,$i1,3 # 0xf0000003 126 andc $in0,$in0,$i1 # &=0x0fffffff 127 andc $in1,$in1,$i2 # &=0x0ffffffc 128 andc $in2,$in2,$i2 129 andc $in3,$in3,$i2 130 131 stw $in0,`8*4+(4^$LITTLE_ENDIAN)`($ctx) # fill "template" 132 stw $in1,`8*5+(4^$LITTLE_ENDIAN)`($ctx) 133 stw $in2,`8*6+(4^$LITTLE_ENDIAN)`($ctx) 134 stw $in3,`8*7+(4^$LITTLE_ENDIAN)`($ctx) 135 136 mtfsf 255,$h3lo # fpscr 137 stfd $two0,8*18($ctx) # copy constants to context 138 stfd $two32,8*19($ctx) 139 stfd $two64,8*20($ctx) 140 stfd $two96,8*21($ctx) 141 stfd $two130,8*22($ctx) 142 stfd $five_two130,8*23($ctx) 143 144 lfd $h0lo,8*4($ctx) # load [biased] key 145 lfd $h1lo,8*5($ctx) 146 lfd $h2lo,8*6($ctx) 147 lfd $h3lo,8*7($ctx) 148 149 fsub $h0lo,$h0lo,$two0 # r0 150 fsub $h1lo,$h1lo,$two32 # r1 151 fsub $h2lo,$h2lo,$two64 # r2 152 fsub $h3lo,$h3lo,$two96 # r3 153 154 lfd $two0,8*6($len) # more constants 155 lfd $two32,8*7($len) 156 lfd $two64,8*8($len) 157 lfd $two96,8*9($len) 158 159 fmul $h1hi,$h1lo,$five_two130 # s1 160 fmul $h2hi,$h2lo,$five_two130 # s2 161 stfd $h3hi,8*15($ctx) # borrow slot for original fpscr 162 fmul $h3hi,$h3lo,$five_two130 # s3 163 164 fadd $h0hi,$h0lo,$two0 165 stfd $h1hi,8*12($ctx) # put aside for now 166 fadd $h1hi,$h1lo,$two32 167 stfd $h2hi,8*13($ctx) 168 fadd $h2hi,$h2lo,$two64 169 stfd $h3hi,8*14($ctx) 170 fadd $h3hi,$h3lo,$two96 171 172 fsub $h0hi,$h0hi,$two0 173 fsub $h1hi,$h1hi,$two32 174 fsub $h2hi,$h2hi,$two64 175 fsub $h3hi,$h3hi,$two96 176 177 lfd $two0,8*10($len) # more constants 178 lfd $two32,8*11($len) 179 lfd $two64,8*12($len) 180 181 fsub $h0lo,$h0lo,$h0hi 182 fsub $h1lo,$h1lo,$h1hi 183 fsub $h2lo,$h2lo,$h2hi 184 fsub $h3lo,$h3lo,$h3hi 185 186 stfd $h0hi,8*5($ctx) # r0hi 187 stfd $h1hi,8*7($ctx) # r1hi 188 stfd $h2hi,8*9($ctx) # r2hi 189 stfd $h3hi,8*11($ctx) # r3hi 190 191 stfd $h0lo,8*4($ctx) # r0lo 192 stfd $h1lo,8*6($ctx) # r1lo 193 stfd $h2lo,8*8($ctx) # r2lo 194 stfd $h3lo,8*10($ctx) # r3lo 195 196 lfd $h1lo,8*12($ctx) # s1 197 lfd $h2lo,8*13($ctx) # s2 198 lfd $h3lo,8*14($ctx) # s3 199 lfd $h0lo,8*15($ctx) # pull original fpscr 200 201 fadd $h1hi,$h1lo,$two0 202 fadd $h2hi,$h2lo,$two32 203 fadd $h3hi,$h3lo,$two64 204 205 fsub $h1hi,$h1hi,$two0 206 fsub $h2hi,$h2hi,$two32 207 fsub $h3hi,$h3hi,$two64 208 209 fsub $h1lo,$h1lo,$h1hi 210 fsub $h2lo,$h2lo,$h2hi 211 fsub $h3lo,$h3lo,$h3hi 212 213 stfd $h1hi,8*13($ctx) # s1hi 214 stfd $h2hi,8*15($ctx) # s2hi 215 stfd $h3hi,8*17($ctx) # s3hi 216 217 stfd $h1lo,8*12($ctx) # s1lo 218 stfd $h2lo,8*14($ctx) # s2lo 219 stfd $h3lo,8*16($ctx) # s3lo 220 221 mtfsf 255,$h0lo # restore fpscr 222Lno_key: 223 xor r3,r3,r3 224 addi $sp,$sp,$LOCALS 225 blr 226 .long 0 227 .byte 0,12,4,1,0x80,0,2,0 228.size .poly1305_init_fpu,.-.poly1305_init_fpu 229 230.globl .poly1305_blocks_fpu 231.align 4 232.poly1305_blocks_fpu: 233 srwi. $len,$len,4 234 beq- Labort 235 236 $STU $sp,-$FRAME($sp) 237 mflr r0 238 stfd f14,`$FRAME-8*18`($sp) 239 stfd f15,`$FRAME-8*17`($sp) 240 stfd f16,`$FRAME-8*16`($sp) 241 stfd f17,`$FRAME-8*15`($sp) 242 stfd f18,`$FRAME-8*14`($sp) 243 stfd f19,`$FRAME-8*13`($sp) 244 stfd f20,`$FRAME-8*12`($sp) 245 stfd f21,`$FRAME-8*11`($sp) 246 stfd f22,`$FRAME-8*10`($sp) 247 stfd f23,`$FRAME-8*9`($sp) 248 stfd f24,`$FRAME-8*8`($sp) 249 stfd f25,`$FRAME-8*7`($sp) 250 stfd f26,`$FRAME-8*6`($sp) 251 stfd f27,`$FRAME-8*5`($sp) 252 stfd f28,`$FRAME-8*4`($sp) 253 stfd f29,`$FRAME-8*3`($sp) 254 stfd f30,`$FRAME-8*2`($sp) 255 stfd f31,`$FRAME-8*1`($sp) 256 $PUSH r0,`$FRAME+$LRSAVE`($sp) 257 258 xor r0,r0,r0 259 li $in3,1 260 mtctr $len 261 neg $len,$len 262 stw r0,`$LOCALS+8*4+(0^$LITTLE_ENDIAN)`($sp) 263 stw $in3,`$LOCALS+8*4+(4^$LITTLE_ENDIAN)`($sp) 264 265 lfd $two0,8*18($ctx) # load constants 266 lfd $two32,8*19($ctx) 267 lfd $two64,8*20($ctx) 268 lfd $two96,8*21($ctx) 269 lfd $two130,8*22($ctx) 270 lfd $five_two130,8*23($ctx) 271 272 lfd $h0lo,8*0($ctx) # load [biased] hash value 273 lfd $h1lo,8*1($ctx) 274 lfd $h2lo,8*2($ctx) 275 lfd $h3lo,8*3($ctx) 276 277 stfd $two0,`$LOCALS+8*0`($sp) # input "template" 278 oris $in3,$padbit,`(1023+52+96)<<4` 279 stfd $two32,`$LOCALS+8*1`($sp) 280 stfd $two64,`$LOCALS+8*2`($sp) 281 stw $in3,`$LOCALS+8*3+(0^$LITTLE_ENDIAN)`($sp) 282 283 li $i1,4 284 li $i2,8 285 li $i3,12 286 $LWXLE $in0,0,$inp # load input 287 $LWXLE $in1,$i1,$inp 288 $LWXLE $in2,$i2,$inp 289 $LWXLE $in3,$i3,$inp 290 addi $inp,$inp,16 291 292 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" 293 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 294 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 295 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 296 297 mffs $x0 # original fpscr 298 lfd $x1,`$LOCALS+8*4`($sp) # new fpscr 299 lfd $r0lo,8*4($ctx) # load key 300 lfd $r0hi,8*5($ctx) 301 lfd $r1lo,8*6($ctx) 302 lfd $r1hi,8*7($ctx) 303 lfd $r2lo,8*8($ctx) 304 lfd $r2hi,8*9($ctx) 305 lfd $r3lo,8*10($ctx) 306 lfd $r3hi,8*11($ctx) 307 lfd $s1lo,8*12($ctx) 308 lfd $s1hi,8*13($ctx) 309 lfd $s2lo,8*14($ctx) 310 lfd $s2hi,8*15($ctx) 311 lfd $s3lo,8*16($ctx) 312 lfd $s3hi,8*17($ctx) 313 314 stfd $x0,`$LOCALS+8*4`($sp) # save original fpscr 315 mtfsf 255,$x1 316 317 addic $len,$len,1 318 addze r0,r0 319 slwi. r0,r0,4 320 sub $inp,$inp,r0 # conditional rewind 321 322 lfd $x0,`$LOCALS+8*0`($sp) 323 lfd $x1,`$LOCALS+8*1`($sp) 324 lfd $x2,`$LOCALS+8*2`($sp) 325 lfd $x3,`$LOCALS+8*3`($sp) 326 327 fsub $h0lo,$h0lo,$two0 # de-bias hash value 328 $LWXLE $in0,0,$inp # modulo-scheduled input load 329 fsub $h1lo,$h1lo,$two32 330 $LWXLE $in1,$i1,$inp 331 fsub $h2lo,$h2lo,$two64 332 $LWXLE $in2,$i2,$inp 333 fsub $h3lo,$h3lo,$two96 334 $LWXLE $in3,$i3,$inp 335 336 fsub $x0,$x0,$two0 # de-bias input 337 addi $inp,$inp,16 338 fsub $x1,$x1,$two32 339 fsub $x2,$x2,$two64 340 fsub $x3,$x3,$two96 341 342 fadd $x0,$x0,$h0lo # accumulate input 343 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) 344 fadd $x1,$x1,$h1lo 345 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 346 fadd $x2,$x2,$h2lo 347 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 348 fadd $x3,$x3,$h3lo 349 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 350 351 b Lentry 352 353.align 4 354Loop: 355 fsub $y0,$y0,$two0 # de-bias input 356 addic $len,$len,1 357 fsub $y1,$y1,$two32 358 addze r0,r0 359 fsub $y2,$y2,$two64 360 slwi. r0,r0,4 361 fsub $y3,$y3,$two96 362 sub $inp,$inp,r0 # conditional rewind 363 364 fadd $h0lo,$h0lo,$y0 # accumulate input 365 fadd $h0hi,$h0hi,$y1 366 fadd $h2lo,$h2lo,$y2 367 fadd $h2hi,$h2hi,$y3 368 369 ######################################### base 2^48 -> base 2^32 370 fadd $c1lo,$h1lo,$two64 371 $LWXLE $in0,0,$inp # modulo-scheduled input load 372 fadd $c1hi,$h1hi,$two64 373 $LWXLE $in1,$i1,$inp 374 fadd $c3lo,$h3lo,$two130 375 $LWXLE $in2,$i2,$inp 376 fadd $c3hi,$h3hi,$two130 377 $LWXLE $in3,$i3,$inp 378 fadd $c0lo,$h0lo,$two32 379 addi $inp,$inp,16 380 fadd $c0hi,$h0hi,$two32 381 fadd $c2lo,$h2lo,$two96 382 fadd $c2hi,$h2hi,$two96 383 384 fsub $c1lo,$c1lo,$two64 385 stw $in0,`$LOCALS+8*0+(4^$LITTLE_ENDIAN)`($sp) # fill "template" 386 fsub $c1hi,$c1hi,$two64 387 stw $in1,`$LOCALS+8*1+(4^$LITTLE_ENDIAN)`($sp) 388 fsub $c3lo,$c3lo,$two130 389 stw $in2,`$LOCALS+8*2+(4^$LITTLE_ENDIAN)`($sp) 390 fsub $c3hi,$c3hi,$two130 391 stw $in3,`$LOCALS+8*3+(4^$LITTLE_ENDIAN)`($sp) 392 fsub $c0lo,$c0lo,$two32 393 fsub $c0hi,$c0hi,$two32 394 fsub $c2lo,$c2lo,$two96 395 fsub $c2hi,$c2hi,$two96 396 397 fsub $h1lo,$h1lo,$c1lo 398 fsub $h1hi,$h1hi,$c1hi 399 fsub $h3lo,$h3lo,$c3lo 400 fsub $h3hi,$h3hi,$c3hi 401 fsub $h2lo,$h2lo,$c2lo 402 fsub $h2hi,$h2hi,$c2hi 403 fsub $h0lo,$h0lo,$c0lo 404 fsub $h0hi,$h0hi,$c0hi 405 406 fadd $h1lo,$h1lo,$c0lo 407 fadd $h1hi,$h1hi,$c0hi 408 fadd $h3lo,$h3lo,$c2lo 409 fadd $h3hi,$h3hi,$c2hi 410 fadd $h2lo,$h2lo,$c1lo 411 fadd $h2hi,$h2hi,$c1hi 412 fmadd $h0lo,$c3lo,$five_two130,$h0lo 413 fmadd $h0hi,$c3hi,$five_two130,$h0hi 414 415 fadd $x1,$h1lo,$h1hi 416 lfd $s1lo,8*12($ctx) # reload constants 417 fadd $x3,$h3lo,$h3hi 418 lfd $s1hi,8*13($ctx) 419 fadd $x2,$h2lo,$h2hi 420 lfd $r3lo,8*10($ctx) 421 fadd $x0,$h0lo,$h0hi 422 lfd $r3hi,8*11($ctx) 423Lentry: 424 fmul $h0lo,$s3lo,$x1 425 fmul $h0hi,$s3hi,$x1 426 fmul $h2lo,$r1lo,$x1 427 fmul $h2hi,$r1hi,$x1 428 fmul $h1lo,$r0lo,$x1 429 fmul $h1hi,$r0hi,$x1 430 fmul $h3lo,$r2lo,$x1 431 fmul $h3hi,$r2hi,$x1 432 433 fmadd $h0lo,$s1lo,$x3,$h0lo 434 fmadd $h0hi,$s1hi,$x3,$h0hi 435 fmadd $h2lo,$s3lo,$x3,$h2lo 436 fmadd $h2hi,$s3hi,$x3,$h2hi 437 fmadd $h1lo,$s2lo,$x3,$h1lo 438 fmadd $h1hi,$s2hi,$x3,$h1hi 439 fmadd $h3lo,$r0lo,$x3,$h3lo 440 fmadd $h3hi,$r0hi,$x3,$h3hi 441 442 fmadd $h0lo,$s2lo,$x2,$h0lo 443 fmadd $h0hi,$s2hi,$x2,$h0hi 444 fmadd $h2lo,$r0lo,$x2,$h2lo 445 fmadd $h2hi,$r0hi,$x2,$h2hi 446 fmadd $h1lo,$s3lo,$x2,$h1lo 447 fmadd $h1hi,$s3hi,$x2,$h1hi 448 fmadd $h3lo,$r1lo,$x2,$h3lo 449 fmadd $h3hi,$r1hi,$x2,$h3hi 450 451 fmadd $h0lo,$r0lo,$x0,$h0lo 452 lfd $y0,`$LOCALS+8*0`($sp) # load [biased] input 453 fmadd $h0hi,$r0hi,$x0,$h0hi 454 lfd $y1,`$LOCALS+8*1`($sp) 455 fmadd $h2lo,$r2lo,$x0,$h2lo 456 lfd $y2,`$LOCALS+8*2`($sp) 457 fmadd $h2hi,$r2hi,$x0,$h2hi 458 lfd $y3,`$LOCALS+8*3`($sp) 459 fmadd $h1lo,$r1lo,$x0,$h1lo 460 fmadd $h1hi,$r1hi,$x0,$h1hi 461 fmadd $h3lo,$r3lo,$x0,$h3lo 462 fmadd $h3hi,$r3hi,$x0,$h3hi 463 464 bdnz Loop 465 466 ######################################### base 2^48 -> base 2^32 467 fadd $c0lo,$h0lo,$two32 468 fadd $c0hi,$h0hi,$two32 469 fadd $c2lo,$h2lo,$two96 470 fadd $c2hi,$h2hi,$two96 471 fadd $c1lo,$h1lo,$two64 472 fadd $c1hi,$h1hi,$two64 473 fadd $c3lo,$h3lo,$two130 474 fadd $c3hi,$h3hi,$two130 475 476 fsub $c0lo,$c0lo,$two32 477 fsub $c0hi,$c0hi,$two32 478 fsub $c2lo,$c2lo,$two96 479 fsub $c2hi,$c2hi,$two96 480 fsub $c1lo,$c1lo,$two64 481 fsub $c1hi,$c1hi,$two64 482 fsub $c3lo,$c3lo,$two130 483 fsub $c3hi,$c3hi,$two130 484 485 fsub $h1lo,$h1lo,$c1lo 486 fsub $h1hi,$h1hi,$c1hi 487 fsub $h3lo,$h3lo,$c3lo 488 fsub $h3hi,$h3hi,$c3hi 489 fsub $h2lo,$h2lo,$c2lo 490 fsub $h2hi,$h2hi,$c2hi 491 fsub $h0lo,$h0lo,$c0lo 492 fsub $h0hi,$h0hi,$c0hi 493 494 fadd $h1lo,$h1lo,$c0lo 495 fadd $h1hi,$h1hi,$c0hi 496 fadd $h3lo,$h3lo,$c2lo 497 fadd $h3hi,$h3hi,$c2hi 498 fadd $h2lo,$h2lo,$c1lo 499 fadd $h2hi,$h2hi,$c1hi 500 fmadd $h0lo,$c3lo,$five_two130,$h0lo 501 fmadd $h0hi,$c3hi,$five_two130,$h0hi 502 503 fadd $x1,$h1lo,$h1hi 504 fadd $x3,$h3lo,$h3hi 505 fadd $x2,$h2lo,$h2hi 506 fadd $x0,$h0lo,$h0hi 507 508 lfd $h0lo,`$LOCALS+8*4`($sp) # pull saved fpscr 509 fadd $x1,$x1,$two32 # bias 510 fadd $x3,$x3,$two96 511 fadd $x2,$x2,$two64 512 fadd $x0,$x0,$two0 513 514 stfd $x1,8*1($ctx) # store [biased] hash value 515 stfd $x3,8*3($ctx) 516 stfd $x2,8*2($ctx) 517 stfd $x0,8*0($ctx) 518 519 mtfsf 255,$h0lo # restore original fpscr 520 lfd f14,`$FRAME-8*18`($sp) 521 lfd f15,`$FRAME-8*17`($sp) 522 lfd f16,`$FRAME-8*16`($sp) 523 lfd f17,`$FRAME-8*15`($sp) 524 lfd f18,`$FRAME-8*14`($sp) 525 lfd f19,`$FRAME-8*13`($sp) 526 lfd f20,`$FRAME-8*12`($sp) 527 lfd f21,`$FRAME-8*11`($sp) 528 lfd f22,`$FRAME-8*10`($sp) 529 lfd f23,`$FRAME-8*9`($sp) 530 lfd f24,`$FRAME-8*8`($sp) 531 lfd f25,`$FRAME-8*7`($sp) 532 lfd f26,`$FRAME-8*6`($sp) 533 lfd f27,`$FRAME-8*5`($sp) 534 lfd f28,`$FRAME-8*4`($sp) 535 lfd f29,`$FRAME-8*3`($sp) 536 lfd f30,`$FRAME-8*2`($sp) 537 lfd f31,`$FRAME-8*1`($sp) 538 addi $sp,$sp,$FRAME 539Labort: 540 blr 541 .long 0 542 .byte 0,12,4,1,0x80,0,4,0 543.size .poly1305_blocks_fpu,.-.poly1305_blocks_fpu 544___ 545{ 546my ($mac,$nonce)=($inp,$len); 547 548my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3 549 ) = map("r$_",(7..11,28..31)); 550my $mask = "r0"; 551my $FRAME = (6+4)*$SIZE_T; 552 553$code.=<<___; 554.globl .poly1305_emit_fpu 555.align 4 556.poly1305_emit_fpu: 557 $STU $sp,-$FRAME($sp) 558 mflr r0 559 $PUSH r28,`$FRAME-$SIZE_T*4`($sp) 560 $PUSH r29,`$FRAME-$SIZE_T*3`($sp) 561 $PUSH r30,`$FRAME-$SIZE_T*2`($sp) 562 $PUSH r31,`$FRAME-$SIZE_T*1`($sp) 563 $PUSH r0,`$FRAME+$LRSAVE`($sp) 564 565 lwz $d0,`8*0+(0^$LITTLE_ENDIAN)`($ctx) # load hash 566 lwz $h0,`8*0+(4^$LITTLE_ENDIAN)`($ctx) 567 lwz $d1,`8*1+(0^$LITTLE_ENDIAN)`($ctx) 568 lwz $h1,`8*1+(4^$LITTLE_ENDIAN)`($ctx) 569 lwz $d2,`8*2+(0^$LITTLE_ENDIAN)`($ctx) 570 lwz $h2,`8*2+(4^$LITTLE_ENDIAN)`($ctx) 571 lwz $d3,`8*3+(0^$LITTLE_ENDIAN)`($ctx) 572 lwz $h3,`8*3+(4^$LITTLE_ENDIAN)`($ctx) 573 574 lis $mask,0xfff0 575 andc $d0,$d0,$mask # mask exponent 576 andc $d1,$d1,$mask 577 andc $d2,$d2,$mask 578 andc $d3,$d3,$mask # can be partially reduced... 579 li $mask,3 580 581 srwi $padbit,$d3,2 # ... so reduce 582 and $h4,$d3,$mask 583 andc $d3,$d3,$mask 584 add $d3,$d3,$padbit 585___ 586 if ($SIZE_T==4) { 587$code.=<<___; 588 addc $h0,$h0,$d3 589 adde $h1,$h1,$d0 590 adde $h2,$h2,$d1 591 adde $h3,$h3,$d2 592 addze $h4,$h4 593 594 addic $d0,$h0,5 # compare to modulus 595 addze $d1,$h1 596 addze $d2,$h2 597 addze $d3,$h3 598 addze $mask,$h4 599 600 srwi $mask,$mask,2 # did it carry/borrow? 601 neg $mask,$mask 602 srawi $mask,$mask,31 # mask 603 604 andc $h0,$h0,$mask 605 and $d0,$d0,$mask 606 andc $h1,$h1,$mask 607 and $d1,$d1,$mask 608 or $h0,$h0,$d0 609 lwz $d0,0($nonce) # load nonce 610 andc $h2,$h2,$mask 611 and $d2,$d2,$mask 612 or $h1,$h1,$d1 613 lwz $d1,4($nonce) 614 andc $h3,$h3,$mask 615 and $d3,$d3,$mask 616 or $h2,$h2,$d2 617 lwz $d2,8($nonce) 618 or $h3,$h3,$d3 619 lwz $d3,12($nonce) 620 621 addc $h0,$h0,$d0 # accumulate nonce 622 adde $h1,$h1,$d1 623 adde $h2,$h2,$d2 624 adde $h3,$h3,$d3 625___ 626 } else { 627$code.=<<___; 628 add $h0,$h0,$d3 629 add $h1,$h1,$d0 630 add $h2,$h2,$d1 631 add $h3,$h3,$d2 632 633 srdi $d0,$h0,32 634 add $h1,$h1,$d0 635 srdi $d1,$h1,32 636 add $h2,$h2,$d1 637 srdi $d2,$h2,32 638 add $h3,$h3,$d2 639 srdi $d3,$h3,32 640 add $h4,$h4,$d3 641 642 insrdi $h0,$h1,32,0 643 insrdi $h2,$h3,32,0 644 645 addic $d0,$h0,5 # compare to modulus 646 addze $d1,$h2 647 addze $d2,$h4 648 649 srdi $mask,$d2,2 # did it carry/borrow? 650 neg $mask,$mask 651 sradi $mask,$mask,63 # mask 652 ld $d2,0($nonce) # load nonce 653 ld $d3,8($nonce) 654 655 andc $h0,$h0,$mask 656 and $d0,$d0,$mask 657 andc $h2,$h2,$mask 658 and $d1,$d1,$mask 659 or $h0,$h0,$d0 660 or $h2,$h2,$d1 661___ 662$code.=<<___ if (!$LITTLE_ENDIAN); 663 rotldi $d2,$d2,32 # flip nonce words 664 rotldi $d3,$d3,32 665___ 666$code.=<<___; 667 addc $h0,$h0,$d2 # accumulate nonce 668 adde $h2,$h2,$d3 669 670 srdi $h1,$h0,32 671 srdi $h3,$h2,32 672___ 673 } 674$code.=<<___ if ($LITTLE_ENDIAN); 675 stw $h0,0($mac) # write result 676 stw $h1,4($mac) 677 stw $h2,8($mac) 678 stw $h3,12($mac) 679___ 680$code.=<<___ if (!$LITTLE_ENDIAN); 681 li $d1,4 682 stwbrx $h0,0,$mac # write result 683 li $d2,8 684 stwbrx $h1,$d1,$mac 685 li $d3,12 686 stwbrx $h2,$d2,$mac 687 stwbrx $h3,$d3,$mac 688___ 689$code.=<<___; 690 $POP r28,`$FRAME-$SIZE_T*4`($sp) 691 $POP r29,`$FRAME-$SIZE_T*3`($sp) 692 $POP r30,`$FRAME-$SIZE_T*2`($sp) 693 $POP r31,`$FRAME-$SIZE_T*1`($sp) 694 addi $sp,$sp,$FRAME 695 blr 696 .long 0 697 .byte 0,12,4,1,0x80,4,3,0 698.size .poly1305_emit_fpu,.-.poly1305_emit_fpu 699___ 700} 701# Ugly hack here, because PPC assembler syntax seem to vary too 702# much from platforms to platform... 703$code.=<<___; 704.align 6 705LPICmeup: 706 mflr r0 707 bcl 20,31,\$+4 708 mflr $len # vvvvvv "distance" between . and 1st data entry 709 addi $len,$len,`64-8` # borrow $len 710 mtlr r0 711 blr 712 .long 0 713 .byte 0,12,0x14,0,0,0,0,0 714 .space `64-9*4` 715 716.quad 0x4330000000000000 # 2^(52+0) 717.quad 0x4530000000000000 # 2^(52+32) 718.quad 0x4730000000000000 # 2^(52+64) 719.quad 0x4930000000000000 # 2^(52+96) 720.quad 0x4b50000000000000 # 2^(52+130) 721 722.quad 0x37f4000000000000 # 5/2^130 723 724.quad 0x4430000000000000 # 2^(52+16+0) 725.quad 0x4630000000000000 # 2^(52+16+32) 726.quad 0x4830000000000000 # 2^(52+16+64) 727.quad 0x4a30000000000000 # 2^(52+16+96) 728.quad 0x3e30000000000000 # 2^(52+16+0-96) 729.quad 0x4030000000000000 # 2^(52+16+32-96) 730.quad 0x4230000000000000 # 2^(52+16+64-96) 731 732.quad 0x0000000000000001 # fpscr: truncate, no exceptions 733.asciz "Poly1305 for PPC FPU, CRYPTOGAMS by <appro\@openssl.org>" 734.align 4 735___ 736 737$code =~ s/\`([^\`]*)\`/eval $1/gem; 738print $code; 739close STDOUT or die "error closing STDOUT: $!"; 740