1#! /usr/bin/env perl 2# Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. 13# 14# Rights for redistribution and usage in source and binary forms are 15# granted according to the OpenSSL license. Warranty of any kind is 16# disclaimed. 17# ==================================================================== 18 19 20# July 1999 21# 22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 23# 24# The module is designed to work with either of the "new" MIPS ABI(5), 25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under 26# IRIX 5.x not only because it doesn't support new ABIs but also 27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 29# cause illegal instruction exception:-( 30# 31# In addition the code depends on preprocessor flags set up by MIPSpro 32# compiler driver (either as or cc) and therefore (probably?) can't be 33# compiled by the GNU assembler. GNU C driver manages fine though... 34# I mean as long as -mmips-as is specified or is the default option, 35# because then it simply invokes /usr/bin/as which in turn takes 36# perfect care of the preprocessor definitions. Another neat feature 37# offered by the MIPSpro assembler is an optimization pass. This gave 38# me the opportunity to have the code looking more regular as all those 39# architecture dependent instruction rescheduling details were left to 40# the assembler. Cool, huh? 41# 42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 43# goes way over 3 times faster! 44# 45# <appro@openssl.org> 46 47# October 2010 48# 49# Adapt the module even for 32-bit ABIs and other OSes. The former was 50# achieved by mechanical replacement of 64-bit arithmetic instructions 51# such as dmultu, daddu, etc. with their 32-bit counterparts and 52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 53# >3x performance improvement naturally does not apply to 32-bit code 54# [because there is no instruction 32-bit compiler can't use], one 55# has to content with 40-85% improvement depending on benchmark and 56# key length, more for longer keys. 57 58$flavour = shift || "o32"; 59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 60open STDOUT,">$output"; 61 62if ($flavour =~ /64|n32/i) { 63 $LD="ld"; 64 $ST="sd"; 65 $MULTU="dmultu"; 66 $DIVU="ddivu"; 67 $ADDU="daddu"; 68 $SUBU="dsubu"; 69 $SRL="dsrl"; 70 $SLL="dsll"; 71 $BNSZ=8; 72 $PTR_ADD="daddu"; 73 $PTR_SUB="dsubu"; 74 $SZREG=8; 75 $REG_S="sd"; 76 $REG_L="ld"; 77} else { 78 $LD="lw"; 79 $ST="sw"; 80 $MULTU="multu"; 81 $DIVU="divu"; 82 $ADDU="addu"; 83 $SUBU="subu"; 84 $SRL="srl"; 85 $SLL="sll"; 86 $BNSZ=4; 87 $PTR_ADD="addu"; 88 $PTR_SUB="subu"; 89 $SZREG=4; 90 $REG_S="sw"; 91 $REG_L="lw"; 92 $code="#if !(defined (__mips_isa_rev) && (__mips_isa_rev >= 6))\n.set mips2\n#endif\n"; 93} 94 95# Below is N32/64 register layout used in the original module. 96# 97($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 101($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 103# 104# No special adaptation is required for O32. NUBI on the other hand 105# is treated by saving/restoring ($v1,$t0..$t3). 106 107$gp=$v1 if ($flavour =~ /nubi/i); 108 109$minus4=$v1; 110 111$code.=<<___; 112#include "mips_arch.h" 113 114#if defined(_MIPS_ARCH_MIPS64R6) 115# define ddivu(rs,rt) 116# define mfqt(rd,rs,rt) ddivu rd,rs,rt 117# define mfrm(rd,rs,rt) dmodu rd,rs,rt 118#elif defined(_MIPS_ARCH_MIPS32R6) 119# define divu(rs,rt) 120# define mfqt(rd,rs,rt) divu rd,rs,rt 121# define mfrm(rd,rs,rt) modu rd,rs,rt 122#else 123# define $DIVU(rs,rt) $DIVU $zero,rs,rt 124# define mfqt(rd,rs,rt) mflo rd 125# define mfrm(rd,rs,rt) mfhi rd 126#endif 127 128.rdata 129.asciiz "mips3.s, Version 1.2" 130.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 131 132.text 133.set noat 134 135.align 5 136.globl bn_mul_add_words 137.ent bn_mul_add_words 138bn_mul_add_words: 139 .set noreorder 140 bgtz $a2,bn_mul_add_words_internal 141 move $v0,$zero 142 jr $ra 143 move $a0,$v0 144.end bn_mul_add_words 145 146.align 5 147.ent bn_mul_add_words_internal 148bn_mul_add_words_internal: 149___ 150$code.=<<___ if ($flavour =~ /nubi/i); 151 .frame $sp,6*$SZREG,$ra 152 .mask 0x8000f008,-$SZREG 153 .set noreorder 154 $PTR_SUB $sp,6*$SZREG 155 $REG_S $ra,5*$SZREG($sp) 156 $REG_S $t3,4*$SZREG($sp) 157 $REG_S $t2,3*$SZREG($sp) 158 $REG_S $t1,2*$SZREG($sp) 159 $REG_S $t0,1*$SZREG($sp) 160 $REG_S $gp,0*$SZREG($sp) 161___ 162$code.=<<___; 163 .set reorder 164 li $minus4,-4 165 and $ta0,$a2,$minus4 166 beqz $ta0,.L_bn_mul_add_words_tail 167 168.L_bn_mul_add_words_loop: 169 $LD $t0,0($a1) 170 $MULTU ($t0,$a3) 171 $LD $t1,0($a0) 172 $LD $t2,$BNSZ($a1) 173 $LD $t3,$BNSZ($a0) 174 $LD $ta0,2*$BNSZ($a1) 175 $LD $ta1,2*$BNSZ($a0) 176 $ADDU $t1,$v0 177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 178 # values", but it seems to work fine 179 # even on 64-bit registers. 180 mflo ($at,$t0,$a3) 181 mfhi ($t0,$t0,$a3) 182 $ADDU $t1,$at 183 $ADDU $v0,$t0 184 $MULTU ($t2,$a3) 185 sltu $at,$t1,$at 186 $ST $t1,0($a0) 187 $ADDU $v0,$at 188 189 $LD $ta2,3*$BNSZ($a1) 190 $LD $ta3,3*$BNSZ($a0) 191 $ADDU $t3,$v0 192 sltu $v0,$t3,$v0 193 mflo ($at,$t2,$a3) 194 mfhi ($t2,$t2,$a3) 195 $ADDU $t3,$at 196 $ADDU $v0,$t2 197 $MULTU ($ta0,$a3) 198 sltu $at,$t3,$at 199 $ST $t3,$BNSZ($a0) 200 $ADDU $v0,$at 201 202 subu $a2,4 203 $PTR_ADD $a0,4*$BNSZ 204 $PTR_ADD $a1,4*$BNSZ 205 $ADDU $ta1,$v0 206 sltu $v0,$ta1,$v0 207 mflo ($at,$ta0,$a3) 208 mfhi ($ta0,$ta0,$a3) 209 $ADDU $ta1,$at 210 $ADDU $v0,$ta0 211 $MULTU ($ta2,$a3) 212 sltu $at,$ta1,$at 213 $ST $ta1,-2*$BNSZ($a0) 214 $ADDU $v0,$at 215 216 217 and $ta0,$a2,$minus4 218 $ADDU $ta3,$v0 219 sltu $v0,$ta3,$v0 220 mflo ($at,$ta2,$a3) 221 mfhi ($ta2,$ta2,$a3) 222 $ADDU $ta3,$at 223 $ADDU $v0,$ta2 224 sltu $at,$ta3,$at 225 $ST $ta3,-$BNSZ($a0) 226 .set noreorder 227 bgtz $ta0,.L_bn_mul_add_words_loop 228 $ADDU $v0,$at 229 230 beqz $a2,.L_bn_mul_add_words_return 231 nop 232 233.L_bn_mul_add_words_tail: 234 .set reorder 235 $LD $t0,0($a1) 236 $MULTU ($t0,$a3) 237 $LD $t1,0($a0) 238 subu $a2,1 239 $ADDU $t1,$v0 240 sltu $v0,$t1,$v0 241 mflo ($at,$t0,$a3) 242 mfhi ($t0,$t0,$a3) 243 $ADDU $t1,$at 244 $ADDU $v0,$t0 245 sltu $at,$t1,$at 246 $ST $t1,0($a0) 247 $ADDU $v0,$at 248 beqz $a2,.L_bn_mul_add_words_return 249 250 $LD $t0,$BNSZ($a1) 251 $MULTU ($t0,$a3) 252 $LD $t1,$BNSZ($a0) 253 subu $a2,1 254 $ADDU $t1,$v0 255 sltu $v0,$t1,$v0 256 mflo ($at,$t0,$a3) 257 mfhi ($t0,$t0,$a3) 258 $ADDU $t1,$at 259 $ADDU $v0,$t0 260 sltu $at,$t1,$at 261 $ST $t1,$BNSZ($a0) 262 $ADDU $v0,$at 263 beqz $a2,.L_bn_mul_add_words_return 264 265 $LD $t0,2*$BNSZ($a1) 266 $MULTU ($t0,$a3) 267 $LD $t1,2*$BNSZ($a0) 268 $ADDU $t1,$v0 269 sltu $v0,$t1,$v0 270 mflo ($at,$t0,$a3) 271 mfhi ($t0,$t0,$a3) 272 $ADDU $t1,$at 273 $ADDU $v0,$t0 274 sltu $at,$t1,$at 275 $ST $t1,2*$BNSZ($a0) 276 $ADDU $v0,$at 277 278.L_bn_mul_add_words_return: 279 .set noreorder 280___ 281$code.=<<___ if ($flavour =~ /nubi/i); 282 $REG_L $t3,4*$SZREG($sp) 283 $REG_L $t2,3*$SZREG($sp) 284 $REG_L $t1,2*$SZREG($sp) 285 $REG_L $t0,1*$SZREG($sp) 286 $REG_L $gp,0*$SZREG($sp) 287 $PTR_ADD $sp,6*$SZREG 288___ 289$code.=<<___; 290 jr $ra 291 move $a0,$v0 292.end bn_mul_add_words_internal 293 294.align 5 295.globl bn_mul_words 296.ent bn_mul_words 297bn_mul_words: 298 .set noreorder 299 bgtz $a2,bn_mul_words_internal 300 move $v0,$zero 301 jr $ra 302 move $a0,$v0 303.end bn_mul_words 304 305.align 5 306.ent bn_mul_words_internal 307bn_mul_words_internal: 308___ 309$code.=<<___ if ($flavour =~ /nubi/i); 310 .frame $sp,6*$SZREG,$ra 311 .mask 0x8000f008,-$SZREG 312 .set noreorder 313 $PTR_SUB $sp,6*$SZREG 314 $REG_S $ra,5*$SZREG($sp) 315 $REG_S $t3,4*$SZREG($sp) 316 $REG_S $t2,3*$SZREG($sp) 317 $REG_S $t1,2*$SZREG($sp) 318 $REG_S $t0,1*$SZREG($sp) 319 $REG_S $gp,0*$SZREG($sp) 320___ 321$code.=<<___; 322 .set reorder 323 li $minus4,-4 324 and $ta0,$a2,$minus4 325 beqz $ta0,.L_bn_mul_words_tail 326 327.L_bn_mul_words_loop: 328 $LD $t0,0($a1) 329 $MULTU ($t0,$a3) 330 $LD $t2,$BNSZ($a1) 331 $LD $ta0,2*$BNSZ($a1) 332 $LD $ta2,3*$BNSZ($a1) 333 mflo ($at,$t0,$a3) 334 mfhi ($t0,$t0,$a3) 335 $ADDU $v0,$at 336 sltu $t1,$v0,$at 337 $MULTU ($t2,$a3) 338 $ST $v0,0($a0) 339 $ADDU $v0,$t1,$t0 340 341 subu $a2,4 342 $PTR_ADD $a0,4*$BNSZ 343 $PTR_ADD $a1,4*$BNSZ 344 mflo ($at,$t2,$a3) 345 mfhi ($t2,$t2,$a3) 346 $ADDU $v0,$at 347 sltu $t3,$v0,$at 348 $MULTU ($ta0,$a3) 349 $ST $v0,-3*$BNSZ($a0) 350 $ADDU $v0,$t3,$t2 351 352 mflo ($at,$ta0,$a3) 353 mfhi ($ta0,$ta0,$a3) 354 $ADDU $v0,$at 355 sltu $ta1,$v0,$at 356 $MULTU ($ta2,$a3) 357 $ST $v0,-2*$BNSZ($a0) 358 $ADDU $v0,$ta1,$ta0 359 360 and $ta0,$a2,$minus4 361 mflo ($at,$ta2,$a3) 362 mfhi ($ta2,$ta2,$a3) 363 $ADDU $v0,$at 364 sltu $ta3,$v0,$at 365 $ST $v0,-$BNSZ($a0) 366 .set noreorder 367 bgtz $ta0,.L_bn_mul_words_loop 368 $ADDU $v0,$ta3,$ta2 369 370 beqz $a2,.L_bn_mul_words_return 371 nop 372 373.L_bn_mul_words_tail: 374 .set reorder 375 $LD $t0,0($a1) 376 $MULTU ($t0,$a3) 377 subu $a2,1 378 mflo ($at,$t0,$a3) 379 mfhi ($t0,$t0,$a3) 380 $ADDU $v0,$at 381 sltu $t1,$v0,$at 382 $ST $v0,0($a0) 383 $ADDU $v0,$t1,$t0 384 beqz $a2,.L_bn_mul_words_return 385 386 $LD $t0,$BNSZ($a1) 387 $MULTU ($t0,$a3) 388 subu $a2,1 389 mflo ($at,$t0,$a3) 390 mfhi ($t0,$t0,$a3) 391 $ADDU $v0,$at 392 sltu $t1,$v0,$at 393 $ST $v0,$BNSZ($a0) 394 $ADDU $v0,$t1,$t0 395 beqz $a2,.L_bn_mul_words_return 396 397 $LD $t0,2*$BNSZ($a1) 398 $MULTU ($t0,$a3) 399 mflo ($at,$t0,$a3) 400 mfhi ($t0,$t0,$a3) 401 $ADDU $v0,$at 402 sltu $t1,$v0,$at 403 $ST $v0,2*$BNSZ($a0) 404 $ADDU $v0,$t1,$t0 405 406.L_bn_mul_words_return: 407 .set noreorder 408___ 409$code.=<<___ if ($flavour =~ /nubi/i); 410 $REG_L $t3,4*$SZREG($sp) 411 $REG_L $t2,3*$SZREG($sp) 412 $REG_L $t1,2*$SZREG($sp) 413 $REG_L $t0,1*$SZREG($sp) 414 $REG_L $gp,0*$SZREG($sp) 415 $PTR_ADD $sp,6*$SZREG 416___ 417$code.=<<___; 418 jr $ra 419 move $a0,$v0 420.end bn_mul_words_internal 421 422.align 5 423.globl bn_sqr_words 424.ent bn_sqr_words 425bn_sqr_words: 426 .set noreorder 427 bgtz $a2,bn_sqr_words_internal 428 move $v0,$zero 429 jr $ra 430 move $a0,$v0 431.end bn_sqr_words 432 433.align 5 434.ent bn_sqr_words_internal 435bn_sqr_words_internal: 436___ 437$code.=<<___ if ($flavour =~ /nubi/i); 438 .frame $sp,6*$SZREG,$ra 439 .mask 0x8000f008,-$SZREG 440 .set noreorder 441 $PTR_SUB $sp,6*$SZREG 442 $REG_S $ra,5*$SZREG($sp) 443 $REG_S $t3,4*$SZREG($sp) 444 $REG_S $t2,3*$SZREG($sp) 445 $REG_S $t1,2*$SZREG($sp) 446 $REG_S $t0,1*$SZREG($sp) 447 $REG_S $gp,0*$SZREG($sp) 448___ 449$code.=<<___; 450 .set reorder 451 li $minus4,-4 452 and $ta0,$a2,$minus4 453 beqz $ta0,.L_bn_sqr_words_tail 454 455.L_bn_sqr_words_loop: 456 $LD $t0,0($a1) 457 $MULTU ($t0,$t0) 458 $LD $t2,$BNSZ($a1) 459 $LD $ta0,2*$BNSZ($a1) 460 $LD $ta2,3*$BNSZ($a1) 461 mflo ($t1,$t0,$t0) 462 mfhi ($t0,$t0,$t0) 463 $ST $t1,0($a0) 464 $ST $t0,$BNSZ($a0) 465 466 $MULTU ($t2,$t2) 467 subu $a2,4 468 $PTR_ADD $a0,8*$BNSZ 469 $PTR_ADD $a1,4*$BNSZ 470 mflo ($t3,$t2,$t2) 471 mfhi ($t2,$t2,$t2) 472 $ST $t3,-6*$BNSZ($a0) 473 $ST $t2,-5*$BNSZ($a0) 474 475 $MULTU ($ta0,$ta0) 476 mflo ($ta1,$ta0,$ta0) 477 mfhi ($ta0,$ta0,$ta0) 478 $ST $ta1,-4*$BNSZ($a0) 479 $ST $ta0,-3*$BNSZ($a0) 480 481 482 $MULTU ($ta2,$ta2) 483 and $ta0,$a2,$minus4 484 mflo ($ta3,$ta2,$ta2) 485 mfhi ($ta2,$ta2,$ta2) 486 $ST $ta3,-2*$BNSZ($a0) 487 488 .set noreorder 489 bgtz $ta0,.L_bn_sqr_words_loop 490 $ST $ta2,-$BNSZ($a0) 491 492 beqz $a2,.L_bn_sqr_words_return 493 nop 494 495.L_bn_sqr_words_tail: 496 .set reorder 497 $LD $t0,0($a1) 498 $MULTU ($t0,$t0) 499 subu $a2,1 500 mflo ($t1,$t0,$t0) 501 mfhi ($t0,$t0,$t0) 502 $ST $t1,0($a0) 503 $ST $t0,$BNSZ($a0) 504 beqz $a2,.L_bn_sqr_words_return 505 506 $LD $t0,$BNSZ($a1) 507 $MULTU ($t0,$t0) 508 subu $a2,1 509 mflo ($t1,$t0,$t0) 510 mfhi ($t0,$t0,$t0) 511 $ST $t1,2*$BNSZ($a0) 512 $ST $t0,3*$BNSZ($a0) 513 beqz $a2,.L_bn_sqr_words_return 514 515 $LD $t0,2*$BNSZ($a1) 516 $MULTU ($t0,$t0) 517 mflo ($t1,$t0,$t0) 518 mfhi ($t0,$t0,$t0) 519 $ST $t1,4*$BNSZ($a0) 520 $ST $t0,5*$BNSZ($a0) 521 522.L_bn_sqr_words_return: 523 .set noreorder 524___ 525$code.=<<___ if ($flavour =~ /nubi/i); 526 $REG_L $t3,4*$SZREG($sp) 527 $REG_L $t2,3*$SZREG($sp) 528 $REG_L $t1,2*$SZREG($sp) 529 $REG_L $t0,1*$SZREG($sp) 530 $REG_L $gp,0*$SZREG($sp) 531 $PTR_ADD $sp,6*$SZREG 532___ 533$code.=<<___; 534 jr $ra 535 move $a0,$v0 536 537.end bn_sqr_words_internal 538 539.align 5 540.globl bn_add_words 541.ent bn_add_words 542bn_add_words: 543 .set noreorder 544 bgtz $a3,bn_add_words_internal 545 move $v0,$zero 546 jr $ra 547 move $a0,$v0 548.end bn_add_words 549 550.align 5 551.ent bn_add_words_internal 552bn_add_words_internal: 553___ 554$code.=<<___ if ($flavour =~ /nubi/i); 555 .frame $sp,6*$SZREG,$ra 556 .mask 0x8000f008,-$SZREG 557 .set noreorder 558 $PTR_SUB $sp,6*$SZREG 559 $REG_S $ra,5*$SZREG($sp) 560 $REG_S $t3,4*$SZREG($sp) 561 $REG_S $t2,3*$SZREG($sp) 562 $REG_S $t1,2*$SZREG($sp) 563 $REG_S $t0,1*$SZREG($sp) 564 $REG_S $gp,0*$SZREG($sp) 565___ 566$code.=<<___; 567 .set reorder 568 li $minus4,-4 569 and $at,$a3,$minus4 570 beqz $at,.L_bn_add_words_tail 571 572.L_bn_add_words_loop: 573 $LD $t0,0($a1) 574 $LD $ta0,0($a2) 575 subu $a3,4 576 $LD $t1,$BNSZ($a1) 577 and $at,$a3,$minus4 578 $LD $t2,2*$BNSZ($a1) 579 $PTR_ADD $a2,4*$BNSZ 580 $LD $t3,3*$BNSZ($a1) 581 $PTR_ADD $a0,4*$BNSZ 582 $LD $ta1,-3*$BNSZ($a2) 583 $PTR_ADD $a1,4*$BNSZ 584 $LD $ta2,-2*$BNSZ($a2) 585 $LD $ta3,-$BNSZ($a2) 586 $ADDU $ta0,$t0 587 sltu $t8,$ta0,$t0 588 $ADDU $t0,$ta0,$v0 589 sltu $v0,$t0,$ta0 590 $ST $t0,-4*$BNSZ($a0) 591 $ADDU $v0,$t8 592 593 $ADDU $ta1,$t1 594 sltu $t9,$ta1,$t1 595 $ADDU $t1,$ta1,$v0 596 sltu $v0,$t1,$ta1 597 $ST $t1,-3*$BNSZ($a0) 598 $ADDU $v0,$t9 599 600 $ADDU $ta2,$t2 601 sltu $t8,$ta2,$t2 602 $ADDU $t2,$ta2,$v0 603 sltu $v0,$t2,$ta2 604 $ST $t2,-2*$BNSZ($a0) 605 $ADDU $v0,$t8 606 607 $ADDU $ta3,$t3 608 sltu $t9,$ta3,$t3 609 $ADDU $t3,$ta3,$v0 610 sltu $v0,$t3,$ta3 611 $ST $t3,-$BNSZ($a0) 612 613 .set noreorder 614 bgtz $at,.L_bn_add_words_loop 615 $ADDU $v0,$t9 616 617 beqz $a3,.L_bn_add_words_return 618 nop 619 620.L_bn_add_words_tail: 621 .set reorder 622 $LD $t0,0($a1) 623 $LD $ta0,0($a2) 624 $ADDU $ta0,$t0 625 subu $a3,1 626 sltu $t8,$ta0,$t0 627 $ADDU $t0,$ta0,$v0 628 sltu $v0,$t0,$ta0 629 $ST $t0,0($a0) 630 $ADDU $v0,$t8 631 beqz $a3,.L_bn_add_words_return 632 633 $LD $t1,$BNSZ($a1) 634 $LD $ta1,$BNSZ($a2) 635 $ADDU $ta1,$t1 636 subu $a3,1 637 sltu $t9,$ta1,$t1 638 $ADDU $t1,$ta1,$v0 639 sltu $v0,$t1,$ta1 640 $ST $t1,$BNSZ($a0) 641 $ADDU $v0,$t9 642 beqz $a3,.L_bn_add_words_return 643 644 $LD $t2,2*$BNSZ($a1) 645 $LD $ta2,2*$BNSZ($a2) 646 $ADDU $ta2,$t2 647 sltu $t8,$ta2,$t2 648 $ADDU $t2,$ta2,$v0 649 sltu $v0,$t2,$ta2 650 $ST $t2,2*$BNSZ($a0) 651 $ADDU $v0,$t8 652 653.L_bn_add_words_return: 654 .set noreorder 655___ 656$code.=<<___ if ($flavour =~ /nubi/i); 657 $REG_L $t3,4*$SZREG($sp) 658 $REG_L $t2,3*$SZREG($sp) 659 $REG_L $t1,2*$SZREG($sp) 660 $REG_L $t0,1*$SZREG($sp) 661 $REG_L $gp,0*$SZREG($sp) 662 $PTR_ADD $sp,6*$SZREG 663___ 664$code.=<<___; 665 jr $ra 666 move $a0,$v0 667 668.end bn_add_words_internal 669 670.align 5 671.globl bn_sub_words 672.ent bn_sub_words 673bn_sub_words: 674 .set noreorder 675 bgtz $a3,bn_sub_words_internal 676 move $v0,$zero 677 jr $ra 678 move $a0,$zero 679.end bn_sub_words 680 681.align 5 682.ent bn_sub_words_internal 683bn_sub_words_internal: 684___ 685$code.=<<___ if ($flavour =~ /nubi/i); 686 .frame $sp,6*$SZREG,$ra 687 .mask 0x8000f008,-$SZREG 688 .set noreorder 689 $PTR_SUB $sp,6*$SZREG 690 $REG_S $ra,5*$SZREG($sp) 691 $REG_S $t3,4*$SZREG($sp) 692 $REG_S $t2,3*$SZREG($sp) 693 $REG_S $t1,2*$SZREG($sp) 694 $REG_S $t0,1*$SZREG($sp) 695 $REG_S $gp,0*$SZREG($sp) 696___ 697$code.=<<___; 698 .set reorder 699 li $minus4,-4 700 and $at,$a3,$minus4 701 beqz $at,.L_bn_sub_words_tail 702 703.L_bn_sub_words_loop: 704 $LD $t0,0($a1) 705 $LD $ta0,0($a2) 706 subu $a3,4 707 $LD $t1,$BNSZ($a1) 708 and $at,$a3,$minus4 709 $LD $t2,2*$BNSZ($a1) 710 $PTR_ADD $a2,4*$BNSZ 711 $LD $t3,3*$BNSZ($a1) 712 $PTR_ADD $a0,4*$BNSZ 713 $LD $ta1,-3*$BNSZ($a2) 714 $PTR_ADD $a1,4*$BNSZ 715 $LD $ta2,-2*$BNSZ($a2) 716 $LD $ta3,-$BNSZ($a2) 717 sltu $t8,$t0,$ta0 718 $SUBU $ta0,$t0,$ta0 719 $SUBU $t0,$ta0,$v0 720 sgtu $v0,$t0,$ta0 721 $ST $t0,-4*$BNSZ($a0) 722 $ADDU $v0,$t8 723 724 sltu $t9,$t1,$ta1 725 $SUBU $ta1,$t1,$ta1 726 $SUBU $t1,$ta1,$v0 727 sgtu $v0,$t1,$ta1 728 $ST $t1,-3*$BNSZ($a0) 729 $ADDU $v0,$t9 730 731 732 sltu $t8,$t2,$ta2 733 $SUBU $ta2,$t2,$ta2 734 $SUBU $t2,$ta2,$v0 735 sgtu $v0,$t2,$ta2 736 $ST $t2,-2*$BNSZ($a0) 737 $ADDU $v0,$t8 738 739 sltu $t9,$t3,$ta3 740 $SUBU $ta3,$t3,$ta3 741 $SUBU $t3,$ta3,$v0 742 sgtu $v0,$t3,$ta3 743 $ST $t3,-$BNSZ($a0) 744 745 .set noreorder 746 bgtz $at,.L_bn_sub_words_loop 747 $ADDU $v0,$t9 748 749 beqz $a3,.L_bn_sub_words_return 750 nop 751 752.L_bn_sub_words_tail: 753 .set reorder 754 $LD $t0,0($a1) 755 $LD $ta0,0($a2) 756 subu $a3,1 757 sltu $t8,$t0,$ta0 758 $SUBU $ta0,$t0,$ta0 759 $SUBU $t0,$ta0,$v0 760 sgtu $v0,$t0,$ta0 761 $ST $t0,0($a0) 762 $ADDU $v0,$t8 763 beqz $a3,.L_bn_sub_words_return 764 765 $LD $t1,$BNSZ($a1) 766 subu $a3,1 767 $LD $ta1,$BNSZ($a2) 768 sltu $t9,$t1,$ta1 769 $SUBU $ta1,$t1,$ta1 770 $SUBU $t1,$ta1,$v0 771 sgtu $v0,$t1,$ta1 772 $ST $t1,$BNSZ($a0) 773 $ADDU $v0,$t9 774 beqz $a3,.L_bn_sub_words_return 775 776 $LD $t2,2*$BNSZ($a1) 777 $LD $ta2,2*$BNSZ($a2) 778 sltu $t8,$t2,$ta2 779 $SUBU $ta2,$t2,$ta2 780 $SUBU $t2,$ta2,$v0 781 sgtu $v0,$t2,$ta2 782 $ST $t2,2*$BNSZ($a0) 783 $ADDU $v0,$t8 784 785.L_bn_sub_words_return: 786 .set noreorder 787___ 788$code.=<<___ if ($flavour =~ /nubi/i); 789 $REG_L $t3,4*$SZREG($sp) 790 $REG_L $t2,3*$SZREG($sp) 791 $REG_L $t1,2*$SZREG($sp) 792 $REG_L $t0,1*$SZREG($sp) 793 $REG_L $gp,0*$SZREG($sp) 794 $PTR_ADD $sp,6*$SZREG 795___ 796$code.=<<___; 797 jr $ra 798 move $a0,$v0 799.end bn_sub_words_internal 800 801#if 0 802/* 803 * The bn_div_3_words entry point is re-used for constant-time interface. 804 * Implementation is retained as historical reference. 805 */ 806.align 5 807.globl bn_div_3_words 808.ent bn_div_3_words 809bn_div_3_words: 810 .set noreorder 811 move $a3,$a0 # we know that bn_div_words does not 812 # touch $a3, $ta2, $ta3 and preserves $a2 813 # so that we can save two arguments 814 # and return address in registers 815 # instead of stack:-) 816 817 $LD $a0,($a3) 818 move $ta2,$a1 819 bne $a0,$a2,bn_div_3_words_internal 820 $LD $a1,-$BNSZ($a3) 821 li $v0,-1 822 jr $ra 823 move $a0,$v0 824.end bn_div_3_words 825 826.align 5 827.ent bn_div_3_words_internal 828bn_div_3_words_internal: 829___ 830$code.=<<___ if ($flavour =~ /nubi/i); 831 .frame $sp,6*$SZREG,$ra 832 .mask 0x8000f008,-$SZREG 833 .set noreorder 834 $PTR_SUB $sp,6*$SZREG 835 $REG_S $ra,5*$SZREG($sp) 836 $REG_S $t3,4*$SZREG($sp) 837 $REG_S $t2,3*$SZREG($sp) 838 $REG_S $t1,2*$SZREG($sp) 839 $REG_S $t0,1*$SZREG($sp) 840 $REG_S $gp,0*$SZREG($sp) 841___ 842$code.=<<___; 843 .set reorder 844 move $ta3,$ra 845 bal bn_div_words_internal 846 move $ra,$ta3 847 $MULTU ($ta2,$v0) 848 $LD $t2,-2*$BNSZ($a3) 849 move $ta0,$zero 850 mfhi ($t1,$ta2,$v0) 851 mflo ($t0,$ta2,$v0) 852 sltu $t8,$t1,$a1 853.L_bn_div_3_words_inner_loop: 854 bnez $t8,.L_bn_div_3_words_inner_loop_done 855 sgeu $at,$t2,$t0 856 seq $t9,$t1,$a1 857 and $at,$t9 858 sltu $t3,$t0,$ta2 859 $ADDU $a1,$a2 860 $SUBU $t1,$t3 861 $SUBU $t0,$ta2 862 sltu $t8,$t1,$a1 863 sltu $ta0,$a1,$a2 864 or $t8,$ta0 865 .set noreorder 866 beqz $at,.L_bn_div_3_words_inner_loop 867 $SUBU $v0,1 868 $ADDU $v0,1 869 .set reorder 870.L_bn_div_3_words_inner_loop_done: 871 .set noreorder 872___ 873$code.=<<___ if ($flavour =~ /nubi/i); 874 $REG_L $t3,4*$SZREG($sp) 875 $REG_L $t2,3*$SZREG($sp) 876 $REG_L $t1,2*$SZREG($sp) 877 $REG_L $t0,1*$SZREG($sp) 878 $REG_L $gp,0*$SZREG($sp) 879 $PTR_ADD $sp,6*$SZREG 880___ 881$code.=<<___; 882 jr $ra 883 move $a0,$v0 884.end bn_div_3_words_internal 885#endif 886 887.align 5 888.globl bn_div_words 889.ent bn_div_words 890bn_div_words: 891 .set noreorder 892 bnez $a2,bn_div_words_internal 893 li $v0,-1 # I would rather signal div-by-zero 894 # which can be done with 'break 7' 895 jr $ra 896 move $a0,$v0 897.end bn_div_words 898 899.align 5 900.ent bn_div_words_internal 901bn_div_words_internal: 902___ 903$code.=<<___ if ($flavour =~ /nubi/i); 904 .frame $sp,6*$SZREG,$ra 905 .mask 0x8000f008,-$SZREG 906 .set noreorder 907 $PTR_SUB $sp,6*$SZREG 908 $REG_S $ra,5*$SZREG($sp) 909 $REG_S $t3,4*$SZREG($sp) 910 $REG_S $t2,3*$SZREG($sp) 911 $REG_S $t1,2*$SZREG($sp) 912 $REG_S $t0,1*$SZREG($sp) 913 $REG_S $gp,0*$SZREG($sp) 914___ 915$code.=<<___; 916 move $v1,$zero 917 bltz $a2,.L_bn_div_words_body 918 move $t9,$v1 919 $SLL $a2,1 920 bgtz $a2,.-4 921 addu $t9,1 922 923 .set reorder 924 negu $t1,$t9 925 li $t2,-1 926 $SLL $t2,$t1 927 and $t2,$a0 928 $SRL $at,$a1,$t1 929 .set noreorder 930 beqz $t2,.+12 931 nop 932 break 6 # signal overflow 933 .set reorder 934 $SLL $a0,$t9 935 $SLL $a1,$t9 936 or $a0,$at 937___ 938$QT=$ta0; 939$HH=$ta1; 940$DH=$v1; 941$code.=<<___; 942.L_bn_div_words_body: 943 $SRL $DH,$a2,4*$BNSZ # bits 944 sgeu $at,$a0,$a2 945 .set noreorder 946 beqz $at,.+12 947 nop 948 $SUBU $a0,$a2 949 .set reorder 950 951 li $QT,-1 952 $SRL $HH,$a0,4*$BNSZ # bits 953 $SRL $QT,4*$BNSZ # q=0xffffffff 954 beq $DH,$HH,.L_bn_div_words_skip_div1 955 $DIVU ($a0,$DH) 956 mfqt ($QT,$a0,$DH) 957.L_bn_div_words_skip_div1: 958 $MULTU ($a2,$QT) 959 $SLL $t3,$a0,4*$BNSZ # bits 960 $SRL $at,$a1,4*$BNSZ # bits 961 or $t3,$at 962 mflo ($t0,$a2,$QT) 963 mfhi ($t1,$a2,$QT) 964.L_bn_div_words_inner_loop1: 965 sltu $t2,$t3,$t0 966 seq $t8,$HH,$t1 967 sltu $at,$HH,$t1 968 and $t2,$t8 969 sltu $v0,$t0,$a2 970 or $at,$t2 971 .set noreorder 972 beqz $at,.L_bn_div_words_inner_loop1_done 973 $SUBU $t1,$v0 974 $SUBU $t0,$a2 975 b .L_bn_div_words_inner_loop1 976 $SUBU $QT,1 977 .set reorder 978.L_bn_div_words_inner_loop1_done: 979 980 $SLL $a1,4*$BNSZ # bits 981 $SUBU $a0,$t3,$t0 982 $SLL $v0,$QT,4*$BNSZ # bits 983 984 li $QT,-1 985 $SRL $HH,$a0,4*$BNSZ # bits 986 $SRL $QT,4*$BNSZ # q=0xffffffff 987 beq $DH,$HH,.L_bn_div_words_skip_div2 988 $DIVU ($a0,$DH) 989 mfqt ($QT,$a0,$DH) 990.L_bn_div_words_skip_div2: 991 $MULTU ($a2,$QT) 992 $SLL $t3,$a0,4*$BNSZ # bits 993 $SRL $at,$a1,4*$BNSZ # bits 994 or $t3,$at 995 mflo ($t0,$a2,$QT) 996 mfhi ($t1,$a2,$QT) 997.L_bn_div_words_inner_loop2: 998 sltu $t2,$t3,$t0 999 seq $t8,$HH,$t1 1000 sltu $at,$HH,$t1 1001 and $t2,$t8 1002 sltu $v1,$t0,$a2 1003 or $at,$t2 1004 .set noreorder 1005 beqz $at,.L_bn_div_words_inner_loop2_done 1006 $SUBU $t1,$v1 1007 $SUBU $t0,$a2 1008 b .L_bn_div_words_inner_loop2 1009 $SUBU $QT,1 1010 .set reorder 1011.L_bn_div_words_inner_loop2_done: 1012 1013 $SUBU $a0,$t3,$t0 1014 or $v0,$QT 1015 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 1016 $SRL $a2,$t9 # restore $a2 1017 1018 .set noreorder 1019 move $a1,$v1 1020___ 1021$code.=<<___ if ($flavour =~ /nubi/i); 1022 $REG_L $t3,4*$SZREG($sp) 1023 $REG_L $t2,3*$SZREG($sp) 1024 $REG_L $t1,2*$SZREG($sp) 1025 $REG_L $t0,1*$SZREG($sp) 1026 $REG_L $gp,0*$SZREG($sp) 1027 $PTR_ADD $sp,6*$SZREG 1028___ 1029$code.=<<___; 1030 jr $ra 1031 move $a0,$v0 1032.end bn_div_words_internal 1033___ 1034undef $HH; undef $QT; undef $DH; 1035 1036($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1037($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1038 1039($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1040($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1041 1042($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1043 1044$code.=<<___; 1045 1046.align 5 1047.globl bn_mul_comba8 1048.ent bn_mul_comba8 1049bn_mul_comba8: 1050 .set noreorder 1051___ 1052$code.=<<___ if ($flavour =~ /nubi/i); 1053 .frame $sp,12*$SZREG,$ra 1054 .mask 0x803ff008,-$SZREG 1055 $PTR_SUB $sp,12*$SZREG 1056 $REG_S $ra,11*$SZREG($sp) 1057 $REG_S $s5,10*$SZREG($sp) 1058 $REG_S $s4,9*$SZREG($sp) 1059 $REG_S $s3,8*$SZREG($sp) 1060 $REG_S $s2,7*$SZREG($sp) 1061 $REG_S $s1,6*$SZREG($sp) 1062 $REG_S $s0,5*$SZREG($sp) 1063 $REG_S $t3,4*$SZREG($sp) 1064 $REG_S $t2,3*$SZREG($sp) 1065 $REG_S $t1,2*$SZREG($sp) 1066 $REG_S $t0,1*$SZREG($sp) 1067 $REG_S $gp,0*$SZREG($sp) 1068___ 1069$code.=<<___ if ($flavour !~ /nubi/i); 1070 .frame $sp,6*$SZREG,$ra 1071 .mask 0x003f0000,-$SZREG 1072 $PTR_SUB $sp,6*$SZREG 1073 $REG_S $s5,5*$SZREG($sp) 1074 $REG_S $s4,4*$SZREG($sp) 1075 $REG_S $s3,3*$SZREG($sp) 1076 $REG_S $s2,2*$SZREG($sp) 1077 $REG_S $s1,1*$SZREG($sp) 1078 $REG_S $s0,0*$SZREG($sp) 1079___ 1080$code.=<<___; 1081 1082 .set reorder 1083 $LD $a_0,0($a1) # If compiled with -mips3 option on 1084 # R5000 box assembler barks on this 1085 # 1ine with "should not have mult/div 1086 # as last instruction in bb (R10K 1087 # bug)" warning. If anybody out there 1088 # has a clue about how to circumvent 1089 # this do send me a note. 1090 # <appro\@fy.chalmers.se> 1091 1092 $LD $b_0,0($a2) 1093 $LD $a_1,$BNSZ($a1) 1094 $LD $a_2,2*$BNSZ($a1) 1095 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1096 $LD $a_3,3*$BNSZ($a1) 1097 $LD $b_1,$BNSZ($a2) 1098 $LD $b_2,2*$BNSZ($a2) 1099 $LD $b_3,3*$BNSZ($a2) 1100 mflo ($c_1,$a_0,$b_0) 1101 mfhi ($c_2,$a_0,$b_0) 1102 1103 $LD $a_4,4*$BNSZ($a1) 1104 $LD $a_5,5*$BNSZ($a1) 1105 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1106 $LD $a_6,6*$BNSZ($a1) 1107 $LD $a_7,7*$BNSZ($a1) 1108 $LD $b_4,4*$BNSZ($a2) 1109 $LD $b_5,5*$BNSZ($a2) 1110 mflo ($t_1,$a_0,$b_1) 1111 mfhi ($t_2,$a_0,$b_1) 1112 $ADDU $c_2,$t_1 1113 sltu $at,$c_2,$t_1 1114 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1115 $ADDU $c_3,$t_2,$at 1116 $LD $b_6,6*$BNSZ($a2) 1117 $LD $b_7,7*$BNSZ($a2) 1118 $ST $c_1,0($a0) # r[0]=c1; 1119 mflo ($t_1,$a_1,$b_0) 1120 mfhi ($t_2,$a_1,$b_0) 1121 $ADDU $c_2,$t_1 1122 sltu $at,$c_2,$t_1 1123 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1124 $ADDU $t_2,$at 1125 $ADDU $c_3,$t_2 1126 sltu $c_1,$c_3,$t_2 1127 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1128 1129 mflo ($t_1,$a_2,$b_0) 1130 mfhi ($t_2,$a_2,$b_0) 1131 $ADDU $c_3,$t_1 1132 sltu $at,$c_3,$t_1 1133 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1134 $ADDU $t_2,$at 1135 $ADDU $c_1,$t_2 1136 mflo ($t_1,$a_1,$b_1) 1137 mfhi ($t_2,$a_1,$b_1) 1138 $ADDU $c_3,$t_1 1139 sltu $at,$c_3,$t_1 1140 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1141 $ADDU $t_2,$at 1142 $ADDU $c_1,$t_2 1143 sltu $c_2,$c_1,$t_2 1144 mflo ($t_1,$a_0,$b_2) 1145 mfhi ($t_2,$a_0,$b_2) 1146 $ADDU $c_3,$t_1 1147 sltu $at,$c_3,$t_1 1148 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1149 $ADDU $t_2,$at 1150 $ADDU $c_1,$t_2 1151 sltu $at,$c_1,$t_2 1152 $ADDU $c_2,$at 1153 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1154 1155 mflo ($t_1,$a_0,$b_3) 1156 mfhi ($t_2,$a_0,$b_3) 1157 $ADDU $c_1,$t_1 1158 sltu $at,$c_1,$t_1 1159 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1160 $ADDU $t_2,$at 1161 $ADDU $c_2,$t_2 1162 sltu $c_3,$c_2,$t_2 1163 mflo ($t_1,$a_1,$b_2) 1164 mfhi ($t_2,$a_1,$b_2) 1165 $ADDU $c_1,$t_1 1166 sltu $at,$c_1,$t_1 1167 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1168 $ADDU $t_2,$at 1169 $ADDU $c_2,$t_2 1170 sltu $at,$c_2,$t_2 1171 $ADDU $c_3,$at 1172 mflo ($t_1,$a_2,$b_1) 1173 mfhi ($t_2,$a_2,$b_1) 1174 $ADDU $c_1,$t_1 1175 sltu $at,$c_1,$t_1 1176 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1177 $ADDU $t_2,$at 1178 $ADDU $c_2,$t_2 1179 sltu $at,$c_2,$t_2 1180 $ADDU $c_3,$at 1181 mflo ($t_1,$a_3,$b_0) 1182 mfhi ($t_2,$a_3,$b_0) 1183 $ADDU $c_1,$t_1 1184 sltu $at,$c_1,$t_1 1185 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); 1186 $ADDU $t_2,$at 1187 $ADDU $c_2,$t_2 1188 sltu $at,$c_2,$t_2 1189 $ADDU $c_3,$at 1190 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1191 1192 mflo ($t_1,$a_4,$b_0) 1193 mfhi ($t_2,$a_4,$b_0) 1194 $ADDU $c_2,$t_1 1195 sltu $at,$c_2,$t_1 1196 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1197 $ADDU $t_2,$at 1198 $ADDU $c_3,$t_2 1199 sltu $c_1,$c_3,$t_2 1200 mflo ($t_1,$a_3,$b_1) 1201 mfhi ($t_2,$a_3,$b_1) 1202 $ADDU $c_2,$t_1 1203 sltu $at,$c_2,$t_1 1204 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1205 $ADDU $t_2,$at 1206 $ADDU $c_3,$t_2 1207 sltu $at,$c_3,$t_2 1208 $ADDU $c_1,$at 1209 mflo ($t_1,$a_2,$b_2) 1210 mfhi ($t_2,$a_2,$b_2) 1211 $ADDU $c_2,$t_1 1212 sltu $at,$c_2,$t_1 1213 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1214 $ADDU $t_2,$at 1215 $ADDU $c_3,$t_2 1216 sltu $at,$c_3,$t_2 1217 $ADDU $c_1,$at 1218 mflo ($t_1,$a_1,$b_3) 1219 mfhi ($t_2,$a_1,$b_3) 1220 $ADDU $c_2,$t_1 1221 sltu $at,$c_2,$t_1 1222 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); 1223 $ADDU $t_2,$at 1224 $ADDU $c_3,$t_2 1225 sltu $at,$c_3,$t_2 1226 $ADDU $c_1,$at 1227 mflo ($t_1,$a_0,$b_4) 1228 mfhi ($t_2,$a_0,$b_4) 1229 $ADDU $c_2,$t_1 1230 sltu $at,$c_2,$t_1 1231 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); 1232 $ADDU $t_2,$at 1233 $ADDU $c_3,$t_2 1234 sltu $at,$c_3,$t_2 1235 $ADDU $c_1,$at 1236 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1237 1238 mflo ($t_1,$a_0,$b_5) 1239 mfhi ($t_2,$a_0,$b_5) 1240 $ADDU $c_3,$t_1 1241 sltu $at,$c_3,$t_1 1242 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); 1243 $ADDU $t_2,$at 1244 $ADDU $c_1,$t_2 1245 sltu $c_2,$c_1,$t_2 1246 mflo ($t_1,$a_1,$b_4) 1247 mfhi ($t_2,$a_1,$b_4) 1248 $ADDU $c_3,$t_1 1249 sltu $at,$c_3,$t_1 1250 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1251 $ADDU $t_2,$at 1252 $ADDU $c_1,$t_2 1253 sltu $at,$c_1,$t_2 1254 $ADDU $c_2,$at 1255 mflo ($t_1,$a_2,$b_3) 1256 mfhi ($t_2,$a_2,$b_3) 1257 $ADDU $c_3,$t_1 1258 sltu $at,$c_3,$t_1 1259 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1260 $ADDU $t_2,$at 1261 $ADDU $c_1,$t_2 1262 sltu $at,$c_1,$t_2 1263 $ADDU $c_2,$at 1264 mflo ($t_1,$a_3,$b_2) 1265 mfhi ($t_2,$a_3,$b_2) 1266 $ADDU $c_3,$t_1 1267 sltu $at,$c_3,$t_1 1268 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); 1269 $ADDU $t_2,$at 1270 $ADDU $c_1,$t_2 1271 sltu $at,$c_1,$t_2 1272 $ADDU $c_2,$at 1273 mflo ($t_1,$a_4,$b_1) 1274 mfhi ($t_2,$a_4,$b_1) 1275 $ADDU $c_3,$t_1 1276 sltu $at,$c_3,$t_1 1277 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); 1278 $ADDU $t_2,$at 1279 $ADDU $c_1,$t_2 1280 sltu $at,$c_1,$t_2 1281 $ADDU $c_2,$at 1282 mflo ($t_1,$a_5,$b_0) 1283 mfhi ($t_2,$a_5,$b_0) 1284 $ADDU $c_3,$t_1 1285 sltu $at,$c_3,$t_1 1286 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); 1287 $ADDU $t_2,$at 1288 $ADDU $c_1,$t_2 1289 sltu $at,$c_1,$t_2 1290 $ADDU $c_2,$at 1291 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1292 1293 mflo ($t_1,$a_6,$b_0) 1294 mfhi ($t_2,$a_6,$b_0) 1295 $ADDU $c_1,$t_1 1296 sltu $at,$c_1,$t_1 1297 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); 1298 $ADDU $t_2,$at 1299 $ADDU $c_2,$t_2 1300 sltu $c_3,$c_2,$t_2 1301 mflo ($t_1,$a_5,$b_1) 1302 mfhi ($t_2,$a_5,$b_1) 1303 $ADDU $c_1,$t_1 1304 sltu $at,$c_1,$t_1 1305 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); 1306 $ADDU $t_2,$at 1307 $ADDU $c_2,$t_2 1308 sltu $at,$c_2,$t_2 1309 $ADDU $c_3,$at 1310 mflo ($t_1,$a_4,$b_2) 1311 mfhi ($t_2,$a_4,$b_2) 1312 $ADDU $c_1,$t_1 1313 sltu $at,$c_1,$t_1 1314 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1315 $ADDU $t_2,$at 1316 $ADDU $c_2,$t_2 1317 sltu $at,$c_2,$t_2 1318 $ADDU $c_3,$at 1319 mflo ($t_1,$a_3,$b_3) 1320 mfhi ($t_2,$a_3,$b_3) 1321 $ADDU $c_1,$t_1 1322 sltu $at,$c_1,$t_1 1323 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); 1324 $ADDU $t_2,$at 1325 $ADDU $c_2,$t_2 1326 sltu $at,$c_2,$t_2 1327 $ADDU $c_3,$at 1328 mflo ($t_1,$a_2,$b_4) 1329 mfhi ($t_2,$a_2,$b_4) 1330 $ADDU $c_1,$t_1 1331 sltu $at,$c_1,$t_1 1332 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); 1333 $ADDU $t_2,$at 1334 $ADDU $c_2,$t_2 1335 sltu $at,$c_2,$t_2 1336 $ADDU $c_3,$at 1337 mflo ($t_1,$a_1,$b_5) 1338 mfhi ($t_2,$a_1,$b_5) 1339 $ADDU $c_1,$t_1 1340 sltu $at,$c_1,$t_1 1341 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); 1342 $ADDU $t_2,$at 1343 $ADDU $c_2,$t_2 1344 sltu $at,$c_2,$t_2 1345 $ADDU $c_3,$at 1346 mflo ($t_1,$a_0,$b_6) 1347 mfhi ($t_2,$a_0,$b_6) 1348 $ADDU $c_1,$t_1 1349 sltu $at,$c_1,$t_1 1350 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); 1351 $ADDU $t_2,$at 1352 $ADDU $c_2,$t_2 1353 sltu $at,$c_2,$t_2 1354 $ADDU $c_3,$at 1355 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1356 1357 mflo ($t_1,$a_0,$b_7) 1358 mfhi ($t_2,$a_0,$b_7) 1359 $ADDU $c_2,$t_1 1360 sltu $at,$c_2,$t_1 1361 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); 1362 $ADDU $t_2,$at 1363 $ADDU $c_3,$t_2 1364 sltu $c_1,$c_3,$t_2 1365 mflo ($t_1,$a_1,$b_6) 1366 mfhi ($t_2,$a_1,$b_6) 1367 $ADDU $c_2,$t_1 1368 sltu $at,$c_2,$t_1 1369 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); 1370 $ADDU $t_2,$at 1371 $ADDU $c_3,$t_2 1372 sltu $at,$c_3,$t_2 1373 $ADDU $c_1,$at 1374 mflo ($t_1,$a_2,$b_5) 1375 mfhi ($t_2,$a_2,$b_5) 1376 $ADDU $c_2,$t_1 1377 sltu $at,$c_2,$t_1 1378 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); 1379 $ADDU $t_2,$at 1380 $ADDU $c_3,$t_2 1381 sltu $at,$c_3,$t_2 1382 $ADDU $c_1,$at 1383 mflo ($t_1,$a_3,$b_4) 1384 mfhi ($t_2,$a_3,$b_4) 1385 $ADDU $c_2,$t_1 1386 sltu $at,$c_2,$t_1 1387 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); 1388 $ADDU $t_2,$at 1389 $ADDU $c_3,$t_2 1390 sltu $at,$c_3,$t_2 1391 $ADDU $c_1,$at 1392 mflo ($t_1,$a_4,$b_3) 1393 mfhi ($t_2,$a_4,$b_3) 1394 $ADDU $c_2,$t_1 1395 sltu $at,$c_2,$t_1 1396 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); 1397 $ADDU $t_2,$at 1398 $ADDU $c_3,$t_2 1399 sltu $at,$c_3,$t_2 1400 $ADDU $c_1,$at 1401 mflo ($t_1,$a_5,$b_2) 1402 mfhi ($t_2,$a_5,$b_2) 1403 $ADDU $c_2,$t_1 1404 sltu $at,$c_2,$t_1 1405 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); 1406 $ADDU $t_2,$at 1407 $ADDU $c_3,$t_2 1408 sltu $at,$c_3,$t_2 1409 $ADDU $c_1,$at 1410 mflo ($t_1,$a_6,$b_1) 1411 mfhi ($t_2,$a_6,$b_1) 1412 $ADDU $c_2,$t_1 1413 sltu $at,$c_2,$t_1 1414 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); 1415 $ADDU $t_2,$at 1416 $ADDU $c_3,$t_2 1417 sltu $at,$c_3,$t_2 1418 $ADDU $c_1,$at 1419 mflo ($t_1,$a_7,$b_0) 1420 mfhi ($t_2,$a_7,$b_0) 1421 $ADDU $c_2,$t_1 1422 sltu $at,$c_2,$t_1 1423 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); 1424 $ADDU $t_2,$at 1425 $ADDU $c_3,$t_2 1426 sltu $at,$c_3,$t_2 1427 $ADDU $c_1,$at 1428 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1429 1430 mflo ($t_1,$a_7,$b_1) 1431 mfhi ($t_2,$a_7,$b_1) 1432 $ADDU $c_3,$t_1 1433 sltu $at,$c_3,$t_1 1434 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); 1435 $ADDU $t_2,$at 1436 $ADDU $c_1,$t_2 1437 sltu $c_2,$c_1,$t_2 1438 mflo ($t_1,$a_6,$b_2) 1439 mfhi ($t_2,$a_6,$b_2) 1440 $ADDU $c_3,$t_1 1441 sltu $at,$c_3,$t_1 1442 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); 1443 $ADDU $t_2,$at 1444 $ADDU $c_1,$t_2 1445 sltu $at,$c_1,$t_2 1446 $ADDU $c_2,$at 1447 mflo ($t_1,$a_5,$b_3) 1448 mfhi ($t_2,$a_5,$b_3) 1449 $ADDU $c_3,$t_1 1450 sltu $at,$c_3,$t_1 1451 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); 1452 $ADDU $t_2,$at 1453 $ADDU $c_1,$t_2 1454 sltu $at,$c_1,$t_2 1455 $ADDU $c_2,$at 1456 mflo ($t_1,$a_4,$b_4) 1457 mfhi ($t_2,$a_4,$b_4) 1458 $ADDU $c_3,$t_1 1459 sltu $at,$c_3,$t_1 1460 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); 1461 $ADDU $t_2,$at 1462 $ADDU $c_1,$t_2 1463 sltu $at,$c_1,$t_2 1464 $ADDU $c_2,$at 1465 mflo ($t_1,$a_3,$b_5) 1466 mfhi ($t_2,$a_3,$b_5) 1467 $ADDU $c_3,$t_1 1468 sltu $at,$c_3,$t_1 1469 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); 1470 $ADDU $t_2,$at 1471 $ADDU $c_1,$t_2 1472 sltu $at,$c_1,$t_2 1473 $ADDU $c_2,$at 1474 mflo ($t_1,$a_2,$b_6) 1475 mfhi ($t_2,$a_2,$b_6) 1476 $ADDU $c_3,$t_1 1477 sltu $at,$c_3,$t_1 1478 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); 1479 $ADDU $t_2,$at 1480 $ADDU $c_1,$t_2 1481 sltu $at,$c_1,$t_2 1482 $ADDU $c_2,$at 1483 mflo ($t_1,$a_1,$b_7) 1484 mfhi ($t_2,$a_1,$b_7) 1485 $ADDU $c_3,$t_1 1486 sltu $at,$c_3,$t_1 1487 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); 1488 $ADDU $t_2,$at 1489 $ADDU $c_1,$t_2 1490 sltu $at,$c_1,$t_2 1491 $ADDU $c_2,$at 1492 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1493 1494 mflo ($t_1,$a_2,$b_7) 1495 mfhi ($t_2,$a_2,$b_7) 1496 $ADDU $c_1,$t_1 1497 sltu $at,$c_1,$t_1 1498 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); 1499 $ADDU $t_2,$at 1500 $ADDU $c_2,$t_2 1501 sltu $c_3,$c_2,$t_2 1502 mflo ($t_1,$a_3,$b_6) 1503 mfhi ($t_2,$a_3,$b_6) 1504 $ADDU $c_1,$t_1 1505 sltu $at,$c_1,$t_1 1506 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); 1507 $ADDU $t_2,$at 1508 $ADDU $c_2,$t_2 1509 sltu $at,$c_2,$t_2 1510 $ADDU $c_3,$at 1511 mflo ($t_1,$a_4,$b_5) 1512 mfhi ($t_2,$a_4,$b_5) 1513 $ADDU $c_1,$t_1 1514 sltu $at,$c_1,$t_1 1515 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); 1516 $ADDU $t_2,$at 1517 $ADDU $c_2,$t_2 1518 sltu $at,$c_2,$t_2 1519 $ADDU $c_3,$at 1520 mflo ($t_1,$a_5,$b_4) 1521 mfhi ($t_2,$a_5,$b_4) 1522 $ADDU $c_1,$t_1 1523 sltu $at,$c_1,$t_1 1524 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); 1525 $ADDU $t_2,$at 1526 $ADDU $c_2,$t_2 1527 sltu $at,$c_2,$t_2 1528 $ADDU $c_3,$at 1529 mflo ($t_1,$a_6,$b_3) 1530 mfhi ($t_2,$a_6,$b_3) 1531 $ADDU $c_1,$t_1 1532 sltu $at,$c_1,$t_1 1533 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); 1534 $ADDU $t_2,$at 1535 $ADDU $c_2,$t_2 1536 sltu $at,$c_2,$t_2 1537 $ADDU $c_3,$at 1538 mflo ($t_1,$a_7,$b_2) 1539 mfhi ($t_2,$a_7,$b_2) 1540 $ADDU $c_1,$t_1 1541 sltu $at,$c_1,$t_1 1542 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); 1543 $ADDU $t_2,$at 1544 $ADDU $c_2,$t_2 1545 sltu $at,$c_2,$t_2 1546 $ADDU $c_3,$at 1547 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1548 1549 mflo ($t_1,$a_7,$b_3) 1550 mfhi ($t_2,$a_7,$b_3) 1551 $ADDU $c_2,$t_1 1552 sltu $at,$c_2,$t_1 1553 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); 1554 $ADDU $t_2,$at 1555 $ADDU $c_3,$t_2 1556 sltu $c_1,$c_3,$t_2 1557 mflo ($t_1,$a_6,$b_4) 1558 mfhi ($t_2,$a_6,$b_4) 1559 $ADDU $c_2,$t_1 1560 sltu $at,$c_2,$t_1 1561 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); 1562 $ADDU $t_2,$at 1563 $ADDU $c_3,$t_2 1564 sltu $at,$c_3,$t_2 1565 $ADDU $c_1,$at 1566 mflo ($t_1,$a_5,$b_5) 1567 mfhi ($t_2,$a_5,$b_5) 1568 $ADDU $c_2,$t_1 1569 sltu $at,$c_2,$t_1 1570 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); 1571 $ADDU $t_2,$at 1572 $ADDU $c_3,$t_2 1573 sltu $at,$c_3,$t_2 1574 $ADDU $c_1,$at 1575 mflo ($t_1,$a_4,$b_6) 1576 mfhi ($t_2,$a_4,$b_6) 1577 $ADDU $c_2,$t_1 1578 sltu $at,$c_2,$t_1 1579 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); 1580 $ADDU $t_2,$at 1581 $ADDU $c_3,$t_2 1582 sltu $at,$c_3,$t_2 1583 $ADDU $c_1,$at 1584 mflo ($t_1,$a_3,$b_7) 1585 mfhi ($t_2,$a_3,$b_7) 1586 $ADDU $c_2,$t_1 1587 sltu $at,$c_2,$t_1 1588 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); 1589 $ADDU $t_2,$at 1590 $ADDU $c_3,$t_2 1591 sltu $at,$c_3,$t_2 1592 $ADDU $c_1,$at 1593 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1594 1595 mflo ($t_1,$a_4,$b_7) 1596 mfhi ($t_2,$a_4,$b_7) 1597 $ADDU $c_3,$t_1 1598 sltu $at,$c_3,$t_1 1599 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); 1600 $ADDU $t_2,$at 1601 $ADDU $c_1,$t_2 1602 sltu $c_2,$c_1,$t_2 1603 mflo ($t_1,$a_5,$b_6) 1604 mfhi ($t_2,$a_5,$b_6) 1605 $ADDU $c_3,$t_1 1606 sltu $at,$c_3,$t_1 1607 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); 1608 $ADDU $t_2,$at 1609 $ADDU $c_1,$t_2 1610 sltu $at,$c_1,$t_2 1611 $ADDU $c_2,$at 1612 mflo ($t_1,$a_6,$b_5) 1613 mfhi ($t_2,$a_6,$b_5) 1614 $ADDU $c_3,$t_1 1615 sltu $at,$c_3,$t_1 1616 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); 1617 $ADDU $t_2,$at 1618 $ADDU $c_1,$t_2 1619 sltu $at,$c_1,$t_2 1620 $ADDU $c_2,$at 1621 mflo ($t_1,$a_7,$b_4) 1622 mfhi ($t_2,$a_7,$b_4) 1623 $ADDU $c_3,$t_1 1624 sltu $at,$c_3,$t_1 1625 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); 1626 $ADDU $t_2,$at 1627 $ADDU $c_1,$t_2 1628 sltu $at,$c_1,$t_2 1629 $ADDU $c_2,$at 1630 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1631 1632 mflo ($t_1,$a_7,$b_5) 1633 mfhi ($t_2,$a_7,$b_5) 1634 $ADDU $c_1,$t_1 1635 sltu $at,$c_1,$t_1 1636 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); 1637 $ADDU $t_2,$at 1638 $ADDU $c_2,$t_2 1639 sltu $c_3,$c_2,$t_2 1640 mflo ($t_1,$a_6,$b_6) 1641 mfhi ($t_2,$a_6,$b_6) 1642 $ADDU $c_1,$t_1 1643 sltu $at,$c_1,$t_1 1644 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); 1645 $ADDU $t_2,$at 1646 $ADDU $c_2,$t_2 1647 sltu $at,$c_2,$t_2 1648 $ADDU $c_3,$at 1649 mflo ($t_1,$a_5,$b_7) 1650 mfhi ($t_2,$a_5,$b_7) 1651 $ADDU $c_1,$t_1 1652 sltu $at,$c_1,$t_1 1653 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); 1654 $ADDU $t_2,$at 1655 $ADDU $c_2,$t_2 1656 sltu $at,$c_2,$t_2 1657 $ADDU $c_3,$at 1658 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1659 1660 mflo ($t_1,$a_6,$b_7) 1661 mfhi ($t_2,$a_6,$b_7) 1662 $ADDU $c_2,$t_1 1663 sltu $at,$c_2,$t_1 1664 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); 1665 $ADDU $t_2,$at 1666 $ADDU $c_3,$t_2 1667 sltu $c_1,$c_3,$t_2 1668 mflo ($t_1,$a_7,$b_6) 1669 mfhi ($t_2,$a_7,$b_6) 1670 $ADDU $c_2,$t_1 1671 sltu $at,$c_2,$t_1 1672 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); 1673 $ADDU $t_2,$at 1674 $ADDU $c_3,$t_2 1675 sltu $at,$c_3,$t_2 1676 $ADDU $c_1,$at 1677 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1678 1679 mflo ($t_1,$a_7,$b_7) 1680 mfhi ($t_2,$a_7,$b_7) 1681 $ADDU $c_3,$t_1 1682 sltu $at,$c_3,$t_1 1683 $ADDU $t_2,$at 1684 $ADDU $c_1,$t_2 1685 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1686 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1687 1688 .set noreorder 1689___ 1690$code.=<<___ if ($flavour =~ /nubi/i); 1691 $REG_L $s5,10*$SZREG($sp) 1692 $REG_L $s4,9*$SZREG($sp) 1693 $REG_L $s3,8*$SZREG($sp) 1694 $REG_L $s2,7*$SZREG($sp) 1695 $REG_L $s1,6*$SZREG($sp) 1696 $REG_L $s0,5*$SZREG($sp) 1697 $REG_L $t3,4*$SZREG($sp) 1698 $REG_L $t2,3*$SZREG($sp) 1699 $REG_L $t1,2*$SZREG($sp) 1700 $REG_L $t0,1*$SZREG($sp) 1701 $REG_L $gp,0*$SZREG($sp) 1702 jr $ra 1703 $PTR_ADD $sp,12*$SZREG 1704___ 1705$code.=<<___ if ($flavour !~ /nubi/i); 1706 $REG_L $s5,5*$SZREG($sp) 1707 $REG_L $s4,4*$SZREG($sp) 1708 $REG_L $s3,3*$SZREG($sp) 1709 $REG_L $s2,2*$SZREG($sp) 1710 $REG_L $s1,1*$SZREG($sp) 1711 $REG_L $s0,0*$SZREG($sp) 1712 jr $ra 1713 $PTR_ADD $sp,6*$SZREG 1714___ 1715$code.=<<___; 1716.end bn_mul_comba8 1717 1718.align 5 1719.globl bn_mul_comba4 1720.ent bn_mul_comba4 1721bn_mul_comba4: 1722___ 1723$code.=<<___ if ($flavour =~ /nubi/i); 1724 .frame $sp,6*$SZREG,$ra 1725 .mask 0x8000f008,-$SZREG 1726 .set noreorder 1727 $PTR_SUB $sp,6*$SZREG 1728 $REG_S $ra,5*$SZREG($sp) 1729 $REG_S $t3,4*$SZREG($sp) 1730 $REG_S $t2,3*$SZREG($sp) 1731 $REG_S $t1,2*$SZREG($sp) 1732 $REG_S $t0,1*$SZREG($sp) 1733 $REG_S $gp,0*$SZREG($sp) 1734___ 1735$code.=<<___; 1736 .set reorder 1737 $LD $a_0,0($a1) 1738 $LD $b_0,0($a2) 1739 $LD $a_1,$BNSZ($a1) 1740 $LD $a_2,2*$BNSZ($a1) 1741 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1742 $LD $a_3,3*$BNSZ($a1) 1743 $LD $b_1,$BNSZ($a2) 1744 $LD $b_2,2*$BNSZ($a2) 1745 $LD $b_3,3*$BNSZ($a2) 1746 mflo ($c_1,$a_0,$b_0) 1747 mfhi ($c_2,$a_0,$b_0) 1748 $ST $c_1,0($a0) 1749 1750 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1751 mflo ($t_1,$a_0,$b_1) 1752 mfhi ($t_2,$a_0,$b_1) 1753 $ADDU $c_2,$t_1 1754 sltu $at,$c_2,$t_1 1755 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1756 $ADDU $c_3,$t_2,$at 1757 mflo ($t_1,$a_1,$b_0) 1758 mfhi ($t_2,$a_1,$b_0) 1759 $ADDU $c_2,$t_1 1760 sltu $at,$c_2,$t_1 1761 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1762 $ADDU $t_2,$at 1763 $ADDU $c_3,$t_2 1764 sltu $c_1,$c_3,$t_2 1765 $ST $c_2,$BNSZ($a0) 1766 1767 mflo ($t_1,$a_2,$b_0) 1768 mfhi ($t_2,$a_2,$b_0) 1769 $ADDU $c_3,$t_1 1770 sltu $at,$c_3,$t_1 1771 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1772 $ADDU $t_2,$at 1773 $ADDU $c_1,$t_2 1774 mflo ($t_1,$a_1,$b_1) 1775 mfhi ($t_2,$a_1,$b_1) 1776 $ADDU $c_3,$t_1 1777 sltu $at,$c_3,$t_1 1778 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1779 $ADDU $t_2,$at 1780 $ADDU $c_1,$t_2 1781 sltu $c_2,$c_1,$t_2 1782 mflo ($t_1,$a_0,$b_2) 1783 mfhi ($t_2,$a_0,$b_2) 1784 $ADDU $c_3,$t_1 1785 sltu $at,$c_3,$t_1 1786 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1787 $ADDU $t_2,$at 1788 $ADDU $c_1,$t_2 1789 sltu $at,$c_1,$t_2 1790 $ADDU $c_2,$at 1791 $ST $c_3,2*$BNSZ($a0) 1792 1793 mflo ($t_1,$a_0,$b_3) 1794 mfhi ($t_2,$a_0,$b_3) 1795 $ADDU $c_1,$t_1 1796 sltu $at,$c_1,$t_1 1797 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1798 $ADDU $t_2,$at 1799 $ADDU $c_2,$t_2 1800 sltu $c_3,$c_2,$t_2 1801 mflo ($t_1,$a_1,$b_2) 1802 mfhi ($t_2,$a_1,$b_2) 1803 $ADDU $c_1,$t_1 1804 sltu $at,$c_1,$t_1 1805 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1806 $ADDU $t_2,$at 1807 $ADDU $c_2,$t_2 1808 sltu $at,$c_2,$t_2 1809 $ADDU $c_3,$at 1810 mflo ($t_1,$a_2,$b_1) 1811 mfhi ($t_2,$a_2,$b_1) 1812 $ADDU $c_1,$t_1 1813 sltu $at,$c_1,$t_1 1814 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1815 $ADDU $t_2,$at 1816 $ADDU $c_2,$t_2 1817 sltu $at,$c_2,$t_2 1818 $ADDU $c_3,$at 1819 mflo ($t_1,$a_3,$b_0) 1820 mfhi ($t_2,$a_3,$b_0) 1821 $ADDU $c_1,$t_1 1822 sltu $at,$c_1,$t_1 1823 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1824 $ADDU $t_2,$at 1825 $ADDU $c_2,$t_2 1826 sltu $at,$c_2,$t_2 1827 $ADDU $c_3,$at 1828 $ST $c_1,3*$BNSZ($a0) 1829 1830 mflo ($t_1,$a_3,$b_1) 1831 mfhi ($t_2,$a_3,$b_1) 1832 $ADDU $c_2,$t_1 1833 sltu $at,$c_2,$t_1 1834 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1835 $ADDU $t_2,$at 1836 $ADDU $c_3,$t_2 1837 sltu $c_1,$c_3,$t_2 1838 mflo ($t_1,$a_2,$b_2) 1839 mfhi ($t_2,$a_2,$b_2) 1840 $ADDU $c_2,$t_1 1841 sltu $at,$c_2,$t_1 1842 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1843 $ADDU $t_2,$at 1844 $ADDU $c_3,$t_2 1845 sltu $at,$c_3,$t_2 1846 $ADDU $c_1,$at 1847 mflo ($t_1,$a_1,$b_3) 1848 mfhi ($t_2,$a_1,$b_3) 1849 $ADDU $c_2,$t_1 1850 sltu $at,$c_2,$t_1 1851 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1852 $ADDU $t_2,$at 1853 $ADDU $c_3,$t_2 1854 sltu $at,$c_3,$t_2 1855 $ADDU $c_1,$at 1856 $ST $c_2,4*$BNSZ($a0) 1857 1858 mflo ($t_1,$a_2,$b_3) 1859 mfhi ($t_2,$a_2,$b_3) 1860 $ADDU $c_3,$t_1 1861 sltu $at,$c_3,$t_1 1862 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1863 $ADDU $t_2,$at 1864 $ADDU $c_1,$t_2 1865 sltu $c_2,$c_1,$t_2 1866 mflo ($t_1,$a_3,$b_2) 1867 mfhi ($t_2,$a_3,$b_2) 1868 $ADDU $c_3,$t_1 1869 sltu $at,$c_3,$t_1 1870 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1871 $ADDU $t_2,$at 1872 $ADDU $c_1,$t_2 1873 sltu $at,$c_1,$t_2 1874 $ADDU $c_2,$at 1875 $ST $c_3,5*$BNSZ($a0) 1876 1877 mflo ($t_1,$a_3,$b_3) 1878 mfhi ($t_2,$a_3,$b_3) 1879 $ADDU $c_1,$t_1 1880 sltu $at,$c_1,$t_1 1881 $ADDU $t_2,$at 1882 $ADDU $c_2,$t_2 1883 $ST $c_1,6*$BNSZ($a0) 1884 $ST $c_2,7*$BNSZ($a0) 1885 1886 .set noreorder 1887___ 1888$code.=<<___ if ($flavour =~ /nubi/i); 1889 $REG_L $t3,4*$SZREG($sp) 1890 $REG_L $t2,3*$SZREG($sp) 1891 $REG_L $t1,2*$SZREG($sp) 1892 $REG_L $t0,1*$SZREG($sp) 1893 $REG_L $gp,0*$SZREG($sp) 1894 $PTR_ADD $sp,6*$SZREG 1895___ 1896$code.=<<___; 1897 jr $ra 1898 nop 1899.end bn_mul_comba4 1900___ 1901 1902($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1903 1904sub add_c2 () { 1905my ($hi,$lo,$c0,$c1,$c2, 1906 $warm, # !$warm denotes first call with specific sequence of 1907 # $c_[XYZ] when there is no Z-carry to accumulate yet; 1908 $an,$bn # these two are arguments for multiplication which 1909 # result is used in *next* step [which is why it's 1910 # commented as "forward multiplication" below]; 1911 )=@_; 1912$code.=<<___; 1913 $ADDU $c0,$lo 1914 sltu $at,$c0,$lo 1915 $MULTU ($an,$bn) # forward multiplication 1916 $ADDU $c0,$lo 1917 $ADDU $at,$hi 1918 sltu $lo,$c0,$lo 1919 $ADDU $c1,$at 1920 $ADDU $hi,$lo 1921___ 1922$code.=<<___ if (!$warm); 1923 sltu $c2,$c1,$at 1924 $ADDU $c1,$hi 1925___ 1926$code.=<<___ if ($warm); 1927 sltu $at,$c1,$at 1928 $ADDU $c1,$hi 1929 $ADDU $c2,$at 1930___ 1931$code.=<<___; 1932 sltu $hi,$c1,$hi 1933 $ADDU $c2,$hi 1934 mflo ($lo,$an,$bn) 1935 mfhi ($hi,$an,$bn) 1936___ 1937} 1938 1939$code.=<<___; 1940 1941.align 5 1942.globl bn_sqr_comba8 1943.ent bn_sqr_comba8 1944bn_sqr_comba8: 1945___ 1946$code.=<<___ if ($flavour =~ /nubi/i); 1947 .frame $sp,6*$SZREG,$ra 1948 .mask 0x8000f008,-$SZREG 1949 .set noreorder 1950 $PTR_SUB $sp,6*$SZREG 1951 $REG_S $ra,5*$SZREG($sp) 1952 $REG_S $t3,4*$SZREG($sp) 1953 $REG_S $t2,3*$SZREG($sp) 1954 $REG_S $t1,2*$SZREG($sp) 1955 $REG_S $t0,1*$SZREG($sp) 1956 $REG_S $gp,0*$SZREG($sp) 1957___ 1958$code.=<<___; 1959 .set reorder 1960 $LD $a_0,0($a1) 1961 $LD $a_1,$BNSZ($a1) 1962 $LD $a_2,2*$BNSZ($a1) 1963 $LD $a_3,3*$BNSZ($a1) 1964 1965 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1966 $LD $a_4,4*$BNSZ($a1) 1967 $LD $a_5,5*$BNSZ($a1) 1968 $LD $a_6,6*$BNSZ($a1) 1969 $LD $a_7,7*$BNSZ($a1) 1970 mflo ($c_1,$a_0,$a_0) 1971 mfhi ($c_2,$a_0,$a_0) 1972 $ST $c_1,0($a0) 1973 1974 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 1975 mflo ($t_1,$a_0,$a_1) 1976 mfhi ($t_2,$a_0,$a_1) 1977 slt $c_1,$t_2,$zero 1978 $SLL $t_2,1 1979 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 1980 slt $a2,$t_1,$zero 1981 $ADDU $t_2,$a2 1982 $SLL $t_1,1 1983 $ADDU $c_2,$t_1 1984 sltu $at,$c_2,$t_1 1985 $ADDU $c_3,$t_2,$at 1986 $ST $c_2,$BNSZ($a0) 1987 sltu $at,$c_3,$t_2 1988 $ADDU $c_1,$at 1989 mflo ($t_1,$a_2,$a_0) 1990 mfhi ($t_2,$a_2,$a_0) 1991___ 1992 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1993 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 1994$code.=<<___; 1995 $ADDU $c_3,$t_1 1996 sltu $at,$c_3,$t_1 1997 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 1998 $ADDU $t_2,$at 1999 $ADDU $c_1,$t_2 2000 sltu $at,$c_1,$t_2 2001 $ADDU $c_2,$at 2002 $ST $c_3,2*$BNSZ($a0) 2003 mflo ($t_1,$a_0,$a_3) 2004 mfhi ($t_2,$a_0,$a_3) 2005___ 2006 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2007 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); 2008 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2009 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); 2010$code.=<<___; 2011 $ST $c_1,3*$BNSZ($a0) 2012___ 2013 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2014 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2015 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2016 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2017$code.=<<___; 2018 $ADDU $c_2,$t_1 2019 sltu $at,$c_2,$t_1 2020 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); 2021 $ADDU $t_2,$at 2022 $ADDU $c_3,$t_2 2023 sltu $at,$c_3,$t_2 2024 $ADDU $c_1,$at 2025 $ST $c_2,4*$BNSZ($a0) 2026 mflo ($t_1,$a_0,$a_5) 2027 mfhi ($t_2,$a_0,$a_5) 2028___ 2029 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2030 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); 2031 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2032 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); 2033 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2034 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); 2035$code.=<<___; 2036 $ST $c_3,5*$BNSZ($a0) 2037___ 2038 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2039 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); 2040 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2041 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); 2042 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2043 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2044$code.=<<___; 2045 $ADDU $c_1,$t_1 2046 sltu $at,$c_1,$t_1 2047 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); 2048 $ADDU $t_2,$at 2049 $ADDU $c_2,$t_2 2050 sltu $at,$c_2,$t_2 2051 $ADDU $c_3,$at 2052 $ST $c_1,6*$BNSZ($a0) 2053 mflo ($t_1,$a_0,$a_7) 2054 mfhi ($t_2,$a_0,$a_7) 2055___ 2056 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2057 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); 2058 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2059 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); 2060 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2061 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); 2062 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2063 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); 2064$code.=<<___; 2065 $ST $c_2,7*$BNSZ($a0) 2066___ 2067 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2068 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); 2069 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2070 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); 2071 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2072 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); 2073$code.=<<___; 2074 $ADDU $c_3,$t_1 2075 sltu $at,$c_3,$t_1 2076 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); 2077 $ADDU $t_2,$at 2078 $ADDU $c_1,$t_2 2079 sltu $at,$c_1,$t_2 2080 $ADDU $c_2,$at 2081 $ST $c_3,8*$BNSZ($a0) 2082 mflo ($t_1,$a_2,$a_7) 2083 mfhi ($t_2,$a_2,$a_7) 2084___ 2085 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2086 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); 2087 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2088 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); 2089 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2090 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); 2091$code.=<<___; 2092 $ST $c_1,9*$BNSZ($a0) 2093___ 2094 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2095 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); 2096 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2097 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); 2098$code.=<<___; 2099 $ADDU $c_2,$t_1 2100 sltu $at,$c_2,$t_1 2101 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); 2102 $ADDU $t_2,$at 2103 $ADDU $c_3,$t_2 2104 sltu $at,$c_3,$t_2 2105 $ADDU $c_1,$at 2106 $ST $c_2,10*$BNSZ($a0) 2107 mflo ($t_1,$a_4,$a_7) 2108 mfhi ($t_2,$a_4,$a_7) 2109___ 2110 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2111 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); 2112 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2113 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); 2114$code.=<<___; 2115 $ST $c_3,11*$BNSZ($a0) 2116___ 2117 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2118 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); 2119$code.=<<___; 2120 $ADDU $c_1,$t_1 2121 sltu $at,$c_1,$t_1 2122 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); 2123 $ADDU $t_2,$at 2124 $ADDU $c_2,$t_2 2125 sltu $at,$c_2,$t_2 2126 $ADDU $c_3,$at 2127 $ST $c_1,12*$BNSZ($a0) 2128 mflo ($t_1,$a_6,$a_7) 2129 mfhi ($t_2,$a_6,$a_7) 2130___ 2131 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2132 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); 2133$code.=<<___; 2134 $ST $c_2,13*$BNSZ($a0) 2135 2136 $ADDU $c_3,$t_1 2137 sltu $at,$c_3,$t_1 2138 $ADDU $t_2,$at 2139 $ADDU $c_1,$t_2 2140 $ST $c_3,14*$BNSZ($a0) 2141 $ST $c_1,15*$BNSZ($a0) 2142 2143 .set noreorder 2144___ 2145$code.=<<___ if ($flavour =~ /nubi/i); 2146 $REG_L $t3,4*$SZREG($sp) 2147 $REG_L $t2,3*$SZREG($sp) 2148 $REG_L $t1,2*$SZREG($sp) 2149 $REG_L $t0,1*$SZREG($sp) 2150 $REG_L $gp,0*$SZREG($sp) 2151 $PTR_ADD $sp,6*$SZREG 2152___ 2153$code.=<<___; 2154 jr $ra 2155 nop 2156.end bn_sqr_comba8 2157 2158.align 5 2159.globl bn_sqr_comba4 2160.ent bn_sqr_comba4 2161bn_sqr_comba4: 2162___ 2163$code.=<<___ if ($flavour =~ /nubi/i); 2164 .frame $sp,6*$SZREG,$ra 2165 .mask 0x8000f008,-$SZREG 2166 .set noreorder 2167 $PTR_SUB $sp,6*$SZREG 2168 $REG_S $ra,5*$SZREG($sp) 2169 $REG_S $t3,4*$SZREG($sp) 2170 $REG_S $t2,3*$SZREG($sp) 2171 $REG_S $t1,2*$SZREG($sp) 2172 $REG_S $t0,1*$SZREG($sp) 2173 $REG_S $gp,0*$SZREG($sp) 2174___ 2175$code.=<<___; 2176 .set reorder 2177 $LD $a_0,0($a1) 2178 $LD $a_1,$BNSZ($a1) 2179 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 2180 $LD $a_2,2*$BNSZ($a1) 2181 $LD $a_3,3*$BNSZ($a1) 2182 mflo ($c_1,$a_0,$a_0) 2183 mfhi ($c_2,$a_0,$a_0) 2184 $ST $c_1,0($a0) 2185 2186 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 2187 mflo ($t_1,$a_0,$a_1) 2188 mfhi ($t_2,$a_0,$a_1) 2189 slt $c_1,$t_2,$zero 2190 $SLL $t_2,1 2191 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 2192 slt $a2,$t_1,$zero 2193 $ADDU $t_2,$a2 2194 $SLL $t_1,1 2195 $ADDU $c_2,$t_1 2196 sltu $at,$c_2,$t_1 2197 $ADDU $c_3,$t_2,$at 2198 $ST $c_2,$BNSZ($a0) 2199 sltu $at,$c_3,$t_2 2200 $ADDU $c_1,$at 2201 mflo ($t_1,$a_2,$a_0) 2202 mfhi ($t_2,$a_2,$a_0) 2203___ 2204 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2205 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 2206$code.=<<___; 2207 $ADDU $c_3,$t_1 2208 sltu $at,$c_3,$t_1 2209 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 2210 $ADDU $t_2,$at 2211 $ADDU $c_1,$t_2 2212 sltu $at,$c_1,$t_2 2213 $ADDU $c_2,$at 2214 $ST $c_3,2*$BNSZ($a0) 2215 mflo ($t_1,$a_0,$a_3) 2216 mfhi ($t_2,$a_0,$a_3) 2217___ 2218 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2219 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); 2220 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2221 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2222$code.=<<___; 2223 $ST $c_1,3*$BNSZ($a0) 2224___ 2225 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2226 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2227$code.=<<___; 2228 $ADDU $c_2,$t_1 2229 sltu $at,$c_2,$t_1 2230 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); 2231 $ADDU $t_2,$at 2232 $ADDU $c_3,$t_2 2233 sltu $at,$c_3,$t_2 2234 $ADDU $c_1,$at 2235 $ST $c_2,4*$BNSZ($a0) 2236 mflo ($t_1,$a_2,$a_3) 2237 mfhi ($t_2,$a_2,$a_3) 2238___ 2239 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2240 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2241$code.=<<___; 2242 $ST $c_3,5*$BNSZ($a0) 2243 2244 $ADDU $c_1,$t_1 2245 sltu $at,$c_1,$t_1 2246 $ADDU $t_2,$at 2247 $ADDU $c_2,$t_2 2248 $ST $c_1,6*$BNSZ($a0) 2249 $ST $c_2,7*$BNSZ($a0) 2250 2251 .set noreorder 2252___ 2253$code.=<<___ if ($flavour =~ /nubi/i); 2254 $REG_L $t3,4*$SZREG($sp) 2255 $REG_L $t2,3*$SZREG($sp) 2256 $REG_L $t1,2*$SZREG($sp) 2257 $REG_L $t0,1*$SZREG($sp) 2258 $REG_L $gp,0*$SZREG($sp) 2259 $PTR_ADD $sp,6*$SZREG 2260___ 2261$code.=<<___; 2262 jr $ra 2263 nop 2264.end bn_sqr_comba4 2265___ 2266print $code; 2267close STDOUT or die "error closing STDOUT: $!"; 2268