1#! /usr/bin/env perl 2# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. 13# 14# Rights for redistribution and usage in source and binary forms are 15# granted according to the OpenSSL license. Warranty of any kind is 16# disclaimed. 17# ==================================================================== 18 19 20# July 1999 21# 22# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. 23# 24# The module is designed to work with either of the "new" MIPS ABI(5), 25# namely N32 or N64, offered by IRIX 6.x. It's not meant to work under 26# IRIX 5.x not only because it doesn't support new ABIs but also 27# because 5.x kernels put R4x00 CPU into 32-bit mode and all those 28# 64-bit instructions (daddu, dmultu, etc.) found below gonna only 29# cause illegal instruction exception:-( 30# 31# In addition the code depends on preprocessor flags set up by MIPSpro 32# compiler driver (either as or cc) and therefore (probably?) can't be 33# compiled by the GNU assembler. GNU C driver manages fine though... 34# I mean as long as -mmips-as is specified or is the default option, 35# because then it simply invokes /usr/bin/as which in turn takes 36# perfect care of the preprocessor definitions. Another neat feature 37# offered by the MIPSpro assembler is an optimization pass. This gave 38# me the opportunity to have the code looking more regular as all those 39# architecture dependent instruction rescheduling details were left to 40# the assembler. Cool, huh? 41# 42# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' 43# goes way over 3 times faster! 44# 45# <appro@openssl.org> 46 47# October 2010 48# 49# Adapt the module even for 32-bit ABIs and other OSes. The former was 50# achieved by mechanical replacement of 64-bit arithmetic instructions 51# such as dmultu, daddu, etc. with their 32-bit counterparts and 52# adjusting offsets denoting multiples of BN_ULONG. Above mentioned 53# >3x performance improvement naturally does not apply to 32-bit code 54# [because there is no instruction 32-bit compiler can't use], one 55# has to content with 40-85% improvement depending on benchmark and 56# key length, more for longer keys. 57 58$flavour = shift || "o32"; 59while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 60open STDOUT,">$output"; 61 62if ($flavour =~ /64|n32/i) { 63 $LD="ld"; 64 $ST="sd"; 65 $MULTU="dmultu"; 66 $DIVU="ddivu"; 67 $ADDU="daddu"; 68 $SUBU="dsubu"; 69 $SRL="dsrl"; 70 $SLL="dsll"; 71 $BNSZ=8; 72 $PTR_ADD="daddu"; 73 $PTR_SUB="dsubu"; 74 $SZREG=8; 75 $REG_S="sd"; 76 $REG_L="ld"; 77} else { 78 $LD="lw"; 79 $ST="sw"; 80 $MULTU="multu"; 81 $DIVU="divu"; 82 $ADDU="addu"; 83 $SUBU="subu"; 84 $SRL="srl"; 85 $SLL="sll"; 86 $BNSZ=4; 87 $PTR_ADD="addu"; 88 $PTR_SUB="subu"; 89 $SZREG=4; 90 $REG_S="sw"; 91 $REG_L="lw"; 92 $code=".set mips2\n"; 93} 94 95# Below is N32/64 register layout used in the original module. 96# 97($zero,$at,$v0,$v1)=map("\$$_",(0..3)); 98($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); 99($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); 100($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); 101($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); 102($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); 103# 104# No special adaptation is required for O32. NUBI on the other hand 105# is treated by saving/restoring ($v1,$t0..$t3). 106 107$gp=$v1 if ($flavour =~ /nubi/i); 108 109$minus4=$v1; 110 111$code.=<<___; 112#include "mips_arch.h" 113 114#if defined(_MIPS_ARCH_MIPS64R6) 115# define ddivu(rs,rt) 116# define mfqt(rd,rs,rt) ddivu rd,rs,rt 117# define mfrm(rd,rs,rt) dmodu rd,rs,rt 118#elif defined(_MIPS_ARCH_MIPS32R6) 119# define divu(rs,rt) 120# define mfqt(rd,rs,rt) divu rd,rs,rt 121# define mfrm(rd,rs,rt) modu rd,rs,rt 122#else 123# define $DIVU(rs,rt) $DIVU $zero,rs,rt 124# define mfqt(rd,rs,rt) mflo rd 125# define mfrm(rd,rs,rt) mfhi rd 126#endif 127 128.rdata 129.asciiz "mips3.s, Version 1.2" 130.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov <appro\@fy.chalmers.se>" 131 132.text 133.set noat 134 135.align 5 136.globl bn_mul_add_words 137.ent bn_mul_add_words 138bn_mul_add_words: 139 .set noreorder 140 bgtz $a2,bn_mul_add_words_internal 141 move $v0,$zero 142 jr $ra 143 move $a0,$v0 144.end bn_mul_add_words 145 146.align 5 147.ent bn_mul_add_words_internal 148bn_mul_add_words_internal: 149___ 150$code.=<<___ if ($flavour =~ /nubi/i); 151 .frame $sp,6*$SZREG,$ra 152 .mask 0x8000f008,-$SZREG 153 .set noreorder 154 $PTR_SUB $sp,6*$SZREG 155 $REG_S $ra,5*$SZREG($sp) 156 $REG_S $t3,4*$SZREG($sp) 157 $REG_S $t2,3*$SZREG($sp) 158 $REG_S $t1,2*$SZREG($sp) 159 $REG_S $t0,1*$SZREG($sp) 160 $REG_S $gp,0*$SZREG($sp) 161___ 162$code.=<<___; 163 .set reorder 164 li $minus4,-4 165 and $ta0,$a2,$minus4 166 beqz $ta0,.L_bn_mul_add_words_tail 167 168.L_bn_mul_add_words_loop: 169 $LD $t0,0($a1) 170 $MULTU ($t0,$a3) 171 $LD $t1,0($a0) 172 $LD $t2,$BNSZ($a1) 173 $LD $t3,$BNSZ($a0) 174 $LD $ta0,2*$BNSZ($a1) 175 $LD $ta1,2*$BNSZ($a0) 176 $ADDU $t1,$v0 177 sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit 178 # values", but it seems to work fine 179 # even on 64-bit registers. 180 mflo ($at,$t0,$a3) 181 mfhi ($t0,$t0,$a3) 182 $ADDU $t1,$at 183 $ADDU $v0,$t0 184 $MULTU ($t2,$a3) 185 sltu $at,$t1,$at 186 $ST $t1,0($a0) 187 $ADDU $v0,$at 188 189 $LD $ta2,3*$BNSZ($a1) 190 $LD $ta3,3*$BNSZ($a0) 191 $ADDU $t3,$v0 192 sltu $v0,$t3,$v0 193 mflo ($at,$t2,$a3) 194 mfhi ($t2,$t2,$a3) 195 $ADDU $t3,$at 196 $ADDU $v0,$t2 197 $MULTU ($ta0,$a3) 198 sltu $at,$t3,$at 199 $ST $t3,$BNSZ($a0) 200 $ADDU $v0,$at 201 202 subu $a2,4 203 $PTR_ADD $a0,4*$BNSZ 204 $PTR_ADD $a1,4*$BNSZ 205 $ADDU $ta1,$v0 206 sltu $v0,$ta1,$v0 207 mflo ($at,$ta0,$a3) 208 mfhi ($ta0,$ta0,$a3) 209 $ADDU $ta1,$at 210 $ADDU $v0,$ta0 211 $MULTU ($ta2,$a3) 212 sltu $at,$ta1,$at 213 $ST $ta1,-2*$BNSZ($a0) 214 $ADDU $v0,$at 215 216 217 and $ta0,$a2,$minus4 218 $ADDU $ta3,$v0 219 sltu $v0,$ta3,$v0 220 mflo ($at,$ta2,$a3) 221 mfhi ($ta2,$ta2,$a3) 222 $ADDU $ta3,$at 223 $ADDU $v0,$ta2 224 sltu $at,$ta3,$at 225 $ST $ta3,-$BNSZ($a0) 226 .set noreorder 227 bgtz $ta0,.L_bn_mul_add_words_loop 228 $ADDU $v0,$at 229 230 beqz $a2,.L_bn_mul_add_words_return 231 nop 232 233.L_bn_mul_add_words_tail: 234 .set reorder 235 $LD $t0,0($a1) 236 $MULTU ($t0,$a3) 237 $LD $t1,0($a0) 238 subu $a2,1 239 $ADDU $t1,$v0 240 sltu $v0,$t1,$v0 241 mflo ($at,$t0,$a3) 242 mfhi ($t0,$t0,$a3) 243 $ADDU $t1,$at 244 $ADDU $v0,$t0 245 sltu $at,$t1,$at 246 $ST $t1,0($a0) 247 $ADDU $v0,$at 248 beqz $a2,.L_bn_mul_add_words_return 249 250 $LD $t0,$BNSZ($a1) 251 $MULTU ($t0,$a3) 252 $LD $t1,$BNSZ($a0) 253 subu $a2,1 254 $ADDU $t1,$v0 255 sltu $v0,$t1,$v0 256 mflo ($at,$t0,$a3) 257 mfhi ($t0,$t0,$a3) 258 $ADDU $t1,$at 259 $ADDU $v0,$t0 260 sltu $at,$t1,$at 261 $ST $t1,$BNSZ($a0) 262 $ADDU $v0,$at 263 beqz $a2,.L_bn_mul_add_words_return 264 265 $LD $t0,2*$BNSZ($a1) 266 $MULTU ($t0,$a3) 267 $LD $t1,2*$BNSZ($a0) 268 $ADDU $t1,$v0 269 sltu $v0,$t1,$v0 270 mflo ($at,$t0,$a3) 271 mfhi ($t0,$t0,$a3) 272 $ADDU $t1,$at 273 $ADDU $v0,$t0 274 sltu $at,$t1,$at 275 $ST $t1,2*$BNSZ($a0) 276 $ADDU $v0,$at 277 278.L_bn_mul_add_words_return: 279 .set noreorder 280___ 281$code.=<<___ if ($flavour =~ /nubi/i); 282 $REG_L $t3,4*$SZREG($sp) 283 $REG_L $t2,3*$SZREG($sp) 284 $REG_L $t1,2*$SZREG($sp) 285 $REG_L $t0,1*$SZREG($sp) 286 $REG_L $gp,0*$SZREG($sp) 287 $PTR_ADD $sp,6*$SZREG 288___ 289$code.=<<___; 290 jr $ra 291 move $a0,$v0 292.end bn_mul_add_words_internal 293 294.align 5 295.globl bn_mul_words 296.ent bn_mul_words 297bn_mul_words: 298 .set noreorder 299 bgtz $a2,bn_mul_words_internal 300 move $v0,$zero 301 jr $ra 302 move $a0,$v0 303.end bn_mul_words 304 305.align 5 306.ent bn_mul_words_internal 307bn_mul_words_internal: 308___ 309$code.=<<___ if ($flavour =~ /nubi/i); 310 .frame $sp,6*$SZREG,$ra 311 .mask 0x8000f008,-$SZREG 312 .set noreorder 313 $PTR_SUB $sp,6*$SZREG 314 $REG_S $ra,5*$SZREG($sp) 315 $REG_S $t3,4*$SZREG($sp) 316 $REG_S $t2,3*$SZREG($sp) 317 $REG_S $t1,2*$SZREG($sp) 318 $REG_S $t0,1*$SZREG($sp) 319 $REG_S $gp,0*$SZREG($sp) 320___ 321$code.=<<___; 322 .set reorder 323 li $minus4,-4 324 and $ta0,$a2,$minus4 325 beqz $ta0,.L_bn_mul_words_tail 326 327.L_bn_mul_words_loop: 328 $LD $t0,0($a1) 329 $MULTU ($t0,$a3) 330 $LD $t2,$BNSZ($a1) 331 $LD $ta0,2*$BNSZ($a1) 332 $LD $ta2,3*$BNSZ($a1) 333 mflo ($at,$t0,$a3) 334 mfhi ($t0,$t0,$a3) 335 $ADDU $v0,$at 336 sltu $t1,$v0,$at 337 $MULTU ($t2,$a3) 338 $ST $v0,0($a0) 339 $ADDU $v0,$t1,$t0 340 341 subu $a2,4 342 $PTR_ADD $a0,4*$BNSZ 343 $PTR_ADD $a1,4*$BNSZ 344 mflo ($at,$t2,$a3) 345 mfhi ($t2,$t2,$a3) 346 $ADDU $v0,$at 347 sltu $t3,$v0,$at 348 $MULTU ($ta0,$a3) 349 $ST $v0,-3*$BNSZ($a0) 350 $ADDU $v0,$t3,$t2 351 352 mflo ($at,$ta0,$a3) 353 mfhi ($ta0,$ta0,$a3) 354 $ADDU $v0,$at 355 sltu $ta1,$v0,$at 356 $MULTU ($ta2,$a3) 357 $ST $v0,-2*$BNSZ($a0) 358 $ADDU $v0,$ta1,$ta0 359 360 and $ta0,$a2,$minus4 361 mflo ($at,$ta2,$a3) 362 mfhi ($ta2,$ta2,$a3) 363 $ADDU $v0,$at 364 sltu $ta3,$v0,$at 365 $ST $v0,-$BNSZ($a0) 366 .set noreorder 367 bgtz $ta0,.L_bn_mul_words_loop 368 $ADDU $v0,$ta3,$ta2 369 370 beqz $a2,.L_bn_mul_words_return 371 nop 372 373.L_bn_mul_words_tail: 374 .set reorder 375 $LD $t0,0($a1) 376 $MULTU ($t0,$a3) 377 subu $a2,1 378 mflo ($at,$t0,$a3) 379 mfhi ($t0,$t0,$a3) 380 $ADDU $v0,$at 381 sltu $t1,$v0,$at 382 $ST $v0,0($a0) 383 $ADDU $v0,$t1,$t0 384 beqz $a2,.L_bn_mul_words_return 385 386 $LD $t0,$BNSZ($a1) 387 $MULTU ($t0,$a3) 388 subu $a2,1 389 mflo ($at,$t0,$a3) 390 mfhi ($t0,$t0,$a3) 391 $ADDU $v0,$at 392 sltu $t1,$v0,$at 393 $ST $v0,$BNSZ($a0) 394 $ADDU $v0,$t1,$t0 395 beqz $a2,.L_bn_mul_words_return 396 397 $LD $t0,2*$BNSZ($a1) 398 $MULTU ($t0,$a3) 399 mflo ($at,$t0,$a3) 400 mfhi ($t0,$t0,$a3) 401 $ADDU $v0,$at 402 sltu $t1,$v0,$at 403 $ST $v0,2*$BNSZ($a0) 404 $ADDU $v0,$t1,$t0 405 406.L_bn_mul_words_return: 407 .set noreorder 408___ 409$code.=<<___ if ($flavour =~ /nubi/i); 410 $REG_L $t3,4*$SZREG($sp) 411 $REG_L $t2,3*$SZREG($sp) 412 $REG_L $t1,2*$SZREG($sp) 413 $REG_L $t0,1*$SZREG($sp) 414 $REG_L $gp,0*$SZREG($sp) 415 $PTR_ADD $sp,6*$SZREG 416___ 417$code.=<<___; 418 jr $ra 419 move $a0,$v0 420.end bn_mul_words_internal 421 422.align 5 423.globl bn_sqr_words 424.ent bn_sqr_words 425bn_sqr_words: 426 .set noreorder 427 bgtz $a2,bn_sqr_words_internal 428 move $v0,$zero 429 jr $ra 430 move $a0,$v0 431.end bn_sqr_words 432 433.align 5 434.ent bn_sqr_words_internal 435bn_sqr_words_internal: 436___ 437$code.=<<___ if ($flavour =~ /nubi/i); 438 .frame $sp,6*$SZREG,$ra 439 .mask 0x8000f008,-$SZREG 440 .set noreorder 441 $PTR_SUB $sp,6*$SZREG 442 $REG_S $ra,5*$SZREG($sp) 443 $REG_S $t3,4*$SZREG($sp) 444 $REG_S $t2,3*$SZREG($sp) 445 $REG_S $t1,2*$SZREG($sp) 446 $REG_S $t0,1*$SZREG($sp) 447 $REG_S $gp,0*$SZREG($sp) 448___ 449$code.=<<___; 450 .set reorder 451 li $minus4,-4 452 and $ta0,$a2,$minus4 453 beqz $ta0,.L_bn_sqr_words_tail 454 455.L_bn_sqr_words_loop: 456 $LD $t0,0($a1) 457 $MULTU ($t0,$t0) 458 $LD $t2,$BNSZ($a1) 459 $LD $ta0,2*$BNSZ($a1) 460 $LD $ta2,3*$BNSZ($a1) 461 mflo ($t1,$t0,$t0) 462 mfhi ($t0,$t0,$t0) 463 $ST $t1,0($a0) 464 $ST $t0,$BNSZ($a0) 465 466 $MULTU ($t2,$t2) 467 subu $a2,4 468 $PTR_ADD $a0,8*$BNSZ 469 $PTR_ADD $a1,4*$BNSZ 470 mflo ($t3,$t2,$t2) 471 mfhi ($t2,$t2,$t2) 472 $ST $t3,-6*$BNSZ($a0) 473 $ST $t2,-5*$BNSZ($a0) 474 475 $MULTU ($ta0,$ta0) 476 mflo ($ta1,$ta0,$ta0) 477 mfhi ($ta0,$ta0,$ta0) 478 $ST $ta1,-4*$BNSZ($a0) 479 $ST $ta0,-3*$BNSZ($a0) 480 481 482 $MULTU ($ta2,$ta2) 483 and $ta0,$a2,$minus4 484 mflo ($ta3,$ta2,$ta2) 485 mfhi ($ta2,$ta2,$ta2) 486 $ST $ta3,-2*$BNSZ($a0) 487 488 .set noreorder 489 bgtz $ta0,.L_bn_sqr_words_loop 490 $ST $ta2,-$BNSZ($a0) 491 492 beqz $a2,.L_bn_sqr_words_return 493 nop 494 495.L_bn_sqr_words_tail: 496 .set reorder 497 $LD $t0,0($a1) 498 $MULTU ($t0,$t0) 499 subu $a2,1 500 mflo ($t1,$t0,$t0) 501 mfhi ($t0,$t0,$t0) 502 $ST $t1,0($a0) 503 $ST $t0,$BNSZ($a0) 504 beqz $a2,.L_bn_sqr_words_return 505 506 $LD $t0,$BNSZ($a1) 507 $MULTU ($t0,$t0) 508 subu $a2,1 509 mflo ($t1,$t0,$t0) 510 mfhi ($t0,$t0,$t0) 511 $ST $t1,2*$BNSZ($a0) 512 $ST $t0,3*$BNSZ($a0) 513 beqz $a2,.L_bn_sqr_words_return 514 515 $LD $t0,2*$BNSZ($a1) 516 $MULTU ($t0,$t0) 517 mflo ($t1,$t0,$t0) 518 mfhi ($t0,$t0,$t0) 519 $ST $t1,4*$BNSZ($a0) 520 $ST $t0,5*$BNSZ($a0) 521 522.L_bn_sqr_words_return: 523 .set noreorder 524___ 525$code.=<<___ if ($flavour =~ /nubi/i); 526 $REG_L $t3,4*$SZREG($sp) 527 $REG_L $t2,3*$SZREG($sp) 528 $REG_L $t1,2*$SZREG($sp) 529 $REG_L $t0,1*$SZREG($sp) 530 $REG_L $gp,0*$SZREG($sp) 531 $PTR_ADD $sp,6*$SZREG 532___ 533$code.=<<___; 534 jr $ra 535 move $a0,$v0 536 537.end bn_sqr_words_internal 538 539.align 5 540.globl bn_add_words 541.ent bn_add_words 542bn_add_words: 543 .set noreorder 544 bgtz $a3,bn_add_words_internal 545 move $v0,$zero 546 jr $ra 547 move $a0,$v0 548.end bn_add_words 549 550.align 5 551.ent bn_add_words_internal 552bn_add_words_internal: 553___ 554$code.=<<___ if ($flavour =~ /nubi/i); 555 .frame $sp,6*$SZREG,$ra 556 .mask 0x8000f008,-$SZREG 557 .set noreorder 558 $PTR_SUB $sp,6*$SZREG 559 $REG_S $ra,5*$SZREG($sp) 560 $REG_S $t3,4*$SZREG($sp) 561 $REG_S $t2,3*$SZREG($sp) 562 $REG_S $t1,2*$SZREG($sp) 563 $REG_S $t0,1*$SZREG($sp) 564 $REG_S $gp,0*$SZREG($sp) 565___ 566$code.=<<___; 567 .set reorder 568 li $minus4,-4 569 and $at,$a3,$minus4 570 beqz $at,.L_bn_add_words_tail 571 572.L_bn_add_words_loop: 573 $LD $t0,0($a1) 574 $LD $ta0,0($a2) 575 subu $a3,4 576 $LD $t1,$BNSZ($a1) 577 and $at,$a3,$minus4 578 $LD $t2,2*$BNSZ($a1) 579 $PTR_ADD $a2,4*$BNSZ 580 $LD $t3,3*$BNSZ($a1) 581 $PTR_ADD $a0,4*$BNSZ 582 $LD $ta1,-3*$BNSZ($a2) 583 $PTR_ADD $a1,4*$BNSZ 584 $LD $ta2,-2*$BNSZ($a2) 585 $LD $ta3,-$BNSZ($a2) 586 $ADDU $ta0,$t0 587 sltu $t8,$ta0,$t0 588 $ADDU $t0,$ta0,$v0 589 sltu $v0,$t0,$ta0 590 $ST $t0,-4*$BNSZ($a0) 591 $ADDU $v0,$t8 592 593 $ADDU $ta1,$t1 594 sltu $t9,$ta1,$t1 595 $ADDU $t1,$ta1,$v0 596 sltu $v0,$t1,$ta1 597 $ST $t1,-3*$BNSZ($a0) 598 $ADDU $v0,$t9 599 600 $ADDU $ta2,$t2 601 sltu $t8,$ta2,$t2 602 $ADDU $t2,$ta2,$v0 603 sltu $v0,$t2,$ta2 604 $ST $t2,-2*$BNSZ($a0) 605 $ADDU $v0,$t8 606 607 $ADDU $ta3,$t3 608 sltu $t9,$ta3,$t3 609 $ADDU $t3,$ta3,$v0 610 sltu $v0,$t3,$ta3 611 $ST $t3,-$BNSZ($a0) 612 613 .set noreorder 614 bgtz $at,.L_bn_add_words_loop 615 $ADDU $v0,$t9 616 617 beqz $a3,.L_bn_add_words_return 618 nop 619 620.L_bn_add_words_tail: 621 .set reorder 622 $LD $t0,0($a1) 623 $LD $ta0,0($a2) 624 $ADDU $ta0,$t0 625 subu $a3,1 626 sltu $t8,$ta0,$t0 627 $ADDU $t0,$ta0,$v0 628 sltu $v0,$t0,$ta0 629 $ST $t0,0($a0) 630 $ADDU $v0,$t8 631 beqz $a3,.L_bn_add_words_return 632 633 $LD $t1,$BNSZ($a1) 634 $LD $ta1,$BNSZ($a2) 635 $ADDU $ta1,$t1 636 subu $a3,1 637 sltu $t9,$ta1,$t1 638 $ADDU $t1,$ta1,$v0 639 sltu $v0,$t1,$ta1 640 $ST $t1,$BNSZ($a0) 641 $ADDU $v0,$t9 642 beqz $a3,.L_bn_add_words_return 643 644 $LD $t2,2*$BNSZ($a1) 645 $LD $ta2,2*$BNSZ($a2) 646 $ADDU $ta2,$t2 647 sltu $t8,$ta2,$t2 648 $ADDU $t2,$ta2,$v0 649 sltu $v0,$t2,$ta2 650 $ST $t2,2*$BNSZ($a0) 651 $ADDU $v0,$t8 652 653.L_bn_add_words_return: 654 .set noreorder 655___ 656$code.=<<___ if ($flavour =~ /nubi/i); 657 $REG_L $t3,4*$SZREG($sp) 658 $REG_L $t2,3*$SZREG($sp) 659 $REG_L $t1,2*$SZREG($sp) 660 $REG_L $t0,1*$SZREG($sp) 661 $REG_L $gp,0*$SZREG($sp) 662 $PTR_ADD $sp,6*$SZREG 663___ 664$code.=<<___; 665 jr $ra 666 move $a0,$v0 667 668.end bn_add_words_internal 669 670.align 5 671.globl bn_sub_words 672.ent bn_sub_words 673bn_sub_words: 674 .set noreorder 675 bgtz $a3,bn_sub_words_internal 676 move $v0,$zero 677 jr $ra 678 move $a0,$zero 679.end bn_sub_words 680 681.align 5 682.ent bn_sub_words_internal 683bn_sub_words_internal: 684___ 685$code.=<<___ if ($flavour =~ /nubi/i); 686 .frame $sp,6*$SZREG,$ra 687 .mask 0x8000f008,-$SZREG 688 .set noreorder 689 $PTR_SUB $sp,6*$SZREG 690 $REG_S $ra,5*$SZREG($sp) 691 $REG_S $t3,4*$SZREG($sp) 692 $REG_S $t2,3*$SZREG($sp) 693 $REG_S $t1,2*$SZREG($sp) 694 $REG_S $t0,1*$SZREG($sp) 695 $REG_S $gp,0*$SZREG($sp) 696___ 697$code.=<<___; 698 .set reorder 699 li $minus4,-4 700 and $at,$a3,$minus4 701 beqz $at,.L_bn_sub_words_tail 702 703.L_bn_sub_words_loop: 704 $LD $t0,0($a1) 705 $LD $ta0,0($a2) 706 subu $a3,4 707 $LD $t1,$BNSZ($a1) 708 and $at,$a3,$minus4 709 $LD $t2,2*$BNSZ($a1) 710 $PTR_ADD $a2,4*$BNSZ 711 $LD $t3,3*$BNSZ($a1) 712 $PTR_ADD $a0,4*$BNSZ 713 $LD $ta1,-3*$BNSZ($a2) 714 $PTR_ADD $a1,4*$BNSZ 715 $LD $ta2,-2*$BNSZ($a2) 716 $LD $ta3,-$BNSZ($a2) 717 sltu $t8,$t0,$ta0 718 $SUBU $ta0,$t0,$ta0 719 $SUBU $t0,$ta0,$v0 720 sgtu $v0,$t0,$ta0 721 $ST $t0,-4*$BNSZ($a0) 722 $ADDU $v0,$t8 723 724 sltu $t9,$t1,$ta1 725 $SUBU $ta1,$t1,$ta1 726 $SUBU $t1,$ta1,$v0 727 sgtu $v0,$t1,$ta1 728 $ST $t1,-3*$BNSZ($a0) 729 $ADDU $v0,$t9 730 731 732 sltu $t8,$t2,$ta2 733 $SUBU $ta2,$t2,$ta2 734 $SUBU $t2,$ta2,$v0 735 sgtu $v0,$t2,$ta2 736 $ST $t2,-2*$BNSZ($a0) 737 $ADDU $v0,$t8 738 739 sltu $t9,$t3,$ta3 740 $SUBU $ta3,$t3,$ta3 741 $SUBU $t3,$ta3,$v0 742 sgtu $v0,$t3,$ta3 743 $ST $t3,-$BNSZ($a0) 744 745 .set noreorder 746 bgtz $at,.L_bn_sub_words_loop 747 $ADDU $v0,$t9 748 749 beqz $a3,.L_bn_sub_words_return 750 nop 751 752.L_bn_sub_words_tail: 753 .set reorder 754 $LD $t0,0($a1) 755 $LD $ta0,0($a2) 756 subu $a3,1 757 sltu $t8,$t0,$ta0 758 $SUBU $ta0,$t0,$ta0 759 $SUBU $t0,$ta0,$v0 760 sgtu $v0,$t0,$ta0 761 $ST $t0,0($a0) 762 $ADDU $v0,$t8 763 beqz $a3,.L_bn_sub_words_return 764 765 $LD $t1,$BNSZ($a1) 766 subu $a3,1 767 $LD $ta1,$BNSZ($a2) 768 sltu $t9,$t1,$ta1 769 $SUBU $ta1,$t1,$ta1 770 $SUBU $t1,$ta1,$v0 771 sgtu $v0,$t1,$ta1 772 $ST $t1,$BNSZ($a0) 773 $ADDU $v0,$t9 774 beqz $a3,.L_bn_sub_words_return 775 776 $LD $t2,2*$BNSZ($a1) 777 $LD $ta2,2*$BNSZ($a2) 778 sltu $t8,$t2,$ta2 779 $SUBU $ta2,$t2,$ta2 780 $SUBU $t2,$ta2,$v0 781 sgtu $v0,$t2,$ta2 782 $ST $t2,2*$BNSZ($a0) 783 $ADDU $v0,$t8 784 785.L_bn_sub_words_return: 786 .set noreorder 787___ 788$code.=<<___ if ($flavour =~ /nubi/i); 789 $REG_L $t3,4*$SZREG($sp) 790 $REG_L $t2,3*$SZREG($sp) 791 $REG_L $t1,2*$SZREG($sp) 792 $REG_L $t0,1*$SZREG($sp) 793 $REG_L $gp,0*$SZREG($sp) 794 $PTR_ADD $sp,6*$SZREG 795___ 796$code.=<<___; 797 jr $ra 798 move $a0,$v0 799.end bn_sub_words_internal 800 801.align 5 802.globl bn_div_3_words 803.ent bn_div_3_words 804bn_div_3_words: 805 .set noreorder 806 move $a3,$a0 # we know that bn_div_words does not 807 # touch $a3, $ta2, $ta3 and preserves $a2 808 # so that we can save two arguments 809 # and return address in registers 810 # instead of stack:-) 811 812 $LD $a0,($a3) 813 move $ta2,$a1 814 bne $a0,$a2,bn_div_3_words_internal 815 $LD $a1,-$BNSZ($a3) 816 li $v0,-1 817 jr $ra 818 move $a0,$v0 819.end bn_div_3_words 820 821.align 5 822.ent bn_div_3_words_internal 823bn_div_3_words_internal: 824___ 825$code.=<<___ if ($flavour =~ /nubi/i); 826 .frame $sp,6*$SZREG,$ra 827 .mask 0x8000f008,-$SZREG 828 .set noreorder 829 $PTR_SUB $sp,6*$SZREG 830 $REG_S $ra,5*$SZREG($sp) 831 $REG_S $t3,4*$SZREG($sp) 832 $REG_S $t2,3*$SZREG($sp) 833 $REG_S $t1,2*$SZREG($sp) 834 $REG_S $t0,1*$SZREG($sp) 835 $REG_S $gp,0*$SZREG($sp) 836___ 837$code.=<<___; 838 .set reorder 839 move $ta3,$ra 840 bal bn_div_words_internal 841 move $ra,$ta3 842 $MULTU ($ta2,$v0) 843 $LD $t2,-2*$BNSZ($a3) 844 move $ta0,$zero 845 mfhi ($t1,$ta2,$v0) 846 mflo ($t0,$ta2,$v0) 847 sltu $t8,$t1,$a1 848.L_bn_div_3_words_inner_loop: 849 bnez $t8,.L_bn_div_3_words_inner_loop_done 850 sgeu $at,$t2,$t0 851 seq $t9,$t1,$a1 852 and $at,$t9 853 sltu $t3,$t0,$ta2 854 $ADDU $a1,$a2 855 $SUBU $t1,$t3 856 $SUBU $t0,$ta2 857 sltu $t8,$t1,$a1 858 sltu $ta0,$a1,$a2 859 or $t8,$ta0 860 .set noreorder 861 beqz $at,.L_bn_div_3_words_inner_loop 862 $SUBU $v0,1 863 $ADDU $v0,1 864 .set reorder 865.L_bn_div_3_words_inner_loop_done: 866 .set noreorder 867___ 868$code.=<<___ if ($flavour =~ /nubi/i); 869 $REG_L $t3,4*$SZREG($sp) 870 $REG_L $t2,3*$SZREG($sp) 871 $REG_L $t1,2*$SZREG($sp) 872 $REG_L $t0,1*$SZREG($sp) 873 $REG_L $gp,0*$SZREG($sp) 874 $PTR_ADD $sp,6*$SZREG 875___ 876$code.=<<___; 877 jr $ra 878 move $a0,$v0 879.end bn_div_3_words_internal 880 881.align 5 882.globl bn_div_words 883.ent bn_div_words 884bn_div_words: 885 .set noreorder 886 bnez $a2,bn_div_words_internal 887 li $v0,-1 # I would rather signal div-by-zero 888 # which can be done with 'break 7' 889 jr $ra 890 move $a0,$v0 891.end bn_div_words 892 893.align 5 894.ent bn_div_words_internal 895bn_div_words_internal: 896___ 897$code.=<<___ if ($flavour =~ /nubi/i); 898 .frame $sp,6*$SZREG,$ra 899 .mask 0x8000f008,-$SZREG 900 .set noreorder 901 $PTR_SUB $sp,6*$SZREG 902 $REG_S $ra,5*$SZREG($sp) 903 $REG_S $t3,4*$SZREG($sp) 904 $REG_S $t2,3*$SZREG($sp) 905 $REG_S $t1,2*$SZREG($sp) 906 $REG_S $t0,1*$SZREG($sp) 907 $REG_S $gp,0*$SZREG($sp) 908___ 909$code.=<<___; 910 move $v1,$zero 911 bltz $a2,.L_bn_div_words_body 912 move $t9,$v1 913 $SLL $a2,1 914 bgtz $a2,.-4 915 addu $t9,1 916 917 .set reorder 918 negu $t1,$t9 919 li $t2,-1 920 $SLL $t2,$t1 921 and $t2,$a0 922 $SRL $at,$a1,$t1 923 .set noreorder 924 beqz $t2,.+12 925 nop 926 break 6 # signal overflow 927 .set reorder 928 $SLL $a0,$t9 929 $SLL $a1,$t9 930 or $a0,$at 931___ 932$QT=$ta0; 933$HH=$ta1; 934$DH=$v1; 935$code.=<<___; 936.L_bn_div_words_body: 937 $SRL $DH,$a2,4*$BNSZ # bits 938 sgeu $at,$a0,$a2 939 .set noreorder 940 beqz $at,.+12 941 nop 942 $SUBU $a0,$a2 943 .set reorder 944 945 li $QT,-1 946 $SRL $HH,$a0,4*$BNSZ # bits 947 $SRL $QT,4*$BNSZ # q=0xffffffff 948 beq $DH,$HH,.L_bn_div_words_skip_div1 949 $DIVU ($a0,$DH) 950 mfqt ($QT,$a0,$DH) 951.L_bn_div_words_skip_div1: 952 $MULTU ($a2,$QT) 953 $SLL $t3,$a0,4*$BNSZ # bits 954 $SRL $at,$a1,4*$BNSZ # bits 955 or $t3,$at 956 mflo ($t0,$a2,$QT) 957 mfhi ($t1,$a2,$QT) 958.L_bn_div_words_inner_loop1: 959 sltu $t2,$t3,$t0 960 seq $t8,$HH,$t1 961 sltu $at,$HH,$t1 962 and $t2,$t8 963 sltu $v0,$t0,$a2 964 or $at,$t2 965 .set noreorder 966 beqz $at,.L_bn_div_words_inner_loop1_done 967 $SUBU $t1,$v0 968 $SUBU $t0,$a2 969 b .L_bn_div_words_inner_loop1 970 $SUBU $QT,1 971 .set reorder 972.L_bn_div_words_inner_loop1_done: 973 974 $SLL $a1,4*$BNSZ # bits 975 $SUBU $a0,$t3,$t0 976 $SLL $v0,$QT,4*$BNSZ # bits 977 978 li $QT,-1 979 $SRL $HH,$a0,4*$BNSZ # bits 980 $SRL $QT,4*$BNSZ # q=0xffffffff 981 beq $DH,$HH,.L_bn_div_words_skip_div2 982 $DIVU ($a0,$DH) 983 mfqt ($QT,$a0,$DH) 984.L_bn_div_words_skip_div2: 985 $MULTU ($a2,$QT) 986 $SLL $t3,$a0,4*$BNSZ # bits 987 $SRL $at,$a1,4*$BNSZ # bits 988 or $t3,$at 989 mflo ($t0,$a2,$QT) 990 mfhi ($t1,$a2,$QT) 991.L_bn_div_words_inner_loop2: 992 sltu $t2,$t3,$t0 993 seq $t8,$HH,$t1 994 sltu $at,$HH,$t1 995 and $t2,$t8 996 sltu $v1,$t0,$a2 997 or $at,$t2 998 .set noreorder 999 beqz $at,.L_bn_div_words_inner_loop2_done 1000 $SUBU $t1,$v1 1001 $SUBU $t0,$a2 1002 b .L_bn_div_words_inner_loop2 1003 $SUBU $QT,1 1004 .set reorder 1005.L_bn_div_words_inner_loop2_done: 1006 1007 $SUBU $a0,$t3,$t0 1008 or $v0,$QT 1009 $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it 1010 $SRL $a2,$t9 # restore $a2 1011 1012 .set noreorder 1013 move $a1,$v1 1014___ 1015$code.=<<___ if ($flavour =~ /nubi/i); 1016 $REG_L $t3,4*$SZREG($sp) 1017 $REG_L $t2,3*$SZREG($sp) 1018 $REG_L $t1,2*$SZREG($sp) 1019 $REG_L $t0,1*$SZREG($sp) 1020 $REG_L $gp,0*$SZREG($sp) 1021 $PTR_ADD $sp,6*$SZREG 1022___ 1023$code.=<<___; 1024 jr $ra 1025 move $a0,$v0 1026.end bn_div_words_internal 1027___ 1028undef $HH; undef $QT; undef $DH; 1029 1030($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); 1031($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); 1032 1033($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 1034($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 1035 1036($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); 1037 1038$code.=<<___; 1039 1040.align 5 1041.globl bn_mul_comba8 1042.ent bn_mul_comba8 1043bn_mul_comba8: 1044 .set noreorder 1045___ 1046$code.=<<___ if ($flavour =~ /nubi/i); 1047 .frame $sp,12*$SZREG,$ra 1048 .mask 0x803ff008,-$SZREG 1049 $PTR_SUB $sp,12*$SZREG 1050 $REG_S $ra,11*$SZREG($sp) 1051 $REG_S $s5,10*$SZREG($sp) 1052 $REG_S $s4,9*$SZREG($sp) 1053 $REG_S $s3,8*$SZREG($sp) 1054 $REG_S $s2,7*$SZREG($sp) 1055 $REG_S $s1,6*$SZREG($sp) 1056 $REG_S $s0,5*$SZREG($sp) 1057 $REG_S $t3,4*$SZREG($sp) 1058 $REG_S $t2,3*$SZREG($sp) 1059 $REG_S $t1,2*$SZREG($sp) 1060 $REG_S $t0,1*$SZREG($sp) 1061 $REG_S $gp,0*$SZREG($sp) 1062___ 1063$code.=<<___ if ($flavour !~ /nubi/i); 1064 .frame $sp,6*$SZREG,$ra 1065 .mask 0x003f0000,-$SZREG 1066 $PTR_SUB $sp,6*$SZREG 1067 $REG_S $s5,5*$SZREG($sp) 1068 $REG_S $s4,4*$SZREG($sp) 1069 $REG_S $s3,3*$SZREG($sp) 1070 $REG_S $s2,2*$SZREG($sp) 1071 $REG_S $s1,1*$SZREG($sp) 1072 $REG_S $s0,0*$SZREG($sp) 1073___ 1074$code.=<<___; 1075 1076 .set reorder 1077 $LD $a_0,0($a1) # If compiled with -mips3 option on 1078 # R5000 box assembler barks on this 1079 # 1ine with "should not have mult/div 1080 # as last instruction in bb (R10K 1081 # bug)" warning. If anybody out there 1082 # has a clue about how to circumvent 1083 # this do send me a note. 1084 # <appro\@fy.chalmers.se> 1085 1086 $LD $b_0,0($a2) 1087 $LD $a_1,$BNSZ($a1) 1088 $LD $a_2,2*$BNSZ($a1) 1089 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1090 $LD $a_3,3*$BNSZ($a1) 1091 $LD $b_1,$BNSZ($a2) 1092 $LD $b_2,2*$BNSZ($a2) 1093 $LD $b_3,3*$BNSZ($a2) 1094 mflo ($c_1,$a_0,$b_0) 1095 mfhi ($c_2,$a_0,$b_0) 1096 1097 $LD $a_4,4*$BNSZ($a1) 1098 $LD $a_5,5*$BNSZ($a1) 1099 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1100 $LD $a_6,6*$BNSZ($a1) 1101 $LD $a_7,7*$BNSZ($a1) 1102 $LD $b_4,4*$BNSZ($a2) 1103 $LD $b_5,5*$BNSZ($a2) 1104 mflo ($t_1,$a_0,$b_1) 1105 mfhi ($t_2,$a_0,$b_1) 1106 $ADDU $c_2,$t_1 1107 sltu $at,$c_2,$t_1 1108 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1109 $ADDU $c_3,$t_2,$at 1110 $LD $b_6,6*$BNSZ($a2) 1111 $LD $b_7,7*$BNSZ($a2) 1112 $ST $c_1,0($a0) # r[0]=c1; 1113 mflo ($t_1,$a_1,$b_0) 1114 mfhi ($t_2,$a_1,$b_0) 1115 $ADDU $c_2,$t_1 1116 sltu $at,$c_2,$t_1 1117 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1118 $ADDU $t_2,$at 1119 $ADDU $c_3,$t_2 1120 sltu $c_1,$c_3,$t_2 1121 $ST $c_2,$BNSZ($a0) # r[1]=c2; 1122 1123 mflo ($t_1,$a_2,$b_0) 1124 mfhi ($t_2,$a_2,$b_0) 1125 $ADDU $c_3,$t_1 1126 sltu $at,$c_3,$t_1 1127 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1128 $ADDU $t_2,$at 1129 $ADDU $c_1,$t_2 1130 mflo ($t_1,$a_1,$b_1) 1131 mfhi ($t_2,$a_1,$b_1) 1132 $ADDU $c_3,$t_1 1133 sltu $at,$c_3,$t_1 1134 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1135 $ADDU $t_2,$at 1136 $ADDU $c_1,$t_2 1137 sltu $c_2,$c_1,$t_2 1138 mflo ($t_1,$a_0,$b_2) 1139 mfhi ($t_2,$a_0,$b_2) 1140 $ADDU $c_3,$t_1 1141 sltu $at,$c_3,$t_1 1142 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1143 $ADDU $t_2,$at 1144 $ADDU $c_1,$t_2 1145 sltu $at,$c_1,$t_2 1146 $ADDU $c_2,$at 1147 $ST $c_3,2*$BNSZ($a0) # r[2]=c3; 1148 1149 mflo ($t_1,$a_0,$b_3) 1150 mfhi ($t_2,$a_0,$b_3) 1151 $ADDU $c_1,$t_1 1152 sltu $at,$c_1,$t_1 1153 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1154 $ADDU $t_2,$at 1155 $ADDU $c_2,$t_2 1156 sltu $c_3,$c_2,$t_2 1157 mflo ($t_1,$a_1,$b_2) 1158 mfhi ($t_2,$a_1,$b_2) 1159 $ADDU $c_1,$t_1 1160 sltu $at,$c_1,$t_1 1161 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1162 $ADDU $t_2,$at 1163 $ADDU $c_2,$t_2 1164 sltu $at,$c_2,$t_2 1165 $ADDU $c_3,$at 1166 mflo ($t_1,$a_2,$b_1) 1167 mfhi ($t_2,$a_2,$b_1) 1168 $ADDU $c_1,$t_1 1169 sltu $at,$c_1,$t_1 1170 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1171 $ADDU $t_2,$at 1172 $ADDU $c_2,$t_2 1173 sltu $at,$c_2,$t_2 1174 $ADDU $c_3,$at 1175 mflo ($t_1,$a_3,$b_0) 1176 mfhi ($t_2,$a_3,$b_0) 1177 $ADDU $c_1,$t_1 1178 sltu $at,$c_1,$t_1 1179 $MULTU ($a_4,$b_0) # mul_add_c(a[4],b[0],c2,c3,c1); 1180 $ADDU $t_2,$at 1181 $ADDU $c_2,$t_2 1182 sltu $at,$c_2,$t_2 1183 $ADDU $c_3,$at 1184 $ST $c_1,3*$BNSZ($a0) # r[3]=c1; 1185 1186 mflo ($t_1,$a_4,$b_0) 1187 mfhi ($t_2,$a_4,$b_0) 1188 $ADDU $c_2,$t_1 1189 sltu $at,$c_2,$t_1 1190 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1191 $ADDU $t_2,$at 1192 $ADDU $c_3,$t_2 1193 sltu $c_1,$c_3,$t_2 1194 mflo ($t_1,$a_3,$b_1) 1195 mfhi ($t_2,$a_3,$b_1) 1196 $ADDU $c_2,$t_1 1197 sltu $at,$c_2,$t_1 1198 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1199 $ADDU $t_2,$at 1200 $ADDU $c_3,$t_2 1201 sltu $at,$c_3,$t_2 1202 $ADDU $c_1,$at 1203 mflo ($t_1,$a_2,$b_2) 1204 mfhi ($t_2,$a_2,$b_2) 1205 $ADDU $c_2,$t_1 1206 sltu $at,$c_2,$t_1 1207 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1208 $ADDU $t_2,$at 1209 $ADDU $c_3,$t_2 1210 sltu $at,$c_3,$t_2 1211 $ADDU $c_1,$at 1212 mflo ($t_1,$a_1,$b_3) 1213 mfhi ($t_2,$a_1,$b_3) 1214 $ADDU $c_2,$t_1 1215 sltu $at,$c_2,$t_1 1216 $MULTU ($a_0,$b_4) # mul_add_c(a[0],b[4],c2,c3,c1); 1217 $ADDU $t_2,$at 1218 $ADDU $c_3,$t_2 1219 sltu $at,$c_3,$t_2 1220 $ADDU $c_1,$at 1221 mflo ($t_1,$a_0,$b_4) 1222 mfhi ($t_2,$a_0,$b_4) 1223 $ADDU $c_2,$t_1 1224 sltu $at,$c_2,$t_1 1225 $MULTU ($a_0,$b_5) # mul_add_c(a[0],b[5],c3,c1,c2); 1226 $ADDU $t_2,$at 1227 $ADDU $c_3,$t_2 1228 sltu $at,$c_3,$t_2 1229 $ADDU $c_1,$at 1230 $ST $c_2,4*$BNSZ($a0) # r[4]=c2; 1231 1232 mflo ($t_1,$a_0,$b_5) 1233 mfhi ($t_2,$a_0,$b_5) 1234 $ADDU $c_3,$t_1 1235 sltu $at,$c_3,$t_1 1236 $MULTU ($a_1,$b_4) # mul_add_c(a[1],b[4],c3,c1,c2); 1237 $ADDU $t_2,$at 1238 $ADDU $c_1,$t_2 1239 sltu $c_2,$c_1,$t_2 1240 mflo ($t_1,$a_1,$b_4) 1241 mfhi ($t_2,$a_1,$b_4) 1242 $ADDU $c_3,$t_1 1243 sltu $at,$c_3,$t_1 1244 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1245 $ADDU $t_2,$at 1246 $ADDU $c_1,$t_2 1247 sltu $at,$c_1,$t_2 1248 $ADDU $c_2,$at 1249 mflo ($t_1,$a_2,$b_3) 1250 mfhi ($t_2,$a_2,$b_3) 1251 $ADDU $c_3,$t_1 1252 sltu $at,$c_3,$t_1 1253 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1254 $ADDU $t_2,$at 1255 $ADDU $c_1,$t_2 1256 sltu $at,$c_1,$t_2 1257 $ADDU $c_2,$at 1258 mflo ($t_1,$a_3,$b_2) 1259 mfhi ($t_2,$a_3,$b_2) 1260 $ADDU $c_3,$t_1 1261 sltu $at,$c_3,$t_1 1262 $MULTU ($a_4,$b_1) # mul_add_c(a[4],b[1],c3,c1,c2); 1263 $ADDU $t_2,$at 1264 $ADDU $c_1,$t_2 1265 sltu $at,$c_1,$t_2 1266 $ADDU $c_2,$at 1267 mflo ($t_1,$a_4,$b_1) 1268 mfhi ($t_2,$a_4,$b_1) 1269 $ADDU $c_3,$t_1 1270 sltu $at,$c_3,$t_1 1271 $MULTU ($a_5,$b_0) # mul_add_c(a[5],b[0],c3,c1,c2); 1272 $ADDU $t_2,$at 1273 $ADDU $c_1,$t_2 1274 sltu $at,$c_1,$t_2 1275 $ADDU $c_2,$at 1276 mflo ($t_1,$a_5,$b_0) 1277 mfhi ($t_2,$a_5,$b_0) 1278 $ADDU $c_3,$t_1 1279 sltu $at,$c_3,$t_1 1280 $MULTU ($a_6,$b_0) # mul_add_c(a[6],b[0],c1,c2,c3); 1281 $ADDU $t_2,$at 1282 $ADDU $c_1,$t_2 1283 sltu $at,$c_1,$t_2 1284 $ADDU $c_2,$at 1285 $ST $c_3,5*$BNSZ($a0) # r[5]=c3; 1286 1287 mflo ($t_1,$a_6,$b_0) 1288 mfhi ($t_2,$a_6,$b_0) 1289 $ADDU $c_1,$t_1 1290 sltu $at,$c_1,$t_1 1291 $MULTU ($a_5,$b_1) # mul_add_c(a[5],b[1],c1,c2,c3); 1292 $ADDU $t_2,$at 1293 $ADDU $c_2,$t_2 1294 sltu $c_3,$c_2,$t_2 1295 mflo ($t_1,$a_5,$b_1) 1296 mfhi ($t_2,$a_5,$b_1) 1297 $ADDU $c_1,$t_1 1298 sltu $at,$c_1,$t_1 1299 $MULTU ($a_4,$b_2) # mul_add_c(a[4],b[2],c1,c2,c3); 1300 $ADDU $t_2,$at 1301 $ADDU $c_2,$t_2 1302 sltu $at,$c_2,$t_2 1303 $ADDU $c_3,$at 1304 mflo ($t_1,$a_4,$b_2) 1305 mfhi ($t_2,$a_4,$b_2) 1306 $ADDU $c_1,$t_1 1307 sltu $at,$c_1,$t_1 1308 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1309 $ADDU $t_2,$at 1310 $ADDU $c_2,$t_2 1311 sltu $at,$c_2,$t_2 1312 $ADDU $c_3,$at 1313 mflo ($t_1,$a_3,$b_3) 1314 mfhi ($t_2,$a_3,$b_3) 1315 $ADDU $c_1,$t_1 1316 sltu $at,$c_1,$t_1 1317 $MULTU ($a_2,$b_4) # mul_add_c(a[2],b[4],c1,c2,c3); 1318 $ADDU $t_2,$at 1319 $ADDU $c_2,$t_2 1320 sltu $at,$c_2,$t_2 1321 $ADDU $c_3,$at 1322 mflo ($t_1,$a_2,$b_4) 1323 mfhi ($t_2,$a_2,$b_4) 1324 $ADDU $c_1,$t_1 1325 sltu $at,$c_1,$t_1 1326 $MULTU ($a_1,$b_5) # mul_add_c(a[1],b[5],c1,c2,c3); 1327 $ADDU $t_2,$at 1328 $ADDU $c_2,$t_2 1329 sltu $at,$c_2,$t_2 1330 $ADDU $c_3,$at 1331 mflo ($t_1,$a_1,$b_5) 1332 mfhi ($t_2,$a_1,$b_5) 1333 $ADDU $c_1,$t_1 1334 sltu $at,$c_1,$t_1 1335 $MULTU ($a_0,$b_6) # mul_add_c(a[0],b[6],c1,c2,c3); 1336 $ADDU $t_2,$at 1337 $ADDU $c_2,$t_2 1338 sltu $at,$c_2,$t_2 1339 $ADDU $c_3,$at 1340 mflo ($t_1,$a_0,$b_6) 1341 mfhi ($t_2,$a_0,$b_6) 1342 $ADDU $c_1,$t_1 1343 sltu $at,$c_1,$t_1 1344 $MULTU ($a_0,$b_7) # mul_add_c(a[0],b[7],c2,c3,c1); 1345 $ADDU $t_2,$at 1346 $ADDU $c_2,$t_2 1347 sltu $at,$c_2,$t_2 1348 $ADDU $c_3,$at 1349 $ST $c_1,6*$BNSZ($a0) # r[6]=c1; 1350 1351 mflo ($t_1,$a_0,$b_7) 1352 mfhi ($t_2,$a_0,$b_7) 1353 $ADDU $c_2,$t_1 1354 sltu $at,$c_2,$t_1 1355 $MULTU ($a_1,$b_6) # mul_add_c(a[1],b[6],c2,c3,c1); 1356 $ADDU $t_2,$at 1357 $ADDU $c_3,$t_2 1358 sltu $c_1,$c_3,$t_2 1359 mflo ($t_1,$a_1,$b_6) 1360 mfhi ($t_2,$a_1,$b_6) 1361 $ADDU $c_2,$t_1 1362 sltu $at,$c_2,$t_1 1363 $MULTU ($a_2,$b_5) # mul_add_c(a[2],b[5],c2,c3,c1); 1364 $ADDU $t_2,$at 1365 $ADDU $c_3,$t_2 1366 sltu $at,$c_3,$t_2 1367 $ADDU $c_1,$at 1368 mflo ($t_1,$a_2,$b_5) 1369 mfhi ($t_2,$a_2,$b_5) 1370 $ADDU $c_2,$t_1 1371 sltu $at,$c_2,$t_1 1372 $MULTU ($a_3,$b_4) # mul_add_c(a[3],b[4],c2,c3,c1); 1373 $ADDU $t_2,$at 1374 $ADDU $c_3,$t_2 1375 sltu $at,$c_3,$t_2 1376 $ADDU $c_1,$at 1377 mflo ($t_1,$a_3,$b_4) 1378 mfhi ($t_2,$a_3,$b_4) 1379 $ADDU $c_2,$t_1 1380 sltu $at,$c_2,$t_1 1381 $MULTU ($a_4,$b_3) # mul_add_c(a[4],b[3],c2,c3,c1); 1382 $ADDU $t_2,$at 1383 $ADDU $c_3,$t_2 1384 sltu $at,$c_3,$t_2 1385 $ADDU $c_1,$at 1386 mflo ($t_1,$a_4,$b_3) 1387 mfhi ($t_2,$a_4,$b_3) 1388 $ADDU $c_2,$t_1 1389 sltu $at,$c_2,$t_1 1390 $MULTU ($a_5,$b_2) # mul_add_c(a[5],b[2],c2,c3,c1); 1391 $ADDU $t_2,$at 1392 $ADDU $c_3,$t_2 1393 sltu $at,$c_3,$t_2 1394 $ADDU $c_1,$at 1395 mflo ($t_1,$a_5,$b_2) 1396 mfhi ($t_2,$a_5,$b_2) 1397 $ADDU $c_2,$t_1 1398 sltu $at,$c_2,$t_1 1399 $MULTU ($a_6,$b_1) # mul_add_c(a[6],b[1],c2,c3,c1); 1400 $ADDU $t_2,$at 1401 $ADDU $c_3,$t_2 1402 sltu $at,$c_3,$t_2 1403 $ADDU $c_1,$at 1404 mflo ($t_1,$a_6,$b_1) 1405 mfhi ($t_2,$a_6,$b_1) 1406 $ADDU $c_2,$t_1 1407 sltu $at,$c_2,$t_1 1408 $MULTU ($a_7,$b_0) # mul_add_c(a[7],b[0],c2,c3,c1); 1409 $ADDU $t_2,$at 1410 $ADDU $c_3,$t_2 1411 sltu $at,$c_3,$t_2 1412 $ADDU $c_1,$at 1413 mflo ($t_1,$a_7,$b_0) 1414 mfhi ($t_2,$a_7,$b_0) 1415 $ADDU $c_2,$t_1 1416 sltu $at,$c_2,$t_1 1417 $MULTU ($a_7,$b_1) # mul_add_c(a[7],b[1],c3,c1,c2); 1418 $ADDU $t_2,$at 1419 $ADDU $c_3,$t_2 1420 sltu $at,$c_3,$t_2 1421 $ADDU $c_1,$at 1422 $ST $c_2,7*$BNSZ($a0) # r[7]=c2; 1423 1424 mflo ($t_1,$a_7,$b_1) 1425 mfhi ($t_2,$a_7,$b_1) 1426 $ADDU $c_3,$t_1 1427 sltu $at,$c_3,$t_1 1428 $MULTU ($a_6,$b_2) # mul_add_c(a[6],b[2],c3,c1,c2); 1429 $ADDU $t_2,$at 1430 $ADDU $c_1,$t_2 1431 sltu $c_2,$c_1,$t_2 1432 mflo ($t_1,$a_6,$b_2) 1433 mfhi ($t_2,$a_6,$b_2) 1434 $ADDU $c_3,$t_1 1435 sltu $at,$c_3,$t_1 1436 $MULTU ($a_5,$b_3) # mul_add_c(a[5],b[3],c3,c1,c2); 1437 $ADDU $t_2,$at 1438 $ADDU $c_1,$t_2 1439 sltu $at,$c_1,$t_2 1440 $ADDU $c_2,$at 1441 mflo ($t_1,$a_5,$b_3) 1442 mfhi ($t_2,$a_5,$b_3) 1443 $ADDU $c_3,$t_1 1444 sltu $at,$c_3,$t_1 1445 $MULTU ($a_4,$b_4) # mul_add_c(a[4],b[4],c3,c1,c2); 1446 $ADDU $t_2,$at 1447 $ADDU $c_1,$t_2 1448 sltu $at,$c_1,$t_2 1449 $ADDU $c_2,$at 1450 mflo ($t_1,$a_4,$b_4) 1451 mfhi ($t_2,$a_4,$b_4) 1452 $ADDU $c_3,$t_1 1453 sltu $at,$c_3,$t_1 1454 $MULTU ($a_3,$b_5) # mul_add_c(a[3],b[5],c3,c1,c2); 1455 $ADDU $t_2,$at 1456 $ADDU $c_1,$t_2 1457 sltu $at,$c_1,$t_2 1458 $ADDU $c_2,$at 1459 mflo ($t_1,$a_3,$b_5) 1460 mfhi ($t_2,$a_3,$b_5) 1461 $ADDU $c_3,$t_1 1462 sltu $at,$c_3,$t_1 1463 $MULTU ($a_2,$b_6) # mul_add_c(a[2],b[6],c3,c1,c2); 1464 $ADDU $t_2,$at 1465 $ADDU $c_1,$t_2 1466 sltu $at,$c_1,$t_2 1467 $ADDU $c_2,$at 1468 mflo ($t_1,$a_2,$b_6) 1469 mfhi ($t_2,$a_2,$b_6) 1470 $ADDU $c_3,$t_1 1471 sltu $at,$c_3,$t_1 1472 $MULTU ($a_1,$b_7) # mul_add_c(a[1],b[7],c3,c1,c2); 1473 $ADDU $t_2,$at 1474 $ADDU $c_1,$t_2 1475 sltu $at,$c_1,$t_2 1476 $ADDU $c_2,$at 1477 mflo ($t_1,$a_1,$b_7) 1478 mfhi ($t_2,$a_1,$b_7) 1479 $ADDU $c_3,$t_1 1480 sltu $at,$c_3,$t_1 1481 $MULTU ($a_2,$b_7) # mul_add_c(a[2],b[7],c1,c2,c3); 1482 $ADDU $t_2,$at 1483 $ADDU $c_1,$t_2 1484 sltu $at,$c_1,$t_2 1485 $ADDU $c_2,$at 1486 $ST $c_3,8*$BNSZ($a0) # r[8]=c3; 1487 1488 mflo ($t_1,$a_2,$b_7) 1489 mfhi ($t_2,$a_2,$b_7) 1490 $ADDU $c_1,$t_1 1491 sltu $at,$c_1,$t_1 1492 $MULTU ($a_3,$b_6) # mul_add_c(a[3],b[6],c1,c2,c3); 1493 $ADDU $t_2,$at 1494 $ADDU $c_2,$t_2 1495 sltu $c_3,$c_2,$t_2 1496 mflo ($t_1,$a_3,$b_6) 1497 mfhi ($t_2,$a_3,$b_6) 1498 $ADDU $c_1,$t_1 1499 sltu $at,$c_1,$t_1 1500 $MULTU ($a_4,$b_5) # mul_add_c(a[4],b[5],c1,c2,c3); 1501 $ADDU $t_2,$at 1502 $ADDU $c_2,$t_2 1503 sltu $at,$c_2,$t_2 1504 $ADDU $c_3,$at 1505 mflo ($t_1,$a_4,$b_5) 1506 mfhi ($t_2,$a_4,$b_5) 1507 $ADDU $c_1,$t_1 1508 sltu $at,$c_1,$t_1 1509 $MULTU ($a_5,$b_4) # mul_add_c(a[5],b[4],c1,c2,c3); 1510 $ADDU $t_2,$at 1511 $ADDU $c_2,$t_2 1512 sltu $at,$c_2,$t_2 1513 $ADDU $c_3,$at 1514 mflo ($t_1,$a_5,$b_4) 1515 mfhi ($t_2,$a_5,$b_4) 1516 $ADDU $c_1,$t_1 1517 sltu $at,$c_1,$t_1 1518 $MULTU ($a_6,$b_3) # mul_add_c(a[6],b[3],c1,c2,c3); 1519 $ADDU $t_2,$at 1520 $ADDU $c_2,$t_2 1521 sltu $at,$c_2,$t_2 1522 $ADDU $c_3,$at 1523 mflo ($t_1,$a_6,$b_3) 1524 mfhi ($t_2,$a_6,$b_3) 1525 $ADDU $c_1,$t_1 1526 sltu $at,$c_1,$t_1 1527 $MULTU ($a_7,$b_2) # mul_add_c(a[7],b[2],c1,c2,c3); 1528 $ADDU $t_2,$at 1529 $ADDU $c_2,$t_2 1530 sltu $at,$c_2,$t_2 1531 $ADDU $c_3,$at 1532 mflo ($t_1,$a_7,$b_2) 1533 mfhi ($t_2,$a_7,$b_2) 1534 $ADDU $c_1,$t_1 1535 sltu $at,$c_1,$t_1 1536 $MULTU ($a_7,$b_3) # mul_add_c(a[7],b[3],c2,c3,c1); 1537 $ADDU $t_2,$at 1538 $ADDU $c_2,$t_2 1539 sltu $at,$c_2,$t_2 1540 $ADDU $c_3,$at 1541 $ST $c_1,9*$BNSZ($a0) # r[9]=c1; 1542 1543 mflo ($t_1,$a_7,$b_3) 1544 mfhi ($t_2,$a_7,$b_3) 1545 $ADDU $c_2,$t_1 1546 sltu $at,$c_2,$t_1 1547 $MULTU ($a_6,$b_4) # mul_add_c(a[6],b[4],c2,c3,c1); 1548 $ADDU $t_2,$at 1549 $ADDU $c_3,$t_2 1550 sltu $c_1,$c_3,$t_2 1551 mflo ($t_1,$a_6,$b_4) 1552 mfhi ($t_2,$a_6,$b_4) 1553 $ADDU $c_2,$t_1 1554 sltu $at,$c_2,$t_1 1555 $MULTU ($a_5,$b_5) # mul_add_c(a[5],b[5],c2,c3,c1); 1556 $ADDU $t_2,$at 1557 $ADDU $c_3,$t_2 1558 sltu $at,$c_3,$t_2 1559 $ADDU $c_1,$at 1560 mflo ($t_1,$a_5,$b_5) 1561 mfhi ($t_2,$a_5,$b_5) 1562 $ADDU $c_2,$t_1 1563 sltu $at,$c_2,$t_1 1564 $MULTU ($a_4,$b_6) # mul_add_c(a[4],b[6],c2,c3,c1); 1565 $ADDU $t_2,$at 1566 $ADDU $c_3,$t_2 1567 sltu $at,$c_3,$t_2 1568 $ADDU $c_1,$at 1569 mflo ($t_1,$a_4,$b_6) 1570 mfhi ($t_2,$a_4,$b_6) 1571 $ADDU $c_2,$t_1 1572 sltu $at,$c_2,$t_1 1573 $MULTU ($a_3,$b_7) # mul_add_c(a[3],b[7],c2,c3,c1); 1574 $ADDU $t_2,$at 1575 $ADDU $c_3,$t_2 1576 sltu $at,$c_3,$t_2 1577 $ADDU $c_1,$at 1578 mflo ($t_1,$a_3,$b_7) 1579 mfhi ($t_2,$a_3,$b_7) 1580 $ADDU $c_2,$t_1 1581 sltu $at,$c_2,$t_1 1582 $MULTU ($a_4,$b_7) # mul_add_c(a[4],b[7],c3,c1,c2); 1583 $ADDU $t_2,$at 1584 $ADDU $c_3,$t_2 1585 sltu $at,$c_3,$t_2 1586 $ADDU $c_1,$at 1587 $ST $c_2,10*$BNSZ($a0) # r[10]=c2; 1588 1589 mflo ($t_1,$a_4,$b_7) 1590 mfhi ($t_2,$a_4,$b_7) 1591 $ADDU $c_3,$t_1 1592 sltu $at,$c_3,$t_1 1593 $MULTU ($a_5,$b_6) # mul_add_c(a[5],b[6],c3,c1,c2); 1594 $ADDU $t_2,$at 1595 $ADDU $c_1,$t_2 1596 sltu $c_2,$c_1,$t_2 1597 mflo ($t_1,$a_5,$b_6) 1598 mfhi ($t_2,$a_5,$b_6) 1599 $ADDU $c_3,$t_1 1600 sltu $at,$c_3,$t_1 1601 $MULTU ($a_6,$b_5) # mul_add_c(a[6],b[5],c3,c1,c2); 1602 $ADDU $t_2,$at 1603 $ADDU $c_1,$t_2 1604 sltu $at,$c_1,$t_2 1605 $ADDU $c_2,$at 1606 mflo ($t_1,$a_6,$b_5) 1607 mfhi ($t_2,$a_6,$b_5) 1608 $ADDU $c_3,$t_1 1609 sltu $at,$c_3,$t_1 1610 $MULTU ($a_7,$b_4) # mul_add_c(a[7],b[4],c3,c1,c2); 1611 $ADDU $t_2,$at 1612 $ADDU $c_1,$t_2 1613 sltu $at,$c_1,$t_2 1614 $ADDU $c_2,$at 1615 mflo ($t_1,$a_7,$b_4) 1616 mfhi ($t_2,$a_7,$b_4) 1617 $ADDU $c_3,$t_1 1618 sltu $at,$c_3,$t_1 1619 $MULTU ($a_7,$b_5) # mul_add_c(a[7],b[5],c1,c2,c3); 1620 $ADDU $t_2,$at 1621 $ADDU $c_1,$t_2 1622 sltu $at,$c_1,$t_2 1623 $ADDU $c_2,$at 1624 $ST $c_3,11*$BNSZ($a0) # r[11]=c3; 1625 1626 mflo ($t_1,$a_7,$b_5) 1627 mfhi ($t_2,$a_7,$b_5) 1628 $ADDU $c_1,$t_1 1629 sltu $at,$c_1,$t_1 1630 $MULTU ($a_6,$b_6) # mul_add_c(a[6],b[6],c1,c2,c3); 1631 $ADDU $t_2,$at 1632 $ADDU $c_2,$t_2 1633 sltu $c_3,$c_2,$t_2 1634 mflo ($t_1,$a_6,$b_6) 1635 mfhi ($t_2,$a_6,$b_6) 1636 $ADDU $c_1,$t_1 1637 sltu $at,$c_1,$t_1 1638 $MULTU ($a_5,$b_7) # mul_add_c(a[5],b[7],c1,c2,c3); 1639 $ADDU $t_2,$at 1640 $ADDU $c_2,$t_2 1641 sltu $at,$c_2,$t_2 1642 $ADDU $c_3,$at 1643 mflo ($t_1,$a_5,$b_7) 1644 mfhi ($t_2,$a_5,$b_7) 1645 $ADDU $c_1,$t_1 1646 sltu $at,$c_1,$t_1 1647 $MULTU ($a_6,$b_7) # mul_add_c(a[6],b[7],c2,c3,c1); 1648 $ADDU $t_2,$at 1649 $ADDU $c_2,$t_2 1650 sltu $at,$c_2,$t_2 1651 $ADDU $c_3,$at 1652 $ST $c_1,12*$BNSZ($a0) # r[12]=c1; 1653 1654 mflo ($t_1,$a_6,$b_7) 1655 mfhi ($t_2,$a_6,$b_7) 1656 $ADDU $c_2,$t_1 1657 sltu $at,$c_2,$t_1 1658 $MULTU ($a_7,$b_6) # mul_add_c(a[7],b[6],c2,c3,c1); 1659 $ADDU $t_2,$at 1660 $ADDU $c_3,$t_2 1661 sltu $c_1,$c_3,$t_2 1662 mflo ($t_1,$a_7,$b_6) 1663 mfhi ($t_2,$a_7,$b_6) 1664 $ADDU $c_2,$t_1 1665 sltu $at,$c_2,$t_1 1666 $MULTU ($a_7,$b_7) # mul_add_c(a[7],b[7],c3,c1,c2); 1667 $ADDU $t_2,$at 1668 $ADDU $c_3,$t_2 1669 sltu $at,$c_3,$t_2 1670 $ADDU $c_1,$at 1671 $ST $c_2,13*$BNSZ($a0) # r[13]=c2; 1672 1673 mflo ($t_1,$a_7,$b_7) 1674 mfhi ($t_2,$a_7,$b_7) 1675 $ADDU $c_3,$t_1 1676 sltu $at,$c_3,$t_1 1677 $ADDU $t_2,$at 1678 $ADDU $c_1,$t_2 1679 $ST $c_3,14*$BNSZ($a0) # r[14]=c3; 1680 $ST $c_1,15*$BNSZ($a0) # r[15]=c1; 1681 1682 .set noreorder 1683___ 1684$code.=<<___ if ($flavour =~ /nubi/i); 1685 $REG_L $s5,10*$SZREG($sp) 1686 $REG_L $s4,9*$SZREG($sp) 1687 $REG_L $s3,8*$SZREG($sp) 1688 $REG_L $s2,7*$SZREG($sp) 1689 $REG_L $s1,6*$SZREG($sp) 1690 $REG_L $s0,5*$SZREG($sp) 1691 $REG_L $t3,4*$SZREG($sp) 1692 $REG_L $t2,3*$SZREG($sp) 1693 $REG_L $t1,2*$SZREG($sp) 1694 $REG_L $t0,1*$SZREG($sp) 1695 $REG_L $gp,0*$SZREG($sp) 1696 jr $ra 1697 $PTR_ADD $sp,12*$SZREG 1698___ 1699$code.=<<___ if ($flavour !~ /nubi/i); 1700 $REG_L $s5,5*$SZREG($sp) 1701 $REG_L $s4,4*$SZREG($sp) 1702 $REG_L $s3,3*$SZREG($sp) 1703 $REG_L $s2,2*$SZREG($sp) 1704 $REG_L $s1,1*$SZREG($sp) 1705 $REG_L $s0,0*$SZREG($sp) 1706 jr $ra 1707 $PTR_ADD $sp,6*$SZREG 1708___ 1709$code.=<<___; 1710.end bn_mul_comba8 1711 1712.align 5 1713.globl bn_mul_comba4 1714.ent bn_mul_comba4 1715bn_mul_comba4: 1716___ 1717$code.=<<___ if ($flavour =~ /nubi/i); 1718 .frame $sp,6*$SZREG,$ra 1719 .mask 0x8000f008,-$SZREG 1720 .set noreorder 1721 $PTR_SUB $sp,6*$SZREG 1722 $REG_S $ra,5*$SZREG($sp) 1723 $REG_S $t3,4*$SZREG($sp) 1724 $REG_S $t2,3*$SZREG($sp) 1725 $REG_S $t1,2*$SZREG($sp) 1726 $REG_S $t0,1*$SZREG($sp) 1727 $REG_S $gp,0*$SZREG($sp) 1728___ 1729$code.=<<___; 1730 .set reorder 1731 $LD $a_0,0($a1) 1732 $LD $b_0,0($a2) 1733 $LD $a_1,$BNSZ($a1) 1734 $LD $a_2,2*$BNSZ($a1) 1735 $MULTU ($a_0,$b_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1736 $LD $a_3,3*$BNSZ($a1) 1737 $LD $b_1,$BNSZ($a2) 1738 $LD $b_2,2*$BNSZ($a2) 1739 $LD $b_3,3*$BNSZ($a2) 1740 mflo ($c_1,$a_0,$b_0) 1741 mfhi ($c_2,$a_0,$b_0) 1742 $ST $c_1,0($a0) 1743 1744 $MULTU ($a_0,$b_1) # mul_add_c(a[0],b[1],c2,c3,c1); 1745 mflo ($t_1,$a_0,$b_1) 1746 mfhi ($t_2,$a_0,$b_1) 1747 $ADDU $c_2,$t_1 1748 sltu $at,$c_2,$t_1 1749 $MULTU ($a_1,$b_0) # mul_add_c(a[1],b[0],c2,c3,c1); 1750 $ADDU $c_3,$t_2,$at 1751 mflo ($t_1,$a_1,$b_0) 1752 mfhi ($t_2,$a_1,$b_0) 1753 $ADDU $c_2,$t_1 1754 sltu $at,$c_2,$t_1 1755 $MULTU ($a_2,$b_0) # mul_add_c(a[2],b[0],c3,c1,c2); 1756 $ADDU $t_2,$at 1757 $ADDU $c_3,$t_2 1758 sltu $c_1,$c_3,$t_2 1759 $ST $c_2,$BNSZ($a0) 1760 1761 mflo ($t_1,$a_2,$b_0) 1762 mfhi ($t_2,$a_2,$b_0) 1763 $ADDU $c_3,$t_1 1764 sltu $at,$c_3,$t_1 1765 $MULTU ($a_1,$b_1) # mul_add_c(a[1],b[1],c3,c1,c2); 1766 $ADDU $t_2,$at 1767 $ADDU $c_1,$t_2 1768 mflo ($t_1,$a_1,$b_1) 1769 mfhi ($t_2,$a_1,$b_1) 1770 $ADDU $c_3,$t_1 1771 sltu $at,$c_3,$t_1 1772 $MULTU ($a_0,$b_2) # mul_add_c(a[0],b[2],c3,c1,c2); 1773 $ADDU $t_2,$at 1774 $ADDU $c_1,$t_2 1775 sltu $c_2,$c_1,$t_2 1776 mflo ($t_1,$a_0,$b_2) 1777 mfhi ($t_2,$a_0,$b_2) 1778 $ADDU $c_3,$t_1 1779 sltu $at,$c_3,$t_1 1780 $MULTU ($a_0,$b_3) # mul_add_c(a[0],b[3],c1,c2,c3); 1781 $ADDU $t_2,$at 1782 $ADDU $c_1,$t_2 1783 sltu $at,$c_1,$t_2 1784 $ADDU $c_2,$at 1785 $ST $c_3,2*$BNSZ($a0) 1786 1787 mflo ($t_1,$a_0,$b_3) 1788 mfhi ($t_2,$a_0,$b_3) 1789 $ADDU $c_1,$t_1 1790 sltu $at,$c_1,$t_1 1791 $MULTU ($a_1,$b_2) # mul_add_c(a[1],b[2],c1,c2,c3); 1792 $ADDU $t_2,$at 1793 $ADDU $c_2,$t_2 1794 sltu $c_3,$c_2,$t_2 1795 mflo ($t_1,$a_1,$b_2) 1796 mfhi ($t_2,$a_1,$b_2) 1797 $ADDU $c_1,$t_1 1798 sltu $at,$c_1,$t_1 1799 $MULTU ($a_2,$b_1) # mul_add_c(a[2],b[1],c1,c2,c3); 1800 $ADDU $t_2,$at 1801 $ADDU $c_2,$t_2 1802 sltu $at,$c_2,$t_2 1803 $ADDU $c_3,$at 1804 mflo ($t_1,$a_2,$b_1) 1805 mfhi ($t_2,$a_2,$b_1) 1806 $ADDU $c_1,$t_1 1807 sltu $at,$c_1,$t_1 1808 $MULTU ($a_3,$b_0) # mul_add_c(a[3],b[0],c1,c2,c3); 1809 $ADDU $t_2,$at 1810 $ADDU $c_2,$t_2 1811 sltu $at,$c_2,$t_2 1812 $ADDU $c_3,$at 1813 mflo ($t_1,$a_3,$b_0) 1814 mfhi ($t_2,$a_3,$b_0) 1815 $ADDU $c_1,$t_1 1816 sltu $at,$c_1,$t_1 1817 $MULTU ($a_3,$b_1) # mul_add_c(a[3],b[1],c2,c3,c1); 1818 $ADDU $t_2,$at 1819 $ADDU $c_2,$t_2 1820 sltu $at,$c_2,$t_2 1821 $ADDU $c_3,$at 1822 $ST $c_1,3*$BNSZ($a0) 1823 1824 mflo ($t_1,$a_3,$b_1) 1825 mfhi ($t_2,$a_3,$b_1) 1826 $ADDU $c_2,$t_1 1827 sltu $at,$c_2,$t_1 1828 $MULTU ($a_2,$b_2) # mul_add_c(a[2],b[2],c2,c3,c1); 1829 $ADDU $t_2,$at 1830 $ADDU $c_3,$t_2 1831 sltu $c_1,$c_3,$t_2 1832 mflo ($t_1,$a_2,$b_2) 1833 mfhi ($t_2,$a_2,$b_2) 1834 $ADDU $c_2,$t_1 1835 sltu $at,$c_2,$t_1 1836 $MULTU ($a_1,$b_3) # mul_add_c(a[1],b[3],c2,c3,c1); 1837 $ADDU $t_2,$at 1838 $ADDU $c_3,$t_2 1839 sltu $at,$c_3,$t_2 1840 $ADDU $c_1,$at 1841 mflo ($t_1,$a_1,$b_3) 1842 mfhi ($t_2,$a_1,$b_3) 1843 $ADDU $c_2,$t_1 1844 sltu $at,$c_2,$t_1 1845 $MULTU ($a_2,$b_3) # mul_add_c(a[2],b[3],c3,c1,c2); 1846 $ADDU $t_2,$at 1847 $ADDU $c_3,$t_2 1848 sltu $at,$c_3,$t_2 1849 $ADDU $c_1,$at 1850 $ST $c_2,4*$BNSZ($a0) 1851 1852 mflo ($t_1,$a_2,$b_3) 1853 mfhi ($t_2,$a_2,$b_3) 1854 $ADDU $c_3,$t_1 1855 sltu $at,$c_3,$t_1 1856 $MULTU ($a_3,$b_2) # mul_add_c(a[3],b[2],c3,c1,c2); 1857 $ADDU $t_2,$at 1858 $ADDU $c_1,$t_2 1859 sltu $c_2,$c_1,$t_2 1860 mflo ($t_1,$a_3,$b_2) 1861 mfhi ($t_2,$a_3,$b_2) 1862 $ADDU $c_3,$t_1 1863 sltu $at,$c_3,$t_1 1864 $MULTU ($a_3,$b_3) # mul_add_c(a[3],b[3],c1,c2,c3); 1865 $ADDU $t_2,$at 1866 $ADDU $c_1,$t_2 1867 sltu $at,$c_1,$t_2 1868 $ADDU $c_2,$at 1869 $ST $c_3,5*$BNSZ($a0) 1870 1871 mflo ($t_1,$a_3,$b_3) 1872 mfhi ($t_2,$a_3,$b_3) 1873 $ADDU $c_1,$t_1 1874 sltu $at,$c_1,$t_1 1875 $ADDU $t_2,$at 1876 $ADDU $c_2,$t_2 1877 $ST $c_1,6*$BNSZ($a0) 1878 $ST $c_2,7*$BNSZ($a0) 1879 1880 .set noreorder 1881___ 1882$code.=<<___ if ($flavour =~ /nubi/i); 1883 $REG_L $t3,4*$SZREG($sp) 1884 $REG_L $t2,3*$SZREG($sp) 1885 $REG_L $t1,2*$SZREG($sp) 1886 $REG_L $t0,1*$SZREG($sp) 1887 $REG_L $gp,0*$SZREG($sp) 1888 $PTR_ADD $sp,6*$SZREG 1889___ 1890$code.=<<___; 1891 jr $ra 1892 nop 1893.end bn_mul_comba4 1894___ 1895 1896($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); 1897 1898sub add_c2 () { 1899my ($hi,$lo,$c0,$c1,$c2, 1900 $warm, # !$warm denotes first call with specific sequence of 1901 # $c_[XYZ] when there is no Z-carry to accumulate yet; 1902 $an,$bn # these two are arguments for multiplication which 1903 # result is used in *next* step [which is why it's 1904 # commented as "forward multiplication" below]; 1905 )=@_; 1906$code.=<<___; 1907 $ADDU $c0,$lo 1908 sltu $at,$c0,$lo 1909 $MULTU ($an,$bn) # forward multiplication 1910 $ADDU $c0,$lo 1911 $ADDU $at,$hi 1912 sltu $lo,$c0,$lo 1913 $ADDU $c1,$at 1914 $ADDU $hi,$lo 1915___ 1916$code.=<<___ if (!$warm); 1917 sltu $c2,$c1,$at 1918 $ADDU $c1,$hi 1919___ 1920$code.=<<___ if ($warm); 1921 sltu $at,$c1,$at 1922 $ADDU $c1,$hi 1923 $ADDU $c2,$at 1924___ 1925$code.=<<___; 1926 sltu $hi,$c1,$hi 1927 $ADDU $c2,$hi 1928 mflo ($lo,$an,$bn) 1929 mfhi ($hi,$an,$bn) 1930___ 1931} 1932 1933$code.=<<___; 1934 1935.align 5 1936.globl bn_sqr_comba8 1937.ent bn_sqr_comba8 1938bn_sqr_comba8: 1939___ 1940$code.=<<___ if ($flavour =~ /nubi/i); 1941 .frame $sp,6*$SZREG,$ra 1942 .mask 0x8000f008,-$SZREG 1943 .set noreorder 1944 $PTR_SUB $sp,6*$SZREG 1945 $REG_S $ra,5*$SZREG($sp) 1946 $REG_S $t3,4*$SZREG($sp) 1947 $REG_S $t2,3*$SZREG($sp) 1948 $REG_S $t1,2*$SZREG($sp) 1949 $REG_S $t0,1*$SZREG($sp) 1950 $REG_S $gp,0*$SZREG($sp) 1951___ 1952$code.=<<___; 1953 .set reorder 1954 $LD $a_0,0($a1) 1955 $LD $a_1,$BNSZ($a1) 1956 $LD $a_2,2*$BNSZ($a1) 1957 $LD $a_3,3*$BNSZ($a1) 1958 1959 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 1960 $LD $a_4,4*$BNSZ($a1) 1961 $LD $a_5,5*$BNSZ($a1) 1962 $LD $a_6,6*$BNSZ($a1) 1963 $LD $a_7,7*$BNSZ($a1) 1964 mflo ($c_1,$a_0,$a_0) 1965 mfhi ($c_2,$a_0,$a_0) 1966 $ST $c_1,0($a0) 1967 1968 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 1969 mflo ($t_1,$a_0,$a_1) 1970 mfhi ($t_2,$a_0,$a_1) 1971 slt $c_1,$t_2,$zero 1972 $SLL $t_2,1 1973 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 1974 slt $a2,$t_1,$zero 1975 $ADDU $t_2,$a2 1976 $SLL $t_1,1 1977 $ADDU $c_2,$t_1 1978 sltu $at,$c_2,$t_1 1979 $ADDU $c_3,$t_2,$at 1980 $ST $c_2,$BNSZ($a0) 1981 mflo ($t_1,$a_2,$a_0) 1982 mfhi ($t_2,$a_2,$a_0) 1983___ 1984 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 1985 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 1986$code.=<<___; 1987 $ADDU $c_3,$t_1 1988 sltu $at,$c_3,$t_1 1989 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 1990 $ADDU $t_2,$at 1991 $ADDU $c_1,$t_2 1992 sltu $at,$c_1,$t_2 1993 $ADDU $c_2,$at 1994 $ST $c_3,2*$BNSZ($a0) 1995 mflo ($t_1,$a_0,$a_3) 1996 mfhi ($t_2,$a_0,$a_3) 1997___ 1998 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 1999 $a_1,$a_2); # mul_add_c2(a[1],b[2],c1,c2,c3); 2000 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2001 $a_4,$a_0); # mul_add_c2(a[4],b[0],c2,c3,c1); 2002$code.=<<___; 2003 $ST $c_1,3*$BNSZ($a0) 2004___ 2005 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2006 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2007 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2008 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2009$code.=<<___; 2010 $ADDU $c_2,$t_1 2011 sltu $at,$c_2,$t_1 2012 $MULTU ($a_0,$a_5) # mul_add_c2(a[0],b[5],c3,c1,c2); 2013 $ADDU $t_2,$at 2014 $ADDU $c_3,$t_2 2015 sltu $at,$c_3,$t_2 2016 $ADDU $c_1,$at 2017 $ST $c_2,4*$BNSZ($a0) 2018 mflo ($t_1,$a_0,$a_5) 2019 mfhi ($t_2,$a_0,$a_5) 2020___ 2021 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2022 $a_1,$a_4); # mul_add_c2(a[1],b[4],c3,c1,c2); 2023 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2024 $a_2,$a_3); # mul_add_c2(a[2],b[3],c3,c1,c2); 2025 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2026 $a_6,$a_0); # mul_add_c2(a[6],b[0],c1,c2,c3); 2027$code.=<<___; 2028 $ST $c_3,5*$BNSZ($a0) 2029___ 2030 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2031 $a_5,$a_1); # mul_add_c2(a[5],b[1],c1,c2,c3); 2032 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2033 $a_4,$a_2); # mul_add_c2(a[4],b[2],c1,c2,c3); 2034 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2035 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2036$code.=<<___; 2037 $ADDU $c_1,$t_1 2038 sltu $at,$c_1,$t_1 2039 $MULTU ($a_0,$a_7) # mul_add_c2(a[0],b[7],c2,c3,c1); 2040 $ADDU $t_2,$at 2041 $ADDU $c_2,$t_2 2042 sltu $at,$c_2,$t_2 2043 $ADDU $c_3,$at 2044 $ST $c_1,6*$BNSZ($a0) 2045 mflo ($t_1,$a_0,$a_7) 2046 mfhi ($t_2,$a_0,$a_7) 2047___ 2048 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2049 $a_1,$a_6); # mul_add_c2(a[1],b[6],c2,c3,c1); 2050 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2051 $a_2,$a_5); # mul_add_c2(a[2],b[5],c2,c3,c1); 2052 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2053 $a_3,$a_4); # mul_add_c2(a[3],b[4],c2,c3,c1); 2054 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2055 $a_7,$a_1); # mul_add_c2(a[7],b[1],c3,c1,c2); 2056$code.=<<___; 2057 $ST $c_2,7*$BNSZ($a0) 2058___ 2059 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2060 $a_6,$a_2); # mul_add_c2(a[6],b[2],c3,c1,c2); 2061 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2062 $a_5,$a_3); # mul_add_c2(a[5],b[3],c3,c1,c2); 2063 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2064 $a_4,$a_4); # mul_add_c(a[4],b[4],c3,c1,c2); 2065$code.=<<___; 2066 $ADDU $c_3,$t_1 2067 sltu $at,$c_3,$t_1 2068 $MULTU ($a_2,$a_7) # mul_add_c2(a[2],b[7],c1,c2,c3); 2069 $ADDU $t_2,$at 2070 $ADDU $c_1,$t_2 2071 sltu $at,$c_1,$t_2 2072 $ADDU $c_2,$at 2073 $ST $c_3,8*$BNSZ($a0) 2074 mflo ($t_1,$a_2,$a_7) 2075 mfhi ($t_2,$a_2,$a_7) 2076___ 2077 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2078 $a_3,$a_6); # mul_add_c2(a[3],b[6],c1,c2,c3); 2079 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2080 $a_4,$a_5); # mul_add_c2(a[4],b[5],c1,c2,c3); 2081 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2082 $a_7,$a_3); # mul_add_c2(a[7],b[3],c2,c3,c1); 2083$code.=<<___; 2084 $ST $c_1,9*$BNSZ($a0) 2085___ 2086 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2087 $a_6,$a_4); # mul_add_c2(a[6],b[4],c2,c3,c1); 2088 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,1, 2089 $a_5,$a_5); # mul_add_c(a[5],b[5],c2,c3,c1); 2090$code.=<<___; 2091 $ADDU $c_2,$t_1 2092 sltu $at,$c_2,$t_1 2093 $MULTU ($a_4,$a_7) # mul_add_c2(a[4],b[7],c3,c1,c2); 2094 $ADDU $t_2,$at 2095 $ADDU $c_3,$t_2 2096 sltu $at,$c_3,$t_2 2097 $ADDU $c_1,$at 2098 $ST $c_2,10*$BNSZ($a0) 2099 mflo ($t_1,$a_4,$a_7) 2100 mfhi ($t_2,$a_4,$a_7) 2101___ 2102 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2103 $a_5,$a_6); # mul_add_c2(a[5],b[6],c3,c1,c2); 2104 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,1, 2105 $a_7,$a_5); # mul_add_c2(a[7],b[5],c1,c2,c3); 2106$code.=<<___; 2107 $ST $c_3,11*$BNSZ($a0) 2108___ 2109 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2110 $a_6,$a_6); # mul_add_c(a[6],b[6],c1,c2,c3); 2111$code.=<<___; 2112 $ADDU $c_1,$t_1 2113 sltu $at,$c_1,$t_1 2114 $MULTU ($a_6,$a_7) # mul_add_c2(a[6],b[7],c2,c3,c1); 2115 $ADDU $t_2,$at 2116 $ADDU $c_2,$t_2 2117 sltu $at,$c_2,$t_2 2118 $ADDU $c_3,$at 2119 $ST $c_1,12*$BNSZ($a0) 2120 mflo ($t_1,$a_6,$a_7) 2121 mfhi ($t_2,$a_6,$a_7) 2122___ 2123 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2124 $a_7,$a_7); # mul_add_c(a[7],b[7],c3,c1,c2); 2125$code.=<<___; 2126 $ST $c_2,13*$BNSZ($a0) 2127 2128 $ADDU $c_3,$t_1 2129 sltu $at,$c_3,$t_1 2130 $ADDU $t_2,$at 2131 $ADDU $c_1,$t_2 2132 $ST $c_3,14*$BNSZ($a0) 2133 $ST $c_1,15*$BNSZ($a0) 2134 2135 .set noreorder 2136___ 2137$code.=<<___ if ($flavour =~ /nubi/i); 2138 $REG_L $t3,4*$SZREG($sp) 2139 $REG_L $t2,3*$SZREG($sp) 2140 $REG_L $t1,2*$SZREG($sp) 2141 $REG_L $t0,1*$SZREG($sp) 2142 $REG_L $gp,0*$SZREG($sp) 2143 $PTR_ADD $sp,6*$SZREG 2144___ 2145$code.=<<___; 2146 jr $ra 2147 nop 2148.end bn_sqr_comba8 2149 2150.align 5 2151.globl bn_sqr_comba4 2152.ent bn_sqr_comba4 2153bn_sqr_comba4: 2154___ 2155$code.=<<___ if ($flavour =~ /nubi/i); 2156 .frame $sp,6*$SZREG,$ra 2157 .mask 0x8000f008,-$SZREG 2158 .set noreorder 2159 $PTR_SUB $sp,6*$SZREG 2160 $REG_S $ra,5*$SZREG($sp) 2161 $REG_S $t3,4*$SZREG($sp) 2162 $REG_S $t2,3*$SZREG($sp) 2163 $REG_S $t1,2*$SZREG($sp) 2164 $REG_S $t0,1*$SZREG($sp) 2165 $REG_S $gp,0*$SZREG($sp) 2166___ 2167$code.=<<___; 2168 .set reorder 2169 $LD $a_0,0($a1) 2170 $LD $a_1,$BNSZ($a1) 2171 $MULTU ($a_0,$a_0) # mul_add_c(a[0],b[0],c1,c2,c3); 2172 $LD $a_2,2*$BNSZ($a1) 2173 $LD $a_3,3*$BNSZ($a1) 2174 mflo ($c_1,$a_0,$a_0) 2175 mfhi ($c_2,$a_0,$a_0) 2176 $ST $c_1,0($a0) 2177 2178 $MULTU ($a_0,$a_1) # mul_add_c2(a[0],b[1],c2,c3,c1); 2179 mflo ($t_1,$a_0,$a_1) 2180 mfhi ($t_2,$a_0,$a_1) 2181 slt $c_1,$t_2,$zero 2182 $SLL $t_2,1 2183 $MULTU ($a_2,$a_0) # mul_add_c2(a[2],b[0],c3,c1,c2); 2184 slt $a2,$t_1,$zero 2185 $ADDU $t_2,$a2 2186 $SLL $t_1,1 2187 $ADDU $c_2,$t_1 2188 sltu $at,$c_2,$t_1 2189 $ADDU $c_3,$t_2,$at 2190 $ST $c_2,$BNSZ($a0) 2191 mflo ($t_1,$a_2,$a_0) 2192 mfhi ($t_2,$a_2,$a_0) 2193___ 2194 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2195 $a_1,$a_1); # mul_add_c(a[1],b[1],c3,c1,c2); 2196$code.=<<___; 2197 $ADDU $c_3,$t_1 2198 sltu $at,$c_3,$t_1 2199 $MULTU ($a_0,$a_3) # mul_add_c2(a[0],b[3],c1,c2,c3); 2200 $ADDU $t_2,$at 2201 $ADDU $c_1,$t_2 2202 sltu $at,$c_1,$t_2 2203 $ADDU $c_2,$at 2204 $ST $c_3,2*$BNSZ($a0) 2205 mflo ($t_1,$a_0,$a_3) 2206 mfhi ($t_2,$a_0,$a_3) 2207___ 2208 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,0, 2209 $a_1,$a_2); # mul_add_c2(a2[1],b[2],c1,c2,c3); 2210 &add_c2($t_2,$t_1,$c_1,$c_2,$c_3,1, 2211 $a_3,$a_1); # mul_add_c2(a[3],b[1],c2,c3,c1); 2212$code.=<<___; 2213 $ST $c_1,3*$BNSZ($a0) 2214___ 2215 &add_c2($t_2,$t_1,$c_2,$c_3,$c_1,0, 2216 $a_2,$a_2); # mul_add_c(a[2],b[2],c2,c3,c1); 2217$code.=<<___; 2218 $ADDU $c_2,$t_1 2219 sltu $at,$c_2,$t_1 2220 $MULTU ($a_2,$a_3) # mul_add_c2(a[2],b[3],c3,c1,c2); 2221 $ADDU $t_2,$at 2222 $ADDU $c_3,$t_2 2223 sltu $at,$c_3,$t_2 2224 $ADDU $c_1,$at 2225 $ST $c_2,4*$BNSZ($a0) 2226 mflo ($t_1,$a_2,$a_3) 2227 mfhi ($t_2,$a_2,$a_3) 2228___ 2229 &add_c2($t_2,$t_1,$c_3,$c_1,$c_2,0, 2230 $a_3,$a_3); # mul_add_c(a[3],b[3],c1,c2,c3); 2231$code.=<<___; 2232 $ST $c_3,5*$BNSZ($a0) 2233 2234 $ADDU $c_1,$t_1 2235 sltu $at,$c_1,$t_1 2236 $ADDU $t_2,$at 2237 $ADDU $c_2,$t_2 2238 $ST $c_1,6*$BNSZ($a0) 2239 $ST $c_2,7*$BNSZ($a0) 2240 2241 .set noreorder 2242___ 2243$code.=<<___ if ($flavour =~ /nubi/i); 2244 $REG_L $t3,4*$SZREG($sp) 2245 $REG_L $t2,3*$SZREG($sp) 2246 $REG_L $t1,2*$SZREG($sp) 2247 $REG_L $t0,1*$SZREG($sp) 2248 $REG_L $gp,0*$SZREG($sp) 2249 $PTR_ADD $sp,6*$SZREG 2250___ 2251$code.=<<___; 2252 jr $ra 2253 nop 2254.end bn_sqr_comba4 2255___ 2256print $code; 2257close STDOUT; 2258