1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M $4 26#define N $5 27#define K $6 28#define A $8 29#define B $9 30#define C $10 31#define LDC $11 32 33#define AO $12 34#define BO $13 35 36#define I $2 37#define J $3 38#define L $7 39 40#define CO1 $14 41#define CO2 $15 42#define CO3 $16 43#define CO4 $17 44#define CO5 $18 45#define CO6 $19 46#define CO7 $20 47#define CO8 $21 48 49#define OFFSET $22 50#define KK $23 51#define TEMP $24 52#define AORIG $25 53 54#define a1 $f0 55#define a2 $f1 56#define a3 $f27 57#define a4 $f28 58 59#define b1 $f2 60#define b2 $f3 61#define b3 $f4 62#define b4 $f5 63#define b5 $f6 64#define b6 $f7 65#define b7 $f8 66#define b8 $f9 67 68#define a5 b8 69 70#define c11 $f10 71#define c12 $f11 72#define c21 $f12 73#define c22 $f13 74#define c31 $f14 75#define c32 $f16 76#define c41 $f17 77#define c42 $f18 78#define c51 $f19 79#define c52 $f20 80#define c61 $f21 81#define c62 $f22 82#define c71 $f23 83#define c72 $f24 84#define c81 $f25 85#define c82 $f26 86 87#define ALPHA $f15 88 89 PROLOGUE 90 91 daddiu $sp, $sp, -144 92 93 SDARG $16, 0($sp) 94 SDARG $17, 8($sp) 95 SDARG $18, 16($sp) 96 SDARG $19, 24($sp) 97 SDARG $20, 32($sp) 98 SDARG $21, 40($sp) 99 sdc1 $f24, 48($sp) 100 sdc1 $f25, 56($sp) 101 sdc1 $f26, 64($sp) 102 sdc1 $f27, 72($sp) 103 sdc1 $f28, 80($sp) 104 105 SDARG $22, 88($sp) 106 SDARG $23, 96($sp) 107 SDARG $24, 104($sp) 108 SDARG $25, 112($sp) 109 110#ifndef __64BIT__ 111 sdc1 $f20,112($sp) 112 sdc1 $f21,120($sp) 113 sdc1 $f22,128($sp) 114 sdc1 $f23,136($sp) 115#endif 116 117 LDARG OFFSET, 144($sp) 118 119 dsll LDC, LDC, BASE_SHIFT 120 121#ifdef LN 122 mult M, K 123 mflo TEMP 124 125 dsll TEMP, TEMP, BASE_SHIFT 126 daddu A, A, TEMP 127 128 dsll TEMP, M, BASE_SHIFT 129 daddu C, C, TEMP 130#endif 131 132#ifdef RN 133 neg KK, OFFSET 134#endif 135 136#ifdef RT 137 mult N, K 138 mflo TEMP 139 140 dsll TEMP, TEMP, BASE_SHIFT 141 daddu B, B, TEMP 142 143 mult N, LDC 144 mflo TEMP 145 daddu C, C, TEMP 146 147 dsubu KK, N, OFFSET 148#endif 149 150 dsra J, N, 3 151 blez J, .L30 152 nop 153 154.L10: 155#ifdef RT 156 dsll TEMP, K, 3 + BASE_SHIFT 157 dsubu B, B, TEMP 158 159 dsll TEMP, LDC, 3 160 dsubu C, C, TEMP 161#endif 162 163 move CO1, C 164 MTC $0, c11 165 daddu CO2, C, LDC 166 daddu CO3, CO2, LDC 167 daddiu J, J, -1 168 daddu CO4, CO3, LDC 169 MOV c21, c11 170 daddu CO5, CO4, LDC 171 MOV c31, c11 172 daddu CO6, CO5, LDC 173 MOV c41, c11 174 daddu CO7, CO6, LDC 175 MOV c51, c11 176 daddu CO8, CO7, LDC 177 178#ifdef LN 179 daddu KK, M, OFFSET 180#endif 181 182#ifdef LT 183 move KK, OFFSET 184#endif 185 186#if defined(LN) || defined(RT) 187 move AORIG, A 188#else 189 move AO, A 190#endif 191#ifndef RT 192 daddu C, CO8, LDC 193#endif 194 195 andi I, M, 1 196 MOV c61, c11 197 blez I, .L20 198 MOV c71, c11 199 200#if defined(LT) || defined(RN) 201 LD a1, 0 * SIZE(AO) 202 LD a2, 1 * SIZE(AO) 203 LD a3, 2 * SIZE(AO) 204 LD a4, 3 * SIZE(AO) 205 206 LD b1, 0 * SIZE(B) 207 LD b2, 1 * SIZE(B) 208 LD b3, 2 * SIZE(B) 209 LD b4, 3 * SIZE(B) 210 LD b5, 4 * SIZE(B) 211 LD b6, 8 * SIZE(B) 212 LD b7, 12 * SIZE(B) 213 214 dsra L, KK, 2 215 MOV c81, c11 216 217 blez L, .L25 218 move BO, B 219#else 220 221#ifdef LN 222 dsll TEMP, K, 0 + BASE_SHIFT 223 dsubu AORIG, AORIG, TEMP 224#endif 225 226 dsll L, KK, 0 + BASE_SHIFT 227 dsll TEMP, KK, 3 + BASE_SHIFT 228 229 daddu AO, AORIG, L 230 daddu BO, B, TEMP 231 232 dsubu TEMP, K, KK 233 234 LD a1, 0 * SIZE(AO) 235 LD a2, 1 * SIZE(AO) 236 LD a3, 2 * SIZE(AO) 237 LD a4, 3 * SIZE(AO) 238 239 LD b1, 0 * SIZE(BO) 240 LD b2, 1 * SIZE(BO) 241 LD b3, 2 * SIZE(BO) 242 LD b4, 3 * SIZE(BO) 243 LD b5, 4 * SIZE(BO) 244 LD b6, 8 * SIZE(BO) 245 LD b7, 12 * SIZE(BO) 246 247 dsra L, TEMP, 2 248 MOV c81, c11 249 250 blez L, .L25 251 NOP 252#endif 253 .align 3 254 255.L22: 256 MADD c11, c11, a1, b1 257 LD b1, 16 * SIZE(BO) 258 MADD c21, c21, a1, b2 259 LD b2, 5 * SIZE(BO) 260 MADD c31, c31, a1, b3 261 LD b3, 6 * SIZE(BO) 262 MADD c41, c41, a1, b4 263 LD b4, 7 * SIZE(BO) 264 265 MADD c51, c51, a1, b5 266 LD b5, 20 * SIZE(BO) 267 MADD c61, c61, a1, b2 268 LD b2, 9 * SIZE(BO) 269 MADD c71, c71, a1, b3 270 LD b3, 10 * SIZE(BO) 271 MADD c81, c81, a1, b4 272 LD b4, 11 * SIZE(BO) 273 274 LD a1, 4 * SIZE(AO) 275 daddiu L, L, -1 276 277 MADD c11, c11, a2, b6 278 LD b6, 24 * SIZE(BO) 279 MADD c21, c21, a2, b2 280 LD b2, 13 * SIZE(BO) 281 MADD c31, c31, a2, b3 282 LD b3, 14 * SIZE(BO) 283 MADD c41, c41, a2, b4 284 LD b4, 15 * SIZE(BO) 285 286 MADD c51, c51, a2, b7 287 LD b7, 28 * SIZE(BO) 288 MADD c61, c61, a2, b2 289 LD b2, 17 * SIZE(BO) 290 MADD c71, c71, a2, b3 291 LD b3, 18 * SIZE(BO) 292 MADD c81, c81, a2, b4 293 LD b4, 19 * SIZE(BO) 294 295 LD a2, 5 * SIZE(AO) 296 daddiu AO, AO, 4 * SIZE 297 298 MADD c11, c11, a3, b1 299 LD b1, 32 * SIZE(BO) 300 MADD c21, c21, a3, b2 301 LD b2, 21 * SIZE(BO) 302 MADD c31, c31, a3, b3 303 LD b3, 22 * SIZE(BO) 304 MADD c41, c41, a3, b4 305 LD b4, 23 * SIZE(BO) 306 307 MADD c51, c51, a3, b5 308 LD b5, 36 * SIZE(BO) 309 MADD c61, c61, a3, b2 310 LD b2, 25 * SIZE(BO) 311 MADD c71, c71, a3, b3 312 LD b3, 26 * SIZE(BO) 313 MADD c81, c81, a3, b4 314 LD b4, 27 * SIZE(BO) 315 316 LD a3, 2 * SIZE(AO) 317 daddiu BO, BO, 32 * SIZE 318 319 MADD c11, c11, a4, b6 320 LD b6, 8 * SIZE(BO) 321 MADD c21, c21, a4, b2 322 LD b2, -3 * SIZE(BO) 323 MADD c31, c31, a4, b3 324 LD b3, -2 * SIZE(BO) 325 MADD c41, c41, a4, b4 326 LD b4, -1 * SIZE(BO) 327 328 MADD c51, c51, a4, b7 329 LD b7, 12 * SIZE(BO) 330 MADD c61, c61, a4, b2 331 LD b2, 1 * SIZE(BO) 332 MADD c71, c71, a4, b3 333 LD b3, 2 * SIZE(BO) 334 MADD c81, c81, a4, b4 335 LD b4, 3 * SIZE(BO) 336 bgtz L, .L22 337 LD a4, 3 * SIZE(AO) 338 .align 3 339 340.L25: 341#if defined(LT) || defined(RN) 342 andi L, KK, 3 343#else 344 andi L, TEMP, 3 345#endif 346 NOP 347 blez L, .L28 348 NOP 349 .align 3 350 351.L26: 352 MADD c11, c11, a1, b1 353 LD b1, 8 * SIZE(BO) 354 MADD c21, c21, a1, b2 355 LD b2, 5 * SIZE(BO) 356 MADD c31, c31, a1, b3 357 LD b3, 6 * SIZE(BO) 358 MADD c41, c41, a1, b4 359 LD b4, 7 * SIZE(BO) 360 361 daddiu L, L, -1 362 MOV a2, a2 363 daddiu AO, AO, 1 * SIZE 364 daddiu BO, BO, 8 * SIZE 365 366 MADD c51, c51, a1, b5 367 LD b5, 4 * SIZE(BO) 368 MADD c61, c61, a1, b2 369 LD b2, 1 * SIZE(BO) 370 MADD c71, c71, a1, b3 371 LD b3, 2 * SIZE(BO) 372 MADD c81, c81, a1, b4 373 LD a1, 0 * SIZE(AO) 374 375 bgtz L, .L26 376 LD b4, 3 * SIZE(BO) 377 378.L28: 379#if defined(LN) || defined(RT) 380#ifdef LN 381 daddiu TEMP, KK, -1 382#else 383 daddiu TEMP, KK, -8 384#endif 385 386 dsll L, TEMP, 0 + BASE_SHIFT 387 dsll TEMP, TEMP, 3 + BASE_SHIFT 388 daddu AO, AORIG, L 389 daddu BO, B, TEMP 390#endif 391 392 393#if defined(LN) || defined(LT) 394 LD b1, 0 * SIZE(BO) 395 LD b2, 1 * SIZE(BO) 396 LD b3, 2 * SIZE(BO) 397 LD b4, 3 * SIZE(BO) 398 LD b5, 4 * SIZE(BO) 399 LD b6, 5 * SIZE(BO) 400 LD b7, 6 * SIZE(BO) 401 LD b8, 7 * SIZE(BO) 402 403 SUB c11, b1, c11 404 SUB c21, b2, c21 405 SUB c31, b3, c31 406 SUB c41, b4, c41 407 SUB c51, b5, c51 408 SUB c61, b6, c61 409 SUB c71, b7, c71 410 SUB c81, b8, c81 411#else 412 LD b1, 0 * SIZE(AO) 413 LD b2, 1 * SIZE(AO) 414 LD b3, 2 * SIZE(AO) 415 LD b4, 3 * SIZE(AO) 416 LD b5, 4 * SIZE(AO) 417 LD b6, 5 * SIZE(AO) 418 LD b7, 6 * SIZE(AO) 419 LD b8, 7 * SIZE(AO) 420 421 SUB c11, b1, c11 422 SUB c21, b2, c21 423 SUB c31, b3, c31 424 SUB c41, b4, c41 425 SUB c51, b5, c51 426 SUB c61, b6, c61 427 SUB c71, b7, c71 428 SUB c81, b8, c81 429#endif 430 431#if defined(LN) || defined(LT) 432 LD b1, 0 * SIZE(AO) 433 434 MUL c11, b1, c11 435 MUL c21, b1, c21 436 MUL c31, b1, c31 437 MUL c41, b1, c41 438 MUL c51, b1, c51 439 MUL c61, b1, c61 440 MUL c71, b1, c71 441 MUL c81, b1, c81 442#endif 443 444#ifdef RN 445 LD b1, 0 * SIZE(BO) 446 LD b2, 1 * SIZE(BO) 447 LD b3, 2 * SIZE(BO) 448 LD b4, 3 * SIZE(BO) 449 LD b5, 4 * SIZE(BO) 450 LD b6, 5 * SIZE(BO) 451 LD b7, 6 * SIZE(BO) 452 LD b8, 7 * SIZE(BO) 453 454 MUL c11, b1, c11 455 456 NMSUB c21, c21, b2, c11 457 NMSUB c31, c31, b3, c11 458 NMSUB c41, c41, b4, c11 459 NMSUB c51, c51, b5, c11 460 NMSUB c61, c61, b6, c11 461 NMSUB c71, c71, b7, c11 462 NMSUB c81, c81, b8, c11 463 464 LD b2, 9 * SIZE(BO) 465 LD b3, 10 * SIZE(BO) 466 LD b4, 11 * SIZE(BO) 467 LD b5, 12 * SIZE(BO) 468 LD b6, 13 * SIZE(BO) 469 LD b7, 14 * SIZE(BO) 470 LD b8, 15 * SIZE(BO) 471 472 MUL c21, b2, c21 473 474 NMSUB c31, c31, b3, c21 475 NMSUB c41, c41, b4, c21 476 NMSUB c51, c51, b5, c21 477 NMSUB c61, c61, b6, c21 478 NMSUB c71, c71, b7, c21 479 NMSUB c81, c81, b8, c21 480 481 LD b3, 18 * SIZE(BO) 482 LD b4, 19 * SIZE(BO) 483 LD b5, 20 * SIZE(BO) 484 LD b6, 21 * SIZE(BO) 485 LD b7, 22 * SIZE(BO) 486 LD b8, 23 * SIZE(BO) 487 488 MUL c31, b3, c31 489 490 NMSUB c41, c41, b4, c31 491 NMSUB c51, c51, b5, c31 492 NMSUB c61, c61, b6, c31 493 NMSUB c71, c71, b7, c31 494 NMSUB c81, c81, b8, c31 495 496 LD b4, 27 * SIZE(BO) 497 LD b5, 28 * SIZE(BO) 498 LD b6, 29 * SIZE(BO) 499 LD b7, 30 * SIZE(BO) 500 LD b8, 31 * SIZE(BO) 501 502 MUL c41, b4, c41 503 504 NMSUB c51, c51, b5, c41 505 NMSUB c61, c61, b6, c41 506 NMSUB c71, c71, b7, c41 507 NMSUB c81, c81, b8, c41 508 509 LD b5, 36 * SIZE(BO) 510 LD b6, 37 * SIZE(BO) 511 LD b7, 38 * SIZE(BO) 512 LD b8, 39 * SIZE(BO) 513 514 MUL c51, b5, c51 515 516 NMSUB c61, c61, b6, c51 517 NMSUB c71, c71, b7, c51 518 NMSUB c81, c81, b8, c51 519 520 LD b6, 45 * SIZE(BO) 521 LD b7, 46 * SIZE(BO) 522 LD b8, 47 * SIZE(BO) 523 524 MUL c61, b6, c61 525 526 NMSUB c71, c71, b7, c61 527 NMSUB c81, c81, b8, c61 528 529 LD b7, 54 * SIZE(BO) 530 LD b8, 55 * SIZE(BO) 531 532 MUL c71, b7, c71 533 534 NMSUB c81, c81, b8, c71 535 536 LD b8, 63 * SIZE(BO) 537 538 MUL c81, b8, c81 539#endif 540 541#ifdef RT 542 LD b1, 63 * SIZE(BO) 543 LD b2, 62 * SIZE(BO) 544 LD b3, 61 * SIZE(BO) 545 LD b4, 60 * SIZE(BO) 546 LD b5, 59 * SIZE(BO) 547 LD b6, 58 * SIZE(BO) 548 LD b7, 57 * SIZE(BO) 549 LD b8, 56 * SIZE(BO) 550 551 MUL c81, b1, c81 552 553 NMSUB c71, c71, b2, c81 554 NMSUB c61, c61, b3, c81 555 NMSUB c51, c51, b4, c81 556 NMSUB c41, c41, b5, c81 557 NMSUB c31, c31, b6, c81 558 NMSUB c21, c21, b7, c81 559 NMSUB c11, c11, b8, c81 560 561 LD b2, 54 * SIZE(BO) 562 LD b3, 53 * SIZE(BO) 563 LD b4, 52 * SIZE(BO) 564 LD b5, 51 * SIZE(BO) 565 LD b6, 50 * SIZE(BO) 566 LD b7, 49 * SIZE(BO) 567 LD b8, 48 * SIZE(BO) 568 569 MUL c71, b2, c71 570 571 NMSUB c61, c61, b3, c71 572 NMSUB c51, c51, b4, c71 573 NMSUB c41, c41, b5, c71 574 NMSUB c31, c31, b6, c71 575 NMSUB c21, c21, b7, c71 576 NMSUB c11, c11, b8, c71 577 578 LD b3, 45 * SIZE(BO) 579 LD b4, 44 * SIZE(BO) 580 LD b5, 43 * SIZE(BO) 581 LD b6, 42 * SIZE(BO) 582 LD b7, 41 * SIZE(BO) 583 LD b8, 40 * SIZE(BO) 584 585 MUL c61, b3, c61 586 587 NMSUB c51, c51, b4, c61 588 NMSUB c41, c41, b5, c61 589 NMSUB c31, c31, b6, c61 590 NMSUB c21, c21, b7, c61 591 NMSUB c11, c11, b8, c61 592 593 LD b4, 36 * SIZE(BO) 594 LD b5, 35 * SIZE(BO) 595 LD b6, 34 * SIZE(BO) 596 LD b7, 33 * SIZE(BO) 597 LD b8, 32 * SIZE(BO) 598 599 MUL c51, b4, c51 600 601 NMSUB c41, c41, b5, c51 602 NMSUB c31, c31, b6, c51 603 NMSUB c21, c21, b7, c51 604 NMSUB c11, c11, b8, c51 605 606 LD b5, 27 * SIZE(BO) 607 LD b6, 26 * SIZE(BO) 608 LD b7, 25 * SIZE(BO) 609 LD b8, 24 * SIZE(BO) 610 611 MUL c41, b5, c41 612 613 NMSUB c31, c31, b6, c41 614 NMSUB c21, c21, b7, c41 615 NMSUB c11, c11, b8, c41 616 617 LD b6, 18 * SIZE(BO) 618 LD b7, 17 * SIZE(BO) 619 LD b8, 16 * SIZE(BO) 620 621 MUL c31, b6, c31 622 623 NMSUB c21, c21, b7, c31 624 NMSUB c11, c11, b8, c31 625 626 LD b7, 9 * SIZE(BO) 627 LD b8, 8 * SIZE(BO) 628 629 MUL c21, b7, c21 630 631 NMSUB c11, c11, b8, c21 632 633 LD b8, 0 * SIZE(BO) 634 635 MUL c11, b8, c11 636#endif 637 638#ifdef LN 639 daddiu CO1, CO1, -1 * SIZE 640 daddiu CO2, CO2, -1 * SIZE 641 daddiu CO3, CO3, -1 * SIZE 642 daddiu CO4, CO4, -1 * SIZE 643 daddiu CO5, CO5, -1 * SIZE 644 daddiu CO6, CO6, -1 * SIZE 645 daddiu CO7, CO7, -1 * SIZE 646 daddiu CO8, CO8, -1 * SIZE 647#endif 648 649#if defined(LN) || defined(LT) 650 ST c11, 0 * SIZE(BO) 651 ST c21, 1 * SIZE(BO) 652 ST c31, 2 * SIZE(BO) 653 ST c41, 3 * SIZE(BO) 654 ST c51, 4 * SIZE(BO) 655 ST c61, 5 * SIZE(BO) 656 ST c71, 6 * SIZE(BO) 657 ST c81, 7 * SIZE(BO) 658#else 659 ST c11, 0 * SIZE(AO) 660 ST c21, 1 * SIZE(AO) 661 ST c31, 2 * SIZE(AO) 662 ST c41, 3 * SIZE(AO) 663 ST c51, 4 * SIZE(AO) 664 ST c61, 5 * SIZE(AO) 665 ST c71, 6 * SIZE(AO) 666 ST c81, 7 * SIZE(AO) 667#endif 668 669 ST c11, 0 * SIZE(CO1) 670 ST c21, 0 * SIZE(CO2) 671 ST c31, 0 * SIZE(CO3) 672 ST c41, 0 * SIZE(CO4) 673 ST c51, 0 * SIZE(CO5) 674 ST c61, 0 * SIZE(CO6) 675 ST c71, 0 * SIZE(CO7) 676 ST c81, 0 * SIZE(CO8) 677 678 MTC $0, c11 679 680#ifndef LN 681 daddiu CO1, CO1, 1 * SIZE 682 daddiu CO2, CO2, 1 * SIZE 683 daddiu CO3, CO3, 1 * SIZE 684 daddiu CO4, CO4, 1 * SIZE 685 daddiu CO5, CO5, 1 * SIZE 686 daddiu CO6, CO6, 1 * SIZE 687 daddiu CO7, CO7, 1 * SIZE 688 daddiu CO8, CO8, 1 * SIZE 689#endif 690 691 MOV c21, c11 692 693#ifdef RT 694 dsll TEMP, K, BASE_SHIFT 695 daddu AORIG, AORIG, TEMP 696#endif 697 698 MOV c31, c11 699 700#if defined(LT) || defined(RN) 701 dsubu TEMP, K, KK 702 dsll L, TEMP, 0 + BASE_SHIFT 703 dsll TEMP, TEMP, 3 + BASE_SHIFT 704 daddu AO, AO, L 705 daddu BO, BO, TEMP 706#endif 707 708 MOV c41, c11 709 710#ifdef LT 711 daddiu KK, KK, 1 712#endif 713 714#ifdef LN 715 daddiu KK, KK, -1 716#endif 717 .align 3 718 719.L20: 720 dsra I, M, 1 721 MOV c51, c11 722 blez I, .L29 723 MOV c61, c11 724 725.L11: 726#if defined(LT) || defined(RN) 727 LD a1, 0 * SIZE(AO) 728 MOV c71, c11 729 LD b1, 0 * SIZE(B) 730 MOV c81, c11 731 732 LD a3, 4 * SIZE(AO) 733 MOV c12, c11 734 LD b2, 1 * SIZE(B) 735 MOV c22, c11 736 737 dsra L, KK, 2 738 MOV c32, c11 739 LD b3, 2 * SIZE(B) 740 MOV c42, c11 741 742 LD b4, 3 * SIZE(B) 743 MOV c52, c11 744 LD b5, 4 * SIZE(B) 745 MOV c62, c11 746 747 LD b6, 8 * SIZE(B) 748 MOV c72, c11 749 LD b7, 12 * SIZE(B) 750 MOV c82, c11 751 752 blez L, .L15 753 move BO, B 754#else 755 756#ifdef LN 757 dsll TEMP, K, 1 + BASE_SHIFT 758 dsubu AORIG, AORIG, TEMP 759#endif 760 761 dsll L, KK, 1 + BASE_SHIFT 762 dsll TEMP, KK, 3 + BASE_SHIFT 763 764 daddu AO, AORIG, L 765 daddu BO, B, TEMP 766 767 dsubu TEMP, K, KK 768 769 LD a1, 0 * SIZE(AO) 770 MOV c71, c11 771 LD b1, 0 * SIZE(BO) 772 MOV c81, c11 773 774 LD a3, 4 * SIZE(AO) 775 MOV c12, c11 776 LD b2, 1 * SIZE(BO) 777 MOV c22, c11 778 779 MOV c32, c11 780 LD b3, 2 * SIZE(BO) 781 MOV c42, c11 782 783 LD b4, 3 * SIZE(BO) 784 MOV c52, c11 785 LD b5, 4 * SIZE(BO) 786 MOV c62, c11 787 788 LD b6, 8 * SIZE(BO) 789 MOV c72, c11 790 LD b7, 12 * SIZE(BO) 791 MOV c82, c11 792 793 dsra L, TEMP, 2 794 blez L, .L15 795 NOP 796#endif 797 798 MADD c11, c11, a1, b1 799 LD a2, 1 * SIZE(AO) 800 MADD c21, c21, a1, b2 801 daddiu L, L, -1 802 MADD c31, c31, a1, b3 803 blez L, .L13 804 MADD c41, c41, a1, b4 805 NOP 806 .align 3 807 808.L12: 809 MADD c12, c12, a2, b1 810 LD b1, 16 * SIZE(BO) 811 MADD c22, c22, a2, b2 812 LD b2, 5 * SIZE(BO) 813 MADD c32, c32, a2, b3 814 LD b3, 6 * SIZE(BO) 815 MADD c42, c42, a2, b4 816 LD b4, 7 * SIZE(BO) 817 818 MADD c51, c51, a1, b5 819 NOP 820 MADD c61, c61, a1, b2 821 LD a4, 2 * SIZE(AO) 822 MADD c71, c71, a1, b3 823 NOP 824 MADD c81, c81, a1, b4 825 LD a1, 8 * SIZE(AO) 826 827 MADD c52, c52, a2, b5 828 LD b5, 20 * SIZE(BO) 829 MADD c62, c62, a2, b2 830 LD b2, 9 * SIZE(BO) 831 MADD c72, c72, a2, b3 832 LD b3, 10 * SIZE(BO) 833 MADD c82, c82, a2, b4 834 LD b4, 11 * SIZE(BO) 835 836 MADD c11, c11, a4, b6 837 LD a2, 3 * SIZE(AO) 838 MADD c21, c21, a4, b2 839 NOP 840 MADD c31, c31, a4, b3 841 NOP 842 MADD c41, c41, a4, b4 843 NOP 844 845 MADD c12, c12, a2, b6 846 LD b6, 24 * SIZE(BO) 847 MADD c22, c22, a2, b2 848 LD b2, 13 * SIZE(BO) 849 MADD c32, c32, a2, b3 850 LD b3, 14 * SIZE(BO) 851 MADD c42, c42, a2, b4 852 LD b4, 15 * SIZE(BO) 853 854 MADD c51, c51, a4, b7 855 NOP 856 MADD c61, c61, a4, b2 857 NOP 858 MADD c71, c71, a4, b3 859 NOP 860 MADD c81, c81, a4, b4 861 NOP 862 863 MADD c52, c52, a2, b7 864 LD b7, 28 * SIZE(BO) 865 MADD c62, c62, a2, b2 866 LD b2, 17 * SIZE(BO) 867 MADD c72, c72, a2, b3 868 LD b3, 18 * SIZE(BO) 869 MADD c82, c82, a2, b4 870 LD b4, 19 * SIZE(BO) 871 872 MADD c11, c11, a3, b1 873 LD a2, 5 * SIZE(AO) 874 MADD c21, c21, a3, b2 875 NOP 876 MADD c31, c31, a3, b3 877 NOP 878 MADD c41, c41, a3, b4 879 NOP 880 881 MADD c12, c12, a2, b1 882 LD b1, 32 * SIZE(BO) 883 MADD c22, c22, a2, b2 884 LD b2, 21 * SIZE(BO) 885 MADD c32, c32, a2, b3 886 LD b3, 22 * SIZE(BO) 887 MADD c42, c42, a2, b4 888 LD b4, 23 * SIZE(BO) 889 890 MADD c51, c51, a3, b5 891 NOP 892 MADD c61, c61, a3, b2 893 LD a4, 6 * SIZE(AO) 894 MADD c71, c71, a3, b3 895 NOP 896 MADD c81, c81, a3, b4 897 LD a3, 12 * SIZE(AO) 898 899 MADD c52, c52, a2, b5 900 LD b5, 36 * SIZE(BO) 901 MADD c62, c62, a2, b2 902 LD b2, 25 * SIZE(BO) 903 MADD c72, c72, a2, b3 904 LD b3, 26 * SIZE(BO) 905 MADD c82, c82, a2, b4 906 LD b4, 27 * SIZE(BO) 907 908 MADD c11, c11, a4, b6 909 LD a2, 7 * SIZE(AO) 910 MADD c21, c21, a4, b2 911 NOP 912 MADD c31, c31, a4, b3 913 NOP 914 MADD c41, c41, a4, b4 915 daddiu L, L, -1 916 917 MADD c12, c12, a2, b6 918 LD b6, 40 * SIZE(BO) 919 MADD c22, c22, a2, b2 920 LD b2, 29 * SIZE(BO) 921 MADD c32, c32, a2, b3 922 LD b3, 30 * SIZE(BO) 923 MADD c42, c42, a2, b4 924 LD b4, 31 * SIZE(BO) 925 926 MADD c51, c51, a4, b7 927 daddiu BO, BO, 32 * SIZE 928 MADD c61, c61, a4, b2 929 daddiu AO, AO, 8 * SIZE 930 MADD c71, c71, a4, b3 931 NOP 932 MADD c81, c81, a4, b4 933 NOP 934 935 MADD c52, c52, a2, b7 936 LD b7, 12 * SIZE(BO) 937 MADD c62, c62, a2, b2 938 LD b2, 1 * SIZE(BO) 939 MADD c72, c72, a2, b3 940 LD b3, 2 * SIZE(BO) 941 MADD c82, c82, a2, b4 942 LD b4, 3 * SIZE(BO) 943 944 MADD c11, c11, a1, b1 945 LD a2, 1 * SIZE(AO) 946 MADD c21, c21, a1, b2 947 NOP 948 MADD c31, c31, a1, b3 949 bgtz L, .L12 950 MADD c41, c41, a1, b4 951 NOP 952 .align 3 953 954.L13: 955 MADD c12, c12, a2, b1 956 LD b1, 16 * SIZE(BO) 957 MADD c22, c22, a2, b2 958 LD b2, 5 * SIZE(BO) 959 MADD c32, c32, a2, b3 960 LD b3, 6 * SIZE(BO) 961 MADD c42, c42, a2, b4 962 LD b4, 7 * SIZE(BO) 963 964 MADD c51, c51, a1, b5 965 NOP 966 MADD c61, c61, a1, b2 967 LD a4, 2 * SIZE(AO) 968 MADD c71, c71, a1, b3 969 NOP 970 MADD c81, c81, a1, b4 971 LD a1, 8 * SIZE(AO) 972 973 MADD c52, c52, a2, b5 974 LD b5, 20 * SIZE(BO) 975 MADD c62, c62, a2, b2 976 LD b2, 9 * SIZE(BO) 977 MADD c72, c72, a2, b3 978 LD b3, 10 * SIZE(BO) 979 MADD c82, c82, a2, b4 980 LD b4, 11 * SIZE(BO) 981 982 MADD c11, c11, a4, b6 983 LD a2, 3 * SIZE(AO) 984 MADD c21, c21, a4, b2 985 NOP 986 MADD c31, c31, a4, b3 987 NOP 988 MADD c41, c41, a4, b4 989 NOP 990 991 MADD c12, c12, a2, b6 992 LD b6, 24 * SIZE(BO) 993 MADD c22, c22, a2, b2 994 LD b2, 13 * SIZE(BO) 995 MADD c32, c32, a2, b3 996 LD b3, 14 * SIZE(BO) 997 MADD c42, c42, a2, b4 998 LD b4, 15 * SIZE(BO) 999 1000 MADD c51, c51, a4, b7 1001 NOP 1002 MADD c61, c61, a4, b2 1003 NOP 1004 MADD c71, c71, a4, b3 1005 NOP 1006 MADD c81, c81, a4, b4 1007 NOP 1008 1009 MADD c52, c52, a2, b7 1010 LD b7, 28 * SIZE(BO) 1011 MADD c62, c62, a2, b2 1012 LD b2, 17 * SIZE(BO) 1013 MADD c72, c72, a2, b3 1014 LD b3, 18 * SIZE(BO) 1015 MADD c82, c82, a2, b4 1016 LD b4, 19 * SIZE(BO) 1017 1018 MADD c11, c11, a3, b1 1019 LD a2, 5 * SIZE(AO) 1020 MADD c21, c21, a3, b2 1021 NOP 1022 MADD c31, c31, a3, b3 1023 NOP 1024 MADD c41, c41, a3, b4 1025 NOP 1026 1027 MADD c12, c12, a2, b1 1028 LD b1, 32 * SIZE(BO) 1029 MADD c22, c22, a2, b2 1030 LD b2, 21 * SIZE(BO) 1031 MADD c32, c32, a2, b3 1032 LD b3, 22 * SIZE(BO) 1033 MADD c42, c42, a2, b4 1034 LD b4, 23 * SIZE(BO) 1035 1036 MADD c51, c51, a3, b5 1037 NOP 1038 MADD c61, c61, a3, b2 1039 LD a4, 6 * SIZE(AO) 1040 MADD c71, c71, a3, b3 1041 NOP 1042 MADD c81, c81, a3, b4 1043 LD a3, 12 * SIZE(AO) 1044 1045 MADD c52, c52, a2, b5 1046 LD b5, 36 * SIZE(BO) 1047 MADD c62, c62, a2, b2 1048 LD b2, 25 * SIZE(BO) 1049 MADD c72, c72, a2, b3 1050 LD b3, 26 * SIZE(BO) 1051 MADD c82, c82, a2, b4 1052 LD b4, 27 * SIZE(BO) 1053 1054 MADD c11, c11, a4, b6 1055 LD a2, 7 * SIZE(AO) 1056 MADD c21, c21, a4, b2 1057 NOP 1058 MADD c31, c31, a4, b3 1059 NOP 1060 MADD c41, c41, a4, b4 1061 NOP 1062 1063 MADD c12, c12, a2, b6 1064 LD b6, 40 * SIZE(BO) 1065 MADD c22, c22, a2, b2 1066 LD b2, 29 * SIZE(BO) 1067 MADD c32, c32, a2, b3 1068 LD b3, 30 * SIZE(BO) 1069 MADD c42, c42, a2, b4 1070 LD b4, 31 * SIZE(BO) 1071 1072 MADD c51, c51, a4, b7 1073 daddiu BO, BO, 32 * SIZE 1074 MADD c61, c61, a4, b2 1075 daddiu AO, AO, 8 * SIZE 1076 MADD c71, c71, a4, b3 1077 NOP 1078 MADD c81, c81, a4, b4 1079 NOP 1080 1081 MADD c52, c52, a2, b7 1082 LD b7, 12 * SIZE(BO) 1083 MADD c62, c62, a2, b2 1084 LD b2, 1 * SIZE(BO) 1085 MADD c72, c72, a2, b3 1086 LD b3, 2 * SIZE(BO) 1087 MADD c82, c82, a2, b4 1088 LD b4, 3 * SIZE(BO) 1089 .align 3 1090 1091.L15: 1092#if defined(LT) || defined(RN) 1093 andi L, KK, 3 1094#else 1095 andi L, TEMP, 3 1096#endif 1097 blez L, .L18 1098 NOP 1099 .align 3 1100 1101.L16: 1102 MADD c11, c11, a1, b1 1103 LD a2, 1 * SIZE(AO) 1104 MADD c21, c21, a1, b2 1105 NOP 1106 MADD c31, c31, a1, b3 1107 NOP 1108 MADD c41, c41, a1, b4 1109 NOP 1110 1111 MADD c12, c12, a2, b1 1112 LD b1, 8 * SIZE(BO) 1113 MADD c22, c22, a2, b2 1114 LD b2, 5 * SIZE(BO) 1115 MADD c32, c32, a2, b3 1116 LD b3, 6 * SIZE(BO) 1117 MADD c42, c42, a2, b4 1118 LD b4, 7 * SIZE(BO) 1119 1120 MADD c51, c51, a1, b5 1121 daddiu L, L, -1 1122 MADD c61, c61, a1, b2 1123 daddiu AO, AO, 2 * SIZE 1124 MADD c71, c71, a1, b3 1125 daddiu BO, BO, 8 * SIZE 1126 MADD c81, c81, a1, b4 1127 LD a1, 0 * SIZE(AO) 1128 1129 MADD c52, c52, a2, b5 1130 LD b5, 4 * SIZE(BO) 1131 MADD c62, c62, a2, b2 1132 LD b2, 1 * SIZE(BO) 1133 MADD c72, c72, a2, b3 1134 LD b3, 2 * SIZE(BO) 1135 MADD c82, c82, a2, b4 1136 bgtz L, .L16 1137 LD b4, 3 * SIZE(BO) 1138 1139.L18: 1140#if defined(LN) || defined(RT) 1141#ifdef LN 1142 daddiu TEMP, KK, -2 1143#else 1144 daddiu TEMP, KK, -8 1145#endif 1146 1147 dsll L, TEMP, 1 + BASE_SHIFT 1148 dsll TEMP, TEMP, 3 + BASE_SHIFT 1149 daddu AO, AORIG, L 1150 daddu BO, B, TEMP 1151#endif 1152 1153#if defined(LN) || defined(LT) 1154 LD b1, 0 * SIZE(BO) 1155 LD b2, 1 * SIZE(BO) 1156 LD b3, 2 * SIZE(BO) 1157 LD b4, 3 * SIZE(BO) 1158 1159 SUB c11, b1, c11 1160 LD b5, 4 * SIZE(BO) 1161 SUB c21, b2, c21 1162 LD b6, 5 * SIZE(BO) 1163 SUB c31, b3, c31 1164 LD b7, 6 * SIZE(BO) 1165 SUB c41, b4, c41 1166 LD b8, 7 * SIZE(BO) 1167 1168 SUB c51, b5, c51 1169 LD b1, 8 * SIZE(BO) 1170 SUB c61, b6, c61 1171 LD b2, 9 * SIZE(BO) 1172 SUB c71, b7, c71 1173 LD b3, 10 * SIZE(BO) 1174 SUB c81, b8, c81 1175 LD b4, 11 * SIZE(BO) 1176 1177 SUB c12, b1, c12 1178 LD b5, 12 * SIZE(BO) 1179 SUB c22, b2, c22 1180 LD b6, 13 * SIZE(BO) 1181 SUB c32, b3, c32 1182 LD b7, 14 * SIZE(BO) 1183 SUB c42, b4, c42 1184 LD b8, 15 * SIZE(BO) 1185 1186 SUB c52, b5, c52 1187#ifdef LN 1188 LD b1, 3 * SIZE(AO) 1189#else 1190 LD b1, 0 * SIZE(AO) 1191#endif 1192 SUB c62, b6, c62 1193 SUB c72, b7, c72 1194 SUB c82, b8, c82 1195#else 1196 LD b1, 0 * SIZE(AO) 1197 LD b2, 1 * SIZE(AO) 1198 LD b3, 2 * SIZE(AO) 1199 LD b4, 3 * SIZE(AO) 1200 1201 SUB c11, b1, c11 1202 LD b5, 4 * SIZE(AO) 1203 SUB c12, b2, c12 1204 LD b6, 5 * SIZE(AO) 1205 SUB c21, b3, c21 1206 LD b7, 6 * SIZE(AO) 1207 SUB c22, b4, c22 1208 LD b8, 7 * SIZE(AO) 1209 1210 SUB c31, b5, c31 1211 LD b1, 8 * SIZE(AO) 1212 SUB c32, b6, c32 1213 LD b2, 9 * SIZE(AO) 1214 SUB c41, b7, c41 1215 LD b3, 10 * SIZE(AO) 1216 SUB c42, b8, c42 1217 LD b4, 11 * SIZE(AO) 1218 1219 LD b5, 12 * SIZE(AO) 1220 SUB c51, b1, c51 1221 LD b6, 13 * SIZE(AO) 1222 SUB c52, b2, c52 1223 LD b7, 14 * SIZE(AO) 1224 SUB c61, b3, c61 1225 LD b8, 15 * SIZE(AO) 1226 SUB c62, b4, c62 1227 1228 SUB c71, b5, c71 1229 SUB c72, b6, c72 1230 SUB c81, b7, c81 1231 SUB c82, b8, c82 1232#endif 1233 1234#ifdef LN 1235 MUL c12, b1, c12 1236 LD b2, 2 * SIZE(AO) 1237 MUL c22, b1, c22 1238 MUL c32, b1, c32 1239 MUL c42, b1, c42 1240 MUL c52, b1, c52 1241 MUL c62, b1, c62 1242 MUL c72, b1, c72 1243 MUL c82, b1, c82 1244 1245 NMSUB c11, c11, b2, c12 1246 LD b3, 0 * SIZE(AO) 1247 NMSUB c21, c21, b2, c22 1248 NMSUB c31, c31, b2, c32 1249 NMSUB c41, c41, b2, c42 1250 NMSUB c51, c51, b2, c52 1251 NMSUB c61, c61, b2, c62 1252 NMSUB c71, c71, b2, c72 1253 NMSUB c81, c81, b2, c82 1254 1255 MUL c11, b3, c11 1256 daddiu CO1, CO1, -2 * SIZE 1257 MUL c21, b3, c21 1258 daddiu CO2, CO2, -2 * SIZE 1259 MUL c31, b3, c31 1260 daddiu CO3, CO3, -2 * SIZE 1261 MUL c41, b3, c41 1262 daddiu CO4, CO4, -2 * SIZE 1263 MUL c51, b3, c51 1264 daddiu CO5, CO5, -2 * SIZE 1265 MUL c61, b3, c61 1266 daddiu CO6, CO6, -2 * SIZE 1267 MUL c71, b3, c71 1268 daddiu CO7, CO7, -2 * SIZE 1269 MUL c81, b3, c81 1270 daddiu CO8, CO8, -2 * SIZE 1271#endif 1272 1273#ifdef LT 1274 MUL c11, b1, c11 1275 LD b2, 1 * SIZE(AO) 1276 MUL c21, b1, c21 1277 MUL c31, b1, c31 1278 MUL c41, b1, c41 1279 MUL c51, b1, c51 1280 MUL c61, b1, c61 1281 MUL c71, b1, c71 1282 MUL c81, b1, c81 1283 1284 NMSUB c12, c12, b2, c11 1285 LD b3, 3 * SIZE(AO) 1286 NMSUB c22, c22, b2, c21 1287 NMSUB c32, c32, b2, c31 1288 NMSUB c42, c42, b2, c41 1289 NMSUB c52, c52, b2, c51 1290 NMSUB c62, c62, b2, c61 1291 NMSUB c72, c72, b2, c71 1292 NMSUB c82, c82, b2, c81 1293 1294 MUL c12, b3, c12 1295 MUL c22, b3, c22 1296 MUL c32, b3, c32 1297 MUL c42, b3, c42 1298 MUL c52, b3, c52 1299 MUL c62, b3, c62 1300 MUL c72, b3, c72 1301 MUL c82, b3, c82 1302#endif 1303 1304#ifdef RN 1305 LD b1, 0 * SIZE(BO) 1306 LD b2, 1 * SIZE(BO) 1307 LD b3, 2 * SIZE(BO) 1308 LD b4, 3 * SIZE(BO) 1309 1310 MUL c11, b1, c11 1311 MUL c12, b1, c12 1312 LD b5, 4 * SIZE(BO) 1313 1314 NMSUB c21, c21, b2, c11 1315 NMSUB c22, c22, b2, c12 1316 LD b6, 5 * SIZE(BO) 1317 NMSUB c31, c31, b3, c11 1318 NMSUB c32, c32, b3, c12 1319 LD b7, 6 * SIZE(BO) 1320 NMSUB c41, c41, b4, c11 1321 NMSUB c42, c42, b4, c12 1322 LD b8, 7 * SIZE(BO) 1323 1324 NMSUB c51, c51, b5, c11 1325 NMSUB c52, c52, b5, c12 1326 LD b2, 9 * SIZE(BO) 1327 NMSUB c61, c61, b6, c11 1328 NMSUB c62, c62, b6, c12 1329 LD b3, 10 * SIZE(BO) 1330 NMSUB c71, c71, b7, c11 1331 NMSUB c72, c72, b7, c12 1332 LD b4, 11 * SIZE(BO) 1333 NMSUB c81, c81, b8, c11 1334 NMSUB c82, c82, b8, c12 1335 LD b5, 12 * SIZE(BO) 1336 1337 MUL c21, b2, c21 1338 MUL c22, b2, c22 1339 LD b6, 13 * SIZE(BO) 1340 1341 NMSUB c31, c31, b3, c21 1342 NMSUB c32, c32, b3, c22 1343 LD b7, 14 * SIZE(BO) 1344 NMSUB c41, c41, b4, c21 1345 NMSUB c42, c42, b4, c22 1346 LD b8, 15 * SIZE(BO) 1347 NMSUB c51, c51, b5, c21 1348 NMSUB c52, c52, b5, c22 1349 LD b3, 18 * SIZE(BO) 1350 NMSUB c61, c61, b6, c21 1351 NMSUB c62, c62, b6, c22 1352 LD b4, 19 * SIZE(BO) 1353 NMSUB c71, c71, b7, c21 1354 NMSUB c72, c72, b7, c22 1355 LD b5, 20 * SIZE(BO) 1356 NMSUB c81, c81, b8, c21 1357 NMSUB c82, c82, b8, c22 1358 LD b6, 21 * SIZE(BO) 1359 1360 MUL c31, b3, c31 1361 MUL c32, b3, c32 1362 LD b7, 22 * SIZE(BO) 1363 1364 NMSUB c41, c41, b4, c31 1365 NMSUB c42, c42, b4, c32 1366 LD b8, 23 * SIZE(BO) 1367 NMSUB c51, c51, b5, c31 1368 NMSUB c52, c52, b5, c32 1369 LD b4, 27 * SIZE(BO) 1370 NMSUB c61, c61, b6, c31 1371 NMSUB c62, c62, b6, c32 1372 LD b5, 28 * SIZE(BO) 1373 NMSUB c71, c71, b7, c31 1374 NMSUB c72, c72, b7, c32 1375 LD b6, 29 * SIZE(BO) 1376 NMSUB c81, c81, b8, c31 1377 NMSUB c82, c82, b8, c32 1378 LD b7, 30 * SIZE(BO) 1379 1380 MUL c41, b4, c41 1381 MUL c42, b4, c42 1382 LD b8, 31 * SIZE(BO) 1383 1384 NMSUB c51, c51, b5, c41 1385 NMSUB c52, c52, b5, c42 1386 LD b5, 36 * SIZE(BO) 1387 NMSUB c61, c61, b6, c41 1388 NMSUB c62, c62, b6, c42 1389 LD b6, 37 * SIZE(BO) 1390 NMSUB c71, c71, b7, c41 1391 NMSUB c72, c72, b7, c42 1392 LD b7, 38 * SIZE(BO) 1393 NMSUB c81, c81, b8, c41 1394 NMSUB c82, c82, b8, c42 1395 LD b8, 39 * SIZE(BO) 1396 1397 MUL c51, b5, c51 1398 MUL c52, b5, c52 1399 1400 NMSUB c61, c61, b6, c51 1401 NMSUB c62, c62, b6, c52 1402 LD b6, 45 * SIZE(BO) 1403 NMSUB c71, c71, b7, c51 1404 NMSUB c72, c72, b7, c52 1405 LD b7, 46 * SIZE(BO) 1406 NMSUB c81, c81, b8, c51 1407 NMSUB c82, c82, b8, c52 1408 LD b8, 47 * SIZE(BO) 1409 1410 MUL c61, b6, c61 1411 MUL c62, b6, c62 1412 1413 NMSUB c71, c71, b7, c61 1414 NMSUB c72, c72, b7, c62 1415 LD b7, 54 * SIZE(BO) 1416 NMSUB c81, c81, b8, c61 1417 NMSUB c82, c82, b8, c62 1418 LD b8, 55 * SIZE(BO) 1419 1420 MUL c71, b7, c71 1421 MUL c72, b7, c72 1422 1423 NMSUB c81, c81, b8, c71 1424 NMSUB c82, c82, b8, c72 1425 LD b8, 63 * SIZE(BO) 1426 1427 MUL c81, b8, c81 1428 MUL c82, b8, c82 1429#endif 1430 1431#ifdef RT 1432 LD b1, 63 * SIZE(BO) 1433 LD b2, 62 * SIZE(BO) 1434 LD b3, 61 * SIZE(BO) 1435 LD b4, 60 * SIZE(BO) 1436 1437 MUL c81, b1, c81 1438 MUL c82, b1, c82 1439 LD b5, 59 * SIZE(BO) 1440 1441 NMSUB c71, c71, b2, c81 1442 NMSUB c72, c72, b2, c82 1443 LD b6, 58 * SIZE(BO) 1444 NMSUB c61, c61, b3, c81 1445 NMSUB c62, c62, b3, c82 1446 LD b7, 57 * SIZE(BO) 1447 NMSUB c51, c51, b4, c81 1448 NMSUB c52, c52, b4, c82 1449 LD b8, 56 * SIZE(BO) 1450 1451 NMSUB c41, c41, b5, c81 1452 NMSUB c42, c42, b5, c82 1453 LD b2, 54 * SIZE(BO) 1454 NMSUB c31, c31, b6, c81 1455 NMSUB c32, c32, b6, c82 1456 LD b3, 53 * SIZE(BO) 1457 NMSUB c21, c21, b7, c81 1458 NMSUB c22, c22, b7, c82 1459 LD b4, 52 * SIZE(BO) 1460 NMSUB c11, c11, b8, c81 1461 NMSUB c12, c12, b8, c82 1462 LD b5, 51 * SIZE(BO) 1463 1464 MUL c71, b2, c71 1465 MUL c72, b2, c72 1466 LD b6, 50 * SIZE(BO) 1467 1468 NMSUB c61, c61, b3, c71 1469 NMSUB c62, c62, b3, c72 1470 LD b7, 49 * SIZE(BO) 1471 NMSUB c51, c51, b4, c71 1472 NMSUB c52, c52, b4, c72 1473 LD b8, 48 * SIZE(BO) 1474 NMSUB c41, c41, b5, c71 1475 NMSUB c42, c42, b5, c72 1476 LD b3, 45 * SIZE(BO) 1477 NMSUB c31, c31, b6, c71 1478 NMSUB c32, c32, b6, c72 1479 LD b4, 44 * SIZE(BO) 1480 NMSUB c21, c21, b7, c71 1481 NMSUB c22, c22, b7, c72 1482 LD b5, 43 * SIZE(BO) 1483 NMSUB c11, c11, b8, c71 1484 NMSUB c12, c12, b8, c72 1485 LD b6, 42 * SIZE(BO) 1486 1487 MUL c61, b3, c61 1488 MUL c62, b3, c62 1489 LD b7, 41 * SIZE(BO) 1490 1491 NMSUB c51, c51, b4, c61 1492 NMSUB c52, c52, b4, c62 1493 LD b8, 40 * SIZE(BO) 1494 NMSUB c41, c41, b5, c61 1495 NMSUB c42, c42, b5, c62 1496 LD b4, 36 * SIZE(BO) 1497 NMSUB c31, c31, b6, c61 1498 NMSUB c32, c32, b6, c62 1499 LD b5, 35 * SIZE(BO) 1500 NMSUB c21, c21, b7, c61 1501 NMSUB c22, c22, b7, c62 1502 LD b6, 34 * SIZE(BO) 1503 NMSUB c11, c11, b8, c61 1504 NMSUB c12, c12, b8, c62 1505 LD b7, 33 * SIZE(BO) 1506 1507 MUL c51, b4, c51 1508 MUL c52, b4, c52 1509 LD b8, 32 * SIZE(BO) 1510 1511 NMSUB c41, c41, b5, c51 1512 NMSUB c42, c42, b5, c52 1513 LD b5, 27 * SIZE(BO) 1514 NMSUB c31, c31, b6, c51 1515 NMSUB c32, c32, b6, c52 1516 LD b6, 26 * SIZE(BO) 1517 NMSUB c21, c21, b7, c51 1518 NMSUB c22, c22, b7, c52 1519 LD b7, 25 * SIZE(BO) 1520 NMSUB c11, c11, b8, c51 1521 NMSUB c12, c12, b8, c52 1522 LD b8, 24 * SIZE(BO) 1523 1524 MUL c41, b5, c41 1525 MUL c42, b5, c42 1526 1527 NMSUB c31, c31, b6, c41 1528 NMSUB c32, c32, b6, c42 1529 LD b6, 18 * SIZE(BO) 1530 NMSUB c21, c21, b7, c41 1531 NMSUB c22, c22, b7, c42 1532 LD b7, 17 * SIZE(BO) 1533 NMSUB c11, c11, b8, c41 1534 NMSUB c12, c12, b8, c42 1535 LD b8, 16 * SIZE(BO) 1536 1537 MUL c31, b6, c31 1538 MUL c32, b6, c32 1539 1540 NMSUB c21, c21, b7, c31 1541 NMSUB c22, c22, b7, c32 1542 LD b7, 9 * SIZE(BO) 1543 NMSUB c11, c11, b8, c31 1544 NMSUB c12, c12, b8, c32 1545 LD b8, 8 * SIZE(BO) 1546 1547 MUL c21, b7, c21 1548 MUL c22, b7, c22 1549 1550 NMSUB c11, c11, b8, c21 1551 NMSUB c12, c12, b8, c22 1552 LD b8, 0 * SIZE(BO) 1553 1554 MUL c11, b8, c11 1555 MUL c12, b8, c12 1556#endif 1557 1558#if defined(LN) || defined(LT) 1559 ST c11, 0 * SIZE(BO) 1560 ST c21, 1 * SIZE(BO) 1561 ST c31, 2 * SIZE(BO) 1562 ST c41, 3 * SIZE(BO) 1563 ST c51, 4 * SIZE(BO) 1564 ST c61, 5 * SIZE(BO) 1565 ST c71, 6 * SIZE(BO) 1566 ST c81, 7 * SIZE(BO) 1567 1568 ST c12, 8 * SIZE(BO) 1569 ST c22, 9 * SIZE(BO) 1570 ST c32, 10 * SIZE(BO) 1571 ST c42, 11 * SIZE(BO) 1572 ST c52, 12 * SIZE(BO) 1573 ST c62, 13 * SIZE(BO) 1574 ST c72, 14 * SIZE(BO) 1575 ST c82, 15 * SIZE(BO) 1576#else 1577 ST c11, 0 * SIZE(AO) 1578 ST c12, 1 * SIZE(AO) 1579 ST c21, 2 * SIZE(AO) 1580 ST c22, 3 * SIZE(AO) 1581 ST c31, 4 * SIZE(AO) 1582 ST c32, 5 * SIZE(AO) 1583 ST c41, 6 * SIZE(AO) 1584 ST c42, 7 * SIZE(AO) 1585 1586 ST c51, 8 * SIZE(AO) 1587 ST c52, 9 * SIZE(AO) 1588 ST c61, 10 * SIZE(AO) 1589 ST c62, 11 * SIZE(AO) 1590 ST c71, 12 * SIZE(AO) 1591 ST c72, 13 * SIZE(AO) 1592 ST c81, 14 * SIZE(AO) 1593 ST c82, 15 * SIZE(AO) 1594#endif 1595 1596 ST c11, 0 * SIZE(CO1) 1597 ST c12, 1 * SIZE(CO1) 1598 ST c21, 0 * SIZE(CO2) 1599 ST c22, 1 * SIZE(CO2) 1600 ST c31, 0 * SIZE(CO3) 1601 ST c32, 1 * SIZE(CO3) 1602 ST c41, 0 * SIZE(CO4) 1603 ST c42, 1 * SIZE(CO4) 1604 ST c51, 0 * SIZE(CO5) 1605 ST c52, 1 * SIZE(CO5) 1606 ST c61, 0 * SIZE(CO6) 1607 ST c62, 1 * SIZE(CO6) 1608 ST c71, 0 * SIZE(CO7) 1609 ST c72, 1 * SIZE(CO7) 1610 ST c81, 0 * SIZE(CO8) 1611 ST c82, 1 * SIZE(CO8) 1612 1613 MTC $0, a1 1614 1615#ifndef LN 1616 daddiu CO1, CO1, 2 * SIZE 1617 daddiu CO2, CO2, 2 * SIZE 1618 daddiu CO3, CO3, 2 * SIZE 1619 daddiu CO4, CO4, 2 * SIZE 1620 daddiu CO5, CO5, 2 * SIZE 1621 daddiu CO6, CO6, 2 * SIZE 1622 daddiu CO7, CO7, 2 * SIZE 1623 daddiu CO8, CO8, 2 * SIZE 1624#endif 1625 1626 MOV c11, a1 1627 MOV c21, a1 1628 1629#ifdef RT 1630 dsll TEMP, K, 1 + BASE_SHIFT 1631 daddu AORIG, AORIG, TEMP 1632#endif 1633 1634 MOV c31, a1 1635 MOV c41, a1 1636 1637#if defined(LT) || defined(RN) 1638 dsubu TEMP, K, KK 1639 dsll L, TEMP, 1 + BASE_SHIFT 1640 dsll TEMP, TEMP, 3 + BASE_SHIFT 1641 daddu AO, AO, L 1642 daddu BO, BO, TEMP 1643#endif 1644 1645#ifdef LT 1646 daddiu KK, KK, 2 1647#endif 1648 1649#ifdef LN 1650 daddiu KK, KK, -2 1651#endif 1652 1653 daddiu I, I, -1 1654 MOV c51, a1 1655 1656 bgtz I, .L11 1657 MOV c61, a1 1658 .align 3 1659 1660.L29: 1661#ifdef LN 1662 dsll TEMP, K, 3 + BASE_SHIFT 1663 daddu B, B, TEMP 1664#endif 1665 1666#if defined(LT) || defined(RN) 1667 move B, BO 1668#endif 1669 1670#ifdef RN 1671 daddiu KK, KK, 8 1672#endif 1673 1674#ifdef RT 1675 daddiu KK, KK, -8 1676#endif 1677 1678 bgtz J, .L10 1679 NOP 1680 .align 3 1681 1682.L30: 1683 andi J, N, 4 1684 blez J, .L50 1685 move AO, A 1686 1687#ifdef RT 1688 dsll TEMP, K, 2 + BASE_SHIFT 1689 dsubu B, B, TEMP 1690 1691 dsll TEMP, LDC, 2 1692 dsubu C, C, TEMP 1693#endif 1694 1695 move CO1, C 1696 MTC $0, c11 1697 daddu CO2, C, LDC 1698 daddu CO3, CO2, LDC 1699 MOV c21, c11 1700 daddu CO4, CO3, LDC 1701 MOV c31, c11 1702 1703#ifdef LN 1704 daddu KK, M, OFFSET 1705#endif 1706 1707#ifdef LT 1708 move KK, OFFSET 1709#endif 1710 1711#if defined(LN) || defined(RT) 1712 move AORIG, A 1713#else 1714 move AO, A 1715#endif 1716#ifndef RT 1717 daddu C, CO4, LDC 1718#endif 1719 1720 andi I, M, 1 1721 blez I, .L40 1722 MOV c41, c11 1723 1724#if defined(LT) || defined(RN) 1725 LD a1, 0 * SIZE(AO) 1726 MOV c71, c11 1727 LD a2, 1 * SIZE(AO) 1728 MOV c81, c11 1729 1730 LD b1, 0 * SIZE(B) 1731 LD b2, 1 * SIZE(B) 1732 LD b3, 2 * SIZE(B) 1733 LD b4, 3 * SIZE(B) 1734 LD b5, 4 * SIZE(B) 1735 LD b6, 8 * SIZE(B) 1736 LD b7, 12 * SIZE(B) 1737 1738 dsra L, KK, 2 1739 1740 blez L, .L45 1741 move BO, B 1742#else 1743#ifdef LN 1744 dsll TEMP, K, BASE_SHIFT 1745 dsubu AORIG, AORIG, TEMP 1746#endif 1747 1748 dsll L, KK, 0 + BASE_SHIFT 1749 dsll TEMP, KK, 2 + BASE_SHIFT 1750 1751 daddu AO, AORIG, L 1752 daddu BO, B, TEMP 1753 1754 dsubu TEMP, K, KK 1755 1756 LD a1, 0 * SIZE(AO) 1757 MOV c71, c11 1758 LD a2, 1 * SIZE(AO) 1759 MOV c81, c11 1760 1761 LD b1, 0 * SIZE(BO) 1762 LD b2, 1 * SIZE(BO) 1763 LD b3, 2 * SIZE(BO) 1764 LD b4, 3 * SIZE(BO) 1765 LD b5, 4 * SIZE(BO) 1766 LD b6, 8 * SIZE(BO) 1767 LD b7, 12 * SIZE(BO) 1768 1769 dsra L, TEMP, 2 1770 1771 blez L, .L45 1772 NOP 1773#endif 1774 .align 3 1775 1776.L42: 1777 MADD c11, c11, a1, b1 1778 LD b1, 16 * SIZE(BO) 1779 MADD c21, c21, a1, b2 1780 LD b2, 5 * SIZE(BO) 1781 MADD c31, c31, a1, b3 1782 LD b3, 6 * SIZE(BO) 1783 MADD c41, c41, a1, b4 1784 LD b4, 7 * SIZE(BO) 1785 1786 LD a1, 4 * SIZE(AO) 1787 daddiu L, L, -1 1788 1789 MADD c11, c11, a2, b5 1790 LD b5, 20 * SIZE(BO) 1791 MADD c21, c21, a2, b2 1792 LD b2, 9 * SIZE(BO) 1793 MADD c31, c31, a2, b3 1794 LD b3, 10 * SIZE(BO) 1795 MADD c41, c41, a2, b4 1796 LD b4, 11 * SIZE(BO) 1797 1798 LD a2, 2 * SIZE(AO) 1799 daddiu AO, AO, 4 * SIZE 1800 1801 MADD c11, c11, a2, b6 1802 LD b6, 24 * SIZE(BO) 1803 MADD c21, c21, a2, b2 1804 LD b2, 13 * SIZE(BO) 1805 MADD c31, c31, a2, b3 1806 LD b3, 14 * SIZE(BO) 1807 MADD c41, c41, a2, b4 1808 LD b4, 15 * SIZE(BO) 1809 1810 LD a2, -1 * SIZE(AO) 1811 daddiu BO, BO, 16 * SIZE 1812 1813 MADD c11, c11, a2, b7 1814 LD b7, 12 * SIZE(BO) 1815 MADD c21, c21, a2, b2 1816 LD b2, 1 * SIZE(BO) 1817 MADD c31, c31, a2, b3 1818 LD b3, 2 * SIZE(BO) 1819 MADD c41, c41, a2, b4 1820 LD b4, 3 * SIZE(BO) 1821 1822 bgtz L, .L42 1823 LD a2, 1 * SIZE(AO) 1824 .align 3 1825 1826.L45: 1827#if defined(LT) || defined(RN) 1828 andi L, KK, 3 1829#else 1830 andi L, TEMP, 3 1831#endif 1832 NOP 1833 blez L, .L48 1834 NOP 1835 .align 3 1836 1837.L46: 1838 MADD c11, c11, a1, b1 1839 LD b1, 4 * SIZE(BO) 1840 MADD c21, c21, a1, b2 1841 LD b2, 5 * SIZE(BO) 1842 MADD c31, c31, a1, b3 1843 LD b3, 6 * SIZE(BO) 1844 MADD c41, c41, a1, b4 1845 LD a1, 1 * SIZE(AO) 1846 1847 LD b4, 7 * SIZE(BO) 1848 daddiu L, L, -1 1849 1850 daddiu AO, AO, 1 * SIZE 1851 MOV a2, a2 1852 bgtz L, .L46 1853 daddiu BO, BO, 4 * SIZE 1854 1855 1856.L48: 1857#if defined(LN) || defined(RT) 1858#ifdef LN 1859 daddiu TEMP, KK, -1 1860#else 1861 daddiu TEMP, KK, -4 1862#endif 1863 1864 dsll L, TEMP, 0 + BASE_SHIFT 1865 dsll TEMP, TEMP, 2 + BASE_SHIFT 1866 daddu AO, AORIG, L 1867 daddu BO, B, TEMP 1868#endif 1869 1870 1871#if defined(LN) || defined(LT) 1872 LD b1, 0 * SIZE(BO) 1873 LD b2, 1 * SIZE(BO) 1874 LD b3, 2 * SIZE(BO) 1875 LD b4, 3 * SIZE(BO) 1876 1877 SUB c11, b1, c11 1878 SUB c21, b2, c21 1879 SUB c31, b3, c31 1880 SUB c41, b4, c41 1881#else 1882 LD b1, 0 * SIZE(AO) 1883 LD b2, 1 * SIZE(AO) 1884 LD b3, 2 * SIZE(AO) 1885 LD b4, 3 * SIZE(AO) 1886 1887 SUB c11, b1, c11 1888 SUB c21, b2, c21 1889 SUB c31, b3, c31 1890 SUB c41, b4, c41 1891#endif 1892 1893#if defined(LN) || defined(LT) 1894 LD b1, 0 * SIZE(AO) 1895 1896 MUL c11, b1, c11 1897 MUL c21, b1, c21 1898 MUL c31, b1, c31 1899 MUL c41, b1, c41 1900#endif 1901 1902#ifdef RN 1903 LD b1, 0 * SIZE(BO) 1904 LD b2, 1 * SIZE(BO) 1905 LD b3, 2 * SIZE(BO) 1906 LD b4, 3 * SIZE(BO) 1907 1908 MUL c11, b1, c11 1909 1910 NMSUB c21, c21, b2, c11 1911 NMSUB c31, c31, b3, c11 1912 NMSUB c41, c41, b4, c11 1913 1914 LD b2, 5 * SIZE(BO) 1915 LD b3, 6 * SIZE(BO) 1916 LD b4, 7 * SIZE(BO) 1917 1918 MUL c21, b2, c21 1919 1920 NMSUB c31, c31, b3, c21 1921 NMSUB c41, c41, b4, c21 1922 1923 LD b3, 10 * SIZE(BO) 1924 LD b4, 11 * SIZE(BO) 1925 1926 MUL c31, b3, c31 1927 1928 NMSUB c41, c41, b4, c31 1929 1930 LD b4, 15 * SIZE(BO) 1931 1932 MUL c41, b4, c41 1933#endif 1934 1935#ifdef RT 1936 LD b5, 15 * SIZE(BO) 1937 LD b6, 14 * SIZE(BO) 1938 LD b7, 13 * SIZE(BO) 1939 LD b8, 12 * SIZE(BO) 1940 1941 MUL c41, b5, c41 1942 1943 NMSUB c31, c31, b6, c41 1944 NMSUB c21, c21, b7, c41 1945 NMSUB c11, c11, b8, c41 1946 1947 LD b6, 10 * SIZE(BO) 1948 LD b7, 9 * SIZE(BO) 1949 LD b8, 8 * SIZE(BO) 1950 1951 MUL c31, b6, c31 1952 1953 NMSUB c21, c21, b7, c31 1954 NMSUB c11, c11, b8, c31 1955 1956 LD b7, 5 * SIZE(BO) 1957 LD b8, 4 * SIZE(BO) 1958 1959 MUL c21, b7, c21 1960 1961 NMSUB c11, c11, b8, c21 1962 1963 LD b8, 0 * SIZE(BO) 1964 1965 MUL c11, b8, c11 1966#endif 1967 1968#ifdef LN 1969 daddiu CO1, CO1, -1 * SIZE 1970 daddiu CO2, CO2, -1 * SIZE 1971 daddiu CO3, CO3, -1 * SIZE 1972 daddiu CO4, CO4, -1 * SIZE 1973#endif 1974 1975#if defined(LN) || defined(LT) 1976 ST c11, 0 * SIZE(BO) 1977 ST c21, 1 * SIZE(BO) 1978 ST c31, 2 * SIZE(BO) 1979 ST c41, 3 * SIZE(BO) 1980#else 1981 ST c11, 0 * SIZE(AO) 1982 ST c21, 1 * SIZE(AO) 1983 ST c31, 2 * SIZE(AO) 1984 ST c41, 3 * SIZE(AO) 1985#endif 1986 1987 ST c11, 0 * SIZE(CO1) 1988 ST c21, 0 * SIZE(CO2) 1989 ST c31, 0 * SIZE(CO3) 1990 ST c41, 0 * SIZE(CO4) 1991 1992 MTC $0, c11 1993 1994#ifndef LN 1995 daddiu CO1, CO1, 1 * SIZE 1996 daddiu CO2, CO2, 1 * SIZE 1997 daddiu CO3, CO3, 1 * SIZE 1998 daddiu CO4, CO4, 1 * SIZE 1999#endif 2000 2001 MOV c21, c11 2002 2003#ifdef RT 2004 dsll TEMP, K, BASE_SHIFT 2005 daddu AORIG, AORIG, TEMP 2006#endif 2007 2008#if defined(LT) || defined(RN) 2009 dsubu TEMP, K, KK 2010 dsll L, TEMP, 0 + BASE_SHIFT 2011 dsll TEMP, TEMP, 2 + BASE_SHIFT 2012 daddu AO, AO, L 2013 daddu BO, BO, TEMP 2014#endif 2015 2016 MOV c31, c11 2017 2018#ifdef LT 2019 daddiu KK, KK, 1 2020#endif 2021 2022#ifdef LN 2023 daddiu KK, KK, -1 2024#endif 2025 .align 3 2026 2027.L40: 2028 dsra I, M, 1 2029 MOV c61, c11 2030 blez I, .L49 2031 MOV c41, c11 2032 2033.L31: 2034#if defined(LT) || defined(RN) 2035 LD a1, 0 * SIZE(AO) 2036 LD a3, 4 * SIZE(AO) 2037 2038 LD b1, 0 * SIZE(B) 2039 MOV c12, c11 2040 LD b2, 1 * SIZE(B) 2041 MOV c22, c11 2042 LD b3, 2 * SIZE(B) 2043 MOV c32, c11 2044 LD b4, 3 * SIZE(B) 2045 MOV c42, c11 2046 2047 LD b5, 4 * SIZE(B) 2048 dsra L, KK, 2 2049 LD b6, 8 * SIZE(B) 2050 LD b7, 12 * SIZE(B) 2051 2052 blez L, .L35 2053 move BO, B 2054#else 2055#ifdef LN 2056 dsll TEMP, K, 1 + BASE_SHIFT 2057 dsubu AORIG, AORIG, TEMP 2058#endif 2059 2060 dsll L, KK, 1 + BASE_SHIFT 2061 dsll TEMP, KK, 2 + BASE_SHIFT 2062 2063 daddu AO, AORIG, L 2064 daddu BO, B, TEMP 2065 2066 dsubu TEMP, K, KK 2067 2068 LD a1, 0 * SIZE(AO) 2069 LD a3, 4 * SIZE(AO) 2070 2071 LD b1, 0 * SIZE(BO) 2072 MOV c12, c11 2073 LD b2, 1 * SIZE(BO) 2074 MOV c22, c11 2075 LD b3, 2 * SIZE(BO) 2076 MOV c32, c11 2077 LD b4, 3 * SIZE(BO) 2078 MOV c42, c11 2079 2080 LD b5, 4 * SIZE(BO) 2081 dsra L, TEMP, 2 2082 LD b6, 8 * SIZE(BO) 2083 LD b7, 12 * SIZE(BO) 2084 2085 blez L, .L35 2086 NOP 2087#endif 2088 .align 3 2089 2090.L32: 2091 MADD c11, c11, a1, b1 2092 LD a2, 1 * SIZE(AO) 2093 MADD c21, c21, a1, b2 2094 daddiu L, L, -1 2095 MADD c31, c31, a1, b3 2096 NOP 2097 MADD c41, c41, a1, b4 2098 LD a1, 2 * SIZE(AO) 2099 2100 MADD c12, c12, a2, b1 2101 LD b1, 16 * SIZE(BO) 2102 MADD c22, c22, a2, b2 2103 LD b2, 5 * SIZE(BO) 2104 MADD c32, c32, a2, b3 2105 LD b3, 6 * SIZE(BO) 2106 MADD c42, c42, a2, b4 2107 LD b4, 7 * SIZE(BO) 2108 2109 MADD c11, c11, a1, b5 2110 LD a2, 3 * SIZE(AO) 2111 MADD c21, c21, a1, b2 2112 NOP 2113 MADD c31, c31, a1, b3 2114 NOP 2115 MADD c41, c41, a1, b4 2116 LD a1, 8 * SIZE(AO) 2117 2118 MADD c12, c12, a2, b5 2119 LD b5, 20 * SIZE(BO) 2120 MADD c22, c22, a2, b2 2121 LD b2, 9 * SIZE(BO) 2122 MADD c32, c32, a2, b3 2123 LD b3, 10 * SIZE(BO) 2124 MADD c42, c42, a2, b4 2125 LD b4, 11 * SIZE(BO) 2126 2127 MADD c11, c11, a3, b6 2128 LD a2, 5 * SIZE(AO) 2129 MADD c21, c21, a3, b2 2130 NOP 2131 MADD c31, c31, a3, b3 2132 NOP 2133 MADD c41, c41, a3, b4 2134 LD a3, 6 * SIZE(AO) 2135 2136 MADD c12, c12, a2, b6 2137 LD b6, 24 * SIZE(BO) 2138 MADD c22, c22, a2, b2 2139 LD b2, 13 * SIZE(BO) 2140 MADD c32, c32, a2, b3 2141 LD b3, 14 * SIZE(BO) 2142 MADD c42, c42, a2, b4 2143 LD b4, 15 * SIZE(BO) 2144 2145 MADD c11, c11, a3, b7 2146 LD a2, 7 * SIZE(AO) 2147 MADD c21, c21, a3, b2 2148 daddiu AO, AO, 8 * SIZE 2149 MADD c31, c31, a3, b3 2150 daddiu BO, BO, 16 * SIZE 2151 MADD c41, c41, a3, b4 2152 LD a3, 4 * SIZE(AO) 2153 2154 MADD c12, c12, a2, b7 2155 LD b7, 12 * SIZE(BO) 2156 MADD c22, c22, a2, b2 2157 LD b2, 1 * SIZE(BO) 2158 MADD c32, c32, a2, b3 2159 LD b3, 2 * SIZE(BO) 2160 MADD c42, c42, a2, b4 2161 NOP 2162 2163 bgtz L, .L32 2164 LD b4, 3 * SIZE(BO) 2165 .align 3 2166 2167.L35: 2168#if defined(LT) || defined(RN) 2169 andi L, KK, 3 2170#else 2171 andi L, TEMP, 3 2172#endif 2173 NOP 2174 blez L, .L38 2175 NOP 2176 .align 3 2177 2178.L36: 2179 MADD c11, c11, a1, b1 2180 LD a2, 1 * SIZE(AO) 2181 MADD c21, c21, a1, b2 2182 daddiu L, L, -1 2183 MADD c31, c31, a1, b3 2184 daddiu AO, AO, 2 * SIZE 2185 MADD c41, c41, a1, b4 2186 LD a1, 0 * SIZE(AO) 2187 2188 MADD c12, c12, a2, b1 2189 LD b1, 4 * SIZE(BO) 2190 MADD c22, c22, a2, b2 2191 LD b2, 5 * SIZE(BO) 2192 MADD c32, c32, a2, b3 2193 LD b3, 6 * SIZE(BO) 2194 MADD c42, c42, a2, b4 2195 LD b4, 7 * SIZE(BO) 2196 2197 bgtz L, .L36 2198 daddiu BO, BO, 4 * SIZE 2199 2200.L38: 2201#if defined(LN) || defined(RT) 2202#ifdef LN 2203 daddiu TEMP, KK, -2 2204#else 2205 daddiu TEMP, KK, -4 2206#endif 2207 2208 dsll L, TEMP, 1 + BASE_SHIFT 2209 dsll TEMP, TEMP, 2 + BASE_SHIFT 2210 daddu AO, AORIG, L 2211 daddu BO, B, TEMP 2212#endif 2213 2214 2215#if defined(LN) || defined(LT) 2216 LD b1, 0 * SIZE(BO) 2217 LD b2, 1 * SIZE(BO) 2218 LD b3, 2 * SIZE(BO) 2219 LD b4, 3 * SIZE(BO) 2220 LD b5, 4 * SIZE(BO) 2221 LD b6, 5 * SIZE(BO) 2222 LD b7, 6 * SIZE(BO) 2223 LD b8, 7 * SIZE(BO) 2224 2225 SUB c11, b1, c11 2226 SUB c21, b2, c21 2227 SUB c31, b3, c31 2228 SUB c41, b4, c41 2229 SUB c12, b5, c12 2230 SUB c22, b6, c22 2231 SUB c32, b7, c32 2232 SUB c42, b8, c42 2233#else 2234 LD b1, 0 * SIZE(AO) 2235 LD b2, 1 * SIZE(AO) 2236 LD b3, 2 * SIZE(AO) 2237 LD b4, 3 * SIZE(AO) 2238 LD b5, 4 * SIZE(AO) 2239 LD b6, 5 * SIZE(AO) 2240 LD b7, 6 * SIZE(AO) 2241 LD b8, 7 * SIZE(AO) 2242 2243 SUB c11, b1, c11 2244 SUB c12, b2, c12 2245 SUB c21, b3, c21 2246 SUB c22, b4, c22 2247 SUB c31, b5, c31 2248 SUB c32, b6, c32 2249 SUB c41, b7, c41 2250 SUB c42, b8, c42 2251#endif 2252 2253#ifdef LN 2254 LD b1, 3 * SIZE(AO) 2255 LD b2, 2 * SIZE(AO) 2256 LD b3, 0 * SIZE(AO) 2257 2258 MUL c12, b1, c12 2259 MUL c22, b1, c22 2260 MUL c32, b1, c32 2261 MUL c42, b1, c42 2262 2263 NMSUB c11, c11, b2, c12 2264 NMSUB c21, c21, b2, c22 2265 NMSUB c31, c31, b2, c32 2266 NMSUB c41, c41, b2, c42 2267 2268 MUL c11, b3, c11 2269 MUL c21, b3, c21 2270 MUL c31, b3, c31 2271 MUL c41, b3, c41 2272#endif 2273 2274#ifdef LT 2275 LD b1, 0 * SIZE(AO) 2276 LD b2, 1 * SIZE(AO) 2277 LD b3, 3 * SIZE(AO) 2278 2279 MUL c11, b1, c11 2280 MUL c21, b1, c21 2281 MUL c31, b1, c31 2282 MUL c41, b1, c41 2283 2284 NMSUB c12, c12, b2, c11 2285 NMSUB c22, c22, b2, c21 2286 NMSUB c32, c32, b2, c31 2287 NMSUB c42, c42, b2, c41 2288 2289 MUL c12, b3, c12 2290 MUL c22, b3, c22 2291 MUL c32, b3, c32 2292 MUL c42, b3, c42 2293#endif 2294 2295#ifdef RN 2296 LD b1, 0 * SIZE(BO) 2297 LD b2, 1 * SIZE(BO) 2298 LD b3, 2 * SIZE(BO) 2299 LD b4, 3 * SIZE(BO) 2300 2301 MUL c11, b1, c11 2302 MUL c12, b1, c12 2303 2304 NMSUB c21, c21, b2, c11 2305 NMSUB c22, c22, b2, c12 2306 NMSUB c31, c31, b3, c11 2307 NMSUB c32, c32, b3, c12 2308 NMSUB c41, c41, b4, c11 2309 NMSUB c42, c42, b4, c12 2310 2311 LD b2, 5 * SIZE(BO) 2312 LD b3, 6 * SIZE(BO) 2313 LD b4, 7 * SIZE(BO) 2314 2315 MUL c21, b2, c21 2316 MUL c22, b2, c22 2317 2318 NMSUB c31, c31, b3, c21 2319 NMSUB c32, c32, b3, c22 2320 NMSUB c41, c41, b4, c21 2321 NMSUB c42, c42, b4, c22 2322 2323 LD b3, 10 * SIZE(BO) 2324 LD b4, 11 * SIZE(BO) 2325 2326 MUL c31, b3, c31 2327 MUL c32, b3, c32 2328 2329 NMSUB c41, c41, b4, c31 2330 NMSUB c42, c42, b4, c32 2331 2332 LD b4, 15 * SIZE(BO) 2333 2334 MUL c41, b4, c41 2335 MUL c42, b4, c42 2336#endif 2337 2338#ifdef RT 2339 LD b5, 15 * SIZE(BO) 2340 LD b6, 14 * SIZE(BO) 2341 LD b7, 13 * SIZE(BO) 2342 LD b8, 12 * SIZE(BO) 2343 2344 MUL c41, b5, c41 2345 MUL c42, b5, c42 2346 2347 NMSUB c31, c31, b6, c41 2348 NMSUB c32, c32, b6, c42 2349 NMSUB c21, c21, b7, c41 2350 NMSUB c22, c22, b7, c42 2351 NMSUB c11, c11, b8, c41 2352 NMSUB c12, c12, b8, c42 2353 2354 LD b6, 10 * SIZE(BO) 2355 LD b7, 9 * SIZE(BO) 2356 LD b8, 8 * SIZE(BO) 2357 2358 MUL c31, b6, c31 2359 MUL c32, b6, c32 2360 2361 NMSUB c21, c21, b7, c31 2362 NMSUB c22, c22, b7, c32 2363 NMSUB c11, c11, b8, c31 2364 NMSUB c12, c12, b8, c32 2365 2366 LD b7, 5 * SIZE(BO) 2367 LD b8, 4 * SIZE(BO) 2368 2369 MUL c21, b7, c21 2370 MUL c22, b7, c22 2371 2372 NMSUB c11, c11, b8, c21 2373 NMSUB c12, c12, b8, c22 2374 2375 LD b8, 0 * SIZE(BO) 2376 2377 MUL c11, b8, c11 2378 MUL c12, b8, c12 2379#endif 2380 2381#ifdef LN 2382 daddiu CO1, CO1, -2 * SIZE 2383 daddiu CO2, CO2, -2 * SIZE 2384 daddiu CO3, CO3, -2 * SIZE 2385 daddiu CO4, CO4, -2 * SIZE 2386#endif 2387 2388#if defined(LN) || defined(LT) 2389 ST c11, 0 * SIZE(BO) 2390 ST c21, 1 * SIZE(BO) 2391 ST c31, 2 * SIZE(BO) 2392 ST c41, 3 * SIZE(BO) 2393 ST c12, 4 * SIZE(BO) 2394 ST c22, 5 * SIZE(BO) 2395 ST c32, 6 * SIZE(BO) 2396 ST c42, 7 * SIZE(BO) 2397#else 2398 ST c11, 0 * SIZE(AO) 2399 ST c12, 1 * SIZE(AO) 2400 ST c21, 2 * SIZE(AO) 2401 ST c22, 3 * SIZE(AO) 2402 ST c31, 4 * SIZE(AO) 2403 ST c32, 5 * SIZE(AO) 2404 ST c41, 6 * SIZE(AO) 2405 ST c42, 7 * SIZE(AO) 2406#endif 2407 2408 ST c11, 0 * SIZE(CO1) 2409 ST c12, 1 * SIZE(CO1) 2410 ST c21, 0 * SIZE(CO2) 2411 ST c22, 1 * SIZE(CO2) 2412 ST c31, 0 * SIZE(CO3) 2413 ST c32, 1 * SIZE(CO3) 2414 ST c41, 0 * SIZE(CO4) 2415 ST c42, 1 * SIZE(CO4) 2416 2417#ifndef LN 2418 daddiu CO1, CO1, 2 * SIZE 2419 daddiu CO2, CO2, 2 * SIZE 2420 daddiu CO3, CO3, 2 * SIZE 2421 daddiu CO4, CO4, 2 * SIZE 2422#endif 2423 2424#ifdef RT 2425 dsll TEMP, K, 1 + BASE_SHIFT 2426 daddu AORIG, AORIG, TEMP 2427#endif 2428 2429#if defined(LT) || defined(RN) 2430 dsubu TEMP, K, KK 2431 dsll L, TEMP, 1 + BASE_SHIFT 2432 dsll TEMP, TEMP, 2 + BASE_SHIFT 2433 daddu AO, AO, L 2434 daddu BO, BO, TEMP 2435#endif 2436 2437#ifdef LT 2438 daddiu KK, KK, 2 2439#endif 2440 2441#ifdef LN 2442 daddiu KK, KK, -2 2443#endif 2444 2445 MTC $0, a1 2446 2447 MOV c11, a1 2448 MOV c21, a1 2449 MOV c31, a1 2450 2451 daddiu I, I, -1 2452 2453 bgtz I, .L31 2454 MOV c41, c11 2455 .align 3 2456 2457.L49: 2458#ifdef LN 2459 dsll TEMP, K, 2 + BASE_SHIFT 2460 daddu B, B, TEMP 2461#endif 2462 2463#if defined(LT) || defined(RN) 2464 move B, BO 2465#endif 2466 2467#ifdef RN 2468 daddiu KK, KK, 4 2469#endif 2470 2471#ifdef RT 2472 daddiu KK, KK, -4 2473#endif 2474 .align 3 2475 2476.L50: 2477 andi J, N, 2 2478 blez J, .L70 2479 2480#ifdef RT 2481 dsll TEMP, K, 1 + BASE_SHIFT 2482 dsubu B, B, TEMP 2483 2484 dsll TEMP, LDC, 1 2485 dsubu C, C, TEMP 2486#endif 2487 2488 move AO, A 2489 move CO1, C 2490 daddu CO2, C, LDC 2491 2492#ifdef LN 2493 daddu KK, M, OFFSET 2494#endif 2495 2496#ifdef LT 2497 move KK, OFFSET 2498#endif 2499 2500#if defined(LN) || defined(RT) 2501 move AORIG, A 2502#else 2503 move AO, A 2504#endif 2505#ifndef RT 2506 daddu C, CO2, LDC 2507#endif 2508 2509 andi I, M, 1 2510 blez I, .L60 2511 NOP 2512 2513#if defined(LT) || defined(RN) 2514 dsra L, KK, 2 2515 LD a1, 0 * SIZE(AO) 2516 MTC $0, c11 2517 LD a2, 1 * SIZE(AO) 2518 MOV c21, c11 2519 LD a3, 2 * SIZE(AO) 2520 MOV c31, c11 2521 LD a4, 3 * SIZE(AO) 2522 MOV c41, c11 2523 2524 LD b1, 0 * SIZE(B) 2525 LD b2, 1 * SIZE(B) 2526 LD b3, 2 * SIZE(B) 2527 LD b4, 3 * SIZE(B) 2528 LD b5, 4 * SIZE(B) 2529 LD b6, 8 * SIZE(B) 2530 LD b7, 12 * SIZE(B) 2531 2532 blez L, .L65 2533 move BO, B 2534#else 2535#ifdef LN 2536 dsll TEMP, K, BASE_SHIFT 2537 dsubu AORIG, AORIG, TEMP 2538#endif 2539 2540 dsll L, KK, 0 + BASE_SHIFT 2541 dsll TEMP, KK, 1 + BASE_SHIFT 2542 2543 daddu AO, AORIG, L 2544 daddu BO, B, TEMP 2545 2546 dsubu TEMP, K, KK 2547 2548 dsra L, TEMP, 2 2549 LD a1, 0 * SIZE(AO) 2550 MTC $0, c11 2551 LD a2, 1 * SIZE(AO) 2552 MOV c21, c11 2553 LD a3, 2 * SIZE(AO) 2554 MOV c31, c11 2555 LD a4, 3 * SIZE(AO) 2556 MOV c41, c11 2557 2558 LD b1, 0 * SIZE(BO) 2559 LD b2, 1 * SIZE(BO) 2560 LD b3, 2 * SIZE(BO) 2561 LD b4, 3 * SIZE(BO) 2562 LD b5, 4 * SIZE(BO) 2563 LD b6, 8 * SIZE(BO) 2564 LD b7, 12 * SIZE(BO) 2565 2566 blez L, .L65 2567 NOP 2568#endif 2569 .align 3 2570 2571.L62: 2572 MADD c11, c11, a1, b1 2573 LD b1, 4 * SIZE(BO) 2574 MADD c21, c21, a1, b2 2575 LD b2, 5 * SIZE(BO) 2576 MADD c31, c31, a2, b3 2577 LD b3, 6 * SIZE(BO) 2578 MADD c41, c41, a2, b4 2579 LD b4, 7 * SIZE(BO) 2580 2581 LD a1, 4 * SIZE(AO) 2582 LD a2, 5 * SIZE(AO) 2583 2584 MADD c11, c11, a3, b1 2585 LD b1, 8 * SIZE(BO) 2586 MADD c21, c21, a3, b2 2587 LD b2, 9 * SIZE(BO) 2588 MADD c31, c31, a4, b3 2589 LD b3, 10 * SIZE(BO) 2590 MADD c41, c41, a4, b4 2591 LD b4, 11 * SIZE(BO) 2592 2593 LD a3, 6 * SIZE(AO) 2594 LD a4, 7 * SIZE(AO) 2595 2596 daddiu L, L, -1 2597 daddiu AO, AO, 4 * SIZE 2598 2599 bgtz L, .L62 2600 daddiu BO, BO, 8 * SIZE 2601 .align 3 2602 2603.L65: 2604#if defined(LT) || defined(RN) 2605 andi L, KK, 3 2606#else 2607 andi L, TEMP, 3 2608#endif 2609 NOP 2610 blez L, .L68 2611 NOP 2612 .align 3 2613 2614.L66: 2615 MADD c11, c11, a1, b1 2616 LD b1, 2 * SIZE(BO) 2617 MADD c21, c21, a1, b2 2618 LD b2, 3 * SIZE(BO) 2619 2620 LD a1, 1 * SIZE(AO) 2621 daddiu L, L, -1 2622 2623 daddiu AO, AO, 1 * SIZE 2624 bgtz L, .L66 2625 daddiu BO, BO, 2 * SIZE 2626 2627 2628.L68: 2629 ADD c11, c11, c31 2630 ADD c21, c21, c41 2631 2632#if defined(LN) || defined(RT) 2633#ifdef LN 2634 daddiu TEMP, KK, -1 2635#else 2636 daddiu TEMP, KK, -2 2637#endif 2638 2639 dsll L, TEMP, 0 + BASE_SHIFT 2640 dsll TEMP, TEMP, 1 + BASE_SHIFT 2641 daddu AO, AORIG, L 2642 daddu BO, B, TEMP 2643#endif 2644 2645 2646#if defined(LN) || defined(LT) 2647 LD b1, 0 * SIZE(BO) 2648 LD b2, 1 * SIZE(BO) 2649 2650 SUB c11, b1, c11 2651 SUB c21, b2, c21 2652#else 2653 LD b1, 0 * SIZE(AO) 2654 LD b2, 1 * SIZE(AO) 2655 2656 SUB c11, b1, c11 2657 SUB c21, b2, c21 2658#endif 2659 2660#if defined(LN) || defined(LT) 2661 LD b3, 0 * SIZE(AO) 2662 2663 MUL c11, b3, c11 2664 MUL c21, b3, c21 2665#endif 2666 2667#ifdef RN 2668 LD b1, 0 * SIZE(BO) 2669 LD b2, 1 * SIZE(BO) 2670 LD b3, 3 * SIZE(BO) 2671 2672 MUL c11, b1, c11 2673 2674 NMSUB c21, c21, b2, c11 2675 2676 MUL c21, b3, c21 2677#endif 2678 2679#ifdef RT 2680 LD b1, 3 * SIZE(BO) 2681 LD b2, 2 * SIZE(BO) 2682 LD b3, 0 * SIZE(BO) 2683 2684 MUL c21, b1, c21 2685 2686 NMSUB c11, c11, b2, c21 2687 2688 MUL c11, b3, c11 2689#endif 2690 2691#ifdef LN 2692 daddiu CO1, CO1, -1 * SIZE 2693 daddiu CO2, CO2, -1 * SIZE 2694#endif 2695 2696#if defined(LN) || defined(LT) 2697 ST c11, 0 * SIZE(BO) 2698 ST c21, 1 * SIZE(BO) 2699#else 2700 ST c11, 0 * SIZE(AO) 2701 ST c21, 1 * SIZE(AO) 2702#endif 2703 2704 ST c11, 0 * SIZE(CO1) 2705 ST c21, 0 * SIZE(CO2) 2706 2707#ifndef LN 2708 daddiu CO1, CO1, 1 * SIZE 2709 daddiu CO2, CO2, 1 * SIZE 2710#endif 2711 2712#ifdef RT 2713 dsll TEMP, K, 0 + BASE_SHIFT 2714 daddu AORIG, AORIG, TEMP 2715#endif 2716 2717#if defined(LT) || defined(RN) 2718 dsubu TEMP, K, KK 2719 dsll L, TEMP, 0 + BASE_SHIFT 2720 dsll TEMP, TEMP, 1 + BASE_SHIFT 2721 daddu AO, AO, L 2722 daddu BO, BO, TEMP 2723#endif 2724 2725#ifdef LT 2726 daddiu KK, KK, 1 2727#endif 2728 2729#ifdef LN 2730 daddiu KK, KK, -1 2731#endif 2732 .align 3 2733 2734.L60: 2735 dsra I, M, 1 2736 blez I, .L69 2737 NOP 2738 2739.L51: 2740#if defined(LT) || defined(RN) 2741 LD a1, 0 * SIZE(AO) 2742 MTC $0, c11 2743 LD a2, 1 * SIZE(AO) 2744 MOV c21, c11 2745 LD a5, 4 * SIZE(AO) 2746 2747 LD b1, 0 * SIZE(B) 2748 MOV c12, c11 2749 LD b2, 1 * SIZE(B) 2750 MOV c22, c11 2751 LD b3, 2 * SIZE(B) 2752 LD b5, 4 * SIZE(B) 2753 dsra L, KK, 2 2754 LD b6, 8 * SIZE(B) 2755 LD b7, 12 * SIZE(B) 2756 2757 blez L, .L55 2758 move BO, B 2759 2760#else 2761#ifdef LN 2762 dsll TEMP, K, 1 + BASE_SHIFT 2763 dsubu AORIG, AORIG, TEMP 2764#endif 2765 2766 dsll L, KK, 1 + BASE_SHIFT 2767 dsll TEMP, KK, 1 + BASE_SHIFT 2768 2769 daddu AO, AORIG, L 2770 daddu BO, B, TEMP 2771 2772 dsubu TEMP, K, KK 2773 2774 LD a1, 0 * SIZE(AO) 2775 MTC $0, c11 2776 LD a2, 1 * SIZE(AO) 2777 MOV c21, c11 2778 LD a5, 4 * SIZE(AO) 2779 2780 LD b1, 0 * SIZE(BO) 2781 MOV c12, c11 2782 LD b2, 1 * SIZE(BO) 2783 MOV c22, c11 2784 LD b3, 2 * SIZE(BO) 2785 LD b5, 4 * SIZE(BO) 2786 dsra L, TEMP, 2 2787 LD b6, 8 * SIZE(BO) 2788 LD b7, 12 * SIZE(BO) 2789 2790 blez L, .L55 2791 NOP 2792#endif 2793 .align 3 2794 2795.L52: 2796 MADD c11, c11, a1, b1 2797 LD a3, 2 * SIZE(AO) 2798 MADD c21, c21, a1, b2 2799 LD b4, 3 * SIZE(BO) 2800 MADD c12, c12, a2, b1 2801 LD a4, 3 * SIZE(AO) 2802 MADD c22, c22, a2, b2 2803 LD b1, 8 * SIZE(BO) 2804 2805 MADD c11, c11, a3, b3 2806 LD a1, 8 * SIZE(AO) 2807 MADD c21, c21, a3, b4 2808 LD b2, 5 * SIZE(BO) 2809 MADD c12, c12, a4, b3 2810 LD a2, 5 * SIZE(AO) 2811 MADD c22, c22, a4, b4 2812 LD b3, 6 * SIZE(BO) 2813 2814 MADD c11, c11, a5, b5 2815 LD a3, 6 * SIZE(AO) 2816 MADD c21, c21, a5, b2 2817 LD b4, 7 * SIZE(BO) 2818 MADD c12, c12, a2, b5 2819 LD a4, 7 * SIZE(AO) 2820 MADD c22, c22, a2, b2 2821 LD b5, 12 * SIZE(BO) 2822 2823 MADD c11, c11, a3, b3 2824 LD a5, 12 * SIZE(AO) 2825 MADD c21, c21, a3, b4 2826 LD b2, 9 * SIZE(BO) 2827 MADD c12, c12, a4, b3 2828 LD a2, 9 * SIZE(AO) 2829 MADD c22, c22, a4, b4 2830 LD b3, 10 * SIZE(BO) 2831 2832 daddiu AO, AO, 8 * SIZE 2833 daddiu L, L, -1 2834 bgtz L, .L52 2835 daddiu BO, BO, 8 * SIZE 2836 .align 3 2837 2838.L55: 2839#if defined(LT) || defined(RN) 2840 andi L, KK, 3 2841#else 2842 andi L, TEMP, 3 2843#endif 2844 NOP 2845 blez L, .L58 2846 NOP 2847 .align 3 2848 2849.L56: 2850 MADD c11, c11, a1, b1 2851 LD a2, 1 * SIZE(AO) 2852 MADD c21, c21, a1, b2 2853 LD a1, 2 * SIZE(AO) 2854 2855 MADD c12, c12, a2, b1 2856 LD b1, 2 * SIZE(BO) 2857 MADD c22, c22, a2, b2 2858 LD b2, 3 * SIZE(BO) 2859 2860 daddiu L, L, -1 2861 daddiu AO, AO, 2 * SIZE 2862 bgtz L, .L56 2863 daddiu BO, BO, 2 * SIZE 2864 2865.L58: 2866#if defined(LN) || defined(RT) 2867#ifdef LN 2868 daddiu TEMP, KK, -2 2869#else 2870 daddiu TEMP, KK, -2 2871#endif 2872 2873 dsll L, TEMP, 1 + BASE_SHIFT 2874 dsll TEMP, TEMP, 1 + BASE_SHIFT 2875 daddu AO, AORIG, L 2876 daddu BO, B, TEMP 2877#endif 2878 2879 2880#if defined(LN) || defined(LT) 2881 LD b1, 0 * SIZE(BO) 2882 LD b2, 1 * SIZE(BO) 2883 LD b3, 2 * SIZE(BO) 2884 LD b4, 3 * SIZE(BO) 2885 2886 SUB c11, b1, c11 2887 SUB c21, b2, c21 2888 SUB c12, b3, c12 2889 SUB c22, b4, c22 2890#else 2891 LD b1, 0 * SIZE(AO) 2892 LD b2, 1 * SIZE(AO) 2893 LD b3, 2 * SIZE(AO) 2894 LD b4, 3 * SIZE(AO) 2895 2896 SUB c11, b1, c11 2897 SUB c12, b2, c12 2898 SUB c21, b3, c21 2899 SUB c22, b4, c22 2900#endif 2901 2902#ifdef LN 2903 LD b1, 3 * SIZE(AO) 2904 LD b2, 2 * SIZE(AO) 2905 LD b3, 0 * SIZE(AO) 2906 2907 MUL c12, b1, c12 2908 MUL c22, b1, c22 2909 2910 NMSUB c11, c11, b2, c12 2911 NMSUB c21, c21, b2, c22 2912 2913 MUL c11, b3, c11 2914 MUL c21, b3, c21 2915#endif 2916 2917#ifdef LT 2918 LD b1, 0 * SIZE(AO) 2919 LD b2, 1 * SIZE(AO) 2920 LD b3, 3 * SIZE(AO) 2921 2922 MUL c11, b1, c11 2923 MUL c21, b1, c21 2924 2925 NMSUB c12, c12, b2, c11 2926 NMSUB c22, c22, b2, c21 2927 2928 MUL c12, b3, c12 2929 MUL c22, b3, c22 2930#endif 2931 2932#ifdef RN 2933 LD b1, 0 * SIZE(BO) 2934 LD b2, 1 * SIZE(BO) 2935 LD b3, 3 * SIZE(BO) 2936 2937 MUL c11, b1, c11 2938 MUL c12, b1, c12 2939 2940 NMSUB c21, c21, b2, c11 2941 NMSUB c22, c22, b2, c12 2942 2943 MUL c21, b3, c21 2944 MUL c22, b3, c22 2945#endif 2946 2947#ifdef RT 2948 LD b1, 3 * SIZE(BO) 2949 LD b2, 2 * SIZE(BO) 2950 LD b3, 0 * SIZE(BO) 2951 2952 MUL c21, b1, c21 2953 MUL c22, b1, c22 2954 2955 NMSUB c11, c11, b2, c21 2956 NMSUB c12, c12, b2, c22 2957 2958 MUL c11, b3, c11 2959 MUL c12, b3, c12 2960#endif 2961 2962#ifdef LN 2963 daddiu CO1, CO1, -2 * SIZE 2964 daddiu CO2, CO2, -2 * SIZE 2965#endif 2966 2967#if defined(LN) || defined(LT) 2968 ST c11, 0 * SIZE(BO) 2969 ST c21, 1 * SIZE(BO) 2970 ST c12, 2 * SIZE(BO) 2971 ST c22, 3 * SIZE(BO) 2972#else 2973 ST c11, 0 * SIZE(AO) 2974 ST c12, 1 * SIZE(AO) 2975 ST c21, 2 * SIZE(AO) 2976 ST c22, 3 * SIZE(AO) 2977#endif 2978 2979 ST c11, 0 * SIZE(CO1) 2980 ST c12, 1 * SIZE(CO1) 2981 ST c21, 0 * SIZE(CO2) 2982 ST c22, 1 * SIZE(CO2) 2983 2984#ifndef LN 2985 daddiu CO1, CO1, 2 * SIZE 2986 daddiu CO2, CO2, 2 * SIZE 2987#endif 2988 2989#ifdef RT 2990 dsll TEMP, K, 1 + BASE_SHIFT 2991 daddu AORIG, AORIG, TEMP 2992#endif 2993 2994#if defined(LT) || defined(RN) 2995 dsubu TEMP, K, KK 2996 dsll TEMP, TEMP, 1 + BASE_SHIFT 2997 daddu AO, AO, TEMP 2998 daddu BO, BO, TEMP 2999#endif 3000 3001#ifdef LT 3002 daddiu KK, KK, 2 3003#endif 3004 3005#ifdef LN 3006 daddiu KK, KK, -2 3007#endif 3008 3009 MTC $0, a1 3010 3011 MOV c11, a1 3012 MOV c21, a1 3013 MOV c31, a1 3014 3015 daddiu I, I, -1 3016 3017 bgtz I, .L51 3018 MOV c41, c11 3019 .align 3 3020 3021.L69: 3022#ifdef LN 3023 dsll TEMP, K, 1 + BASE_SHIFT 3024 daddu B, B, TEMP 3025#endif 3026 3027#if defined(LT) || defined(RN) 3028 move B, BO 3029#endif 3030 3031#ifdef RN 3032 daddiu KK, KK, 2 3033#endif 3034 3035#ifdef RT 3036 daddiu KK, KK, -2 3037#endif 3038 .align 3 3039 3040.L70: 3041 andi J, N, 1 3042 blez J, .L999 3043 NOP 3044 3045#ifdef RT 3046 dsll TEMP, K, BASE_SHIFT 3047 dsubu B, B, TEMP 3048 3049 dsubu C, C, LDC 3050#endif 3051 3052 move AO, A 3053 move CO1, C 3054 3055#ifdef LN 3056 daddu KK, M, OFFSET 3057#endif 3058 3059#ifdef LT 3060 move KK, OFFSET 3061#endif 3062 3063#if defined(LN) || defined(RT) 3064 move AORIG, A 3065#else 3066 move AO, A 3067#endif 3068#ifndef RT 3069 daddu C, CO1, LDC 3070#endif 3071 3072 andi I, M, 1 3073 blez I, .L80 3074 NOP 3075 3076#if defined(LT) || defined(RN) 3077 LD a1, 0 * SIZE(AO) 3078 MTC $0, c11 3079 LD a2, 1 * SIZE(AO) 3080 MOV c21, c11 3081 LD a3, 2 * SIZE(AO) 3082 LD a4, 3 * SIZE(AO) 3083 3084 LD b1, 0 * SIZE(B) 3085 LD b2, 1 * SIZE(B) 3086 LD b3, 2 * SIZE(B) 3087 LD b4, 3 * SIZE(B) 3088 LD b5, 4 * SIZE(B) 3089 LD b6, 8 * SIZE(B) 3090 LD b7, 12 * SIZE(B) 3091 3092 dsra L, KK, 2 3093 blez L, .L85 3094 move BO, B 3095#else 3096#ifdef LN 3097 dsll TEMP, K, BASE_SHIFT 3098 dsubu AORIG, AORIG, TEMP 3099#endif 3100 3101 dsll TEMP, KK, BASE_SHIFT 3102 3103 daddu AO, AORIG, TEMP 3104 daddu BO, B, TEMP 3105 3106 dsubu TEMP, K, KK 3107 3108 LD a1, 0 * SIZE(AO) 3109 MTC $0, c11 3110 LD a2, 1 * SIZE(AO) 3111 MOV c21, c11 3112 LD a3, 2 * SIZE(AO) 3113 LD a4, 3 * SIZE(AO) 3114 3115 LD b1, 0 * SIZE(BO) 3116 LD b2, 1 * SIZE(BO) 3117 LD b3, 2 * SIZE(BO) 3118 LD b4, 3 * SIZE(BO) 3119 LD b5, 4 * SIZE(BO) 3120 LD b6, 8 * SIZE(BO) 3121 LD b7, 12 * SIZE(BO) 3122 3123 dsra L, TEMP, 2 3124 blez L, .L85 3125 NOP 3126#endif 3127 .align 3 3128 3129.L82: 3130 LD a1, 0 * SIZE(AO) 3131 LD b1, 0 * SIZE(BO) 3132 3133 MADD c11, c11, a1, b1 3134 3135 LD a1, 1 * SIZE(AO) 3136 LD b1, 1 * SIZE(BO) 3137 3138 MADD c21, c21, a1, b1 3139 3140 LD a1, 2 * SIZE(AO) 3141 LD b1, 2 * SIZE(BO) 3142 3143 MADD c11, c11, a1, b1 3144 3145 LD a1, 3 * SIZE(AO) 3146 LD b1, 3 * SIZE(BO) 3147 3148 MADD c21, c21, a1, b1 3149 3150 daddiu L, L, -1 3151 daddiu AO, AO, 4 * SIZE 3152 bgtz L, .L82 3153 daddiu BO, BO, 4 * SIZE 3154 .align 3 3155 3156.L85: 3157#if defined(LT) || defined(RN) 3158 andi L, KK, 3 3159#else 3160 andi L, TEMP, 3 3161#endif 3162 NOP 3163 blez L, .L88 3164 NOP 3165 .align 3 3166 3167.L86: 3168 LD a1, 0 * SIZE(AO) 3169 LD b1, 0 * SIZE(BO) 3170 3171 MADD c11, c11, a1, b1 3172 3173 daddiu L, L, -1 3174 daddiu AO, AO, 1 * SIZE 3175 bgtz L, .L86 3176 daddiu BO, BO, 1 * SIZE 3177 3178 3179.L88: 3180 ADD c11, c11, c21 3181 3182#if defined(LN) || defined(RT) 3183#ifdef LN 3184 daddiu TEMP, KK, -1 3185#else 3186 daddiu TEMP, KK, -1 3187#endif 3188 3189 dsll TEMP, TEMP, 0 + BASE_SHIFT 3190 daddu AO, AORIG, TEMP 3191 daddu BO, B, TEMP 3192#endif 3193 3194 3195#if defined(LN) || defined(LT) 3196 LD b1, 0 * SIZE(BO) 3197 3198 SUB c11, b1, c11 3199#else 3200 LD b1, 0 * SIZE(AO) 3201 3202 SUB c11, b1, c11 3203#endif 3204 3205#if defined(LN) || defined(LT) 3206 LD b1, 0 * SIZE(AO) 3207 3208 MUL c11, b1, c11 3209#endif 3210 3211#if defined(RN) || defined(RT) 3212 LD b1, 0 * SIZE(BO) 3213 3214 MUL c11, b1, c11 3215#endif 3216 3217#ifdef LN 3218 daddiu CO1, CO1, -1 * SIZE 3219#endif 3220 3221#if defined(LN) || defined(LT) 3222 ST c11, 0 * SIZE(BO) 3223#else 3224 ST c11, 0 * SIZE(AO) 3225#endif 3226 3227 ST c11, 0 * SIZE(CO1) 3228 3229#ifndef LN 3230 daddiu CO1, CO1, 1 * SIZE 3231#endif 3232 3233#ifdef RT 3234 dsll TEMP, K, BASE_SHIFT 3235 daddu AORIG, AORIG, TEMP 3236#endif 3237 3238#if defined(LT) || defined(RN) 3239 dsubu TEMP, K, KK 3240 dsll TEMP, TEMP, 0 + BASE_SHIFT 3241 daddu AO, AO, TEMP 3242 daddu BO, BO, TEMP 3243#endif 3244 3245#ifdef LT 3246 daddiu KK, KK, 1 3247#endif 3248 3249#ifdef LN 3250 daddiu KK, KK, -1 3251#endif 3252 .align 3 3253 3254.L80: 3255 dsra I, M, 1 3256 blez I, .L89 3257 NOP 3258 3259.L71: 3260#if defined(LT) || defined(RN) 3261 LD a1, 0 * SIZE(AO) 3262 MTC $0, c11 3263 LD a2, 1 * SIZE(AO) 3264 MOV c21, c11 3265 LD a5, 4 * SIZE(AO) 3266 3267 LD b1, 0 * SIZE(B) 3268 MOV c12, c11 3269 LD b2, 1 * SIZE(B) 3270 MOV c22, c11 3271 LD b3, 2 * SIZE(B) 3272 LD b5, 4 * SIZE(B) 3273 dsra L, KK, 2 3274 LD b6, 8 * SIZE(B) 3275 LD b7, 12 * SIZE(B) 3276 3277 blez L, .L75 3278 move BO, B 3279#else 3280#ifdef LN 3281 dsll TEMP, K, 1 + BASE_SHIFT 3282 dsubu AORIG, AORIG, TEMP 3283#endif 3284 3285 dsll L, KK, 1 + BASE_SHIFT 3286 dsll TEMP, KK, 0 + BASE_SHIFT 3287 3288 daddu AO, AORIG, L 3289 daddu BO, B, TEMP 3290 3291 dsubu TEMP, K, KK 3292 3293 LD a1, 0 * SIZE(AO) 3294 MTC $0, c11 3295 LD a2, 1 * SIZE(AO) 3296 MOV c21, c11 3297 LD a5, 4 * SIZE(AO) 3298 3299 LD b1, 0 * SIZE(BO) 3300 MOV c12, c11 3301 LD b2, 1 * SIZE(BO) 3302 MOV c22, c11 3303 LD b3, 2 * SIZE(BO) 3304 LD b5, 4 * SIZE(BO) 3305 dsra L, TEMP, 2 3306 LD b6, 8 * SIZE(BO) 3307 LD b7, 12 * SIZE(BO) 3308 3309 blez L, .L75 3310 NOP 3311#endif 3312 .align 3 3313 3314.L72: 3315 LD a1, 0 * SIZE(AO) 3316 LD a2, 1 * SIZE(AO) 3317 LD b1, 0 * SIZE(BO) 3318 3319 MADD c11, c11, a1, b1 3320 MADD c12, c12, a2, b1 3321 3322 LD a1, 2 * SIZE(AO) 3323 LD a2, 3 * SIZE(AO) 3324 LD b1, 1 * SIZE(BO) 3325 3326 MADD c11, c11, a1, b1 3327 MADD c12, c12, a2, b1 3328 3329 LD a1, 4 * SIZE(AO) 3330 LD a2, 5 * SIZE(AO) 3331 LD b1, 2 * SIZE(BO) 3332 3333 MADD c11, c11, a1, b1 3334 MADD c12, c12, a2, b1 3335 3336 LD a1, 6 * SIZE(AO) 3337 LD a2, 7 * SIZE(AO) 3338 LD b1, 3 * SIZE(BO) 3339 3340 MADD c11, c11, a1, b1 3341 MADD c12, c12, a2, b1 3342 3343 daddiu L, L, -1 3344 daddiu AO, AO, 8 * SIZE 3345 bgtz L, .L72 3346 daddiu BO, BO, 4 * SIZE 3347 .align 3 3348 3349.L75: 3350#if defined(LT) || defined(RN) 3351 andi L, KK, 3 3352#else 3353 andi L, TEMP, 3 3354#endif 3355 NOP 3356 blez L, .L78 3357 NOP 3358 .align 3 3359 3360.L76: 3361 LD a1, 0 * SIZE(AO) 3362 LD a2, 1 * SIZE(AO) 3363 LD b1, 0 * SIZE(BO) 3364 3365 MADD c11, c11, a1, b1 3366 MADD c12, c12, a2, b1 3367 3368 daddiu L, L, -1 3369 daddiu AO, AO, 2 * SIZE 3370 bgtz L, .L76 3371 daddiu BO, BO, 1 * SIZE 3372 3373.L78: 3374 ADD c11, c11, c21 3375 ADD c12, c12, c22 3376 3377#if defined(LN) || defined(RT) 3378#ifdef LN 3379 daddiu TEMP, KK, -2 3380#else 3381 daddiu TEMP, KK, -1 3382#endif 3383 3384 dsll L, TEMP, 1 + BASE_SHIFT 3385 dsll TEMP, TEMP, 0 + BASE_SHIFT 3386 daddu AO, AORIG, L 3387 daddu BO, B, TEMP 3388#endif 3389 3390 3391#if defined(LN) || defined(LT) 3392 LD b1, 0 * SIZE(BO) 3393 LD b2, 1 * SIZE(BO) 3394 3395 SUB c11, b1, c11 3396 SUB c12, b2, c12 3397#else 3398 LD b1, 0 * SIZE(AO) 3399 LD b2, 1 * SIZE(AO) 3400 3401 SUB c11, b1, c11 3402 SUB c12, b2, c12 3403#endif 3404 3405#ifdef LN 3406 LD b1, 3 * SIZE(AO) 3407 LD b2, 2 * SIZE(AO) 3408 LD b3, 0 * SIZE(AO) 3409 3410 MUL c12, b1, c12 3411 NMSUB c11, c11, b2, c12 3412 MUL c11, b3, c11 3413#endif 3414 3415#ifdef LT 3416 LD b1, 0 * SIZE(AO) 3417 LD b2, 1 * SIZE(AO) 3418 LD b3, 3 * SIZE(AO) 3419 3420 MUL c11, b1, c11 3421 NMSUB c12, c12, b2, c11 3422 MUL c12, b3, c12 3423#endif 3424 3425#if defined(RN) || defined(RT) 3426 LD b1, 0 * SIZE(BO) 3427 3428 MUL c11, b1, c11 3429 MUL c12, b1, c12 3430#endif 3431 3432#ifdef LN 3433 daddiu CO1, CO1, -2 * SIZE 3434#endif 3435 3436#if defined(LN) || defined(LT) 3437 ST c11, 0 * SIZE(BO) 3438 ST c12, 1 * SIZE(BO) 3439#else 3440 ST c11, 0 * SIZE(AO) 3441 ST c12, 1 * SIZE(AO) 3442#endif 3443 3444 ST c11, 0 * SIZE(CO1) 3445 ST c12, 1 * SIZE(CO1) 3446 3447#ifndef LN 3448 daddiu CO1, CO1, 2 * SIZE 3449#endif 3450 3451#ifdef RT 3452 dsll TEMP, K, 1 + BASE_SHIFT 3453 daddu AORIG, AORIG, TEMP 3454#endif 3455 3456#if defined(LT) || defined(RN) 3457 dsubu TEMP, K, KK 3458 dsll L, TEMP, 1 + BASE_SHIFT 3459 dsll TEMP, TEMP, 0 + BASE_SHIFT 3460 daddu AO, AO, L 3461 daddu BO, BO, TEMP 3462#endif 3463 3464#ifdef LT 3465 daddiu KK, KK, 2 3466#endif 3467 3468#ifdef LN 3469 daddiu KK, KK, -2 3470#endif 3471 3472 daddiu I, I, -1 3473 3474 bgtz I, .L71 3475 NOP 3476 .align 3 3477 3478 3479.L89: 3480#ifdef LN 3481 dsll TEMP, K, BASE_SHIFT 3482 daddu B, B, TEMP 3483#endif 3484 3485#if defined(LT) || defined(RN) 3486 move B, BO 3487#endif 3488 3489#ifdef RN 3490 daddiu KK, KK, 1 3491#endif 3492 3493#ifdef RT 3494 daddiu KK, KK, -1 3495#endif 3496 .align 3 3497 3498 3499.L999: 3500 LDARG $16, 0($sp) 3501 LDARG $17, 8($sp) 3502 LDARG $18, 16($sp) 3503 LDARG $19, 24($sp) 3504 LDARG $20, 32($sp) 3505 LDARG $21, 40($sp) 3506 ldc1 $f24, 48($sp) 3507 ldc1 $f25, 56($sp) 3508 ldc1 $f26, 64($sp) 3509 ldc1 $f27, 72($sp) 3510 ldc1 $f28, 80($sp) 3511 3512 LDARG $22, 88($sp) 3513 LDARG $23, 96($sp) 3514 LDARG $24, 104($sp) 3515 LDARG $25, 112($sp) 3516 3517#ifndef __64BIT__ 3518 ldc1 $f20,112($sp) 3519 ldc1 $f21,120($sp) 3520 ldc1 $f22,128($sp) 3521 ldc1 $f23,136($sp) 3522#endif 3523 3524 j $31 3525 daddiu $sp, $sp, 144 3526 3527 EPILOGUE 3528