1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M $4 26#define N $5 27#define K $6 28#define A $8 29#define B $9 30#define C $10 31#define LDC $11 32 33#define AO $12 34#define BO $13 35 36#define I $2 37#define J $3 38#define L $7 39 40#define PREFETCHSIZE (4 * 10) 41 42#define CO1 $14 43#define CO2 $15 44#define CO3 $16 45#define CO4 $17 46#define CO5 $18 47#define CO6 $19 48#define CO7 $20 49#define CO8 $21 50 51#define BB $22 52 53#if defined(TRMMKERNEL) 54#define OFFSET $23 55#define KK $24 56#define TEMP $25 57#endif 58 59#define a1 $f0 60#define a2 $f1 61#define a3 $f27 62#define a4 $f28 63 64#define b1 $f2 65#define b2 $f3 66#define b3 $f4 67#define b4 $f5 68#define b5 $f6 69#define b6 $f7 70#define b7 $f8 71#define b8 $f9 72 73#define a5 b8 74 75#define c11 $f10 76#define c12 $f11 77#define c21 $f12 78#define c22 $f13 79#define c31 $f14 80#define c32 $f16 81#define c41 $f17 82#define c42 $f18 83#define c51 $f19 84#define c52 $f20 85#define c61 $f21 86#define c62 $f22 87#define c71 $f23 88#define c72 $f24 89#define c81 $f25 90#define c82 $f26 91 92#define ALPHA $f15 93 94 PROLOGUE 95 96 daddiu $sp, $sp, -160 97 98 SDARG $16, 0($sp) 99 SDARG $17, 8($sp) 100 SDARG $18, 16($sp) 101 SDARG $19, 24($sp) 102 SDARG $20, 32($sp) 103 SDARG $21, 40($sp) 104 SDARG $22, 48($sp) 105 106 sdc1 $f24, 56($sp) 107 sdc1 $f25, 64($sp) 108 sdc1 $f26, 72($sp) 109 sdc1 $f27, 80($sp) 110 sdc1 $f28, 88($sp) 111 112#if defined(TRMMKERNEL) 113 SDARG $23, 96($sp) 114 SDARG $24, 104($sp) 115 SDARG $25, 112($sp) 116 117 LDARG OFFSET, 160($sp) 118#endif 119 120#ifndef __64BIT__ 121 sdc1 $f20,120($sp) 122 sdc1 $f21,128($sp) 123 sdc1 $f22,136($sp) 124 sdc1 $f23,144($sp) 125#endif 126 127 dsll LDC, LDC, BASE_SHIFT 128 129#if defined(TRMMKERNEL) && !defined(LEFT) 130 neg KK, OFFSET 131#endif 132 133 dsra J, N, 3 134 blez J, .L30 135 nop 136 137.L10: 138 move CO1, C 139 MTC $0, c11 140 daddu CO2, C, LDC 141 move AO, A 142 daddu CO3, CO2, LDC 143 daddiu J, J, -1 144 daddu CO4, CO3, LDC 145 MOV c21, c11 146 daddu CO5, CO4, LDC 147 MOV c31, c11 148 daddu CO6, CO5, LDC 149 MOV c41, c11 150 daddu CO7, CO6, LDC 151 MOV c51, c11 152 daddu CO8, CO7, LDC 153 dsra I, M, 1 154 daddu C, CO8, LDC 155 156 dsll BB, K, 2 + BASE_SHIFT 157 daddu BB, B, BB 158 159#if defined(TRMMKERNEL) && defined(LEFT) 160 move KK, OFFSET 161#endif 162 163 blez I, .L20 164 MOV c61, c11 165 166.L11: 167#if defined(TRMMKERNEL) 168#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 169 move BO, B 170#else 171 dsll L, KK, 1 + BASE_SHIFT 172 dsll TEMP, KK, 3 + BASE_SHIFT 173 174 daddu AO, AO, L 175 daddu BO, B, TEMP 176#endif 177 178 LD a1, 0 * SIZE(AO) 179 MOV c71, c11 180 LD b1, 0 * SIZE(BO) 181 MOV c81, c11 182 183 LD a3, 4 * SIZE(AO) 184 MOV c12, c11 185 LD b2, 1 * SIZE(BO) 186 MOV c22, c11 187 188 MOV c32, c11 189 LD b3, 2 * SIZE(BO) 190 MOV c42, c11 191 192 LD b4, 3 * SIZE(BO) 193 MOV c52, c11 194 LD b5, 4 * SIZE(BO) 195 MOV c62, c11 196 197 LD b6, 8 * SIZE(BO) 198 MOV c72, c11 199 LD b7, 12 * SIZE(BO) 200 MOV c82, c11 201 202#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 203 dsubu TEMP, K, KK 204#elif defined(LEFT) 205 daddiu TEMP, KK, 2 206#else 207 daddiu TEMP, KK, 8 208#endif 209 dsra L, TEMP, 2 210 211 blez L, .L15 212 NOP 213#else 214 LD a1, 0 * SIZE(AO) 215 MOV c71, c11 216 LD b1, 0 * SIZE(B) 217 MOV c81, c11 218 219 pref 1, 3 * SIZE(CO1) 220 pref 1, 3 * SIZE(CO2) 221 222 LD a3, 4 * SIZE(AO) 223 MOV c12, c11 224 LD b2, 1 * SIZE(B) 225 MOV c22, c11 226 227 dsra L, K, 2 228 MOV c32, c11 229 LD b3, 2 * SIZE(B) 230 MOV c42, c11 231 232 LD b4, 3 * SIZE(B) 233 MOV c52, c11 234 LD b5, 4 * SIZE(B) 235 MOV c62, c11 236 237 LD b6, 8 * SIZE(B) 238 MOV c72, c11 239 LD b7, 12 * SIZE(B) 240 MOV c82, c11 241 242 blez L, .L15 243 move BO, B 244#endif 245 246 MADD c11, c11, a1, b1 247 LD a2, 1 * SIZE(AO) 248 MADD c21, c21, a1, b2 249 daddiu L, L, -1 250 MADD c31, c31, a1, b3 251 blez L, .L13 252 MADD c41, c41, a1, b4 253 pref 1, 2 * SIZE(CO3) 254 .align 3 255 256.L12: 257 MADD c12, c12, a2, b1 258 LD b1, 16 * SIZE(BO) 259 MADD c22, c22, a2, b2 260 LD b2, 5 * SIZE(BO) 261 MADD c32, c32, a2, b3 262 LD b3, 6 * SIZE(BO) 263 MADD c42, c42, a2, b4 264 LD b4, 7 * SIZE(BO) 265 266 MADD c51, c51, a1, b5 267 LD a4, 2 * SIZE(AO) 268 MADD c61, c61, a1, b2 269 NOP 270 MADD c71, c71, a1, b3 271 NOP 272 MADD c81, c81, a1, b4 273 LD a1, 8 * SIZE(AO) 274 275 MADD c52, c52, a2, b5 276 LD b5, 20 * SIZE(BO) 277 MADD c62, c62, a2, b2 278 LD b2, 9 * SIZE(BO) 279 MADD c72, c72, a2, b3 280 LD b3, 10 * SIZE(BO) 281 MADD c82, c82, a2, b4 282 LD b4, 11 * SIZE(BO) 283 284 MADD c11, c11, a4, b6 285 LD a2, 3 * SIZE(AO) 286 MADD c21, c21, a4, b2 287 NOP 288 MADD c31, c31, a4, b3 289 NOP 290 MADD c41, c41, a4, b4 291 NOP 292 293 MADD c12, c12, a2, b6 294 LD b6, 24 * SIZE(BO) 295 MADD c22, c22, a2, b2 296 LD b2, 13 * SIZE(BO) 297 MADD c32, c32, a2, b3 298 LD b3, 14 * SIZE(BO) 299 MADD c42, c42, a2, b4 300 LD b4, 15 * SIZE(BO) 301 302 MADD c51, c51, a4, b7 303 NOP 304 MADD c61, c61, a4, b2 305 NOP 306 MADD c71, c71, a4, b3 307 NOP 308 MADD c81, c81, a4, b4 309 NOP 310 311 MADD c52, c52, a2, b7 312 LD b7, 28 * SIZE(BO) 313 MADD c62, c62, a2, b2 314 LD b2, 17 * SIZE(BO) 315 MADD c72, c72, a2, b3 316 LD b3, 18 * SIZE(BO) 317 MADD c82, c82, a2, b4 318 LD b4, 19 * SIZE(BO) 319 320 MADD c11, c11, a3, b1 321 LD a2, 5 * SIZE(AO) 322 MADD c21, c21, a3, b2 323 NOP 324 MADD c31, c31, a3, b3 325 NOP 326 MADD c41, c41, a3, b4 327 NOP 328 329 MADD c12, c12, a2, b1 330 LD b1, 32 * SIZE(BO) 331 MADD c22, c22, a2, b2 332 LD b2, 21 * SIZE(BO) 333 MADD c32, c32, a2, b3 334 LD b3, 22 * SIZE(BO) 335 MADD c42, c42, a2, b4 336 LD b4, 23 * SIZE(BO) 337 338 MADD c51, c51, a3, b5 339 LD a4, 6 * SIZE(AO) 340 MADD c61, c61, a3, b2 341 NOP 342 MADD c71, c71, a3, b3 343 NOP 344 MADD c81, c81, a3, b4 345 LD a3, 12 * SIZE(AO) 346 347 MADD c52, c52, a2, b5 348 LD b5, 36 * SIZE(BO) 349 MADD c62, c62, a2, b2 350 LD b2, 25 * SIZE(BO) 351 MADD c72, c72, a2, b3 352 LD b3, 26 * SIZE(BO) 353 MADD c82, c82, a2, b4 354 LD b4, 27 * SIZE(BO) 355 356 MADD c11, c11, a4, b6 357 LD a2, 7 * SIZE(AO) 358 MADD c21, c21, a4, b2 359 NOP 360 MADD c31, c31, a4, b3 361 NOP 362 MADD c41, c41, a4, b4 363 daddiu L, L, -1 364 365 MADD c12, c12, a2, b6 366 LD b6, 40 * SIZE(BO) 367 MADD c22, c22, a2, b2 368 LD b2, 29 * SIZE(BO) 369 MADD c32, c32, a2, b3 370 LD b3, 30 * SIZE(BO) 371 MADD c42, c42, a2, b4 372 LD b4, 31 * SIZE(BO) 373 374 MADD c51, c51, a4, b7 375 daddiu BO, BO, 32 * SIZE 376 MADD c61, c61, a4, b2 377 daddiu AO, AO, 8 * SIZE 378 MADD c71, c71, a4, b3 379 NOP 380 MADD c81, c81, a4, b4 381 NOP 382 383 MADD c52, c52, a2, b7 384 LD b7, 12 * SIZE(BO) 385 MADD c62, c62, a2, b2 386 LD b2, 1 * SIZE(BO) 387 MADD c72, c72, a2, b3 388 LD b3, 2 * SIZE(BO) 389 MADD c82, c82, a2, b4 390 LD b4, 3 * SIZE(BO) 391 392 MADD c11, c11, a1, b1 393 LD a2, 1 * SIZE(AO) 394 MADD c21, c21, a1, b2 395 NOP 396 MADD c31, c31, a1, b3 397 bgtz L, .L12 398 MADD c41, c41, a1, b4 399 NOP 400 .align 3 401 402.L13: 403 MADD c12, c12, a2, b1 404 LD b1, 16 * SIZE(BO) 405 MADD c22, c22, a2, b2 406 LD b2, 5 * SIZE(BO) 407 MADD c32, c32, a2, b3 408 LD b3, 6 * SIZE(BO) 409 MADD c42, c42, a2, b4 410 LD b4, 7 * SIZE(BO) 411 412 MADD c51, c51, a1, b5 413 NOP 414 MADD c61, c61, a1, b2 415 LD a4, 2 * SIZE(AO) 416 MADD c71, c71, a1, b3 417 NOP 418 MADD c81, c81, a1, b4 419 LD a1, 8 * SIZE(AO) 420 421 MADD c52, c52, a2, b5 422 LD b5, 20 * SIZE(BO) 423 MADD c62, c62, a2, b2 424 LD b2, 9 * SIZE(BO) 425 MADD c72, c72, a2, b3 426 LD b3, 10 * SIZE(BO) 427 MADD c82, c82, a2, b4 428 LD b4, 11 * SIZE(BO) 429 430 MADD c11, c11, a4, b6 431 LD a2, 3 * SIZE(AO) 432 MADD c21, c21, a4, b2 433 NOP 434 MADD c31, c31, a4, b3 435 pref 1, 3 * SIZE(CO4) 436 MADD c41, c41, a4, b4 437 NOP 438 439 MADD c12, c12, a2, b6 440 LD b6, 24 * SIZE(BO) 441 MADD c22, c22, a2, b2 442 LD b2, 13 * SIZE(BO) 443 MADD c32, c32, a2, b3 444 LD b3, 14 * SIZE(BO) 445 MADD c42, c42, a2, b4 446 LD b4, 15 * SIZE(BO) 447 448 MADD c51, c51, a4, b7 449 pref 1, 3 * SIZE(CO5) 450 MADD c61, c61, a4, b2 451 NOP 452 MADD c71, c71, a4, b3 453 pref 1, 3 * SIZE(CO6) 454 MADD c81, c81, a4, b4 455 NOP 456 457 MADD c52, c52, a2, b7 458 LD b7, 28 * SIZE(BO) 459 MADD c62, c62, a2, b2 460 LD b2, 17 * SIZE(BO) 461 MADD c72, c72, a2, b3 462 LD b3, 18 * SIZE(BO) 463 MADD c82, c82, a2, b4 464 LD b4, 19 * SIZE(BO) 465 466 MADD c11, c11, a3, b1 467 LD a2, 5 * SIZE(AO) 468 MADD c21, c21, a3, b2 469 NOP 470 MADD c31, c31, a3, b3 471 pref 1, 3 * SIZE(CO7) 472 MADD c41, c41, a3, b4 473 NOP 474 475 MADD c12, c12, a2, b1 476 LD b1, 32 * SIZE(BO) 477 MADD c22, c22, a2, b2 478 LD b2, 21 * SIZE(BO) 479 MADD c32, c32, a2, b3 480 LD b3, 22 * SIZE(BO) 481 MADD c42, c42, a2, b4 482 LD b4, 23 * SIZE(BO) 483 484 MADD c51, c51, a3, b5 485 NOP 486 MADD c61, c61, a3, b2 487 LD a4, 6 * SIZE(AO) 488 MADD c71, c71, a3, b3 489 NOP 490 MADD c81, c81, a3, b4 491 NOP 492 493 MADD c52, c52, a2, b5 494 LD b5, 36 * SIZE(BO) 495 MADD c62, c62, a2, b2 496 LD b2, 25 * SIZE(BO) 497 MADD c72, c72, a2, b3 498 LD b3, 26 * SIZE(BO) 499 MADD c82, c82, a2, b4 500 LD b4, 27 * SIZE(BO) 501 502 MADD c11, c11, a4, b6 503 LD a2, 7 * SIZE(AO) 504 MADD c21, c21, a4, b2 505 NOP 506 MADD c31, c31, a4, b3 507 NOP 508 MADD c41, c41, a4, b4 509 NOP 510 511 MADD c12, c12, a2, b6 512 LD b6, 40 * SIZE(BO) 513 MADD c22, c22, a2, b2 514 LD b2, 29 * SIZE(BO) 515 MADD c32, c32, a2, b3 516 LD b3, 30 * SIZE(BO) 517 MADD c42, c42, a2, b4 518 LD b4, 31 * SIZE(BO) 519 520 MADD c51, c51, a4, b7 521 daddiu BO, BO, 32 * SIZE 522 MADD c61, c61, a4, b2 523 daddiu AO, AO, 8 * SIZE 524 MADD c71, c71, a4, b3 525 NOP 526 MADD c81, c81, a4, b4 527 NOP 528 529 MADD c52, c52, a2, b7 530 LD b7, 12 * SIZE(BO) 531 MADD c62, c62, a2, b2 532 LD b2, 1 * SIZE(BO) 533 MADD c72, c72, a2, b3 534 LD b3, 2 * SIZE(BO) 535 MADD c82, c82, a2, b4 536 LD b4, 3 * SIZE(BO) 537 .align 3 538 539.L15: 540#ifndef TRMMKERNEL 541 andi L, K, 3 542#else 543 andi L, TEMP, 3 544#endif 545 NOP 546 blez L, .L18 547 pref 1, 3 * SIZE(CO8) 548 .align 3 549 550.L16: 551 MADD c11, c11, a1, b1 552 LD a2, 1 * SIZE(AO) 553 MADD c21, c21, a1, b2 554 NOP 555 MADD c31, c31, a1, b3 556 NOP 557 MADD c41, c41, a1, b4 558 NOP 559 560 MADD c12, c12, a2, b1 561 LD b1, 8 * SIZE(BO) 562 MADD c22, c22, a2, b2 563 LD b2, 5 * SIZE(BO) 564 MADD c32, c32, a2, b3 565 LD b3, 6 * SIZE(BO) 566 MADD c42, c42, a2, b4 567 LD b4, 7 * SIZE(BO) 568 569 MADD c51, c51, a1, b5 570 daddiu L, L, -1 571 MADD c61, c61, a1, b2 572 daddiu AO, AO, 2 * SIZE 573 MADD c71, c71, a1, b3 574 daddiu BO, BO, 8 * SIZE 575 MADD c81, c81, a1, b4 576 LD a1, 0 * SIZE(AO) 577 578 MADD c52, c52, a2, b5 579 LD b5, 4 * SIZE(BO) 580 MADD c62, c62, a2, b2 581 LD b2, 1 * SIZE(BO) 582 MADD c72, c72, a2, b3 583 LD b3, 2 * SIZE(BO) 584 MADD c82, c82, a2, b4 585 bgtz L, .L16 586 LD b4, 3 * SIZE(BO) 587 588.L18: 589#ifndef TRMMKERNEL 590 LD $f0, 0 * SIZE(CO1) 591 daddiu CO3,CO3, 2 * SIZE 592 LD $f1, 1 * SIZE(CO1) 593 daddiu CO1,CO1, 2 * SIZE 594 LD $f2, 0 * SIZE(CO2) 595 daddiu CO4,CO4, 2 * SIZE 596 LD $f3, 1 * SIZE(CO2) 597 daddiu CO2,CO2, 2 * SIZE 598 599 LD $f4, -2 * SIZE(CO3) 600 daddiu CO5,CO5, 2 * SIZE 601 LD $f5, -1 * SIZE(CO3) 602 daddiu CO6,CO6, 2 * SIZE 603 LD $f6, -2 * SIZE(CO4) 604 daddiu CO7,CO7, 2 * SIZE 605 LD $f7, -1 * SIZE(CO4) 606 daddiu I, I, -1 607 608 MADD c11, $f0, ALPHA, c11 609 LD $f0,-2 * SIZE(CO5) 610 MADD c12, $f1, ALPHA, c12 611 LD $f1,-1 * SIZE(CO5) 612 MADD c21, $f2, ALPHA, c21 613 LD $f2,-2 * SIZE(CO6) 614 MADD c22, $f3, ALPHA, c22 615 LD $f3,-1 * SIZE(CO6) 616 617 MADD c31, $f4, ALPHA, c31 618 LD $f4,-2 * SIZE(CO7) 619 MADD c32, $f5, ALPHA, c32 620 LD $f5,-1 * SIZE(CO7) 621 MADD c41, $f6, ALPHA, c41 622 LD $f6, 0 * SIZE(CO8) 623 MADD c42, $f7, ALPHA, c42 624 LD $f7, 1 * SIZE(CO8) 625 626 pref 0, 0 * SIZE(BB) 627 pref 0, 8 * SIZE(BB) 628 629 ST c11, -2 * SIZE(CO1) 630 MTC $0, c11 631 ST c12, -1 * SIZE(CO1) 632 daddiu CO8,CO8, 2 * SIZE 633 ST c21, -2 * SIZE(CO2) 634 MOV c21, c11 635 ST c22, -1 * SIZE(CO2) 636 daddiu BB, BB, 16 * SIZE 637 638 MADD c51, $f0, ALPHA, c51 639 ST c31, -2 * SIZE(CO3) 640 MADD c52, $f1, ALPHA, c52 641 ST c32, -1 * SIZE(CO3) 642 MADD c61, $f2, ALPHA, c61 643 ST c41, -2 * SIZE(CO4) 644 MADD c62, $f3, ALPHA, c62 645 ST c42, -1 * SIZE(CO4) 646 647 MADD c71, $f4, ALPHA, c71 648 ST c51, -2 * SIZE(CO5) 649 MADD c72, $f5, ALPHA, c72 650 ST c52, -1 * SIZE(CO5) 651 MADD c81, $f6, ALPHA, c81 652 ST c61, -2 * SIZE(CO6) 653 MADD c82, $f7, ALPHA, c82 654 ST c62, -1 * SIZE(CO6) 655 656 ST c71, -2 * SIZE(CO7) 657 MOV c31, c11 658 ST c72, -1 * SIZE(CO7) 659 MOV c41, c11 660 661 ST c81, -2 * SIZE(CO8) 662 MOV c51, c11 663 ST c82, -1 * SIZE(CO8) 664 bgtz I, .L11 665 MOV c61, c11 666#else 667 daddiu CO4,CO4, 2 * SIZE 668 daddiu CO5,CO5, 2 * SIZE 669 daddiu CO6,CO6, 2 * SIZE 670 daddiu CO7,CO7, 2 * SIZE 671 672 pref 0, 0 * SIZE(BB) 673 pref 0, 8 * SIZE(BB) 674 675 MUL c11, ALPHA, c11 676 daddiu CO1,CO1, 2 * SIZE 677 MUL c12, ALPHA, c12 678 MTC $0, a1 679 MUL c21, ALPHA, c21 680 daddiu CO2,CO2, 2 * SIZE 681 MUL c22, ALPHA, c22 682 daddiu CO3,CO3, 2 * SIZE 683 684 ST c11, -2 * SIZE(CO1) 685 MUL c31, ALPHA, c31 686 ST c12, -1 * SIZE(CO1) 687 MUL c32, ALPHA, c32 688 ST c21, -2 * SIZE(CO2) 689 MUL c41, ALPHA, c41 690 ST c22, -1 * SIZE(CO2) 691 MUL c42, ALPHA, c42 692 693 ST c31, -2 * SIZE(CO3) 694 MUL c51, ALPHA, c51 695 ST c32, -1 * SIZE(CO3) 696 MUL c52, ALPHA, c52 697 ST c41, -2 * SIZE(CO4) 698 MUL c61, ALPHA, c61 699 ST c42, -1 * SIZE(CO4) 700 MUL c62, ALPHA, c62 701 702 ST c51, -2 * SIZE(CO5) 703 MUL c71, ALPHA, c71 704 ST c52, -1 * SIZE(CO5) 705 MUL c72, ALPHA, c72 706 ST c61, -2 * SIZE(CO6) 707 MUL c81, ALPHA, c81 708 ST c62, -1 * SIZE(CO6) 709 MUL c82, ALPHA, c82 710 711 ST c71, -2 * SIZE(CO7) 712 MOV c11, a1 713 ST c72, -1 * SIZE(CO7) 714 MOV c21, a1 715 716 daddiu CO8,CO8, 2 * SIZE 717 daddiu BB, BB, 16 * SIZE 718 719 ST c81, -2 * SIZE(CO8) 720 MOV c31, a1 721 ST c82, -1 * SIZE(CO8) 722 MOV c41, a1 723 724 daddiu I, I, -1 725 MOV c51, a1 726 727#if ( defined(LEFT) && defined(TRANSA)) || \ 728 (!defined(LEFT) && !defined(TRANSA)) 729 dsubu TEMP, K, KK 730#ifdef LEFT 731 daddiu TEMP, TEMP, -2 732#else 733 daddiu TEMP, TEMP, -8 734#endif 735 736 dsll L, TEMP, 1 + BASE_SHIFT 737 dsll TEMP, TEMP, 3 + BASE_SHIFT 738 739 daddu AO, AO, L 740 daddu BO, BO, TEMP 741#endif 742 743#ifdef LEFT 744 daddiu KK, KK, 2 745#endif 746 747 bgtz I, .L11 748 MOV c61, a1 749#endif 750 .align 3 751 752.L20: 753 andi I, M, 1 754 MOV c61, c11 755 blez I, .L29 756 MOV c71, c11 757 758#if defined(TRMMKERNEL) 759#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 760 move BO, B 761#else 762 dsll L, KK, 0 + BASE_SHIFT 763 dsll TEMP, KK, 3 + BASE_SHIFT 764 765 daddu AO, AO, L 766 daddu BO, B, TEMP 767#endif 768 769 LD a1, 0 * SIZE(AO) 770 LD a2, 1 * SIZE(AO) 771 LD a3, 2 * SIZE(AO) 772 LD a4, 3 * SIZE(AO) 773 774 LD b1, 0 * SIZE(BO) 775 LD b2, 1 * SIZE(BO) 776 LD b3, 2 * SIZE(BO) 777 LD b4, 3 * SIZE(BO) 778 LD b5, 4 * SIZE(BO) 779 LD b6, 8 * SIZE(BO) 780 LD b7, 12 * SIZE(BO) 781 782#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 783 dsubu TEMP, K, KK 784#elif defined(LEFT) 785 daddiu TEMP, KK, 1 786#else 787 daddiu TEMP, KK, 8 788#endif 789 dsra L, TEMP, 2 790 791 blez L, .L25 792 MOV c81, c11 793#else 794 LD a1, 0 * SIZE(AO) 795 LD a2, 1 * SIZE(AO) 796 LD a3, 2 * SIZE(AO) 797 LD a4, 3 * SIZE(AO) 798 799 LD b1, 0 * SIZE(B) 800 LD b2, 1 * SIZE(B) 801 LD b3, 2 * SIZE(B) 802 LD b4, 3 * SIZE(B) 803 LD b5, 4 * SIZE(B) 804 LD b6, 8 * SIZE(B) 805 LD b7, 12 * SIZE(B) 806 807 dsra L, K, 2 808 MOV c81, c11 809 810 blez L, .L25 811 move BO, B 812#endif 813 .align 3 814 815.L22: 816 MADD c11, c11, a1, b1 817 LD b1, 16 * SIZE(BO) 818 MADD c21, c21, a1, b2 819 LD b2, 5 * SIZE(BO) 820 MADD c31, c31, a1, b3 821 LD b3, 6 * SIZE(BO) 822 MADD c41, c41, a1, b4 823 LD b4, 7 * SIZE(BO) 824 825 MADD c51, c51, a1, b5 826 LD b5, 20 * SIZE(BO) 827 MADD c61, c61, a1, b2 828 LD b2, 9 * SIZE(BO) 829 MADD c71, c71, a1, b3 830 LD b3, 10 * SIZE(BO) 831 MADD c81, c81, a1, b4 832 LD b4, 11 * SIZE(BO) 833 834 LD a1, 4 * SIZE(AO) 835 daddiu L, L, -1 836 837 MADD c11, c11, a2, b6 838 LD b6, 24 * SIZE(BO) 839 MADD c21, c21, a2, b2 840 LD b2, 13 * SIZE(BO) 841 MADD c31, c31, a2, b3 842 LD b3, 14 * SIZE(BO) 843 MADD c41, c41, a2, b4 844 LD b4, 15 * SIZE(BO) 845 846 MADD c51, c51, a2, b7 847 LD b7, 28 * SIZE(BO) 848 MADD c61, c61, a2, b2 849 LD b2, 17 * SIZE(BO) 850 MADD c71, c71, a2, b3 851 LD b3, 18 * SIZE(BO) 852 MADD c81, c81, a2, b4 853 LD b4, 19 * SIZE(BO) 854 855 LD a2, 5 * SIZE(AO) 856 daddiu AO, AO, 4 * SIZE 857 858 MADD c11, c11, a3, b1 859 LD b1, 32 * SIZE(BO) 860 MADD c21, c21, a3, b2 861 LD b2, 21 * SIZE(BO) 862 MADD c31, c31, a3, b3 863 LD b3, 22 * SIZE(BO) 864 MADD c41, c41, a3, b4 865 LD b4, 23 * SIZE(BO) 866 867 MADD c51, c51, a3, b5 868 LD b5, 36 * SIZE(BO) 869 MADD c61, c61, a3, b2 870 LD b2, 25 * SIZE(BO) 871 MADD c71, c71, a3, b3 872 LD b3, 26 * SIZE(BO) 873 MADD c81, c81, a3, b4 874 LD b4, 27 * SIZE(BO) 875 876 LD a3, 2 * SIZE(AO) 877 daddiu BO, BO, 32 * SIZE 878 879 MADD c11, c11, a4, b6 880 LD b6, 8 * SIZE(BO) 881 MADD c21, c21, a4, b2 882 LD b2, -3 * SIZE(BO) 883 MADD c31, c31, a4, b3 884 LD b3, -2 * SIZE(BO) 885 MADD c41, c41, a4, b4 886 LD b4, -1 * SIZE(BO) 887 888 MADD c51, c51, a4, b7 889 LD b7, 12 * SIZE(BO) 890 MADD c61, c61, a4, b2 891 LD b2, 1 * SIZE(BO) 892 MADD c71, c71, a4, b3 893 LD b3, 2 * SIZE(BO) 894 MADD c81, c81, a4, b4 895 LD b4, 3 * SIZE(BO) 896 bgtz L, .L22 897 LD a4, 3 * SIZE(AO) 898 .align 3 899 900.L25: 901#ifndef TRMMKERNEL 902 andi L, K, 3 903#else 904 andi L, TEMP, 3 905#endif 906 NOP 907 blez L, .L28 908 NOP 909 .align 3 910 911.L26: 912 MADD c11, c11, a1, b1 913 LD b1, 8 * SIZE(BO) 914 MADD c21, c21, a1, b2 915 LD b2, 5 * SIZE(BO) 916 MADD c31, c31, a1, b3 917 LD b3, 6 * SIZE(BO) 918 MADD c41, c41, a1, b4 919 LD b4, 7 * SIZE(BO) 920 921 daddiu L, L, -1 922 MOV a2, a2 923 daddiu AO, AO, 1 * SIZE 924 daddiu BO, BO, 8 * SIZE 925 926 MADD c51, c51, a1, b5 927 LD b5, 4 * SIZE(BO) 928 MADD c61, c61, a1, b2 929 LD b2, 1 * SIZE(BO) 930 MADD c71, c71, a1, b3 931 LD b3, 2 * SIZE(BO) 932 MADD c81, c81, a1, b4 933 LD a1, 0 * SIZE(AO) 934 935 bgtz L, .L26 936 LD b4, 3 * SIZE(BO) 937 938.L28: 939#ifndef TRMMKERNEL 940 LD $f0, 0 * SIZE(CO1) 941 LD $f1, 0 * SIZE(CO2) 942 LD $f2, 0 * SIZE(CO3) 943 LD $f3, 0 * SIZE(CO4) 944 MADD c11, $f0, ALPHA, c11 945 LD $f4, 0 * SIZE(CO5) 946 MADD c21, $f1, ALPHA, c21 947 LD $f5, 0 * SIZE(CO6) 948 MADD c31, $f2, ALPHA, c31 949 LD $f6, 0 * SIZE(CO7) 950 MADD c41, $f3, ALPHA, c41 951 LD $f7, 0 * SIZE(CO8) 952 MADD c51, $f4, ALPHA, c51 953 ST c11, 0 * SIZE(CO1) 954 MADD c61, $f5, ALPHA, c61 955 ST c21, 0 * SIZE(CO2) 956 MADD c71, $f6, ALPHA, c71 957 ST c31, 0 * SIZE(CO3) 958 MADD c81, $f7, ALPHA, c81 959 ST c41, 0 * SIZE(CO4) 960 ST c51, 0 * SIZE(CO5) 961 ST c61, 0 * SIZE(CO6) 962 ST c71, 0 * SIZE(CO7) 963 ST c81, 0 * SIZE(CO8) 964#else 965 MUL c11, ALPHA, c11 966 MUL c21, ALPHA, c21 967 MUL c31, ALPHA, c31 968 MUL c41, ALPHA, c41 969 970 ST c11, 0 * SIZE(CO1) 971 MUL c51, ALPHA, c51 972 ST c21, 0 * SIZE(CO2) 973 MUL c61, ALPHA, c61 974 ST c31, 0 * SIZE(CO3) 975 MUL c71, ALPHA, c71 976 ST c41, 0 * SIZE(CO4) 977 MUL c81, ALPHA, c81 978 979 ST c51, 0 * SIZE(CO5) 980 ST c61, 0 * SIZE(CO6) 981 ST c71, 0 * SIZE(CO7) 982 ST c81, 0 * SIZE(CO8) 983 984#if ( defined(LEFT) && defined(TRANSA)) || \ 985 (!defined(LEFT) && !defined(TRANSA)) 986 dsubu TEMP, K, KK 987#ifdef LEFT 988 daddiu TEMP, TEMP, -1 989#else 990 daddiu TEMP, TEMP, -8 991#endif 992 993 dsll L, TEMP, 0 + BASE_SHIFT 994 dsll TEMP, TEMP, 3 + BASE_SHIFT 995 996 daddu AO, AO, L 997 daddu BO, BO, TEMP 998#endif 999 1000#ifdef LEFT 1001 daddiu KK, KK, 1 1002#endif 1003#endif 1004 .align 3 1005 1006.L29: 1007#if defined(TRMMKERNEL) && !defined(LEFT) 1008 daddiu KK, KK, 8 1009#endif 1010 1011 bgtz J, .L10 1012 move B, BO 1013 .align 3 1014 1015.L30: 1016 andi J, N, 4 1017 blez J, .L50 1018 move AO, A 1019 1020 move CO1, C 1021 MTC $0, c11 1022 daddu CO2, C, LDC 1023 daddu CO3, CO2, LDC 1024 daddu CO4, CO3, LDC 1025 MOV c21, c11 1026 daddu C, CO4, LDC 1027 MOV c31, c11 1028 1029#if defined(TRMMKERNEL) && defined(LEFT) 1030 move KK, OFFSET 1031#endif 1032 1033 dsra I, M, 1 1034 blez I, .L40 1035 MOV c41, c11 1036 1037.L31: 1038#if defined(TRMMKERNEL) 1039#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1040 move BO, B 1041#else 1042 dsll L, KK, 1 + BASE_SHIFT 1043 dsll TEMP, KK, 2 + BASE_SHIFT 1044 1045 daddu AO, AO, L 1046 daddu BO, B, TEMP 1047#endif 1048 1049 LD a1, 0 * SIZE(AO) 1050 LD a3, 4 * SIZE(AO) 1051 1052 LD b1, 0 * SIZE(BO) 1053 MOV c12, c11 1054 LD b2, 1 * SIZE(BO) 1055 MOV c22, c11 1056 LD b3, 2 * SIZE(BO) 1057 MOV c32, c11 1058 LD b4, 3 * SIZE(BO) 1059 MOV c42, c11 1060 1061 LD b5, 4 * SIZE(BO) 1062 LD b6, 8 * SIZE(BO) 1063 LD b7, 12 * SIZE(BO) 1064 1065#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1066 dsubu TEMP, K, KK 1067#elif defined(LEFT) 1068 daddiu TEMP, KK, 2 1069#else 1070 daddiu TEMP, KK, 4 1071#endif 1072 dsra L, TEMP, 2 1073 blez L, .L35 1074 NOP 1075#else 1076 LD a1, 0 * SIZE(AO) 1077 LD a3, 4 * SIZE(AO) 1078 1079 LD b1, 0 * SIZE(B) 1080 MOV c12, c11 1081 LD b2, 1 * SIZE(B) 1082 MOV c22, c11 1083 LD b3, 2 * SIZE(B) 1084 MOV c32, c11 1085 LD b4, 3 * SIZE(B) 1086 MOV c42, c11 1087 1088 LD b5, 4 * SIZE(B) 1089 dsra L, K, 2 1090 LD b6, 8 * SIZE(B) 1091 LD b7, 12 * SIZE(B) 1092 1093 blez L, .L35 1094 move BO, B 1095#endif 1096 .align 3 1097 1098.L32: 1099 MADD c11, c11, a1, b1 1100 LD a2, 1 * SIZE(AO) 1101 MADD c21, c21, a1, b2 1102 daddiu L, L, -1 1103 MADD c31, c31, a1, b3 1104 NOP 1105 MADD c41, c41, a1, b4 1106 LD a1, 2 * SIZE(AO) 1107 1108 MADD c12, c12, a2, b1 1109 LD b1, 16 * SIZE(BO) 1110 MADD c22, c22, a2, b2 1111 LD b2, 5 * SIZE(BO) 1112 MADD c32, c32, a2, b3 1113 LD b3, 6 * SIZE(BO) 1114 MADD c42, c42, a2, b4 1115 LD b4, 7 * SIZE(BO) 1116 1117 MADD c11, c11, a1, b5 1118 LD a2, 3 * SIZE(AO) 1119 MADD c21, c21, a1, b2 1120 NOP 1121 MADD c31, c31, a1, b3 1122 NOP 1123 MADD c41, c41, a1, b4 1124 LD a1, 8 * SIZE(AO) 1125 1126 MADD c12, c12, a2, b5 1127 LD b5, 20 * SIZE(BO) 1128 MADD c22, c22, a2, b2 1129 LD b2, 9 * SIZE(BO) 1130 MADD c32, c32, a2, b3 1131 LD b3, 10 * SIZE(BO) 1132 MADD c42, c42, a2, b4 1133 LD b4, 11 * SIZE(BO) 1134 1135 MADD c11, c11, a3, b6 1136 LD a2, 5 * SIZE(AO) 1137 MADD c21, c21, a3, b2 1138 NOP 1139 MADD c31, c31, a3, b3 1140 NOP 1141 MADD c41, c41, a3, b4 1142 LD a3, 6 * SIZE(AO) 1143 1144 MADD c12, c12, a2, b6 1145 LD b6, 24 * SIZE(BO) 1146 MADD c22, c22, a2, b2 1147 LD b2, 13 * SIZE(BO) 1148 MADD c32, c32, a2, b3 1149 LD b3, 14 * SIZE(BO) 1150 MADD c42, c42, a2, b4 1151 LD b4, 15 * SIZE(BO) 1152 1153 MADD c11, c11, a3, b7 1154 LD a2, 7 * SIZE(AO) 1155 MADD c21, c21, a3, b2 1156 daddiu AO, AO, 8 * SIZE 1157 MADD c31, c31, a3, b3 1158 daddiu BO, BO, 16 * SIZE 1159 MADD c41, c41, a3, b4 1160 LD a3, 4 * SIZE(AO) 1161 1162 MADD c12, c12, a2, b7 1163 LD b7, 12 * SIZE(BO) 1164 MADD c22, c22, a2, b2 1165 LD b2, 1 * SIZE(BO) 1166 MADD c32, c32, a2, b3 1167 LD b3, 2 * SIZE(BO) 1168 MADD c42, c42, a2, b4 1169 NOP 1170 1171 bgtz L, .L32 1172 LD b4, 3 * SIZE(BO) 1173 .align 3 1174 1175.L35: 1176#ifndef TRMMKERNEL 1177 andi L, K, 3 1178#else 1179 andi L, TEMP, 3 1180#endif 1181 NOP 1182 blez L, .L38 1183 NOP 1184 .align 3 1185 1186.L36: 1187 MADD c11, c11, a1, b1 1188 LD a2, 1 * SIZE(AO) 1189 MADD c21, c21, a1, b2 1190 daddiu L, L, -1 1191 MADD c31, c31, a1, b3 1192 daddiu AO, AO, 2 * SIZE 1193 MADD c41, c41, a1, b4 1194 LD a1, 0 * SIZE(AO) 1195 1196 MADD c12, c12, a2, b1 1197 LD b1, 4 * SIZE(BO) 1198 MADD c22, c22, a2, b2 1199 LD b2, 5 * SIZE(BO) 1200 MADD c32, c32, a2, b3 1201 LD b3, 6 * SIZE(BO) 1202 MADD c42, c42, a2, b4 1203 LD b4, 7 * SIZE(BO) 1204 1205 bgtz L, .L36 1206 daddiu BO, BO, 4 * SIZE 1207 1208.L38: 1209#ifndef TRMMKERNEL 1210 LD $f0, 0 * SIZE(CO1) 1211 daddiu CO3,CO3, 2 * SIZE 1212 LD $f1, 1 * SIZE(CO1) 1213 daddiu CO1,CO1, 2 * SIZE 1214 LD $f2, 0 * SIZE(CO2) 1215 daddiu CO4,CO4, 2 * SIZE 1216 LD $f3, 1 * SIZE(CO2) 1217 daddiu CO2,CO2, 2 * SIZE 1218 1219 LD $f4, -2 * SIZE(CO3) 1220 MADD c11, $f0, ALPHA, c11 1221 LD $f5, -1 * SIZE(CO3) 1222 MADD c12, $f1, ALPHA, c12 1223 LD $f6, -2 * SIZE(CO4) 1224 MADD c21, $f2, ALPHA, c21 1225 LD $f7, -1 * SIZE(CO4) 1226 MADD c22, $f3, ALPHA, c22 1227 1228 MADD c31, $f4, ALPHA, c31 1229 ST c11, -2 * SIZE(CO1) 1230 MADD c32, $f5, ALPHA, c32 1231 ST c12, -1 * SIZE(CO1) 1232 MADD c41, $f6, ALPHA, c41 1233 ST c21, -2 * SIZE(CO2) 1234 MADD c42, $f7, ALPHA, c42 1235 ST c22, -1 * SIZE(CO2) 1236 1237 ST c31, -2 * SIZE(CO3) 1238 MTC $0, c11 1239 ST c32, -1 * SIZE(CO3) 1240 daddiu I, I, -1 1241 ST c41, -2 * SIZE(CO4) 1242 MOV c21, c11 1243 ST c42, -1 * SIZE(CO4) 1244 MOV c31, c11 1245#else 1246 MUL c11, ALPHA, c11 1247 daddiu CO3,CO3, 2 * SIZE 1248 MUL c12, ALPHA, c12 1249 daddiu CO1,CO1, 2 * SIZE 1250 MUL c21, ALPHA, c21 1251 daddiu CO4,CO4, 2 * SIZE 1252 MUL c22, ALPHA, c22 1253 daddiu CO2,CO2, 2 * SIZE 1254 1255 ST c11, -2 * SIZE(CO1) 1256 MUL c31, ALPHA, c31 1257 ST c12, -1 * SIZE(CO1) 1258 MUL c32, ALPHA, c32 1259 ST c21, -2 * SIZE(CO2) 1260 MUL c41, ALPHA, c41 1261 ST c22, -1 * SIZE(CO2) 1262 MUL c42, ALPHA, c42 1263 1264 ST c31, -2 * SIZE(CO3) 1265 MTC $0, c11 1266 ST c32, -1 * SIZE(CO3) 1267 daddiu I, I, -1 1268 ST c41, -2 * SIZE(CO4) 1269 MOV c21, c11 1270 ST c42, -1 * SIZE(CO4) 1271 MOV c31, c11 1272 1273#if ( defined(LEFT) && defined(TRANSA)) || \ 1274 (!defined(LEFT) && !defined(TRANSA)) 1275 dsubu TEMP, K, KK 1276#ifdef LEFT 1277 daddiu TEMP, TEMP, -2 1278#else 1279 daddiu TEMP, TEMP, -4 1280#endif 1281 1282 dsll L, TEMP, 1 + BASE_SHIFT 1283 dsll TEMP, TEMP, 2 + BASE_SHIFT 1284 1285 daddu AO, AO, L 1286 daddu BO, BO, TEMP 1287#endif 1288 1289#ifdef LEFT 1290 daddiu KK, KK, 2 1291#endif 1292#endif 1293 1294 bgtz I, .L31 1295 MOV c41, c11 1296 .align 3 1297 1298.L40: 1299 andi I, M, 1 1300 blez I, .L49 1301 MOV c61, c11 1302 1303#if defined(TRMMKERNEL) 1304#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1305 move BO, B 1306#else 1307 dsll L, KK, 0 + BASE_SHIFT 1308 dsll TEMP, KK, 2 + BASE_SHIFT 1309 1310 daddu AO, AO, L 1311 daddu BO, B, TEMP 1312#endif 1313 1314 LD a1, 0 * SIZE(AO) 1315 MOV c71, c11 1316 LD a2, 1 * SIZE(AO) 1317 MOV c81, c11 1318 1319 LD b1, 0 * SIZE(BO) 1320 LD b2, 1 * SIZE(BO) 1321 LD b3, 2 * SIZE(BO) 1322 LD b4, 3 * SIZE(BO) 1323 LD b5, 4 * SIZE(BO) 1324 LD b6, 8 * SIZE(BO) 1325 LD b7, 12 * SIZE(BO) 1326 1327#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1328 dsubu TEMP, K, KK 1329#elif defined(LEFT) 1330 daddiu TEMP, KK, 1 1331#else 1332 daddiu TEMP, KK, 4 1333#endif 1334 dsra L, TEMP, 2 1335 1336 blez L, .L45 1337 NOP 1338#else 1339 LD a1, 0 * SIZE(AO) 1340 MOV c71, c11 1341 LD a2, 1 * SIZE(AO) 1342 MOV c81, c11 1343 1344 LD b1, 0 * SIZE(B) 1345 LD b2, 1 * SIZE(B) 1346 LD b3, 2 * SIZE(B) 1347 LD b4, 3 * SIZE(B) 1348 LD b5, 4 * SIZE(B) 1349 LD b6, 8 * SIZE(B) 1350 LD b7, 12 * SIZE(B) 1351 1352 dsra L, K, 2 1353 1354 blez L, .L45 1355 move BO, B 1356#endif 1357 .align 3 1358 1359.L42: 1360 MADD c11, c11, a1, b1 1361 LD b1, 16 * SIZE(BO) 1362 MADD c21, c21, a1, b2 1363 LD b2, 5 * SIZE(BO) 1364 MADD c31, c31, a1, b3 1365 LD b3, 6 * SIZE(BO) 1366 MADD c41, c41, a1, b4 1367 LD b4, 7 * SIZE(BO) 1368 1369 LD a1, 4 * SIZE(AO) 1370 daddiu L, L, -1 1371 1372 MADD c11, c11, a2, b5 1373 LD b5, 20 * SIZE(BO) 1374 MADD c21, c21, a2, b2 1375 LD b2, 9 * SIZE(BO) 1376 MADD c31, c31, a2, b3 1377 LD b3, 10 * SIZE(BO) 1378 MADD c41, c41, a2, b4 1379 LD b4, 11 * SIZE(BO) 1380 1381 LD a2, 2 * SIZE(AO) 1382 daddiu AO, AO, 4 * SIZE 1383 1384 MADD c11, c11, a2, b6 1385 LD b6, 24 * SIZE(BO) 1386 MADD c21, c21, a2, b2 1387 LD b2, 13 * SIZE(BO) 1388 MADD c31, c31, a2, b3 1389 LD b3, 14 * SIZE(BO) 1390 MADD c41, c41, a2, b4 1391 LD b4, 15 * SIZE(BO) 1392 1393 LD a2, -1 * SIZE(AO) 1394 daddiu BO, BO, 16 * SIZE 1395 1396 MADD c11, c11, a2, b7 1397 LD b7, 12 * SIZE(BO) 1398 MADD c21, c21, a2, b2 1399 LD b2, 1 * SIZE(BO) 1400 MADD c31, c31, a2, b3 1401 LD b3, 2 * SIZE(BO) 1402 MADD c41, c41, a2, b4 1403 LD b4, 3 * SIZE(BO) 1404 1405 bgtz L, .L42 1406 LD a2, 1 * SIZE(AO) 1407 .align 3 1408 1409.L45: 1410#ifndef TRMMKERNEL 1411 andi L, K, 3 1412#else 1413 andi L, TEMP, 3 1414#endif 1415 NOP 1416 blez L, .L48 1417 NOP 1418 .align 3 1419 1420.L46: 1421 MADD c11, c11, a1, b1 1422 LD b1, 4 * SIZE(BO) 1423 MADD c21, c21, a1, b2 1424 LD b2, 5 * SIZE(BO) 1425 MADD c31, c31, a1, b3 1426 LD b3, 6 * SIZE(BO) 1427 MADD c41, c41, a1, b4 1428 LD a1, 1 * SIZE(AO) 1429 1430 LD b4, 7 * SIZE(BO) 1431 daddiu L, L, -1 1432 1433 daddiu AO, AO, 1 * SIZE 1434 MOV a2, a2 1435 bgtz L, .L46 1436 daddiu BO, BO, 4 * SIZE 1437 1438 1439.L48: 1440#ifndef TRMMKERNEL 1441 LD $f0, 0 * SIZE(CO1) 1442 LD $f1, 0 * SIZE(CO2) 1443 LD $f2, 0 * SIZE(CO3) 1444 LD $f3, 0 * SIZE(CO4) 1445 1446 MADD c11, $f0, ALPHA, c11 1447 MADD c21, $f1, ALPHA, c21 1448 MADD c31, $f2, ALPHA, c31 1449 MADD c41, $f3, ALPHA, c41 1450 1451 ST c11, 0 * SIZE(CO1) 1452 ST c21, 0 * SIZE(CO2) 1453 ST c31, 0 * SIZE(CO3) 1454 ST c41, 0 * SIZE(CO4) 1455#else 1456 MUL c11, ALPHA, c11 1457 MUL c21, ALPHA, c21 1458 MUL c31, ALPHA, c31 1459 MUL c41, ALPHA, c41 1460 1461 ST c11, 0 * SIZE(CO1) 1462 ST c21, 0 * SIZE(CO2) 1463 ST c31, 0 * SIZE(CO3) 1464 ST c41, 0 * SIZE(CO4) 1465 1466#if ( defined(LEFT) && defined(TRANSA)) || \ 1467 (!defined(LEFT) && !defined(TRANSA)) 1468 dsubu TEMP, K, KK 1469#ifdef LEFT 1470 daddiu TEMP, TEMP, -1 1471#else 1472 daddiu TEMP, TEMP, -4 1473#endif 1474 1475 dsll L, TEMP, 0 + BASE_SHIFT 1476 dsll TEMP, TEMP, 2 + BASE_SHIFT 1477 1478 daddu AO, AO, L 1479 daddu BO, BO, TEMP 1480#endif 1481 1482#ifdef LEFT 1483 daddiu KK, KK, 1 1484#endif 1485#endif 1486 .align 3 1487 1488.L49: 1489#if defined(TRMMKERNEL) && !defined(LEFT) 1490 daddiu KK, KK, 4 1491#endif 1492 move B, BO 1493 .align 3 1494 1495.L50: 1496 andi J, N, 2 1497 blez J, .L70 1498 1499 move AO, A 1500 move CO1, C 1501 daddu CO2, C, LDC 1502 1503#if defined(TRMMKERNEL) && defined(LEFT) 1504 move KK, OFFSET 1505#endif 1506 1507 dsra I, M, 1 1508 blez I, .L60 1509 daddu C, CO2, LDC 1510 1511.L51: 1512#if defined(TRMMKERNEL) 1513#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1514 move BO, B 1515#else 1516 dsll L, KK, 1 + BASE_SHIFT 1517 dsll TEMP, KK, 1 + BASE_SHIFT 1518 1519 daddu AO, AO, L 1520 daddu BO, B, TEMP 1521#endif 1522 1523 LD a1, 0 * SIZE(AO) 1524 MTC $0, c11 1525 LD a2, 1 * SIZE(AO) 1526 MOV c21, c11 1527 LD a5, 4 * SIZE(AO) 1528 1529 LD b1, 0 * SIZE(BO) 1530 MOV c12, c11 1531 LD b2, 1 * SIZE(BO) 1532 MOV c22, c11 1533 LD b3, 2 * SIZE(BO) 1534 LD b5, 4 * SIZE(BO) 1535 LD b6, 8 * SIZE(BO) 1536 LD b7, 12 * SIZE(BO) 1537 1538#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1539 dsubu TEMP, K, KK 1540#elif defined(LEFT) 1541 daddiu TEMP, KK, 2 1542#else 1543 daddiu TEMP, KK, 2 1544#endif 1545 dsra L, TEMP, 2 1546 blez L, .L55 1547 NOP 1548#else 1549 LD a1, 0 * SIZE(AO) 1550 MTC $0, c11 1551 LD a2, 1 * SIZE(AO) 1552 MOV c21, c11 1553 LD a5, 4 * SIZE(AO) 1554 1555 LD b1, 0 * SIZE(B) 1556 MOV c12, c11 1557 LD b2, 1 * SIZE(B) 1558 MOV c22, c11 1559 LD b3, 2 * SIZE(B) 1560 LD b5, 4 * SIZE(B) 1561 dsra L, K, 2 1562 LD b6, 8 * SIZE(B) 1563 LD b7, 12 * SIZE(B) 1564 1565 blez L, .L55 1566 move BO, B 1567#endif 1568 .align 3 1569 1570.L52: 1571 MADD c11, c11, a1, b1 1572 LD a3, 2 * SIZE(AO) 1573 MADD c21, c21, a1, b2 1574 LD b4, 3 * SIZE(BO) 1575 MADD c12, c12, a2, b1 1576 LD a4, 3 * SIZE(AO) 1577 MADD c22, c22, a2, b2 1578 LD b1, 8 * SIZE(BO) 1579 1580 MADD c11, c11, a3, b3 1581 LD a1, 8 * SIZE(AO) 1582 MADD c21, c21, a3, b4 1583 LD b2, 5 * SIZE(BO) 1584 MADD c12, c12, a4, b3 1585 LD a2, 5 * SIZE(AO) 1586 MADD c22, c22, a4, b4 1587 LD b3, 6 * SIZE(BO) 1588 1589 MADD c11, c11, a5, b5 1590 LD a3, 6 * SIZE(AO) 1591 MADD c21, c21, a5, b2 1592 LD b4, 7 * SIZE(BO) 1593 MADD c12, c12, a2, b5 1594 LD a4, 7 * SIZE(AO) 1595 MADD c22, c22, a2, b2 1596 LD b5, 12 * SIZE(BO) 1597 1598 MADD c11, c11, a3, b3 1599 LD a5, 12 * SIZE(AO) 1600 MADD c21, c21, a3, b4 1601 LD b2, 9 * SIZE(BO) 1602 MADD c12, c12, a4, b3 1603 LD a2, 9 * SIZE(AO) 1604 MADD c22, c22, a4, b4 1605 LD b3, 10 * SIZE(BO) 1606 1607 daddiu AO, AO, 8 * SIZE 1608 daddiu L, L, -1 1609 bgtz L, .L52 1610 daddiu BO, BO, 8 * SIZE 1611 .align 3 1612 1613.L55: 1614#ifndef TRMMKERNEL 1615 andi L, K, 3 1616#else 1617 andi L, TEMP, 3 1618#endif 1619 NOP 1620 blez L, .L58 1621 NOP 1622 .align 3 1623 1624.L56: 1625 MADD c11, c11, a1, b1 1626 LD a2, 1 * SIZE(AO) 1627 MADD c21, c21, a1, b2 1628 LD a1, 2 * SIZE(AO) 1629 1630 MADD c12, c12, a2, b1 1631 LD b1, 2 * SIZE(BO) 1632 MADD c22, c22, a2, b2 1633 LD b2, 3 * SIZE(BO) 1634 1635 daddiu L, L, -1 1636 daddiu AO, AO, 2 * SIZE 1637 bgtz L, .L56 1638 daddiu BO, BO, 2 * SIZE 1639 1640.L58: 1641#ifndef TRMMKERNEL 1642 LD $f0, 0 * SIZE(CO1) 1643 daddiu I, I, -1 1644 LD $f1, 1 * SIZE(CO1) 1645 daddiu CO1,CO1, 2 * SIZE 1646 LD $f2, 0 * SIZE(CO2) 1647 NOP 1648 LD $f3, 1 * SIZE(CO2) 1649 daddiu CO2,CO2, 2 * SIZE 1650 1651 MADD c11, $f0, ALPHA, c11 1652 MADD c12, $f1, ALPHA, c12 1653 MADD c21, $f2, ALPHA, c21 1654 MADD c22, $f3, ALPHA, c22 1655 1656 ST c11, -2 * SIZE(CO1) 1657 ST c12, -1 * SIZE(CO1) 1658 ST c21, -2 * SIZE(CO2) 1659 NOP 1660 bgtz I, .L51 1661 ST c22, -1 * SIZE(CO2) 1662#else 1663 daddiu I, I, -1 1664 1665 daddiu CO1,CO1, 2 * SIZE 1666 daddiu CO2,CO2, 2 * SIZE 1667 1668 MUL c11, ALPHA, c11 1669 MUL c12, ALPHA, c12 1670 MUL c21, ALPHA, c21 1671 MUL c22, ALPHA, c22 1672 1673 ST c11, -2 * SIZE(CO1) 1674 ST c12, -1 * SIZE(CO1) 1675 ST c21, -2 * SIZE(CO2) 1676 ST c22, -1 * SIZE(CO2) 1677 1678#if ( defined(LEFT) && defined(TRANSA)) || \ 1679 (!defined(LEFT) && !defined(TRANSA)) 1680 dsubu TEMP, K, KK 1681#ifdef LEFT 1682 daddiu TEMP, TEMP, -2 1683#else 1684 daddiu TEMP, TEMP, -2 1685#endif 1686 1687 dsll L, TEMP, 1 + BASE_SHIFT 1688 dsll TEMP, TEMP, 1 + BASE_SHIFT 1689 1690 daddu AO, AO, L 1691 daddu BO, BO, TEMP 1692#endif 1693 1694#ifdef LEFT 1695 daddiu KK, KK, 2 1696#endif 1697 1698 bgtz I, .L51 1699 NOP 1700#endif 1701 .align 3 1702 1703.L60: 1704 andi I, M, 1 1705 blez I, .L69 1706 NOP 1707 1708#if defined(TRMMKERNEL) 1709#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1710 move BO, B 1711#else 1712 dsll L, KK, 0 + BASE_SHIFT 1713 dsll TEMP, KK, 1 + BASE_SHIFT 1714 1715 daddu AO, AO, L 1716 daddu BO, B, TEMP 1717#endif 1718 1719 LD a1, 0 * SIZE(AO) 1720 MTC $0, c11 1721 LD a2, 1 * SIZE(AO) 1722 MOV c21, c11 1723 LD a3, 2 * SIZE(AO) 1724 MOV c31, c11 1725 LD a4, 3 * SIZE(AO) 1726 MOV c41, c11 1727 1728 LD b1, 0 * SIZE(BO) 1729 LD b2, 1 * SIZE(BO) 1730 LD b3, 2 * SIZE(BO) 1731 LD b4, 3 * SIZE(BO) 1732 LD b5, 4 * SIZE(BO) 1733 LD b6, 8 * SIZE(BO) 1734 LD b7, 12 * SIZE(BO) 1735 1736#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1737 dsubu TEMP, K, KK 1738#elif defined(LEFT) 1739 daddiu TEMP, KK, 1 1740#else 1741 daddiu TEMP, KK, 2 1742#endif 1743 dsra L, TEMP, 2 1744 blez L, .L65 1745 NOP 1746#else 1747 dsra L, K, 2 1748 LD a1, 0 * SIZE(AO) 1749 MTC $0, c11 1750 LD a2, 1 * SIZE(AO) 1751 MOV c21, c11 1752 LD a3, 2 * SIZE(AO) 1753 MOV c31, c11 1754 LD a4, 3 * SIZE(AO) 1755 MOV c41, c11 1756 1757 LD b1, 0 * SIZE(B) 1758 LD b2, 1 * SIZE(B) 1759 LD b3, 2 * SIZE(B) 1760 LD b4, 3 * SIZE(B) 1761 LD b5, 4 * SIZE(B) 1762 LD b6, 8 * SIZE(B) 1763 LD b7, 12 * SIZE(B) 1764 1765 blez L, .L65 1766 move BO, B 1767#endif 1768 .align 3 1769 1770.L62: 1771 MADD c11, c11, a1, b1 1772 LD b1, 4 * SIZE(BO) 1773 MADD c21, c21, a1, b2 1774 LD b2, 5 * SIZE(BO) 1775 MADD c31, c31, a2, b3 1776 LD b3, 6 * SIZE(BO) 1777 MADD c41, c41, a2, b4 1778 LD b4, 7 * SIZE(BO) 1779 1780 LD a1, 4 * SIZE(AO) 1781 LD a2, 5 * SIZE(AO) 1782 1783 MADD c11, c11, a3, b1 1784 LD b1, 8 * SIZE(BO) 1785 MADD c21, c21, a3, b2 1786 LD b2, 9 * SIZE(BO) 1787 MADD c31, c31, a4, b3 1788 LD b3, 10 * SIZE(BO) 1789 MADD c41, c41, a4, b4 1790 LD b4, 11 * SIZE(BO) 1791 1792 LD a3, 6 * SIZE(AO) 1793 LD a4, 7 * SIZE(AO) 1794 1795 daddiu L, L, -1 1796 daddiu AO, AO, 4 * SIZE 1797 1798 bgtz L, .L62 1799 daddiu BO, BO, 8 * SIZE 1800 .align 3 1801 1802.L65: 1803#ifndef TRMMKERNEL 1804 andi L, K, 3 1805#else 1806 andi L, TEMP, 3 1807#endif 1808 NOP 1809 blez L, .L68 1810 NOP 1811 .align 3 1812 1813.L66: 1814 MADD c11, c11, a1, b1 1815 LD b1, 2 * SIZE(BO) 1816 MADD c21, c21, a1, b2 1817 LD b2, 3 * SIZE(BO) 1818 1819 LD a1, 1 * SIZE(AO) 1820 daddiu L, L, -1 1821 1822 daddiu AO, AO, 1 * SIZE 1823 bgtz L, .L66 1824 daddiu BO, BO, 2 * SIZE 1825 1826 1827.L68: 1828#ifndef TRMMKERNEL 1829 LD $f0, 0 * SIZE(CO1) 1830 LD $f1, 0 * SIZE(CO2) 1831 1832 ADD c11, c11, c31 1833 ADD c21, c21, c41 1834 1835 MADD c11, $f0, ALPHA, c11 1836 MADD c21, $f1, ALPHA, c21 1837 1838 ST c11, 0 * SIZE(CO1) 1839 ST c21, 0 * SIZE(CO2) 1840#else 1841 ADD c11, c11, c31 1842 ADD c21, c21, c41 1843 1844 MUL c11, ALPHA, c11 1845 MUL c21, ALPHA, c21 1846 1847 ST c11, 0 * SIZE(CO1) 1848 ST c21, 0 * SIZE(CO2) 1849 1850#if ( defined(LEFT) && defined(TRANSA)) || \ 1851 (!defined(LEFT) && !defined(TRANSA)) 1852 dsubu TEMP, K, KK 1853#ifdef LEFT 1854 daddiu TEMP, TEMP, -1 1855#else 1856 daddiu TEMP, TEMP, -2 1857#endif 1858 1859 dsll L, TEMP, 0 + BASE_SHIFT 1860 dsll TEMP, TEMP, 1 + BASE_SHIFT 1861 1862 daddu AO, AO, L 1863 daddu BO, BO, TEMP 1864#endif 1865 1866#ifdef LEFT 1867 daddiu KK, KK, 1 1868#endif 1869#endif 1870 .align 3 1871 1872.L69: 1873#if defined(TRMMKERNEL) && !defined(LEFT) 1874 daddiu KK, KK, 2 1875#endif 1876 move B, BO 1877 .align 3 1878 1879.L70: 1880 andi J, N, 1 1881 blez J, .L999 1882 1883 move AO, A 1884 move CO1, C 1885 1886#if defined(TRMMKERNEL) && defined(LEFT) 1887 move KK, OFFSET 1888#endif 1889 1890 dsra I, M, 1 1891 blez I, .L80 1892 daddu C, CO1, LDC 1893 1894.L71: 1895#if defined(TRMMKERNEL) 1896#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1897 move BO, B 1898#else 1899 dsll L, KK, 1 + BASE_SHIFT 1900 dsll TEMP, KK, 0 + BASE_SHIFT 1901 1902 daddu AO, AO, L 1903 daddu BO, B, TEMP 1904#endif 1905 1906 LD a1, 0 * SIZE(AO) 1907 MTC $0, c11 1908 LD a2, 1 * SIZE(AO) 1909 MOV c21, c11 1910 LD a5, 4 * SIZE(AO) 1911 1912 LD b1, 0 * SIZE(BO) 1913 MOV c12, c11 1914 LD b2, 1 * SIZE(BO) 1915 MOV c22, c11 1916 LD b3, 2 * SIZE(BO) 1917 LD b5, 4 * SIZE(BO) 1918 LD b6, 8 * SIZE(BO) 1919 LD b7, 12 * SIZE(BO) 1920 1921#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1922 dsubu TEMP, K, KK 1923#elif defined(LEFT) 1924 daddiu TEMP, KK, 2 1925#else 1926 daddiu TEMP, KK, 1 1927#endif 1928 dsra L, TEMP, 2 1929 blez L, .L75 1930 NOP 1931#else 1932 LD a1, 0 * SIZE(AO) 1933 MTC $0, c11 1934 LD a2, 1 * SIZE(AO) 1935 MOV c21, c11 1936 LD a5, 4 * SIZE(AO) 1937 1938 LD b1, 0 * SIZE(B) 1939 MOV c12, c11 1940 LD b2, 1 * SIZE(B) 1941 MOV c22, c11 1942 LD b3, 2 * SIZE(B) 1943 LD b5, 4 * SIZE(B) 1944 dsra L, K, 2 1945 LD b6, 8 * SIZE(B) 1946 LD b7, 12 * SIZE(B) 1947 1948 blez L, .L75 1949 move BO, B 1950#endif 1951 .align 3 1952 1953.L72: 1954 LD a1, 0 * SIZE(AO) 1955 LD a2, 1 * SIZE(AO) 1956 LD b1, 0 * SIZE(BO) 1957 1958 MADD c11, c11, a1, b1 1959 MADD c12, c12, a2, b1 1960 1961 LD a1, 2 * SIZE(AO) 1962 LD a2, 3 * SIZE(AO) 1963 LD b1, 1 * SIZE(BO) 1964 1965 MADD c11, c11, a1, b1 1966 MADD c12, c12, a2, b1 1967 1968 LD a1, 4 * SIZE(AO) 1969 LD a2, 5 * SIZE(AO) 1970 LD b1, 2 * SIZE(BO) 1971 1972 MADD c11, c11, a1, b1 1973 MADD c12, c12, a2, b1 1974 1975 LD a1, 6 * SIZE(AO) 1976 LD a2, 7 * SIZE(AO) 1977 LD b1, 3 * SIZE(BO) 1978 1979 MADD c11, c11, a1, b1 1980 MADD c12, c12, a2, b1 1981 1982 daddiu L, L, -1 1983 daddiu AO, AO, 8 * SIZE 1984 bgtz L, .L72 1985 daddiu BO, BO, 4 * SIZE 1986 .align 3 1987 1988.L75: 1989#ifndef TRMMKERNEL 1990 andi L, K, 3 1991#else 1992 andi L, TEMP, 3 1993#endif 1994 NOP 1995 blez L, .L78 1996 NOP 1997 .align 3 1998 1999.L76: 2000 LD a1, 0 * SIZE(AO) 2001 LD a2, 1 * SIZE(AO) 2002 LD b1, 0 * SIZE(BO) 2003 2004 MADD c11, c11, a1, b1 2005 MADD c12, c12, a2, b1 2006 2007 daddiu L, L, -1 2008 daddiu AO, AO, 2 * SIZE 2009 bgtz L, .L76 2010 daddiu BO, BO, 1 * SIZE 2011 2012.L78: 2013#ifndef TRMMKERNEL 2014 LD $f0, 0 * SIZE(CO1) 2015 daddiu I, I, -1 2016 LD $f1, 1 * SIZE(CO1) 2017 daddiu CO1,CO1, 2 * SIZE 2018 2019 ADD c11, c11, c21 2020 ADD c12, c12, c22 2021 2022 MADD c11, $f0, ALPHA, c11 2023 MADD c12, $f1, ALPHA, c12 2024 2025 ST c11, -2 * SIZE(CO1) 2026 bgtz I, .L71 2027 ST c12, -1 * SIZE(CO1) 2028#else 2029 ADD c11, c11, c21 2030 daddiu I, I, -1 2031 ADD c12, c12, c22 2032 daddiu CO1,CO1, 2 * SIZE 2033 2034 MUL c11, ALPHA, c11 2035 MUL c12, ALPHA, c12 2036 2037 ST c11, -2 * SIZE(CO1) 2038 ST c12, -1 * SIZE(CO1) 2039 2040#if ( defined(LEFT) && defined(TRANSA)) || \ 2041 (!defined(LEFT) && !defined(TRANSA)) 2042 dsubu TEMP, K, KK 2043#ifdef LEFT 2044 daddiu TEMP, TEMP, -2 2045#else 2046 daddiu TEMP, TEMP, -1 2047#endif 2048 2049 dsll L, TEMP, 1 + BASE_SHIFT 2050 dsll TEMP, TEMP, 0 + BASE_SHIFT 2051 2052 daddu AO, AO, L 2053 daddu BO, BO, TEMP 2054#endif 2055 2056#ifdef LEFT 2057 daddiu KK, KK, 2 2058#endif 2059 2060 bgtz I, .L71 2061 NOP 2062#endif 2063 .align 3 2064 2065.L80: 2066 andi I, M, 1 2067 blez I, .L89 2068 NOP 2069 2070#if defined(TRMMKERNEL) 2071#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2072 move BO, B 2073#else 2074 dsll L, KK, 0 + BASE_SHIFT 2075 dsll TEMP, KK, 0 + BASE_SHIFT 2076 2077 daddu AO, AO, L 2078 daddu BO, B, TEMP 2079#endif 2080 2081 LD a1, 0 * SIZE(AO) 2082 MTC $0, c11 2083 LD a2, 1 * SIZE(AO) 2084 MOV c21, c11 2085 LD a3, 2 * SIZE(AO) 2086 LD a4, 3 * SIZE(AO) 2087 2088 LD b1, 0 * SIZE(BO) 2089 LD b2, 1 * SIZE(BO) 2090 LD b3, 2 * SIZE(BO) 2091 LD b4, 3 * SIZE(BO) 2092 LD b5, 4 * SIZE(BO) 2093 LD b6, 8 * SIZE(BO) 2094 LD b7, 12 * SIZE(BO) 2095 2096#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2097 dsubu TEMP, K, KK 2098#elif defined(LEFT) 2099 daddiu TEMP, KK, 1 2100#else 2101 daddiu TEMP, KK, 1 2102#endif 2103 dsra L, TEMP, 2 2104 blez L, .L85 2105 NOP 2106#else 2107 LD a1, 0 * SIZE(AO) 2108 MTC $0, c11 2109 LD a2, 1 * SIZE(AO) 2110 MOV c21, c11 2111 LD a3, 2 * SIZE(AO) 2112 LD a4, 3 * SIZE(AO) 2113 2114 LD b1, 0 * SIZE(B) 2115 LD b2, 1 * SIZE(B) 2116 LD b3, 2 * SIZE(B) 2117 LD b4, 3 * SIZE(B) 2118 LD b5, 4 * SIZE(B) 2119 LD b6, 8 * SIZE(B) 2120 LD b7, 12 * SIZE(B) 2121 2122 dsra L, K, 2 2123 blez L, .L85 2124 move BO, B 2125#endif 2126 .align 3 2127 2128.L82: 2129 LD a1, 0 * SIZE(AO) 2130 LD b1, 0 * SIZE(BO) 2131 2132 MADD c11, c11, a1, b1 2133 2134 LD a1, 1 * SIZE(AO) 2135 LD b1, 1 * SIZE(BO) 2136 2137 MADD c21, c21, a1, b1 2138 2139 LD a1, 2 * SIZE(AO) 2140 LD b1, 2 * SIZE(BO) 2141 2142 MADD c11, c11, a1, b1 2143 2144 LD a1, 3 * SIZE(AO) 2145 LD b1, 3 * SIZE(BO) 2146 2147 MADD c21, c21, a1, b1 2148 2149 daddiu L, L, -1 2150 daddiu AO, AO, 4 * SIZE 2151 bgtz L, .L82 2152 daddiu BO, BO, 4 * SIZE 2153 .align 3 2154 2155.L85: 2156#ifndef TRMMKERNEL 2157 andi L, K, 3 2158#else 2159 andi L, TEMP, 3 2160#endif 2161 NOP 2162 blez L, .L88 2163 NOP 2164 .align 3 2165 2166.L86: 2167 LD a1, 0 * SIZE(AO) 2168 LD b1, 0 * SIZE(BO) 2169 2170 MADD c11, c11, a1, b1 2171 2172 daddiu L, L, -1 2173 daddiu AO, AO, 1 * SIZE 2174 bgtz L, .L86 2175 daddiu BO, BO, 1 * SIZE 2176 2177 2178.L88: 2179#ifndef TRMMKERNEL 2180 LD $f0, 0 * SIZE(CO1) 2181 2182 ADD c11, c11, c21 2183 MADD c11, $f0, ALPHA, c11 2184 2185 ST c11, 0 * SIZE(CO1) 2186#else 2187 ADD c11, c11, c21 2188 MUL c11, ALPHA, c11 2189 2190 ST c11, 0 * SIZE(CO1) 2191#endif 2192 .align 3 2193 2194.L89: 2195#if defined(TRMMKERNEL) && !defined(LEFT) 2196 daddiu KK, KK, 1 2197#endif 2198 move B, BO 2199 .align 3 2200 2201 2202.L999: 2203 LDARG $16, 0($sp) 2204 LDARG $17, 8($sp) 2205 LDARG $18, 16($sp) 2206 LDARG $19, 24($sp) 2207 LDARG $20, 32($sp) 2208 LDARG $21, 40($sp) 2209 LDARG $22, 48($sp) 2210 2211 ldc1 $f24, 56($sp) 2212 ldc1 $f25, 64($sp) 2213 ldc1 $f26, 72($sp) 2214 ldc1 $f27, 80($sp) 2215 ldc1 $f28, 88($sp) 2216 2217#if defined(TRMMKERNEL) 2218 LDARG $23, 96($sp) 2219 LDARG $24, 104($sp) 2220 LDARG $25, 112($sp) 2221#endif 2222 2223#ifndef __64BIT__ 2224 ldc1 $f20,120($sp) 2225 ldc1 $f21,128($sp) 2226 ldc1 $f22,136($sp) 2227 ldc1 $f23,144($sp) 2228#endif 2229 2230 j $31 2231 daddiu $sp, $sp, 160 2232 2233 EPILOGUE 2234