1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M $4 26#define N $5 27#define K $6 28#define A $9 29#define B $10 30#define C $11 31#define LDC $8 32 33#define AO $12 34#define BO $13 35 36#define I $2 37#define J $3 38#define L $7 39 40#define CO1 $14 41#define CO2 $15 42#define CO3 $16 43#define CO4 $17 44 45#if defined(TRMMKERNEL) 46#define OFFSET $18 47#define KK $19 48#define TEMP $20 49#endif 50 51#define a1 $f0 52#define a2 $f1 53#define a3 $f28 54#define a4 $f29 55 56#define b1 $f2 57#define b2 $f3 58#define b3 $f4 59#define b4 $f5 60#define b5 $f6 61#define b6 $f7 62#define b7 $f8 63#define b8 $f9 64 65#define a5 b8 66 67#define c11 $f10 68#define c12 $f11 69#define c21 $f12 70#define c22 $f13 71#define c31 $f14 72#define c32 $f17 73#define c41 $f18 74#define c42 $f19 75#define c51 $f20 76#define c52 $f21 77#define c61 $f22 78#define c62 $f23 79#define c71 $f24 80#define c72 $f25 81#define c81 $f26 82#define c82 $f27 83 84#define ALPHA_R $f15 85#define ALPHA_I $f16 86 87#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 88#define MADD1 MADD 89#define MADD2 MADD 90#define MADD3 MADD 91#define MADD4 NMSUB 92#endif 93 94#if defined(NR) || defined(NC) || defined(TR) || defined(TC) 95#define MADD1 MADD 96#define MADD2 MADD 97#define MADD3 NMSUB 98#define MADD4 MADD 99#endif 100 101#if defined(RN) || defined(RT) || defined(CN) || defined(CT) 102#define MADD1 MADD 103#define MADD2 NMSUB 104#define MADD3 MADD 105#define MADD4 MADD 106#endif 107 108#if defined(RR) || defined(RC) || defined(CR) || defined(CC) 109#define MADD1 MADD 110#define MADD2 NMSUB 111#define MADD3 NMSUB 112#define MADD4 NMSUB 113#endif 114 115 PROLOGUE 116 117 LDARG LDC, 0($sp) 118 daddiu $sp, $sp, -128 119 120 SDARG $16, 0($sp) 121 SDARG $17, 8($sp) 122 sdc1 $f24, 16($sp) 123 sdc1 $f25, 24($sp) 124 sdc1 $f26, 32($sp) 125 sdc1 $f27, 40($sp) 126 sdc1 $f28, 48($sp) 127 sdc1 $f29, 56($sp) 128 129#if defined(TRMMKERNEL) 130 SDARG $18, 64($sp) 131 SDARG $19, 72($sp) 132 SDARG $20, 80($sp) 133 134 LDARG OFFSET, 128 + 8($sp) 135#endif 136 137#ifndef __64BIT__ 138 sdc1 $f20, 88($sp) 139 sdc1 $f21, 96($sp) 140 sdc1 $f22,104($sp) 141 sdc1 $f23,112($sp) 142#endif 143 144 dsll LDC, LDC, ZBASE_SHIFT 145 146#if defined(TRMMKERNEL) && !defined(LEFT) 147 neg KK, OFFSET 148#endif 149 150 dsra J, N, 2 151 blez J, .L20 152 nop 153 154.L10: 155 move CO1, C 156 MTC $0, c11 157 daddu CO2, C, LDC 158 move AO, A 159 daddu CO3, CO2, LDC 160 daddiu J, J, -1 161 daddu CO4, CO3, LDC 162 MOV c21, c11 163 MOV c31, c11 164#if defined(TRMMKERNEL) && defined(LEFT) 165 move KK, OFFSET 166#endif 167 MOV c41, c11 168 MOV c51, c11 169 move I, M 170 daddu C, CO4, LDC 171 172 blez I, .L19 173 MOV c61, c11 174 175.L11: 176#if defined(TRMMKERNEL) 177#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 178 move BO, B 179#else 180 dsll L, KK, ZBASE_SHIFT 181 dsll TEMP, KK, 2 + ZBASE_SHIFT 182 183 daddu AO, AO, L 184 daddu BO, B, TEMP 185#endif 186 187 LD a1, 0 * SIZE(AO) 188 MOV c71, c11 189 LD b1, 0 * SIZE(BO) 190 MOV c81, c11 191 192 LD a3, 4 * SIZE(AO) 193 MOV c12, c11 194 LD b2, 1 * SIZE(BO) 195 MOV c22, c11 196 197 MOV c32, c11 198 LD b3, 2 * SIZE(BO) 199 MOV c42, c11 200 201 LD b4, 3 * SIZE(BO) 202 MOV c52, c11 203 LD b5, 4 * SIZE(BO) 204 MOV c62, c11 205 206 LD b6, 8 * SIZE(BO) 207 MOV c72, c11 208 LD b7, 12 * SIZE(BO) 209 MOV c82, c11 210 211#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 212 dsubu TEMP, K, KK 213#elif defined(LEFT) 214 daddiu TEMP, KK, 1 215#else 216 daddiu TEMP, KK, 4 217#endif 218 dsra L, TEMP, 2 219 220 blez L, .L15 221 NOP 222#else 223 LD a1, 0 * SIZE(AO) 224 MOV c71, c11 225 LD b1, 0 * SIZE(B) 226 MOV c81, c11 227 228 LD a3, 4 * SIZE(AO) 229 MOV c12, c11 230 LD b2, 1 * SIZE(B) 231 MOV c22, c11 232 233 dsra L, K, 2 234 MOV c32, c11 235 LD b3, 2 * SIZE(B) 236 MOV c42, c11 237 238 LD b4, 3 * SIZE(B) 239 MOV c52, c11 240 LD b5, 4 * SIZE(B) 241 MOV c62, c11 242 243 LD b6, 8 * SIZE(B) 244 MOV c72, c11 245 LD b7, 12 * SIZE(B) 246 MOV c82, c11 247 248 blez L, .L15 249 move BO, B 250#endif 251 252 MADD1 c11, c11, a1, b1 253 LD a2, 1 * SIZE(AO) 254 MADD3 c21, c21, a1, b2 255 daddiu L, L, -1 256 MADD1 c31, c31, a1, b3 257 NOP 258 blez L, .L13 259 MADD3 c41, c41, a1, b4 260 .align 3 261 262.L12: 263 MADD2 c12, c12, a2, b1 264 LD b1, 16 * SIZE(BO) 265 MADD4 c22, c22, a2, b2 266 LD b2, 5 * SIZE(BO) 267 MADD2 c32, c32, a2, b3 268 LD b3, 6 * SIZE(BO) 269 MADD4 c42, c42, a2, b4 270 LD b4, 7 * SIZE(BO) 271 272 MADD1 c51, c51, a1, b5 273 NOP 274 MADD3 c61, c61, a1, b2 275 LD a4, 2 * SIZE(AO) 276 MADD1 c71, c71, a1, b3 277 NOP 278 MADD3 c81, c81, a1, b4 279 LD a1, 8 * SIZE(AO) 280 281 MADD2 c52, c52, a2, b5 282 LD b5, 20 * SIZE(BO) 283 MADD4 c62, c62, a2, b2 284 LD b2, 9 * SIZE(BO) 285 MADD2 c72, c72, a2, b3 286 LD b3, 10 * SIZE(BO) 287 MADD4 c82, c82, a2, b4 288 LD b4, 11 * SIZE(BO) 289 290 MADD1 c11, c11, a4, b6 291 LD a2, 3 * SIZE(AO) 292 MADD3 c21, c21, a4, b2 293 NOP 294 MADD1 c31, c31, a4, b3 295 NOP 296 MADD3 c41, c41, a4, b4 297 NOP 298 299 MADD2 c12, c12, a2, b6 300 LD b6, 24 * SIZE(BO) 301 MADD4 c22, c22, a2, b2 302 LD b2, 13 * SIZE(BO) 303 MADD2 c32, c32, a2, b3 304 LD b3, 14 * SIZE(BO) 305 MADD4 c42, c42, a2, b4 306 LD b4, 15 * SIZE(BO) 307 308 MADD1 c51, c51, a4, b7 309 NOP 310 MADD3 c61, c61, a4, b2 311 NOP 312 MADD1 c71, c71, a4, b3 313 NOP 314 MADD3 c81, c81, a4, b4 315 NOP 316 317 MADD2 c52, c52, a2, b7 318 LD b7, 28 * SIZE(BO) 319 MADD4 c62, c62, a2, b2 320 LD b2, 17 * SIZE(BO) 321 MADD2 c72, c72, a2, b3 322 LD b3, 18 * SIZE(BO) 323 MADD4 c82, c82, a2, b4 324 LD b4, 19 * SIZE(BO) 325 326 MADD1 c11, c11, a3, b1 327 LD a2, 5 * SIZE(AO) 328 MADD3 c21, c21, a3, b2 329 NOP 330 MADD1 c31, c31, a3, b3 331 NOP 332 MADD3 c41, c41, a3, b4 333 NOP 334 335 MADD2 c12, c12, a2, b1 336 LD b1, 32 * SIZE(BO) 337 MADD4 c22, c22, a2, b2 338 LD b2, 21 * SIZE(BO) 339 MADD2 c32, c32, a2, b3 340 LD b3, 22 * SIZE(BO) 341 MADD4 c42, c42, a2, b4 342 LD b4, 23 * SIZE(BO) 343 344 MADD1 c51, c51, a3, b5 345 NOP 346 MADD3 c61, c61, a3, b2 347 LD a4, 6 * SIZE(AO) 348 MADD1 c71, c71, a3, b3 349 NOP 350 MADD3 c81, c81, a3, b4 351 LD a3, 12 * SIZE(AO) 352 353 MADD2 c52, c52, a2, b5 354 LD b5, 36 * SIZE(BO) 355 MADD4 c62, c62, a2, b2 356 LD b2, 25 * SIZE(BO) 357 MADD2 c72, c72, a2, b3 358 LD b3, 26 * SIZE(BO) 359 MADD4 c82, c82, a2, b4 360 LD b4, 27 * SIZE(BO) 361 362 MADD1 c11, c11, a4, b6 363 LD a2, 7 * SIZE(AO) 364 MADD3 c21, c21, a4, b2 365 NOP 366 MADD1 c31, c31, a4, b3 367 NOP 368 MADD3 c41, c41, a4, b4 369 daddiu L, L, -1 370 371 MADD2 c12, c12, a2, b6 372 LD b6, 40 * SIZE(BO) 373 MADD4 c22, c22, a2, b2 374 LD b2, 29 * SIZE(BO) 375 MADD2 c32, c32, a2, b3 376 LD b3, 30 * SIZE(BO) 377 MADD4 c42, c42, a2, b4 378 LD b4, 31 * SIZE(BO) 379 380 MADD1 c51, c51, a4, b7 381 daddiu BO, BO, 32 * SIZE 382 MADD3 c61, c61, a4, b2 383 daddiu AO, AO, 8 * SIZE 384 MADD1 c71, c71, a4, b3 385 NOP 386 MADD3 c81, c81, a4, b4 387 NOP 388 389 MADD2 c52, c52, a2, b7 390 LD b7, 12 * SIZE(BO) 391 MADD4 c62, c62, a2, b2 392 LD b2, 1 * SIZE(BO) 393 MADD2 c72, c72, a2, b3 394 LD b3, 2 * SIZE(BO) 395 MADD4 c82, c82, a2, b4 396 LD b4, 3 * SIZE(BO) 397 398 MADD1 c11, c11, a1, b1 399 LD a2, 1 * SIZE(AO) 400 MADD3 c21, c21, a1, b2 401 NOP 402 MADD1 c31, c31, a1, b3 403 NOP 404 bgtz L, .L12 405 MADD3 c41, c41, a1, b4 406 .align 3 407 408.L13: 409 MADD2 c12, c12, a2, b1 410 LD b1, 16 * SIZE(BO) 411 MADD4 c22, c22, a2, b2 412 LD b2, 5 * SIZE(BO) 413 MADD2 c32, c32, a2, b3 414 LD b3, 6 * SIZE(BO) 415 MADD4 c42, c42, a2, b4 416 LD b4, 7 * SIZE(BO) 417 418 MADD1 c51, c51, a1, b5 419 NOP 420 MADD3 c61, c61, a1, b2 421 LD a4, 2 * SIZE(AO) 422 MADD1 c71, c71, a1, b3 423 NOP 424 MADD3 c81, c81, a1, b4 425 LD a1, 8 * SIZE(AO) 426 427 MADD2 c52, c52, a2, b5 428 LD b5, 20 * SIZE(BO) 429 MADD4 c62, c62, a2, b2 430 LD b2, 9 * SIZE(BO) 431 MADD2 c72, c72, a2, b3 432 LD b3, 10 * SIZE(BO) 433 MADD4 c82, c82, a2, b4 434 LD b4, 11 * SIZE(BO) 435 436 MADD1 c11, c11, a4, b6 437 LD a2, 3 * SIZE(AO) 438 MADD3 c21, c21, a4, b2 439 NOP 440 MADD1 c31, c31, a4, b3 441 NOP 442 MADD3 c41, c41, a4, b4 443 NOP 444 445 MADD2 c12, c12, a2, b6 446 LD b6, 24 * SIZE(BO) 447 MADD4 c22, c22, a2, b2 448 LD b2, 13 * SIZE(BO) 449 MADD2 c32, c32, a2, b3 450 LD b3, 14 * SIZE(BO) 451 MADD4 c42, c42, a2, b4 452 LD b4, 15 * SIZE(BO) 453 454 MADD1 c51, c51, a4, b7 455 NOP 456 MADD3 c61, c61, a4, b2 457 NOP 458 MADD1 c71, c71, a4, b3 459 NOP 460 MADD3 c81, c81, a4, b4 461 NOP 462 463 MADD2 c52, c52, a2, b7 464 LD b7, 28 * SIZE(BO) 465 MADD4 c62, c62, a2, b2 466 LD b2, 17 * SIZE(BO) 467 MADD2 c72, c72, a2, b3 468 LD b3, 18 * SIZE(BO) 469 MADD4 c82, c82, a2, b4 470 LD b4, 19 * SIZE(BO) 471 472 MADD1 c11, c11, a3, b1 473 LD a2, 5 * SIZE(AO) 474 MADD3 c21, c21, a3, b2 475 NOP 476 MADD1 c31, c31, a3, b3 477 NOP 478 MADD3 c41, c41, a3, b4 479 NOP 480 481 MADD2 c12, c12, a2, b1 482 LD b1, 32 * SIZE(BO) 483 MADD4 c22, c22, a2, b2 484 LD b2, 21 * SIZE(BO) 485 MADD2 c32, c32, a2, b3 486 LD b3, 22 * SIZE(BO) 487 MADD4 c42, c42, a2, b4 488 LD b4, 23 * SIZE(BO) 489 490 MADD1 c51, c51, a3, b5 491 NOP 492 MADD3 c61, c61, a3, b2 493 LD a4, 6 * SIZE(AO) 494 MADD1 c71, c71, a3, b3 495 NOP 496 MADD3 c81, c81, a3, b4 497 LD a3, 12 * SIZE(AO) 498 499 MADD2 c52, c52, a2, b5 500 LD b5, 36 * SIZE(BO) 501 MADD4 c62, c62, a2, b2 502 LD b2, 25 * SIZE(BO) 503 MADD2 c72, c72, a2, b3 504 LD b3, 26 * SIZE(BO) 505 MADD4 c82, c82, a2, b4 506 LD b4, 27 * SIZE(BO) 507 508 MADD1 c11, c11, a4, b6 509 LD a2, 7 * SIZE(AO) 510 MADD3 c21, c21, a4, b2 511 NOP 512 MADD1 c31, c31, a4, b3 513 NOP 514 MADD3 c41, c41, a4, b4 515 NOP 516 517 MADD2 c12, c12, a2, b6 518 LD b6, 40 * SIZE(BO) 519 MADD4 c22, c22, a2, b2 520 LD b2, 29 * SIZE(BO) 521 MADD2 c32, c32, a2, b3 522 LD b3, 30 * SIZE(BO) 523 MADD4 c42, c42, a2, b4 524 LD b4, 31 * SIZE(BO) 525 526 MADD1 c51, c51, a4, b7 527 daddiu BO, BO, 32 * SIZE 528 MADD3 c61, c61, a4, b2 529 daddiu AO, AO, 8 * SIZE 530 MADD1 c71, c71, a4, b3 531 NOP 532 MADD3 c81, c81, a4, b4 533 NOP 534 535 MADD2 c52, c52, a2, b7 536 LD b7, 12 * SIZE(BO) 537 MADD4 c62, c62, a2, b2 538 LD b2, 1 * SIZE(BO) 539 MADD2 c72, c72, a2, b3 540 LD b3, 2 * SIZE(BO) 541 MADD4 c82, c82, a2, b4 542 LD b4, 3 * SIZE(BO) 543 .align 3 544 545.L15: 546#ifndef TRMMKERNEL 547 andi L, K, 3 548#else 549 andi L, TEMP, 3 550#endif 551 NOP 552 blez L, .L18 553 NOP 554 .align 3 555 556.L16: 557 MADD1 c11, c11, a1, b1 558 LD a2, 1 * SIZE(AO) 559 MADD3 c21, c21, a1, b2 560 NOP 561 MADD1 c31, c31, a1, b3 562 NOP 563 MADD3 c41, c41, a1, b4 564 NOP 565 566 MADD2 c12, c12, a2, b1 567 LD b1, 8 * SIZE(BO) 568 MADD4 c22, c22, a2, b2 569 LD b2, 5 * SIZE(BO) 570 MADD2 c32, c32, a2, b3 571 LD b3, 6 * SIZE(BO) 572 MADD4 c42, c42, a2, b4 573 LD b4, 7 * SIZE(BO) 574 575 MADD1 c51, c51, a1, b5 576 daddiu L, L, -1 577 MADD3 c61, c61, a1, b2 578 daddiu AO, AO, 2 * SIZE 579 MADD1 c71, c71, a1, b3 580 daddiu BO, BO, 8 * SIZE 581 MADD3 c81, c81, a1, b4 582 LD a1, 0 * SIZE(AO) 583 584 MADD2 c52, c52, a2, b5 585 LD b5, 4 * SIZE(BO) 586 MADD4 c62, c62, a2, b2 587 LD b2, 1 * SIZE(BO) 588 MADD2 c72, c72, a2, b3 589 LD b3, 2 * SIZE(BO) 590 MADD4 c82, c82, a2, b4 591 bgtz L, .L16 592 LD b4, 3 * SIZE(BO) 593 594.L18: 595#ifndef TRMMKERNEL 596 LD b1, 0 * SIZE(CO1) 597 ADD c11, c11, c22 598 LD b2, 1 * SIZE(CO1) 599 ADD c12, c12, c21 600 LD b3, 0 * SIZE(CO2) 601 ADD c31, c31, c42 602 LD b4, 1 * SIZE(CO2) 603 ADD c32, c32, c41 604 605 LD b5, 0 * SIZE(CO3) 606 ADD c51, c51, c62 607 LD b6, 1 * SIZE(CO3) 608 ADD c52, c52, c61 609 LD b7, 0 * SIZE(CO4) 610 ADD c71, c71, c82 611 LD b8, 1 * SIZE(CO4) 612 ADD c72, c72, c81 613 614 MADD b1, b1, ALPHA_R, c11 615 daddiu CO1,CO1, 2 * SIZE 616 MADD b2, b2, ALPHA_R, c12 617 daddiu CO2,CO2, 2 * SIZE 618 MADD b3, b3, ALPHA_R, c31 619 daddiu CO3,CO3, 2 * SIZE 620 MADD b4, b4, ALPHA_R, c32 621 daddiu CO4,CO4, 2 * SIZE 622 623 MADD b5, b5, ALPHA_R, c51 624 daddiu I, I, -1 625 MADD b6, b6, ALPHA_R, c52 626 NOP 627 MADD b7, b7, ALPHA_R, c71 628 NOP 629 MADD b8, b8, ALPHA_R, c72 630 NOP 631 632 NMSUB b1, b1, ALPHA_I, c12 633 NOP 634 MADD b2, b2, ALPHA_I, c11 635 MTC $0, c11 636 NMSUB b3, b3, ALPHA_I, c32 637 NOP 638 MADD b4, b4, ALPHA_I, c31 639 NOP 640 641 ST b1, -2 * SIZE(CO1) 642 NMSUB b5, b5, ALPHA_I, c52 643 ST b2, -1 * SIZE(CO1) 644 MADD b6, b6, ALPHA_I, c51 645 ST b3, -2 * SIZE(CO2) 646 NMSUB b7, b7, ALPHA_I, c72 647 ST b4, -1 * SIZE(CO2) 648 MADD b8, b8, ALPHA_I, c71 649 650 ST b5, -2 * SIZE(CO3) 651 MOV c21, c11 652 ST b6, -1 * SIZE(CO3) 653 MOV c31, c11 654 ST b7, -2 * SIZE(CO4) 655 MOV c41, c11 656 ST b8, -1 * SIZE(CO4) 657 MOV c51, c11 658 659#else 660 661 ADD c11, c11, c22 662 daddiu CO1,CO1, 2 * SIZE 663 ADD c12, c12, c21 664 daddiu CO2,CO2, 2 * SIZE 665 ADD c31, c31, c42 666 daddiu CO3,CO3, 2 * SIZE 667 ADD c32, c32, c41 668 daddiu CO4,CO4, 2 * SIZE 669 670 ADD c51, c51, c62 671 daddiu I, I, -1 672 ADD c52, c52, c61 673 ADD c71, c71, c82 674 ADD c72, c72, c81 675 676 MUL b1, ALPHA_R, c11 677 MUL b2, ALPHA_R, c12 678 MUL b3, ALPHA_R, c31 679 MUL b4, ALPHA_R, c32 680 681 MUL b5, ALPHA_R, c51 682 MUL b6, ALPHA_R, c52 683 MUL b7, ALPHA_R, c71 684 MUL b8, ALPHA_R, c72 685 686 NMSUB b1, b1, ALPHA_I, c12 687 NOP 688 MADD b2, b2, ALPHA_I, c11 689 MTC $0, c11 690 NMSUB b3, b3, ALPHA_I, c32 691 NOP 692 MADD b4, b4, ALPHA_I, c31 693 NOP 694 695 ST b1, -2 * SIZE(CO1) 696 NMSUB b5, b5, ALPHA_I, c52 697 ST b2, -1 * SIZE(CO1) 698 MADD b6, b6, ALPHA_I, c51 699 ST b3, -2 * SIZE(CO2) 700 NMSUB b7, b7, ALPHA_I, c72 701 ST b4, -1 * SIZE(CO2) 702 MADD b8, b8, ALPHA_I, c71 703 704 ST b5, -2 * SIZE(CO3) 705 MOV c21, c11 706 ST b6, -1 * SIZE(CO3) 707 MOV c31, c11 708 ST b7, -2 * SIZE(CO4) 709 MOV c41, c11 710 ST b8, -1 * SIZE(CO4) 711 MOV c51, c11 712 713#if ( defined(LEFT) && defined(TRANSA)) || \ 714 (!defined(LEFT) && !defined(TRANSA)) 715 dsubu TEMP, K, KK 716#ifdef LEFT 717 daddiu TEMP, TEMP, -1 718#else 719 daddiu TEMP, TEMP, -4 720#endif 721 722 dsll L, TEMP, ZBASE_SHIFT 723 dsll TEMP, TEMP, 2 + ZBASE_SHIFT 724 725 daddu AO, AO, L 726 daddu BO, BO, TEMP 727#endif 728 729#ifdef LEFT 730 daddiu KK, KK, 1 731#endif 732#endif 733 bgtz I, .L11 734 MOV c61, c11 735 .align 3 736 737.L19: 738#if defined(TRMMKERNEL) && !defined(LEFT) 739 daddiu KK, KK, 4 740#endif 741 742 bgtz J, .L10 743 move B, BO 744 .align 3 745 746.L20: 747 andi J, N, 2 748 MTC $0, c11 749 blez J, .L30 750 move CO1, C 751 752 daddu CO2, C, LDC 753 daddu C, CO2, LDC 754 755#if defined(TRMMKERNEL) && defined(LEFT) 756 move KK, OFFSET 757#endif 758 759 move I, M 760 blez I, .L29 761 move AO, A 762 .align 3 763 764.L21: 765#if defined(TRMMKERNEL) 766#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 767 move BO, B 768#else 769 dsll L, KK, ZBASE_SHIFT 770 dsll TEMP, KK, 1 + ZBASE_SHIFT 771 772 daddu AO, AO, L 773 daddu BO, B, TEMP 774#endif 775 776 LD a1, 0 * SIZE(AO) 777 MOV c21, c11 778 LD b1, 0 * SIZE(BO) 779 MOV c31, c11 780 LD a3, 4 * SIZE(AO) 781 MOV c41, c11 782 LD b2, 1 * SIZE(BO) 783 784 LD b3, 2 * SIZE(BO) 785 MOV c12, c11 786 LD b4, 3 * SIZE(BO) 787 MOV c22, c11 788 LD b5, 4 * SIZE(BO) 789 MOV c32, c11 790 791#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 792 dsubu TEMP, K, KK 793#elif defined(LEFT) 794 daddiu TEMP, KK, 1 795#else 796 daddiu TEMP, KK, 2 797#endif 798 dsra L, TEMP, 2 799 blez L, .L25 800 MOV c42, c11 801 802#else 803 LD a1, 0 * SIZE(AO) 804 MOV c21, c11 805 LD b1, 0 * SIZE(B) 806 MOV c31, c11 807 LD a3, 4 * SIZE(AO) 808 MOV c41, c11 809 LD b2, 1 * SIZE(B) 810 dsra L, K, 2 811 812 LD b3, 2 * SIZE(B) 813 MOV c12, c11 814 LD b4, 3 * SIZE(B) 815 MOV c22, c11 816 LD b5, 4 * SIZE(B) 817 MOV c32, c11 818 819 NOP 820 MOV c42, c11 821 blez L, .L25 822 move BO, B 823#endif 824 .align 3 825 826.L22: 827 MADD1 c11, c11, a1, b1 828 LD a2, 1 * SIZE(AO) 829 MADD3 c21, c21, a1, b2 830 daddiu L, L, -1 831 MADD1 c31, c31, a1, b3 832 NOP 833 MADD3 c41, c41, a1, b4 834 LD a1, 2 * SIZE(AO) 835 836 MADD2 c12, c12, a2, b1 837 LD b1, 8 * SIZE(BO) 838 MADD4 c22, c22, a2, b2 839 LD b2, 5 * SIZE(BO) 840 MADD2 c32, c32, a2, b3 841 LD b3, 6 * SIZE(BO) 842 MADD4 c42, c42, a2, b4 843 LD b4, 7 * SIZE(BO) 844 845 MADD1 c11, c11, a1, b5 846 LD a2, 3 * SIZE(AO) 847 MADD3 c21, c21, a1, b2 848 NOP 849 MADD1 c31, c31, a1, b3 850 NOP 851 MADD3 c41, c41, a1, b4 852 LD a1, 8 * SIZE(AO) 853 854 MADD2 c12, c12, a2, b5 855 LD b5, 12 * SIZE(BO) 856 MADD4 c22, c22, a2, b2 857 LD b2, 9 * SIZE(BO) 858 MADD2 c32, c32, a2, b3 859 LD b3, 10 * SIZE(BO) 860 MADD4 c42, c42, a2, b4 861 LD b4, 11 * SIZE(BO) 862 863 MADD1 c11, c11, a3, b1 864 LD a2, 5 * SIZE(AO) 865 MADD3 c21, c21, a3, b2 866 NOP 867 MADD1 c31, c31, a3, b3 868 NOP 869 MADD3 c41, c41, a3, b4 870 LD a3, 6 * SIZE(AO) 871 872 MADD2 c12, c12, a2, b1 873 LD b1, 16 * SIZE(BO) 874 MADD4 c22, c22, a2, b2 875 LD b2, 13 * SIZE(BO) 876 MADD2 c32, c32, a2, b3 877 LD b3, 14 * SIZE(BO) 878 MADD4 c42, c42, a2, b4 879 LD b4, 15 * SIZE(BO) 880 881 MADD1 c11, c11, a3, b5 882 LD a2, 7 * SIZE(AO) 883 MADD3 c21, c21, a3, b2 884 daddiu AO, AO, 8 * SIZE 885 MADD1 c31, c31, a3, b3 886 NOP 887 MADD3 c41, c41, a3, b4 888 LD a3, 4 * SIZE(AO) 889 890 MADD2 c12, c12, a2, b5 891 LD b5, 20 * SIZE(BO) 892 MADD4 c22, c22, a2, b2 893 LD b2, 17 * SIZE(BO) 894 MADD2 c32, c32, a2, b3 895 LD b3, 18 * SIZE(BO) 896 MADD4 c42, c42, a2, b4 897 LD b4, 19 * SIZE(BO) 898 899 bgtz L, .L22 900 daddiu BO, BO, 16 * SIZE 901 .align 3 902 903.L25: 904#ifndef TRMMKERNEL 905 andi L, K, 3 906#else 907 andi L, TEMP, 3 908#endif 909 NOP 910 blez L, .L28 911 NOP 912 .align 3 913 914.L26: 915 MADD1 c11, c11, a1, b1 916 LD a2, 1 * SIZE(AO) 917 MADD3 c21, c21, a1, b2 918 daddiu L, L, -1 919 MADD1 c31, c31, a1, b3 920 daddiu BO, BO, 4 * SIZE 921 MADD3 c41, c41, a1, b4 922 LD a1, 2 * SIZE(AO) 923 924 MADD2 c12, c12, a2, b1 925 LD b1, 0 * SIZE(BO) 926 MADD4 c22, c22, a2, b2 927 LD b2, 1 * SIZE(BO) 928 MADD2 c32, c32, a2, b3 929 LD b3, 2 * SIZE(BO) 930 MADD4 c42, c42, a2, b4 931 LD b4, 3 * SIZE(BO) 932 933 bgtz L, .L26 934 daddiu AO, AO, 2 * SIZE 935 936.L28: 937#ifndef TRMMKERNEL 938 LD b1, 0 * SIZE(CO1) 939 ADD c11, c11, c22 940 LD b2, 1 * SIZE(CO1) 941 ADD c12, c12, c21 942 LD b3, 0 * SIZE(CO2) 943 ADD c31, c31, c42 944 LD b4, 1 * SIZE(CO2) 945 ADD c32, c32, c41 946 947 MADD b1, b1, ALPHA_R, c11 948 daddiu CO1,CO1, 2 * SIZE 949 MADD b2, b2, ALPHA_R, c12 950 daddiu CO2,CO2, 2 * SIZE 951 MADD b3, b3, ALPHA_R, c31 952 daddiu I, I, -1 953 MADD b4, b4, ALPHA_R, c32 954 955 NMSUB b1, b1, ALPHA_I, c12 956 NOP 957 MADD b2, b2, ALPHA_I, c11 958 MTC $0, c11 959 NMSUB b3, b3, ALPHA_I, c32 960 NOP 961 MADD b4, b4, ALPHA_I, c31 962 NOP 963 964 ST b1, -2 * SIZE(CO1) 965 ST b2, -1 * SIZE(CO1) 966 ST b3, -2 * SIZE(CO2) 967#else 968 ADD c11, c11, c22 969 ADD c12, c12, c21 970 ADD c31, c31, c42 971 ADD c32, c32, c41 972 973 MUL b1, ALPHA_R, c11 974 daddiu CO1,CO1, 2 * SIZE 975 MUL b2, ALPHA_R, c12 976 daddiu CO2,CO2, 2 * SIZE 977 MUL b3, ALPHA_R, c31 978 daddiu I, I, -1 979 MUL b4, ALPHA_R, c32 980 981 NMSUB b1, b1, ALPHA_I, c12 982 NOP 983 MADD b2, b2, ALPHA_I, c11 984 MTC $0, c11 985 NMSUB b3, b3, ALPHA_I, c32 986 NOP 987 MADD b4, b4, ALPHA_I, c31 988 NOP 989 990 ST b1, -2 * SIZE(CO1) 991 ST b2, -1 * SIZE(CO1) 992 ST b3, -2 * SIZE(CO2) 993 994#if ( defined(LEFT) && defined(TRANSA)) || \ 995 (!defined(LEFT) && !defined(TRANSA)) 996 dsubu TEMP, K, KK 997#ifdef LEFT 998 daddiu TEMP, TEMP, -1 999#else 1000 daddiu TEMP, TEMP, -2 1001#endif 1002 1003 dsll L, TEMP, ZBASE_SHIFT 1004 dsll TEMP, TEMP, 1 + ZBASE_SHIFT 1005 1006 daddu AO, AO, L 1007 daddu BO, BO, TEMP 1008#endif 1009 1010#ifdef LEFT 1011 daddiu KK, KK, 1 1012#endif 1013#endif 1014 bgtz I, .L21 1015 ST b4, -1 * SIZE(CO2) 1016 .align 3 1017 1018.L29: 1019#if defined(TRMMKERNEL) && !defined(LEFT) 1020 daddiu KK, KK, 2 1021#endif 1022 1023 move B, BO 1024 .align 3 1025 1026.L30: 1027 andi J, N, 1 1028 MTC $0, c11 1029 blez J, .L999 1030 move CO1, C 1031 1032#if defined(TRMMKERNEL) && defined(LEFT) 1033 move KK, OFFSET 1034#endif 1035 1036 move I, M 1037 daddu C, CO1, LDC 1038 blez I, .L39 1039 move AO, A 1040 .align 3 1041 1042.L31: 1043#if defined(TRMMKERNEL) 1044#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1045 move BO, B 1046#else 1047 dsll TEMP, KK, ZBASE_SHIFT 1048 1049 daddu AO, AO, TEMP 1050 daddu BO, B, TEMP 1051#endif 1052 1053 LD a1, 0 * SIZE(AO) 1054 MOV c21, c11 1055 LD b1, 0 * SIZE(BO) 1056 MOV c31, c11 1057 LD a2, 1 * SIZE(AO) 1058 1059 MOV c41, c11 1060 LD b2, 1 * SIZE(BO) 1061 MOV c12, c11 1062 NOP 1063 1064 MOV c22, c11 1065 LD a3, 4 * SIZE(AO) 1066 MOV c32, c11 1067 LD b3, 4 * SIZE(BO) 1068 1069#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1070 dsubu TEMP, K, KK 1071#elif defined(LEFT) 1072 daddiu TEMP, KK, 1 1073#else 1074 daddiu TEMP, KK, 1 1075#endif 1076 dsra L, TEMP, 2 1077 1078 blez L, .L35 1079 MOV c42, c11 1080#else 1081 LD a1, 0 * SIZE(AO) 1082 MOV c21, c11 1083 LD b1, 0 * SIZE(B) 1084 MOV c31, c11 1085 LD a2, 1 * SIZE(AO) 1086 1087 MOV c41, c11 1088 LD b2, 1 * SIZE(B) 1089 MOV c12, c11 1090 dsra L, K, 2 1091 1092 MOV c22, c11 1093 LD a3, 4 * SIZE(AO) 1094 MOV c32, c11 1095 LD b3, 4 * SIZE(B) 1096 1097 NOP 1098 MOV c42, c11 1099 blez L, .L35 1100 move BO, B 1101#endif 1102 .align 3 1103 1104.L32: 1105 MADD1 c11, c11, a1, b1 1106 LD b4, 3 * SIZE(BO) 1107 MADD3 c21, c21, a1, b2 1108 LD a1, 2 * SIZE(AO) 1109 MADD2 c12, c12, a2, b1 1110 LD b1, 2 * SIZE(BO) 1111 MADD4 c22, c22, a2, b2 1112 LD a2, 3 * SIZE(AO) 1113 1114 MADD1 c11, c11, a1, b1 1115 LD b2, 5 * SIZE(BO) 1116 MADD3 c21, c21, a1, b4 1117 LD a1, 8 * SIZE(AO) 1118 MADD2 c12, c12, a2, b1 1119 LD b1, 8 * SIZE(BO) 1120 MADD4 c22, c22, a2, b4 1121 LD a2, 5 * SIZE(AO) 1122 1123 MADD1 c11, c11, a3, b3 1124 LD b4, 7 * SIZE(BO) 1125 MADD3 c21, c21, a3, b2 1126 LD a3, 6 * SIZE(AO) 1127 MADD2 c12, c12, a2, b3 1128 LD b3, 6 * SIZE(BO) 1129 MADD4 c22, c22, a2, b2 1130 LD a2, 7 * SIZE(AO) 1131 1132 MADD1 c11, c11, a3, b3 1133 LD b2, 9 * SIZE(BO) 1134 MADD3 c21, c21, a3, b4 1135 LD a3, 12 * SIZE(AO) 1136 MADD2 c12, c12, a2, b3 1137 LD b3, 12 * SIZE(BO) 1138 MADD4 c22, c22, a2, b4 1139 LD a2, 9 * SIZE(AO) 1140 1141 daddiu AO, AO, 8 * SIZE 1142 daddiu L, L, -1 1143 1144 bgtz L, .L32 1145 daddiu BO, BO, 8 * SIZE 1146 .align 3 1147 1148.L35: 1149#ifndef TRMMKERNEL 1150 andi L, K, 3 1151#else 1152 andi L, TEMP, 3 1153#endif 1154 NOP 1155 blez L, .L38 1156 NOP 1157 .align 3 1158 1159.L36: 1160 MADD1 c11, c11, a1, b1 1161 daddiu L, L, -1 1162 MADD3 c21, c21, a1, b2 1163 LD a1, 2 * SIZE(AO) 1164 MADD2 c12, c12, a2, b1 1165 LD b1, 2 * SIZE(BO) 1166 MADD4 c22, c22, a2, b2 1167 LD a2, 3 * SIZE(AO) 1168 1169 LD b2, 3 * SIZE(BO) 1170 daddiu BO, BO, 2 * SIZE 1171 bgtz L, .L36 1172 daddiu AO, AO, 2 * SIZE 1173 1174.L38: 1175#ifndef TRMMKERNEL 1176 LD b1, 0 * SIZE(CO1) 1177 ADD c11, c11, c22 1178 LD b2, 1 * SIZE(CO1) 1179 ADD c12, c12, c21 1180 1181 MADD b1, b1, ALPHA_R, c11 1182 daddiu CO1,CO1, 2 * SIZE 1183 MADD b2, b2, ALPHA_R, c12 1184 daddiu I, I, -1 1185 1186 NMSUB b1, b1, ALPHA_I, c12 1187 NOP 1188 MADD b2, b2, ALPHA_I, c11 1189 MTC $0, c11 1190 1191 ST b1, -2 * SIZE(CO1) 1192 NOP 1193 bgtz I, .L31 1194 ST b2, -1 * SIZE(CO1) 1195#else 1196 ADD c11, c11, c22 1197 ADD c12, c12, c21 1198 1199 MUL b1, ALPHA_R, c11 1200 daddiu CO1,CO1, 2 * SIZE 1201 MUL b2, ALPHA_R, c12 1202 daddiu I, I, -1 1203 1204 NMSUB b1, b1, ALPHA_I, c12 1205 NOP 1206 MADD b2, b2, ALPHA_I, c11 1207 MTC $0, c11 1208 1209#if ( defined(LEFT) && defined(TRANSA)) || \ 1210 (!defined(LEFT) && !defined(TRANSA)) 1211 dsubu TEMP, K, KK 1212#ifdef LEFT 1213 daddiu TEMP, TEMP, -1 1214#else 1215 daddiu TEMP, TEMP, -1 1216#endif 1217 1218 dsll TEMP, TEMP, ZBASE_SHIFT 1219 1220 daddu AO, AO, TEMP 1221 daddu BO, BO, TEMP 1222#endif 1223 1224#ifdef LEFT 1225 daddiu KK, KK, 1 1226#endif 1227 1228 ST b1, -2 * SIZE(CO1) 1229 NOP 1230 bgtz I, .L31 1231 ST b2, -1 * SIZE(CO1) 1232#endif 1233 .align 3 1234 1235.L39: 1236#if defined(TRMMKERNEL) && !defined(LEFT) 1237 daddiu KK, KK, 1 1238#endif 1239 move B, BO 1240 .align 3 1241 1242 1243.L999: 1244 LDARG $16, 0($sp) 1245 LDARG $17, 8($sp) 1246 ldc1 $f24, 16($sp) 1247 ldc1 $f25, 24($sp) 1248 ldc1 $f26, 32($sp) 1249 ldc1 $f27, 40($sp) 1250 ldc1 $f28, 48($sp) 1251 ldc1 $f29, 56($sp) 1252 1253#if defined(TRMMKERNEL) 1254 LDARG $18, 64($sp) 1255 LDARG $19, 72($sp) 1256 LDARG $20, 80($sp) 1257#endif 1258 1259#ifndef __64BIT__ 1260 ldc1 $f20, 88($sp) 1261 ldc1 $f21, 96($sp) 1262 ldc1 $f22,104($sp) 1263 ldc1 $f23,112($sp) 1264#endif 1265 1266 j $31 1267 daddiu $sp, $sp, 128 1268 1269 EPILOGUE 1270