1#define REALNAME ASMNAME 2#define ASSEMBLER 3#include "common.h" 4 5#define FETCH ld 6#define gsLQC1(base,fq,ft,offset) .word(0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) 7#define gsSQC1(base,fq,ft,offset) .word(0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) 8 9#define M $4 10#define N $5 11#define K $6 12#define A $8 13#define B $9 14#define C $10 15#define LDC $11 16 17#define AO $12 18#define BO $13 19 20#define CO1 $14 21#define CO2 $15 22#define CO3 $16 23#define CO4 $17 24 25#define KCO $18 26#define MCO $19 27#define NCO $20 28 29#define SPANB $21 30#define PREB $23 31#define PREA $24 32#define SPANA $25 33 34#define ALPHA $f15 35 36#if defined(TRMMKERNEL) 37#define OFFSET $2 38#define KK $3 39#define TEMP $7 40#endif 41 42#define R8 8 43#define R9 9 44#define R14 14 45#define R15 15 46#define R16 16 47#define R17 17 48 49#define t11 $f30 50#define t21 $f31 51#define t31 $f28 52#define t41 $f29 53 54#define t12 $f26 55#define t22 $f27 56#define t32 $f24 57#define t42 $f25 58 59#define t13 $f22 60#define t23 $f23 61#define t33 $f20 62#define t43 $f21 63 64#define t14 $f18 65#define t24 $f19 66#define t34 $f16 67#define t44 $f17 68 69#define c11 $f0 70#define c21 $f1 71#define c31 $f2 72#define c41 $f3 73 74#define c12 $f4 75#define c22 $f5 76#define c32 $f6 77#define c42 $f7 78 79#define c13 $f8 80#define c23 $f9 81#define c33 $f10 82#define c43 $f11 83 84#define c14 $f12 85#define c24 $f13 86#define c34 $f14 87#define c44 $f0 88 89#define a0 $f0 90#define a1 $f1 91#define a2 $f2 92#define a3 $f3 93#define a4 $f4 94#define a5 $f5 95#define a6 $f6 96#define a7 $f7 97#define b0 $f8 98#define b1 $f9 99#define b2 $f10 100#define b3 $f11 101#define b4 $f12 102#define b5 $f13 103#define b6 $f14 104#define b7 $f15 105 106#define F31 31 107#define F30 30 108#define F29 29 109#define F28 28 110#define F27 27 111#define F26 26 112#define F25 25 113#define F24 24 114#define F23 23 115#define F22 22 116#define F21 21 117#define F20 20 118#define F19 19 119#define F18 18 120#define F17 17 121#define F16 16 122#define F15 15 123#define F14 14 124#define F13 13 125#define F12 12 126#define F11 11 127#define F10 10 128#define F9 9 129#define F8 8 130#define F7 7 131#define F6 6 132#define F5 5 133#define F4 4 134#define F3 3 135#define F2 2 136#define F1 1 137#define F0 0 138 139 PROLOGUE 140 141 daddiu $sp, $sp, -160 142 sd $16, 0($sp) 143 sd $17, 8($sp) 144 sd $18, 16($sp) 145 sd $19, 24($sp) 146 sd $20, 32($sp) 147 sd $21, 40($sp) 148 sd $22, 48($sp) 149 ST $f24, 56($sp) 150 ST $f25, 64($sp) 151 ST $f26, 72($sp) 152 ST $f27, 80($sp) 153 ST $f28, 88($sp) 154 sd $23, 96($sp) 155 sd $24, 104($sp) 156 sd $25, 112($sp) 157 ST $f20,120($sp) 158 ST $f21,128($sp) 159 ST $f22,136($sp) 160 ST $f23,144($sp) 161 162 163 .align 5 164.L0_N4: # Loop N 165 ST ALPHA,152($sp) # Backup ALPHA 166 move MCO,M # Backup M 167 168 move NCO,N # Backup N 169 move KCO,K # Backup K 170 171 move AO,A # Backup A_addr 172 dsra N,NCO,2 # N=NCO/2 173 174 dsll LDC,LDC,BASE_SHIFT # LDC*8Byte 175 dsll SPANB,KCO,2+BASE_SHIFT # SPANB=KC*4nr*8Byte=KC*2^5 176 177#if defined(TRMMKERNEL) 178 LDARG OFFSET,160($sp) # OFFSET is relate to the data part 179#endif 180 181#if defined(TRMMKERNEL) && !defined(LEFT) 182 neg KK,OFFSET 183#endif 184 185 move BO,B # Backup B_addr 186 beq N,$0,.L0_N2 # N=0,NCO<4 187 dsll SPANA,KCO,1+BASE_SHIFT # SPANA = KCO*2mr*8Byte 188 189.L0_N4_Lb: # mr=4,nr=4 190 move CO1,C 191 dsra M,MCO,2 # M=MCO/2 192 193 move A,AO # Reset A 194 daddu CO2,C,LDC 195 196 daddu PREB,BO,SPANB # PreB point next panelB 197 daddu CO3,CO2,LDC 198 199 daddu PREA,AO,SPANA 200 daddu CO4,CO3,LDC 201 202#if defined(TRMMKERNEL) && defined(LEFT) 203 move KK,OFFSET 204#endif 205 beqz M,.L14_M2 206 daddu C,CO4,LDC # move C to next panel Cj 207 208.L10: 209#if defined(TRMMKERNEL) 210#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 211 move B,BO # (SIDE=L and UPLO=L) or (SIZE=R and UPLO=U) 212#else 213 dsll K,KK,2 + BASE_SHIFT # KK is the length that needs to span to the data part 214 dsll TEMP,KK,2 + BASE_SHIFT 215 216 daddu A,A,K # move A B to data part 217 daddu B,BO,TEMP 218#endif 219 220 MTC $0,t11 # GEMM part NR=4,MR=4 221 LD a0,0(A) 222 223 MOV t21,t11 224 MOV t31,t11 225 LD a1,1*SIZE(A) 226 227 MOV t41,t11 228 MOV t12,t11 229 LD b0,0(B) 230 231 MOV t22,t11 232 MOV t32,t11 233 LD b1,1*SIZE(B) 234 235 MOV t42,t11 236 LD a2,2*SIZE(A) 237 238 MOV t13,t11 239 MOV t23,t11 240 LD b2,2*SIZE(B) 241 242 MOV t33,t11 243 MOV t43,t11 244 LD a3,3*SIZE(A) 245 246 MOV t14,t11 247 MOV t24,t11 248 LD b3,3*SIZE(B) 249 250#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 251 dsubu TEMP,KCO,KK # temp is the length of the data part 252#elif defined(LEFT) 253 daddiu TEMP, KK, 4 # S=L,U=L 254#else 255 daddiu TEMP, KK, 4 # S=R,U=U,for this two situation KK is the length of the data part 256#endif 257 dsra K,TEMP,2 # K=KCO/2 258 MOV t34,t11 259 beqz K,.L15 260 MOV t44,t11 261 262#else 263 move B,BO # Reset B 264 MTC $0,t11 # GEMM part NR=4,MR=4 265 LD a0,0(A) 266 267 MOV t21,t11 268 MOV t31,t11 269 LD a1,1*SIZE(A) 270 271 MOV t41,t11 272 MOV t12,t11 273 LD b0,0(B) 274 275 MOV t22,t11 276 MOV t32,t11 277 LD b1,1*SIZE(B) 278 279 MOV t42,t11 280 dsra K,KCO,2 # K=KCO/2 281 LD a2,2*SIZE(A) 282 283 MOV t13,t11 284 MOV t23,t11 285 LD b2,2*SIZE(B) 286 287 MOV t33,t11 288 MOV t43,t11 289 LD a3,3*SIZE(A) 290 291 MOV t14,t11 292 MOV t24,t11 293 LD b3,3*SIZE(B) 294 295 MOV t34,t11 296 beqz K,.L15 297 MOV t44,t11 # clear 16 results registers 298#endif 299 300 .align 5 301.L11: # kr=4 302 MADD t11,t11,a0,b0 303 MADD t21,t21,a1,b0 304 LD a4,4*SIZE(A) 305 306 MADD t12,t12,a0,b1 307 MADD t22,t22,a1,b1 308 LD a5,5*SIZE(A) 309 310 MADD t31,t31,a2,b0 311 MADD t41,t41,a3,b0 312 LD b4,4*SIZE(B) 313 314 MADD t32,t32,a2,b1 315 MADD t42,t42,a3,b1 316 LD b5,5*SIZE(B) 317 FETCH $0,(PREB) 318 319 MADD t13,t13,a0,b2 320 MADD t23,t23,a1,b2 321 LD a6,6*SIZE(A) 322 323 MADD t14,t14,a0,b3 324 MADD t24,t24,a1,b3 325 LD b6,6*SIZE(B) 326 FETCH $0,(PREA) 327 328 MADD t33,t33,a2,b2 329 MADD t43,t43,a3,b2 330 LD a7,7*SIZE(A) 331 332 MADD t34,t34,a2,b3 333 MADD t44,t44,a3,b3 334 LD b7,7*SIZE(B) 335 336.L12: 337 MADD t11,t11,a4,b4 338 MADD t21,t21,a5,b4 339 LD a0,8*SIZE(A) 340 341 MADD t12,t12,a4,b5 342 MADD t22,t22,a5,b5 343 LD a1,9*SIZE(A) 344 345 MADD t31,t31,a6,b4 346 MADD t41,t41,a7,b4 347 LD b0,8*SIZE(B) 348 349 MADD t32,t32,a6,b5 350 MADD t42,t42,a7,b5 351 LD b1,9*SIZE(B) 352 353 FETCH $0,4*SIZE(PREB) 354 MADD t13,t13,a4,b6 355 MADD t23,t23,a5,b6 356 LD a2,10*SIZE(A) 357 358 MADD t14,t14,a4,b7 359 MADD t24,t24,a5,b7 360 LD b2,10*SIZE(B) 361 362 FETCH $0,4*SIZE(PREA) 363 MADD t33,t33,a6,b6 364 MADD t43,t43,a7,b6 365 LD a3,11*SIZE(A) 366 367 MADD t34,t34,a6,b7 368 MADD t44,t44,a7,b7 369 LD b3,11*SIZE(B) 370 371.L13: 372 MADD t11,t11,a0,b0 373 MADD t21,t21,a1,b0 374 LD a4,12*SIZE(A) 375 376 MADD t12,t12,a0,b1 377 MADD t22,t22,a1,b1 378 LD a5,13*SIZE(A) 379 380 MADD t31,t31,a2,b0 381 MADD t41,t41,a3,b0 382 LD b4,12*SIZE(B) 383 384 FETCH $0,8*SIZE(PREA) 385 MADD t32,t32,a2,b1 386 MADD t42,t42,a3,b1 387 LD b5,13*SIZE(B) 388 389 FETCH $0,8*SIZE(PREB) 390 MADD t13,t13,a0,b2 391 MADD t23,t23,a1,b2 392 LD a6,14*SIZE(A) 393 394 MADD t14,t14,a0,b3 395 MADD t24,t24,a1,b3 396 daddu A,A,16*SIZE # 4mr*4kr 397 LD b6,14*SIZE(B) 398 399 MADD t33,t33,a2,b2 400 MADD t43,t43,a3,b2 401 daddu B,B,16*SIZE # 4nr*4kr 402 LD a7,-1*SIZE(A) 403 404 MADD t34,t34,a2,b3 405 MADD t44,t44,a3,b3 406 LD b7,-1*SIZE(B) 407 408.L14: 409 MADD t11,t11,a4,b4 410 MADD t21,t21,a5,b4 411 LD a0,0(A) 412 413 MADD t12,t12,a4,b5 414 MADD t22,t22,a5,b5 415 LD a1,1*SIZE(A) 416 417 MADD t31,t31,a6,b4 418 MADD t41,t41,a7,b4 419 daddiu K,K,-1 420 LD b0,0(B) 421 422 MADD t32,t32,a6,b5 423 MADD t42,t42,a7,b5 424 daddu PREA,PREA,16*SIZE 425 LD b1,1*SIZE(B) 426 427 FETCH $0,12*SIZE(PREB) 428 MADD t13,t13,a4,b6 429 MADD t23,t23,a5,b6 430 LD a2,2*SIZE(A) 431 432 FETCH $0,-4*SIZE(PREA) 433 MADD t14,t14,a4,b7 434 MADD t24,t24,a5,b7 435 LD b2,2*SIZE(B) 436 437 MADD t33,t33,a6,b6 438 MADD t43,t43,a7,b6 439 daddu PREB,PREB,16*SIZE 440 LD a3,3*SIZE(A) 441 442 MADD t34,t34,a6,b7 443 MADD t44,t44,a7,b7 444 bnez K,.L11 445 LD b3,3*SIZE(B) 446 447 448.L15: # kr=2 449#ifndef TRMMKERNEL 450 andi K,KCO,2 451#else 452 andi K,TEMP, 2 453#endif 454 beqz K,.L18 455 nop 456 457.L16: 458 MADD t11,t11,a0,b0 459 MADD t21,t21,a1,b0 460 LD a4,4*SIZE(A) 461 462 MADD t12,t12,a0,b1 463 MADD t22,t22,a1,b1 464 LD a5,5*SIZE(A) 465 466 MADD t31,t31,a2,b0 467 MADD t41,t41,a3,b0 468 LD b4,4*SIZE(B) 469 470 FETCH $0,0(PREA) 471 MADD t32,t32,a2,b1 472 MADD t42,t42,a3,b1 473 LD b5,5*SIZE(B) 474 475 FETCH $0,0(PREB) 476 MADD t13,t13,a0,b2 477 MADD t23,t23,a1,b2 478 LD a6,6*SIZE(A) 479 480 MADD t14,t14,a0,b3 481 MADD t24,t24,a1,b3 482 daddu A,A,8*SIZE # 4mr*2kr 483 LD b6,6*SIZE(B) 484 485 MADD t33,t33,a2,b2 486 MADD t43,t43,a3,b2 487 daddu B,B,8*SIZE # 4nr*2kr 488 LD a7,-1*SIZE(A) 489 490 MADD t34,t34,a2,b3 491 MADD t44,t44,a3,b3 492 LD b7,-1*SIZE(B) 493 494.L17: 495 MADD t11,t11,a4,b4 496 MADD t21,t21,a5,b4 497 LD a0,0*SIZE(A) 498 499 MADD t12,t12,a4,b5 500 MADD t22,t22,a5,b5 501 LD a1,1*SIZE(A) 502 503 MADD t31,t31,a6,b4 504 MADD t41,t41,a7,b4 505 LD b0,0*SIZE(B) 506 507 MADD t32,t32,a6,b5 508 MADD t42,t42,a7,b5 509 LD b1,1*SIZE(B) 510 511 FETCH $0,4*SIZE(PREB) 512 MADD t13,t13,a4,b6 513 MADD t23,t23,a5,b6 514 LD a2,2*SIZE(A) 515 516 FETCH $0,4*SIZE(PREA) 517 MADD t14,t14,a4,b7 518 MADD t24,t24,a5,b7 519 LD b2,2*SIZE(B) 520 521 MADD t33,t33,a6,b6 522 MADD t43,t43,a7,b6 523 daddu PREA,PREA,8*SIZE 524 LD a3,3*SIZE(A) 525 526 MADD t34,t34,a6,b7 527 MADD t44,t44,a7,b7 528 daddu PREB,PREB,8*SIZE 529 LD b3,3*SIZE(B) 530 531 532.L18: # kr=1 533#ifndef TRMMKERNEL 534 andi K,KCO,1 535#else 536 andi K,TEMP,1 537#endif 538 beqz K,.L19 539 LD ALPHA,152($sp) # Get ALPHA 540 541 FETCH $0,0(PREB) 542 MADD t11,t11,a0,b0 543 MADD t21,t21,a1,b0 544 daddu A,A,4*SIZE # 4mr*kr 545 546 MADD t12,t12,a0,b1 547 MADD t22,t22,a1,b1 548 daddu B,B,4*SIZE # 4nr*kr 549 550 FETCH $0,0(PREA) 551 MADD t31,t31,a2,b0 552 MADD t41,t41,a3,b0 553 daddu PREB,PREB,4*SIZE 554 555 MADD t32,t32,a2,b1 556 MADD t42,t42,a3,b1 557 daddu PREA,PREA,4*SIZE 558 559 MADD t13,t13,a0,b2 560 MADD t23,t23,a1,b2 561 562 MADD t14,t14,a0,b3 563 MADD t24,t24,a1,b3 564 565 MADD t33,t33,a2,b2 566 MADD t43,t43,a3,b2 567 568 MADD t34,t34,a2,b3 569 MADD t44,t44,a3,b3 570 571.L19: # Write Back to C 572#ifndef TRMMKERNEL 573 LD c11,0(CO1) # GEMM write part 574 LD c21,1*SIZE(CO1) # get 16 C 575 LD c31,2*SIZE(CO1) 576 LD c41,3*SIZE(CO1) 577 578 LD c12,0(CO2) 579 MADD t11,c11,t11,ALPHA 580 LD c22,1*SIZE(CO2) 581 MADD t21,c21,t21,ALPHA 582 LD c32,2*SIZE(CO2) 583 MADD t31,c31,t31,ALPHA 584 LD c42,3*SIZE(CO2) 585 MADD t41,c41,t41,ALPHA 586 587 LD c13,0(CO3) 588 MADD t12,c12,t12,ALPHA 589 LD c23,1*SIZE(CO3) 590 MADD t22,c22,t22,ALPHA 591 LD c33,2*SIZE(CO3) 592 MADD t32,c32,t32,ALPHA 593 LD c43,3*SIZE(CO3) 594 MADD t42,c42,t42,ALPHA 595 596 LD c14,0(CO4) 597 MADD t13,c13,t13,ALPHA 598 LD c24,1*SIZE(CO4) 599 MADD t23,c23,t23,ALPHA 600 LD c34,2*SIZE(CO4) 601 MADD t33,c33,t33,ALPHA 602 LD c44,3*SIZE(CO4) 603 MADD t43,c43,t43,ALPHA 604 605 ST t11,0(CO1) 606 MADD t14,c14,t14,ALPHA 607 ST t21,1*SIZE(CO1) 608 MADD t24,c24,t24,ALPHA 609 ST t31,2*SIZE(CO1) 610 MADD t34,c34,t34,ALPHA 611 ST t41,3*SIZE(CO1) 612 MADD t44,c44,t44,ALPHA 613 daddiu M,M,-1 # M-- 614 615 ST t12,0(CO2) 616 ST t22,1*SIZE(CO2) 617 ST t32,2*SIZE(CO2) 618 ST t42,3*SIZE(CO2) 619 620 ST t13,0(CO3) 621 ST t23,1*SIZE(CO3) 622 ST t33,2*SIZE(CO3) 623 ST t43,3*SIZE(CO3) 624 625 FETCH $0,4*SIZE(CO1) 626 FETCH $0,4*SIZE(CO2) 627 FETCH $0,4*SIZE(CO3) 628 FETCH $0,4*SIZE(CO4) 629 630 FETCH $0,8*SIZE(CO1) 631 FETCH $0,8*SIZE(CO2) 632 FETCH $0,8*SIZE(CO3) 633 FETCH $0,8*SIZE(CO4) 634 635 ST t14,0(CO4) 636 daddu CO1,CO1,4*SIZE # COi += 4 637 ST t24,1*SIZE(CO4) 638 daddu CO2,CO2,4*SIZE 639 ST t34,2*SIZE(CO4) 640 daddu CO3,CO3,4*SIZE 641 ST t44,3*SIZE(CO4) 642 daddu PREB,BO,SPANB 643 644 bnez M,.L10 645 daddu CO4,CO4,4*SIZE 646 647#else 648 MUL t11, ALPHA, t11 # TRMM write back part 649 MUL t21, ALPHA, t21 650 MUL t31, ALPHA, t31 651 MUL t41, ALPHA, t41 652 653 ST t11, 0 * SIZE(CO1) 654 MUL t12, ALPHA, t12 655 ST t21, 1 * SIZE(CO1) 656 MUL t22, ALPHA, t22 657 ST t31, 2 * SIZE(CO1) 658 MUL t32, ALPHA, t32 659 ST t41, 3 * SIZE(CO1) 660 MUL t42, ALPHA, t42 661 662 ST t12, 0 * SIZE(CO2) 663 MUL t13, ALPHA, t13 664 ST t22, 1 * SIZE(CO2) 665 MUL t23, ALPHA, t23 666 ST t32, 2 * SIZE(CO2) 667 MUL t33, ALPHA, t33 668 ST t42, 3 * SIZE(CO2) 669 MUL t43, ALPHA, t43 670 671 ST t13, 0 * SIZE(CO3) 672 MUL t14, ALPHA, t14 673 ST t23, 1 * SIZE(CO3) 674 MUL t24, ALPHA, t24 675 ST t33, 2 * SIZE(CO3) 676 MUL t34, ALPHA, t34 677 ST t43, 3 * SIZE(CO3) 678 MUL t44, ALPHA, t44 679 680 ST t14, 0 * SIZE(CO4) 681 daddiu M,M,-1 # M-- 682 ST t24, 1 * SIZE(CO4) 683 ST t34, 2 * SIZE(CO4) 684 ST t44, 3 * SIZE(CO4) 685 daddiu CO1,CO1, 4 * SIZE 686 daddiu CO2,CO2, 4 * SIZE 687 daddiu CO3,CO3, 4 * SIZE 688 daddiu CO4,CO4, 4 * SIZE 689 690 FETCH $0,4*SIZE(CO1) 691 FETCH $0,4*SIZE(CO2) 692 FETCH $0,4*SIZE(CO3) 693 FETCH $0,4*SIZE(CO4) 694 695 FETCH $0,0(CO1) 696 FETCH $0,0(CO2) 697 FETCH $0,0(CO3) 698 FETCH $0,0(CO4) 699 700#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 701 dsubu TEMP,KCO,KK 702#ifdef LEFT 703 daddiu TEMP,TEMP, -4 704#else 705 daddiu TEMP,TEMP, -4 706#endif 707 dsll K,TEMP,2 + BASE_SHIFT 708 dsll TEMP,TEMP,2 + BASE_SHIFT 709 daddu A,A,K # mov A to the end of panel Ai 710 daddu B,B,TEMP # mov B to the end of panel Bj 711#endif 712 713#ifdef LEFT 714 daddiu KK, KK,4 715#endif 716 bnez M,.L10 717 nop 718#endif 719 720 721 .align 3 722.L14_M2: 723 andi M, MCO, 2 # nr=4,mr=2 724 beqz M,.L14_M1 725 nop 726 727.L20: 728#if defined(TRMMKERNEL) 729#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 730 move B,BO # Reset B 731#else 732 dsll K,KK,1 + BASE_SHIFT # mr=2 733 dsll TEMP,KK,2 + BASE_SHIFT # nr=4 734 daddu A,A,K 735 daddu B,BO,TEMP 736#endif 737 738 LD a0,0*SIZE(A) 739 MTC $0,t11 740 LD a1,1*SIZE(A) 741 742 MOV t21,t11 743 LD b0,0*SIZE(B) 744 MOV t12,t11 745 LD b1,1*SIZE(B) 746 747 MOV t22,t11 748 LD b2,2*SIZE(B) 749 750 MOV t13,t11 751 MOV t23,t11 752 LD b3,3*SIZE(B) 753 754#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 755 dsubu TEMP,KCO,KK 756#elif defined(LEFT) 757 daddiu TEMP,KK,2 # left part,controlled by mr, mr=2 758#else 759 daddiu TEMP,KK,4 # right part,controlled by nr,nr=4 760#endif 761 dsra K,TEMP,2 762 MOV t14,t11 763 beqz K,.L25 764 MOV t24,t11 # clear 2*4=8 results registers 765 766#else 767 move B,BO # Reset B 768 LD a0,0*SIZE(A) 769 MTC $0,t11 770 LD a1,1*SIZE(A) 771 772 MOV t21,t11 773 LD b0,0*SIZE(B) 774 MOV t12,t11 775 LD b1,1*SIZE(B) 776 777 MOV t22,t11 778 dsra K,KCO,2 779 LD b2,2*SIZE(B) 780 781 MOV t13,t11 782 MOV t23,t11 783 LD b3,3*SIZE(B) 784 785 MOV t14,t11 786 beqz K,.L25 787 MOV t24,t11 788 789#endif 790 791.L21: # nr=4,mr=2,kr=4 792 MADD t11,t11,a0,b0 793 LD a4,2*SIZE(A) 794 MADD t21,t21,a1,b0 795 LD a5,3*SIZE(A) 796 797 MADD t12,t12,a0,b1 798 LD b4,4*SIZE(B) 799 MADD t22,t22,a1,b1 800 LD b5,5*SIZE(B) 801 802 MADD t13,t13,a0,b2 803 LD b6,6*SIZE(B) 804 MADD t23,t23,a1,b2 805 LD b7,7*SIZE(B) 806 807 MADD t14,t14,a0,b3 808 MADD t24,t24,a1,b3 809 810 MADD t11,t11,a4,b4 811 LD a2,4*SIZE(A) 812 MADD t21,t21,a5,b4 813 LD a3,5*SIZE(A) 814 815 MADD t12,t12,a4,b5 816 LD b0,8*SIZE(B) 817 MADD t22,t22,a5,b5 818 LD b1,9*SIZE(B) 819 820 MADD t13,t13,a4,b6 821 LD b2,10*SIZE(B) 822 MADD t23,t23,a5,b6 823 LD b3,11*SIZE(B) 824 825 MADD t14,t14,a4,b7 826 MADD t24,t24,a5,b7 827 daddiu K,K,-1 828 829 MADD t11,t11,a2,b0 830 LD a6,6*SIZE(A) 831 MADD t21,t21,a3,b0 832 LD a7,7*SIZE(A) 833 834 MADD t12,t12,a2,b1 835 LD b4,12*SIZE(B) 836 MADD t22,t22,a3,b1 837 LD b5,13*SIZE(B) 838 839 MADD t13,t13,a2,b2 840 LD b6,14*SIZE(B) 841 MADD t23,t23,a3,b2 842 LD b7,15*SIZE(B) 843 844 MADD t14,t14,a2,b3 845 MADD t24,t24,a3,b3 846 daddu A,A,8*SIZE # 2mr*4kr 847 daddu B,B,16*SIZE # 4nr*4kr 848 849 MADD t11,t11,a6,b4 850 LD a0,0*SIZE(A) 851 MADD t21,t21,a7,b4 852 LD a1,1*SIZE(A) 853 854 MADD t12,t12,a6,b5 855 LD b0,0*SIZE(B) 856 MADD t22,t22,a7,b5 857 LD b1,1*SIZE(B) 858 859 MADD t13,t13,a6,b6 860 LD b2,2*SIZE(B) 861 MADD t23,t23,a7,b6 862 LD b3,3*SIZE(B) 863 864 MADD t14,t14,a6,b7 865 bnez K,.L21 866 MADD t24,t24,a7,b7 867 868 869.L25: 870#ifndef TRMMKERNEL 871 andi K,KCO,2 # kr=2 872#else 873 andi K,TEMP,2 874#endif 875 beqz K,.L28 876 nop 877 878.L26: 879 MADD t11,t11,a0,b0 880 LD a4,2*SIZE(A) 881 MADD t21,t21,a1,b0 882 LD a5,3*SIZE(A) 883 884 MADD t12,t12,a0,b1 885 LD b4,4*SIZE(B) 886 MADD t22,t22,a1,b1 887 LD b5,5*SIZE(B) 888 889 MADD t13,t13,a0,b2 890 LD b6,6*SIZE(B) 891 MADD t23,t23,a1,b2 892 LD b7,7*SIZE(B) 893 894 MADD t14,t14,a0,b3 895 MADD t24,t24,a1,b3 896 daddu A,A,4*SIZE # 2mr*2kr 897 daddu B,B,8*SIZE # 4nr*2kr 898 899.L27: 900 MADD t11,t11,a4,b4 901 LD a0,0*SIZE(A) 902 MADD t21,t21,a5,b4 903 LD a1,1*SIZE(A) 904 905 MADD t12,t12,a4,b5 906 LD b0,0*SIZE(B) 907 MADD t22,t22,a5,b5 908 LD b1,1*SIZE(B) 909 910 MADD t13,t13,a4,b6 911 LD b2,2*SIZE(B) 912 MADD t23,t23,a5,b6 913 LD b3,3*SIZE(B) 914 915 MADD t14,t14,a4,b7 916 MADD t24,t24,a5,b7 917 918 919.L28: # kr=1 920#ifndef TRMMKERNEL 921 andi K,KCO,1 922#else 923 andi K,TEMP,1 924#endif 925 beqz K,.L29 926 LD ALPHA,152($sp) # Get ALPHA 927 928 MADD t11,t11,a0,b0 929 MADD t21,t21,a1,b0 930 daddu A,A,2*SIZE # 2mr*kr 931 daddu B,B,4*SIZE # 4nr*kr 932 933 MADD t12,t12,a0,b1 934 MADD t22,t22,a1,b1 935 936 MADD t13,t13,a0,b2 937 MADD t23,t23,a1,b2 938 939 MADD t14,t14,a0,b3 940 MADD t24,t24,a1,b3 941 942.L29: # Write Back to C 943#ifndef TRMMKERNEL 944 LD c11,0(CO1) # GEMM write back part 945 LD c21,1*SIZE(CO1) 946 947 LD c12,0(CO2) 948 LD c22,1*SIZE(CO2) 949 950 LD c13,0(CO3) 951 MADD t11,c11,t11,ALPHA 952 LD c23,1*SIZE(CO3) 953 MADD t21,c21,t21,ALPHA 954 955 LD c14,0(CO4) 956 MADD t12,c12,t12,ALPHA 957 LD c24,1*SIZE(CO4) 958 MADD t22,c22,t22,ALPHA 959 960 ST t11,0(CO1) 961 MADD t13,c13,t13,ALPHA 962 ST t21,1*SIZE(CO1) 963 MADD t23,c23,t23,ALPHA 964 965 ST t12,0(CO2) 966 MADD t14,c14,t14,ALPHA 967 ST t22,1*SIZE(CO2) 968 MADD t24,c24,t24,ALPHA 969 970 ST t13,0(CO3) 971 daddu CO1,CO1,2*SIZE # COi += 2 972 ST t23,1*SIZE(CO3) 973 daddu CO2,CO2,2*SIZE 974 975 ST t14,0(CO4) 976 daddu CO3,CO3,2*SIZE 977 ST t24,1*SIZE(CO4) 978 daddu CO4,CO4,2*SIZE 979 980 FETCH $0,0(CO1) 981 FETCH $0,0(CO2) 982 FETCH $0,0(CO3) 983 FETCH $0,0(CO4) 984 985#else 986 MUL t11, ALPHA, t11 # TRMM write back part 987 MUL t21, ALPHA, t21 988 989 ST t11, 0 * SIZE(CO1) 990 MUL t12, ALPHA, t12 991 ST t21, 1 * SIZE(CO1) 992 MUL t22, ALPHA, t22 993 994 ST t12, 0 * SIZE(CO2) 995 MUL t13, ALPHA, t13 996 ST t22, 1 * SIZE(CO2) 997 MUL t23, ALPHA, t23 998 999 ST t13, 0 * SIZE(CO3) 1000 MUL t14, ALPHA, t14 1001 ST t23, 1 * SIZE(CO3) 1002 MUL t24, ALPHA, t24 1003 1004 ST t14, 0 * SIZE(CO4) 1005 ST t24, 1 * SIZE(CO4) 1006 1007 daddiu CO1,CO1, 2 * SIZE 1008 daddiu CO2,CO2, 2 * SIZE 1009 daddiu CO3,CO3, 2 * SIZE 1010 daddiu CO4,CO4, 2 * SIZE 1011 1012 FETCH $0,0(CO1) 1013 FETCH $0,0(CO2) 1014 FETCH $0,0(CO3) 1015 FETCH $0,0(CO4) 1016 1017#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1018 dsubu TEMP,KCO,KK 1019#ifdef LEFT 1020 daddiu TEMP,TEMP,-2 1021#else 1022 daddiu TEMP,TEMP,-4 1023#endif 1024 dsll K,TEMP,1 + BASE_SHIFT 1025 dsll TEMP,TEMP,2 + BASE_SHIFT 1026 1027 daddu A,A,K # move A to next panel Ai 1028 daddu B,B,TEMP # move B to next panel Bj 1029#endif 1030 1031#ifdef LEFT 1032 daddiu KK, KK, 2 1033#endif 1034#endif 1035 1036 1037 .align 3 1038.L14_M1: 1039 andi M,MCO,1 # mr=1 1040 beqz M,.L0_N4_Loop # M = 0, finishing one panel Bj 1041 nop 1042 1043.L30: 1044#if defined(TRMMKERNEL) 1045#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1046 move B,BO # Reset B 1047#else 1048 dsll K,KK, BASE_SHIFT 1049 dsll TEMP,KK,2 + BASE_SHIFT 1050 1051 daddu A,A,K 1052 daddu B,BO,TEMP 1053#endif 1054 1055 LD a0, 0 * SIZE(A) # a0 1056 1057 MTC $0,t11 1058 LD b0,0*SIZE(B) 1059 1060 MOV t12,t11 1061 LD b1,1*SIZE(B) 1062 1063 MOV t13,t11 1064 LD b2,2*SIZE(B) 1065 1066 MOV t14,t11 1067 LD b3,3*SIZE(B) 1068 1069#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1070 dsubu TEMP, KCO, KK 1071#elif defined(LEFT) 1072 daddiu TEMP, KK, 1 1073#else 1074 daddiu TEMP, KK, 4 1075#endif 1076 dsra K,TEMP, 2 1077 nop 1078 beqz K,.L35 1079 nop 1080 1081#else 1082 move B,BO # Reset B, GEMM part 1083 dsra K,KCO,2 # K=KCO/2 1084 LD a0, 0 * SIZE(A) # a0 1085 1086 MTC $0,t11 1087 LD b0,0*SIZE(B) 1088 1089 MOV t12,t11 1090 LD b1,1*SIZE(B) 1091 1092 MOV t13,t11 1093 LD b2,2*SIZE(B) 1094 1095 MOV t14,t11 1096 beqz K,.L35 1097 LD b3,3*SIZE(B) 1098 1099#endif 1100 1101.L31: # nr=4,mr=1,kr=4 1102 LD a1, 1*SIZE(A) # load a1 1103 MADD t11,t11,a0,b0 1104 1105 LD b4,4*SIZE(B) 1106 LD b5,5*SIZE(B) 1107 MADD t12,t12,a0,b1 1108 1109 LD b6,6*SIZE(B) 1110 LD b7,7*SIZE(B) 1111 MADD t13,t13,a0,b2 1112 MADD t14,t14,a0,b3 1113 1114 LD a2, 2*SIZE(A) # a2 1115 MADD t11,t11,a1,b4 1116 1117 LD b0,8*SIZE(B) 1118 LD b1,9*SIZE(B) 1119 MADD t12,t12,a1,b5 1120 1121 LD b2,10*SIZE(B) 1122 LD b3,11*SIZE(B) 1123 MADD t13,t13,a1,b6 1124 MADD t14,t14,a1,b7 1125 1126 LD a3, 3*SIZE(A) # a3 1127 MADD t11,t11,a2,b0 1128 daddiu K,K,-1 1129 1130 LD b4,12*SIZE(B) 1131 LD b5,13*SIZE(B) 1132 MADD t12,t12,a2,b1 1133 daddu A,A,4*SIZE # 1mr*4kr 1134 1135 LD b6,14*SIZE(B) 1136 LD b7,15*SIZE(B) 1137 MADD t13,t13,a2,b2 1138 MADD t14,t14,a2,b3 1139 1140 LD a0, 0*SIZE(A) # a0 1141 daddu B,B,16*SIZE # 4nr*4kr 1142 MADD t11,t11,a3,b4 1143 1144 LD b0,0*SIZE(B) 1145 MADD t12,t12,a3,b5 1146 LD b1,1*SIZE(B) 1147 MADD t13,t13,a3,b6 1148 1149 LD b2,2*SIZE(B) 1150 MADD t14,t14,a3,b7 1151 bnez K,.L31 1152 LD b3,3*SIZE(B) 1153 1154 1155.L35: # kr=2 1156#ifndef TRMMKERNEL 1157 andi K,KCO,2 1158#else 1159 andi K,TEMP,2 1160#endif 1161 beqz K,.L38 1162 nop 1163 1164.L36: 1165 LD a1,1*SIZE(A) # load a1 1166 MADD t11,t11,a0,b0 1167 1168 LD b4,4*SIZE(B) 1169 LD b5,5*SIZE(B) 1170 MADD t12,t12,a0,b1 1171 daddu A,A,2*SIZE # mr*2kr 1172 1173 LD b6,6*SIZE(B) 1174 MADD t13,t13,a0,b2 1175 1176 LD b7,7*SIZE(B) 1177 MADD t14,t14,a0,b3 1178 daddu B,B,8*SIZE # 4nr*2kr 1179 1180 1181.L37: 1182 LD a0,0(A) 1183 MADD t11,t11,a1,b4 1184 1185 LD b0,0*SIZE(B) 1186 LD b1,1*SIZE(B) 1187 MADD t12,t12,a1,b5 1188 1189 LD b2,2*SIZE(B) 1190 LD b3,3*SIZE(B) 1191 MADD t13,t13,a1,b6 1192 MADD t14,t14,a1,b7 1193 1194 1195.L38: # kr=1 1196#ifndef TRMMKERNEL 1197 andi K,KCO,1 1198#else 1199 andi K,TEMP,1 1200#endif 1201 beqz K,.L39 1202 LD ALPHA,152($sp) # Get ALPHA 1203 1204 MADD t11,t11,a0,b0 1205 MADD t12,t12,a0,b1 1206 daddu A,A,1*SIZE 1207 daddu B,B,4*SIZE 1208 1209 MADD t13,t13,a0,b2 1210 MADD t14,t14,a0,b3 1211 1212.L39: # Write Back 1213#ifndef TRMMKERNEL 1214 LD c11,0(CO1) 1215 LD c12,0(CO2) 1216 LD c13,0(CO3) 1217 LD c14,0(CO4) 1218 1219 MADD t11,c11,t11,ALPHA 1220 MADD t12,c12,t12,ALPHA 1221 MADD t13,c13,t13,ALPHA 1222 MADD t14,c14,t14,ALPHA 1223 1224 ST t11,0(CO1) 1225 ST t12,0(CO2) 1226 ST t13,0(CO3) 1227 ST t14,0(CO4) 1228#else 1229 MUL t11, ALPHA, t11 1230 MUL t12, ALPHA, t12 1231 MUL t13, ALPHA, t13 1232 MUL t14, ALPHA, t14 1233 1234 ST t11, 0 * SIZE(CO1) 1235 ST t12, 0 * SIZE(CO2) 1236 ST t13, 0 * SIZE(CO3) 1237 ST t14, 0 * SIZE(CO4) 1238 1239#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1240 dsubu TEMP, KCO, KK 1241#ifdef LEFT 1242 daddiu TEMP, TEMP, -1 1243#else 1244 daddiu TEMP, TEMP, -4 1245#endif 1246 1247 dsll K,TEMP, BASE_SHIFT 1248 dsll TEMP,TEMP, 2 + BASE_SHIFT 1249 1250 daddu A,A,K 1251 daddu B,B,TEMP 1252#endif 1253 1254#ifdef LEFT 1255 daddiu KK, KK, 1 1256#endif 1257#endif 1258 1259 1260 .align 3 1261.L0_N4_Loop: # mc finished 1262 daddiu N,N,-1 # N-- 1263#if defined(TRMMKERNEL) && !defined(LEFT) 1264 daddiu KK, KK,4 1265#endif 1266 bnez N,.L0_N4_Lb 1267 move BO,B # Set BO point to next panel Bj 1268 1269 .align 5 1270.L0_N2: 1271 andi N,NCO,2 # nr = 2 1272 beqz N,.L0_N1 1273 nop 1274 1275.L0_N2_Lb: 1276 move CO1,C 1277 daddu CO2,C,LDC 1278 1279 dsra M,MCO,2 1280 move A,AO # Reset A 1281 1282 daddu PREA,AO,SPANA 1283 daddu C,CO2,LDC 1284 1285#if defined(TRMMKERNEL) && defined(LEFT) 1286 move KK, OFFSET 1287#endif 1288 beqz M,.L12_M2 1289 nop 1290 1291.L40: 1292#if defined(TRMMKERNEL) 1293#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1294 move B,BO # Reset B 1295#else 1296 dsll K,KK, 2 + BASE_SHIFT 1297 dsll TEMP, KK,1 + BASE_SHIFT 1298 1299 daddu A,A,K 1300 daddu B,BO,TEMP 1301#endif 1302 LD a0,0*SIZE(A) 1303 MTC $0,t11 # gemm part 1304 LD a1,1*SIZE(A) 1305 1306 MOV t21,t11 1307 LD b0,0*SIZE(B) 1308 MOV t31,t11 1309 LD b1,1*SIZE(B) 1310 1311 MOV t41,t11 1312 LD a2,2*SIZE(A) 1313 LD a3,3*SIZE(A) 1314 1315 MOV t12,t11 1316 MOV t22,t11 1317 1318#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1319 dsubu TEMP,KCO,KK 1320#elif defined(LEFT) 1321 daddiu TEMP, KK, 4 1322#else 1323 daddiu TEMP, KK, 2 1324#endif 1325 dsra K,TEMP,2 1326 MOV t32,t11 1327 beqz K,.L45 1328 MOV t42,t11 1329 1330#else 1331 move B,BO # Reset B 1332 LD a0,0*SIZE(A) 1333 MTC $0,t11 # gemm part 1334 LD a1,1*SIZE(A) 1335 1336 MOV t21,t11 1337 LD b0,0*SIZE(B) 1338 MOV t31,t11 1339 LD b1,1*SIZE(B) 1340 1341 MOV t41,t11 1342 LD a2,2*SIZE(A) 1343 dsra K,KCO,2 # K=KCO/2 1344 LD a3,3*SIZE(A) 1345 1346 MOV t12,t11 1347 MOV t22,t11 1348 1349 MOV t32,t11 1350 beqz K,.L45 1351 MOV t42,t11 1352 1353#endif 1354 1355.L41: # nr=2,mr=kr=4 1356 MADD t11,t11,a0,b0 1357 LD a4,4*SIZE(A) 1358 MADD t21,t21,a1,b0 1359 LD a5,5*SIZE(A) 1360 1361 MADD t12,t12,a0,b1 1362 LD b4,2*SIZE(B) 1363 MADD t22,t22,a1,b1 1364 LD b5,3*SIZE(B) 1365 1366 MADD t31,t31,a2,b0 1367 LD a6,6*SIZE(A) 1368 MADD t41,t41,a3,b0 1369 LD a7,7*SIZE(A) 1370 1371 FETCH $0,(PREA) 1372 MADD t32,t32,a2,b1 1373 MADD t42,t42,a3,b1 1374 1375.L42: 1376 MADD t11,t11,a4,b4 1377 LD a0,8*SIZE(A) 1378 MADD t21,t21,a5,b4 1379 LD a1,9*SIZE(A) 1380 1381 MADD t12,t12,a4,b5 1382 LD b2,4*SIZE(B) 1383 MADD t22,t22,a5,b5 1384 LD b3,5*SIZE(B) 1385 1386 MADD t31,t31,a6,b4 1387 LD a2,10*SIZE(A) 1388 MADD t41,t41,a7,b4 1389 LD a3,11*SIZE(A) 1390 1391 FETCH $0,4*SIZE(PREA) 1392 MADD t32,t32,a6,b5 1393 MADD t42,t42,a7,b5 1394 1395.L43: 1396 MADD t11,t11,a0,b2 1397 LD a4,12*SIZE(A) 1398 MADD t21,t21,a1,b2 1399 LD a5,13*SIZE(A) 1400 1401 MADD t12,t12,a0,b3 1402 LD b6,6*SIZE(B) 1403 MADD t22,t22,a1,b3 1404 LD b7,7*SIZE(B) 1405 1406 MADD t31,t31,a2,b2 1407 LD a6,14*SIZE(A) 1408 MADD t41,t41,a3,b2 1409 LD a7,15*SIZE(A) 1410 1411 FETCH $0,8*SIZE(PREA) 1412 MADD t32,t32,a2,b3 1413 MADD t42,t42,a3,b3 1414 1415 daddu A,A,16*SIZE # 4mr*4kr 1416 daddu B,B,8*SIZE # 2nr*4kr 1417 1418.L44: 1419 MADD t11,t11,a4,b6 1420 LD a0,0*SIZE(A) 1421 MADD t21,t21,a5,b6 1422 LD a1,1*SIZE(A) 1423 1424 1425 MADD t12,t12,a4,b7 1426 LD b0,0*SIZE(B) 1427 MADD t22,t22,a5,b7 1428 LD b1,1*SIZE(B) 1429 1430 daddiu K,K,-1 1431 daddu PREA,PREA,16*SIZE 1432 1433 MADD t31,t31,a6,b6 1434 LD a2,2*SIZE(A) 1435 MADD t41,t41,a7,b6 1436 LD a3,3*SIZE(A) 1437 1438 FETCH $0,-4*SIZE(PREA) 1439 MADD t32,t32,a6,b7 1440 bnez K,.L41 1441 MADD t42,t42,a7,b7 1442 1443 1444.L45: # kr=2 1445#ifndef TRMMKERNEL 1446 andi K,KCO,2 1447#else 1448 andi K,TEMP,2 1449#endif 1450 beqz K,.L48 1451 nop 1452 1453.L46: 1454 MADD t11,t11,a0,b0 1455 LD a4,4*SIZE(A) 1456 MADD t21,t21,a1,b0 1457 LD a5,5*SIZE(A) 1458 1459 MADD t12,t12,a0,b1 1460 LD b4,2*SIZE(B) 1461 MADD t22,t22,a1,b1 1462 LD b5,3*SIZE(B) 1463 1464 MADD t31,t31,a2,b0 1465 LD a6,6*SIZE(A) 1466 MADD t41,t41,a3,b0 1467 LD a7,7*SIZE(A) 1468 1469 FETCH $0,0(PREA) 1470 MADD t32,t32,a2,b1 1471 daddu B,B,4*SIZE # B+=2(nr)*2(kr)*8Byte=32 1472 1473 MADD t42,t42,a3,b1 1474 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE 1475 1476.L47: 1477 MADD t11,t11,a4,b4 1478 LD a0,0*SIZE(A) 1479 MADD t21,t21,a5,b4 1480 LD a1,1*SIZE(A) 1481 1482 MADD t12,t12,a4,b5 1483 LD b0,0*SIZE(B) 1484 MADD t22,t22,a5,b5 1485 LD b1,1*SIZE(B) 1486 1487 MADD t31,t31,a6,b4 1488 LD a2,2*SIZE(A) 1489 MADD t41,t41,a7,b4 1490 LD a3,3*SIZE(A) 1491 1492 FETCH $0,4*SIZE(PREA) 1493 MADD t32,t32,a6,b5 1494 MADD t42,t42,a7,b5 1495 daddu PREA,PREA,8*SIZE 1496 1497 1498 1499.L48: # kr=1 1500#ifndef TRMMKERNEL 1501 andi K,KCO,1 1502#else 1503 andi K,TEMP,1 1504#endif 1505 beqz K,.L49 1506 LD ALPHA,152($sp) # Get ALPHA 1507 1508 FETCH $0,0(PREA) 1509 MADD t11,t11,a0,b0 1510 MADD t21,t21,a1,b0 1511 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 1512 1513 MADD t12,t12,a0,b1 1514 MADD t22,t22,a1,b1 1515 daddu B,B,2*SIZE 1516 daddu PREA,PREA,4*SIZE 1517 1518 MADD t31,t31,a2,b0 1519 MADD t41,t41,a3,b0 1520 1521 MADD t32,t32,a2,b1 1522 MADD t42,t42,a3,b1 1523 1524.L49: # Write Back 1525#ifndef TRMMKERNEL 1526 LD c11,0(CO1) # gemm write back part Fetch 16 C 1527 LD c21,1*SIZE(CO1) 1528 LD c31,2*SIZE(CO1) 1529 LD c41,3*SIZE(CO1) 1530 1531 LD c12,0(CO2) 1532 MADD t11,c11,t11,ALPHA 1533 LD c22,1*SIZE(CO2) 1534 MADD t21,c21,t21,ALPHA 1535 LD c32,2*SIZE(CO2) 1536 MADD t31,c31,t31,ALPHA 1537 LD c42,3*SIZE(CO2) 1538 MADD t41,c41,t41,ALPHA 1539 1540 ST t11,0(CO1) 1541 MADD t12,c12,t12,ALPHA 1542 ST t21,1*SIZE(CO1) 1543 MADD t22,c22,t22,ALPHA 1544 ST t31,2*SIZE(CO1) 1545 MADD t32,c32,t32,ALPHA 1546 ST t41,3*SIZE(CO1) 1547 MADD t42,c42,t42,ALPHA 1548 daddiu M,M,-1 1549 1550 ST t12,0(CO2) 1551 ST t22,1*SIZE(CO2) 1552 ST t32,2*SIZE(CO2) 1553 ST t42,3*SIZE(CO2) 1554 1555 FETCH $0,4*SIZE(CO1) 1556 FETCH $0,4*SIZE(CO2) 1557 FETCH $0,8*SIZE(CO1) 1558 FETCH $0,8*SIZE(CO2) 1559 1560 daddu CO1,CO1,4*SIZE 1561 bnez M,.L40 1562 daddu CO2,CO2,4*SIZE 1563 1564#else 1565 MUL t11, ALPHA, t11 1566 MUL t21, ALPHA, t21 1567 MUL t31, ALPHA, t31 1568 MUL t41, ALPHA, t41 1569 1570 MUL t12, ALPHA, t12 1571 ST t11, 0 * SIZE(CO1) 1572 MUL t22, ALPHA, t22 1573 ST t21, 1 * SIZE(CO1) 1574 MUL t32, ALPHA, t32 1575 ST t31, 2 * SIZE(CO1) 1576 MUL t42, ALPHA, t42 1577 ST t41, 3 * SIZE(CO1) 1578 1579 ST t12, 0 * SIZE(CO2) 1580 daddiu M,M,-1 1581 ST t22, 1 * SIZE(CO2) 1582 ST t32, 2 * SIZE(CO2) 1583 ST t42, 3 * SIZE(CO2) 1584 1585 daddiu CO1,CO1, 4*SIZE 1586 daddiu CO2,CO2, 4*SIZE 1587 1588 FETCH $0,0(CO1) 1589 FETCH $0,0(CO2) 1590 FETCH $0,4(CO1) 1591 FETCH $0,4(CO2) 1592 1593#if ( defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) 1594 dsubu TEMP, KCO, KK 1595#ifdef LEFT 1596 daddiu TEMP, TEMP, -4 1597#else 1598 daddiu TEMP, TEMP, -2 1599#endif 1600 dsll K,TEMP, 2 + BASE_SHIFT 1601 dsll TEMP, TEMP, 1 + BASE_SHIFT 1602 1603 daddu A,A,K 1604 daddu B,B,TEMP 1605#endif 1606 1607#ifdef LEFT 1608 daddiu KK, KK, 4 1609#endif 1610 bnez M,.L40 1611 nop 1612#endif 1613 1614 1615 .align 3 1616.L12_M2: 1617 andi M,MCO,2 # mr = 2 1618 beqz M,.L12_M1 1619 nop 1620 1621.L50: 1622#if defined(TRMMKERNEL) 1623#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1624 move B,BO 1625#else 1626 dsll K, KK, 1 + BASE_SHIFT #mr=2 1627 dsll TEMP, KK, 1 + BASE_SHIFT #nr=2 1628 1629 daddu A, A, K 1630 daddu B, BO, TEMP 1631#endif 1632 LD a0,0*SIZE(A) 1633 LD a1,1*SIZE(A) 1634 1635 MTC $0,t11 1636 LD b0,0*SIZE(B) 1637 MOV t21,t11 1638 LD b1,1*SIZE(B) 1639 1640#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1641 dsubu TEMP, KCO, KK 1642#elif defined(LEFT) 1643 daddiu TEMP, KK, 2 1644#else 1645 daddiu TEMP, KK, 2 1646#endif 1647 dsra K,TEMP,2 1648 MOV t12,t11 1649 beqz K,.L55 1650 MOV t22,t11 1651 1652#else 1653 move B,BO 1654 LD a0,0*SIZE(A) 1655 dsra K,KCO,2 # K=KCO/2 1656 LD a1,1*SIZE(A) 1657 1658 MTC $0,t11 1659 LD b0,0*SIZE(B) 1660 MOV t21,t11 1661 LD b1,1*SIZE(B) 1662 1663 MOV t12,t11 1664 beqz K,.L55 1665 MOV t22,t11 1666 1667#endif 1668 1669.L51: # nr=2 mr=2,kr=4 1670 MADD t11,t11,a0,b0 1671 LD a4,2*SIZE(A) 1672 MADD t21,t21,a1,b0 1673 LD b4,2*SIZE(B) 1674 1675 MADD t12,t12,a0,b1 1676 LD a5,3*SIZE(A) 1677 MADD t22,t22,a1,b1 1678 LD b5,3*SIZE(B) 1679 1680 MADD t11,t11,a4,b4 1681 LD a2,4*SIZE(A) 1682 MADD t21,t21,a5,b4 1683 LD b2,4*SIZE(B) 1684 1685 MADD t12,t12,a4,b5 1686 LD a3,5*SIZE(A) 1687 MADD t22,t22,a5,b5 1688 daddiu K,K,-1 1689 LD b3,5*SIZE(B) 1690 1691 MADD t11,t11,a2,b2 1692 LD a6,6*SIZE(A) 1693 MADD t21,t21,a3,b2 1694 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE 1695 LD b6,6*SIZE(B) 1696 1697 MADD t12,t12,a2,b3 1698 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=16*SIZE 1699 LD a7,-1*SIZE(A) 1700 MADD t22,t22,a3,b3 1701 LD b7,-1*SIZE(B) 1702 1703 MADD t11,t11,a6,b6 1704 LD a0,0*SIZE(A) 1705 MADD t21,t21,a7,b6 1706 LD b0,0*SIZE(B) 1707 1708 MADD t12,t12,a6,b7 1709 LD a1,1*SIZE(A) 1710 1711 MADD t22,t22,a7,b7 1712 bnez K,.L51 1713 LD b1,1*SIZE(B) 1714 1715 1716.L55: # kr=2 1717#ifndef TRMMKERNEL 1718 andi K,KCO,2 1719#else 1720 andi K,TEMP,2 1721#endif 1722 beqz K,.L58 1723 nop 1724 1725.L56: 1726 MADD t11,t11,a0,b0 1727 LD a4,2*SIZE(A) 1728 MADD t21,t21,a1,b0 1729 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 1730 LD b4,2*SIZE(B) 1731 1732 MADD t12,t12,a0,b1 1733 daddu B,B,4*SIZE # 2nr*2kr 1734 LD a5,-1*SIZE(A) 1735 MADD t22,t22,a1,b1 1736 LD b5,-1*SIZE(B) 1737 1738.L57: 1739 MADD t11,t11,a4,b4 1740 LD a0,0*SIZE(A) 1741 MADD t21,t21,a5,b4 1742 LD b0,0*SIZE(B) 1743 1744 MADD t12,t12,a4,b5 1745 LD a1,1*SIZE(A) 1746 MADD t22,t22,a5,b5 1747 LD b1,1*SIZE(B) 1748 1749.L58: # kr=1 1750#ifndef TRMMKERNEL 1751 andi K,KCO,1 1752#else 1753 andi K,TEMP, 1 1754#endif 1755 beqz K,.L59 1756 LD ALPHA,152($sp) # Get ALPHA 1757 1758 MADD t11,t11,a0,b0 1759 MADD t21,t21,a1,b0 1760 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 1761 daddu B,B,2*SIZE # 2nr*kr 1762 1763 MADD t12,t12,a0,b1 1764 MADD t22,t22,a1,b1 1765 1766 1767.L59: # Write Back 1768#ifndef TRMMKERNEL 1769 LD c11,0(CO1) # write gemm part back Fetch 16 C 1770 LD c21,1*SIZE(CO1) 1771 LD c12,0(CO2) 1772 LD c22,1*SIZE(CO2) 1773 1774 MADD t11,c11,t11,ALPHA 1775 MADD t21,c21,t21,ALPHA 1776 MADD t12,c12,t12,ALPHA 1777 MADD t22,c22,t22,ALPHA 1778 1779 ST t11,0(CO1) 1780 ST t21,1*SIZE(CO1) 1781 ST t12,0(CO2) 1782 ST t22,1*SIZE(CO2) 1783 1784 daddu CO1,CO1,2*SIZE 1785 daddu CO2,CO2,2*SIZE 1786 1787 FETCH $0,0(CO1) 1788 FETCH $0,0(CO2) 1789#else 1790 daddiu M, M, -1 1791 daddiu CO1,CO1, 2 * SIZE 1792 daddiu CO2,CO2, 2 * SIZE 1793 MUL t11, ALPHA, t11 1794 MUL t21, ALPHA, t21 1795 MUL t12, ALPHA, t12 1796 MUL t22, ALPHA, t22 1797 1798 ST t11, -2 * SIZE(CO1) 1799 ST t21, -1 * SIZE(CO1) 1800 ST t12, -2 * SIZE(CO2) 1801 ST t22, -1 * SIZE(CO2) 1802 1803 FETCH $0,0(CO1) 1804 FETCH $0,0(CO2) 1805 1806#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1807 dsubu TEMP, KCO, KK 1808#ifdef LEFT 1809 daddiu TEMP, TEMP, -2 1810#else 1811 daddiu TEMP, TEMP, -2 1812#endif 1813 1814 dsll K, TEMP, 1 + BASE_SHIFT 1815 dsll TEMP, TEMP, 1 + BASE_SHIFT 1816 1817 daddu A, A, K 1818 daddu B, B, TEMP 1819#endif 1820 1821#ifdef LEFT 1822 daddiu KK, KK, 2 1823#endif 1824#endif 1825 1826 1827 .align 3 1828.L12_M1: 1829 andi M,MCO,1 # mr = 1 1830 beqz M,.L0_N2_Loop 1831 nop 1832 1833.L60: 1834#if defined(TRMMKERNEL) 1835#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1836 move B,BO # Reset B 1837#else 1838 dsll K, KK, 0 + BASE_SHIFT 1839 dsll TEMP, KK, 1 + BASE_SHIFT 1840 1841 daddu A, A, K 1842 daddu B, BO, TEMP 1843#endif 1844 LD a0,0*SIZE(A) 1845 1846 MTC $0,t11 1847 MOV t21,t11 1848 LD b0,0*SIZE(B) 1849 1850 MOV t12,t11 1851 LD b1,1*SIZE(B) 1852 1853#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1854 dsubu TEMP, KCO, KK 1855#elif defined(LEFT) 1856 daddiu TEMP, KK, 1 1857#else 1858 daddiu TEMP, KK, 2 1859#endif 1860 dsra K,TEMP,2 1861 MOV t22,t11 1862 beqz K,.L65 1863 nop 1864 1865#else 1866 dsra K,KCO,2 1867 move B,BO # Reset B 1868 LD a0,0*SIZE(A) 1869 1870 MTC $0,t11 1871 MOV t21,t11 1872 LD b0,0*SIZE(B) 1873 1874 MOV t12,t11 1875 LD b1,1*SIZE(B) 1876 beqz K,.L65 1877 MOV t22,t11 1878 1879#endif 1880 1881.L61: # nr=2,mr=1,kr=4 1882 LD a4, 1*SIZE(A) # a2 1883 LD b4, 2*SIZE(B) 1884 MADD t11,t11,a0,b0 1885 1886 LD b5,3*SIZE(B) 1887 MADD t12,t12,a0,b1 1888 1889 LD a2, 2*SIZE(A) # a3 1890 LD b2,4*SIZE(B) 1891 MADD t11,t11,a4,b4 1892 1893 LD b3,5*SIZE(B) 1894 MADD t12,t12,a4,b5 1895 1896 LD a6, 3*SIZE(A) # a4 1897 daddiu K,K,-1 1898 LD b6,6*SIZE(B) 1899 MADD t11,t11,a2,b2 1900 1901 LD b7,7*SIZE(B) 1902 MADD t12,t12,a2,b3 1903 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 1904 1905 LD a0, 0*SIZE(A) 1906 daddu B,B,8*SIZE # B+=2(nr)*4(kr)*8Byte=8*SIZE 1907 1908 LD b0,0*SIZE(B) 1909 MADD t11,t11,a6,b6 1910 1911 LD b1,1*SIZE(B) 1912 bnez K,.L61 1913 MADD t12,t12,a6,b7 1914 1915 1916 1917.L65: # kr=2 1918#ifndef TRMMKERNEL 1919 andi K,KCO,2 1920#else 1921 andi K,TEMP,2 1922#endif 1923 beqz K,.L68 1924 nop 1925 1926.L66: 1927 LD a4, 1*SIZE(A) # a1 1928 MADD t11,t11,a0,b0 1929 LD b4,2*SIZE(B) 1930 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=16 1931 1932 LD b5,3*SIZE(B) 1933 MADD t12,t12,a0,b1 1934 daddu B,B,4*SIZE 1935 1936.L67: 1937 LD a0,0(A) # a0 1938 LD b0,0*SIZE(B) 1939 MADD t11,t11,a4,b4 1940 1941 LD b1,1*SIZE(B) 1942 MADD t12,t12,a4,b5 1943 1944 1945.L68: # kr=1 1946#ifndef TRMMKERNEL 1947 andi K,KCO,1 1948#else 1949 andi K,TEMP,1 1950#endif 1951 beqz K,.L69 1952 LD ALPHA,152($sp) # Get ALPHA 1953 1954 MADD t11,t11,a0,b0 1955 MADD t12,t12,a0,b1 1956 daddu A,A,1*SIZE # A+=1(mr)*1(kr)*8Byte=16 1957 daddu B,B,2*SIZE 1958 1959 1960.L69: # Write Back 1961#ifndef TRMMKERNEL 1962 LD c11,0(CO1) # Fetch 16 C 1963 LD c12,0(CO2) 1964 1965 MADD t11,c11,t11,ALPHA 1966 MADD t12,c12,t12,ALPHA 1967 1968 ST t11,0(CO1) 1969 ST t12,0(CO2) 1970 1971 daddu CO1,CO1,1*SIZE 1972 daddu CO2,CO2,1*SIZE 1973 1974#else 1975 MUL t11, ALPHA, t11 1976 MUL t12, ALPHA, t12 1977 1978 ST t11, 0 * SIZE(CO1) 1979 ST t12, 0 * SIZE(CO2) 1980 1981 daddu CO1,CO1,1*SIZE 1982 daddu CO2,CO2,1*SIZE 1983 1984#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1985 dsubu TEMP, KCO, KK 1986#ifdef LEFT 1987 daddiu TEMP, TEMP, -1 1988#else 1989 daddiu TEMP, TEMP, -2 1990#endif 1991 1992 dsll K, TEMP, 0 + BASE_SHIFT 1993 dsll TEMP, TEMP, 1 + BASE_SHIFT 1994 1995 daddu A, A, K 1996 daddu B, B, TEMP 1997#endif 1998 1999#ifdef LEFT 2000 daddiu KK, KK, 1 2001#endif 2002#endif 2003 2004.L0_N2_Loop: 2005#if defined(TRMMKERNEL) && !defined(LEFT) 2006 daddiu KK, KK, 2 2007#endif 2008 move BO, B 2009 2010 2011 .align 5 2012.L0_N1: 2013 andi N,NCO,1 # nr = 1 2014 beqz N,.L999 2015 nop 2016 2017 move CO1,C 2018 dsra M,MCO,2 2019 2020 move A,AO # Reset A 2021 daddu PREA,AO,SPANA 2022#if defined(TRMMKERNEL) && defined(LEFT) 2023 move KK, OFFSET 2024#endif 2025 2026 beqz M,.L11_M2 2027 daddu C,CO1,LDC 2028 2029.L70: 2030#if defined(TRMMKERNEL) 2031#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2032 move B, BO # Reset B 2033#else 2034 dsll K, KK, 2 + BASE_SHIFT 2035 dsll TEMP, KK, 0 + BASE_SHIFT 2036 2037 daddu A, A, K 2038 daddu B, BO, TEMP 2039#endif 2040 LD b0, 0*SIZE(B) 2041 2042 MTC $0,t11 2043 LD a0,0*SIZE(A) 2044 MOV t21,t11 2045 LD a1,1*SIZE(A) 2046 2047 MOV t31,t11 2048 LD a2,2*SIZE(A) 2049 MOV t41,t11 2050 LD a3,3*SIZE(A) 2051 2052 2053#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2054 dsubu TEMP, KCO, KK 2055#elif defined(LEFT) 2056 daddiu TEMP, KK, 4 2057#else 2058 daddiu TEMP, KK, 1 2059#endif 2060 dsra K,TEMP,2 2061 beqz K,.L75 2062 nop 2063#else 2064 move B, BO # Reset B 2065 dsra K,KCO,2 2066 LD b0, 0*SIZE(B) 2067 2068 MTC $0,t11 2069 LD a0,0*SIZE(A) 2070 MOV t21,t11 2071 LD a1,1*SIZE(A) 2072 2073 MOV t31,t11 2074 LD a2,2*SIZE(A) 2075 MOV t41,t11 2076 beqz K,.L75 2077 LD a3,3*SIZE(A) 2078 2079#endif 2080 2081.L71: # nr=1,mr=kr=4 2082 LD b4, 1*SIZE(B) # b1 2083 MADD t11,t11,a0,b0 2084 2085 LD a4, 4*SIZE(A) 2086 MADD t21,t21,a1,b0 2087 2088 LD a5, 5*SIZE(A) 2089 FETCH $0,(PREA) 2090 2091 LD a6,6*SIZE(A) 2092 MADD t31,t31,a2,b0 2093 2094 LD a7,7*SIZE(A) 2095 MADD t41,t41,a3,b0 2096 2097.L72: 2098 LD b2, 2*SIZE(B) # b2 2099 MADD t11,t11,a4,b4 2100 2101 LD a0,8*SIZE(A) 2102 MADD t21,t21,a5,b4 2103 2104 LD a1,9*SIZE(A) 2105 FETCH $0,4*SIZE(PREA) 2106 2107 LD a2,10*SIZE(A) 2108 MADD t31,t31,a6,b4 2109 2110 LD a3,11*SIZE(A) 2111 MADD t41,t41,a7,b4 2112 2113.L73: 2114 LD b6, 3*SIZE(B) 2115 MADD t11,t11,a0,b2 2116 2117 LD a4,12*SIZE(A) 2118 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 2119 2120 LD a5,13*SIZE(A) 2121 MADD t21,t21,a1,b2 2122 2123 LD a6,14*SIZE(A) 2124 FETCH $0,8*SIZE(PREA) 2125 MADD t31,t31,a2,b2 2126 2127 LD a7,15*SIZE(A) 2128 MADD t41,t41,a3,b2 2129 daddu A,A,16*SIZE # A+=4(mr)*4(kr)*8Byte=16*SIZE 2130 2131.L74: 2132 LD b0, 0*SIZE(B) 2133 MADD t11,t11,a4,b6 2134 2135 LD a0,0*SIZE(A) 2136 daddu PREA,PREA,16*SIZE 2137 2138 LD a1,1*SIZE(A) 2139 MADD t21,t21,a5,b6 2140 2141 LD a2,2*SIZE(A) 2142 daddiu K,K,-1 2143 MADD t31,t31,a6,b6 2144 2145 LD a3,3*SIZE(A) 2146 MADD t41,t41,a7,b6 2147 bnez K,.L71 2148 FETCH $0,-32(PREA) 2149 2150 2151.L75: # kr=2 2152#ifndef TRMMKERNEL 2153 andi K,KCO,2 2154#else 2155 andi K,TEMP,2 2156#endif 2157 beqz K,.L78 2158 nop 2159 2160.L76: 2161 LD b4, 1*SIZE(B) 2162 MADD t11,t11,a0,b0 2163 2164 LD a4,4*SIZE(A) 2165 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=32 2166 2167 LD a5,5*SIZE(A) 2168 MADD t21,t21,a1,b0 2169 FETCH $0,0(PREA) 2170 2171 LD a6,6*SIZE(A) 2172 MADD t31,t31,a2,b0 2173 2174 LD a7,7*SIZE(A) 2175 MADD t41,t41,a3,b0 2176 daddu A,A,8*SIZE # A+=4(mr)*2(kr)*8Byte=8*SIZE 2177 2178.L77: 2179 LD b0,0(B) 2180 MADD t11,t11,a4,b4 2181 2182 LD a0,0*SIZE(A) 2183 MADD t21,t21,a5,b4 2184 FETCH $0,4*SIZE(PREA) 2185 2186 LD a1,1*SIZE(A) 2187 MADD t31,t31,a6,b4 2188 2189 LD a2,2*SIZE(A) 2190 MADD t41,t41,a7,b4 2191 2192 LD a3,3*SIZE(A) 2193 daddu PREA,PREA,8*SIZE 2194 2195 2196 2197.L78: # kr=1 2198#ifndef TRMMKERNEL 2199 andi K,KCO,1 2200#else 2201 andi K,TEMP,1 2202#endif 2203 beqz K,.L79 2204 LD ALPHA,152($sp) # Get ALPHA 2205 2206 FETCH $0,0(PREA) 2207 MADD t11,t11,a0,b0 2208 MADD t21,t21,a1,b0 2209 daddu A,A,4*SIZE # A+=4(mr)*1(kr)*8Byte=32 2210 2211 MADD t31,t31,a2,b0 2212 MADD t41,t41,a3,b0 2213 daddu B,B,1*SIZE 2214 daddu PREA,PREA,4*SIZE 2215 2216 2217.L79: # Write Back 2218#ifndef TRMMKERNEL 2219 LD c11,0(CO1) # Fetch 16 C 2220 LD c21,1*SIZE(CO1) 2221 LD c31,2*SIZE(CO1) 2222 LD c41,3*SIZE(CO1) 2223 2224 MADD t11,c11,t11,ALPHA 2225 MADD t21,c21,t21,ALPHA 2226 MADD t31,c31,t31,ALPHA 2227 MADD t41,c41,t41,ALPHA 2228 2229 ST t11,0(CO1) 2230 ST t21,1*SIZE(CO1) 2231 ST t31,2*SIZE(CO1) 2232 ST t41,3*SIZE(CO1) 2233 daddiu M,M,-1 # M-- 2234 2235 FETCH $0,4*SIZE(CO1) 2236 FETCH $0,8*SIZE(CO1) 2237 2238 bnez M,.L70 # M!=0 2239 daddu CO1,CO1,4*SIZE # COx += 4*8Byte 2240#else 2241 daddiu M,M,-1 # M-- 2242 MUL t11, ALPHA, t11 2243 MUL t21, ALPHA, t21 2244 MUL t31, ALPHA, t31 2245 MUL t41, ALPHA, t41 2246 2247 ST t11,0(CO1) 2248 ST t21,1*SIZE(CO1) 2249 ST t31,2*SIZE(CO1) 2250 ST t41,3*SIZE(CO1) 2251 2252 FETCH $0,4*SIZE(CO1) 2253 FETCH $0,8*SIZE(CO1) 2254 2255 daddu CO1,CO1,4*SIZE 2256#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2257 dsubu TEMP, KCO, KK 2258#ifdef LEFT 2259 daddiu TEMP, TEMP, -4 2260#else 2261 daddiu TEMP, TEMP, -1 2262#endif 2263 2264 dsll K, TEMP, 2 + BASE_SHIFT 2265 dsll TEMP, TEMP, 0 + BASE_SHIFT 2266 2267 daddu A, A,K 2268 daddu B, B, TEMP 2269#endif 2270 2271#ifdef LEFT 2272 daddiu KK, KK, 4 2273#endif 2274 bnez M,.L70 2275 nop 2276#endif 2277 2278 2279 .align 3 2280.L11_M2: 2281 andi M,MCO,2 # mr = 2 2282 beqz M,.L11_M1 2283 nop 2284 2285.L80: 2286#if defined(TRMMKERNEL) 2287#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2288 move B, BO 2289#else 2290 dsll K, KK, 1 + BASE_SHIFT 2291 dsll TEMP, KK, 0 + BASE_SHIFT 2292 2293 daddu A, A, K 2294 daddu B, BO, TEMP 2295#endif 2296 LD b0, 0*SIZE(B) 2297 2298 MTC $0,t11 2299 MOV t21,t11 2300 LD a0,0*SIZE(A) 2301 LD a1,1*SIZE(A) 2302 2303#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2304 dsubu TEMP, KCO, KK 2305#elif defined(LEFT) 2306 daddiu TEMP, KK, 2 2307#else 2308 daddiu TEMP, KK, 1 2309#endif 2310 dsra K,TEMP,2 # K=KCO/2 2311 beqz K,.L85 2312 nop 2313#else 2314 move B, BO 2315 dsra K,KCO,2 2316 LD b0, 0*SIZE(B) 2317 2318 MTC $0,t11 2319 MOV t21,t11 2320 LD a0,0*SIZE(A) 2321 2322 beqz K,.L85 2323 LD a1,1*SIZE(A) 2324 2325#endif 2326 2327.L81: # nr=1,mr=2,kr=4 2328 LD b4, 1*SIZE(B) 2329 LD a4,2*SIZE(A) 2330 MADD t11,t11,a0,b0 2331 LD a5,3*SIZE(A) 2332 MADD t21,t21,a1,b0 2333 2334 LD b2, 2*SIZE(B) 2335 LD a2,4*SIZE(A) 2336 MADD t11,t11,a4,b4 2337 LD a3,5*SIZE(A) 2338 MADD t21,t21,a5,b4 2339 2340 LD b6, 3*SIZE(B) 2341 LD a6,6*SIZE(A) 2342 MADD t11,t11,a2,b2 2343 LD a7,7*SIZE(A) 2344 MADD t21,t21,a3,b2 2345 2346 daddu A,A,8*SIZE # A+=2(mr)*4(kr)*8Byte=8*SIZE 2347 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 2348 2349 LD b0, 0*SIZE(B) 2350 daddiu K,K,-1 2351 2352 LD a0,0*SIZE(A) 2353 MADD t11,t11,a6,b6 2354 2355 LD a1,1*SIZE(A) 2356 bnez K,.L81 2357 MADD t21,t21,a7,b6 2358 2359.L85: # kr=2 2360#ifndef TRMMKERNEL 2361 andi K,KCO,2 2362#else 2363 andi K,TEMP,2 2364#endif 2365 beqz K,.L88 2366 nop 2367 2368.L86: 2369 LD b4, 1*SIZE(B) 2370 LD a4,2*SIZE(A) 2371 MADD t11,t11,a0,b0 2372 LD a5,3*SIZE(A) 2373 MADD t21,t21,a1,b0 2374 2375 daddu A,A,4*SIZE # A+=2(mr)*2(kr)*8Byte=32 2376 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 2377 2378 LD b0,0(B) 2379 LD a0,0*SIZE(A) 2380 MADD t11,t11,a4,b4 2381 LD a1,1*SIZE(A) 2382 MADD t21,t21,a5,b4 2383 2384 2385 2386.L88: # kr=1 2387#ifndef TRMMKERNEL 2388 andi K,KCO,1 2389#else 2390 andi K,TEMP,1 2391#endif 2392 beqz K,.L89 2393 LD ALPHA,152($sp) # Get ALPHA 2394 2395 MADD t11,t11,a0,b0 2396 MADD t21,t21,a1,b0 2397 daddu A,A,2*SIZE # A+=2(mr)*1(kr)*8Byte=16 2398 daddu B,B,1*SIZE 2399 2400 2401.L89: # Write Back 2402#ifndef TRMMKERNEL 2403 LD c11,0(CO1) # Fetch 16 C 2404 LD c21,1*SIZE(CO1) 2405 2406 MADD t11,c11,t11,ALPHA 2407 MADD t21,c21,t21,ALPHA 2408 2409 ST t11,0(CO1) 2410 ST t21,1*SIZE(CO1) 2411 2412 FETCH $0,2*SIZE(CO1) 2413 2414 daddu CO1,CO1,2*SIZE # COx += 2*8Byte 2415 2416#else 2417 daddu CO1,CO1,2*SIZE # COx += 2*8Byte 2418 MUL t11, ALPHA, t11 2419 MUL t21, ALPHA, t21 2420 2421 FETCH $0,0(CO1) 2422 ST t11, -2 * SIZE(CO1) 2423 ST t21, -1 * SIZE(CO1) 2424#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2425 dsubu TEMP, KCO, KK 2426#ifdef LEFT 2427 daddiu TEMP, TEMP, -2 2428#else 2429 daddiu TEMP, TEMP, -1 2430#endif 2431 2432 dsll K, TEMP, 1 + BASE_SHIFT 2433 dsll TEMP, TEMP, 0 + BASE_SHIFT 2434 2435 daddu A, A, K 2436 daddu B, B, TEMP 2437#endif 2438 2439#ifdef LEFT 2440 daddiu KK, KK, 2 2441#endif 2442#endif 2443 2444 2445 .align 3 2446.L11_M1: 2447 andi M,MCO,1 # mr = 1 2448 beqz M,.L999 2449 nop 2450 2451.L90: 2452#if defined(TRMMKERNEL) 2453#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 2454 move B, BO 2455#else 2456 dsll K, KK, 0 + BASE_SHIFT 2457 dsll TEMP, KK, 0 + BASE_SHIFT 2458 2459 daddu A, A, K 2460 daddu B, BO, TEMP 2461#endif 2462 LD a0, 0*SIZE(A) 2463 LD b0, 0*SIZE(B) 2464 MTC $0,t11 2465 2466#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2467 dsubu TEMP, KCO, KK 2468#elif defined(LEFT) 2469 daddiu TEMP, KK, 1 2470#else 2471 daddiu TEMP, KK, 1 2472#endif 2473 dsra K, TEMP, 2 2474 beqz K,.L95 2475 nop 2476 2477#else 2478 move B, BO 2479 LD a0, 0*SIZE(A) 2480 LD b0, 0*SIZE(B) 2481 dsra K,KCO,2 2482 beqz K,.L95 2483 MTC $0,t11 2484#endif 2485 2486.L91: # nr=mr=1,kr=4 2487 LD a4, 1*SIZE(A) 2488 LD b4, 1*SIZE(B) 2489 MADD t11,t11,a0,b0 2490 2491 LD a2, 2*SIZE(A) 2492 LD b2, 2*SIZE(B) 2493 MADD t11,t11,a4,b4 2494 2495 LD a6, 3*SIZE(A) 2496 LD b6, 3*SIZE(B) 2497 MADD t11,t11,a2,b2 2498 2499 daddu A,A,4*SIZE # A+=1(mr)*4(kr)*8Byte=32 2500 daddu B,B,4*SIZE # B+=1(nr)*4(kr)*8Byte=32 2501 2502 LD a0, 0*SIZE(A) 2503 LD b0, 0*SIZE(B) 2504 MADD t11,t11,a6,b6 2505 2506 daddiu K,K,-1 2507 bnez K,.L91 2508 nop 2509 2510.L95: # kr=2 2511#ifndef TRMMKERNEL 2512 andi K,KCO,2 2513#else 2514 andi K,TEMP,2 2515#endif 2516 beqz K,.L98 2517 nop 2518 2519.L96: 2520 LD a4, 1*SIZE(A) 2521 LD b4, 1*SIZE(B) 2522 MADD t11,t11,a0,b0 2523 daddu B,B,2*SIZE # B+=1(nr)*2(kr)*8Byte=16 2524 daddu A,A,2*SIZE # A+=1(mr)*2(kr)*8Byte=32 2525 2526 LD b0,0(B) 2527 LD a0,0(A) 2528 MADD t11,t11,a4,b4 2529 2530.L98: # kr=1 2531#ifndef TRMMKERNEL 2532 andi K,KCO,1 2533#else 2534 andi K,TEMP,1 2535#endif 2536 beqz K,.L99 2537 LD ALPHA,152($sp) # Get ALPHA 2538 2539 MADD t11,t11,a0,b0 2540 2541 2542.L99: # Write Back 2543#ifndef TRMMKERNEL 2544 LD c11,0(CO1) # Fetch 16 C 2545 MADD t11,c11,t11,ALPHA 2546 ST t11,0(CO1) 2547 2548#else 2549 MUL t11, ALPHA, t11 2550 2551 ST t11, 0 * SIZE(CO1) 2552#endif 2553 2554 2555.L999: # End 2556 ld $16, 0($sp) 2557 ld $17, 8($sp) 2558 ld $18, 16($sp) 2559 ld $19, 24($sp) 2560 ld $20, 32($sp) 2561 ld $21, 40($sp) 2562 ld $22, 48($sp) 2563 LD $f24, 56($sp) 2564 LD $f25, 64($sp) 2565 LD $f26, 72($sp) 2566 LD $f27, 80($sp) 2567 LD $f28, 88($sp) 2568 ld $23, 96($sp) 2569 ld $24, 104($sp) 2570 ld $25, 112($sp) 2571 LD $f20,120($sp) 2572 LD $f21,128($sp) 2573 LD $f22,136($sp) 2574 LD $f23,144($sp) 2575 2576 j $31 2577 daddiu $sp, $sp, 160 2578 2579 EPILOGUE 2580