1/*************************************************************************** 2Copyright (c) 2021, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28#define ASSEMBLER 29 30#include "common.h" 31 32#define M $r4 33#define N $r5 34#define K $r6 35#define A $r7 36#define B $r8 37#define C $r9 38#define LDC $r10 39#define AO $r12 40#define BO $r13 41#define I $r17 42#define J $r18 43#define L $r30 44#define PREFETCHSIZE (4 * 10) 45#define CO1 $r14 46#define CO2 $r15 47#define CO3 $r23 48#define CO4 $r24 49#define CO5 $r25 50#define CO6 $r26 51#define CO7 $r27 52#define CO8 $r28 53#define BB $r29 54 55#if defined(TRMMKERNEL) 56#define OFFSET $r11 57#define KK $r20 58#define TEMP $r16 59#endif 60 61#define a1 $f22 62#define a2 $f8 63#define a3 $f27 64#define a4 $f28 65#define b1 $f23 66#define b2 $f9 67#define b3 $f10 68#define b4 $f11 69#define b5 $f12 70#define b6 $f13 71#define b7 $f14 72#define b8 $f15 73#define a5 b8 74#define c11 $f16 75#define c12 $f17 76#define c21 $f3 77#define c22 $f1 78#define c31 $f2 79#define c32 $f4 80#define c41 $f5 81#define c42 $f6 82#define c51 $f7 83#define c52 $f18 84#define c61 $f19 85#define c62 $f20 86#define c71 $f21 87#define c72 $f24 88#define c81 $f25 89#define c82 $f26 90#define ALPHA $f0 91 92 PROLOGUE 93 94 addi.d $sp, $sp, -160 95 SDARG $r23, $sp, 0 96 SDARG $r24, $sp, 8 97 SDARG $r25, $sp, 16 98 SDARG $r26, $sp, 24 99 SDARG $r27, $sp, 32 100 SDARG $r28, $sp, 40 101 SDARG $r29, $sp, 48 102 SDARG $r30, $sp, 96 103 fst.d $f24, $sp, 56 104 fst.d $f25, $sp, 64 105 fst.d $f26, $sp, 72 106 fst.d $f27, $sp, 80 107 fst.d $f28, $sp, 88 108#if defined(TRMMKERNEL) 109 SDARG $r20, $sp, 104 110 SDARG $r16, $sp, 112 111#endif 112#ifndef __64BIT__ 113 fst.d $f18, $sp, 120 114 fst.d $f19, $sp, 128 115 fst.d $f20, $sp, 136 116 fst.d $f21, $sp, 144 117#endif 118 slli.d LDC, LDC, BASE_SHIFT 119#if defined(TRMMKERNEL) && !defined(LEFT) 120 sub.d KK, $r0, OFFSET 121#endif 122 srai.d J, N, 3 123nop 124 bge $r0, J, .L30 125.L10: 126 move CO1, C 127 MTC c11, $r0 128 add.d CO2, C, LDC 129 move AO, A 130 add.d CO3, CO2, LDC 131 addi.d J, J, -1 132 add.d CO4, CO3, LDC 133 MOV c21, c11 134 add.d CO5, CO4, LDC 135 MOV c31, c11 136 add.d CO6, CO5, LDC 137 MOV c41, c11 138 add.d CO7, CO6, LDC 139 MOV c51, c11 140 add.d CO8, CO7, LDC 141 srai.d I, M, 1 142 add.d C, CO8, LDC 143 slli.d BB, K, 2 + BASE_SHIFT 144 add.d BB, B, BB 145#if defined(TRMMKERNEL) && defined(LEFT) 146 move KK, OFFSET 147#endif 148MOV c61, c11 149 bge $r0, I, .L20 150.L11: 151#if defined(TRMMKERNEL) 152#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 153 move BO, B 154#else 155 slli.d L, KK, 1 + BASE_SHIFT 156 slli.d TEMP, KK, 3 + BASE_SHIFT 157 add.d AO, AO, L 158 add.d BO, B, TEMP 159#endif 160 LD a1, AO, 0 * SIZE 161 MOV c71, c11 162 LD b1, BO, 0 * SIZE 163 MOV c81, c11 164 LD a3, AO, 4 * SIZE 165 MOV c12, c11 166 LD b2, BO, 1 * SIZE 167 MOV c22, c11 168 MOV c32, c11 169 LD b3, BO, 2 * SIZE 170 MOV c42, c11 171 LD b4, BO, 3 * SIZE 172 MOV c52, c11 173 LD b5, BO, 4 * SIZE 174 MOV c62, c11 175 LD b6, BO, 8 * SIZE 176 MOV c72, c11 177 LD b7, BO, 12 * SIZE 178 MOV c82, c11 179#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 180 sub.d TEMP, K, KK 181#elif defined(LEFT) 182 addi.d TEMP, KK, 2 183#else 184 addi.d TEMP, KK, 8 185#endif 186 srai.d L, TEMP, 2 187 bge $r0, L, .L15 188#else 189 LD a1, AO, 0 * SIZE 190 MOV c71, c11 191 LD b1, B, 0 * SIZE 192 MOV c81, c11 193 preld 1, CO1, 3 * SIZE 194 preld 1, CO2, 3 * SIZE 195 LD a3, AO, 4 * SIZE 196 MOV c12, c11 197 LD b2, B, 1 * SIZE 198 MOV c22, c11 199 srai.d L, K, 2 200 MOV c32, c11 201 LD b3, B, 2 * SIZE 202 MOV c42, c11 203 LD b4, B, 3 * SIZE 204 MOV c52, c11 205 LD b5, B, 4 * SIZE 206 MOV c62, c11 207 LD b6, B, 8 * SIZE 208 MOV c72, c11 209 LD b7, B, 12 * SIZE 210 MOV c82, c11 211move BO, B 212 bge $r0, L, .L15 213#endif 214 MADD c11, b1, a1, c11 215 LD a2, AO, 1 * SIZE 216 MADD c21, b2, a1, c21 217 addi.d L, L, -1 218 MADD c31, b3, a1, c31 219 MADD c41, b4, a1, c41 220 bge $r0, L, .L13 221 preld 1, CO3, 2 * SIZE 222 .align 3 223.L12: 224 MADD c12, b1, a2, c12 225 LD b1, BO, 16 * SIZE 226 MADD c22, b2, a2, c22 227 LD b2, BO, 5 * SIZE 228 MADD c32, b3, a2, c32 229 LD b3, BO, 6 * SIZE 230 MADD c42, b4, a2, c42 231 LD b4, BO, 7 * SIZE 232 MADD c51, b5, a1, c51 233 LD a4, AO, 2 * SIZE 234 MADD c61, b2, a1, c61 235 MADD c71, b3, a1, c71 236 MADD c81, b4, a1, c81 237 LD a1, AO, 8 * SIZE 238 MADD c52, b5, a2, c52 239 LD b5, BO, 20 * SIZE 240 MADD c62, b2, a2, c62 241 LD b2, BO, 9 * SIZE 242 MADD c72, b3, a2, c72 243 LD b3, BO, 10 * SIZE 244 MADD c82, b4, a2, c82 245 LD b4, BO, 11 * SIZE 246 MADD c11, b6, a4, c11 247 LD a2, AO, 3 * SIZE 248 MADD c21, b2, a4, c21 249 MADD c31, b3, a4, c31 250 MADD c41, b4, a4, c41 251 MADD c12, b6, a2, c12 252 LD b6, BO, 24 * SIZE 253 MADD c22, b2, a2, c22 254 LD b2, BO, 13 * SIZE 255 MADD c32, b3, a2, c32 256 LD b3, BO, 14 * SIZE 257 MADD c42, b4, a2, c42 258 LD b4, BO, 15 * SIZE 259 MADD c51, b7, a4, c51 260 MADD c61, b2, a4, c61 261 MADD c71, b3, a4, c71 262 MADD c81, b4, a4, c81 263 MADD c52, b7, a2, c52 264 LD b7, BO, 28 * SIZE 265 MADD c62, b2, a2, c62 266 LD b2, BO, 17 * SIZE 267 MADD c72, b3, a2, c72 268 LD b3, BO, 18 * SIZE 269 MADD c82, b4, a2, c82 270 LD b4, BO, 19 * SIZE 271 MADD c11, b1, a3, c11 272 LD a2, AO, 5 * SIZE 273 MADD c21, b2, a3, c21 274 MADD c31, b3, a3, c31 275 MADD c41, b4, a3, c41 276 MADD c12, b1, a2, c12 277 LD b1, BO, 32 * SIZE 278 MADD c22, b2, a2, c22 279 LD b2, BO, 21 * SIZE 280 MADD c32, b3, a2, c32 281 LD b3, BO, 22 * SIZE 282 MADD c42, b4, a2, c42 283 LD b4, BO, 23 * SIZE 284 MADD c51, b5, a3, c51 285 LD a4, AO, 6 * SIZE 286 MADD c61, b2, a3, c61 287 MADD c71, b3, a3, c71 288 MADD c81, b4, a3, c81 289 LD a3, AO, 12 * SIZE 290 MADD c52, b5, a2, c52 291 LD b5, BO, 36 * SIZE 292 MADD c62, b2, a2, c62 293 LD b2, BO, 25 * SIZE 294 MADD c72, b3, a2, c72 295 LD b3, BO, 26 * SIZE 296 MADD c82, b4, a2, c82 297 LD b4, BO, 27 * SIZE 298 MADD c11, b6, a4, c11 299 LD a2, AO, 7 * SIZE 300 MADD c21, b2, a4, c21 301 MADD c31, b3, a4, c31 302 MADD c41, b4, a4, c41 303 addi.d L, L, -1 304 MADD c12, b6, a2, c12 305 LD b6, BO, 40 * SIZE 306 MADD c22, b2, a2, c22 307 LD b2, BO, 29 * SIZE 308 MADD c32, b3, a2, c32 309 LD b3, BO, 30 * SIZE 310 MADD c42, b4, a2, c42 311 LD b4, BO, 31 * SIZE 312 MADD c51, b7, a4, c51 313 addi.d BO, BO, 32 * SIZE 314 MADD c61, b2, a4, c61 315 addi.d AO, AO, 8 * SIZE 316 MADD c71, b3, a4, c71 317 MADD c81, b4, a4, c81 318 MADD c52, b7, a2, c52 319 LD b7, BO, 12 * SIZE 320 MADD c62, b2, a2, c62 321 LD b2, BO, 1 * SIZE 322 MADD c72, b3, a2, c72 323 LD b3, BO, 2 * SIZE 324 MADD c82, b4, a2, c82 325 LD b4, BO, 3 * SIZE 326 MADD c11, b1, a1, c11 327 LD a2, AO, 1 * SIZE 328 MADD c21, b2, a1, c21 329 MADD c31, b3, a1, c31 330 MADD c41, b4, a1, c41 331 blt $r0, L, .L12 332 .align 3 333 334.L13: 335 MADD c12, b1, a2, c12 336 LD b1, BO, 16 * SIZE 337 MADD c22, b2, a2, c22 338 LD b2, BO, 5 * SIZE 339 MADD c32, b3, a2, c32 340 LD b3, BO, 6 * SIZE 341 MADD c42, b4, a2, c42 342 LD b4, BO, 7 * SIZE 343 MADD c51, b5, a1, c51 344 MADD c61, b2, a1, c61 345 LD a4, AO, 2 * SIZE 346 MADD c71, b3, a1, c71 347 MADD c81, b4, a1, c81 348 LD a1, AO, 8 * SIZE 349 MADD c52, b5, a2, c52 350 LD b5, BO, 20 * SIZE 351 MADD c62, b2, a2, c62 352 LD b2, BO, 9 * SIZE 353 MADD c72, b3, a2, c72 354 LD b3, BO, 10 * SIZE 355 MADD c82, b4, a2, c82 356 LD b4, BO, 11 * SIZE 357 MADD c11, b6, a4, c11 358 LD a2, AO, 3 * SIZE 359 MADD c21, b2, a4, c21 360 MADD c31, b3, a4, c31 361 preld 1, CO4, 3 * SIZE 362 MADD c41, b4, a4, c41 363 MADD c12, b6, a2, c12 364 LD b6, BO, 24 * SIZE 365 MADD c22, b2, a2, c22 366 LD b2, BO, 13 * SIZE 367 MADD c32, b3, a2, c32 368 LD b3, BO, 14 * SIZE 369 MADD c42, b4, a2, c42 370 LD b4, BO, 15 * SIZE 371 MADD c51, b7, a4, c51 372 preld 1, CO5, 3 * SIZE 373 MADD c61, b2, a4, c61 374 MADD c71, b3, a4, c71 375 preld 1, CO6, 3 * SIZE 376 MADD c81, b4, a4, c81 377 MADD c52, b7, a2, c52 378 LD b7, BO, 28 * SIZE 379 MADD c62, b2, a2, c62 380 LD b2, BO, 17 * SIZE 381 MADD c72, b3, a2, c72 382 LD b3, BO, 18 * SIZE 383 MADD c82, b4, a2, c82 384 LD b4, BO, 19 * SIZE 385 MADD c11, b1, a3, c11 386 LD a2, AO, 5 * SIZE 387 MADD c21, b2, a3, c21 388 MADD c31, b3, a3, c31 389 preld 1, CO7, 3 * SIZE 390 MADD c41, b4, a3, c41 391 MADD c12, b1, a2, c12 392 LD b1, BO, 32 * SIZE 393 MADD c22, b2, a2, c22 394 LD b2, BO, 21 * SIZE 395 MADD c32, b3, a2, c32 396 LD b3, BO, 22 * SIZE 397 MADD c42, b4, a2, c42 398 LD b4, BO, 23 * SIZE 399 MADD c51, b5, a3, c51 400 MADD c61, b2, a3, c61 401 LD a4, AO, 6 * SIZE 402 MADD c71, b3, a3, c71 403 MADD c81, b4, a3, c81 404 MADD c52, b5, a2, c52 405 LD b5, BO, 36 * SIZE 406 MADD c62, b2, a2, c62 407 LD b2, BO, 25 * SIZE 408 MADD c72, b3, a2, c72 409 LD b3, BO, 26 * SIZE 410 MADD c82, b4, a2, c82 411 LD b4, BO, 27 * SIZE 412 MADD c11, b6, a4, c11 413 LD a2, AO, 7 * SIZE 414 MADD c21, b2, a4, c21 415 MADD c31, b3, a4, c31 416 MADD c41, b4, a4, c41 417 MADD c12, b6, a2, c12 418 LD b6, BO, 40 * SIZE 419 MADD c22, b2, a2, c22 420 LD b2, BO, 29 * SIZE 421 MADD c32, b3, a2, c32 422 LD b3, BO, 30 * SIZE 423 MADD c42, b4, a2, c42 424 LD b4, BO, 31 * SIZE 425 MADD c51, b7, a4, c51 426 addi.d BO, BO, 32 * SIZE 427 MADD c61, b2, a4, c61 428 addi.d AO, AO, 8 * SIZE 429 MADD c71, b3, a4, c71 430 MADD c81, b4, a4, c81 431 MADD c52, b7, a2, c52 432 LD b7, BO, 12 * SIZE 433 MADD c62, b2, a2, c62 434 LD b2, BO, 1 * SIZE 435 MADD c72, b3, a2, c72 436 LD b3, BO, 2 * SIZE 437 MADD c82, b4, a2, c82 438 LD b4, BO, 3 * SIZE 439 .align 3 440 441.L15: 442#ifndef TRMMKERNEL 443 andi L, K, 3 444#else 445 andi L, TEMP, 3 446#endif 447 preld 1, CO8, 3 * SIZE 448 bge $r0, L, .L18 449 .align 3 450.L16: 451 MADD c11, b1, a1, c11 452 LD a2, AO, 1 * SIZE 453 MADD c21, b2, a1, c21 454 MADD c31, b3, a1, c31 455 MADD c41, b4, a1, c41 456 MADD c12, b1, a2, c12 457 LD b1, BO, 8 * SIZE 458 MADD c22, b2, a2, c22 459 LD b2, BO, 5 * SIZE 460 MADD c32, b3, a2, c32 461 LD b3, BO, 6 * SIZE 462 MADD c42, b4, a2, c42 463 LD b4, BO, 7 * SIZE 464 MADD c51, b5, a1, c51 465 addi.d L, L, -1 466 MADD c61, b2, a1, c61 467 addi.d AO, AO, 2 * SIZE 468 MADD c71, b3, a1, c71 469 addi.d BO, BO, 8 * SIZE 470 MADD c81, b4, a1, c81 471 LD a1, AO, 0 * SIZE 472 MADD c52, b5, a2, c52 473 LD b5, BO, 4 * SIZE 474 MADD c62, b2, a2, c62 475 LD b2, BO, 1 * SIZE 476 MADD c72, b3, a2, c72 477 LD b3, BO, 2 * SIZE 478 MADD c82, b4, a2, c82 479 LD b4, BO, 3 * SIZE 480 blt $r0, L, .L16 481.L18: 482#ifndef TRMMKERNEL 483 LD $f22, CO1, 0 * SIZE 484 addi.d CO3,CO3, 2 * SIZE 485 LD $f8, CO1, 1 * SIZE 486 addi.d CO1,CO1, 2 * SIZE 487 LD $f23, CO2, 0 * SIZE 488 addi.d CO4,CO4, 2 * SIZE 489 LD $f9, CO2, 1 * SIZE 490 addi.d CO2,CO2, 2 * SIZE 491 LD $f10, CO3, -2 * SIZE 492 addi.d CO5,CO5, 2 * SIZE 493 LD $f11, CO3, -1 * SIZE 494 addi.d CO6,CO6, 2 * SIZE 495 LD $f12, CO4, -2 * SIZE 496 addi.d CO7,CO7, 2 * SIZE 497 LD $f13, CO4, -1 * SIZE 498 addi.d I, I, -1 499 MADD c11, c11, ALPHA, $f22 500 LD $f22, CO5, -2 * SIZE 501 MADD c12, c12, ALPHA, $f8 502 LD $f8, CO5, -1 * SIZE 503 MADD c21, c21, ALPHA, $f23 504 LD $f23, CO6, -2 * SIZE 505 MADD c22, c22, ALPHA, $f9 506 LD $f9, CO6, -1 * SIZE 507 MADD c31, c31, ALPHA, $f10 508 LD $f10, CO7, -2 * SIZE 509 MADD c32, c32, ALPHA, $f11 510 LD $f11, CO7, -1 * SIZE 511 MADD c41, c41, ALPHA, $f12 512 LD $f12, CO8, 0 * SIZE 513 MADD c42, c42, ALPHA, $f13 514 LD $f13, CO8, 1 * SIZE 515 preld 0, BB, 0 * SIZE 516 preld 0, BB, 8 * SIZE 517 ST c11, CO1, -2 * SIZE 518 MTC c11, $r0 519 ST c12, CO1, -1 * SIZE 520 addi.d CO8,CO8, 2 * SIZE 521 ST c21, CO2, -2 * SIZE 522 MOV c21, c11 523 ST c22, CO2, -1 * SIZE 524 addi.d BB, BB, 16 * SIZE 525 MADD c51, c51, ALPHA, $f22 526 ST c31, CO3, -2 * SIZE 527 MADD c52, c52, ALPHA, $f8 528 ST c32, CO3, -1 * SIZE 529 MADD c61, c61, ALPHA, $f23 530 ST c41, CO4, -2 * SIZE 531 MADD c62, c62, ALPHA, $f9 532 ST c42, CO4, -1 * SIZE 533 MADD c71, c71, ALPHA, $f10 534 ST c51, CO5, -2 * SIZE 535 MADD c72, c72, ALPHA, $f11 536 ST c52, CO5, -1 * SIZE 537 MADD c81, c81, ALPHA, $f12 538 ST c61, CO6, -2 * SIZE 539 MADD c82, c82, ALPHA, $f13 540 ST c62, CO6, -1 * SIZE 541 ST c71, CO7, -2 * SIZE 542 MOV c31, c11 543 ST c72, CO7, -1 * SIZE 544 MOV c41, c11 545 ST c81, CO8, -2 * SIZE 546 MOV c51, c11 547 ST c82, CO8, -1 * SIZE 548MOV c61, c11 549 blt $r0, I, .L11 550#else 551 addi.d CO4,CO4, 2 * SIZE 552 addi.d CO5,CO5, 2 * SIZE 553 addi.d CO6,CO6, 2 * SIZE 554 addi.d CO7,CO7, 2 * SIZE 555 preld 0, BB, 0 * SIZE 556 preld 0, BB, 8 * SIZE 557 MUL c11, ALPHA, c11 558 addi.d CO1,CO1, 2 * SIZE 559 MUL c12, ALPHA, c12 560 MTC a1, $r0 561 MUL c21, ALPHA, c21 562 addi.d CO2,CO2, 2 * SIZE 563 MUL c22, ALPHA, c22 564 addi.d CO3,CO3, 2 * SIZE 565 ST c11, CO1, -2 * SIZE 566 MUL c31, ALPHA, c31 567 ST c12, CO1, -1 * SIZE 568 MUL c32, ALPHA, c32 569 ST c21, CO2, -2 * SIZE 570 MUL c41, ALPHA, c41 571 ST c22, CO2, -1 * SIZE 572 MUL c42, ALPHA, c42 573 ST c31, CO3, -2 * SIZE 574 MUL c51, ALPHA, c51 575 ST c32, CO3, -1 * SIZE 576 MUL c52, ALPHA, c52 577 ST c41, CO4, -2 * SIZE 578 MUL c61, ALPHA, c61 579 ST c42, CO4, -1 * SIZE 580 MUL c62, ALPHA, c62 581 ST c51, CO5, -2 * SIZE 582 MUL c71, ALPHA, c71 583 ST c52, CO5, -1 * SIZE 584 MUL c72, ALPHA, c72 585 ST c61, CO6, -2 * SIZE 586 MUL c81, ALPHA, c81 587 ST c62, CO6, -1 * SIZE 588 MUL c82, ALPHA, c82 589 ST c71, CO7, -2 * SIZE 590 MOV c11, a1 591 ST c72, CO7, -1 * SIZE 592 MOV c21, a1 593 addi.d CO8,CO8, 2 * SIZE 594 addi.d BB, BB, 16 * SIZE 595 ST c81, CO8, -2 * SIZE 596 MOV c31, a1 597 ST c82, CO8, -1 * SIZE 598 MOV c41, a1 599 addi.d I, I, -1 600 MOV c51, a1 601#if ( defined(LEFT) && defined(TRANSA)) || \ 602 (!defined(LEFT) && !defined(TRANSA)) 603 sub.d TEMP, K, KK 604#ifdef LEFT 605 addi.d TEMP, TEMP, -2 606#else 607 addi.d TEMP, TEMP, -8 608#endif 609 slli.d L, TEMP, 1 + BASE_SHIFT 610 slli.d TEMP, TEMP, 3 + BASE_SHIFT 611 add.d AO, AO, L 612 add.d BO, BO, TEMP 613#endif 614#ifdef LEFT 615 addi.d KK, KK, 2 616#endif 617MOV c61, a1 618 blt $r0, I, .L11 619#endif 620 .align 3 621 622.L20: 623 andi I, M, 1 624 MOV c61, c11 625MOV c71, c11 626 bge $r0, I, .L29 627#if defined(TRMMKERNEL) 628#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 629 move BO, B 630#else 631 slli.d L, KK, 0 + BASE_SHIFT 632 slli.d TEMP, KK, 3 + BASE_SHIFT 633 add.d AO, AO, L 634 add.d BO, B, TEMP 635#endif 636 LD a1, AO, 0 * SIZE 637 LD a2, AO, 1 * SIZE 638 LD a3, AO, 2 * SIZE 639 LD a4, AO, 3 * SIZE 640 LD b1, BO, 0 * SIZE 641 LD b2, BO, 1 * SIZE 642 LD b3, BO, 2 * SIZE 643 LD b4, BO, 3 * SIZE 644 LD b5, BO, 4 * SIZE 645 LD b6, BO, 8 * SIZE 646 LD b7, BO, 12 * SIZE 647#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 648 sub.d TEMP, K, KK 649#elif defined(LEFT) 650 addi.d TEMP, KK, 1 651#else 652 addi.d TEMP, KK, 8 653#endif 654 srai.d L, TEMP, 2 655MOV c81, c11 656 bge $r0, L, .L25 657#else 658 LD a1, AO, 0 * SIZE 659 LD a2, AO, 1 * SIZE 660 LD a3, AO, 2 * SIZE 661 LD a4, AO, 3 * SIZE 662 LD b1, B, 0 * SIZE 663 LD b2, B, 1 * SIZE 664 LD b3, B, 2 * SIZE 665 LD b4, B, 3 * SIZE 666 LD b5, B, 4 * SIZE 667 LD b6, B, 8 * SIZE 668 LD b7, B, 12 * SIZE 669 srai.d L, K, 2 670 MOV c81, c11 671move BO, B 672 bge $r0, L, .L25 673#endif 674 .align 3 675.L22: 676 MADD c11, b1, a1, c11 677 LD b1, BO, 16 * SIZE 678 MADD c21, b2, a1, c21 679 LD b2, BO, 5 * SIZE 680 MADD c31, b3, a1, c31 681 LD b3, BO, 6 * SIZE 682 MADD c41, b4, a1, c41 683 LD b4, BO, 7 * SIZE 684 MADD c51, b5, a1, c51 685 LD b5, BO, 20 * SIZE 686 MADD c61, b2, a1, c61 687 LD b2, BO, 9 * SIZE 688 MADD c71, b3, a1, c71 689 LD b3, BO, 10 * SIZE 690 MADD c81, b4, a1, c81 691 LD b4, BO, 11 * SIZE 692 LD a1, AO, 4 * SIZE 693 addi.d L, L, -1 694 MADD c11, b6, a2, c11 695 LD b6, BO, 24 * SIZE 696 MADD c21, b2, a2, c21 697 LD b2, BO, 13 * SIZE 698 MADD c31, b3, a2, c31 699 LD b3, BO, 14 * SIZE 700 MADD c41, b4, a2, c41 701 LD b4, BO, 15 * SIZE 702 MADD c51, b7, a2, c51 703 LD b7, BO, 28 * SIZE 704 MADD c61, b2, a2, c61 705 LD b2, BO, 17 * SIZE 706 MADD c71, b3, a2, c71 707 LD b3, BO, 18 * SIZE 708 MADD c81, b4, a2, c81 709 LD b4, BO, 19 * SIZE 710 LD a2, AO, 5 * SIZE 711 addi.d AO, AO, 4 * SIZE 712 MADD c11, b1, a3, c11 713 LD b1, BO, 32 * SIZE 714 MADD c21, b2, a3, c21 715 LD b2, BO, 21 * SIZE 716 MADD c31, b3, a3, c31 717 LD b3, BO, 22 * SIZE 718 MADD c41, b4, a3, c41 719 LD b4, BO, 23 * SIZE 720 MADD c51, b5, a3, c51 721 LD b5, BO, 36 * SIZE 722 MADD c61, b2, a3, c61 723 LD b2, BO, 25 * SIZE 724 MADD c71, b3, a3, c71 725 LD b3, BO, 26 * SIZE 726 MADD c81, b4, a3, c81 727 LD b4, BO, 27 * SIZE 728 LD a3, AO, 2 * SIZE 729 addi.d BO, BO, 32 * SIZE 730 MADD c11, b6, a4, c11 731 LD b6, BO, 8 * SIZE 732 MADD c21, b2, a4, c21 733 LD b2, BO, -3 * SIZE 734 MADD c31, b3, a4, c31 735 LD b3, BO, -2 * SIZE 736 MADD c41, b4, a4, c41 737 LD b4, BO, -1 * SIZE 738 MADD c51, b7, a4, c51 739 LD b7, BO, 12 * SIZE 740 MADD c61, b2, a4, c61 741 LD b2, BO, 1 * SIZE 742 MADD c71, b3, a4, c71 743 LD b3, BO, 2 * SIZE 744 MADD c81, b4, a4, c81 745 LD b4, BO, 3 * SIZE 746 LD a4, AO, 3 * SIZE 747 blt $r0, L, .L22 748 .align 3 749 750.L25: 751#ifndef TRMMKERNEL 752 andi L, K, 3 753#else 754 andi L, TEMP, 3 755#endif 756 bge $r0, L, .L28 757 .align 3 758.L26: 759 MADD c11, b1, a1, c11 760 LD b1, BO, 8 * SIZE 761 MADD c21, b2, a1, c21 762 LD b2, BO, 5 * SIZE 763 MADD c31, b3, a1, c31 764 LD b3, BO, 6 * SIZE 765 MADD c41, b4, a1, c41 766 LD b4, BO, 7 * SIZE 767 addi.d L, L, -1 768 MOV a2, a2 769 addi.d AO, AO, 1 * SIZE 770 addi.d BO, BO, 8 * SIZE 771 MADD c51, b5, a1, c51 772 LD b5, BO, 4 * SIZE 773 MADD c61, b2, a1, c61 774 LD b2, BO, 1 * SIZE 775 MADD c71, b3, a1, c71 776 LD b3, BO, 2 * SIZE 777 MADD c81, b4, a1, c81 778 LD a1, AO, 0 * SIZE 779 LD b4, BO, 3 * SIZE 780 blt $r0, L, .L26 781.L28: 782#ifndef TRMMKERNEL 783 LD $f22, CO1, 0 * SIZE 784 LD $f8, CO2, 0 * SIZE 785 LD $f23, CO3, 0 * SIZE 786 LD $f9, CO4, 0 * SIZE 787 MADD c11, c11, ALPHA, $f22 788 LD $f10, CO5, 0 * SIZE 789 MADD c21, c21, ALPHA, $f8 790 LD $f11, CO6, 0 * SIZE 791 MADD c31, c31, ALPHA, $f23 792 LD $f12, CO7, 0 * SIZE 793 MADD c41, c41, ALPHA, $f9 794 LD $f13, CO8, 0 * SIZE 795 MADD c51, c51, ALPHA, $f10 796 ST c11, CO1, 0 * SIZE 797 MADD c61, c61, ALPHA, $f11 798 ST c21, CO2, 0 * SIZE 799 MADD c71, c71, ALPHA, $f12 800 ST c31, CO3, 0 * SIZE 801 MADD c81, c81, ALPHA, $f13 802 ST c41, CO4, 0 * SIZE 803 ST c51, CO5, 0 * SIZE 804 ST c61, CO6, 0 * SIZE 805 ST c71, CO7, 0 * SIZE 806 ST c81, CO8, 0 * SIZE 807#else 808 MUL c11, ALPHA, c11 809 MUL c21, ALPHA, c21 810 MUL c31, ALPHA, c31 811 MUL c41, ALPHA, c41 812 ST c11, CO1, 0 * SIZE 813 MUL c51, ALPHA, c51 814 ST c21, CO2, 0 * SIZE 815 MUL c61, ALPHA, c61 816 ST c31, CO3, 0 * SIZE 817 MUL c71, ALPHA, c71 818 ST c41, CO4, 0 * SIZE 819 MUL c81, ALPHA, c81 820 ST c51, CO5, 0 * SIZE 821 ST c61, CO6, 0 * SIZE 822 ST c71, CO7, 0 * SIZE 823 ST c81, CO8, 0 * SIZE 824#if ( defined(LEFT) && defined(TRANSA)) || \ 825 (!defined(LEFT) && !defined(TRANSA)) 826 sub.d TEMP, K, KK 827#ifdef LEFT 828 addi.d TEMP, TEMP, -1 829#else 830 addi.d TEMP, TEMP, -8 831#endif 832 slli.d L, TEMP, 0 + BASE_SHIFT 833 slli.d TEMP, TEMP, 3 + BASE_SHIFT 834 add.d AO, AO, L 835 add.d BO, BO, TEMP 836#endif 837#ifdef LEFT 838 addi.d KK, KK, 1 839#endif 840#endif 841 .align 3 842 843.L29: 844#if defined(TRMMKERNEL) && !defined(LEFT) 845 addi.d KK, KK, 8 846#endif 847move B, BO 848 blt $r0, J, .L10 849 .align 3 850 851.L30: 852 andi J, N, 4 853move AO, A 854 bge $r0, J, .L50 855 move CO1, C 856 MTC c11, $r0 857 add.d CO2, C, LDC 858 add.d CO3, CO2, LDC 859 add.d CO4, CO3, LDC 860 MOV c21, c11 861 add.d C, CO4, LDC 862 MOV c31, c11 863#if defined(TRMMKERNEL) && defined(LEFT) 864 move KK, OFFSET 865#endif 866 srai.d I, M, 1 867MOV c41, c11 868 bge $r0, I, .L40 869.L31: 870#if defined(TRMMKERNEL) 871#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 872 move BO, B 873#else 874 slli.d L, KK, 1 + BASE_SHIFT 875 slli.d TEMP, KK, 2 + BASE_SHIFT 876 add.d AO, AO, L 877 add.d BO, B, TEMP 878#endif 879 LD a1, AO, 0 * SIZE 880 LD a3, AO, 4 * SIZE 881 LD b1, BO, 0 * SIZE 882 MOV c12, c11 883 LD b2, BO, 1 * SIZE 884 MOV c22, c11 885 LD b3, BO, 2 * SIZE 886 MOV c32, c11 887 LD b4, BO, 3 * SIZE 888 MOV c42, c11 889 LD b5, BO, 4 * SIZE 890 LD b6, BO, 8 * SIZE 891 LD b7, BO, 12 * SIZE 892#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 893 sub.d TEMP, K, KK 894#elif defined(LEFT) 895 addi.d TEMP, KK, 2 896#else 897 addi.d TEMP, KK, 4 898#endif 899 srai.d L, TEMP, 2 900 bge $r0, L, .L35 901#else 902 LD a1, AO, 0 * SIZE 903 LD a3, AO, 4 * SIZE 904 LD b1, B, 0 * SIZE 905 MOV c12, c11 906 LD b2, B, 1 * SIZE 907 MOV c22, c11 908 LD b3, B, 2 * SIZE 909 MOV c32, c11 910 LD b4, B, 3 * SIZE 911 MOV c42, c11 912 LD b5, B, 4 * SIZE 913 srai.d L, K, 2 914 LD b6, B, 8 * SIZE 915 LD b7, B, 12 * SIZE 916move BO, B 917 bge $r0, L, .L35 918#endif 919 .align 3 920.L32: 921 MADD c11, b1, a1, c11 922 LD a2, AO, 1 * SIZE 923 MADD c21, b2, a1, c21 924 addi.d L, L, -1 925 MADD c31, b3, a1, c31 926 MADD c41, b4, a1, c41 927 LD a1, AO, 2 * SIZE 928 MADD c12, b1, a2, c12 929 LD b1, BO, 16 * SIZE 930 MADD c22, b2, a2, c22 931 LD b2, BO, 5 * SIZE 932 MADD c32, b3, a2, c32 933 LD b3, BO, 6 * SIZE 934 MADD c42, b4, a2, c42 935 LD b4, BO, 7 * SIZE 936 MADD c11, b5, a1, c11 937 LD a2, AO, 3 * SIZE 938 MADD c21, b2, a1, c21 939 MADD c31, b3, a1, c31 940 MADD c41, b4, a1, c41 941 LD a1, AO, 8 * SIZE 942 MADD c12, b5, a2, c12 943 LD b5, BO, 20 * SIZE 944 MADD c22, b2, a2, c22 945 LD b2, BO, 9 * SIZE 946 MADD c32, b3, a2, c32 947 LD b3, BO, 10 * SIZE 948 MADD c42, b4, a2, c42 949 LD b4, BO, 11 * SIZE 950 MADD c11, b6, a3, c11 951 LD a2, AO, 5 * SIZE 952 MADD c21, b2, a3, c21 953 MADD c31, b3, a3, c31 954 MADD c41, b4, a3, c41 955 LD a3, AO, 6 * SIZE 956 MADD c12, b6, a2, c12 957 LD b6, BO, 24 * SIZE 958 MADD c22, b2, a2, c22 959 LD b2, BO, 13 * SIZE 960 MADD c32, b3, a2, c32 961 LD b3, BO, 14 * SIZE 962 MADD c42, b4, a2, c42 963 LD b4, BO, 15 * SIZE 964 MADD c11, b7, a3, c11 965 LD a2, AO, 7 * SIZE 966 MADD c21, b2, a3, c21 967 addi.d AO, AO, 8 * SIZE 968 MADD c31, b3, a3, c31 969 addi.d BO, BO, 16 * SIZE 970 MADD c41, b4, a3, c41 971 LD a3, AO, 4 * SIZE 972 MADD c12, b7, a2, c12 973 LD b7, BO, 12 * SIZE 974 MADD c22, b2, a2, c22 975 LD b2, BO, 1 * SIZE 976 MADD c32, b3, a2, c32 977 LD b3, BO, 2 * SIZE 978 MADD c42, b4, a2, c42 979 LD b4, BO, 3 * SIZE 980 blt $r0, L, .L32 981 .align 3 982 983.L35: 984#ifndef TRMMKERNEL 985 andi L, K, 3 986#else 987 andi L, TEMP, 3 988#endif 989 bge $r0, L, .L38 990 .align 3 991.L36: 992 MADD c11, b1, a1, c11 993 LD a2, AO, 1 * SIZE 994 MADD c21, b2, a1, c21 995 addi.d L, L, -1 996 MADD c31, b3, a1, c31 997 addi.d AO, AO, 2 * SIZE 998 MADD c41, b4, a1, c41 999 LD a1, AO, 0 * SIZE 1000 MADD c12, b1, a2, c12 1001 LD b1, BO, 4 * SIZE 1002 MADD c22, b2, a2, c22 1003 LD b2, BO, 5 * SIZE 1004 MADD c32, b3, a2, c32 1005 LD b3, BO, 6 * SIZE 1006 MADD c42, b4, a2, c42 1007 LD b4, BO, 7 * SIZE 1008addi.d BO, BO, 4 * SIZE 1009 blt $r0, L, .L36 1010.L38: 1011#ifndef TRMMKERNEL 1012 LD $f22, CO1, 0 * SIZE 1013 addi.d CO3,CO3, 2 * SIZE 1014 LD $f8, CO1, 1 * SIZE 1015 addi.d CO1,CO1, 2 * SIZE 1016 LD $f23, CO2, 0 * SIZE 1017 addi.d CO4,CO4, 2 * SIZE 1018 LD $f9, CO2, 1 * SIZE 1019 addi.d CO2,CO2, 2 * SIZE 1020 LD $f10, CO3, -2 * SIZE 1021 MADD c11, c11, ALPHA, $f22 1022 LD $f11, CO3, -1 * SIZE 1023 MADD c12, c12, ALPHA, $f8 1024 LD $f12, CO4, -2 * SIZE 1025 MADD c21, c21, ALPHA, $f23 1026 LD $f13, CO4, -1 * SIZE 1027 MADD c22, c22, ALPHA, $f9 1028 MADD c31, c31, ALPHA, $f10 1029 ST c11, CO1, -2 * SIZE 1030 MADD c32, c32, ALPHA, $f11 1031 ST c12, CO1, -1 * SIZE 1032 MADD c41, c41, ALPHA, $f12 1033 ST c21, CO2, -2 * SIZE 1034 MADD c42, c42, ALPHA, $f13 1035 ST c22, CO2, -1 * SIZE 1036 ST c31, CO3, -2 * SIZE 1037 MTC c11, $r0 1038 ST c32, CO3, -1 * SIZE 1039 addi.d I, I, -1 1040 ST c41, CO4, -2 * SIZE 1041 MOV c21, c11 1042 ST c42, CO4, -1 * SIZE 1043 MOV c31, c11 1044#else 1045 MUL c11, ALPHA, c11 1046 addi.d CO3,CO3, 2 * SIZE 1047 MUL c12, ALPHA, c12 1048 addi.d CO1,CO1, 2 * SIZE 1049 MUL c21, ALPHA, c21 1050 addi.d CO4,CO4, 2 * SIZE 1051 MUL c22, ALPHA, c22 1052 addi.d CO2,CO2, 2 * SIZE 1053 ST c11, CO1, -2 * SIZE 1054 MUL c31, ALPHA, c31 1055 ST c12, CO1, -1 * SIZE 1056 MUL c32, ALPHA, c32 1057 ST c21, CO2, -2 * SIZE 1058 MUL c41, ALPHA, c41 1059 ST c22, CO2, -1 * SIZE 1060 MUL c42, ALPHA, c42 1061 ST c31, CO3, -2 * SIZE 1062 MTC c11, $r0 1063 ST c32, CO3, -1 * SIZE 1064 addi.d I, I, -1 1065 ST c41, CO4, -2 * SIZE 1066 MOV c21, c11 1067 ST c42, CO4, -1 * SIZE 1068 MOV c31, c11 1069#if ( defined(LEFT) && defined(TRANSA)) || \ 1070 (!defined(LEFT) && !defined(TRANSA)) 1071 sub.d TEMP, K, KK 1072#ifdef LEFT 1073 addi.d TEMP, TEMP, -2 1074#else 1075 addi.d TEMP, TEMP, -4 1076#endif 1077 slli.d L, TEMP, 1 + BASE_SHIFT 1078 slli.d TEMP, TEMP, 2 + BASE_SHIFT 1079 add.d AO, AO, L 1080 add.d BO, BO, TEMP 1081#endif 1082#ifdef LEFT 1083 addi.d KK, KK, 2 1084#endif 1085#endif 1086MOV c41, c11 1087 blt $r0, I, .L31 1088 .align 3 1089 1090.L40: 1091 andi I, M, 1 1092MOV c61, c11 1093 bge $r0, I, .L49 1094#if defined(TRMMKERNEL) 1095#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1096 move BO, B 1097#else 1098 slli.d L, KK, 0 + BASE_SHIFT 1099 slli.d TEMP, KK, 2 + BASE_SHIFT 1100 add.d AO, AO, L 1101 add.d BO, B, TEMP 1102#endif 1103 LD a1, AO, 0 * SIZE 1104 MOV c71, c11 1105 LD a2, AO, 1 * SIZE 1106 MOV c81, c11 1107 LD b1, BO, 0 * SIZE 1108 LD b2, BO, 1 * SIZE 1109 LD b3, BO, 2 * SIZE 1110 LD b4, BO, 3 * SIZE 1111 LD b5, BO, 4 * SIZE 1112 LD b6, BO, 8 * SIZE 1113 LD b7, BO, 12 * SIZE 1114#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1115 sub.d TEMP, K, KK 1116#elif defined(LEFT) 1117 addi.d TEMP, KK, 1 1118#else 1119 addi.d TEMP, KK, 4 1120#endif 1121 srai.d L, TEMP, 2 1122 bge $r0, L, .L45 1123#else 1124 LD a1, AO, 0 * SIZE 1125 MOV c71, c11 1126 LD a2, AO, 1 * SIZE 1127 MOV c81, c11 1128 LD b1, B, 0 * SIZE 1129 LD b2, B, 1 * SIZE 1130 LD b3, B, 2 * SIZE 1131 LD b4, B, 3 * SIZE 1132 LD b5, B, 4 * SIZE 1133 LD b6, B, 8 * SIZE 1134 LD b7, B, 12 * SIZE 1135 srai.d L, K, 2 1136move BO, B 1137 bge $r0, L, .L45 1138#endif 1139 .align 3 1140.L42: 1141 MADD c11, b1, a1, c11 1142 LD b1, BO, 16 * SIZE 1143 MADD c21, b2, a1, c21 1144 LD b2, BO, 5 * SIZE 1145 MADD c31, b3, a1, c31 1146 LD b3, BO, 6 * SIZE 1147 MADD c41, b4, a1, c41 1148 LD b4, BO, 7 * SIZE 1149 LD a1, AO, 4 * SIZE 1150 addi.d L, L, -1 1151 MADD c11, b5, a2, c11 1152 LD b5, BO, 20 * SIZE 1153 MADD c21, b2, a2, c21 1154 LD b2, BO, 9 * SIZE 1155 MADD c31, b3, a2, c31 1156 LD b3, BO, 10 * SIZE 1157 MADD c41, b4, a2, c41 1158 LD b4, BO, 11 * SIZE 1159 LD a2, AO, 2 * SIZE 1160 addi.d AO, AO, 4 * SIZE 1161 MADD c11, b6, a2, c11 1162 LD b6, BO, 24 * SIZE 1163 MADD c21, b2, a2, c21 1164 LD b2, BO, 13 * SIZE 1165 MADD c31, b3, a2, c31 1166 LD b3, BO, 14 * SIZE 1167 MADD c41, b4, a2, c41 1168 LD b4, BO, 15 * SIZE 1169 LD a2, AO, -1 * SIZE 1170 addi.d BO, BO, 16 * SIZE 1171 MADD c11, b7, a2, c11 1172 LD b7, BO, 12 * SIZE 1173 MADD c21, b2, a2, c21 1174 LD b2, BO, 1 * SIZE 1175 MADD c31, b3, a2, c31 1176 LD b3, BO, 2 * SIZE 1177 MADD c41, b4, a2, c41 1178 LD b4, BO, 3 * SIZE 1179 LD a2, AO, 1 * SIZE 1180 blt $r0, L, .L42 1181 .align 3 1182 1183.L45: 1184#ifndef TRMMKERNEL 1185 andi L, K, 3 1186#else 1187 andi L, TEMP, 3 1188#endif 1189 bge $r0, L, .L48 1190 .align 3 1191.L46: 1192 MADD c11, b1, a1, c11 1193 LD b1, BO, 4 * SIZE 1194 MADD c21, b2, a1, c21 1195 LD b2, BO, 5 * SIZE 1196 MADD c31, b3, a1, c31 1197 LD b3, BO, 6 * SIZE 1198 MADD c41, b4, a1, c41 1199 LD a1, AO, 1 * SIZE 1200 LD b4, BO, 7 * SIZE 1201 addi.d L, L, -1 1202 addi.d AO, AO, 1 * SIZE 1203 MOV a2, a2 1204addi.d BO, BO, 4 * SIZE 1205 blt $r0, L, .L46 1206.L48: 1207#ifndef TRMMKERNEL 1208 LD $f22, CO1, 0 * SIZE 1209 LD $f8, CO2, 0 * SIZE 1210 LD $f23, CO3, 0 * SIZE 1211 LD $f9, CO4, 0 * SIZE 1212 MADD c11, c11, ALPHA, $f22 1213 MADD c21, c21, ALPHA, $f8 1214 MADD c31, c31, ALPHA, $f23 1215 MADD c41, c41, ALPHA, $f9 1216 ST c11, CO1, 0 * SIZE 1217 ST c21, CO2, 0 * SIZE 1218 ST c31, CO3, 0 * SIZE 1219 ST c41, CO4, 0 * SIZE 1220#else 1221 MUL c11, ALPHA, c11 1222 MUL c21, ALPHA, c21 1223 MUL c31, ALPHA, c31 1224 MUL c41, ALPHA, c41 1225 ST c11, CO1, 0 * SIZE 1226 ST c21, CO2, 0 * SIZE 1227 ST c31, CO3, 0 * SIZE 1228 ST c41, CO4, 0 * SIZE 1229#if ( defined(LEFT) && defined(TRANSA)) || \ 1230 (!defined(LEFT) && !defined(TRANSA)) 1231 sub.d TEMP, K, KK 1232#ifdef LEFT 1233 addi.d TEMP, TEMP, -1 1234#else 1235 addi.d TEMP, TEMP, -4 1236#endif 1237 slli.d L, TEMP, 0 + BASE_SHIFT 1238 slli.d TEMP, TEMP, 2 + BASE_SHIFT 1239 add.d AO, AO, L 1240 add.d BO, BO, TEMP 1241#endif 1242#ifdef LEFT 1243 addi.d KK, KK, 1 1244#endif 1245#endif 1246 .align 3 1247 1248.L49: 1249#if defined(TRMMKERNEL) && !defined(LEFT) 1250 addi.d KK, KK, 4 1251#endif 1252 move B, BO 1253 .align 3 1254 1255.L50: 1256 andi J, N, 2 1257move AO, A 1258 bge $r0, J, .L70 1259 move CO1, C 1260 add.d CO2, C, LDC 1261#if defined(TRMMKERNEL) && defined(LEFT) 1262 move KK, OFFSET 1263#endif 1264 srai.d I, M, 1 1265add.d C, CO2, LDC 1266 bge $r0, I, .L60 1267.L51: 1268#if defined(TRMMKERNEL) 1269#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1270 move BO, B 1271#else 1272 slli.d L, KK, 1 + BASE_SHIFT 1273 slli.d TEMP, KK, 1 + BASE_SHIFT 1274 add.d AO, AO, L 1275 add.d BO, B, TEMP 1276#endif 1277 LD a1, AO, 0 * SIZE 1278 MTC c11, $r0 1279 LD a2, AO, 1 * SIZE 1280 MOV c21, c11 1281 LD a5, AO, 4 * SIZE 1282 LD b1, BO, 0 * SIZE 1283 MOV c12, c11 1284 LD b2, BO, 1 * SIZE 1285 MOV c22, c11 1286 LD b3, BO, 2 * SIZE 1287 LD b5, BO, 4 * SIZE 1288 LD b6, BO, 8 * SIZE 1289 LD b7, BO, 12 * SIZE 1290#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1291 sub.d TEMP, K, KK 1292#elif defined(LEFT) 1293 addi.d TEMP, KK, 2 1294#else 1295 addi.d TEMP, KK, 2 1296#endif 1297 srai.d L, TEMP, 2 1298 bge $r0, L, .L55 1299#else 1300 LD a1, AO, 0 * SIZE 1301 MTC c11, $r0 1302 LD a2, AO, 1 * SIZE 1303 MOV c21, c11 1304 LD a5, AO, 4 * SIZE 1305 LD b1, B, 0 * SIZE 1306 MOV c12, c11 1307 LD b2, B, 1 * SIZE 1308 MOV c22, c11 1309 LD b3, B, 2 * SIZE 1310 LD b5, B, 4 * SIZE 1311 srai.d L, K, 2 1312 LD b6, B, 8 * SIZE 1313 LD b7, B, 12 * SIZE 1314move BO, B 1315 bge $r0, L, .L55 1316#endif 1317 .align 3 1318.L52: 1319 MADD c11, b1, a1, c11 1320 LD a3, AO, 2 * SIZE 1321 MADD c21, b2, a1, c21 1322 LD b4, BO, 3 * SIZE 1323 MADD c12, b1, a2, c12 1324 LD a4, AO, 3 * SIZE 1325 MADD c22, b2, a2, c22 1326 LD b1, BO, 8 * SIZE 1327 MADD c11, b3, a3, c11 1328 LD a1, AO, 8 * SIZE 1329 MADD c21, b4, a3, c21 1330 LD b2, BO, 5 * SIZE 1331 MADD c12, b3, a4, c12 1332 LD a2, AO, 5 * SIZE 1333 MADD c22, b4, a4, c22 1334 LD b3, BO, 6 * SIZE 1335 MADD c11, b5, a5, c11 1336 LD a3, AO, 6 * SIZE 1337 MADD c21, b2, a5, c21 1338 LD b4, BO, 7 * SIZE 1339 MADD c12, b5, a2, c12 1340 LD a4, AO, 7 * SIZE 1341 MADD c22, b2, a2, c22 1342 LD b5, BO, 12 * SIZE 1343 MADD c11, b3, a3, c11 1344 LD a5, AO, 12 * SIZE 1345 MADD c21, b4, a3, c21 1346 LD b2, BO, 9 * SIZE 1347 MADD c12, b3, a4, c12 1348 LD a2, AO, 9 * SIZE 1349 MADD c22, b4, a4, c22 1350 LD b3, BO, 10 * SIZE 1351 addi.d AO, AO, 8 * SIZE 1352 addi.d L, L, -1 1353addi.d BO, BO, 8 * SIZE 1354 blt $r0, L, .L52 1355 .align 3 1356 1357.L55: 1358#ifndef TRMMKERNEL 1359 andi L, K, 3 1360#else 1361 andi L, TEMP, 3 1362#endif 1363 bge $r0, L, .L58 1364 .align 3 1365.L56: 1366 MADD c11, b1, a1, c11 1367 LD a2, AO, 1 * SIZE 1368 MADD c21, b2, a1, c21 1369 LD a1, AO, 2 * SIZE 1370 MADD c12, b1, a2, c12 1371 LD b1, BO, 2 * SIZE 1372 MADD c22, b2, a2, c22 1373 LD b2, BO, 3 * SIZE 1374 addi.d L, L, -1 1375 addi.d AO, AO, 2 * SIZE 1376addi.d BO, BO, 2 * SIZE 1377 blt $r0, L, .L56 1378.L58: 1379#ifndef TRMMKERNEL 1380 LD $f22, CO1, 0 * SIZE 1381 addi.d I, I, -1 1382 LD $f8, CO1, 1 * SIZE 1383 addi.d CO1,CO1, 2 * SIZE 1384 LD $f23, CO2, 0 * SIZE 1385 LD $f9, CO2, 1 * SIZE 1386 addi.d CO2,CO2, 2 * SIZE 1387 MADD c11, c11, ALPHA, $f22 1388 MADD c12, c12, ALPHA, $f8 1389 MADD c21, c21, ALPHA, $f23 1390 MADD c22, c22, ALPHA, $f9 1391 ST c11, CO1, -2 * SIZE 1392 ST c12, CO1, -1 * SIZE 1393 ST c21, CO2, -2 * SIZE 1394 ST c22, CO2, -1 * SIZE 1395 blt $r0, I, .L51 1396#else 1397 addi.d I, I, -1 1398 addi.d CO1,CO1, 2 * SIZE 1399 addi.d CO2,CO2, 2 * SIZE 1400 MUL c11, ALPHA, c11 1401 MUL c12, ALPHA, c12 1402 MUL c21, ALPHA, c21 1403 MUL c22, ALPHA, c22 1404 ST c11, CO1, -2 * SIZE 1405 ST c12, CO1, -1 * SIZE 1406 ST c21, CO2, -2 * SIZE 1407 ST c22, CO2, -1 * SIZE 1408#if ( defined(LEFT) && defined(TRANSA)) || \ 1409 (!defined(LEFT) && !defined(TRANSA)) 1410 sub.d TEMP, K, KK 1411#ifdef LEFT 1412 addi.d TEMP, TEMP, -2 1413#else 1414 addi.d TEMP, TEMP, -2 1415#endif 1416 slli.d L, TEMP, 1 + BASE_SHIFT 1417 slli.d TEMP, TEMP, 1 + BASE_SHIFT 1418 add.d AO, AO, L 1419 add.d BO, BO, TEMP 1420#endif 1421#ifdef LEFT 1422 addi.d KK, KK, 2 1423#endif 1424 blt $r0, I, .L51 1425#endif 1426 .align 3 1427 1428.L60: 1429 andi I, M, 1 1430 bge $r0, I, .L69 1431#if defined(TRMMKERNEL) 1432#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1433 move BO, B 1434#else 1435 slli.d L, KK, 0 + BASE_SHIFT 1436 slli.d TEMP, KK, 1 + BASE_SHIFT 1437 add.d AO, AO, L 1438 add.d BO, B, TEMP 1439#endif 1440 LD a1, AO, 0 * SIZE 1441 MTC c11, $r0 1442 LD a2, AO, 1 * SIZE 1443 MOV c21, c11 1444 LD a3, AO, 2 * SIZE 1445 MOV c31, c11 1446 LD a4, AO, 3 * SIZE 1447 MOV c41, c11 1448 LD b1, BO, 0 * SIZE 1449 LD b2, BO, 1 * SIZE 1450 LD b3, BO, 2 * SIZE 1451 LD b4, BO, 3 * SIZE 1452 LD b5, BO, 4 * SIZE 1453 LD b6, BO, 8 * SIZE 1454 LD b7, BO, 12 * SIZE 1455#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1456 sub.d TEMP, K, KK 1457#elif defined(LEFT) 1458 addi.d TEMP, KK, 1 1459#else 1460 addi.d TEMP, KK, 2 1461#endif 1462 srai.d L, TEMP, 2 1463 bge $r0, L, .L65 1464#else 1465 srai.d L, K, 2 1466 LD a1, AO, 0 * SIZE 1467 MTC c11, $r0 1468 LD a2, AO, 1 * SIZE 1469 MOV c21, c11 1470 LD a3, AO, 2 * SIZE 1471 MOV c31, c11 1472 LD a4, AO, 3 * SIZE 1473 MOV c41, c11 1474 LD b1, B, 0 * SIZE 1475 LD b2, B, 1 * SIZE 1476 LD b3, B, 2 * SIZE 1477 LD b4, B, 3 * SIZE 1478 LD b5, B, 4 * SIZE 1479 LD b6, B, 8 * SIZE 1480 LD b7, B, 12 * SIZE 1481move BO, B 1482 bge $r0, L, .L65 1483#endif 1484 .align 3 1485.L62: 1486 MADD c11, b1, a1, c11 1487 LD b1, BO, 4 * SIZE 1488 MADD c21, b2, a1, c21 1489 LD b2, BO, 5 * SIZE 1490 MADD c31, b3, a2, c31 1491 LD b3, BO, 6 * SIZE 1492 MADD c41, b4, a2, c41 1493 LD b4, BO, 7 * SIZE 1494 LD a1, AO, 4 * SIZE 1495 LD a2, AO, 5 * SIZE 1496 MADD c11, b1, a3, c11 1497 LD b1, BO, 8 * SIZE 1498 MADD c21, b2, a3, c21 1499 LD b2, BO, 9 * SIZE 1500 MADD c31, b3, a4, c31 1501 LD b3, BO, 10 * SIZE 1502 MADD c41, b4, a4, c41 1503 LD b4, BO, 11 * SIZE 1504 LD a3, AO, 6 * SIZE 1505 LD a4, AO, 7 * SIZE 1506 addi.d L, L, -1 1507 addi.d AO, AO, 4 * SIZE 1508addi.d BO, BO, 8 * SIZE 1509 blt $r0, L, .L62 1510 .align 3 1511 1512.L65: 1513#ifndef TRMMKERNEL 1514 andi L, K, 3 1515#else 1516 andi L, TEMP, 3 1517#endif 1518 bge $r0, L, .L68 1519 .align 3 1520.L66: 1521 MADD c11, b1, a1, c11 1522 LD b1, BO, 2 * SIZE 1523 MADD c21, b2, a1, c21 1524 LD b2, BO, 3 * SIZE 1525 LD a1, AO, 1 * SIZE 1526 addi.d L, L, -1 1527 addi.d AO, AO, 1 * SIZE 1528addi.d BO, BO, 2 * SIZE 1529 blt $r0, L, .L66 1530.L68: 1531#ifndef TRMMKERNEL 1532 LD $f22, CO1, 0 * SIZE 1533 LD $f8, CO2, 0 * SIZE 1534 ADD c11, c11, c31 1535 ADD c21, c21, c41 1536 MADD c11, c11, ALPHA, $f22 1537 MADD c21, c21, ALPHA, $f8 1538 ST c11, CO1, 0 * SIZE 1539 ST c21, CO2, 0 * SIZE 1540#else 1541 ADD c11, c11, c31 1542 ADD c21, c21, c41 1543 MUL c11, ALPHA, c11 1544 MUL c21, ALPHA, c21 1545 ST c11, CO1, 0 * SIZE 1546 ST c21, CO2, 0 * SIZE 1547#if ( defined(LEFT) && defined(TRANSA)) || \ 1548 (!defined(LEFT) && !defined(TRANSA)) 1549 sub.d TEMP, K, KK 1550#ifdef LEFT 1551 addi.d TEMP, TEMP, -1 1552#else 1553 addi.d TEMP, TEMP, -2 1554#endif 1555 slli.d L, TEMP, 0 + BASE_SHIFT 1556 slli.d TEMP, TEMP, 1 + BASE_SHIFT 1557 add.d AO, AO, L 1558 add.d BO, BO, TEMP 1559#endif 1560#ifdef LEFT 1561 addi.d KK, KK, 1 1562#endif 1563#endif 1564 .align 3 1565 1566.L69: 1567#if defined(TRMMKERNEL) && !defined(LEFT) 1568 addi.d KK, KK, 2 1569#endif 1570 move B, BO 1571 .align 3 1572 1573.L70: 1574 andi J, N, 1 1575move AO, A 1576 bge $r0, J, .L999 1577 move CO1, C 1578#if defined(TRMMKERNEL) && defined(LEFT) 1579 move KK, OFFSET 1580#endif 1581 srai.d I, M, 1 1582add.d C, CO1, LDC 1583 bge $r0, I, .L80 1584.L71: 1585#if defined(TRMMKERNEL) 1586#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1587 move BO, B 1588#else 1589 slli.d L, KK, 1 + BASE_SHIFT 1590 slli.d TEMP, KK, 0 + BASE_SHIFT 1591 add.d AO, AO, L 1592 add.d BO, B, TEMP 1593#endif 1594 LD a1, AO, 0 * SIZE 1595 MTC c11, $r0 1596 LD a2, AO, 1 * SIZE 1597 MOV c21, c11 1598 LD a5, AO, 4 * SIZE 1599 LD b1, BO, 0 * SIZE 1600 MOV c12, c11 1601 LD b2, BO, 1 * SIZE 1602 MOV c22, c11 1603 LD b3, BO, 2 * SIZE 1604 LD b5, BO, 4 * SIZE 1605 LD b6, BO, 8 * SIZE 1606 LD b7, BO, 12 * SIZE 1607#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1608 sub.d TEMP, K, KK 1609#elif defined(LEFT) 1610 addi.d TEMP, KK, 2 1611#else 1612 addi.d TEMP, KK, 1 1613#endif 1614 srai.d L, TEMP, 2 1615 bge $r0, L, .L75 1616#else 1617 LD a1, AO, 0 * SIZE 1618 MTC c11, $r0 1619 LD a2, AO, 1 * SIZE 1620 MOV c21, c11 1621 LD a5, AO, 4 * SIZE 1622 LD b1, B, 0 * SIZE 1623 MOV c12, c11 1624 LD b2, B, 1 * SIZE 1625 MOV c22, c11 1626 LD b3, B, 2 * SIZE 1627 LD b5, B, 4 * SIZE 1628 srai.d L, K, 2 1629 LD b6, B, 8 * SIZE 1630 LD b7, B, 12 * SIZE 1631move BO, B 1632 bge $r0, L, .L75 1633#endif 1634 .align 3 1635.L72: 1636 LD a1, AO, 0 * SIZE 1637 LD a2, AO, 1 * SIZE 1638 LD b1, BO, 0 * SIZE 1639 MADD c11, b1, a1, c11 1640 MADD c12, b1, a2, c12 1641 LD a1, AO, 2 * SIZE 1642 LD a2, AO, 3 * SIZE 1643 LD b1, BO, 1 * SIZE 1644 MADD c11, b1, a1, c11 1645 MADD c12, b1, a2, c12 1646 LD a1, AO, 4 * SIZE 1647 LD a2, AO, 5 * SIZE 1648 LD b1, BO, 2 * SIZE 1649 MADD c11, b1, a1, c11 1650 MADD c12, b1, a2, c12 1651 LD a1, AO, 6 * SIZE 1652 LD a2, AO, 7 * SIZE 1653 LD b1, BO, 3 * SIZE 1654 MADD c11, b1, a1, c11 1655 MADD c12, b1, a2, c12 1656 addi.d L, L, -1 1657 addi.d AO, AO, 8 * SIZE 1658addi.d BO, BO, 4 * SIZE 1659 blt $r0, L, .L72 1660 .align 3 1661 1662.L75: 1663#ifndef TRMMKERNEL 1664 andi L, K, 3 1665#else 1666 andi L, TEMP, 3 1667#endif 1668 bge $r0, L, .L78 1669 .align 3 1670.L76: 1671 LD a1, AO, 0 * SIZE 1672 LD a2, AO, 1 * SIZE 1673 LD b1, BO, 0 * SIZE 1674 MADD c11, b1, a1, c11 1675 MADD c12, b1, a2, c12 1676 addi.d L, L, -1 1677 addi.d AO, AO, 2 * SIZE 1678addi.d BO, BO, 1 * SIZE 1679 blt $r0, L, .L76 1680.L78: 1681#ifndef TRMMKERNEL 1682 LD $f22, CO1, 0 * SIZE 1683 addi.d I, I, -1 1684 LD $f8, CO1, 1 * SIZE 1685 addi.d CO1,CO1, 2 * SIZE 1686 ADD c11, c11, c21 1687 ADD c12, c12, c22 1688 MADD c11, c11, ALPHA, $f22 1689 MADD c12, c12, ALPHA, $f8 1690 ST c11, CO1, -2 * SIZE 1691 ST c12, CO1, -1 * SIZE 1692 blt $r0, I, .L71 1693#else 1694 ADD c11, c11, c21 1695 addi.d I, I, -1 1696 ADD c12, c12, c22 1697 addi.d CO1,CO1, 2 * SIZE 1698 MUL c11, ALPHA, c11 1699 MUL c12, ALPHA, c12 1700 ST c11, CO1, -2 * SIZE 1701 ST c12, CO1, -1 * SIZE 1702#if ( defined(LEFT) && defined(TRANSA)) || \ 1703 (!defined(LEFT) && !defined(TRANSA)) 1704 sub.d TEMP, K, KK 1705#ifdef LEFT 1706 addi.d TEMP, TEMP, -2 1707#else 1708 addi.d TEMP, TEMP, -1 1709#endif 1710 slli.d L, TEMP, 1 + BASE_SHIFT 1711 slli.d TEMP, TEMP, 0 + BASE_SHIFT 1712 add.d AO, AO, L 1713 add.d BO, BO, TEMP 1714#endif 1715#ifdef LEFT 1716 addi.d KK, KK, 2 1717#endif 1718 blt $r0, I, .L71 1719#endif 1720 .align 3 1721 1722.L80: 1723 andi I, M, 1 1724 bge $r0, I, .L89 1725#if defined(TRMMKERNEL) 1726#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1727 move BO, B 1728#else 1729 slli.d L, KK, 0 + BASE_SHIFT 1730 slli.d TEMP, KK, 0 + BASE_SHIFT 1731 add.d AO, AO, L 1732 add.d BO, B, TEMP 1733#endif 1734 LD a1, AO, 0 * SIZE 1735 MTC c11, $r0 1736 LD a2, AO, 1 * SIZE 1737 MOV c21, c11 1738 LD a3, AO, 2 * SIZE 1739 LD a4, AO, 3 * SIZE 1740 LD b1, BO, 0 * SIZE 1741 LD b2, BO, 1 * SIZE 1742 LD b3, BO, 2 * SIZE 1743 LD b4, BO, 3 * SIZE 1744 LD b5, BO, 4 * SIZE 1745 LD b6, BO, 8 * SIZE 1746 LD b7, BO, 12 * SIZE 1747#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1748 sub.d TEMP, K, KK 1749#elif defined(LEFT) 1750 addi.d TEMP, KK, 1 1751#else 1752 addi.d TEMP, KK, 1 1753#endif 1754 srai.d L, TEMP, 2 1755 bge $r0, L, .L85 1756#else 1757 LD a1, AO, 0 * SIZE 1758 MTC c11, $r0 1759 LD a2, AO, 1 * SIZE 1760 MOV c21, c11 1761 LD a3, AO, 2 * SIZE 1762 LD a4, AO, 3 * SIZE 1763 LD b1, B, 0 * SIZE 1764 LD b2, B, 1 * SIZE 1765 LD b3, B, 2 * SIZE 1766 LD b4, B, 3 * SIZE 1767 LD b5, B, 4 * SIZE 1768 LD b6, B, 8 * SIZE 1769 LD b7, B, 12 * SIZE 1770 srai.d L, K, 2 1771move BO, B 1772 bge $r0, L, .L85 1773#endif 1774 .align 3 1775.L82: 1776 LD a1, AO, 0 * SIZE 1777 LD b1, BO, 0 * SIZE 1778 MADD c11, b1, a1, c11 1779 LD a1, AO, 1 * SIZE 1780 LD b1, BO, 1 * SIZE 1781 MADD c21, b1, a1, c21 1782 LD a1, AO, 2 * SIZE 1783 LD b1, BO, 2 * SIZE 1784 MADD c11, b1, a1, c11 1785 LD a1, AO, 3 * SIZE 1786 LD b1, BO, 3 * SIZE 1787 MADD c21, b1, a1, c21 1788 addi.d L, L, -1 1789 addi.d AO, AO, 4 * SIZE 1790addi.d BO, BO, 4 * SIZE 1791 blt $r0, L, .L82 1792 .align 3 1793 1794.L85: 1795#ifndef TRMMKERNEL 1796 andi L, K, 3 1797#else 1798 andi L, TEMP, 3 1799#endif 1800 bge $r0, L, .L88 1801 .align 3 1802.L86: 1803 LD a1, AO, 0 * SIZE 1804 LD b1, BO, 0 * SIZE 1805 MADD c11, b1, a1, c11 1806 addi.d L, L, -1 1807 addi.d AO, AO, 1 * SIZE 1808addi.d BO, BO, 1 * SIZE 1809 blt $r0, L, .L86 1810.L88: 1811#ifndef TRMMKERNEL 1812 LD $f22, CO1, 0 * SIZE 1813 ADD c11, c11, c21 1814 MADD c11, c11, ALPHA, $f22 1815 ST c11, CO1, 0 * SIZE 1816#else 1817 ADD c11, c11, c21 1818 MUL c11, ALPHA, c11 1819 ST c11, CO1, 0 * SIZE 1820#endif 1821 .align 3 1822 1823.L89: 1824#if defined(TRMMKERNEL) && !defined(LEFT) 1825 addi.d KK, KK, 1 1826#endif 1827 move B, BO 1828 .align 3 1829 1830.L999: 1831 LDARG $r23, $sp, 0 1832 LDARG $r24, $sp, 8 1833 LDARG $r25, $sp, 16 1834 LDARG $r26, $sp, 24 1835 LDARG $r27, $sp, 32 1836 LDARG $r28, $sp, 40 1837 LDARG $r29, $sp, 48 1838 LDARG $r30, $sp, 96 1839 fld.d $f24, $sp, 56 1840 fld.d $f25, $sp, 64 1841 fld.d $f26, $sp, 72 1842 fld.d $f27, $sp, 80 1843 fld.d $f28, $sp, 88 1844#if defined(TRMMKERNEL) 1845 LDARG $r20, $sp, 104 1846 LDARG $r16, $sp, 112 1847#endif 1848#ifndef __64BIT__ 1849 fld.d $f18, $sp, 120 1850 fld.d $f19, $sp, 128 1851 fld.d $f20, $sp, 136 1852 fld.d $f21, $sp, 144 1853#endif 1854 addi.d $sp, $sp, 160 1855 move $r4, $r17 1856 fmov.d $f0, $f22 1857 jirl $r0, $r1, 0x0 1858 1859 EPILOGUE 1860