1/*************************************************************************** 2Copyright (c) 2013-2019, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* Abdelrauf(quickwritereader@googlemail.com) 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* LAPACK-TEST : OK 34**************************************************************************************/ 35 36/********************************************************************* 37* Macros for N=4, M=16 * 38*********************************************************************/ 39.macro LOAD4x16_1 40 LOAD4x16 1 41.endm 42 43.macro LOAD4x16_0 44 LOAD4x16 0 45.endm 46.macro LOAD4x16 Zero 47 48 lxv vs24, 0(BO) 49 lxv vs26, 16(BO) 50 xxpermdi vs25, vs24, vs24,2 51 xxpermdi vs27, vs26, vs26,2 52 53 lxv vs0, 0(AO) 54 lxv vs1, 16(AO) 55 lxv vs2, 32(AO) 56 lxv vs3, 48(AO) 57 58 59 lxv vs4, 64(AO) 60 lxv vs5, 80(AO) 61 lxv vs6, 96(AO) 62 lxv vs7, 112(AO) 63.if \Zero==1 64 xxlxor vs32,vs32,vs32 65 xxlxor vs33,vs33,vs33 66 xxlxor vs34,vs34,vs34 67 xxlxor vs35,vs35,vs35 68 xxlxor vs36,vs36,vs36 69 xxlxor vs37,vs37,vs37 70 xxlxor vs38,vs38,vs38 71 xxlxor vs39,vs39,vs39 72 xxlxor vs40, vs40, vs40 73 xxlxor vs41, vs41, vs41 74 xxlxor vs42, vs42, vs42 75 xxlxor vs43, vs43, vs43 76 xxlxor vs44, vs44, vs44 77 xxlxor vs45, vs45, vs45 78 xxlxor vs46, vs46, vs46 79 xxlxor vs47, vs47, vs47 80 xxlxor vs48, vs48, vs48 81 xxlxor vs49, vs49, vs49 82 xxlxor vs50, vs50, vs50 83 xxlxor vs51, vs51, vs51 84 xxlxor vs52, vs52, vs52 85 xxlxor vs53, vs53, vs53 86 xxlxor vs54, vs54, vs54 87 xxlxor vs55, vs55, vs55 88 xxlxor vs56, vs56, vs56 89 xxlxor vs57, vs57, vs57 90 xxlxor vs58, vs58, vs58 91 xxlxor vs59, vs59, vs59 92 xxlxor vs60, vs60, vs60 93 xxlxor vs61, vs61, vs61 94 xxlxor vs62, vs62, vs62 95 xxlxor vs63, vs63, vs63 96.endif 97.endm 98 99 100#define unit_size 8 101#define DISP32(ind,disp) (ind*unit_size*32+disp) 102#define DISP16(ind,disp) (ind*unit_size*16+disp) 103#define DISP8(ind,disp) (ind*unit_size*8+disp) 104#define DISP4(ind,disp) (ind*unit_size*4+disp) 105#define DISP2(ind,disp) (ind*unit_size*2+disp) 106#define DISP1(ind,disp) (ind*unit_size+disp) 107 108.macro KERNEL4x16_L1_L2 Index,IsLast 109 KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 110.endm 111 112 113 114.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast 115 KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 116.endm 117 118.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast 119 KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 120.endm 121 122.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast 123 KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 124.endm 125 126.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast 127 KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 128.endm 129 130.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast 131 KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 132.endm 133 134.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast 135 KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 136.endm 137 138.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete 139 140.if \First ==1 141 xvmuldp vs32, vs0, vs24 142 xvmuldp vs33, vs1, vs24 143 xvmuldp vs34, vs2, vs24 144 xvmuldp vs35, vs3, vs24 145.else 146 xvmaddadp vs32, vs0, vs24 147 xvmaddadp vs33, vs1, vs24 148 xvmaddadp vs34, vs2, vs24 149 xvmaddadp vs35, vs3, vs24 150.endif 151 lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) 152 lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) 153 lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) 154 lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) 155.if \First ==1 156 xvmuldp vs36, vs4, vs24 157 xvmuldp vs37, vs5, vs24 158 xvmuldp vs38, vs6, vs24 159 xvmuldp vs39, vs7, vs24 160.else 161 xvmaddadp vs36, vs4, vs24 162 xvmaddadp vs37, vs5, vs24 163 xvmaddadp vs38, vs6, vs24 164 xvmaddadp vs39, vs7, vs24 165.endif 166 lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) 167 lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) 168 xxpermdi vs29, vs28, vs28,2 169 xxpermdi vs31, vs30, vs30,2 170.if \First ==1 171 xvmuldp vs40, vs0, vs25 172 xvmuldp vs41, vs1, vs25 173 xvmuldp vs42, vs2, vs25 174 xvmuldp vs43, vs3, vs25 175 176 177 xvmuldp vs44, vs4, vs25 178 xvmuldp vs45, vs5, vs25 179 xvmuldp vs46, vs6, vs25 180 xvmuldp vs47, vs7, vs25 181 182 183 xvmuldp vs48, vs0, vs26 184 xvmuldp vs49, vs1, vs26 185 xvmuldp vs50, vs2, vs26 186 xvmuldp vs51, vs3, vs26 187 188 189.else 190 xvmaddadp vs40, vs0, vs25 191 xvmaddadp vs41, vs1, vs25 192 xvmaddadp vs42, vs2, vs25 193 xvmaddadp vs43, vs3, vs25 194 195 196 xvmaddadp vs44, vs4, vs25 197 xvmaddadp vs45, vs5, vs25 198 xvmaddadp vs46, vs6, vs25 199 xvmaddadp vs47, vs7, vs25 200 201 202 xvmaddadp vs48, vs0, vs26 203 xvmaddadp vs49, vs1, vs26 204 xvmaddadp vs50, vs2, vs26 205 xvmaddadp vs51, vs3, vs26 206 207.endif 208 lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) 209 lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) 210.if \First ==1 211 xvmuldp vs52, vs4, vs26 212 xvmuldp vs53, vs5, vs26 213 xvmuldp vs54, vs6, vs26 214 xvmuldp vs55, vs7, vs26 215 216.else 217 xvmaddadp vs52, vs4, vs26 218 xvmaddadp vs53, vs5, vs26 219 xvmaddadp vs54, vs6, vs26 220 xvmaddadp vs55, vs7, vs26 221.endif 222 lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) 223 lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) 224.if \First ==1 225 xvmuldp vs56, vs0, vs27 226 xvmuldp vs57, vs1, vs27 227 xvmuldp vs58, vs2, vs27 228 xvmuldp vs59, vs3, vs27 229 230 231 232 xvmuldp vs60, vs4, vs27 233 xvmuldp vs61, vs5, vs27 234 xvmuldp vs62, vs6, vs27 235 xvmuldp vs63, vs7, vs27 236 237.else 238 xvmaddadp vs56, vs0, vs27 239 xvmaddadp vs57, vs1, vs27 240 xvmaddadp vs58, vs2, vs27 241 xvmaddadp vs59, vs3, vs27 242 243 244 245 xvmaddadp vs60, vs4, vs27 246 xvmaddadp vs61, vs5, vs27 247 xvmaddadp vs62, vs6, vs27 248 xvmaddadp vs63, vs7, vs27 249.endif 250 251 xvmaddadp vs32, vs8, vs28 252 xvmaddadp vs33, vs9, vs28 253 xvmaddadp vs34, vs10, vs28 254 xvmaddadp vs35, vs11, vs28 255.if \Complete==0 256 lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) 257 lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) 258.endif 259 xvmaddadp vs36, vs12, vs28 260 xvmaddadp vs37, vs13, vs28 261 xvmaddadp vs38, vs14, vs28 262 xvmaddadp vs39, vs15, vs28 263.if \Complete==0 264 lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) 265 lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) 266 xxpermdi vs25, vs24, vs24,2 267 xxpermdi vs27, vs26, vs26,2 268.endif 269 xvmaddadp vs40, vs8, vs29 270 xvmaddadp vs41, vs9, vs29 271 xvmaddadp vs42, vs10, vs29 272 xvmaddadp vs43, vs11, vs29 273.if \Complete==0 274 lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) 275 lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) 276.endif 277 xvmaddadp vs44, vs12, vs29 278 xvmaddadp vs45, vs13, vs29 279 xvmaddadp vs46, vs14, vs29 280 xvmaddadp vs47, vs15, vs29 281 282 283 xvmaddadp vs48, vs8, vs30 284 xvmaddadp vs49, vs9, vs30 285 xvmaddadp vs50, vs10, vs30 286 xvmaddadp vs51, vs11, vs30 287.if \Complete==0 288 lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) 289 lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) 290.endif 291 xvmaddadp vs52, vs12, vs30 292 xvmaddadp vs53, vs13, vs30 293 xvmaddadp vs54, vs14, vs30 294 xvmaddadp vs55, vs15, vs30 295.if \Complete==0 296 lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) 297 lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) 298.endif 299 xvmaddadp vs56, vs8, vs31 300 xvmaddadp vs57, vs9, vs31 301 xvmaddadp vs58, vs10, vs31 302 xvmaddadp vs59, vs11, vs31 303 304 305 xvmaddadp vs60, vs12, vs31 306 307 xvmaddadp vs61, vs13, vs31 308 xvmaddadp vs62, vs14, vs31 309 310 xvmaddadp vs63, vs15, vs31 311 .if \IsLast==1 312 .if \Complete==1 313 addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) 314 addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) 315 .else 316 addi \AREG, \AREG, DISP32(\Index,256) 317 addi \BREG, \BREG, DISP8(\Index,64) 318 .endif 319 .endif 320 321 322.endm 323 324 325 326.macro KERNEL4x16 First 327 328 lxv vs24, 0(BO) 329 lxv vs26, 16(BO) 330 xxpermdi vs25, vs24, vs24,2 331 xxpermdi vs27, vs26, vs26,2 332 333 lxv vs0, 0(AO) 334 lxv vs1, 16(AO) 335 lxv vs2, 32(AO) 336 lxv vs3, 48(AO) 337 338 lxv vs4, 64(AO) 339 lxv vs5, 80(AO) 340 lxv vs6, 96(AO) 341 lxv vs7, 112(AO) 342 343 344 345 addi BO, BO, 32 346 addi AO, AO, 128 347 348.if \First==1 349 xvmuldp vs32, vs0, vs24 350 xvmuldp vs33, vs1, vs24 351 xvmuldp vs34, vs2, vs24 352 xvmuldp vs35, vs3, vs24 353 xvmuldp vs36, vs4, vs24 354 xvmuldp vs37, vs5, vs24 355 xvmuldp vs38, vs6, vs24 356 xvmuldp vs39, vs7, vs24 357 358 xvmuldp vs40, vs0, vs25 359 xvmuldp vs41, vs1, vs25 360 xvmuldp vs42, vs2, vs25 361 xvmuldp vs43, vs3, vs25 362 xvmuldp vs44, vs4, vs25 363 xvmuldp vs45, vs5, vs25 364 xvmuldp vs46, vs6, vs25 365 xvmuldp vs47, vs7, vs25 366 367 xvmuldp vs48, vs0, vs26 368 xvmuldp vs49, vs1, vs26 369 xvmuldp vs50, vs2, vs26 370 xvmuldp vs51, vs3, vs26 371 xvmuldp vs52, vs4, vs26 372 xvmuldp vs53, vs5, vs26 373 xvmuldp vs54, vs6, vs26 374 xvmuldp vs55, vs7, vs26 375 376 xvmuldp vs56, vs0, vs27 377 xvmuldp vs57, vs1, vs27 378 xvmuldp vs58, vs2, vs27 379 xvmuldp vs59, vs3, vs27 380 xvmuldp vs60, vs4, vs27 381 xvmuldp vs61, vs5, vs27 382 xvmuldp vs62, vs6, vs27 383 xvmuldp vs63, vs7, vs27 384.else 385 xvmaddadp vs32, vs0, vs24 386 xvmaddadp vs33, vs1, vs24 387 xvmaddadp vs34, vs2, vs24 388 xvmaddadp vs35, vs3, vs24 389 xvmaddadp vs36, vs4, vs24 390 xvmaddadp vs37, vs5, vs24 391 xvmaddadp vs38, vs6, vs24 392 xvmaddadp vs39, vs7, vs24 393 394 xvmaddadp vs40, vs0, vs25 395 xvmaddadp vs41, vs1, vs25 396 xvmaddadp vs42, vs2, vs25 397 xvmaddadp vs43, vs3, vs25 398 399 xvmaddadp vs44, vs4, vs25 400 xvmaddadp vs45, vs5, vs25 401 xvmaddadp vs46, vs6, vs25 402 xvmaddadp vs47, vs7, vs25 403 404 xvmaddadp vs48, vs0, vs26 405 xvmaddadp vs49, vs1, vs26 406 xvmaddadp vs50, vs2, vs26 407 xvmaddadp vs51, vs3, vs26 408 409 xvmaddadp vs52, vs4, vs26 410 xvmaddadp vs53, vs5, vs26 411 xvmaddadp vs54, vs6, vs26 412 xvmaddadp vs55, vs7, vs26 413 414 xvmaddadp vs56, vs0, vs27 415 xvmaddadp vs57, vs1, vs27 416 xvmaddadp vs58, vs2, vs27 417 xvmaddadp vs59, vs3, vs27 418 xvmaddadp vs60, vs4, vs27 419 xvmaddadp vs61, vs5, vs27 420 xvmaddadp vs62, vs6, vs27 421 xvmaddadp vs63, vs7, vs27 422 423.endif 424.endm 425 426.macro SAVE4x16_REGS 427 add C2, CO, LDC 428 add C3, C2, LDC 429 add C4, C3, LDC 430.endm 431 432.macro SAVE4x16 433#ifndef TRMMKERNEL 434 lxv vs0, 0(CO) 435 lxv vs2, 16(CO) 436 lxv vs4, 32(CO) 437 lxv vs6, 48(CO) 438#endif 439 xxpermdi vs8, vs40,vs32,1 440 xxpermdi vs9 ,vs32,vs40,1 441#ifndef TRMMKERNEL 442 lxv vs24, 64(CO) 443 lxv vs26, 80(CO) 444 lxv vs28, 96(CO) 445 lxv vs30, 112(CO) 446#endif 447 xxpermdi vs10, vs41,vs33,1 448 xxpermdi vs11 ,vs33,vs41,1 449#ifndef TRMMKERNEL 450 lxv vs1, 0(C2) 451 lxv vs3, 16(C2) 452 lxv vs5, 32(C2) 453 lxv vs7, 48(C2) 454#endif 455 xxpermdi vs12, vs42,vs34,1 456 xxpermdi vs13 ,vs34,vs42,1 457#ifndef TRMMKERNEL 458 lxv vs25, 64(C2) 459 lxv vs27, 80(C2) 460#endif 461 xxpermdi vs14, vs43,vs35,1 462 xxpermdi vs15 ,vs35,vs43,1 463#ifndef TRMMKERNEL 464 lxv vs29, 96(C2) 465 lxv vs31, 112(C2) 466#endif 467 468#ifndef TRMMKERNEL 469 xvmaddadp vs0, vs8, alpha_r 470 xvmaddadp vs1, vs9, alpha_r 471 xvmaddadp vs2, vs10, alpha_r 472 xvmaddadp vs3, vs11, alpha_r 473#else 474 xvmuldp vs0, vs8, alpha_r 475 xvmuldp vs1, vs9, alpha_r 476 xvmuldp vs2, vs10, alpha_r 477 xvmuldp vs3, vs11, alpha_r 478 479#endif 480 xxpermdi vs8, vs44,vs36,1 481 xxpermdi vs9 ,vs36,vs44,1 482 xxpermdi vs10, vs45,vs37,1 483 xxpermdi vs11 ,vs37,vs45,1 484#ifndef TRMMKERNEL 485 xvmaddadp vs4, vs12, alpha_r 486 xvmaddadp vs5, vs13, alpha_r 487 xvmaddadp vs6, vs14, alpha_r 488 xvmaddadp vs7, vs15, alpha_r 489#else 490 xvmuldp vs4, vs12, alpha_r 491 xvmuldp vs5, vs13, alpha_r 492 xvmuldp vs6, vs14, alpha_r 493 xvmuldp vs7, vs15, alpha_r 494#endif 495 xxpermdi vs12, vs46,vs38,1 496 xxpermdi vs13 ,vs38,vs46,1 497 xxpermdi vs14, vs47,vs39,1 498 xxpermdi vs15 ,vs39,vs47,1 499 500#ifndef TRMMKERNEL 501 xvmaddadp vs24, vs8, alpha_r 502 xvmaddadp vs25, vs9, alpha_r 503 xvmaddadp vs26, vs10, alpha_r 504 xvmaddadp vs27, vs11, alpha_r 505 506 xvmaddadp vs28, vs12, alpha_r 507 xvmaddadp vs29, vs13, alpha_r 508 xvmaddadp vs30, vs14, alpha_r 509 xvmaddadp vs31, vs15, alpha_r 510#else 511 xvmuldp vs24, vs8, alpha_r 512 xvmuldp vs25, vs9, alpha_r 513 xvmuldp vs26, vs10, alpha_r 514 xvmuldp vs27, vs11, alpha_r 515 516 xvmuldp vs28, vs12, alpha_r 517 xvmuldp vs29, vs13, alpha_r 518 xvmuldp vs30, vs14, alpha_r 519 xvmuldp vs31, vs15, alpha_r 520 521#endif 522 stxv vs0, 0(CO) 523 stxv vs2, 16(CO) 524 stxv vs4, 32(CO) 525 stxv vs6, 48(CO) 526 527 stxv vs24, 64(CO) 528 stxv vs26, 80(CO) 529 stxv vs28, 96(CO) 530 stxv vs30, 112(CO) 531 532 stxv vs1, 0(C2) 533 stxv vs3, 16(C2) 534 stxv vs5, 32(C2) 535 stxv vs7, 48(C2) 536 537 stxv vs25, 64(C2) 538 stxv vs27, 80(C2) 539 stxv vs29, 96(C2) 540 stxv vs31, 112(C2) 541#ifndef TRMMKERNEL 542 lxv vs0, 0(C3) 543 lxv vs2, 16(C3) 544 lxv vs4, 32(C3) 545 lxv vs6, 48(C3) 546#endif 547 xxpermdi vs8, vs56,vs48,1 548 xxpermdi vs9 ,vs48,vs56,1 549#ifndef TRMMKERNEL 550 lxv vs24, 64(C3) 551 lxv vs26, 80(C3) 552#endif 553 xxpermdi vs10, vs57,vs49,1 554 xxpermdi vs11 ,vs49,vs57,1 555#ifndef TRMMKERNEL 556 lxv vs28, 96(C3) 557 lxv vs30, 112(C3) 558#endif 559 xxpermdi vs12, vs58,vs50,1 560 xxpermdi vs13 ,vs50,vs58,1 561#ifndef TRMMKERNEL 562 lxv vs1, 0(C4) 563 lxv vs3, 16(C4) 564#endif 565 xxpermdi vs14, vs59,vs51,1 566 xxpermdi vs15 ,vs51,vs59,1 567#ifndef TRMMKERNEL 568 lxv vs5, 32(C4) 569 lxv vs7, 48(C4) 570 571 lxv vs25, 64(C4) 572 lxv vs27, 80(C4) 573 lxv vs29, 96(C4) 574 lxv vs31, 112(C4) 575#endif 576 577#ifndef TRMMKERNEL 578 xvmaddadp vs0, vs8, alpha_r 579 xvmaddadp vs1, vs9, alpha_r 580 xvmaddadp vs2, vs10, alpha_r 581 xvmaddadp vs3, vs11, alpha_r 582#else 583 xvmuldp vs0, vs8, alpha_r 584 xvmuldp vs1, vs9, alpha_r 585 xvmuldp vs2, vs10, alpha_r 586 xvmuldp vs3, vs11, alpha_r 587 588#endif 589 590 xxpermdi vs8, vs60,vs52,1 591 xxpermdi vs9 ,vs52,vs60,1 592 xxpermdi vs10, vs61,vs53,1 593 xxpermdi vs11 ,vs53,vs61,1 594#ifndef TRMMKERNEL 595 xvmaddadp vs4, vs12, alpha_r 596 xvmaddadp vs5, vs13, alpha_r 597 xvmaddadp vs6, vs14, alpha_r 598 xvmaddadp vs7, vs15, alpha_r 599#else 600 xvmuldp vs4, vs12, alpha_r 601 xvmuldp vs5, vs13, alpha_r 602 xvmuldp vs6, vs14, alpha_r 603 xvmuldp vs7, vs15, alpha_r 604#endif 605 606 607 xxpermdi vs12, vs62,vs54,1 608 xxpermdi vs13 ,vs54,vs62,1 609 xxpermdi vs14, vs63,vs55,1 610 xxpermdi vs15 ,vs55,vs63,1 611#ifndef TRMMKERNEL 612 xvmaddadp vs24, vs8, alpha_r 613 xvmaddadp vs25, vs9, alpha_r 614 xvmaddadp vs26, vs10, alpha_r 615 xvmaddadp vs27, vs11, alpha_r 616 617 xvmaddadp vs28, vs12, alpha_r 618 xvmaddadp vs29, vs13, alpha_r 619 xvmaddadp vs30, vs14, alpha_r 620 xvmaddadp vs31, vs15, alpha_r 621#else 622 xvmuldp vs24, vs8, alpha_r 623 xvmuldp vs25, vs9, alpha_r 624 xvmuldp vs26, vs10, alpha_r 625 xvmuldp vs27, vs11, alpha_r 626 627 xvmuldp vs28, vs12, alpha_r 628 xvmuldp vs29, vs13, alpha_r 629 xvmuldp vs30, vs14, alpha_r 630 xvmuldp vs31, vs15, alpha_r 631#endif 632 stxv vs0, 0(C3) 633 stxv vs2, 16(C3) 634 stxv vs4, 32(C3) 635 stxv vs6, 48(C3) 636 637 stxv vs24, 64(C3) 638 stxv vs26, 80(C3) 639 stxv vs28, 96(C3) 640 stxv vs30, 112(C3) 641 642 stxv vs1, 0(C4) 643 stxv vs3, 16(C4) 644 stxv vs5, 32(C4) 645 stxv vs7, 48(C4) 646 647 stxv vs25, 64(C4) 648 stxv vs27, 80(C4) 649 stxv vs29, 96(C4) 650 stxv vs31, 112(C4) 651 652 addi CO, CO, 128 653.endm 654 655/********************************************************************* 656* Macros for N=4, M=8 * 657*********************************************************************/ 658 659.macro LOAD4x8_1 660 LOAD4x8 1 661.endm 662 663.macro LOAD4x8_0 664 LOAD4x8 0 665.endm 666.macro LOAD4x8 Zero 667 668 lxv vs24, 0(BO) 669 lxv vs26, 16(BO) 670 xxpermdi vs25, vs24, vs24,2 671 xxpermdi vs27, vs26, vs26,2 672 673 lxv vs0, 0(AO) 674 lxv vs1, 16(AO) 675 lxv vs2, 32(AO) 676 lxv vs3, 48(AO) 677 678 679 680.if \Zero==1 681 xxlxor vs32,vs32,vs32 682 xxlxor vs33,vs33,vs33 683 xxlxor vs34,vs34,vs34 684 xxlxor vs35,vs35,vs35 685 686 xxlxor vs40, vs40, vs40 687 xxlxor vs41, vs41, vs41 688 xxlxor vs42, vs42, vs42 689 xxlxor vs43, vs43, vs43 690 691 xxlxor vs48, vs48, vs48 692 xxlxor vs49, vs49, vs49 693 xxlxor vs50, vs50, vs50 694 xxlxor vs51, vs51, vs51 695 696 xxlxor vs56, vs56, vs56 697 xxlxor vs57, vs57, vs57 698 xxlxor vs58, vs58, vs58 699 xxlxor vs59, vs59, vs59 700 701.endif 702.endm 703 704 705 706.macro KERNEL4x8_L1_L2 Index,IsLast 707 KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 708.endm 709 710 711 712.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast 713 KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 714.endm 715 716.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast 717 KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 718.endm 719 720.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast 721 KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 722.endm 723 724.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete 725 726 lxv vs8, DISP16(\Index,0+\OffsetA)(AO) 727 lxv vs9, DISP16(\Index,16+\OffsetA)(AO) 728.if \First ==1 729 xvmuldp vs32, vs0, vs24 730 xvmuldp vs33, vs1, vs24 731 xvmuldp vs34, vs2, vs24 732 xvmuldp vs35, vs3, vs24 733.else 734 xvmaddadp vs32, vs0, vs24 735 xvmaddadp vs33, vs1, vs24 736 xvmaddadp vs34, vs2, vs24 737 xvmaddadp vs35, vs3, vs24 738.endif 739 740 lxv vs10, DISP16(\Index,32+\OffsetA)(AO) 741 lxv vs11, DISP16(\Index,48+\OffsetA)(AO) 742 743 744 745.if \First ==1 746 xvmuldp vs40, vs0, vs25 747 xvmuldp vs41, vs1, vs25 748 xvmuldp vs42, vs2, vs25 749 xvmuldp vs43, vs3, vs25 750 751 752 xvmuldp vs48, vs0, vs26 753 xvmuldp vs49, vs1, vs26 754 xvmuldp vs50, vs2, vs26 755 xvmuldp vs51, vs3, vs26 756 757 758.else 759 760 lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) 761 lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) 762 763 xvmaddadp vs40, vs0, vs25 764 xvmaddadp vs41, vs1, vs25 765 xvmaddadp vs42, vs2, vs25 766 xvmaddadp vs43, vs3, vs25 767 768 769 xvmaddadp vs48, vs0, vs26 770 xvmaddadp vs49, vs1, vs26 771 xvmaddadp vs50, vs2, vs26 772 xvmaddadp vs51, vs3, vs26 773 774.endif 775 xxpermdi vs29, vs28, vs28,2 776 xxpermdi vs31, vs30, vs30,2 777.if \First ==1 778 xvmuldp vs56, vs0, vs27 779 xvmuldp vs57, vs1, vs27 780 xvmuldp vs58, vs2, vs27 781 xvmuldp vs59, vs3, vs27 782 783.else 784 xvmaddadp vs56, vs0, vs27 785 xvmaddadp vs57, vs1, vs27 786 xvmaddadp vs58, vs2, vs27 787 xvmaddadp vs59, vs3, vs27 788 789.endif 790 791 xvmaddadp vs32, vs8, vs28 792 xvmaddadp vs33, vs9, vs28 793 xvmaddadp vs34, vs10, vs28 794 xvmaddadp vs35, vs11, vs28 795.if \Complete==0 796 lxv vs0, DISP16(\Index,64+\OffsetA)(AO) 797 lxv vs1, DISP16(\Index,80+\OffsetA)(AO) 798.endif 799 800 801 xvmaddadp vs40, vs8, vs29 802 xvmaddadp vs41, vs9, vs29 803 xvmaddadp vs42, vs10, vs29 804 xvmaddadp vs43, vs11, vs29 805 806.if \Complete==0 807 lxv vs2, DISP16(\Index,96+\OffsetA)(AO) 808 lxv vs3, DISP16(\Index,112+\OffsetA)(AO) 809.endif 810 811 812 xvmaddadp vs48, vs8, vs30 813 xvmaddadp vs49, vs9, vs30 814 xvmaddadp vs50, vs10, vs30 815 xvmaddadp vs51, vs11, vs30 816.if \Complete==0 817 lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) 818 lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) 819.endif 820 821 xvmaddadp vs56, vs8, vs31 822 xvmaddadp vs57, vs9, vs31 823 xvmaddadp vs58, vs10, vs31 824 xvmaddadp vs59, vs11, vs31 825.if \Complete==0 826 xxpermdi vs25, vs24, vs24,2 827 xxpermdi vs27, vs26, vs26,2 828.endif 829 830 .if \IsLast==1 831 .if \Complete==1 832 addi AO, AO, DISP16(\Index,64+\OffsetA) 833 addi BO, BO, DISP8(\Index,32+\OffsetB) 834 .else 835 addi AO, AO, DISP16(\Index,128) 836 addi BO, BO, DISP8(\Index,64) 837 .endif 838 .endif 839 840 841.endm 842 843 844 845.macro KERNEL4x8 First 846 847 lxv vs24, 0(BO) 848 lxv vs26, 16(BO) 849 xxpermdi vs25, vs24, vs24,2 850 xxpermdi vs27, vs26, vs26,2 851 852 lxv vs0, 0(AO) 853 lxv vs1, 16(AO) 854 lxv vs2, 32(AO) 855 lxv vs3, 48(AO) 856 857 858 859 860 addi BO, BO, 32 861 addi AO, AO, 64 862 863.if \First==1 864 xvmuldp vs32, vs0, vs24 865 xvmuldp vs33, vs1, vs24 866 xvmuldp vs34, vs2, vs24 867 xvmuldp vs35, vs3, vs24 868 869 870 xvmuldp vs40, vs0, vs25 871 xvmuldp vs41, vs1, vs25 872 xvmuldp vs42, vs2, vs25 873 xvmuldp vs43, vs3, vs25 874 875 876 xvmuldp vs48, vs0, vs26 877 xvmuldp vs49, vs1, vs26 878 xvmuldp vs50, vs2, vs26 879 xvmuldp vs51, vs3, vs26 880 881 882 xvmuldp vs56, vs0, vs27 883 xvmuldp vs57, vs1, vs27 884 xvmuldp vs58, vs2, vs27 885 xvmuldp vs59, vs3, vs27 886 887.else 888 xvmaddadp vs32, vs0, vs24 889 xvmaddadp vs33, vs1, vs24 890 xvmaddadp vs34, vs2, vs24 891 xvmaddadp vs35, vs3, vs24 892 893 894 xvmaddadp vs40, vs0, vs25 895 xvmaddadp vs41, vs1, vs25 896 xvmaddadp vs42, vs2, vs25 897 xvmaddadp vs43, vs3, vs25 898 899 900 901 xvmaddadp vs48, vs0, vs26 902 xvmaddadp vs49, vs1, vs26 903 xvmaddadp vs50, vs2, vs26 904 xvmaddadp vs51, vs3, vs26 905 906 907 908 xvmaddadp vs56, vs0, vs27 909 xvmaddadp vs57, vs1, vs27 910 xvmaddadp vs58, vs2, vs27 911 xvmaddadp vs59, vs3, vs27 912 913 914.endif 915.endm 916 917 918 919.macro SAVE4x8 920 add T2, CO, LDC 921 add T3, T2, LDC 922 add T4, T3, LDC 923#ifndef TRMMKERNEL 924 lxv vs0, 0(CO) 925 lxv vs2, 16(CO) 926#endif 927 xxpermdi vs8, vs40,vs32,1 928 xxpermdi vs9 ,vs32,vs40,1 929#ifndef TRMMKERNEL 930 lxv vs4, 32(CO) 931 lxv vs6, 48(CO) 932#endif 933 xxpermdi vs10, vs41,vs33,1 934 xxpermdi vs11 ,vs33,vs41,1 935#ifndef TRMMKERNEL 936 lxv vs1, 0(T2) 937 lxv vs3, 16(T2) 938#endif 939 xxpermdi vs12, vs42,vs34,1 940 xxpermdi vs13 ,vs34,vs42,1 941#ifndef TRMMKERNEL 942 lxv vs5, 32(T2) 943 lxv vs7, 48(T2) 944#endif 945 xxpermdi vs14, vs43,vs35,1 946 xxpermdi vs15 ,vs35,vs43,1 947 948 949 950#ifndef TRMMKERNEL 951 xvmaddadp vs0, vs8, alpha_r 952 xvmaddadp vs1, vs9, alpha_r 953 xvmaddadp vs2, vs10, alpha_r 954 xvmaddadp vs3, vs11, alpha_r 955 956 xvmaddadp vs4, vs12, alpha_r 957 xvmaddadp vs5, vs13, alpha_r 958 xvmaddadp vs6, vs14, alpha_r 959 xvmaddadp vs7, vs15, alpha_r 960#else 961 xvmuldp vs0, vs8, alpha_r 962 xvmuldp vs1, vs9, alpha_r 963 xvmuldp vs2, vs10, alpha_r 964 xvmuldp vs3, vs11, alpha_r 965 966 xvmuldp vs4, vs12, alpha_r 967 xvmuldp vs5, vs13, alpha_r 968 xvmuldp vs6, vs14, alpha_r 969 xvmuldp vs7, vs15, alpha_r 970 971#endif 972 973 974 stxv vs0, 0(CO) 975 stxv vs2, 16(CO) 976 stxv vs4, 32(CO) 977 stxv vs6, 48(CO) 978 979 980 stxv vs1, 0(T2) 981 stxv vs3, 16(T2) 982 stxv vs5, 32(T2) 983 stxv vs7, 48(T2) 984 985 986 xxpermdi vs8, vs56,vs48,1 987 xxpermdi vs9 ,vs48,vs56,1 988#ifndef TRMMKERNEL 989 lxv vs0, 0(T3) 990 lxv vs2, 16(T3) 991#endif 992 xxpermdi vs10, vs57,vs49,1 993 xxpermdi vs11 ,vs49,vs57,1 994#ifndef TRMMKERNEL 995 lxv vs4, 32(T3) 996 lxv vs6, 48(T3) 997#endif 998 xxpermdi vs12, vs58,vs50,1 999 xxpermdi vs13 ,vs50,vs58,1 1000#ifndef TRMMKERNEL 1001 lxv vs1, 0(T4) 1002 lxv vs3, 16(T4) 1003#endif 1004 xxpermdi vs14, vs59,vs51,1 1005 xxpermdi vs15 ,vs51,vs59,1 1006#ifndef TRMMKERNEL 1007 lxv vs5, 32(T4) 1008 lxv vs7, 48(T4) 1009 1010 1011 xvmaddadp vs0, vs8, alpha_r 1012 xvmaddadp vs1, vs9, alpha_r 1013 xvmaddadp vs2, vs10, alpha_r 1014 xvmaddadp vs3, vs11, alpha_r 1015 1016 1017 1018 xvmaddadp vs4, vs12, alpha_r 1019 xvmaddadp vs5, vs13, alpha_r 1020 xvmaddadp vs6, vs14, alpha_r 1021 xvmaddadp vs7, vs15, alpha_r 1022#else 1023 xvmuldp vs0, vs8, alpha_r 1024 xvmuldp vs1, vs9, alpha_r 1025 xvmuldp vs2, vs10, alpha_r 1026 xvmuldp vs3, vs11, alpha_r 1027 1028 1029 1030 xvmuldp vs4, vs12, alpha_r 1031 xvmuldp vs5, vs13, alpha_r 1032 xvmuldp vs6, vs14, alpha_r 1033 xvmuldp vs7, vs15, alpha_r 1034 1035#endif 1036 1037 1038 stxv vs0, 0(T3) 1039 stxv vs2, 16(T3) 1040 stxv vs4, 32(T3) 1041 stxv vs6, 48(T3) 1042 1043 1044 stxv vs1, 0(T4) 1045 stxv vs3, 16(T4) 1046 stxv vs5, 32(T4) 1047 stxv vs7, 48(T4) 1048 1049 1050 1051 addi CO, CO, 64 1052.endm 1053 1054 1055/********************************************************************* 1056* Macros for N=4, M=4 * 1057*********************************************************************/ 1058 1059.macro LOAD4x4_1 1060 1061 lxvd2x vs0, 0, AO 1062 lxvd2x vs1, o16, AO 1063 1064 lxvdsx vs24, 0, BO 1065 lxvdsx vs25, o8, BO 1066 lxvdsx vs26, o16, BO 1067 lxvdsx vs27, o24, BO 1068 1069 addi AO, AO, 32 1070 addi BO, BO, 32 1071 1072.endm 1073 1074.macro KERNEL4x4_I1 1075 1076 lxvd2x vs8, 0, AO 1077 lxvd2x vs9, o16, AO 1078 1079 lxvdsx vs28, 0, BO 1080 lxvdsx vs29, o8, BO 1081 lxvdsx vs30, o16, BO 1082 lxvdsx vs31, o24, BO 1083 1084 addi AO, AO, 32 1085 addi BO, BO, 32 1086 1087 1088 xvmuldp vs32, vs0, vs24 1089 xvmuldp vs33, vs1, vs24 1090 1091 xvmuldp vs40, vs0, vs25 1092 xvmuldp vs41, vs1, vs25 1093 1094 xvmuldp vs48, vs0, vs26 1095 xvmuldp vs49, vs1, vs26 1096 1097 xvmuldp vs56, vs0, vs27 1098 xvmuldp vs57, vs1, vs27 1099 1100.endm 1101 1102.macro KERNEL4x4_1 1103 1104 lxvd2x vs8, 0, AO 1105 lxvd2x vs9, o16, AO 1106 1107 lxvdsx vs28, 0, BO 1108 lxvdsx vs29, o8, BO 1109 lxvdsx vs30, o16, BO 1110 lxvdsx vs31, o24, BO 1111 1112 addi AO, AO, 32 1113 addi BO, BO, 32 1114 1115 1116 xvmaddadp vs32, vs0, vs24 1117 xvmaddadp vs33, vs1, vs24 1118 1119 xvmaddadp vs40, vs0, vs25 1120 xvmaddadp vs41, vs1, vs25 1121 1122 xvmaddadp vs48, vs0, vs26 1123 xvmaddadp vs49, vs1, vs26 1124 1125 xvmaddadp vs56, vs0, vs27 1126 xvmaddadp vs57, vs1, vs27 1127 1128.endm 1129 1130.macro KERNEL4x4_2 1131 1132 lxvd2x vs0, 0, AO 1133 lxvd2x vs1, o16, AO 1134 1135 lxvdsx vs24, 0, BO 1136 lxvdsx vs25, o8, BO 1137 lxvdsx vs26, o16, BO 1138 lxvdsx vs27, o24, BO 1139 1140 addi AO, AO, 32 1141 addi BO, BO, 32 1142 1143 1144 xvmaddadp vs32, vs8, vs28 1145 xvmaddadp vs33, vs9, vs28 1146 1147 xvmaddadp vs40, vs8, vs29 1148 xvmaddadp vs41, vs9, vs29 1149 1150 xvmaddadp vs48, vs8, vs30 1151 xvmaddadp vs49, vs9, vs30 1152 1153 xvmaddadp vs56, vs8, vs31 1154 xvmaddadp vs57, vs9, vs31 1155 1156.endm 1157 1158.macro KERNEL4x4_E2 1159 1160 1161 xvmaddadp vs32, vs8, vs28 1162 xvmaddadp vs33, vs9, vs28 1163 1164 xvmaddadp vs40, vs8, vs29 1165 xvmaddadp vs41, vs9, vs29 1166 1167 xvmaddadp vs48, vs8, vs30 1168 xvmaddadp vs49, vs9, vs30 1169 1170 xvmaddadp vs56, vs8, vs31 1171 xvmaddadp vs57, vs9, vs31 1172 1173.endm 1174 1175.macro KERNEL4x4_SUBI1 1176 1177 lxvd2x vs0, 0, AO 1178 lxvd2x vs1, o16, AO 1179 1180 lxvdsx vs24, 0, BO 1181 lxvdsx vs25, o8, BO 1182 lxvdsx vs26, o16, BO 1183 lxvdsx vs27, o24, BO 1184 1185 addi AO, AO, 32 1186 addi BO, BO, 32 1187 1188 1189 xvmuldp vs32, vs0, vs24 1190 xvmuldp vs33, vs1, vs24 1191 1192 xvmuldp vs40, vs0, vs25 1193 xvmuldp vs41, vs1, vs25 1194 1195 xvmuldp vs48, vs0, vs26 1196 xvmuldp vs49, vs1, vs26 1197 1198 xvmuldp vs56, vs0, vs27 1199 xvmuldp vs57, vs1, vs27 1200 1201.endm 1202 1203.macro KERNEL4x4_SUB1 1204 1205 lxvd2x vs0, 0, AO 1206 lxvd2x vs1, o16, AO 1207 1208 lxvdsx vs24, 0, BO 1209 lxvdsx vs25, o8, BO 1210 lxvdsx vs26, o16, BO 1211 lxvdsx vs27, o24, BO 1212 1213 addi AO, AO, 32 1214 addi BO, BO, 32 1215 1216 1217 xvmaddadp vs32, vs0, vs24 1218 xvmaddadp vs33, vs1, vs24 1219 1220 xvmaddadp vs40, vs0, vs25 1221 xvmaddadp vs41, vs1, vs25 1222 1223 xvmaddadp vs48, vs0, vs26 1224 xvmaddadp vs49, vs1, vs26 1225 1226 xvmaddadp vs56, vs0, vs27 1227 xvmaddadp vs57, vs1, vs27 1228 1229.endm 1230 1231.macro SAVE4x4 1232 1233 mr T1, CO 1234 1235#ifndef TRMMKERNEL 1236 lxvd2x vs0, 0, T1 1237 lxvd2x vs1, o16, T1 1238#endif 1239 1240#ifndef TRMMKERNEL 1241 xvmaddadp vs0, vs32, alpha_r 1242 xvmaddadp vs1, vs33, alpha_r 1243#else 1244 xvmuldp vs0, vs32, alpha_r 1245 xvmuldp vs1, vs33, alpha_r 1246#endif 1247 1248 stxvd2x vs0, 0, T1 1249 stxvd2x vs1, o16, T1 1250 1251 add T1, T1, LDC 1252 1253#ifndef TRMMKERNEL 1254 lxvd2x vs8, 0, T1 1255 lxvd2x vs9, o16, T1 1256#endif 1257 1258#ifndef TRMMKERNEL 1259 xvmaddadp vs8, vs40, alpha_r 1260 xvmaddadp vs9, vs41, alpha_r 1261#else 1262 xvmuldp vs8, vs40, alpha_r 1263 xvmuldp vs9, vs41, alpha_r 1264#endif 1265 1266 stxvd2x vs8, 0, T1 1267 stxvd2x vs9, o16, T1 1268 1269 add T1, T1, LDC 1270 1271#ifndef TRMMKERNEL 1272 lxvd2x vs0, 0, T1 1273 lxvd2x vs1, o16, T1 1274#endif 1275 1276#ifndef TRMMKERNEL 1277 xvmaddadp vs0, vs48, alpha_r 1278 xvmaddadp vs1, vs49, alpha_r 1279#else 1280 xvmuldp vs0, vs48, alpha_r 1281 xvmuldp vs1, vs49, alpha_r 1282#endif 1283 1284 stxvd2x vs0, 0, T1 1285 stxvd2x vs1, o16, T1 1286 1287 add T1, T1, LDC 1288 1289#ifndef TRMMKERNEL 1290 lxvd2x vs8, 0, T1 1291 lxvd2x vs9, o16, T1 1292#endif 1293 1294#ifndef TRMMKERNEL 1295 xvmaddadp vs8, vs56, alpha_r 1296 xvmaddadp vs9, vs57, alpha_r 1297#else 1298 xvmuldp vs8, vs56, alpha_r 1299 xvmuldp vs9, vs57, alpha_r 1300#endif 1301 1302 stxvd2x vs8, 0, T1 1303 stxvd2x vs9, o16, T1 1304 1305 addi CO, CO, 32 1306 1307.endm 1308 1309/********************************************************************* 1310* Macros for N=4, M=2 * 1311*********************************************************************/ 1312 1313.macro LOAD4x2_1 1314 1315 lxvd2x vs0, 0, AO 1316 1317 lxvdsx vs24, 0, BO 1318 lxvdsx vs25, o8, BO 1319 lxvdsx vs26, o16, BO 1320 lxvdsx vs27, o24, BO 1321 1322 addi AO, AO, 16 1323 addi BO, BO, 32 1324 1325.endm 1326 1327.macro KERNEL4x2_I1 1328 1329 lxvd2x vs8, 0, AO 1330 1331 lxvdsx vs28, 0, BO 1332 lxvdsx vs29, o8, BO 1333 lxvdsx vs30, o16, BO 1334 lxvdsx vs31, o24, BO 1335 1336 addi AO, AO, 16 1337 addi BO, BO, 32 1338 1339 1340 xvmuldp vs32, vs0, vs24 1341 1342 xvmuldp vs40, vs0, vs25 1343 1344 xvmuldp vs48, vs0, vs26 1345 1346 xvmuldp vs56, vs0, vs27 1347 1348.endm 1349 1350.macro KERNEL4x2_1 1351 1352 lxvd2x vs8, 0, AO 1353 1354 lxvdsx vs28, 0, BO 1355 lxvdsx vs29, o8, BO 1356 lxvdsx vs30, o16, BO 1357 lxvdsx vs31, o24, BO 1358 1359 addi AO, AO, 16 1360 addi BO, BO, 32 1361 1362 1363 xvmaddadp vs32, vs0, vs24 1364 1365 xvmaddadp vs40, vs0, vs25 1366 1367 xvmaddadp vs48, vs0, vs26 1368 1369 xvmaddadp vs56, vs0, vs27 1370 1371.endm 1372 1373.macro KERNEL4x2_2 1374 1375 lxvd2x vs0, 0, AO 1376 1377 lxvdsx vs24, 0, BO 1378 lxvdsx vs25, o8, BO 1379 lxvdsx vs26, o16, BO 1380 lxvdsx vs27, o24, BO 1381 1382 addi AO, AO, 16 1383 addi BO, BO, 32 1384 1385 1386 xvmaddadp vs32, vs8, vs28 1387 1388 xvmaddadp vs40, vs8, vs29 1389 1390 xvmaddadp vs48, vs8, vs30 1391 1392 xvmaddadp vs56, vs8, vs31 1393 1394.endm 1395 1396.macro KERNEL4x2_E2 1397 1398 1399 xvmaddadp vs32, vs8, vs28 1400 1401 xvmaddadp vs40, vs8, vs29 1402 1403 xvmaddadp vs48, vs8, vs30 1404 1405 xvmaddadp vs56, vs8, vs31 1406 1407.endm 1408 1409.macro KERNEL4x2_SUBI1 1410 1411 lxvd2x vs0, 0, AO 1412 1413 lxvdsx vs24, 0, BO 1414 lxvdsx vs25, o8, BO 1415 lxvdsx vs26, o16, BO 1416 lxvdsx vs27, o24, BO 1417 1418 addi AO, AO, 16 1419 addi BO, BO, 32 1420 1421 1422 xvmuldp vs32, vs0, vs24 1423 1424 xvmuldp vs40, vs0, vs25 1425 1426 xvmuldp vs48, vs0, vs26 1427 1428 xvmuldp vs56, vs0, vs27 1429 1430.endm 1431 1432.macro KERNEL4x2_SUB1 1433 1434 lxvd2x vs0, 0, AO 1435 1436 lxvdsx vs24, 0, BO 1437 lxvdsx vs25, o8, BO 1438 lxvdsx vs26, o16, BO 1439 lxvdsx vs27, o24, BO 1440 1441 addi AO, AO, 16 1442 addi BO, BO, 32 1443 1444 1445 xvmaddadp vs32, vs0, vs24 1446 1447 xvmaddadp vs40, vs0, vs25 1448 1449 xvmaddadp vs48, vs0, vs26 1450 1451 xvmaddadp vs56, vs0, vs27 1452 1453.endm 1454 1455.macro SAVE4x2 1456 1457 mr T1, CO 1458 1459#ifndef TRMMKERNEL 1460 lxvd2x vs0, 0, T1 1461#endif 1462 1463#ifndef TRMMKERNEL 1464 xvmaddadp vs0, vs32, alpha_r 1465#else 1466 xvmuldp vs0, vs32, alpha_r 1467#endif 1468 1469 stxvd2x vs0, 0, T1 1470 1471 add T1, T1, LDC 1472 1473#ifndef TRMMKERNEL 1474 lxvd2x vs8, 0, T1 1475#endif 1476 1477#ifndef TRMMKERNEL 1478 xvmaddadp vs8, vs40, alpha_r 1479#else 1480 xvmuldp vs8, vs40, alpha_r 1481#endif 1482 1483 stxvd2x vs8, 0, T1 1484 1485 add T1, T1, LDC 1486 1487#ifndef TRMMKERNEL 1488 lxvd2x vs0, 0, T1 1489#endif 1490 1491#ifndef TRMMKERNEL 1492 xvmaddadp vs0, vs48, alpha_r 1493#else 1494 xvmuldp vs0, vs48, alpha_r 1495#endif 1496 1497 stxvd2x vs0, 0, T1 1498 1499 add T1, T1, LDC 1500 1501#ifndef TRMMKERNEL 1502 lxvd2x vs8, 0, T1 1503#endif 1504 1505#ifndef TRMMKERNEL 1506 xvmaddadp vs8, vs56, alpha_r 1507#else 1508 xvmuldp vs8, vs56, alpha_r 1509#endif 1510 1511 stxvd2x vs8, 0, T1 1512 1513 addi CO, CO, 16 1514 1515.endm 1516 1517/********************************************************************* 1518* Macros for N=4, M=1 * 1519*********************************************************************/ 1520 1521.macro LOAD4x1_1 1522 1523 lxsdx vs0, 0, AO 1524 1525 lxsdx vs24, 0, BO 1526 lxsdx vs25, o8, BO 1527 lxsdx vs26, o16, BO 1528 lxsdx vs27, o24, BO 1529 1530 addi AO, AO, 8 1531 addi BO, BO, 32 1532 1533.endm 1534 1535.macro KERNEL4x1_I1 1536 1537 lxsdx vs8, 0, AO 1538 1539 lxsdx vs28, 0, BO 1540 lxsdx vs29, o8, BO 1541 lxsdx vs30, o16, BO 1542 lxsdx vs31, o24, BO 1543 1544 addi AO, AO, 8 1545 addi BO, BO, 32 1546 1547 1548 xsmuldp vs32, vs0, vs24 1549 1550 xsmuldp vs40, vs0, vs25 1551 1552 xsmuldp vs48, vs0, vs26 1553 1554 xsmuldp vs56, vs0, vs27 1555 1556.endm 1557 1558.macro KERNEL4x1_1 1559 1560 lxsdx vs8, 0, AO 1561 1562 lxsdx vs28, 0, BO 1563 lxsdx vs29, o8, BO 1564 lxsdx vs30, o16, BO 1565 lxsdx vs31, o24, BO 1566 1567 addi AO, AO, 8 1568 addi BO, BO, 32 1569 1570 1571 xsmaddadp vs32, vs0, vs24 1572 1573 xsmaddadp vs40, vs0, vs25 1574 1575 xsmaddadp vs48, vs0, vs26 1576 1577 xsmaddadp vs56, vs0, vs27 1578 1579.endm 1580 1581.macro KERNEL4x1_2 1582 1583 lxsdx vs0, 0, AO 1584 1585 lxsdx vs24, 0, BO 1586 lxsdx vs25, o8, BO 1587 lxsdx vs26, o16, BO 1588 lxsdx vs27, o24, BO 1589 1590 addi AO, AO, 8 1591 addi BO, BO, 32 1592 1593 1594 xsmaddadp vs32, vs8, vs28 1595 1596 xsmaddadp vs40, vs8, vs29 1597 1598 xsmaddadp vs48, vs8, vs30 1599 1600 xsmaddadp vs56, vs8, vs31 1601 1602.endm 1603 1604.macro KERNEL4x1_E2 1605 1606 1607 xsmaddadp vs32, vs8, vs28 1608 1609 xsmaddadp vs40, vs8, vs29 1610 1611 xsmaddadp vs48, vs8, vs30 1612 1613 xsmaddadp vs56, vs8, vs31 1614 1615.endm 1616 1617.macro KERNEL4x1_SUBI1 1618 1619 lxsdx vs0, 0, AO 1620 1621 lxsdx vs24, 0, BO 1622 lxsdx vs25, o8, BO 1623 lxsdx vs26, o16, BO 1624 lxsdx vs27, o24, BO 1625 1626 addi AO, AO, 8 1627 addi BO, BO, 32 1628 1629 1630 xsmuldp vs32, vs0, vs24 1631 1632 xsmuldp vs40, vs0, vs25 1633 1634 xsmuldp vs48, vs0, vs26 1635 1636 xsmuldp vs56, vs0, vs27 1637 1638.endm 1639 1640.macro KERNEL4x1_SUB1 1641 1642 lxsdx vs0, 0, AO 1643 1644 lxsdx vs24, 0, BO 1645 lxsdx vs25, o8, BO 1646 lxsdx vs26, o16, BO 1647 lxsdx vs27, o24, BO 1648 1649 addi AO, AO, 8 1650 addi BO, BO, 32 1651 1652 1653 xsmaddadp vs32, vs0, vs24 1654 1655 xsmaddadp vs40, vs0, vs25 1656 1657 xsmaddadp vs48, vs0, vs26 1658 1659 xsmaddadp vs56, vs0, vs27 1660 1661.endm 1662 1663.macro SAVE4x1 1664 1665 mr T1, CO 1666 1667#ifndef TRMMKERNEL 1668 lxsdx vs0, 0, T1 1669#endif 1670 1671#ifndef TRMMKERNEL 1672 xsmaddadp vs0, vs32, alpha_r 1673#else 1674 xsmuldp vs0, vs32, alpha_r 1675#endif 1676 1677 stxsdx vs0, 0, T1 1678 1679 add T1, T1, LDC 1680 1681#ifndef TRMMKERNEL 1682 lxsdx vs8, 0, T1 1683#endif 1684 1685#ifndef TRMMKERNEL 1686 xsmaddadp vs8, vs40, alpha_r 1687#else 1688 xsmuldp vs8, vs40, alpha_r 1689#endif 1690 1691 stxsdx vs8, 0, T1 1692 1693 add T1, T1, LDC 1694 1695#ifndef TRMMKERNEL 1696 lxsdx vs0, 0, T1 1697#endif 1698 1699#ifndef TRMMKERNEL 1700 xsmaddadp vs0, vs48, alpha_r 1701#else 1702 xsmuldp vs0, vs48, alpha_r 1703#endif 1704 1705 stxsdx vs0, 0, T1 1706 1707 add T1, T1, LDC 1708 1709#ifndef TRMMKERNEL 1710 lxsdx vs8, 0, T1 1711#endif 1712 1713#ifndef TRMMKERNEL 1714 xsmaddadp vs8, vs56, alpha_r 1715#else 1716 xsmuldp vs8, vs56, alpha_r 1717#endif 1718 1719 stxsdx vs8, 0, T1 1720 1721 addi CO, CO, 8 1722 1723.endm 1724 1725/********************************************************************* 1726* Macros for N=2, M=16 * 1727*********************************************************************/ 1728 1729.macro LOAD2x16_1 1730 1731 lxvd2x vs0, 0, AO 1732 lxvd2x vs1, o16, AO 1733 lxvd2x vs2, o32, AO 1734 lxvd2x vs3, o48, AO 1735 1736 lxvdsx vs24, 0, BO 1737 lxvdsx vs25, o8, BO 1738 1739 addi AO, AO, 64 1740 addi BO, BO, 16 1741 1742 lxvd2x vs4, 0, AO 1743 lxvd2x vs5, o16, AO 1744 lxvd2x vs6, o32, AO 1745 lxvd2x vs7, o48, AO 1746 1747 addi AO, AO, 64 1748 1749.endm 1750 1751.macro KERNEL2x16_I1 1752 1753 lxvd2x vs8, 0, AO 1754 lxvd2x vs9, o16, AO 1755 lxvd2x vs10, o32, AO 1756 lxvd2x vs11, o48, AO 1757 1758 lxvdsx vs28, 0, BO 1759 lxvdsx vs29, o8, BO 1760 1761 addi AO, AO, 64 1762 addi BO, BO, 16 1763 1764 lxvd2x vs12, 0, AO 1765 lxvd2x vs13, o16, AO 1766 lxvd2x vs14, o32, AO 1767 lxvd2x vs15, o48, AO 1768 1769 addi AO, AO, 64 1770 1771 1772 xvmuldp vs32, vs0, vs24 1773 xvmuldp vs33, vs1, vs24 1774 xvmuldp vs34, vs2, vs24 1775 xvmuldp vs35, vs3, vs24 1776 xvmuldp vs36, vs4, vs24 1777 xvmuldp vs37, vs5, vs24 1778 xvmuldp vs38, vs6, vs24 1779 xvmuldp vs39, vs7, vs24 1780 1781 xvmuldp vs40, vs0, vs25 1782 xvmuldp vs41, vs1, vs25 1783 xvmuldp vs42, vs2, vs25 1784 xvmuldp vs43, vs3, vs25 1785 xvmuldp vs44, vs4, vs25 1786 xvmuldp vs45, vs5, vs25 1787 xvmuldp vs46, vs6, vs25 1788 xvmuldp vs47, vs7, vs25 1789 1790.endm 1791 1792.macro KERNEL2x16_1 1793 1794 lxvd2x vs8, 0, AO 1795 lxvd2x vs9, o16, AO 1796 lxvd2x vs10, o32, AO 1797 lxvd2x vs11, o48, AO 1798 1799 lxvdsx vs28, 0, BO 1800 lxvdsx vs29, o8, BO 1801 1802 addi AO, AO, 64 1803 addi BO, BO, 16 1804 1805 lxvd2x vs12, 0, AO 1806 lxvd2x vs13, o16, AO 1807 lxvd2x vs14, o32, AO 1808 lxvd2x vs15, o48, AO 1809 1810 addi AO, AO, 64 1811 1812 1813 xvmaddadp vs32, vs0, vs24 1814 xvmaddadp vs33, vs1, vs24 1815 xvmaddadp vs34, vs2, vs24 1816 xvmaddadp vs35, vs3, vs24 1817 xvmaddadp vs36, vs4, vs24 1818 xvmaddadp vs37, vs5, vs24 1819 xvmaddadp vs38, vs6, vs24 1820 xvmaddadp vs39, vs7, vs24 1821 1822 xvmaddadp vs40, vs0, vs25 1823 xvmaddadp vs41, vs1, vs25 1824 xvmaddadp vs42, vs2, vs25 1825 xvmaddadp vs43, vs3, vs25 1826 xvmaddadp vs44, vs4, vs25 1827 xvmaddadp vs45, vs5, vs25 1828 xvmaddadp vs46, vs6, vs25 1829 xvmaddadp vs47, vs7, vs25 1830 1831.endm 1832 1833.macro KERNEL2x16_2 1834 1835 lxvd2x vs0, 0, AO 1836 lxvd2x vs1, o16, AO 1837 lxvd2x vs2, o32, AO 1838 lxvd2x vs3, o48, AO 1839 1840 lxvdsx vs24, 0, BO 1841 lxvdsx vs25, o8, BO 1842 1843 addi AO, AO, 64 1844 addi BO, BO, 16 1845 1846 lxvd2x vs4, 0, AO 1847 lxvd2x vs5, o16, AO 1848 lxvd2x vs6, o32, AO 1849 lxvd2x vs7, o48, AO 1850 1851 addi AO, AO, 64 1852 1853 1854 xvmaddadp vs32, vs8, vs28 1855 xvmaddadp vs33, vs9, vs28 1856 xvmaddadp vs34, vs10, vs28 1857 xvmaddadp vs35, vs11, vs28 1858 xvmaddadp vs36, vs12, vs28 1859 xvmaddadp vs37, vs13, vs28 1860 xvmaddadp vs38, vs14, vs28 1861 xvmaddadp vs39, vs15, vs28 1862 1863 xvmaddadp vs40, vs8, vs29 1864 xvmaddadp vs41, vs9, vs29 1865 xvmaddadp vs42, vs10, vs29 1866 xvmaddadp vs43, vs11, vs29 1867 xvmaddadp vs44, vs12, vs29 1868 xvmaddadp vs45, vs13, vs29 1869 xvmaddadp vs46, vs14, vs29 1870 xvmaddadp vs47, vs15, vs29 1871 1872.endm 1873 1874.macro KERNEL2x16_E2 1875 1876 1877 xvmaddadp vs32, vs8, vs28 1878 xvmaddadp vs33, vs9, vs28 1879 xvmaddadp vs34, vs10, vs28 1880 xvmaddadp vs35, vs11, vs28 1881 xvmaddadp vs36, vs12, vs28 1882 xvmaddadp vs37, vs13, vs28 1883 xvmaddadp vs38, vs14, vs28 1884 xvmaddadp vs39, vs15, vs28 1885 1886 xvmaddadp vs40, vs8, vs29 1887 xvmaddadp vs41, vs9, vs29 1888 xvmaddadp vs42, vs10, vs29 1889 xvmaddadp vs43, vs11, vs29 1890 xvmaddadp vs44, vs12, vs29 1891 xvmaddadp vs45, vs13, vs29 1892 xvmaddadp vs46, vs14, vs29 1893 xvmaddadp vs47, vs15, vs29 1894 1895.endm 1896 1897.macro KERNEL2x16_SUBI1 1898 1899 lxvd2x vs0, 0, AO 1900 lxvd2x vs1, o16, AO 1901 lxvd2x vs2, o32, AO 1902 lxvd2x vs3, o48, AO 1903 1904 lxvdsx vs24, 0, BO 1905 lxvdsx vs25, o8, BO 1906 1907 addi AO, AO, 64 1908 addi BO, BO, 16 1909 1910 lxvd2x vs4, 0, AO 1911 lxvd2x vs5, o16, AO 1912 lxvd2x vs6, o32, AO 1913 lxvd2x vs7, o48, AO 1914 1915 addi AO, AO, 64 1916 1917 1918 xvmuldp vs32, vs0, vs24 1919 xvmuldp vs33, vs1, vs24 1920 xvmuldp vs34, vs2, vs24 1921 xvmuldp vs35, vs3, vs24 1922 xvmuldp vs36, vs4, vs24 1923 xvmuldp vs37, vs5, vs24 1924 xvmuldp vs38, vs6, vs24 1925 xvmuldp vs39, vs7, vs24 1926 1927 xvmuldp vs40, vs0, vs25 1928 xvmuldp vs41, vs1, vs25 1929 xvmuldp vs42, vs2, vs25 1930 xvmuldp vs43, vs3, vs25 1931 xvmuldp vs44, vs4, vs25 1932 xvmuldp vs45, vs5, vs25 1933 xvmuldp vs46, vs6, vs25 1934 xvmuldp vs47, vs7, vs25 1935 1936.endm 1937 1938.macro KERNEL2x16_SUB1 1939 1940 lxvd2x vs0, 0, AO 1941 lxvd2x vs1, o16, AO 1942 lxvd2x vs2, o32, AO 1943 lxvd2x vs3, o48, AO 1944 1945 lxvdsx vs24, 0, BO 1946 lxvdsx vs25, o8, BO 1947 1948 addi AO, AO, 64 1949 addi BO, BO, 16 1950 1951 lxvd2x vs4, 0, AO 1952 lxvd2x vs5, o16, AO 1953 lxvd2x vs6, o32, AO 1954 lxvd2x vs7, o48, AO 1955 1956 addi AO, AO, 64 1957 1958 1959 xvmaddadp vs32, vs0, vs24 1960 xvmaddadp vs33, vs1, vs24 1961 xvmaddadp vs34, vs2, vs24 1962 xvmaddadp vs35, vs3, vs24 1963 xvmaddadp vs36, vs4, vs24 1964 xvmaddadp vs37, vs5, vs24 1965 xvmaddadp vs38, vs6, vs24 1966 xvmaddadp vs39, vs7, vs24 1967 1968 xvmaddadp vs40, vs0, vs25 1969 xvmaddadp vs41, vs1, vs25 1970 xvmaddadp vs42, vs2, vs25 1971 xvmaddadp vs43, vs3, vs25 1972 xvmaddadp vs44, vs4, vs25 1973 xvmaddadp vs45, vs5, vs25 1974 xvmaddadp vs46, vs6, vs25 1975 xvmaddadp vs47, vs7, vs25 1976 1977.endm 1978 1979.macro SAVE2x16 1980 1981 mr T1, CO 1982 addi T2, T1, 64 1983 1984#ifndef TRMMKERNEL 1985 lxvd2x vs0, 0, T1 1986 lxvd2x vs1, o16, T1 1987 lxvd2x vs2, o32, T1 1988 lxvd2x vs3, o48, T1 1989 1990 lxvd2x vs4, 0, T2 1991 lxvd2x vs5, o16, T2 1992 lxvd2x vs6, o32, T2 1993 lxvd2x vs7, o48, T2 1994#endif 1995 1996#ifndef TRMMKERNEL 1997 xvmaddadp vs0, vs32, alpha_r 1998 xvmaddadp vs1, vs33, alpha_r 1999 xvmaddadp vs2, vs34, alpha_r 2000 xvmaddadp vs3, vs35, alpha_r 2001 xvmaddadp vs4, vs36, alpha_r 2002 xvmaddadp vs5, vs37, alpha_r 2003 xvmaddadp vs6, vs38, alpha_r 2004 xvmaddadp vs7, vs39, alpha_r 2005#else 2006 xvmuldp vs0, vs32, alpha_r 2007 xvmuldp vs1, vs33, alpha_r 2008 xvmuldp vs2, vs34, alpha_r 2009 xvmuldp vs3, vs35, alpha_r 2010 xvmuldp vs4, vs36, alpha_r 2011 xvmuldp vs5, vs37, alpha_r 2012 xvmuldp vs6, vs38, alpha_r 2013 xvmuldp vs7, vs39, alpha_r 2014#endif 2015 2016 stxvd2x vs0, 0, T1 2017 stxvd2x vs1, o16, T1 2018 stxvd2x vs2, o32, T1 2019 stxvd2x vs3, o48, T1 2020 2021 stxvd2x vs4, 0, T2 2022 stxvd2x vs5, o16, T2 2023 stxvd2x vs6, o32, T2 2024 stxvd2x vs7, o48, T2 2025 2026 add T1, T1, LDC 2027 add T2, T2, LDC 2028 2029#ifndef TRMMKERNEL 2030 lxvd2x vs8, 0, T1 2031 lxvd2x vs9, o16, T1 2032 lxvd2x vs10, o32, T1 2033 lxvd2x vs11, o48, T1 2034 2035 lxvd2x vs12, 0, T2 2036 lxvd2x vs13, o16, T2 2037 lxvd2x vs14, o32, T2 2038 lxvd2x vs15, o48, T2 2039#endif 2040 2041#ifndef TRMMKERNEL 2042 xvmaddadp vs8, vs40, alpha_r 2043 xvmaddadp vs9, vs41, alpha_r 2044 xvmaddadp vs10, vs42, alpha_r 2045 xvmaddadp vs11, vs43, alpha_r 2046 xvmaddadp vs12, vs44, alpha_r 2047 xvmaddadp vs13, vs45, alpha_r 2048 xvmaddadp vs14, vs46, alpha_r 2049 xvmaddadp vs15, vs47, alpha_r 2050#else 2051 xvmuldp vs8, vs40, alpha_r 2052 xvmuldp vs9, vs41, alpha_r 2053 xvmuldp vs10, vs42, alpha_r 2054 xvmuldp vs11, vs43, alpha_r 2055 xvmuldp vs12, vs44, alpha_r 2056 xvmuldp vs13, vs45, alpha_r 2057 xvmuldp vs14, vs46, alpha_r 2058 xvmuldp vs15, vs47, alpha_r 2059#endif 2060 2061 stxvd2x vs8, 0, T1 2062 stxvd2x vs9, o16, T1 2063 stxvd2x vs10, o32, T1 2064 stxvd2x vs11, o48, T1 2065 2066 stxvd2x vs12, 0, T2 2067 stxvd2x vs13, o16, T2 2068 stxvd2x vs14, o32, T2 2069 stxvd2x vs15, o48, T2 2070 2071 addi CO, CO, 128 2072 2073.endm 2074 2075/********************************************************************* 2076* Macros for N=4, M=8 * 2077*********************************************************************/ 2078 2079.macro LOAD2x8_1 2080 2081 lxvd2x vs0, 0, AO 2082 lxvd2x vs1, o16, AO 2083 lxvd2x vs2, o32, AO 2084 lxvd2x vs3, o48, AO 2085 2086 lxvdsx vs24, 0, BO 2087 lxvdsx vs25, o8, BO 2088 2089 addi AO, AO, 64 2090 addi BO, BO, 16 2091 2092.endm 2093 2094.macro KERNEL2x8_I1 2095 2096 lxvd2x vs8, 0, AO 2097 lxvd2x vs9, o16, AO 2098 lxvd2x vs10, o32, AO 2099 lxvd2x vs11, o48, AO 2100 2101 lxvdsx vs28, 0, BO 2102 lxvdsx vs29, o8, BO 2103 2104 addi AO, AO, 64 2105 addi BO, BO, 16 2106 2107 2108 xvmuldp vs32, vs0, vs24 2109 xvmuldp vs33, vs1, vs24 2110 xvmuldp vs34, vs2, vs24 2111 xvmuldp vs35, vs3, vs24 2112 2113 xvmuldp vs40, vs0, vs25 2114 xvmuldp vs41, vs1, vs25 2115 xvmuldp vs42, vs2, vs25 2116 xvmuldp vs43, vs3, vs25 2117 2118.endm 2119 2120.macro KERNEL2x8_1 2121 2122 lxvd2x vs8, 0, AO 2123 lxvd2x vs9, o16, AO 2124 lxvd2x vs10, o32, AO 2125 lxvd2x vs11, o48, AO 2126 2127 lxvdsx vs28, 0, BO 2128 lxvdsx vs29, o8, BO 2129 2130 addi AO, AO, 64 2131 addi BO, BO, 16 2132 2133 2134 xvmaddadp vs32, vs0, vs24 2135 xvmaddadp vs33, vs1, vs24 2136 xvmaddadp vs34, vs2, vs24 2137 xvmaddadp vs35, vs3, vs24 2138 2139 xvmaddadp vs40, vs0, vs25 2140 xvmaddadp vs41, vs1, vs25 2141 xvmaddadp vs42, vs2, vs25 2142 xvmaddadp vs43, vs3, vs25 2143 2144.endm 2145 2146.macro KERNEL2x8_2 2147 2148 lxvd2x vs0, 0, AO 2149 lxvd2x vs1, o16, AO 2150 lxvd2x vs2, o32, AO 2151 lxvd2x vs3, o48, AO 2152 2153 lxvdsx vs24, 0, BO 2154 lxvdsx vs25, o8, BO 2155 2156 addi AO, AO, 64 2157 addi BO, BO, 16 2158 2159 2160 xvmaddadp vs32, vs8, vs28 2161 xvmaddadp vs33, vs9, vs28 2162 xvmaddadp vs34, vs10, vs28 2163 xvmaddadp vs35, vs11, vs28 2164 2165 xvmaddadp vs40, vs8, vs29 2166 xvmaddadp vs41, vs9, vs29 2167 xvmaddadp vs42, vs10, vs29 2168 xvmaddadp vs43, vs11, vs29 2169 2170.endm 2171 2172.macro KERNEL2x8_E2 2173 2174 2175 xvmaddadp vs32, vs8, vs28 2176 xvmaddadp vs33, vs9, vs28 2177 xvmaddadp vs34, vs10, vs28 2178 xvmaddadp vs35, vs11, vs28 2179 2180 xvmaddadp vs40, vs8, vs29 2181 xvmaddadp vs41, vs9, vs29 2182 xvmaddadp vs42, vs10, vs29 2183 xvmaddadp vs43, vs11, vs29 2184 2185.endm 2186 2187.macro KERNEL2x8_SUBI1 2188 2189 lxvd2x vs0, 0, AO 2190 lxvd2x vs1, o16, AO 2191 lxvd2x vs2, o32, AO 2192 lxvd2x vs3, o48, AO 2193 2194 lxvdsx vs24, 0, BO 2195 lxvdsx vs25, o8, BO 2196 2197 addi AO, AO, 64 2198 addi BO, BO, 16 2199 2200 2201 xvmuldp vs32, vs0, vs24 2202 xvmuldp vs33, vs1, vs24 2203 xvmuldp vs34, vs2, vs24 2204 xvmuldp vs35, vs3, vs24 2205 2206 xvmuldp vs40, vs0, vs25 2207 xvmuldp vs41, vs1, vs25 2208 xvmuldp vs42, vs2, vs25 2209 xvmuldp vs43, vs3, vs25 2210 2211.endm 2212 2213.macro KERNEL2x8_SUB1 2214 2215 lxvd2x vs0, 0, AO 2216 lxvd2x vs1, o16, AO 2217 lxvd2x vs2, o32, AO 2218 lxvd2x vs3, o48, AO 2219 2220 lxvdsx vs24, 0, BO 2221 lxvdsx vs25, o8, BO 2222 2223 addi AO, AO, 64 2224 addi BO, BO, 16 2225 2226 2227 xvmaddadp vs32, vs0, vs24 2228 xvmaddadp vs33, vs1, vs24 2229 xvmaddadp vs34, vs2, vs24 2230 xvmaddadp vs35, vs3, vs24 2231 2232 xvmaddadp vs40, vs0, vs25 2233 xvmaddadp vs41, vs1, vs25 2234 xvmaddadp vs42, vs2, vs25 2235 xvmaddadp vs43, vs3, vs25 2236 2237.endm 2238 2239.macro SAVE2x8 2240 2241 mr T1, CO 2242 2243#ifndef TRMMKERNEL 2244 lxvd2x vs0, 0, T1 2245 lxvd2x vs1, o16, T1 2246 lxvd2x vs2, o32, T1 2247 lxvd2x vs3, o48, T1 2248#endif 2249 2250#ifndef TRMMKERNEL 2251 xvmaddadp vs0, vs32, alpha_r 2252 xvmaddadp vs1, vs33, alpha_r 2253 xvmaddadp vs2, vs34, alpha_r 2254 xvmaddadp vs3, vs35, alpha_r 2255#else 2256 xvmuldp vs0, vs32, alpha_r 2257 xvmuldp vs1, vs33, alpha_r 2258 xvmuldp vs2, vs34, alpha_r 2259 xvmuldp vs3, vs35, alpha_r 2260#endif 2261 2262 stxvd2x vs0, 0, T1 2263 stxvd2x vs1, o16, T1 2264 stxvd2x vs2, o32, T1 2265 stxvd2x vs3, o48, T1 2266 2267 add T1, T1, LDC 2268 2269#ifndef TRMMKERNEL 2270 lxvd2x vs8, 0, T1 2271 lxvd2x vs9, o16, T1 2272 lxvd2x vs10, o32, T1 2273 lxvd2x vs11, o48, T1 2274#endif 2275 2276#ifndef TRMMKERNEL 2277 xvmaddadp vs8, vs40, alpha_r 2278 xvmaddadp vs9, vs41, alpha_r 2279 xvmaddadp vs10, vs42, alpha_r 2280 xvmaddadp vs11, vs43, alpha_r 2281#else 2282 xvmuldp vs8, vs40, alpha_r 2283 xvmuldp vs9, vs41, alpha_r 2284 xvmuldp vs10, vs42, alpha_r 2285 xvmuldp vs11, vs43, alpha_r 2286#endif 2287 2288 stxvd2x vs8, 0, T1 2289 stxvd2x vs9, o16, T1 2290 stxvd2x vs10, o32, T1 2291 stxvd2x vs11, o48, T1 2292 2293 addi CO, CO, 64 2294 2295.endm 2296 2297/********************************************************************* 2298* Macros for N=2, M=4 * 2299*********************************************************************/ 2300 2301.macro LOAD2x4_1 2302 2303 lxvd2x vs0, 0, AO 2304 lxvd2x vs1, o16, AO 2305 2306 lxvdsx vs24, 0, BO 2307 lxvdsx vs25, o8, BO 2308 2309 addi AO, AO, 32 2310 addi BO, BO, 16 2311 2312.endm 2313 2314.macro KERNEL2x4_I1 2315 2316 lxvd2x vs8, 0, AO 2317 lxvd2x vs9, o16, AO 2318 2319 lxvdsx vs28, 0, BO 2320 lxvdsx vs29, o8, BO 2321 2322 addi AO, AO, 32 2323 addi BO, BO, 16 2324 2325 2326 xvmuldp vs32, vs0, vs24 2327 xvmuldp vs33, vs1, vs24 2328 2329 xvmuldp vs40, vs0, vs25 2330 xvmuldp vs41, vs1, vs25 2331 2332.endm 2333 2334.macro KERNEL2x4_1 2335 2336 lxvd2x vs8, 0, AO 2337 lxvd2x vs9, o16, AO 2338 2339 lxvdsx vs28, 0, BO 2340 lxvdsx vs29, o8, BO 2341 2342 addi AO, AO, 32 2343 addi BO, BO, 16 2344 2345 2346 xvmaddadp vs32, vs0, vs24 2347 xvmaddadp vs33, vs1, vs24 2348 2349 xvmaddadp vs40, vs0, vs25 2350 xvmaddadp vs41, vs1, vs25 2351 2352.endm 2353 2354.macro KERNEL2x4_2 2355 2356 lxvd2x vs0, 0, AO 2357 lxvd2x vs1, o16, AO 2358 2359 lxvdsx vs24, 0, BO 2360 lxvdsx vs25, o8, BO 2361 2362 addi AO, AO, 32 2363 addi BO, BO, 16 2364 2365 2366 xvmaddadp vs32, vs8, vs28 2367 xvmaddadp vs33, vs9, vs28 2368 2369 xvmaddadp vs40, vs8, vs29 2370 xvmaddadp vs41, vs9, vs29 2371 2372.endm 2373 2374.macro KERNEL2x4_E2 2375 2376 2377 xvmaddadp vs32, vs8, vs28 2378 xvmaddadp vs33, vs9, vs28 2379 2380 xvmaddadp vs40, vs8, vs29 2381 xvmaddadp vs41, vs9, vs29 2382 2383.endm 2384 2385.macro KERNEL2x4_SUBI1 2386 2387 lxvd2x vs0, 0, AO 2388 lxvd2x vs1, o16, AO 2389 2390 lxvdsx vs24, 0, BO 2391 lxvdsx vs25, o8, BO 2392 2393 addi AO, AO, 32 2394 addi BO, BO, 16 2395 2396 2397 xvmuldp vs32, vs0, vs24 2398 xvmuldp vs33, vs1, vs24 2399 2400 xvmuldp vs40, vs0, vs25 2401 xvmuldp vs41, vs1, vs25 2402 2403.endm 2404 2405.macro KERNEL2x4_SUB1 2406 2407 lxvd2x vs0, 0, AO 2408 lxvd2x vs1, o16, AO 2409 2410 lxvdsx vs24, 0, BO 2411 lxvdsx vs25, o8, BO 2412 2413 addi AO, AO, 32 2414 addi BO, BO, 16 2415 2416 2417 xvmaddadp vs32, vs0, vs24 2418 xvmaddadp vs33, vs1, vs24 2419 2420 xvmaddadp vs40, vs0, vs25 2421 xvmaddadp vs41, vs1, vs25 2422 2423.endm 2424 2425.macro SAVE2x4 2426 2427 mr T1, CO 2428 2429#ifndef TRMMKERNEL 2430 lxvd2x vs0, 0, T1 2431 lxvd2x vs1, o16, T1 2432#endif 2433 2434#ifndef TRMMKERNEL 2435 xvmaddadp vs0, vs32, alpha_r 2436 xvmaddadp vs1, vs33, alpha_r 2437#else 2438 xvmuldp vs0, vs32, alpha_r 2439 xvmuldp vs1, vs33, alpha_r 2440#endif 2441 2442 stxvd2x vs0, 0, T1 2443 stxvd2x vs1, o16, T1 2444 2445 add T1, T1, LDC 2446 2447#ifndef TRMMKERNEL 2448 lxvd2x vs8, 0, T1 2449 lxvd2x vs9, o16, T1 2450#endif 2451 2452#ifndef TRMMKERNEL 2453 xvmaddadp vs8, vs40, alpha_r 2454 xvmaddadp vs9, vs41, alpha_r 2455#else 2456 xvmuldp vs8, vs40, alpha_r 2457 xvmuldp vs9, vs41, alpha_r 2458#endif 2459 2460 stxvd2x vs8, 0, T1 2461 stxvd2x vs9, o16, T1 2462 2463 addi CO, CO, 32 2464 2465.endm 2466 2467/********************************************************************* 2468* Macros for N=2, M=2 * 2469*********************************************************************/ 2470 2471.macro LOAD2x2_1 2472 2473 lxvd2x vs0, 0, AO 2474 2475 lxvdsx vs24, 0, BO 2476 lxvdsx vs25, o8, BO 2477 2478 addi AO, AO, 16 2479 addi BO, BO, 16 2480 2481.endm 2482 2483.macro KERNEL2x2_I1 2484 2485 lxvd2x vs8, 0, AO 2486 2487 lxvdsx vs28, 0, BO 2488 lxvdsx vs29, o8, BO 2489 2490 addi AO, AO, 16 2491 addi BO, BO, 16 2492 2493 2494 xvmuldp vs32, vs0, vs24 2495 2496 xvmuldp vs40, vs0, vs25 2497 2498.endm 2499 2500.macro KERNEL2x2_1 2501 2502 lxvd2x vs8, 0, AO 2503 2504 lxvdsx vs28, 0, BO 2505 lxvdsx vs29, o8, BO 2506 2507 addi AO, AO, 16 2508 addi BO, BO, 16 2509 2510 2511 xvmaddadp vs32, vs0, vs24 2512 2513 xvmaddadp vs40, vs0, vs25 2514 2515.endm 2516 2517.macro KERNEL2x2_2 2518 2519 lxvd2x vs0, 0, AO 2520 2521 lxvdsx vs24, 0, BO 2522 lxvdsx vs25, o8, BO 2523 2524 addi AO, AO, 16 2525 addi BO, BO, 16 2526 2527 2528 xvmaddadp vs32, vs8, vs28 2529 2530 xvmaddadp vs40, vs8, vs29 2531 2532.endm 2533 2534.macro KERNEL2x2_E2 2535 2536 2537 xvmaddadp vs32, vs8, vs28 2538 2539 xvmaddadp vs40, vs8, vs29 2540 2541.endm 2542 2543.macro KERNEL2x2_SUBI1 2544 2545 lxvd2x vs0, 0, AO 2546 2547 lxvdsx vs24, 0, BO 2548 lxvdsx vs25, o8, BO 2549 2550 addi AO, AO, 16 2551 addi BO, BO, 16 2552 2553 2554 xvmuldp vs32, vs0, vs24 2555 2556 xvmuldp vs40, vs0, vs25 2557 2558.endm 2559 2560.macro KERNEL2x2_SUB1 2561 2562 lxvd2x vs0, 0, AO 2563 2564 lxvdsx vs24, 0, BO 2565 lxvdsx vs25, o8, BO 2566 2567 addi AO, AO, 16 2568 addi BO, BO, 16 2569 2570 2571 xvmaddadp vs32, vs0, vs24 2572 2573 xvmaddadp vs40, vs0, vs25 2574 2575.endm 2576 2577.macro SAVE2x2 2578 2579 mr T1, CO 2580 2581#ifndef TRMMKERNEL 2582 lxvd2x vs0, 0, T1 2583#endif 2584 2585#ifndef TRMMKERNEL 2586 xvmaddadp vs0, vs32, alpha_r 2587#else 2588 xvmuldp vs0, vs32, alpha_r 2589#endif 2590 2591 stxvd2x vs0, 0, T1 2592 2593 add T1, T1, LDC 2594 2595#ifndef TRMMKERNEL 2596 lxvd2x vs8, 0, T1 2597#endif 2598 2599#ifndef TRMMKERNEL 2600 xvmaddadp vs8, vs40, alpha_r 2601#else 2602 xvmuldp vs8, vs40, alpha_r 2603#endif 2604 2605 stxvd2x vs8, 0, T1 2606 2607 addi CO, CO, 16 2608 2609.endm 2610 2611/********************************************************************* 2612* Macros for N=2, M=1 * 2613*********************************************************************/ 2614 2615.macro LOAD2x1_1 2616 2617 lxsdx vs0, 0, AO 2618 2619 lxsdx vs24, 0, BO 2620 lxsdx vs25, o8, BO 2621 2622 addi AO, AO, 8 2623 addi BO, BO, 16 2624 2625.endm 2626 2627.macro KERNEL2x1_I1 2628 2629 lxsdx vs8, 0, AO 2630 2631 lxsdx vs28, 0, BO 2632 lxsdx vs29, o8, BO 2633 2634 addi AO, AO, 8 2635 addi BO, BO, 16 2636 2637 2638 xsmuldp vs32, vs0, vs24 2639 2640 xsmuldp vs40, vs0, vs25 2641 2642.endm 2643 2644.macro KERNEL2x1_1 2645 2646 lxsdx vs8, 0, AO 2647 2648 lxsdx vs28, 0, BO 2649 lxsdx vs29, o8, BO 2650 2651 addi AO, AO, 8 2652 addi BO, BO, 16 2653 2654 2655 xsmaddadp vs32, vs0, vs24 2656 2657 xsmaddadp vs40, vs0, vs25 2658 2659.endm 2660 2661.macro KERNEL2x1_2 2662 2663 lxsdx vs0, 0, AO 2664 2665 lxsdx vs24, 0, BO 2666 lxsdx vs25, o8, BO 2667 2668 addi AO, AO, 8 2669 addi BO, BO, 16 2670 2671 2672 xsmaddadp vs32, vs8, vs28 2673 2674 xsmaddadp vs40, vs8, vs29 2675 2676.endm 2677 2678.macro KERNEL2x1_E2 2679 2680 2681 xsmaddadp vs32, vs8, vs28 2682 2683 xsmaddadp vs40, vs8, vs29 2684 2685.endm 2686 2687.macro KERNEL2x1_SUBI1 2688 2689 lxsdx vs0, 0, AO 2690 2691 lxsdx vs24, 0, BO 2692 lxsdx vs25, o8, BO 2693 2694 addi AO, AO, 8 2695 addi BO, BO, 16 2696 2697 2698 xsmuldp vs32, vs0, vs24 2699 2700 xsmuldp vs40, vs0, vs25 2701 2702.endm 2703 2704.macro KERNEL2x1_SUB1 2705 2706 lxsdx vs0, 0, AO 2707 2708 lxsdx vs24, 0, BO 2709 lxsdx vs25, o8, BO 2710 2711 addi AO, AO, 8 2712 addi BO, BO, 16 2713 2714 2715 xsmaddadp vs32, vs0, vs24 2716 2717 xsmaddadp vs40, vs0, vs25 2718 2719.endm 2720 2721.macro SAVE2x1 2722 2723 mr T1, CO 2724 2725#ifndef TRMMKERNEL 2726 lxsdx vs0, 0, T1 2727#endif 2728 2729#ifndef TRMMKERNEL 2730 xsmaddadp vs0, vs32, alpha_r 2731#else 2732 xsmuldp vs0, vs32, alpha_r 2733#endif 2734 2735 stxsdx vs0, 0, T1 2736 2737 add T1, T1, LDC 2738 2739#ifndef TRMMKERNEL 2740 lxsdx vs8, 0, T1 2741#endif 2742 2743#ifndef TRMMKERNEL 2744 xsmaddadp vs8, vs40, alpha_r 2745#else 2746 xsmuldp vs8, vs40, alpha_r 2747#endif 2748 2749 stxsdx vs8, 0, T1 2750 2751 addi CO, CO, 8 2752 2753.endm 2754 2755/********************************************************************* 2756* Macros for N=1, M=16 * 2757*********************************************************************/ 2758 2759.macro LOAD1x16_1 2760 2761 lxvd2x vs0, 0, AO 2762 lxvd2x vs1, o16, AO 2763 lxvd2x vs2, o32, AO 2764 lxvd2x vs3, o48, AO 2765 2766 lxvdsx vs24, 0, BO 2767 2768 addi AO, AO, 64 2769 addi BO, BO, 8 2770 2771 lxvd2x vs4, 0, AO 2772 lxvd2x vs5, o16, AO 2773 lxvd2x vs6, o32, AO 2774 lxvd2x vs7, o48, AO 2775 2776 addi AO, AO, 64 2777 2778.endm 2779 2780.macro KERNEL1x16_I1 2781 2782 lxvd2x vs8, 0, AO 2783 lxvd2x vs9, o16, AO 2784 lxvd2x vs10, o32, AO 2785 lxvd2x vs11, o48, AO 2786 2787 lxvdsx vs28, 0, BO 2788 2789 addi AO, AO, 64 2790 addi BO, BO, 8 2791 2792 lxvd2x vs12, 0, AO 2793 lxvd2x vs13, o16, AO 2794 lxvd2x vs14, o32, AO 2795 lxvd2x vs15, o48, AO 2796 2797 addi AO, AO, 64 2798 2799 2800 xvmuldp vs32, vs0, vs24 2801 xvmuldp vs33, vs1, vs24 2802 xvmuldp vs34, vs2, vs24 2803 xvmuldp vs35, vs3, vs24 2804 xvmuldp vs36, vs4, vs24 2805 xvmuldp vs37, vs5, vs24 2806 xvmuldp vs38, vs6, vs24 2807 xvmuldp vs39, vs7, vs24 2808 2809.endm 2810 2811.macro KERNEL1x16_1 2812 2813 lxvd2x vs8, 0, AO 2814 lxvd2x vs9, o16, AO 2815 lxvd2x vs10, o32, AO 2816 lxvd2x vs11, o48, AO 2817 2818 lxvdsx vs28, 0, BO 2819 2820 addi AO, AO, 64 2821 addi BO, BO, 8 2822 2823 lxvd2x vs12, 0, AO 2824 lxvd2x vs13, o16, AO 2825 lxvd2x vs14, o32, AO 2826 lxvd2x vs15, o48, AO 2827 2828 addi AO, AO, 64 2829 2830 2831 xvmaddadp vs32, vs0, vs24 2832 xvmaddadp vs33, vs1, vs24 2833 xvmaddadp vs34, vs2, vs24 2834 xvmaddadp vs35, vs3, vs24 2835 xvmaddadp vs36, vs4, vs24 2836 xvmaddadp vs37, vs5, vs24 2837 xvmaddadp vs38, vs6, vs24 2838 xvmaddadp vs39, vs7, vs24 2839 2840.endm 2841 2842.macro KERNEL1x16_2 2843 2844 lxvd2x vs0, 0, AO 2845 lxvd2x vs1, o16, AO 2846 lxvd2x vs2, o32, AO 2847 lxvd2x vs3, o48, AO 2848 2849 lxvdsx vs24, 0, BO 2850 2851 addi AO, AO, 64 2852 addi BO, BO, 8 2853 2854 lxvd2x vs4, 0, AO 2855 lxvd2x vs5, o16, AO 2856 lxvd2x vs6, o32, AO 2857 lxvd2x vs7, o48, AO 2858 2859 addi AO, AO, 64 2860 2861 2862 xvmaddadp vs32, vs8, vs28 2863 xvmaddadp vs33, vs9, vs28 2864 xvmaddadp vs34, vs10, vs28 2865 xvmaddadp vs35, vs11, vs28 2866 xvmaddadp vs36, vs12, vs28 2867 xvmaddadp vs37, vs13, vs28 2868 xvmaddadp vs38, vs14, vs28 2869 xvmaddadp vs39, vs15, vs28 2870 2871.endm 2872 2873.macro KERNEL1x16_E2 2874 2875 2876 xvmaddadp vs32, vs8, vs28 2877 xvmaddadp vs33, vs9, vs28 2878 xvmaddadp vs34, vs10, vs28 2879 xvmaddadp vs35, vs11, vs28 2880 xvmaddadp vs36, vs12, vs28 2881 xvmaddadp vs37, vs13, vs28 2882 xvmaddadp vs38, vs14, vs28 2883 xvmaddadp vs39, vs15, vs28 2884 2885.endm 2886 2887.macro KERNEL1x16_SUBI1 2888 2889 lxvd2x vs0, 0, AO 2890 lxvd2x vs1, o16, AO 2891 lxvd2x vs2, o32, AO 2892 lxvd2x vs3, o48, AO 2893 2894 lxvdsx vs24, 0, BO 2895 2896 addi AO, AO, 64 2897 addi BO, BO, 8 2898 2899 lxvd2x vs4, 0, AO 2900 lxvd2x vs5, o16, AO 2901 lxvd2x vs6, o32, AO 2902 lxvd2x vs7, o48, AO 2903 2904 addi AO, AO, 64 2905 2906 2907 xvmuldp vs32, vs0, vs24 2908 xvmuldp vs33, vs1, vs24 2909 xvmuldp vs34, vs2, vs24 2910 xvmuldp vs35, vs3, vs24 2911 xvmuldp vs36, vs4, vs24 2912 xvmuldp vs37, vs5, vs24 2913 xvmuldp vs38, vs6, vs24 2914 xvmuldp vs39, vs7, vs24 2915 2916.endm 2917 2918.macro KERNEL1x16_SUB1 2919 2920 lxvd2x vs0, 0, AO 2921 lxvd2x vs1, o16, AO 2922 lxvd2x vs2, o32, AO 2923 lxvd2x vs3, o48, AO 2924 2925 lxvdsx vs24, 0, BO 2926 2927 addi AO, AO, 64 2928 addi BO, BO, 8 2929 2930 lxvd2x vs4, 0, AO 2931 lxvd2x vs5, o16, AO 2932 lxvd2x vs6, o32, AO 2933 lxvd2x vs7, o48, AO 2934 2935 addi AO, AO, 64 2936 2937 2938 xvmaddadp vs32, vs0, vs24 2939 xvmaddadp vs33, vs1, vs24 2940 xvmaddadp vs34, vs2, vs24 2941 xvmaddadp vs35, vs3, vs24 2942 xvmaddadp vs36, vs4, vs24 2943 xvmaddadp vs37, vs5, vs24 2944 xvmaddadp vs38, vs6, vs24 2945 xvmaddadp vs39, vs7, vs24 2946 2947.endm 2948 2949.macro SAVE1x16 2950 2951 mr T1, CO 2952 addi T2, T1, 64 2953 2954#ifndef TRMMKERNEL 2955 lxvd2x vs0, 0, T1 2956 lxvd2x vs1, o16, T1 2957 lxvd2x vs2, o32, T1 2958 lxvd2x vs3, o48, T1 2959 2960 lxvd2x vs4, 0, T2 2961 lxvd2x vs5, o16, T2 2962 lxvd2x vs6, o32, T2 2963 lxvd2x vs7, o48, T2 2964#endif 2965 2966#ifndef TRMMKERNEL 2967 xvmaddadp vs0, vs32, alpha_r 2968 xvmaddadp vs1, vs33, alpha_r 2969 xvmaddadp vs2, vs34, alpha_r 2970 xvmaddadp vs3, vs35, alpha_r 2971 xvmaddadp vs4, vs36, alpha_r 2972 xvmaddadp vs5, vs37, alpha_r 2973 xvmaddadp vs6, vs38, alpha_r 2974 xvmaddadp vs7, vs39, alpha_r 2975#else 2976 xvmuldp vs0, vs32, alpha_r 2977 xvmuldp vs1, vs33, alpha_r 2978 xvmuldp vs2, vs34, alpha_r 2979 xvmuldp vs3, vs35, alpha_r 2980 xvmuldp vs4, vs36, alpha_r 2981 xvmuldp vs5, vs37, alpha_r 2982 xvmuldp vs6, vs38, alpha_r 2983 xvmuldp vs7, vs39, alpha_r 2984#endif 2985 2986 stxvd2x vs0, 0, T1 2987 stxvd2x vs1, o16, T1 2988 stxvd2x vs2, o32, T1 2989 stxvd2x vs3, o48, T1 2990 2991 stxvd2x vs4, 0, T2 2992 stxvd2x vs5, o16, T2 2993 stxvd2x vs6, o32, T2 2994 stxvd2x vs7, o48, T2 2995 2996 addi CO, CO, 128 2997 2998.endm 2999 3000/********************************************************************* 3001* Macros for N=4, M=8 * 3002*********************************************************************/ 3003 3004.macro LOAD1x8_1 3005 3006 lxvd2x vs0, 0, AO 3007 lxvd2x vs1, o16, AO 3008 lxvd2x vs2, o32, AO 3009 lxvd2x vs3, o48, AO 3010 3011 lxvdsx vs24, 0, BO 3012 3013 addi AO, AO, 64 3014 addi BO, BO, 8 3015 3016.endm 3017 3018.macro KERNEL1x8_I1 3019 3020 lxvd2x vs8, 0, AO 3021 lxvd2x vs9, o16, AO 3022 lxvd2x vs10, o32, AO 3023 lxvd2x vs11, o48, AO 3024 3025 lxvdsx vs28, 0, BO 3026 3027 addi AO, AO, 64 3028 addi BO, BO, 8 3029 3030 3031 xvmuldp vs32, vs0, vs24 3032 xvmuldp vs33, vs1, vs24 3033 xvmuldp vs34, vs2, vs24 3034 xvmuldp vs35, vs3, vs24 3035 3036.endm 3037 3038.macro KERNEL1x8_1 3039 3040 lxvd2x vs8, 0, AO 3041 lxvd2x vs9, o16, AO 3042 lxvd2x vs10, o32, AO 3043 lxvd2x vs11, o48, AO 3044 3045 lxvdsx vs28, 0, BO 3046 3047 addi AO, AO, 64 3048 addi BO, BO, 8 3049 3050 3051 xvmaddadp vs32, vs0, vs24 3052 xvmaddadp vs33, vs1, vs24 3053 xvmaddadp vs34, vs2, vs24 3054 xvmaddadp vs35, vs3, vs24 3055 3056.endm 3057 3058.macro KERNEL1x8_2 3059 3060 lxvd2x vs0, 0, AO 3061 lxvd2x vs1, o16, AO 3062 lxvd2x vs2, o32, AO 3063 lxvd2x vs3, o48, AO 3064 3065 lxvdsx vs24, 0, BO 3066 3067 addi AO, AO, 64 3068 addi BO, BO, 8 3069 3070 3071 xvmaddadp vs32, vs8, vs28 3072 xvmaddadp vs33, vs9, vs28 3073 xvmaddadp vs34, vs10, vs28 3074 xvmaddadp vs35, vs11, vs28 3075 3076.endm 3077 3078.macro KERNEL1x8_E2 3079 3080 3081 xvmaddadp vs32, vs8, vs28 3082 xvmaddadp vs33, vs9, vs28 3083 xvmaddadp vs34, vs10, vs28 3084 xvmaddadp vs35, vs11, vs28 3085 3086.endm 3087 3088.macro KERNEL1x8_SUBI1 3089 3090 lxvd2x vs0, 0, AO 3091 lxvd2x vs1, o16, AO 3092 lxvd2x vs2, o32, AO 3093 lxvd2x vs3, o48, AO 3094 3095 lxvdsx vs24, 0, BO 3096 3097 addi AO, AO, 64 3098 addi BO, BO, 8 3099 3100 3101 xvmuldp vs32, vs0, vs24 3102 xvmuldp vs33, vs1, vs24 3103 xvmuldp vs34, vs2, vs24 3104 xvmuldp vs35, vs3, vs24 3105 3106.endm 3107 3108.macro KERNEL1x8_SUB1 3109 3110 lxvd2x vs0, 0, AO 3111 lxvd2x vs1, o16, AO 3112 lxvd2x vs2, o32, AO 3113 lxvd2x vs3, o48, AO 3114 3115 lxvdsx vs24, 0, BO 3116 3117 addi AO, AO, 64 3118 addi BO, BO, 8 3119 3120 3121 xvmaddadp vs32, vs0, vs24 3122 xvmaddadp vs33, vs1, vs24 3123 xvmaddadp vs34, vs2, vs24 3124 xvmaddadp vs35, vs3, vs24 3125 3126.endm 3127 3128.macro SAVE1x8 3129 3130 mr T1, CO 3131 3132#ifndef TRMMKERNEL 3133 lxvd2x vs0, 0, T1 3134 lxvd2x vs1, o16, T1 3135 lxvd2x vs2, o32, T1 3136 lxvd2x vs3, o48, T1 3137#endif 3138 3139#ifndef TRMMKERNEL 3140 xvmaddadp vs0, vs32, alpha_r 3141 xvmaddadp vs1, vs33, alpha_r 3142 xvmaddadp vs2, vs34, alpha_r 3143 xvmaddadp vs3, vs35, alpha_r 3144#else 3145 xvmuldp vs0, vs32, alpha_r 3146 xvmuldp vs1, vs33, alpha_r 3147 xvmuldp vs2, vs34, alpha_r 3148 xvmuldp vs3, vs35, alpha_r 3149#endif 3150 3151 stxvd2x vs0, 0, T1 3152 stxvd2x vs1, o16, T1 3153 stxvd2x vs2, o32, T1 3154 stxvd2x vs3, o48, T1 3155 3156 addi CO, CO, 64 3157 3158.endm 3159 3160/********************************************************************* 3161* Macros for N=1, M=4 * 3162*********************************************************************/ 3163 3164.macro LOAD1x4_1 3165 3166 lxvd2x vs0, 0, AO 3167 lxvd2x vs1, o16, AO 3168 3169 lxvdsx vs24, 0, BO 3170 3171 addi AO, AO, 32 3172 addi BO, BO, 8 3173 3174.endm 3175 3176.macro KERNEL1x4_I1 3177 3178 lxvd2x vs8, 0, AO 3179 lxvd2x vs9, o16, AO 3180 3181 lxvdsx vs28, 0, BO 3182 3183 addi AO, AO, 32 3184 addi BO, BO, 8 3185 3186 3187 xvmuldp vs32, vs0, vs24 3188 xvmuldp vs33, vs1, vs24 3189 3190.endm 3191 3192.macro KERNEL1x4_1 3193 3194 lxvd2x vs8, 0, AO 3195 lxvd2x vs9, o16, AO 3196 3197 lxvdsx vs28, 0, BO 3198 3199 addi AO, AO, 32 3200 addi BO, BO, 8 3201 3202 3203 xvmaddadp vs32, vs0, vs24 3204 xvmaddadp vs33, vs1, vs24 3205 3206.endm 3207 3208.macro KERNEL1x4_2 3209 3210 lxvd2x vs0, 0, AO 3211 lxvd2x vs1, o16, AO 3212 3213 lxvdsx vs24, 0, BO 3214 3215 addi AO, AO, 32 3216 addi BO, BO, 8 3217 3218 3219 xvmaddadp vs32, vs8, vs28 3220 xvmaddadp vs33, vs9, vs28 3221 3222.endm 3223 3224.macro KERNEL1x4_E2 3225 3226 3227 xvmaddadp vs32, vs8, vs28 3228 xvmaddadp vs33, vs9, vs28 3229 3230.endm 3231 3232.macro KERNEL1x4_SUBI1 3233 3234 lxvd2x vs0, 0, AO 3235 lxvd2x vs1, o16, AO 3236 3237 lxvdsx vs24, 0, BO 3238 3239 addi AO, AO, 32 3240 addi BO, BO, 8 3241 3242 3243 xvmuldp vs32, vs0, vs24 3244 xvmuldp vs33, vs1, vs24 3245 3246.endm 3247 3248.macro KERNEL1x4_SUB1 3249 3250 lxvd2x vs0, 0, AO 3251 lxvd2x vs1, o16, AO 3252 3253 lxvdsx vs24, 0, BO 3254 3255 addi AO, AO, 32 3256 addi BO, BO, 8 3257 3258 3259 xvmaddadp vs32, vs0, vs24 3260 xvmaddadp vs33, vs1, vs24 3261 3262.endm 3263 3264.macro SAVE1x4 3265 3266 mr T1, CO 3267 3268#ifndef TRMMKERNEL 3269 lxvd2x vs0, 0, T1 3270 lxvd2x vs1, o16, T1 3271#endif 3272 3273#ifndef TRMMKERNEL 3274 xvmaddadp vs0, vs32, alpha_r 3275 xvmaddadp vs1, vs33, alpha_r 3276#else 3277 xvmuldp vs0, vs32, alpha_r 3278 xvmuldp vs1, vs33, alpha_r 3279#endif 3280 3281 stxvd2x vs0, 0, T1 3282 stxvd2x vs1, o16, T1 3283 3284 addi CO, CO, 32 3285 3286.endm 3287 3288/********************************************************************* 3289* Macros for N=1, M=2 * 3290*********************************************************************/ 3291 3292.macro LOAD1x2_1 3293 3294 lxvd2x vs0, 0, AO 3295 3296 lxvdsx vs24, 0, BO 3297 3298 addi AO, AO, 16 3299 addi BO, BO, 8 3300 3301.endm 3302 3303.macro KERNEL1x2_I1 3304 3305 lxvd2x vs8, 0, AO 3306 3307 lxvdsx vs28, 0, BO 3308 3309 addi AO, AO, 16 3310 addi BO, BO, 8 3311 3312 3313 xvmuldp vs32, vs0, vs24 3314 3315.endm 3316 3317.macro KERNEL1x2_1 3318 3319 lxvd2x vs8, 0, AO 3320 3321 lxvdsx vs28, 0, BO 3322 3323 addi AO, AO, 16 3324 addi BO, BO, 8 3325 3326 3327 xvmaddadp vs32, vs0, vs24 3328 3329.endm 3330 3331.macro KERNEL1x2_2 3332 3333 lxvd2x vs0, 0, AO 3334 3335 lxvdsx vs24, 0, BO 3336 3337 addi AO, AO, 16 3338 addi BO, BO, 8 3339 3340 3341 xvmaddadp vs32, vs8, vs28 3342 3343.endm 3344 3345.macro KERNEL1x2_E2 3346 3347 3348 xvmaddadp vs32, vs8, vs28 3349 3350.endm 3351 3352.macro KERNEL1x2_SUBI1 3353 3354 lxvd2x vs0, 0, AO 3355 3356 lxvdsx vs24, 0, BO 3357 3358 addi AO, AO, 16 3359 addi BO, BO, 8 3360 3361 3362 xvmuldp vs32, vs0, vs24 3363 3364.endm 3365 3366.macro KERNEL1x2_SUB1 3367 3368 lxvd2x vs0, 0, AO 3369 3370 lxvdsx vs24, 0, BO 3371 3372 addi AO, AO, 16 3373 addi BO, BO, 8 3374 3375 3376 xvmaddadp vs32, vs0, vs24 3377 3378.endm 3379 3380.macro SAVE1x2 3381 3382 mr T1, CO 3383 3384#ifndef TRMMKERNEL 3385 lxvd2x vs0, 0, T1 3386#endif 3387 3388#ifndef TRMMKERNEL 3389 xvmaddadp vs0, vs32, alpha_r 3390#else 3391 xvmuldp vs0, vs32, alpha_r 3392#endif 3393 3394 stxvd2x vs0, 0, T1 3395 3396 addi CO, CO, 16 3397 3398.endm 3399 3400/********************************************************************* 3401* Macros for N=1, M=1 * 3402*********************************************************************/ 3403 3404.macro LOAD1x1_1 3405 3406 lxsdx vs0, 0, AO 3407 3408 lxsdx vs24, 0, BO 3409 3410 addi AO, AO, 8 3411 addi BO, BO, 8 3412 3413.endm 3414 3415.macro KERNEL1x1_I1 3416 3417 lxsdx vs8, 0, AO 3418 3419 lxsdx vs28, 0, BO 3420 3421 addi AO, AO, 8 3422 addi BO, BO, 8 3423 3424 3425 xsmuldp vs32, vs0, vs24 3426 3427.endm 3428 3429.macro KERNEL1x1_1 3430 3431 lxsdx vs8, 0, AO 3432 3433 lxsdx vs28, 0, BO 3434 3435 addi AO, AO, 8 3436 addi BO, BO, 8 3437 3438 3439 xsmaddadp vs32, vs0, vs24 3440 3441.endm 3442 3443.macro KERNEL1x1_2 3444 3445 lxsdx vs0, 0, AO 3446 3447 lxsdx vs24, 0, BO 3448 3449 addi AO, AO, 8 3450 addi BO, BO, 8 3451 3452 3453 xsmaddadp vs32, vs8, vs28 3454 3455.endm 3456 3457.macro KERNEL1x1_E2 3458 3459 3460 xsmaddadp vs32, vs8, vs28 3461 3462.endm 3463 3464.macro KERNEL1x1_SUBI1 3465 3466 lxsdx vs0, 0, AO 3467 3468 lxsdx vs24, 0, BO 3469 3470 addi AO, AO, 8 3471 addi BO, BO, 8 3472 3473 3474 xsmuldp vs32, vs0, vs24 3475 3476.endm 3477 3478.macro KERNEL1x1_SUB1 3479 3480 lxsdx vs0, 0, AO 3481 3482 lxsdx vs24, 0, BO 3483 3484 addi AO, AO, 8 3485 addi BO, BO, 8 3486 3487 3488 xsmaddadp vs32, vs0, vs24 3489 3490.endm 3491 3492.macro SAVE1x1 3493 3494 mr T1, CO 3495 3496#ifndef TRMMKERNEL 3497 lxsdx vs0, 0, T1 3498#endif 3499 3500#ifndef TRMMKERNEL 3501 xsmaddadp vs0, vs32, alpha_r 3502#else 3503 xsmuldp vs0, vs32, alpha_r 3504#endif 3505 3506 stxsdx vs0, 0, T1 3507 3508 addi CO, CO, 8 3509 3510.endm 3511 3512 3513 3514 3515/****************************TRMM POINTER REFRESH MACROSES*************************/ 3516 3517.macro SHIFT_REG REG1,REG2,SHIFT_VAL 3518 .if \SHIFT_VAL==16 3519 slwi \REG1, \REG2, 7 3520 .elseif \SHIFT_VAL==8 3521 slwi \REG1, \REG2, 6 3522 .elseif \SHIFT_VAL==4 3523 slwi \REG1, \REG2, 5 3524 .elseif \SHIFT_VAL==2 3525 slwi \REG1, \REG2, 4 3526 .elseif \SHIFT_VAL==1 3527 slwi \REG1, \REG2, 3 3528 .endif 3529.endm 3530 3531/* 3532//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 3533// ptrbb = bb; 3534// #else 3535// ptrba += off*16; 3536// ptrbb = bb + off*2; 3537// #endif 3538*/ 3539.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B 3540 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 3541 /* ptrbb = bb;*/ 3542 mr \PTR_B,\B_VAL /* refresh BPOINT */ 3543 3544 #else 3545 /* 3546 // ptrba =ptrba+ off*C_A; 3547 // ptrbb = bb + off*C_B; 3548 */ 3549 SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ 3550 SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ 3551 add \PTR_B, \B_VAL , T4 /* Add values to BO */ 3552 add \PTR_A, \PTR_A, T2 /* Add values to AO */ 3553 #endif 3554.endm 3555 3556 3557/* 3558// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3559// temp = bk-off; 3560// #elif defined(LEFT) 3561// temp = off+16; // number of values in A 3562// #else 3563// temp = off+2; // number of values in B 3564// #endif 3565*/ 3566.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B 3567 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3568 /* temp = bk-off;*/ 3569 sub \TEMP_BK,\BK_VAL,\OFF_VAL 3570 3571 #elif defined(LEFT) 3572 /* temp = off+INCR_A; // number of values in A */ 3573 addi \TEMP_BK, \OFF_VAL, \INCR_A 3574 #else 3575 /* temp = off+INCR_B // number of values in B*/ 3576 addi \TEMP_BK,\OFF_VAL, \INCR_B 3577 #endif 3578 3579.endm 3580/* 3581// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 3582// temp = bk - off; 3583// #ifdef LEFT 3584// temp -= 16; // number of values in A 3585// #else 3586// temp -= 2; // number of values in B 3587// #endif 3588// ptrba += temp*16; 3589// ptrbb += temp*2; 3590// #endif 3591 3592// #ifdef LEFT 3593// off += 16; // number of values in A 3594// #endif 3595*/ 3596 3597 3598.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B 3599 3600 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 3601 /*temp = bk - off;*/ 3602 sub \TEMP_BK,\BK_VAL,\OFF_VAL 3603 #ifdef LEFT 3604 /*temp -= 8; // number of values in A*/ 3605 addi \TEMP_BK,\TEMP_BK,-\C_A 3606 #else 3607 /*temp -= 4; // number of values in B*/ 3608 addi \TEMP_BK,\TEMP_BK,-\C_B 3609 #endif 3610 /*ptrba += temp*C_A; 3611 ptrbb += temp*C_B;*/ 3612 SHIFT_REG T4,\TEMP_BK,\C_A 3613 SHIFT_REG T2,\TEMP_BK,\C_B 3614 add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 3615 add \PTR_B, \PTR_B,T2 3616 3617 #endif 3618 3619 #ifdef LEFT 3620 /*off += 8; // number of values in A*/ 3621 addi \OFF_VAL,\OFF_VAL,\C_A 3622 #endif 3623.endm