1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define P 1020 26 27#define M %i0 28#define N %i1 29 30#if defined(DOUBLE) && !defined(__64BIT__) 31#define A %i5 32#define LDA %i2 33#define X %i3 34#define INCX %i4 35#else 36#define A %i4 37#define LDA %i5 38#define X %i2 39#define INCX %i3 40#endif 41 42#define Y %l0 43#define INCY %l1 44#define BUFFER %l2 45 46#define I %l3 47#define IS %l4 48#define J %l5 49#define MIN_M %l6 50#define XP %l7 51 52#define A1 %o0 53#define A2 %o1 54#define A3 %o2 55#define A4 %o3 56#define X1 %o4 57#define Y1 %o5 58#define PNLDA %g1 59#define Y2 %o7 /* Danger? */ 60 61#ifdef DOUBLE 62#define t1 %f0 63#define t2 %f2 64#define t3 %f4 65#define t4 %f6 66 67#define c1 %f8 68#define c2 %f10 69#define c3 %f12 70#define c4 %f14 71 72#define a1 %f16 73#define a2 %f18 74#define a3 %f20 75#define a4 %f22 76#define a5 %f24 77#define a6 %f26 78#define a7 %f28 79#define a8 %f30 80 81#define a9 %f32 82#define a10 %f34 83#define a11 %f36 84#define a12 %f38 85#define a13 %f40 86#define a14 %f42 87#define a15 %f44 88#define a16 %f46 89 90#define b1 %f48 91#define b2 %f50 92#define b3 %f52 93#define b4 %f54 94#define b5 %f56 95#define b6 %f58 96#define b7 %f60 97#define b8 %f62 98 99#define FZERO %f60 100#define ALPHA %f62 101 102#else 103#define t1 %f0 104#define t2 %f1 105#define t3 %f2 106#define t4 %f3 107 108#define c1 %f4 109#define c2 %f5 110#define c3 %f6 111#define c4 %f7 112 113#define a1 %f8 114#define a2 %f9 115#define a3 %f10 116#define a4 %f11 117#define a5 %f12 118#define a6 %f13 119#define a7 %f14 120#define a8 %f15 121 122#define a9 %f16 123#define a10 %f17 124#define a11 %f18 125#define a12 %f19 126#define a13 %f20 127#define a14 %f21 128#define a15 %f22 129#define a16 %f23 130 131#define b1 %f24 132#define b2 %f25 133#define b3 %f26 134#define b4 %f27 135#define b5 %f28 136#define b6 %f29 137#define b7 %f30 138#define b8 %f31 139 140#define FZERO %f30 141#define ALPHA %f31 142#endif 143 144#ifndef __64BIT__ 145#define STACK_FZERO [%sp + STACK_START + 8] 146#define STACK_ALPHA [%sp + STACK_START + 16] 147#else 148#define STACK_FZERO [%sp + STACK_START + 32] 149#define STACK_ALPHA [%sp + STACK_START + 40] 150#endif 151 152#ifdef DOUBLE 153#define PREFETCHSIZE 36 154#else 155#define PREFETCHSIZE 72 156#endif 157 158 PROLOGUE 159 SAVESP 160 nop 161 162#ifndef __64BIT__ 163 164#ifdef DOUBLE 165 st %i3, [%sp + STACK_START + 16] /* ALPHA */ 166 st %i4, [%sp + STACK_START + 20] 167 168 ld [%sp + STACK_START + 28], LDA 169 ld [%sp + STACK_START + 32], X 170 ld [%sp + STACK_START + 36], INCX 171 ld [%sp + STACK_START + 40], Y 172 ld [%sp + STACK_START + 44], INCY 173 ld [%sp + STACK_START + 48], BUFFER 174#else 175 st %i3, [%sp + STACK_START + 16] /* ALPHA */ 176 177 ld [%sp + STACK_START + 28], X 178 ld [%sp + STACK_START + 32], INCX 179 ld [%sp + STACK_START + 36], Y 180 ld [%sp + STACK_START + 40], INCY 181 ld [%sp + STACK_START + 44], BUFFER 182#endif 183 LDF [%sp + STACK_START + 16], ALPHA 184#else 185 ldx [%sp+ STACK_START + 56], X 186 ldx [%sp+ STACK_START + 64], INCX 187 ldx [%sp+ STACK_START + 72], Y 188 ldx [%sp+ STACK_START + 80], INCY 189 ldx [%sp+ STACK_START + 88], BUFFER 190#ifdef DOUBLE 191 FMOV %f6, ALPHA 192 STF %f6, STACK_ALPHA 193#else 194 FMOV %f7, ALPHA 195 STF %f7, STACK_ALPHA 196#endif 197#endif 198 199#ifdef DOUBLE 200 FCLR(29) 201#else 202 FCLR(30) 203#endif 204 205 clr IS 206 mov P, I 207 sll LDA, BASE_SHIFT, LDA 208 sll I, BASE_SHIFT, I 209 smul LDA, N, PNLDA 210 sll INCX, BASE_SHIFT, INCX 211 sll INCY, BASE_SHIFT, INCY 212 sub I, PNLDA, PNLDA 213 214.LL10: 215 sll IS, BASE_SHIFT, I 216 sub M, IS, MIN_M 217 cmp MIN_M, P 218 nop 219 movg %icc, P, MIN_M 220 nop 221 cmp INCX, SIZE 222 beq .LL100 223 add X, I, XP 224 225 sra MIN_M, 2, I 226 mov BUFFER, XP 227 cmp I, 0 228 ble,pn %icc, .LL15 229 mov BUFFER, Y1 230 231.LL11: 232 LDF [X], a1 233 add X, INCX, X 234 LDF [X], a2 235 add X, INCX, X 236 LDF [X], a3 237 add X, INCX, X 238 LDF [X], a4 239 add X, INCX, X 240 241 STF a1, [Y1 + 0 * SIZE] 242 add I, -1, I 243 STF a2, [Y1 + 1 * SIZE] 244 cmp I, 0 245 STF a3, [Y1 + 2 * SIZE] 246 STF a4, [Y1 + 3 * SIZE] 247 bg,pn %icc, .LL11 248 add Y1, 4 * SIZE, Y1 249 250.LL15: 251 and MIN_M, 3, I 252 cmp I, 0 253 ble,pn %icc, .LL100 254 nop 255 256.LL16: 257 LDF [X], a1 258 add X, INCX, X 259 add I, -1, I 260 cmp I, 0 261 nop 262 STF a1, [Y1] 263 bg,pn %icc, .LL16 264 add Y1, 1 * SIZE, Y1 265 266.LL100: 267 sra N, 1, J 268 cmp J, 0 269 ble %icc, .LL200 270 mov Y, Y1 271 272.LL110: 273#ifdef DOUBLE 274 FCLR(29) 275#else 276 FCLR(30) 277#endif 278 279 FMOV FZERO, c1 280 FMOV FZERO, c2 281 FMOV FZERO, c3 282 FMOV FZERO, c4 283 284 FMOV FZERO, t1 285 FMOV FZERO, t2 286 FMOV FZERO, t3 287 FMOV FZERO, t4 288 289 mov A, A1 290 add A, LDA, A2 291 add A2, LDA, A 292 293 mov XP, X1 294 295 sra MIN_M, 3, I 296 cmp I, 0 297 ble %icc, .LL115 298 prefetch [Y1 + 2 * SIZE], 0 299 300 LDF [A1 + 0 * SIZE], a1 301 deccc I 302 LDF [A1 + 1 * SIZE], a2 303 LDF [A1 + 2 * SIZE], a3 304 LDF [A1 + 3 * SIZE], a4 305 LDF [A1 + 4 * SIZE], a5 306 LDF [A1 + 5 * SIZE], a6 307 LDF [A1 + 6 * SIZE], a7 308 LDF [A1 + 7 * SIZE], a8 309 310 LDF [A2 + 0 * SIZE], a9 311 LDF [A2 + 1 * SIZE], a10 312 LDF [A2 + 2 * SIZE], a11 313 LDF [A2 + 3 * SIZE], a12 314 LDF [A2 + 4 * SIZE], a13 315 LDF [A2 + 5 * SIZE], a14 316 LDF [A2 + 6 * SIZE], a15 317 LDF [A2 + 7 * SIZE], a16 318 319 LDF [X1 + 0 * SIZE], b1 320 LDF [X1 + 1 * SIZE], b2 321 LDF [X1 + 2 * SIZE], b3 322 LDF [X1 + 3 * SIZE], b4 323 LDF [X1 + 4 * SIZE], b5 324 LDF [X1 + 5 * SIZE], b6 325 326 ble %icc, .LL112 327 LDF [X1 + 6 * SIZE], b7 328 329.LL111: 330 FADD c1, t1, c1 331 prefetch [A1 + PREFETCHSIZE * SIZE], 1 332 FMUL a1, b1, t1 333 LDF [A1 + 8 * SIZE], a1 334 335 FADD c2, t2, c2 336 LDF [X1 + 7 * SIZE], b8 337 FMUL a9, b1, t2 338 LDF [A2 + 8 * SIZE], a9 339 340 FADD c3, t3, c3 341 LDF [X1 + 8 * SIZE], b1 342 FMUL a2, b2, t3 343 LDF [A1 + 9 * SIZE], a2 344 345 FADD c4, t4, c4 346 deccc I 347 FMUL a10, b2, t4 348 LDF [A2 + 9 * SIZE], a10 349 350 FADD c1, t1, c1 351 LDF [X1 + 9 * SIZE], b2 352 FMUL a3, b3, t1 353 LDF [A1 + 10 * SIZE], a3 354 355 FADD c2, t2, c2 356 nop 357 FMUL a11, b3, t2 358 LDF [A2 + 10 * SIZE], a11 359 360 FADD c3, t3, c3 361 LDF [X1 + 10 * SIZE], b3 362 FMUL a4, b4, t3 363 LDF [A1 + 11 * SIZE], a4 364 365 FADD c4, t4, c4 366 nop 367 FMUL a12, b4, t4 368 LDF [A2 + 11 * SIZE], a12 369 370 FADD c1, t1, c1 371 LDF [X1 + 11 * SIZE], b4 372 FMUL a5, b5, t1 373 LDF [A1 + 12 * SIZE], a5 374 375 FADD c2, t2, c2 376 prefetch [A2 + (PREFETCHSIZE + 4) * SIZE], 1 377 FMUL a13, b5, t2 378 LDF [A2 + 12 * SIZE], a13 379 380 FADD c3, t3, c3 381 LDF [X1 + 12 * SIZE], b5 382 FMUL a6, b6, t3 383 LDF [A1 + 13 * SIZE], a6 384 385 FADD c4, t4, c4 386 FMUL a14, b6, t4 387 LDF [A2 + 13 * SIZE], a14 388 389 FADD c1, t1, c1 390 LDF [X1 + 13 * SIZE], b6 391 FMUL a7, b7, t1 392 LDF [A1 + 14 * SIZE], a7 393 394 FADD c2, t2, c2 395 add X1, 8 * SIZE, X1 396 FMUL a15, b7, t2 397 LDF [A2 + 14 * SIZE], a15 398 399 FADD c3, t3, c3 400 LDF [X1 + 6 * SIZE], b7 401 FMUL a8, b8, t3 402 LDF [A1 + 15 * SIZE], a8 403 404 FADD c4, t4, c4 405 add A1, 8 * SIZE, A1 406 FMUL a16, b8, t4 407 LDF [A2 + 15 * SIZE], a16 408 409 bg,pn %icc, .LL111 410 add A2, 8 * SIZE, A2 411 412.LL112: 413 FADD c1, t1, c1 414 LDF [X1 + 7 * SIZE], b8 415 FMUL a1, b1, t1 416 add A1, 8 * SIZE, A1 417 418 FADD c2, t2, c2 419 add A2, 8 * SIZE, A2 420 FMUL a9, b1, t2 421 add X1, 8 * SIZE, X1 422 423 FADD c3, t3, c3 424 FMUL a2, b2, t3 425 FADD c4, t4, c4 426 FMUL a10, b2, t4 427 428 FADD c1, t1, c1 429 FMUL a3, b3, t1 430 FADD c2, t2, c2 431 FMUL a11, b3, t2 432 433 FADD c3, t3, c3 434 FMUL a4, b4, t3 435 FADD c4, t4, c4 436 FMUL a12, b4, t4 437 438 FADD c1, t1, c1 439 FMUL a5, b5, t1 440 FADD c2, t2, c2 441 FMUL a13, b5, t2 442 443 FADD c3, t3, c3 444 FMUL a6, b6, t3 445 FADD c4, t4, c4 446 FMUL a14, b6, t4 447 448 FADD c1, t1, c1 449 FMUL a7, b7, t1 450 FADD c2, t2, c2 451 FMUL a15, b7, t2 452 453 FADD c3, t3, c3 454 FMUL a8, b8, t3 455 FADD c4, t4, c4 456 FMUL a16, b8, t4 457 458.LL115: 459 andcc MIN_M, 7, I 460 ble %icc, .LL119 461 mov Y1, Y2 462 463 LDF [X1 + 0 * SIZE], b1 464 deccc I 465 LDF [A1 + 0 * SIZE], a1 466 ble %icc, .LL117 467 LDF [A2 + 0 * SIZE], a2 468 469.LL116: 470 FADD c1, t1, c1 471 add X1, 1 * SIZE, X1 472 FMUL a1, b1, t1 473 LDF [A1 + 1 * SIZE], a1 474 475 FADD c2, t2, c2 476 add A1, 1 * SIZE, A1 477 FMUL a2, b1, t2 478 LDF [X1 + 0 * SIZE], b1 479 480 add A2, 1 * SIZE, A2 481 deccc I 482 bg,pn %icc, .LL116 483 LDF [A2 + 0 * SIZE], a2 484 485.LL117: 486 FADD c1, t1, c1 487 add X1, 1 * SIZE, X1 488 FADD c2, t2, c2 489 add A1, 1 * SIZE, A1 490 491 FMUL a1, b1, t1 492 add A2, 1 * SIZE, A2 493 FMUL a2, b1, t2 494 nop 495 496.LL119: 497 FADD c1, t1, c1 498 FADD c2, t2, c2 499 FADD c3, t3, c3 500 FADD c4, t4, c4 501 502 FADD c1, c3, c1 503 FADD c2, c4, c2 504 505 506 LDF [Y1], a1 507 LDF [Y1 + INCY], a2 508 509 add Y1, INCY, Y1 510 add Y1, INCY, Y1 511 512 LDF STACK_ALPHA, ALPHA 513 514 FMUL ALPHA, c1, c1 515 FMUL ALPHA, c2, c2 516 FADD a1, c1, a1 517 FADD a2, c2, a2 518 519 STF a1, [Y2] 520 STF a2, [Y2 + INCY] 521 522 deccc J 523 bg %icc, .LL110 524#ifdef DOUBLE 525 FCLR(29) 526#else 527 FCLR(30) 528#endif 529 530.LL200: 531 andcc N, 1, J 532 nop 533 ble %icc, .LL400 534 FMOV FZERO, c1 535 536.LL310: 537 FMOV FZERO, t1 538 sra MIN_M, 3, I 539 FMOV FZERO, c2 540 mov A, A1 541 FMOV FZERO, t2 542 add A, LDA, A 543 FMOV FZERO, t3 544 cmp I, 0 545 FMOV FZERO, t4 546 ble %icc, .LL315 547 mov XP, X1 548 549 LDF [A1 + 0 * SIZE], a1 550 LDF [A1 + 1 * SIZE], a2 551 LDF [A1 + 2 * SIZE], a3 552 LDF [A1 + 3 * SIZE], a4 553 LDF [A1 + 4 * SIZE], a5 554 LDF [A1 + 5 * SIZE], a6 555 LDF [A1 + 6 * SIZE], a7 556 LDF [A1 + 7 * SIZE], a8 557 add A1, 8 * SIZE, A1 558 559 LDF [X1 + 0 * SIZE], a9 560 add I, -1, I 561 LDF [X1 + 1 * SIZE], a10 562 cmp I, 0 563 LDF [X1 + 2 * SIZE], a11 564 LDF [X1 + 3 * SIZE], a12 565 LDF [X1 + 4 * SIZE], a13 566 LDF [X1 + 5 * SIZE], a14 567 LDF [X1 + 6 * SIZE], a15 568 LDF [X1 + 7 * SIZE], a16 569 ble %icc, .LL312 570 add X1, 8 * SIZE, X1 571 572.LL311: 573 prefetch [A1 + PREFETCHSIZE * SIZE], 1 574 575 FADD c1, t1, c1 576 FMUL a1, a9, t1 577 LDF [A1 + 0 * SIZE], a1 578 LDF [X1 + 0 * SIZE], a9 579 580 FADD c2, t2, c2 581 FMUL a2, a10, t2 582 LDF [A1 + 1 * SIZE], a2 583 LDF [X1 + 1 * SIZE], a10 584 585 FADD c1, t3, c1 586 add I, -1, I 587 FMUL a3, a11, t3 588 LDF [A1 + 2 * SIZE], a3 589 LDF [X1 + 2 * SIZE], a11 590 591 FADD c2, t4, c2 592 cmp I, 0 593 FMUL a4, a12, t4 594 LDF [A1 + 3 * SIZE], a4 595 LDF [X1 + 3 * SIZE], a12 596 597 FADD c1, t1, c1 598 nop 599 FMUL a5, a13, t1 600 LDF [A1 + 4 * SIZE], a5 601 LDF [X1 + 4 * SIZE], a13 602 603 FADD c2, t2, c2 604 nop 605 FMUL a6, a14, t2 606 LDF [A1 + 5 * SIZE], a6 607 LDF [X1 + 5 * SIZE], a14 608 609 FADD c1, t3, c1 610 FMUL a7, a15, t3 611 LDF [A1 + 6 * SIZE], a7 612 LDF [X1 + 6 * SIZE], a15 613 614 FADD c2, t4, c2 615 add X1, 8 * SIZE, X1 616 FMUL a8, a16, t4 617 LDF [A1 + 7 * SIZE], a8 618 add A1, 8 * SIZE, A1 619 bg,pn %icc, .LL311 620 LDF [X1 - 1 * SIZE], a16 621 622.LL312: 623 FADD c1, t1, c1 624 FMUL a1, a9, t1 625 FADD c2, t2, c2 626 FMUL a2, a10, t2 627 FADD c1, t3, c1 628 FMUL a3, a11, t3 629 FADD c2, t4, c2 630 FMUL a4, a12, t4 631 632 FADD c1, t1, c1 633 FMUL a5, a13, t1 634 FADD c2, t2, c2 635 FMUL a6, a14, t2 636 FADD c1, t3, c1 637 FMUL a7, a15, t3 638 FADD c2, t4, c2 639 FMUL a8, a16, t4 640 641.LL315: 642 and MIN_M, 7, I 643 cmp I, 0 644 ble %icc, .LL319 645 nop 646 647.LL316: 648 LDF [A1 + 0 * SIZE], a1 649 add A1, 1 * SIZE, A1 650 LDF [X1 + 0 * SIZE], b1 651 nop 652 653 FADD c1, t1, c1 654 nop 655 add I, -1, I 656 FMUL a1, b1, t1 657 nop 658 cmp I, 0 659 bg,pn %icc, .LL316 660 add X1, 1 * SIZE, X1 661 662.LL319: 663 FADD c1, t1, c1 664 nop 665 FADD c2, t2, c2 666 nop 667 FADD c1, t3, c1 668 FADD c2, t4, c2 669 670 FADD c1, c2, c1 671 672 FMUL ALPHA, c1, c1 673 LDF [Y1 + 0 * SIZE], a1 674 FADD a1, c1, a1 675 STF a1, [Y1 + 0 * SIZE] 676 add Y1, INCY, Y1 677 678.LL400: 679 add IS, P, IS 680 cmp IS, M 681 bl %icc, .LL10 682 add A, PNLDA, A 683 684.LL999: 685 return %i7 + 8 686 clr %o0 687 688 EPILOGUE 689