1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define N %i0 26#define X %i1 27#define INCX %i2 28#define Y %i3 29#define INCY %i4 30#define I %i5 31 32#define XX %l0 33#define YY %l1 34 35#ifdef DOUBLE 36#define a1 %f4 37#define a2 %f6 38#define a3 %f8 39#define a4 %f10 40#define a5 %f12 41#define a6 %f14 42#define a7 %f16 43#define a8 %f18 44#define b1 %f20 45#define b2 %f22 46#define b3 %f24 47#define b4 %f26 48#define b5 %f28 49#define b6 %f30 50#define b7 %f32 51#define b8 %f34 52 53#define c1 %f36 54#define c2 %f38 55#define c3 %f40 56#define c4 %f42 57#define c5 %f44 58#define c6 %f46 59#define c7 %f48 60#define c8 %f50 61 62#define t1 %f52 63#define t2 %f54 64#define t3 %f56 65#define t4 %f58 66#else 67#define a1 %f2 68#define a2 %f3 69#define a3 %f4 70#define a4 %f5 71#define a5 %f6 72#define a6 %f7 73#define a7 %f8 74#define a8 %f9 75#define b1 %f10 76#define b2 %f11 77#define b3 %f12 78#define b4 %f13 79#define b5 %f14 80#define b6 %f15 81#define b7 %f16 82#define b8 %f17 83 84#define c1 %f18 85#define c2 %f19 86#define c3 %f20 87#define c4 %f21 88#define c5 %f22 89#define c6 %f23 90#define c7 %f24 91#define c8 %f25 92 93#define t1 %f26 94#define t2 %f27 95#define t3 %f28 96#define t4 %f29 97#endif 98 99#ifdef DOUBLE 100#define C %f0 101#define S %f2 102#else 103#define C %f0 104#define S %f1 105#endif 106 107 PROLOGUE 108 SAVESP 109 110#ifndef __64BIT__ 111#ifdef DOUBLE 112 st %i5, [%sp + STACK_START + 24] 113 114 LDF [%sp + STACK_START + 24], C 115 LDF [%sp + STACK_START + 32], S 116#else 117 st %i5, [%sp + STACK_START + 24] 118 119 LDF [%sp + STACK_START + 24], C 120 LDF [%sp + STACK_START + 28], S 121#endif 122#else 123#ifdef DOUBLE 124 FMOV %f10, C 125 FMOV %f12, S 126#else 127 FMOV %f11, C 128 FMOV %f13, S 129#endif 130#endif 131 132 cmp N, 0 133 ble .LL19 134 nop 135 136 sll INCX, ZBASE_SHIFT, INCX 137 sll INCY, ZBASE_SHIFT, INCY 138 139 cmp INCX, 2 * SIZE 140 bne .LL50 141 nop 142 143 cmp INCY, 2 * SIZE 144 bne .LL50 145 nop 146 147 sra N, 2, I 148 cmp I, 0 149 ble,pn %icc, .LL15 150 nop 151 152 LDF [X + 0 * SIZE], a1 153 LDF [Y + 0 * SIZE], b1 154 LDF [X + 1 * SIZE], a2 155 LDF [Y + 1 * SIZE], b2 156 LDF [X + 2 * SIZE], a3 157 LDF [Y + 2 * SIZE], b3 158 LDF [X + 3 * SIZE], a4 159 LDF [Y + 3 * SIZE], b4 160 161 LDF [X + 4 * SIZE], a5 162 LDF [Y + 4 * SIZE], b5 163 LDF [X + 5 * SIZE], a6 164 LDF [Y + 5 * SIZE], b6 165 LDF [X + 6 * SIZE], a7 166 LDF [Y + 6 * SIZE], b7 167 LDF [X + 7 * SIZE], a8 168 LDF [Y + 7 * SIZE], b8 169 170 FMUL C, a1, c1 171 FMUL S, b1, c2 172 FMUL C, b1, c3 173 LDF [Y + 8 * SIZE], b1 174 FMUL S, a1, c4 175 LDF [X + 8 * SIZE], a1 176 177 FMUL C, a2, c5 178 FMUL S, b2, c6 179 FADD c1, c2, t1 180 181 FMUL C, b2, c7 182 LDF [Y + 9 * SIZE], b2 183 FMUL S, a2, c8 184 LDF [X + 9 * SIZE], a2 185 FSUB c3, c4, t2 186 187 addcc I, -1, I 188 ble,pt %icc, .LL12 189 nop 190 191#define PREFETCHSIZE 64 192 193.LL11: 194 FMUL C, a3, c1 195 nop 196 prefetch [Y + PREFETCHSIZE * SIZE], 1 197 nop 198 199 FMUL S, b3, c2 200 STF t1, [X + 0 * SIZE] 201 FADD c5, c6, t3 202 nop 203 204 FMUL C, b3, c3 205 LDF [Y + 10 * SIZE], b3 206 nop 207 nop 208 209 FMUL S, a3, c4 210 STF t2, [Y + 0 * SIZE] 211 FSUB c7, c8, t4 212 nop 213 214 FMUL C, a4, c5 215 LDF [X + 10 * SIZE], a3 216 nop 217 nop 218 219 FMUL S, b4, c6 220 STF t3, [X + 1 * SIZE] 221 FADD c1, c2, t1 222 nop 223 224 FMUL C, b4, c7 225 LDF [Y + 11 * SIZE], b4 226 nop 227 nop 228 229 FMUL S, a4, c8 230 STF t4, [Y + 1 * SIZE] 231 FSUB c3, c4, t2 232 nop 233 234 FMUL C, a5, c1 235 LDF [X + 11 * SIZE], a4 236 nop 237 nop 238 239 FMUL S, b5, c2 240 STF t1, [X + 2 * SIZE] 241 FADD c5, c6, t3 242 nop 243 244 FMUL C, b5, c3 245 LDF [Y + 12 * SIZE], b5 246 nop 247 nop 248 249 FMUL S, a5, c4 250 STF t2, [Y + 2 * SIZE] 251 FSUB c7, c8, t4 252 nop 253 254 FMUL C, a6, c5 255 LDF [X + 12 * SIZE], a5 256 nop 257 nop 258 259 FMUL S, b6, c6 260 STF t3, [X + 3 * SIZE] 261 FADD c1, c2, t1 262 nop 263 264 FMUL C, b6, c7 265 LDF [Y + 13 * SIZE], b6 266 nop 267 nop 268 269 FMUL S, a6, c8 270 STF t4, [Y + 3 * SIZE] 271 FSUB c3, c4, t2 272 nop 273 274 FMUL C, a7, c1 275 LDF [X + 13 * SIZE], a6 276 nop 277 nop 278 279 FMUL S, b7, c2 280 STF t1, [X + 4 * SIZE] 281 FADD c5, c6, t3 282 nop 283 284 FMUL C, b7, c3 285 LDF [Y + 14 * SIZE], b7 286 nop 287 nop 288 289 FMUL S, a7, c4 290 STF t2, [Y + 4 * SIZE] 291 FSUB c7, c8, t4 292 nop 293 294 FMUL C, a8, c5 295 LDF [X + 14 * SIZE], a7 296 nop 297 nop 298 299 FMUL S, b8, c6 300 STF t3, [X + 5 * SIZE] 301 FADD c1, c2, t1 302 nop 303 304 FMUL C, b8, c7 305 LDF [Y + 15 * SIZE], b8 306 nop 307 nop 308 309 FMUL S, a8, c8 310 STF t4, [Y + 5 * SIZE] 311 FSUB c3, c4, t2 312 nop 313 314 FMUL C, a1, c1 315 LDF [X + 15 * SIZE], a8 316 addcc I, -1, I 317 nop 318 319 FMUL S, b1, c2 320 STF t1, [X + 6 * SIZE] 321 FADD c5, c6, t3 322 nop 323 324 FMUL C, b1, c3 325 LDF [Y + 16 * SIZE], b1 326 nop 327 nop 328 329 FMUL S, a1, c4 330 STF t2, [Y + 6 * SIZE] 331 FSUB c7, c8, t4 332 nop 333 334 FMUL C, a2, c5 335 LDF [X + 16 * SIZE], a1 336 add Y, 8 * SIZE, Y 337 nop 338 339 FMUL S, b2, c6 340 STF t3, [X + 7 * SIZE] 341 FADD c1, c2, t1 342 nop 343 344 FMUL C, b2, c7 345 LDF [Y + 9 * SIZE], b2 346 add X, 8 * SIZE, X 347 nop 348 349 FMUL S, a2, c8 350 STF t4, [Y - 1 * SIZE] 351 FSUB c3, c4, t2 352 nop 353 354 bg,pt %icc, .LL11 355 LDF [X + 9 * SIZE], a2 356 357 358.LL12: 359 FMUL C, a3, c1 360 FMUL S, b3, c2 361 STF t1, [X + 0 * SIZE] 362 FADD c5, c6, t3 363 364 FMUL C, b3, c3 365 FMUL S, a3, c4 366 STF t2, [Y + 0 * SIZE] 367 FSUB c7, c8, t4 368 369 370 FMUL C, a4, c5 371 FMUL S, b4, c6 372 STF t3, [X + 1 * SIZE] 373 FADD c1, c2, t1 374 375 FMUL C, b4, c7 376 FMUL S, a4, c8 377 STF t4, [Y + 1 * SIZE] 378 FSUB c3, c4, t2 379 380 381 FMUL C, a5, c1 382 FMUL S, b5, c2 383 STF t1, [X + 2 * SIZE] 384 FADD c5, c6, t3 385 386 FMUL C, b5, c3 387 FMUL S, a5, c4 388 STF t2, [Y + 2 * SIZE] 389 FSUB c7, c8, t4 390 391 FMUL C, a6, c5 392 FMUL S, b6, c6 393 STF t3, [X + 3 * SIZE] 394 FADD c1, c2, t1 395 396 FMUL C, b6, c7 397 FMUL S, a6, c8 398 STF t4, [Y + 3 * SIZE] 399 FSUB c3, c4, t2 400 401 FMUL C, a7, c1 402 FMUL S, b7, c2 403 STF t1, [X + 4 * SIZE] 404 FADD c5, c6, t3 405 406 FMUL C, b7, c3 407 FMUL S, a7, c4 408 STF t2, [Y + 4 * SIZE] 409 FSUB c7, c8, t4 410 411 FMUL C, a8, c5 412 FMUL S, b8, c6 413 STF t3, [X + 5 * SIZE] 414 FADD c1, c2, t1 415 416 FMUL C, b8, c7 417 FMUL S, a8, c8 418 STF t4, [Y + 5 * SIZE] 419 FSUB c3, c4, t2 420 421 FADD c5, c6, t3 422 STF t1, [X + 6 * SIZE] 423 424 FSUB c7, c8, t4 425 STF t2, [Y + 6 * SIZE] 426 427 STF t3, [X + 7 * SIZE] 428 STF t4, [Y + 7 * SIZE] 429 430 add X, 8 * SIZE, X 431 add Y, 8 * SIZE, Y 432 433 434.LL15: 435 andcc N, 3, I 436 nop 437 ble,a,pn %icc, .LL19 438 nop 439 440.LL16: 441 LDF [X + 0 * SIZE], a1 442 LDF [Y + 0 * SIZE], b1 443 LDF [X + 1 * SIZE], a2 444 LDF [Y + 1 * SIZE], b2 445 446 FMUL C, a1, c1 447 add X, 2 * SIZE, X 448 FMUL S, b1, c2 449 add Y, 2 * SIZE, Y 450 451 FMUL C, b1, c3 452 addcc I, -1, I 453 FMUL S, a1, c4 454 nop 455 456 FMUL C, a2, c5 457 FMUL S, b2, c6 458 FADD c1, c2, c2 459 460 FMUL C, b2, c7 461 FMUL S, a2, c8 462 FSUB c3, c4, c4 463 464 STF c2, [X - 2 * SIZE] 465 FADD c5, c6, c6 466 STF c4, [Y - 2 * SIZE] 467 FSUB c7, c8, c8 468 469 STF c6, [X - 1 * SIZE] 470 bg,pt %icc, .LL16 471 STF c8, [Y - 1 * SIZE] 472 473.LL19: 474 return %i7 + 8 475 nop 476 477.LL50: 478 mov X, XX 479 mov Y, YY 480 481 sra N, 2, I 482 cmp I, 0 483 ble,pn %icc, .LL55 484 nop 485 486.LL51: 487 LDF [X + 0 * SIZE], a1 488 LDF [Y + 0 * SIZE], b1 489 LDF [X + 1 * SIZE], a2 490 LDF [Y + 1 * SIZE], b2 491 492 FMUL C, a1, c1 493 FMUL S, b1, c2 494 FMUL C, b1, c3 495 FMUL S, a1, c4 496 497 FMUL C, a2, c5 498 nop 499 FMUL S, b2, c6 500 FADD c1, c2, c2 501 502 FMUL C, b2, c7 503 nop 504 FMUL S, a2, c8 505 FSUB c3, c4, c4 506 507 STF c2, [X + 0 * SIZE] 508 FADD c5, c6, c6 509 STF c4, [Y + 0 * SIZE] 510 FSUB c7, c8, c8 511 512 STF c6, [X + 1 * SIZE] 513 add X, INCX, X 514 STF c8, [Y + 1 * SIZE] 515 add Y, INCY, Y 516 517 LDF [X + 0 * SIZE], a1 518 LDF [Y + 0 * SIZE], b1 519 LDF [X + 1 * SIZE], a2 520 LDF [Y + 1 * SIZE], b2 521 522 FMUL C, a1, c1 523 FMUL S, b1, c2 524 FMUL C, b1, c3 525 FMUL S, a1, c4 526 527 FMUL C, a2, c5 528 nop 529 FMUL S, b2, c6 530 FADD c1, c2, c2 531 532 FMUL C, b2, c7 533 nop 534 FMUL S, a2, c8 535 FSUB c3, c4, c4 536 537 STF c2, [X + 0 * SIZE] 538 FADD c5, c6, c6 539 STF c4, [Y + 0 * SIZE] 540 FSUB c7, c8, c8 541 542 STF c6, [X + 1 * SIZE] 543 add X, INCX, X 544 STF c8, [Y + 1 * SIZE] 545 add Y, INCY, Y 546 547 LDF [X + 0 * SIZE], a1 548 LDF [Y + 0 * SIZE], b1 549 LDF [X + 1 * SIZE], a2 550 LDF [Y + 1 * SIZE], b2 551 552 FMUL C, a1, c1 553 FMUL S, b1, c2 554 FMUL C, b1, c3 555 FMUL S, a1, c4 556 557 FMUL C, a2, c5 558 nop 559 FMUL S, b2, c6 560 FADD c1, c2, c2 561 562 FMUL C, b2, c7 563 nop 564 FMUL S, a2, c8 565 FSUB c3, c4, c4 566 567 STF c2, [X + 0 * SIZE] 568 FADD c5, c6, c6 569 STF c4, [Y + 0 * SIZE] 570 FSUB c7, c8, c8 571 572 STF c6, [X + 1 * SIZE] 573 add X, INCX, X 574 STF c8, [Y + 1 * SIZE] 575 add Y, INCY, Y 576 577 LDF [X + 0 * SIZE], a1 578 LDF [Y + 0 * SIZE], b1 579 LDF [X + 1 * SIZE], a2 580 LDF [Y + 1 * SIZE], b2 581 582 FMUL C, a1, c1 583 FMUL S, b1, c2 584 FMUL C, b1, c3 585 FMUL S, a1, c4 586 587 FMUL C, a2, c5 588 nop 589 FMUL S, b2, c6 590 FADD c1, c2, c2 591 592 FMUL C, b2, c7 593 nop 594 FMUL S, a2, c8 595 FSUB c3, c4, c4 596 597 STF c2, [X + 0 * SIZE] 598 FADD c5, c6, c6 599 STF c4, [Y + 0 * SIZE] 600 FSUB c7, c8, c8 601 602 STF c6, [X + 1 * SIZE] 603 add X, INCX, X 604 STF c8, [Y + 1 * SIZE] 605 add Y, INCY, Y 606 607 addcc I, -1, I 608 bg,pt %icc, .LL51 609 nop 610 611 612.LL55: 613 andcc N, 3, I 614 nop 615 ble %icc, .LL59 616 nop 617 618.LL56: 619 LDF [X + 0 * SIZE], a1 620 LDF [Y + 0 * SIZE], b1 621 LDF [X + 1 * SIZE], a2 622 LDF [Y + 1 * SIZE], b2 623 624 FMUL C, a1, c1 625 FMUL S, b1, c2 626 FMUL C, b1, c3 627 FMUL S, a1, c4 628 629 FMUL C, a2, c5 630 addcc I, -1, I 631 FMUL S, b2, c6 632 FADD c1, c2, c2 633 634 FMUL C, b2, c7 635 nop 636 FMUL S, a2, c8 637 FSUB c3, c4, c4 638 639 STF c2, [X + 0 * SIZE] 640 FADD c5, c6, c6 641 STF c4, [Y + 0 * SIZE] 642 FSUB c7, c8, c8 643 644 STF c6, [X + 1 * SIZE] 645 add X, INCX, X 646 STF c8, [Y + 1 * SIZE] 647 648 bg %icc, .LL56 649 add Y, INCY, Y 650 651 652.LL59: 653 return %i7 + 8 654 nop 655 656 EPILOGUE 657