1/***************************************************************************** 2Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are 7met: 8 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 12 2. Redistributions in binary form must reproduce the above copyright 13 notice, this list of conditions and the following disclaimer in 14 the documentation and/or other materials provided with the 15 distribution. 16 3. Neither the name of the ISCAS nor the names of its contributors may 17 be used to endorse or promote products derived from this software 18 without specific prior written permission. 19 20THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 24LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 29USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31**********************************************************************************/ 32 33/*********************************************************************/ 34/* Copyright 2009, 2010 The University of Texas at Austin. */ 35/* All rights reserved. */ 36/* */ 37/* Redistribution and use in source and binary forms, with or */ 38/* without modification, are permitted provided that the following */ 39/* conditions are met: */ 40/* */ 41/* 1. Redistributions of source code must retain the above */ 42/* copyright notice, this list of conditions and the following */ 43/* disclaimer. */ 44/* */ 45/* 2. Redistributions in binary form must reproduce the above */ 46/* copyright notice, this list of conditions and the following */ 47/* disclaimer in the documentation and/or other materials */ 48/* provided with the distribution. */ 49/* */ 50/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 51/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 52/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 53/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 54/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 55/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 56/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 57/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 58/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 59/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 60/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 61/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 62/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 63/* POSSIBILITY OF SUCH DAMAGE. */ 64/* */ 65/* The views and conclusions contained in the software and */ 66/* documentation are those of the authors and should not be */ 67/* interpreted as representing official policies, either expressed */ 68/* or implied, of The University of Texas at Austin. */ 69/*********************************************************************/ 70 71#define ASSEMBLER 72#include "common.h" 73 74 75#define PREFETCH_DISTANCE 2016 76 77#define N $4 78 79#define X $8 80#define INCX $9 81 82#define Y $10 83#define INCY $11 84 85#define I $2 86#define TEMP $3 87 88#define YY $5 89 90#define ALPHA $f15 91 92#define a1 $f0 93#define a2 $f1 94#define a3 $f2 95#define a4 $f3 96#define a5 $f4 97#define a6 $f5 98#define a7 $f6 99#define a8 $f7 100 101#define a9 $f8 102#define a10 $f9 103#define a11 $f10 104#define a12 $f11 105#define a13 $f12 106#define a14 $f13 107#define a15 $f14 108#define a16 $f17 109 110#define t1 $f18 111#define t2 $f19 112#define t3 $f20 113#define t4 $f21 114 115#define b1 $f22 116#define b2 $f23 117#define b3 $f24 118#define b4 $f25 119 120#define b5 $f26 121#define b6 $f27 122#define b7 $f28 123#define b8 $f29 124 125 126#define A1 0 127#define A2 1 128#define A3 2 129#define A4 3 130#define A5 4 131#define A6 5 132#define A7 6 133#define A8 7 134 135#define A9 8 136#define A10 9 137#define A11 10 138#define A12 11 139#define A13 12 140#define A14 13 141#define A15 14 142#define A16 17 143 144#define T1 18 145#define T2 19 146#define T3 20 147#define T4 21 148 149#define B1 22 150#define B2 23 151#define B3 24 152#define B4 25 153 154#define B5 26 155#define B6 27 156#define B7 28 157#define B8 29 158 159#define X_BASE 8 160#define Y_BASE 10 161 162#define gsLQC1_(base,fq,ft,offset) .word (0x32<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) 163#define gsLQC1(base,fq,ft,offset) gsLQC1_((base), (fq), (ft), (offset)) 164 165#define gsSQC1_(base,fq,ft,offset) .word (0x3A<<26|base<<21|ft<<16|0x1<<15|offset<<6|0x1<<5|fq) 166#define gsSQC1(base,fq,ft,offset) gsSQC1_((base), (fq), (ft), (offset)) 167 168 PROLOGUE 169 170#ifndef __64BIT__ 171 daddiu $sp, $sp, -40 172 sdc1 $f20, 0($sp) 173 sdc1 $f22, 8($sp) 174 sdc1 $f24, 16($sp) 175 sdc1 $f26, 24($sp) 176 sdc1 $f28, 32($sp) 177#else 178 daddiu $sp, $sp, -48 179 sdc1 $f24, 0($sp) 180 sdc1 $f25, 8($sp) 181 sdc1 $f26, 16($sp) 182 sdc1 $f27, 24($sp) 183 sdc1 $f28, 32($sp) 184 sdc1 $f29, 40($sp) 185#endif 186 187 188 189 li TEMP, SIZE 190 191 blez N, .L999 192 dsll INCX, INCX, BASE_SHIFT 193 194 bne INCX, TEMP, .L20 195 dsll INCY, INCY, BASE_SHIFT 196 197 bne INCY, TEMP, .L20 198 199 //Dose the address of Y algin 16 bytes? 200 andi TEMP, Y, 8 201 beq TEMP, $0, .L10 202 //Y unalgin. Compute this unalgined element. 203 LD a1, 0 * SIZE(X) 204 LD b1, 0 * SIZE(Y) 205 206 daddiu X, X, SIZE 207 daddiu Y, Y, SIZE 208 209 MADD t1, b1, ALPHA, a1 210 daddiu N, N, -1 211 212 ST t1, -1 * SIZE(Y) 213 blez N, .L999 214 .align 5 215 216.L10: 217 218 dsra I, N, 4 219 220 blez I, .L15 221 daddiu I, I, -1 222 223 //Y algin. We need test X address 224 //Dose the address of X algin 16 bytes? 225 andi TEMP, X, 8 226 bne TEMP, $0, .L30 /// 227 .align 5 228 229.L11: 230 //X & Y algin 231 gsLQC1(X_BASE,A2,A1,0) 232 gsLQC1(X_BASE,A4,A3,1) 233 gsLQC1(X_BASE,A6,A5,2) 234 gsLQC1(X_BASE,A8,A7,3) 235 236 gsLQC1(X_BASE,A10,A9,4) 237 gsLQC1(X_BASE,A12,A11,5) 238 gsLQC1(X_BASE,A14,A13,6) 239 gsLQC1(X_BASE,A16,A15,7) 240 241 gsLQC1(Y_BASE,B2,B1,0) 242 gsLQC1(Y_BASE,B4,B3,1) 243 gsLQC1(Y_BASE,B6,B5,2) 244 gsLQC1(Y_BASE,B8,B7,3) 245 246 blez I, .L13 247 NOP 248 .align 5 249 250.L12: 251 252 MADD t1, b1, ALPHA, a1 253 MADD t2, b2, ALPHA, a2 254 gsSQC1(Y_BASE, T2, T1, 0) 255 gsLQC1(Y_BASE,B2,B1,4) 256 257 MADD t3, b3, ALPHA, a3 258 MADD t4, b4, ALPHA, a4 259 gsSQC1(Y_BASE, T4, T3, 1) 260 gsLQC1(Y_BASE,B4,B3,5) 261 262 PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) 263 PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) 264 265 MADD t1, b5, ALPHA, a5 266 MADD t2, b6, ALPHA, a6 267 gsSQC1(Y_BASE, T2, T1, 2) 268 gsLQC1(Y_BASE,B6,B5,6) 269 270 MADD t3, b7, ALPHA, a7 271 MADD t4, b8, ALPHA, a8 272 gsSQC1(Y_BASE, T4, T3, 3) 273 gsLQC1(Y_BASE,B8,B7, 7) 274 275 PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) 276 PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) 277 278 MADD t1, b1, ALPHA, a9 279 MADD t2, b2, ALPHA, a10 280 gsSQC1(Y_BASE, T2, T1, 4) 281 gsLQC1(Y_BASE,B2,B1,8) 282 283 MADD t3, b3, ALPHA, a11 284 MADD t4, b4, ALPHA, a12 285 gsSQC1(Y_BASE, T4, T3, 5) 286 gsLQC1(Y_BASE,B4,B3,9) 287 288 PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) 289 PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) 290 291 MADD t1, b5, ALPHA, a13 292 MADD t2, b6, ALPHA, a14 293 gsSQC1(Y_BASE, T2, T1, 6) 294 gsLQC1(Y_BASE,B6,B5,10) 295 296 MADD t3, b7, ALPHA, a15 297 MADD t4, b8, ALPHA, a16 298 gsSQC1(Y_BASE, T4, T3, 7) 299 gsLQC1(Y_BASE,B8,B7,11) 300 301 PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) 302 PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) 303 304 gsLQC1(X_BASE,A2,A1,8) 305 gsLQC1(X_BASE,A4,A3,9) 306 gsLQC1(X_BASE,A6,A5,10) 307 gsLQC1(X_BASE,A8,A7,11) 308 309 gsLQC1(X_BASE,A10,A9,12) 310 gsLQC1(X_BASE,A12,A11,13) 311 gsLQC1(X_BASE,A14,A13,14) 312 gsLQC1(X_BASE,A16,A15,15) 313 314 315 daddiu I, I, -1 316 daddiu Y, Y, 16 * SIZE 317 318 daddiu X, X, 16 * SIZE 319 bgtz I, .L12 320 321 .align 5 322 323.L13: 324 325 MADD t1, b1, ALPHA, a1 326 MADD t2, b2, ALPHA, a2 327 gsSQC1(Y_BASE, T2, T1, 0) 328 gsLQC1(Y_BASE,B2,B1,4) 329 330 MADD t3, b3, ALPHA, a3 331 MADD t4, b4, ALPHA, a4 332 gsSQC1(Y_BASE, T4, T3, 1) 333 gsLQC1(Y_BASE,B4,B3,5) 334 335 336 MADD t1, b5, ALPHA, a5 337 MADD t2, b6, ALPHA, a6 338 gsSQC1(Y_BASE, T2, T1, 2) 339 gsLQC1(Y_BASE,B6,B5,6) 340 341 MADD t3, b7, ALPHA, a7 342 MADD t4, b8, ALPHA, a8 343 gsSQC1(Y_BASE, T4, T3, 3) 344 gsLQC1(Y_BASE,B8,B7,7) 345 346 347 MADD t1, b1, ALPHA, a9 348 MADD t2, b2, ALPHA, a10 349 gsSQC1(Y_BASE, T2, T1, 4) 350 351 352 MADD t3, b3, ALPHA, a11 353 MADD t4, b4, ALPHA, a12 354 gsSQC1(Y_BASE, T4, T3, 5) 355 356 357 MADD t1, b5, ALPHA, a13 358 MADD t2, b6, ALPHA, a14 359 gsSQC1(Y_BASE, T2, T1, 6) 360 361 362 MADD t3, b7, ALPHA, a15 363 MADD t4, b8, ALPHA, a16 364 gsSQC1(Y_BASE, T4, T3, 7) 365 366 367 daddiu X, X, 16 * SIZE 368 daddiu Y, Y, 16 * SIZE 369 .align 5 370 371.L15: 372 andi I, N, 15 373 374 blez I, .L999 375 NOP 376 .align 5 377 378.L16: 379 LD a1, 0 * SIZE(X) 380 LD b1, 0 * SIZE(Y) 381 382 daddiu X, X, SIZE 383 daddiu Y, Y, SIZE 384 385 MADD t1, b1, ALPHA, a1 386 daddiu I, I, -1 387 388 bgtz I, .L16 389 ST t1, -1 * SIZE(Y) 390 391 392#ifndef __64BIT__ 393 ldc1 $f20, 0($sp) 394 ldc1 $f22, 8($sp) 395 ldc1 $f24, 16($sp) 396 ldc1 $f26, 24($sp) 397 ldc1 $f28, 32($sp) 398 daddiu $sp, $sp, 40 399#else 400 ldc1 $f24, 0($sp) 401 ldc1 $f25, 8($sp) 402 ldc1 $f26, 16($sp) 403 ldc1 $f27, 24($sp) 404 ldc1 $f28, 32($sp) 405 ldc1 $f29, 40($sp) 406 daddiu $sp, $sp, 48 407#endif 408 409 j $31 410 NOP 411 .align 5 412 413.L30: 414 //Y align, X unalign, INCX==INCY==1 415 //unloop 16 416 417 LD a1, 0 * SIZE(X) 418 daddiu X, X, SIZE 419 gsLQC1(X_BASE,A3,A2,0) 420 gsLQC1(X_BASE,A5,A4,1) 421 gsLQC1(X_BASE,A7,A6,2) 422 gsLQC1(X_BASE,A9,A8,3) 423 424 gsLQC1(X_BASE,A11,A10,4) 425 gsLQC1(X_BASE,A13,A12,5) 426 gsLQC1(X_BASE,A15,A14,6) 427 LD a16, 14 * SIZE(X) 428 429 430 gsLQC1(Y_BASE,B2,B1,0) 431 gsLQC1(Y_BASE,B4,B3,1) 432 gsLQC1(Y_BASE,B6,B5,2) 433 gsLQC1(Y_BASE,B8,B7,3) 434 435 blez I, .L32 436 NOP 437 .align 5 438 439.L31: 440 MADD t1, b1, ALPHA, a1 441 MADD t2, b2, ALPHA, a2 442 gsSQC1(Y_BASE, T2, T1, 0) 443 gsLQC1(Y_BASE,B2,B1,4) 444 445 MADD t3, b3, ALPHA, a3 446 MADD t4, b4, ALPHA, a4 447 gsSQC1(Y_BASE, T4, T3, 1) 448 gsLQC1(Y_BASE,B4,B3,5) 449 450 PREFETCHD(PREFETCH_DISTANCE*SIZE(Y)) 451 PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(Y)) 452 453 MADD t1, b5, ALPHA, a5 454 MADD t2, b6, ALPHA, a6 455 gsSQC1(Y_BASE, T2, T1, 2) 456 gsLQC1(Y_BASE,B6,B5,6) 457 458 MADD t3, b7, ALPHA, a7 459 MADD t4, b8, ALPHA, a8 460 gsSQC1(Y_BASE, T4, T3, 3) 461 gsLQC1(Y_BASE,B8,B7,7) 462 463 PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(Y)) 464 PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(Y)) 465 466 MADD t1, b1, ALPHA, a9 467 MADD t2, b2, ALPHA, a10 468 gsSQC1(Y_BASE, T2, T1, 4) 469 gsLQC1(Y_BASE,B2,B1,8) 470 471 MADD t3, b3, ALPHA, a11 472 MADD t4, b4, ALPHA, a12 473 gsSQC1(Y_BASE, T4, T3, 5) 474 gsLQC1(Y_BASE,B4,B3,9) 475 476 PREFETCHD(PREFETCH_DISTANCE*SIZE(X)) 477 PREFETCHD((PREFETCH_DISTANCE+4)*SIZE(X)) 478 479 MADD t1, b5, ALPHA, a13 480 MADD t2, b6, ALPHA, a14 481 gsSQC1(Y_BASE, T2, T1, 6) 482 gsLQC1(Y_BASE,B6,B5,10) 483 484 MADD t3, b7, ALPHA, a15 485 MADD t4, b8, ALPHA, a16 486 gsSQC1(Y_BASE, T4, T3, 7) 487 gsLQC1(Y_BASE,B8,B7,11) 488 489 PREFETCHD((PREFETCH_DISTANCE+8)*SIZE(X)) 490 PREFETCHD((PREFETCH_DISTANCE+12)*SIZE(X)) 491 492 LD a1, 15 * SIZE(X) 493 gsLQC1(X_BASE,A3,A2,8) 494 gsLQC1(X_BASE,A5,A4,9) 495 gsLQC1(X_BASE,A7,A6,10) 496 gsLQC1(X_BASE,A9,A8,11) 497 498 gsLQC1(X_BASE,A11,A10,12) 499 gsLQC1(X_BASE,A13,A12,13) 500 gsLQC1(X_BASE,A15,A14,14) 501 LD a16, 30 * SIZE(X) 502 503 daddiu I, I, -1 504 daddiu Y, Y, 16 * SIZE 505 506 daddiu X, X, 16 * SIZE 507 bgtz I, .L31 508 509 .align 5 510//Loop end: 511.L32: 512 513 MADD t1, b1, ALPHA, a1 514 MADD t2, b2, ALPHA, a2 515 gsSQC1(Y_BASE, T2, T1, 0) 516 gsLQC1(Y_BASE,B2,B1,4) 517 518 MADD t3, b3, ALPHA, a3 519 MADD t4, b4, ALPHA, a4 520 gsSQC1(Y_BASE, T4, T3, 1) 521 gsLQC1(Y_BASE,B4,B3,5) 522 523 524 MADD t1, b5, ALPHA, a5 525 MADD t2, b6, ALPHA, a6 526 gsSQC1(Y_BASE, T2, T1, 2) 527 gsLQC1(Y_BASE,B6,B5,6) 528 529 MADD t3, b7, ALPHA, a7 530 MADD t4, b8, ALPHA, a8 531 gsSQC1(Y_BASE, T4, T3, 3) 532 gsLQC1(Y_BASE,B8,B7,7) 533 534 535 MADD t1, b1, ALPHA, a9 536 MADD t2, b2, ALPHA, a10 537 gsSQC1(Y_BASE, T2, T1, 4) 538 539 540 MADD t3, b3, ALPHA, a11 541 MADD t4, b4, ALPHA, a12 542 gsSQC1(Y_BASE, T4, T3, 5) 543 544 545 MADD t1, b5, ALPHA, a13 546 MADD t2, b6, ALPHA, a14 547 gsSQC1(Y_BASE, T2, T1, 6) 548 549 550 MADD t3, b7, ALPHA, a15 551 MADD t4, b8, ALPHA, a16 552 gsSQC1(Y_BASE, T4, T3, 7) 553 554 555 daddiu X, X, 15 * SIZE 556 daddiu Y, Y, 16 * SIZE 557 558 //jump back to the remain process. 559 b .L15 560 .align 5 561 562//INCX!=1 or INCY != 1 563.L20: 564 dsra I, N, 3 565 move YY, Y 566 567 blez I, .L25 568 daddiu I, I, -1 569 570 LD a1, 0 * SIZE(X) 571 daddu X, X, INCX 572 LD b1, 0 * SIZE(Y) 573 daddu Y, Y, INCY 574 LD a2, 0 * SIZE(X) 575 daddu X, X, INCX 576 LD b2, 0 * SIZE(Y) 577 daddu Y, Y, INCY 578 LD a3, 0 * SIZE(X) 579 daddu X, X, INCX 580 LD b3, 0 * SIZE(Y) 581 daddu Y, Y, INCY 582 LD a4, 0 * SIZE(X) 583 daddu X, X, INCX 584 LD b4, 0 * SIZE(Y) 585 daddu Y, Y, INCY 586 LD a5, 0 * SIZE(X) 587 daddu X, X, INCX 588 LD b5, 0 * SIZE(Y) 589 daddu Y, Y, INCY 590 LD a6, 0 * SIZE(X) 591 daddu X, X, INCX 592 LD b6, 0 * SIZE(Y) 593 daddu Y, Y, INCY 594 LD a7, 0 * SIZE(X) 595 daddu X, X, INCX 596 LD b7, 0 * SIZE(Y) 597 daddu Y, Y, INCY 598 LD a8, 0 * SIZE(X) 599 daddu X, X, INCX 600 LD b8, 0 * SIZE(Y) 601 daddu Y, Y, INCY 602 603 blez I, .L23 604 NOP 605 .align 5 606 607.L22: 608 MADD t1, b1, ALPHA, a1 609 LD a1, 0 * SIZE(X) 610 LD b1, 0 * SIZE(Y) 611 daddu X, X, INCX 612 daddu Y, Y, INCY 613 614 MADD t2, b2, ALPHA, a2 615 LD a2, 0 * SIZE(X) 616 LD b2, 0 * SIZE(Y) 617 daddu X, X, INCX 618 daddu Y, Y, INCY 619 620 MADD t3, b3, ALPHA, a3 621 LD a3, 0 * SIZE(X) 622 LD b3, 0 * SIZE(Y) 623 daddu X, X, INCX 624 daddu Y, Y, INCY 625 626 MADD t4, b4, ALPHA, a4 627 LD a4, 0 * SIZE(X) 628 LD b4, 0 * SIZE(Y) 629 daddu X, X, INCX 630 daddu Y, Y, INCY 631 632 ST t1, 0 * SIZE(YY) 633 daddu YY, YY, INCY 634 MADD t1, b5, ALPHA, a5 635 636 LD a5, 0 * SIZE(X) 637 LD b5, 0 * SIZE(Y) 638 daddu X, X, INCX 639 daddu Y, Y, INCY 640 641 ST t2, 0 * SIZE(YY) 642 daddu YY, YY, INCY 643 MADD t2, b6, ALPHA, a6 644 645 LD a6, 0 * SIZE(X) 646 LD b6, 0 * SIZE(Y) 647 daddu X, X, INCX 648 daddu Y, Y, INCY 649 650 ST t3, 0 * SIZE(YY) 651 daddu YY, YY, INCY 652 MADD t3, b7, ALPHA, a7 653 654 LD a7, 0 * SIZE(X) 655 LD b7, 0 * SIZE(Y) 656 daddu X, X, INCX 657 daddu Y, Y, INCY 658 659 ST t4, 0 * SIZE(YY) 660 daddu YY, YY, INCY 661 MADD t4, b8, ALPHA, a8 662 663 LD a8, 0 * SIZE(X) 664 daddu X, X, INCX 665 666 LD b8, 0 * SIZE(Y) 667 daddu Y, Y, INCY 668 669 ST t1, 0 * SIZE(YY) 670 daddu YY, YY, INCY 671 ST t2, 0 * SIZE(YY) 672 daddu YY, YY, INCY 673 ST t3, 0 * SIZE(YY) 674 daddu YY, YY, INCY 675 ST t4, 0 * SIZE(YY) 676 daddiu I, I, -1 677 678 bgtz I, .L22 679 daddu YY, YY, INCY 680 .align 5 681 682.L23: 683 MADD t1, b1, ALPHA, a1 684 MADD t2, b2, ALPHA, a2 685 MADD t3, b3, ALPHA, a3 686 MADD t4, b4, ALPHA, a4 687 688 ST t1, 0 * SIZE(YY) 689 daddu YY, YY, INCY 690 MADD t1, b5, ALPHA, a5 691 692 ST t2, 0 * SIZE(YY) 693 daddu YY, YY, INCY 694 MADD t2, b6, ALPHA, a6 695 696 ST t3, 0 * SIZE(YY) 697 daddu YY, YY, INCY 698 MADD t3, b7, ALPHA, a7 699 700 ST t4, 0 * SIZE(YY) 701 daddu YY, YY, INCY 702 MADD t4, b8, ALPHA, a8 703 704 ST t1, 0 * SIZE(YY) 705 daddu YY, YY, INCY 706 ST t2, 0 * SIZE(YY) 707 daddu YY, YY, INCY 708 ST t3, 0 * SIZE(YY) 709 daddu YY, YY, INCY 710 ST t4, 0 * SIZE(YY) 711 daddu YY, YY, INCY 712 .align 5 713 714.L25: 715 andi I, N, 7 716 717 blez I, .L999 718 NOP 719 .align 5 720 721.L26: 722 LD a1, 0 * SIZE(X) 723 LD b1, 0 * SIZE(Y) 724 725 MADD t1, b1, ALPHA, a1 726 daddu X, X, INCX 727 728 ST t1, 0 * SIZE(Y) 729 daddiu I, I, -1 730 731 bgtz I, .L26 732 daddu Y, Y, INCY 733 .align 5 734 735.L999: 736 737#ifndef __64BIT__ 738 ldc1 $f20, 0($sp) 739 ldc1 $f22, 8($sp) 740 ldc1 $f24, 16($sp) 741 ldc1 $f26, 24($sp) 742 ldc1 $f28, 32($sp) 743 daddiu $sp, $sp, 40 744#else 745 ldc1 $f24, 0($sp) 746 ldc1 $f25, 8($sp) 747 ldc1 $f26, 16($sp) 748 ldc1 $f27, 24($sp) 749 ldc1 $f28, 32($sp) 750 ldc1 $f29, 40($sp) 751 daddiu $sp, $sp, 48 752#endif 753 754 j $31 755 NOP 756 757 EPILOGUE 758