1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef PENTIUM 43#define P 88 44#endif 45 46#ifndef P 47#define P 1000 48#endif 49 50#define STACK 16 51#define ARGS 24 52 53#define NLDA 0 + STACK(%esp) 54#define XP 4 + STACK(%esp) 55#define MIN_M 8 + STACK(%esp) 56#define J 12 + STACK(%esp) 57#define IS 16 + STACK(%esp) 58 59#define M 4 + STACK + ARGS(%esp) 60#define N 8 + STACK + ARGS(%esp) 61#define K 12 + STACK + ARGS(%esp) 62#define ALPHA 16 + STACK + ARGS(%esp) 63#ifdef DOUBLE 64#define A 24 + STACK + ARGS(%esp) 65#define LDA 28 + STACK + ARGS(%esp) 66#define X 32 + STACK + ARGS(%esp) 67#define INCX 36 + STACK + ARGS(%esp) 68#define Y 40 + STACK + ARGS(%esp) 69#define INCY 44 + STACK + ARGS(%esp) 70#define BUFFER 48 + STACK + ARGS(%esp) 71#else 72#define A 20 + STACK + ARGS(%esp) 73#define LDA 24 + STACK + ARGS(%esp) 74#define X 28 + STACK + ARGS(%esp) 75#define INCX 32 + STACK + ARGS(%esp) 76#define Y 36 + STACK + ARGS(%esp) 77#define INCY 40 + STACK + ARGS(%esp) 78#define BUFFER 44 + STACK + ARGS(%esp) 79#endif 80 81 PROLOGUE 82 83 subl $ARGS, %esp 84 pushl %ebp 85 pushl %edi 86 pushl %esi 87 pushl %ebx 88 89 PROFCODE 90 91 FLD ALPHA 92 93 movl X, %edi # X 94 95 movl $0, IS 96 97 movl M, %ebx 98 movl N, %eax 99 100 testl %ebx, %ebx 101 jle .L79 102 testl %eax, %eax 103 jle .L79 104 105 movl INCX, %esi 106 leal (,%esi,SIZE), %esi 107 movl %esi, INCX 108 109 movl INCY, %esi 110 leal (, %esi, SIZE), %esi 111 movl %esi, INCY 112 113 movl LDA, %ebx 114 115 imull %ebx, %eax 116 movl $P, %esi 117 subl %eax, %esi 118 leal (, %esi, SIZE), %esi 119 movl %esi, NLDA 120 121 leal (,%ebx,SIZE), %esi 122 movl %esi, LDA 123 ALIGN_2 124 125.L32: 126 movl IS, %esi 127 128 movl $P, %edx 129 movl M, %eax 130 subl %esi, %eax 131 cmpl %edx, %eax 132#ifdef PENTIUM 133 jle .L33 134 movl %edx, %eax 135.L33: 136#else 137 cmovg %edx, %eax 138#endif 139 movl %eax, MIN_M 140 141 movl IS, %ecx 142 leal (%edi,%ecx,SIZE), %ecx # xp = x + is 143 movl INCX, %ebx 144 movl %ecx, XP 145 cmpl $SIZE, %ebx 146 je .L34 147 148 movl BUFFER, %esi 149 movl MIN_M, %ecx 150 movl %esi, XP 151 sarl $2, %ecx 152 jle .L35 153 154 ALIGN_3 155 156.L36: 157 FLD (%edi) 158 addl %ebx, %edi 159 FST 0 * SIZE(%esi) 160 161 FLD (%edi) 162 addl %ebx, %edi 163 FST 1 * SIZE(%esi) 164 165 FLD (%edi) 166 addl %ebx, %edi 167 FST 2 * SIZE(%esi) 168 169 FLD (%edi) 170 addl %ebx, %edi 171 FST 3 * SIZE(%esi) 172 173 addl $4 * SIZE, %esi 174 decl %ecx 175 jg .L36 176 ALIGN_3 177 178.L35: 179 movl MIN_M, %ecx 180 andl $3,%ecx 181 jle .L34 182 ALIGN_2 183 184.L42: 185 FLD (%edi) 186 addl %ebx, %edi 187 FST (%esi) 188 addl $SIZE, %esi 189 decl %ecx 190 jg .L42 191 ALIGN_3 192 193/* Main Routine */ 194 195.L34: 196 movl Y, %ebp # coffset = y 197 198 movl N, %esi 199 sarl $2, %esi 200 movl %esi, J 201 jle .L47 202 ALIGN_3 203 204.L48: 205 movl A, %ebx # a_offset = a 206 fldz 207 movl LDA, %edx 208 fldz 209 210 leal (%ebx, %edx), %ecx # a_offset2 = a + lda 211 fldz 212 leal (%ebx, %edx, 4), %eax 213 fldz 214 215 movl %eax, A 216 movl XP, %esi 217 FLD (%esi) 218 219 movl MIN_M, %eax 220 sarl $2,%eax 221 jle .L51 222 ALIGN_3 223 224#define PRESIZE 8 225 226.L80: 227#ifdef PENTIUM3 228 prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) 229 FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 230 fmul %st(1),%st # at1 *= bt1 231 232 prefetcht0 PRESIZE * SIZE(%ecx) 233 faddp %st,%st(2) # ct1 += at1 234 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 235 236 prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) 237 fmul %st(1),%st # at1 *= bt1 238 faddp %st,%st(3) # ct2 += at1 239 240 prefetcht0 PRESIZE * SIZE(%ebx) 241 FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 242 fmul %st(1),%st 243 244 faddp %st,%st(4) 245 FLD 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 246 fmulp %st, %st(1) 247 248 faddp %st,%st(4) 249 FLD 1 * SIZE(%esi) 250 FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 251 252 fmul %st(1),%st # at1 *= bt1 253 faddp %st,%st(2) # ct1 += at1 254 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 255 256 fmul %st(1),%st # at1 *= bt1 257 faddp %st,%st(3) # ct2 += at1 258 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 259 260 fmul %st(1),%st 261 faddp %st,%st(4) 262 FLD 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 263 264 fmulp %st, %st(1) 265 faddp %st,%st(4) 266 FLD 2 * SIZE(%esi) 267 268 FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 269 fmul %st(1),%st # at1 *= bt1 270 faddp %st,%st(2) # ct1 += at1 271 272 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 273 fmul %st(1),%st # at1 *= bt1 274 faddp %st,%st(3) # ct2 += at1 275 276 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 277 fmul %st(1),%st 278 faddp %st,%st(4) 279 280 FLD 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 281 fmulp %st, %st(1) 282 faddp %st,%st(4) 283 284 FLD 3 * SIZE(%esi) 285 FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 286 fmul %st(1),%st # at1 *= bt1 287 288 faddp %st,%st(2) # ct1 += at1 289 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 290 fmul %st(1),%st # at1 *= bt1 291 292 faddp %st,%st(3) # ct2 += at1 293 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 294 fmul %st(1),%st 295 296 faddp %st,%st(4) 297 FLD 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 298 fmulp %st, %st(1) 299 300 addl $4 * SIZE, %ebx 301 faddp %st,%st(4) 302 addl $4 * SIZE, %ecx 303 304 FLD 4 * SIZE(%esi) 305 addl $4 * SIZE, %esi 306 307#else 308 309#if defined(HAS_PREFETCH) 310 prefetcht0 PRESIZE * SIZE(%ebx) 311 prefetcht0 PRESIZE * SIZE(%ebx, %edx, 2) 312 prefetcht0 PRESIZE * SIZE(%ecx) 313 prefetcht0 PRESIZE * SIZE(%ecx, %edx, 2) 314#endif 315 316 FLD 0 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 317 fmul %st(1),%st # at1 *= bt1 318 faddp %st,%st(2) # ct1 += at1 319 320 FLD 0 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 321 fmul %st(1),%st # at1 *= bt1 322 faddp %st,%st(3) # ct2 += at1 323 324 FLD 0 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 325 fmul %st(1),%st 326 faddp %st,%st(4) 327 328 FMUL 0 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 329 faddp %st,%st(4) 330 FLD 1 * SIZE(%esi) 331 332 FLD 1 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 333 fmul %st(1),%st # at1 *= bt1 334 faddp %st,%st(2) # ct1 += at1 335 336 FLD 1 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 337 fmul %st(1),%st # at1 *= bt1 338 faddp %st,%st(3) # ct2 += at1 339 340 FLD 1 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 341 fmul %st(1),%st 342 faddp %st,%st(4) 343 344 FMUL 1 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 345 faddp %st,%st(4) 346 FLD 2 * SIZE(%esi) 347 348 FLD 2 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 349 fmul %st(1),%st # at1 *= bt1 350 faddp %st,%st(2) # ct1 += at1 351 352 FLD 2 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 353 fmul %st(1),%st # at1 *= bt1 354 faddp %st,%st(3) # ct2 += at1 355 356 FLD 2 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 357 fmul %st(1),%st 358 faddp %st,%st(4) 359 360 FMUL 2 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 361 faddp %st,%st(4) 362 FLD 3 * SIZE(%esi) 363 364 FLD 3 * SIZE(%ebx) # at = *(a_offset + 0 * lda) 365 fmul %st(1),%st # at1 *= bt1 366 faddp %st,%st(2) # ct1 += at1 367 368 FLD 3 * SIZE(%ecx) # at1 = *(a_offset2 + 0 * lda) 369 fmul %st(1),%st # at1 *= bt1 370 faddp %st,%st(3) # ct2 += at1 371 372 FLD 3 * SIZE(%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 373 fmul %st(1),%st 374 faddp %st,%st(4) 375 376 FMUL 3 * SIZE(%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 377 faddp %st,%st(4) 378 FLD 4 * SIZE(%esi) 379 380 addl $4 * SIZE, %ebx 381 addl $4 * SIZE, %ecx 382 addl $4 * SIZE, %esi 383#endif 384 385 decl %eax 386 jg .L80 387 ALIGN_3 388 389.L51: 390 movl MIN_M, %eax 391 andl $3, %eax 392 je .L81 393 ALIGN_3 394 395.L52: 396 397 FLD (%ebx) # at = *(a_offset + 0 * lda) 398 fmul %st(1),%st # at1 *= bt1 399 faddp %st,%st(2) # ct1 += at1 400 401 FLD (%ecx) # at1 = *(a_offset2 + 0 * lda) 402 fmul %st(1),%st # at1 *= bt1 403 faddp %st,%st(3) # ct2 += at1 404 405 FLD (%ebx, %edx, 2) # at = *(a_offset + 2 * lda) 406 fmul %st(1),%st 407 faddp %st,%st(4) 408 409 FMUL (%ecx, %edx, 2) # at1 = *(a_offset2 + 2 * lda) 410 faddp %st,%st(4) 411 FLD 1 * SIZE(%esi) 412 413 addl $SIZE, %ebx 414 addl $SIZE, %ecx 415 addl $SIZE, %esi 416 decl %eax 417 jg .L52 418 ALIGN_3 419 420.L81: 421#ifndef C_SUN 422 ffreep %st(0) 423#else 424 .byte 0xdf 425 .byte 0xc0 426#endif 427 428 fxch %st(4) 429 fmul %st, %st(4) 430 fmul %st, %st(1) 431 fmul %st, %st(2) 432 fmul %st, %st(3) 433 fxch %st(4) 434 435 movl INCY, %eax 436 437 FADD (%ebp) 438 FST (%ebp) 439 addl %eax, %ebp 440 441 FADD (%ebp) 442 FST (%ebp) 443 addl %eax, %ebp 444 445 FADD (%ebp) 446 FST (%ebp) 447 addl %eax, %ebp 448 449 FADD (%ebp) 450 FST (%ebp) 451 addl %eax, %ebp 452 453 decl J 454 jg .L48 455 ALIGN_3 456 457.L47: 458 movl N, %esi 459 andl $3,%esi 460 movl %esi, J 461 jle .L60 462 ALIGN_2 463 464.L61: 465 movl A, %ebx # a_offset = a 466 fldz # ct1 = ZERO 467 movl LDA, %edx 468 fldz # ct1 = ZERO 469 470 addl %ebx, %edx 471 fldz # ct1 = ZERO 472 movl %edx, A 473 fldz # ct1 = ZERO 474 475 movl XP, %esi 476 477 movl MIN_M, %eax 478 sarl $3,%eax 479 jle .L64 480 ALIGN_3 481 482.L65: 483#ifdef HAS_PREFETCH 484 prefetcht0 PRESIZE * 2 * SIZE(%ebx) 485 prefetcht0 PRESIZE * 2 * SIZE(%ebx) 486#endif 487 488 FLD 0 * SIZE(%esi) 489 FMUL 0 * SIZE(%ebx) 490 faddp %st,%st(1) 491 492 FLD 1 * SIZE(%esi) 493 FMUL 1 * SIZE(%ebx) 494 faddp %st,%st(2) 495 496 FLD 2 * SIZE(%esi) 497 FMUL 2 * SIZE(%ebx) 498 faddp %st,%st(3) 499 500 FLD 3 * SIZE(%esi) 501 FMUL 3 * SIZE(%ebx) 502 faddp %st,%st(4) 503 504 FLD 4 * SIZE(%esi) 505 FMUL 4 * SIZE(%ebx) 506 faddp %st,%st(1) 507 508 FLD 5 * SIZE(%esi) 509 FMUL 5 * SIZE(%ebx) 510 faddp %st,%st(2) 511 512 FLD 6 * SIZE(%esi) 513 FMUL 6 * SIZE(%ebx) 514 faddp %st,%st(3) 515 516 FLD 7 * SIZE(%esi) 517 FMUL 7 * SIZE(%ebx) 518 faddp %st,%st(4) 519 520 addl $8 * SIZE, %esi 521 addl $8 * SIZE, %ebx 522 523 decl %eax 524 jg .L65 525 ALIGN_3 526 527.L64: 528 movl MIN_M, %eax 529 andl $7, %eax 530 jle .L70 531 ALIGN_3 532 533.L71: 534 FLD (%esi) 535 FMUL (%ebx) 536 faddp %st,%st(1) 537 538 addl $SIZE, %esi 539 addl $SIZE, %ebx 540 decl %eax 541 jg .L71 542 ALIGN_3 543 544.L70: 545 faddp %st, %st(1) 546 faddp %st, %st(1) 547 faddp %st, %st(1) 548 549 fmul %st(1),%st 550 FADD (%ebp) 551 FST (%ebp) 552 addl INCY, %ebp 553 decl J 554 jg .L61 555 ALIGN_3 556 557.L60: 558 movl A, %ebx 559 addl NLDA, %ebx 560 movl %ebx, A 561 562 addl $P, IS 563 movl M, %esi 564 cmpl %esi, IS 565 jl .L32 566 ALIGN_3 567 568.L79: 569#ifndef C_SUN 570 ffreep %st(0) 571#else 572 .byte 0xdf 573 .byte 0xc0 574#endif 575 576 popl %ebx 577 popl %esi 578 popl %edi 579 popl %ebp 580 addl $ARGS, %esp 581 ret 582 583 EPILOGUE 584