1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef ATOM 43#define PREFETCH prefetchnta 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (8 * 6) 46#endif 47 48#define STACKSIZE 16 49 50#define M 4 + STACKSIZE(%esp) 51#define N 8 + STACKSIZE(%esp) 52#define ALPHA_R 16 + STACKSIZE(%esp) 53#define ALPHA_I 24 + STACKSIZE(%esp) 54#define A 32 + STACKSIZE(%esp) 55#define STACK_LDA 36 + STACKSIZE(%esp) 56#define STACK_X 40 + STACKSIZE(%esp) 57#define STACK_INCX 44 + STACKSIZE(%esp) 58#define Y 48 + STACKSIZE(%esp) 59#define STACK_INCY 52 + STACKSIZE(%esp) 60#define BUFFER 56 + STACKSIZE(%esp) 61 62#define I %eax 63#define J %ebx 64 65#define INCX %ecx 66#define INCY J 67 68#define A1 %esi 69#define X %edx 70#define Y1 %edi 71#define LDA %ebp 72 73#if !defined(CONJ) && !defined(XCONJ) 74#define ADD1 addsd 75#define ADD2 addsd 76#define ADD3 subsd 77#define ADD4 addsd 78#endif 79 80#if defined(CONJ) && !defined(XCONJ) 81#define ADD1 addsd 82#define ADD2 addsd 83#define ADD3 addsd 84#define ADD4 subsd 85#endif 86 87#if !defined(CONJ) && defined(XCONJ) 88#define ADD1 addsd 89#define ADD2 subsd 90#define ADD3 addsd 91#define ADD4 addsd 92#endif 93 94#if defined(CONJ) && defined(XCONJ) 95#define ADD1 addsd 96#define ADD2 subsd 97#define ADD3 subsd 98#define ADD4 subsd 99#endif 100 101 PROLOGUE 102 103 pushl %ebp 104 pushl %edi 105 pushl %esi 106 pushl %ebx 107 108 PROFCODE 109 110 movl STACK_LDA, LDA 111 movl STACK_X, X 112 movl STACK_INCX, INCX 113 114 sall $ZBASE_SHIFT, INCX 115 sall $ZBASE_SHIFT, LDA 116 117 subl $-16 * SIZE, A 118 119 cmpl $0, N 120 jle .L999 121 cmpl $0, M 122 jle .L999 123 124 movl BUFFER, Y1 125 126 movl N, J 127 128 pxor %xmm7, %xmm7 129 130 movl M, %eax 131 addl $8, %eax 132 sarl $3, %eax 133 ALIGN_3 134 135.L01: 136 movapd %xmm7, 0 * SIZE(Y1) 137 movapd %xmm7, 2 * SIZE(Y1) 138 movapd %xmm7, 4 * SIZE(Y1) 139 movapd %xmm7, 6 * SIZE(Y1) 140 movapd %xmm7, 8 * SIZE(Y1) 141 movapd %xmm7, 10 * SIZE(Y1) 142 movapd %xmm7, 12 * SIZE(Y1) 143 movapd %xmm7, 14 * SIZE(Y1) 144 subl $-16 * SIZE, Y1 145 decl %eax 146 jg .L01 147 ALIGN_3 148 149.L10: 150 movl BUFFER, Y1 151 addl $16 * SIZE, Y1 152 153 movl A, A1 154 addl LDA, A 155 156 movsd 0 * SIZE(X), %xmm6 157 movsd 1 * SIZE(X), %xmm7 158 addl INCX, X 159 160 movapd %xmm6, %xmm2 161 mulsd ALPHA_R, %xmm6 162 mulsd ALPHA_I, %xmm2 163 movapd %xmm7, %xmm3 164 mulsd ALPHA_I, %xmm3 165 mulsd ALPHA_R, %xmm7 166 167#ifndef XCONJ 168 subsd %xmm3, %xmm6 169 addsd %xmm2, %xmm7 170#else 171 addsd %xmm3, %xmm6 172 subsd %xmm2, %xmm7 173#endif 174 175 movsd -16 * SIZE(Y1), %xmm0 176 movsd -15 * SIZE(Y1), %xmm1 177 ALIGN_3 178 179 movl M, I 180 sarl $2, I 181 jle .L15 182 183 movsd -16 * SIZE(A1), %xmm2 184 movsd -15 * SIZE(A1), %xmm3 185 186 movapd %xmm2, %xmm4 187 mulsd %xmm6, %xmm2 188 mulsd %xmm7, %xmm4 189 190 decl I 191 jle .L14 192 ALIGN_3 193 194.L13: 195#ifdef PREFETCH 196 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 197#endif 198 199 movapd %xmm3, %xmm5 200 mulsd %xmm7, %xmm3 201 ADD1 %xmm2, %xmm0 202 movsd -14 * SIZE(A1), %xmm2 203 mulsd %xmm6, %xmm5 204 ADD2 %xmm4, %xmm1 205 206 movapd %xmm2, %xmm4 207 mulsd %xmm6, %xmm2 208 ADD3 %xmm3, %xmm0 209 movsd -13 * SIZE(A1), %xmm3 210 ADD4 %xmm5, %xmm1 211 mulsd %xmm7, %xmm4 212 213 movlpd %xmm0, -16 * SIZE(Y1) 214 movsd -14 * SIZE(Y1), %xmm0 215 movlpd %xmm1, -15 * SIZE(Y1) 216 movsd -13 * SIZE(Y1), %xmm1 217 218 movapd %xmm3, %xmm5 219 mulsd %xmm7, %xmm3 220 ADD1 %xmm2, %xmm0 221 movsd -12 * SIZE(A1), %xmm2 222 mulsd %xmm6, %xmm5 223 ADD2 %xmm4, %xmm1 224 225 movapd %xmm2, %xmm4 226 mulsd %xmm6, %xmm2 227 ADD3 %xmm3, %xmm0 228 movsd -11 * SIZE(A1), %xmm3 229 mulsd %xmm7, %xmm4 230 ADD4 %xmm5, %xmm1 231 232 movlpd %xmm0, -14 * SIZE(Y1) 233 movsd -12 * SIZE(Y1), %xmm0 234 movlpd %xmm1, -13 * SIZE(Y1) 235 movsd -11 * SIZE(Y1), %xmm1 236 237 movapd %xmm3, %xmm5 238 mulsd %xmm7, %xmm3 239 ADD1 %xmm2, %xmm0 240 movsd -10 * SIZE(A1), %xmm2 241 mulsd %xmm6, %xmm5 242 ADD2 %xmm4, %xmm1 243 244 movapd %xmm2, %xmm4 245 mulsd %xmm6, %xmm2 246 ADD3 %xmm3, %xmm0 247 movsd -9 * SIZE(A1), %xmm3 248 ADD4 %xmm5, %xmm1 249 mulsd %xmm7, %xmm4 250 251 movlpd %xmm0, -12 * SIZE(Y1) 252 movsd -10 * SIZE(Y1), %xmm0 253 movlpd %xmm1, -11 * SIZE(Y1) 254 movsd -9 * SIZE(Y1), %xmm1 255 256 movapd %xmm3, %xmm5 257 mulsd %xmm7, %xmm3 258 ADD1 %xmm2, %xmm0 259 movsd -8 * SIZE(A1), %xmm2 260 mulsd %xmm6, %xmm5 261 ADD2 %xmm4, %xmm1 262 263 movapd %xmm2, %xmm4 264 mulsd %xmm6, %xmm2 265 ADD3 %xmm3, %xmm0 266 movsd -7 * SIZE(A1), %xmm3 267 mulsd %xmm7, %xmm4 268 ADD4 %xmm5, %xmm1 269 270 movlpd %xmm0, -10 * SIZE(Y1) 271 movsd -8 * SIZE(Y1), %xmm0 272 movlpd %xmm1, -9 * SIZE(Y1) 273 movsd -7 * SIZE(Y1), %xmm1 274 275 subl $-8 * SIZE, A1 276 subl $-8 * SIZE, Y1 277 278 subl $1, I 279 BRANCH 280 jg .L13 281 ALIGN_3 282 283.L14: 284 movapd %xmm3, %xmm5 285 mulsd %xmm7, %xmm3 286 ADD1 %xmm2, %xmm0 287 movsd -14 * SIZE(A1), %xmm2 288 mulsd %xmm6, %xmm5 289 ADD2 %xmm4, %xmm1 290 291 movapd %xmm2, %xmm4 292 mulsd %xmm6, %xmm2 293 ADD3 %xmm3, %xmm0 294 movsd -13 * SIZE(A1), %xmm3 295 ADD4 %xmm5, %xmm1 296 mulsd %xmm7, %xmm4 297 298 movlpd %xmm0, -16 * SIZE(Y1) 299 movsd -14 * SIZE(Y1), %xmm0 300 movlpd %xmm1, -15 * SIZE(Y1) 301 movsd -13 * SIZE(Y1), %xmm1 302 303 movapd %xmm3, %xmm5 304 mulsd %xmm7, %xmm3 305 ADD1 %xmm2, %xmm0 306 movsd -12 * SIZE(A1), %xmm2 307 mulsd %xmm6, %xmm5 308 ADD2 %xmm4, %xmm1 309 310 movapd %xmm2, %xmm4 311 mulsd %xmm6, %xmm2 312 ADD3 %xmm3, %xmm0 313 movsd -11 * SIZE(A1), %xmm3 314 mulsd %xmm7, %xmm4 315 ADD4 %xmm5, %xmm1 316 317 movlpd %xmm0, -14 * SIZE(Y1) 318 movsd -12 * SIZE(Y1), %xmm0 319 movlpd %xmm1, -13 * SIZE(Y1) 320 movsd -11 * SIZE(Y1), %xmm1 321 322 movapd %xmm3, %xmm5 323 mulsd %xmm7, %xmm3 324 ADD1 %xmm2, %xmm0 325 movsd -10 * SIZE(A1), %xmm2 326 mulsd %xmm6, %xmm5 327 ADD2 %xmm4, %xmm1 328 329 movapd %xmm2, %xmm4 330 mulsd %xmm6, %xmm2 331 ADD3 %xmm3, %xmm0 332 movsd -9 * SIZE(A1), %xmm3 333 ADD4 %xmm5, %xmm1 334 mulsd %xmm7, %xmm4 335 336 movlpd %xmm0, -12 * SIZE(Y1) 337 movsd -10 * SIZE(Y1), %xmm0 338 movlpd %xmm1, -11 * SIZE(Y1) 339 movsd -9 * SIZE(Y1), %xmm1 340 341 movapd %xmm3, %xmm5 342 mulsd %xmm7, %xmm3 343 ADD1 %xmm2, %xmm0 344 mulsd %xmm6, %xmm5 345 ADD2 %xmm4, %xmm1 346 347 ADD3 %xmm3, %xmm0 348 ADD4 %xmm5, %xmm1 349 350 movlpd %xmm0, -10 * SIZE(Y1) 351 movsd -8 * SIZE(Y1), %xmm0 352 movlpd %xmm1, -9 * SIZE(Y1) 353 movsd -7 * SIZE(Y1), %xmm1 354 355 subl $-8 * SIZE, A1 356 subl $-8 * SIZE, Y1 357 ALIGN_3 358 359.L15: 360 testl $2, M 361 je .L17 362 363 movsd -16 * SIZE(A1), %xmm2 364 movsd -15 * SIZE(A1), %xmm3 365 366 movapd %xmm2, %xmm4 367 mulsd %xmm6, %xmm2 368 mulsd %xmm7, %xmm4 369 370 movapd %xmm3, %xmm5 371 mulsd %xmm7, %xmm3 372 ADD1 %xmm2, %xmm0 373 movsd -14 * SIZE(A1), %xmm2 374 mulsd %xmm6, %xmm5 375 ADD2 %xmm4, %xmm1 376 377 movapd %xmm2, %xmm4 378 mulsd %xmm6, %xmm2 379 ADD3 %xmm3, %xmm0 380 movsd -13 * SIZE(A1), %xmm3 381 ADD4 %xmm5, %xmm1 382 mulsd %xmm7, %xmm4 383 384 movlpd %xmm0, -16 * SIZE(Y1) 385 movsd -14 * SIZE(Y1), %xmm0 386 movlpd %xmm1, -15 * SIZE(Y1) 387 movsd -13 * SIZE(Y1), %xmm1 388 389 movapd %xmm3, %xmm5 390 mulsd %xmm7, %xmm3 391 ADD1 %xmm2, %xmm0 392 mulsd %xmm6, %xmm5 393 ADD2 %xmm4, %xmm1 394 395 ADD3 %xmm3, %xmm0 396 ADD4 %xmm5, %xmm1 397 398 movlpd %xmm0, -14 * SIZE(Y1) 399 movsd -12 * SIZE(Y1), %xmm0 400 movlpd %xmm1, -13 * SIZE(Y1) 401 movsd -11 * SIZE(Y1), %xmm1 402 403 addl $4 * SIZE, A1 404 addl $4 * SIZE, Y1 405 ALIGN_3 406 407.L17: 408 testl $1, M 409 je .L19 410 411 movsd -16 * SIZE(A1), %xmm2 412 movsd -15 * SIZE(A1), %xmm3 413 414 movapd %xmm2, %xmm4 415 mulsd %xmm6, %xmm2 416 mulsd %xmm7, %xmm4 417 418 movapd %xmm3, %xmm5 419 mulsd %xmm7, %xmm3 420 ADD1 %xmm2, %xmm0 421 mulsd %xmm6, %xmm5 422 ADD2 %xmm4, %xmm1 423 424 ADD3 %xmm3, %xmm0 425 ADD4 %xmm5, %xmm1 426 427 movlpd %xmm0, -16 * SIZE(Y1) 428 movlpd %xmm1, -15 * SIZE(Y1) 429 ALIGN_3 430 431.L19: 432 decl J 433 jg .L10 434 ALIGN_4 435 436.L990: 437 movl Y, Y1 438 movl BUFFER, X 439 movl STACK_INCY, INCY 440 441 movl Y1, A1 442 sall $ZBASE_SHIFT, INCY 443 444 movl M, %eax 445 sarl $2, %eax 446 jle .L994 447 ALIGN_3 448 449.L992: 450 movsd 0 * SIZE(Y1), %xmm0 451 movsd 1 * SIZE(Y1), %xmm1 452 addl INCY, Y1 453 454 movsd 0 * SIZE(Y1), %xmm2 455 movsd 1 * SIZE(Y1), %xmm3 456 addl INCY, Y1 457 458 movsd 0 * SIZE(Y1), %xmm4 459 movsd 1 * SIZE(Y1), %xmm5 460 addl INCY, Y1 461 462 movsd 0 * SIZE(Y1), %xmm6 463 movsd 1 * SIZE(Y1), %xmm7 464 addl INCY, Y1 465 466 addsd 0 * SIZE(X), %xmm0 467 addsd 1 * SIZE(X), %xmm1 468 addsd 2 * SIZE(X), %xmm2 469 addsd 3 * SIZE(X), %xmm3 470 addsd 4 * SIZE(X), %xmm4 471 addsd 5 * SIZE(X), %xmm5 472 addsd 6 * SIZE(X), %xmm6 473 addsd 7 * SIZE(X), %xmm7 474 475 movlpd %xmm0, 0 * SIZE(A1) 476 movlpd %xmm1, 1 * SIZE(A1) 477 addl INCY, A1 478 479 movlpd %xmm2, 0 * SIZE(A1) 480 movlpd %xmm3, 1 * SIZE(A1) 481 addl INCY, A1 482 483 movlpd %xmm4, 0 * SIZE(A1) 484 movlpd %xmm5, 1 * SIZE(A1) 485 addl INCY, A1 486 487 movlpd %xmm6, 0 * SIZE(A1) 488 movlpd %xmm7, 1 * SIZE(A1) 489 addl INCY, A1 490 491 addl $8 * SIZE, X 492 decl %eax 493 jg .L992 494 ALIGN_3 495 496.L994: 497 testl $2, M 498 jle .L996 499 500 movsd 0 * SIZE(Y1), %xmm0 501 movsd 1 * SIZE(Y1), %xmm1 502 addl INCY, Y1 503 504 movsd 0 * SIZE(Y1), %xmm2 505 movsd 1 * SIZE(Y1), %xmm3 506 addl INCY, Y1 507 508 addsd 0 * SIZE(X), %xmm0 509 addsd 1 * SIZE(X), %xmm1 510 addsd 2 * SIZE(X), %xmm2 511 addsd 3 * SIZE(X), %xmm3 512 513 movlpd %xmm0, 0 * SIZE(A1) 514 movlpd %xmm1, 1 * SIZE(A1) 515 addl INCY, A1 516 517 movlpd %xmm2, 0 * SIZE(A1) 518 movlpd %xmm3, 1 * SIZE(A1) 519 addl INCY, A1 520 521 addl $4 * SIZE, X 522 ALIGN_3 523 524.L996: 525 testl $1, M 526 jle .L999 527 528 movsd 0 * SIZE(Y1), %xmm0 529 movsd 1 * SIZE(Y1), %xmm1 530 531 addsd 0 * SIZE(X), %xmm0 532 addsd 1 * SIZE(X), %xmm1 533 534 movlpd %xmm0, 0 * SIZE(A1) 535 movlpd %xmm1, 1 * SIZE(A1) 536 ALIGN_3 537 538.L999: 539 popl %ebx 540 popl %esi 541 popl %edi 542 popl %ebp 543 ret 544 545 EPILOGUE 546