1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#ifdef PENTIUM4 43#define PREFETCH prefetcht0 44#define PREFETCHW prefetcht0 45#define PREFETCHSIZE (8 * 2) 46#endif 47 48#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) 49#define PREFETCH prefetcht0 50#define PREFETCHW prefetcht0 51#define PREFETCHSIZE (8 * 7) 52#endif 53 54#ifdef OPTERON 55#define PREFETCH prefetchnta 56#define PREFETCHW prefetchw 57#define PREFETCHSIZE (8 * 3) 58#define movsd movlps 59#endif 60 61#ifdef BARCELONA 62#define PREFETCH prefetchnta 63#define PREFETCHW prefetchw 64#define PREFETCHSIZE (8 * 5) 65#endif 66 67#ifdef ATOM 68#define PREFETCH prefetchnta 69#define PREFETCHW prefetcht0 70#define PREFETCHSIZE (8 * 6) 71#endif 72 73#ifdef NANO 74#define PREFETCH prefetcht0 75#define PREFETCHSIZE (8 * 4) 76#endif 77 78#define STACKSIZE 16 79 80#define M 4 + STACKSIZE(%esp) 81#define N 8 + STACKSIZE(%esp) 82#define ALPHA_R 16 + STACKSIZE(%esp) 83#define ALPHA_I 24 + STACKSIZE(%esp) 84#define A 32 + STACKSIZE(%esp) 85#define STACK_LDA 36 + STACKSIZE(%esp) 86#define STACK_X 40 + STACKSIZE(%esp) 87#define STACK_INCX 44 + STACKSIZE(%esp) 88#define Y 48 + STACKSIZE(%esp) 89#define STACK_INCY 52 + STACKSIZE(%esp) 90#define BUFFER 56 + STACKSIZE(%esp) 91 92#define I %eax 93#define J %ebx 94 95#define INCX %ecx 96#define INCY J 97 98#define A1 %esi 99#define X %edx 100#define Y1 %edi 101#define LDA %ebp 102 103#undef SUBPD 104 105#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) 106#define SUBPD subpd 107#else 108#define SUBPD addpd 109#endif 110 111 PROLOGUE 112 113 pushl %ebp 114 pushl %edi 115 pushl %esi 116 pushl %ebx 117 118 PROFCODE 119 120 movl STACK_LDA, LDA 121 movl STACK_X, X 122 movl STACK_INCX, INCX 123 124 sall $ZBASE_SHIFT, INCX 125 sall $ZBASE_SHIFT, LDA 126 127 subl $-16 * SIZE, A 128 129 cmpl $0, N 130 jle .L999 131 cmpl $0, M 132 jle .L999 133 134 movl BUFFER, Y1 135 136 movl N, J 137 138 pxor %xmm7, %xmm7 139 140 movl M, %eax 141 addl $8, %eax 142 sarl $3, %eax 143 ALIGN_3 144 145.L01: 146 movapd %xmm7, 0 * SIZE(Y1) 147 movapd %xmm7, 2 * SIZE(Y1) 148 movapd %xmm7, 4 * SIZE(Y1) 149 movapd %xmm7, 6 * SIZE(Y1) 150 movapd %xmm7, 8 * SIZE(Y1) 151 movapd %xmm7, 10 * SIZE(Y1) 152 movapd %xmm7, 12 * SIZE(Y1) 153 movapd %xmm7, 14 * SIZE(Y1) 154 subl $-16 * SIZE, Y1 155 decl %eax 156 jg .L01 157 ALIGN_3 158 159.L10: 160 movl BUFFER, Y1 161 addl $16 * SIZE, Y1 162 163 movl A, A1 164 addl LDA, A 165 166 movsd 0 * SIZE(X), %xmm6 167 movhpd 1 * SIZE(X), %xmm6 168 addl INCX, X 169 170 pcmpeqb %xmm5, %xmm5 171 psllq $63, %xmm5 172 shufps $0xc0, %xmm5, %xmm5 173 174 pshufd $0x4e, %xmm6, %xmm7 175 176#ifdef HAVE_SSE3 177 movddup ALPHA_R, %xmm3 178 movddup ALPHA_I, %xmm4 179#else 180 movsd ALPHA_R, %xmm3 181 movsd ALPHA_I, %xmm4 182 183 unpcklpd %xmm3, %xmm3 184 unpcklpd %xmm4, %xmm4 185#endif 186 187 xorpd %xmm5, %xmm7 188 189 mulpd %xmm3, %xmm6 190 mulpd %xmm4, %xmm7 191 192#ifndef XCONJ 193 subpd %xmm7, %xmm6 194#else 195 addpd %xmm7, %xmm6 196#endif 197 198 pshufd $0xee, %xmm6, %xmm7 199 pshufd $0x44, %xmm6, %xmm6 200 201#ifndef CONJ 202 xorpd %xmm5, %xmm7 203#else 204 xorpd %xmm5, %xmm6 205#endif 206 207 movapd -16 * SIZE(Y1), %xmm0 208 movapd -14 * SIZE(Y1), %xmm1 209 ALIGN_3 210 211 movl M, I 212 sarl $2, I 213 jle .L15 214 215 movsd -16 * SIZE(A1), %xmm2 216 movhpd -15 * SIZE(A1), %xmm2 217 movsd -14 * SIZE(A1), %xmm4 218 movhpd -13 * SIZE(A1), %xmm4 219 220 decl I 221 jle .L14 222 ALIGN_3 223 224.L13: 225#ifdef PREFETCH 226 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 227#endif 228 229 pshufd $0x4e, %xmm2, %xmm3 230 mulpd %xmm6, %xmm2 231 addpd %xmm2, %xmm0 232 movsd -12 * SIZE(A1), %xmm2 233 movhpd -11 * SIZE(A1), %xmm2 234 pshufd $0x4e, %xmm4, %xmm5 235 mulpd %xmm6, %xmm4 236 addpd %xmm4, %xmm1 237 movsd -10 * SIZE(A1), %xmm4 238 movhpd -9 * SIZE(A1), %xmm4 239 240 mulpd %xmm7, %xmm3 241 SUBPD %xmm3, %xmm0 242 movapd %xmm0, -16 * SIZE(Y1) 243 movapd -12 * SIZE(Y1), %xmm0 244 mulpd %xmm7, %xmm5 245 SUBPD %xmm5, %xmm1 246 movapd %xmm1, -14 * SIZE(Y1) 247 movapd -10 * SIZE(Y1), %xmm1 248 249 pshufd $0x4e, %xmm2, %xmm3 250 mulpd %xmm6, %xmm2 251 addpd %xmm2, %xmm0 252 movsd -8 * SIZE(A1), %xmm2 253 movhpd -7 * SIZE(A1), %xmm2 254 pshufd $0x4e, %xmm4, %xmm5 255 mulpd %xmm6, %xmm4 256 addpd %xmm4, %xmm1 257 movsd -6 * SIZE(A1), %xmm4 258 movhpd -5 * SIZE(A1), %xmm4 259 260 mulpd %xmm7, %xmm3 261 SUBPD %xmm3, %xmm0 262 movapd %xmm0, -12 * SIZE(Y1) 263 movapd -8 * SIZE(Y1), %xmm0 264 mulpd %xmm7, %xmm5 265 SUBPD %xmm5, %xmm1 266 movapd %xmm1, -10 * SIZE(Y1) 267 movapd -6 * SIZE(Y1), %xmm1 268 269 subl $-8 * SIZE, A1 270 subl $-8 * SIZE, Y1 271 272 subl $1, I 273 BRANCH 274 jg .L13 275 ALIGN_3 276 277.L14: 278 pshufd $0x4e, %xmm2, %xmm3 279 mulpd %xmm6, %xmm2 280 addpd %xmm2, %xmm0 281 movsd -12 * SIZE(A1), %xmm2 282 movhpd -11 * SIZE(A1), %xmm2 283 pshufd $0x4e, %xmm4, %xmm5 284 mulpd %xmm6, %xmm4 285 addpd %xmm4, %xmm1 286 movsd -10 * SIZE(A1), %xmm4 287 movhpd -9 * SIZE(A1), %xmm4 288 289 mulpd %xmm7, %xmm3 290 SUBPD %xmm3, %xmm0 291 movapd %xmm0, -16 * SIZE(Y1) 292 movapd -12 * SIZE(Y1), %xmm0 293 mulpd %xmm7, %xmm5 294 SUBPD %xmm5, %xmm1 295 movapd %xmm1, -14 * SIZE(Y1) 296 movapd -10 * SIZE(Y1), %xmm1 297 298 pshufd $0x4e, %xmm2, %xmm3 299 mulpd %xmm6, %xmm2 300 addpd %xmm2, %xmm0 301 pshufd $0x4e, %xmm4, %xmm5 302 mulpd %xmm6, %xmm4 303 addpd %xmm4, %xmm1 304 305 mulpd %xmm7, %xmm3 306 SUBPD %xmm3, %xmm0 307 movapd %xmm0, -12 * SIZE(Y1) 308 movapd -8 * SIZE(Y1), %xmm0 309 mulpd %xmm7, %xmm5 310 SUBPD %xmm5, %xmm1 311 movapd %xmm1, -10 * SIZE(Y1) 312 movapd -6 * SIZE(Y1), %xmm1 313 314 subl $-8 * SIZE, A1 315 subl $-8 * SIZE, Y1 316 ALIGN_3 317 318.L15: 319 testl $2, M 320 je .L17 321 322 movsd -16 * SIZE(A1), %xmm2 323 movhpd -15 * SIZE(A1), %xmm2 324 movsd -14 * SIZE(A1), %xmm4 325 movhpd -13 * SIZE(A1), %xmm4 326 327 pshufd $0x4e, %xmm2, %xmm3 328 mulpd %xmm6, %xmm2 329 addpd %xmm2, %xmm0 330 pshufd $0x4e, %xmm4, %xmm5 331 mulpd %xmm6, %xmm4 332 addpd %xmm4, %xmm1 333 334 mulpd %xmm7, %xmm3 335 SUBPD %xmm3, %xmm0 336 movapd %xmm0, -16 * SIZE(Y1) 337 mulpd %xmm7, %xmm5 338 SUBPD %xmm5, %xmm1 339 movapd %xmm1, -14 * SIZE(Y1) 340 341 movapd -12 * SIZE(Y1), %xmm0 342 343 addl $4 * SIZE, A1 344 addl $4 * SIZE, Y1 345 ALIGN_3 346 347.L17: 348 testl $1, M 349 je .L19 350 351 movsd -16 * SIZE(A1), %xmm2 352 movhpd -15 * SIZE(A1), %xmm2 353 354 pshufd $0x4e, %xmm2, %xmm3 355 mulpd %xmm6, %xmm2 356 addpd %xmm2, %xmm0 357 mulpd %xmm7, %xmm3 358 SUBPD %xmm3, %xmm0 359 360 movapd %xmm0, -16 * SIZE(Y1) 361 ALIGN_3 362 363.L19: 364 decl J 365 jg .L10 366 ALIGN_4 367 368.L990: 369 movl Y, Y1 370 movl BUFFER, X 371 372 movl STACK_INCY, INCY 373 sall $ZBASE_SHIFT, INCY 374 375 movl M, %eax 376 sarl $2, %eax 377 jle .L994 378 ALIGN_3 379 380.L992: 381 movsd 0 * SIZE(Y1), %xmm0 382 movhpd 1 * SIZE(Y1), %xmm0 383 384 addpd 0 * SIZE(X), %xmm0 385 386 movlpd %xmm0, 0 * SIZE(Y1) 387 movhpd %xmm0, 1 * SIZE(Y1) 388 addl INCY, Y1 389 390 movsd 0 * SIZE(Y1), %xmm0 391 movhpd 1 * SIZE(Y1), %xmm0 392 393 addpd 2 * SIZE(X), %xmm0 394 395 movlpd %xmm0, 0 * SIZE(Y1) 396 movhpd %xmm0, 1 * SIZE(Y1) 397 addl INCY, Y1 398 399 movsd 0 * SIZE(Y1), %xmm0 400 movhpd 1 * SIZE(Y1), %xmm0 401 402 addpd 4 * SIZE(X), %xmm0 403 404 movlpd %xmm0, 0 * SIZE(Y1) 405 movhpd %xmm0, 1 * SIZE(Y1) 406 addl INCY, Y1 407 408 movsd 0 * SIZE(Y1), %xmm0 409 movhpd 1 * SIZE(Y1), %xmm0 410 411 addpd 6 * SIZE(X), %xmm0 412 413 movlpd %xmm0, 0 * SIZE(Y1) 414 movhpd %xmm0, 1 * SIZE(Y1) 415 addl INCY, Y1 416 417 addl $8 * SIZE, X 418 decl %eax 419 jg .L992 420 ALIGN_3 421 422.L994: 423 testl $2, M 424 jle .L996 425 426 movsd 0 * SIZE(Y1), %xmm0 427 movhpd 1 * SIZE(Y1), %xmm0 428 429 addpd 0 * SIZE(X), %xmm0 430 431 movlpd %xmm0, 0 * SIZE(Y1) 432 movhpd %xmm0, 1 * SIZE(Y1) 433 addl INCY, Y1 434 435 movsd 0 * SIZE(Y1), %xmm0 436 movhpd 1 * SIZE(Y1), %xmm0 437 438 addpd 2 * SIZE(X), %xmm0 439 440 movlpd %xmm0, 0 * SIZE(Y1) 441 movhpd %xmm0, 1 * SIZE(Y1) 442 addl INCY, Y1 443 444 addl $4 * SIZE, X 445 ALIGN_3 446 447.L996: 448 testl $1, M 449 jle .L999 450 451 movsd 0 * SIZE(Y1), %xmm0 452 movhpd 1 * SIZE(Y1), %xmm0 453 454 addpd 0 * SIZE(X), %xmm0 455 456 movlpd %xmm0, 0 * SIZE(Y1) 457 movhpd %xmm0, 1 * SIZE(Y1) 458 ALIGN_3 459 460.L999: 461 popl %ebx 462 popl %esi 463 popl %edi 464 popl %ebp 465 ret 466 467 EPILOGUE 468