1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef ATOM 26#define PREFETCH prefetchnta 27#define PREFETCHW prefetcht0 28#define PREFETCHSIZE (8 * 6) 29#endif 30 31#define STACKSIZE 16 32 33#define M 4 + STACKSIZE(%esp) 34#define N 8 + STACKSIZE(%esp) 35#define ALPHA_R 16 + STACKSIZE(%esp) 36#define ALPHA_I 24 + STACKSIZE(%esp) 37#define A 32 + STACKSIZE(%esp) 38#define STACK_LDA 36 + STACKSIZE(%esp) 39#define STACK_X 40 + STACKSIZE(%esp) 40#define STACK_INCX 44 + STACKSIZE(%esp) 41#define Y 48 + STACKSIZE(%esp) 42#define STACK_INCY 52 + STACKSIZE(%esp) 43#define BUFFER 56 + STACKSIZE(%esp) 44 45#define I %eax 46#define J %ebx 47 48#define INCX %ecx 49#define INCY J 50 51#define A1 %esi 52#define X %edx 53#define Y1 %edi 54#define LDA %ebp 55 56#if !defined(CONJ) && !defined(XCONJ) 57#define ADD1 addsd 58#define ADD2 addsd 59#define ADD3 subsd 60#define ADD4 addsd 61#endif 62 63#if defined(CONJ) && !defined(XCONJ) 64#define ADD1 addsd 65#define ADD2 addsd 66#define ADD3 addsd 67#define ADD4 subsd 68#endif 69 70#if !defined(CONJ) && defined(XCONJ) 71#define ADD1 addsd 72#define ADD2 subsd 73#define ADD3 addsd 74#define ADD4 addsd 75#endif 76 77#if defined(CONJ) && defined(XCONJ) 78#define ADD1 addsd 79#define ADD2 subsd 80#define ADD3 subsd 81#define ADD4 subsd 82#endif 83 84 PROLOGUE 85 86 pushl %ebp 87 pushl %edi 88 pushl %esi 89 pushl %ebx 90 91 PROFCODE 92 93 movl STACK_LDA, LDA 94 movl STACK_X, X 95 movl STACK_INCX, INCX 96 97 sall $ZBASE_SHIFT, INCX 98 sall $ZBASE_SHIFT, LDA 99 100 subl $-16 * SIZE, A 101 102 cmpl $0, N 103 jle .L999 104 cmpl $0, M 105 jle .L999 106 107 movl BUFFER, Y1 108 109 movl N, J 110 111 pxor %xmm7, %xmm7 112 113 movl M, %eax 114 addl $8, %eax 115 sarl $3, %eax 116 ALIGN_3 117 118.L01: 119 movapd %xmm7, 0 * SIZE(Y1) 120 movapd %xmm7, 2 * SIZE(Y1) 121 movapd %xmm7, 4 * SIZE(Y1) 122 movapd %xmm7, 6 * SIZE(Y1) 123 movapd %xmm7, 8 * SIZE(Y1) 124 movapd %xmm7, 10 * SIZE(Y1) 125 movapd %xmm7, 12 * SIZE(Y1) 126 movapd %xmm7, 14 * SIZE(Y1) 127 subl $-16 * SIZE, Y1 128 decl %eax 129 jg .L01 130 ALIGN_3 131 132.L10: 133 movl BUFFER, Y1 134 addl $16 * SIZE, Y1 135 136 movl A, A1 137 addl LDA, A 138 139 movsd 0 * SIZE(X), %xmm6 140 movsd 1 * SIZE(X), %xmm7 141 addl INCX, X 142 143 movapd %xmm6, %xmm2 144 mulsd ALPHA_R, %xmm6 145 mulsd ALPHA_I, %xmm2 146 movapd %xmm7, %xmm3 147 mulsd ALPHA_I, %xmm3 148 mulsd ALPHA_R, %xmm7 149 150#ifndef XCONJ 151 subsd %xmm3, %xmm6 152 addsd %xmm2, %xmm7 153#else 154 addsd %xmm3, %xmm6 155 subsd %xmm2, %xmm7 156#endif 157 158 movsd -16 * SIZE(Y1), %xmm0 159 movsd -15 * SIZE(Y1), %xmm1 160 ALIGN_3 161 162 movl M, I 163 sarl $2, I 164 jle .L15 165 166 movsd -16 * SIZE(A1), %xmm2 167 movsd -15 * SIZE(A1), %xmm3 168 169 movapd %xmm2, %xmm4 170 mulsd %xmm6, %xmm2 171 mulsd %xmm7, %xmm4 172 173 decl I 174 jle .L14 175 ALIGN_3 176 177.L13: 178#ifdef PREFETCH 179 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 180#endif 181 182 movapd %xmm3, %xmm5 183 mulsd %xmm7, %xmm3 184 ADD1 %xmm2, %xmm0 185 movsd -14 * SIZE(A1), %xmm2 186 mulsd %xmm6, %xmm5 187 ADD2 %xmm4, %xmm1 188 189 movapd %xmm2, %xmm4 190 mulsd %xmm6, %xmm2 191 ADD3 %xmm3, %xmm0 192 movsd -13 * SIZE(A1), %xmm3 193 ADD4 %xmm5, %xmm1 194 mulsd %xmm7, %xmm4 195 196 movlpd %xmm0, -16 * SIZE(Y1) 197 movsd -14 * SIZE(Y1), %xmm0 198 movlpd %xmm1, -15 * SIZE(Y1) 199 movsd -13 * SIZE(Y1), %xmm1 200 201 movapd %xmm3, %xmm5 202 mulsd %xmm7, %xmm3 203 ADD1 %xmm2, %xmm0 204 movsd -12 * SIZE(A1), %xmm2 205 mulsd %xmm6, %xmm5 206 ADD2 %xmm4, %xmm1 207 208 movapd %xmm2, %xmm4 209 mulsd %xmm6, %xmm2 210 ADD3 %xmm3, %xmm0 211 movsd -11 * SIZE(A1), %xmm3 212 mulsd %xmm7, %xmm4 213 ADD4 %xmm5, %xmm1 214 215 movlpd %xmm0, -14 * SIZE(Y1) 216 movsd -12 * SIZE(Y1), %xmm0 217 movlpd %xmm1, -13 * SIZE(Y1) 218 movsd -11 * SIZE(Y1), %xmm1 219 220 movapd %xmm3, %xmm5 221 mulsd %xmm7, %xmm3 222 ADD1 %xmm2, %xmm0 223 movsd -10 * SIZE(A1), %xmm2 224 mulsd %xmm6, %xmm5 225 ADD2 %xmm4, %xmm1 226 227 movapd %xmm2, %xmm4 228 mulsd %xmm6, %xmm2 229 ADD3 %xmm3, %xmm0 230 movsd -9 * SIZE(A1), %xmm3 231 ADD4 %xmm5, %xmm1 232 mulsd %xmm7, %xmm4 233 234 movlpd %xmm0, -12 * SIZE(Y1) 235 movsd -10 * SIZE(Y1), %xmm0 236 movlpd %xmm1, -11 * SIZE(Y1) 237 movsd -9 * SIZE(Y1), %xmm1 238 239 movapd %xmm3, %xmm5 240 mulsd %xmm7, %xmm3 241 ADD1 %xmm2, %xmm0 242 movsd -8 * SIZE(A1), %xmm2 243 mulsd %xmm6, %xmm5 244 ADD2 %xmm4, %xmm1 245 246 movapd %xmm2, %xmm4 247 mulsd %xmm6, %xmm2 248 ADD3 %xmm3, %xmm0 249 movsd -7 * SIZE(A1), %xmm3 250 mulsd %xmm7, %xmm4 251 ADD4 %xmm5, %xmm1 252 253 movlpd %xmm0, -10 * SIZE(Y1) 254 movsd -8 * SIZE(Y1), %xmm0 255 movlpd %xmm1, -9 * SIZE(Y1) 256 movsd -7 * SIZE(Y1), %xmm1 257 258 subl $-8 * SIZE, A1 259 subl $-8 * SIZE, Y1 260 261 subl $1, I 262 BRANCH 263 jg .L13 264 ALIGN_3 265 266.L14: 267 movapd %xmm3, %xmm5 268 mulsd %xmm7, %xmm3 269 ADD1 %xmm2, %xmm0 270 movsd -14 * SIZE(A1), %xmm2 271 mulsd %xmm6, %xmm5 272 ADD2 %xmm4, %xmm1 273 274 movapd %xmm2, %xmm4 275 mulsd %xmm6, %xmm2 276 ADD3 %xmm3, %xmm0 277 movsd -13 * SIZE(A1), %xmm3 278 ADD4 %xmm5, %xmm1 279 mulsd %xmm7, %xmm4 280 281 movlpd %xmm0, -16 * SIZE(Y1) 282 movsd -14 * SIZE(Y1), %xmm0 283 movlpd %xmm1, -15 * SIZE(Y1) 284 movsd -13 * SIZE(Y1), %xmm1 285 286 movapd %xmm3, %xmm5 287 mulsd %xmm7, %xmm3 288 ADD1 %xmm2, %xmm0 289 movsd -12 * SIZE(A1), %xmm2 290 mulsd %xmm6, %xmm5 291 ADD2 %xmm4, %xmm1 292 293 movapd %xmm2, %xmm4 294 mulsd %xmm6, %xmm2 295 ADD3 %xmm3, %xmm0 296 movsd -11 * SIZE(A1), %xmm3 297 mulsd %xmm7, %xmm4 298 ADD4 %xmm5, %xmm1 299 300 movlpd %xmm0, -14 * SIZE(Y1) 301 movsd -12 * SIZE(Y1), %xmm0 302 movlpd %xmm1, -13 * SIZE(Y1) 303 movsd -11 * SIZE(Y1), %xmm1 304 305 movapd %xmm3, %xmm5 306 mulsd %xmm7, %xmm3 307 ADD1 %xmm2, %xmm0 308 movsd -10 * SIZE(A1), %xmm2 309 mulsd %xmm6, %xmm5 310 ADD2 %xmm4, %xmm1 311 312 movapd %xmm2, %xmm4 313 mulsd %xmm6, %xmm2 314 ADD3 %xmm3, %xmm0 315 movsd -9 * SIZE(A1), %xmm3 316 ADD4 %xmm5, %xmm1 317 mulsd %xmm7, %xmm4 318 319 movlpd %xmm0, -12 * SIZE(Y1) 320 movsd -10 * SIZE(Y1), %xmm0 321 movlpd %xmm1, -11 * SIZE(Y1) 322 movsd -9 * SIZE(Y1), %xmm1 323 324 movapd %xmm3, %xmm5 325 mulsd %xmm7, %xmm3 326 ADD1 %xmm2, %xmm0 327 mulsd %xmm6, %xmm5 328 ADD2 %xmm4, %xmm1 329 330 ADD3 %xmm3, %xmm0 331 ADD4 %xmm5, %xmm1 332 333 movlpd %xmm0, -10 * SIZE(Y1) 334 movsd -8 * SIZE(Y1), %xmm0 335 movlpd %xmm1, -9 * SIZE(Y1) 336 movsd -7 * SIZE(Y1), %xmm1 337 338 subl $-8 * SIZE, A1 339 subl $-8 * SIZE, Y1 340 ALIGN_3 341 342.L15: 343 testl $2, M 344 je .L17 345 346 movsd -16 * SIZE(A1), %xmm2 347 movsd -15 * SIZE(A1), %xmm3 348 349 movapd %xmm2, %xmm4 350 mulsd %xmm6, %xmm2 351 mulsd %xmm7, %xmm4 352 353 movapd %xmm3, %xmm5 354 mulsd %xmm7, %xmm3 355 ADD1 %xmm2, %xmm0 356 movsd -14 * SIZE(A1), %xmm2 357 mulsd %xmm6, %xmm5 358 ADD2 %xmm4, %xmm1 359 360 movapd %xmm2, %xmm4 361 mulsd %xmm6, %xmm2 362 ADD3 %xmm3, %xmm0 363 movsd -13 * SIZE(A1), %xmm3 364 ADD4 %xmm5, %xmm1 365 mulsd %xmm7, %xmm4 366 367 movlpd %xmm0, -16 * SIZE(Y1) 368 movsd -14 * SIZE(Y1), %xmm0 369 movlpd %xmm1, -15 * SIZE(Y1) 370 movsd -13 * SIZE(Y1), %xmm1 371 372 movapd %xmm3, %xmm5 373 mulsd %xmm7, %xmm3 374 ADD1 %xmm2, %xmm0 375 mulsd %xmm6, %xmm5 376 ADD2 %xmm4, %xmm1 377 378 ADD3 %xmm3, %xmm0 379 ADD4 %xmm5, %xmm1 380 381 movlpd %xmm0, -14 * SIZE(Y1) 382 movsd -12 * SIZE(Y1), %xmm0 383 movlpd %xmm1, -13 * SIZE(Y1) 384 movsd -11 * SIZE(Y1), %xmm1 385 386 addl $4 * SIZE, A1 387 addl $4 * SIZE, Y1 388 ALIGN_3 389 390.L17: 391 testl $1, M 392 je .L19 393 394 movsd -16 * SIZE(A1), %xmm2 395 movsd -15 * SIZE(A1), %xmm3 396 397 movapd %xmm2, %xmm4 398 mulsd %xmm6, %xmm2 399 mulsd %xmm7, %xmm4 400 401 movapd %xmm3, %xmm5 402 mulsd %xmm7, %xmm3 403 ADD1 %xmm2, %xmm0 404 mulsd %xmm6, %xmm5 405 ADD2 %xmm4, %xmm1 406 407 ADD3 %xmm3, %xmm0 408 ADD4 %xmm5, %xmm1 409 410 movlpd %xmm0, -16 * SIZE(Y1) 411 movlpd %xmm1, -15 * SIZE(Y1) 412 ALIGN_3 413 414.L19: 415 decl J 416 jg .L10 417 ALIGN_4 418 419.L990: 420 movl Y, Y1 421 movl BUFFER, X 422 movl STACK_INCY, INCY 423 424 movl Y1, A1 425 sall $ZBASE_SHIFT, INCY 426 427 movl M, %eax 428 sarl $2, %eax 429 jle .L994 430 ALIGN_3 431 432.L992: 433 movsd 0 * SIZE(Y1), %xmm0 434 movsd 1 * SIZE(Y1), %xmm1 435 addl INCY, Y1 436 437 movsd 0 * SIZE(Y1), %xmm2 438 movsd 1 * SIZE(Y1), %xmm3 439 addl INCY, Y1 440 441 movsd 0 * SIZE(Y1), %xmm4 442 movsd 1 * SIZE(Y1), %xmm5 443 addl INCY, Y1 444 445 movsd 0 * SIZE(Y1), %xmm6 446 movsd 1 * SIZE(Y1), %xmm7 447 addl INCY, Y1 448 449 addsd 0 * SIZE(X), %xmm0 450 addsd 1 * SIZE(X), %xmm1 451 addsd 2 * SIZE(X), %xmm2 452 addsd 3 * SIZE(X), %xmm3 453 addsd 4 * SIZE(X), %xmm4 454 addsd 5 * SIZE(X), %xmm5 455 addsd 6 * SIZE(X), %xmm6 456 addsd 7 * SIZE(X), %xmm7 457 458 movlpd %xmm0, 0 * SIZE(A1) 459 movlpd %xmm1, 1 * SIZE(A1) 460 addl INCY, A1 461 462 movlpd %xmm2, 0 * SIZE(A1) 463 movlpd %xmm3, 1 * SIZE(A1) 464 addl INCY, A1 465 466 movlpd %xmm4, 0 * SIZE(A1) 467 movlpd %xmm5, 1 * SIZE(A1) 468 addl INCY, A1 469 470 movlpd %xmm6, 0 * SIZE(A1) 471 movlpd %xmm7, 1 * SIZE(A1) 472 addl INCY, A1 473 474 addl $8 * SIZE, X 475 decl %eax 476 jg .L992 477 ALIGN_3 478 479.L994: 480 testl $2, M 481 jle .L996 482 483 movsd 0 * SIZE(Y1), %xmm0 484 movsd 1 * SIZE(Y1), %xmm1 485 addl INCY, Y1 486 487 movsd 0 * SIZE(Y1), %xmm2 488 movsd 1 * SIZE(Y1), %xmm3 489 addl INCY, Y1 490 491 addsd 0 * SIZE(X), %xmm0 492 addsd 1 * SIZE(X), %xmm1 493 addsd 2 * SIZE(X), %xmm2 494 addsd 3 * SIZE(X), %xmm3 495 496 movlpd %xmm0, 0 * SIZE(A1) 497 movlpd %xmm1, 1 * SIZE(A1) 498 addl INCY, A1 499 500 movlpd %xmm2, 0 * SIZE(A1) 501 movlpd %xmm3, 1 * SIZE(A1) 502 addl INCY, A1 503 504 addl $4 * SIZE, X 505 ALIGN_3 506 507.L996: 508 testl $1, M 509 jle .L999 510 511 movsd 0 * SIZE(Y1), %xmm0 512 movsd 1 * SIZE(Y1), %xmm1 513 514 addsd 0 * SIZE(X), %xmm0 515 addsd 1 * SIZE(X), %xmm1 516 517 movlpd %xmm0, 0 * SIZE(A1) 518 movlpd %xmm1, 1 * SIZE(A1) 519 ALIGN_3 520 521.L999: 522 popl %ebx 523 popl %esi 524 popl %edi 525 popl %ebp 526 ret 527 528 EPILOGUE 529