1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52#define I %r11 53#define AO %rdi 54#define BO %rsi 55#define CO1 %r15 56#define CO2 %rbp 57#define BB %r12 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 64 62 63#else 64 65#define STACKSIZE 256 66 67#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 68#define OLD_A 48 + STACKSIZE(%rsp) 69#define OLD_B 56 + STACKSIZE(%rsp) 70#define OLD_C 64 + STACKSIZE(%rsp) 71#define OLD_LDC 72 + STACKSIZE(%rsp) 72#define OLD_OFFSET 80 + STACKSIZE(%rsp) 73 74#endif 75 76#define POSINV 0(%rsp) 77#define ALPHA_R 16(%rsp) 78#define ALPHA_I 32(%rsp) 79#define J 48(%rsp) 80#define OFFSET 56(%rsp) 81#define KK 64(%rsp) 82#define KKK 72(%rsp) 83#define BUFFER 256(%rsp) 84 85#ifdef OPTERON 86#define PREFETCH prefetch 87#define PREFETCHW prefetchw 88#define PREFETCHSIZE (8 * 9 + 4) 89 90#define RPREFETCHSIZE (8 * 7 + 4) 91#define WPREFETCHSIZE (8 * 8 + 4) 92#endif 93 94#ifdef GENERIC 95#define PREFETCH prefetcht0 96#define PREFETCHW prefetcht0 97#define PREFETCHSIZE (8 * 5 + 4) 98 99#define RPREFETCHSIZE (8 * 7 + 4) 100#define WPREFETCHSIZE (8 * 8 + 4) 101#endif 102 103#ifndef GENERIC 104#define KERNEL1(xx) \ 105 mulpd %xmm0, %xmm1 ;\ 106 addpd %xmm1, %xmm8 ;\ 107 movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 108 mulpd %xmm0, %xmm3 ;\ 109 addpd %xmm3, %xmm9 ;\ 110 movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 111 mulpd %xmm0, %xmm5 ;\ 112 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 113 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 114 addpd %xmm5, %xmm10 ;\ 115 movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 116 addpd %xmm0, %xmm11 ;\ 117 movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 118 119#define KERNEL2(xx) \ 120 mulpd %xmm2, %xmm1 ;\ 121 addpd %xmm1, %xmm12 ;\ 122 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 123 mulpd %xmm2, %xmm3 ;\ 124 addpd %xmm3, %xmm13 ;\ 125 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 126 mulpd %xmm2, %xmm5 ;\ 127 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 128 addpd %xmm5, %xmm14 ;\ 129 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 130 addpd %xmm2, %xmm15 ;\ 131 movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 132 133#define KERNEL3(xx) \ 134 mulpd %xmm4, %xmm7 ;\ 135 addpd %xmm7, %xmm8 ;\ 136 movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 137 mulpd %xmm4, %xmm3 ;\ 138 addpd %xmm3, %xmm9 ;\ 139 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 140 mulpd %xmm4, %xmm5 ;\ 141 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 142 addpd %xmm5, %xmm10 ;\ 143 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 144 addpd %xmm4, %xmm11 ;\ 145 movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 146 147#define KERNEL4(xx) \ 148 mulpd %xmm6, %xmm7 ;\ 149 addpd %xmm7, %xmm12 ;\ 150 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 151 mulpd %xmm6, %xmm3 ;\ 152 addpd %xmm3, %xmm13 ;\ 153 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 154 mulpd %xmm6, %xmm5 ;\ 155 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 156 addpd %xmm5, %xmm14 ;\ 157 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 158 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 159 addpd %xmm6, %xmm15 ;\ 160 movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 161 162#define KERNEL5(xx) \ 163 mulpd %xmm0, %xmm1 ;\ 164 addpd %xmm1, %xmm8 ;\ 165 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 166 mulpd %xmm0, %xmm3 ;\ 167 addpd %xmm3, %xmm9 ;\ 168 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 169 mulpd %xmm0, %xmm5 ;\ 170 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 171 addpd %xmm5, %xmm10 ;\ 172 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 173 addpd %xmm0, %xmm11 ;\ 174 movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 175 176#define KERNEL6(xx) \ 177 mulpd %xmm2, %xmm1 ;\ 178 addpd %xmm1, %xmm12 ;\ 179 movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 180 mulpd %xmm2, %xmm3 ;\ 181 addpd %xmm3, %xmm13 ;\ 182 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 183 mulpd %xmm2, %xmm5 ;\ 184 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 185 addpd %xmm5, %xmm14 ;\ 186 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 187 addpd %xmm2, %xmm15 ;\ 188 movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 189 190#define KERNEL7(xx) \ 191 mulpd %xmm4, %xmm7 ;\ 192 addpd %xmm7, %xmm8 ;\ 193 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 194 mulpd %xmm4, %xmm3 ;\ 195 addpd %xmm3, %xmm9 ;\ 196 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 197 mulpd %xmm4, %xmm5 ;\ 198 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 199 addpd %xmm5, %xmm10 ;\ 200 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 201 addpd %xmm4, %xmm11 ;\ 202 movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 203 204#define KERNEL8(xx) \ 205 mulpd %xmm6, %xmm7 ;\ 206 addpd %xmm7, %xmm12 ;\ 207 movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 208 mulpd %xmm6, %xmm3 ;\ 209 addpd %xmm3, %xmm13 ;\ 210 movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 211 mulpd %xmm6, %xmm5 ;\ 212 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 213 addpd %xmm5, %xmm14 ;\ 214 movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 215 addpd %xmm6, %xmm15 ;\ 216 movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 217 218#else 219#define KERNEL1(xx) \ 220 mulpd %xmm0, %xmm1 ;\ 221 addpd %xmm1, %xmm8 ;\ 222 movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 223 mulpd %xmm0, %xmm3 ;\ 224 addpd %xmm3, %xmm9 ;\ 225 movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 226 mulpd %xmm0, %xmm5 ;\ 227 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 228 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 229 addpd %xmm5, %xmm10 ;\ 230 movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 231 addpd %xmm0, %xmm11 ;\ 232 movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 233 234#define KERNEL2(xx) \ 235 mulpd %xmm2, %xmm1 ;\ 236 addpd %xmm1, %xmm12 ;\ 237 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 238 mulpd %xmm2, %xmm3 ;\ 239 addpd %xmm3, %xmm13 ;\ 240 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 241 mulpd %xmm2, %xmm5 ;\ 242 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 243 addpd %xmm5, %xmm14 ;\ 244 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 245 addpd %xmm2, %xmm15 ;\ 246 movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 247 248#define KERNEL3(xx) \ 249 mulpd %xmm4, %xmm7 ;\ 250 addpd %xmm7, %xmm8 ;\ 251 movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 252 mulpd %xmm4, %xmm3 ;\ 253 addpd %xmm3, %xmm9 ;\ 254 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 255 mulpd %xmm4, %xmm5 ;\ 256 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 257 addpd %xmm5, %xmm10 ;\ 258 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 259 addpd %xmm4, %xmm11 ;\ 260 movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 261 262#define KERNEL4(xx) \ 263 mulpd %xmm6, %xmm7 ;\ 264 addpd %xmm7, %xmm12 ;\ 265 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 266 mulpd %xmm6, %xmm3 ;\ 267 addpd %xmm3, %xmm13 ;\ 268 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 269 mulpd %xmm6, %xmm5 ;\ 270 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 271 addpd %xmm5, %xmm14 ;\ 272 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 273 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 274 addpd %xmm6, %xmm15 ;\ 275 movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 276 277#define KERNEL5(xx) \ 278 mulpd %xmm0, %xmm1 ;\ 279 addpd %xmm1, %xmm8 ;\ 280 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 281 mulpd %xmm0, %xmm3 ;\ 282 addpd %xmm3, %xmm9 ;\ 283 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 284 mulpd %xmm0, %xmm5 ;\ 285 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 286 addpd %xmm5, %xmm10 ;\ 287 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 288 addpd %xmm0, %xmm11 ;\ 289 movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 290 291#define KERNEL6(xx) \ 292 mulpd %xmm2, %xmm1 ;\ 293 addpd %xmm1, %xmm12 ;\ 294 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 295 mulpd %xmm2, %xmm3 ;\ 296 addpd %xmm3, %xmm13 ;\ 297 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 298 mulpd %xmm2, %xmm5 ;\ 299 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 300 addpd %xmm5, %xmm14 ;\ 301 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 302 addpd %xmm2, %xmm15 ;\ 303 movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 304 305#define KERNEL7(xx) \ 306 mulpd %xmm4, %xmm7 ;\ 307 addpd %xmm7, %xmm8 ;\ 308 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 309 mulpd %xmm4, %xmm3 ;\ 310 addpd %xmm3, %xmm9 ;\ 311 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 312 mulpd %xmm4, %xmm5 ;\ 313 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 314 addpd %xmm5, %xmm10 ;\ 315 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 316 addpd %xmm4, %xmm11 ;\ 317 movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 318 319#define KERNEL8(xx) \ 320 mulpd %xmm6, %xmm7 ;\ 321 addpd %xmm7, %xmm12 ;\ 322 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 323 mulpd %xmm6, %xmm3 ;\ 324 addpd %xmm3, %xmm13 ;\ 325 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 326 mulpd %xmm6, %xmm5 ;\ 327 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 328 addpd %xmm5, %xmm14 ;\ 329 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 330 addpd %xmm6, %xmm15 ;\ 331 movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 332 333#endif 334 335 PROLOGUE 336 PROFCODE 337 338 subq $STACKSIZE, %rsp 339 340 movq %rbx, 0(%rsp) 341 movq %rbp, 8(%rsp) 342 movq %r12, 16(%rsp) 343 movq %r13, 24(%rsp) 344 movq %r14, 32(%rsp) 345 movq %r15, 40(%rsp) 346 347#ifdef WINDOWS_ABI 348 movq %rdi, 48(%rsp) 349 movq %rsi, 56(%rsp) 350 movups %xmm6, 64(%rsp) 351 movups %xmm7, 80(%rsp) 352 movups %xmm8, 96(%rsp) 353 movups %xmm9, 112(%rsp) 354 movups %xmm10, 128(%rsp) 355 movups %xmm11, 144(%rsp) 356 movups %xmm12, 160(%rsp) 357 movups %xmm13, 176(%rsp) 358 movups %xmm14, 192(%rsp) 359 movups %xmm15, 208(%rsp) 360 361 movq ARG1, OLD_M 362 movq ARG2, OLD_N 363 movq ARG3, K 364 movq OLD_A, A 365 movq OLD_B, B 366 movq OLD_C, C 367 movq OLD_LDC, LDC 368#ifdef TRMMKERNEL 369 movsd OLD_OFFSET, %xmm12 370#endif 371 movaps %xmm3, %xmm0 372 movsd OLD_ALPHA_I, %xmm1 373#else 374 movq 72(%rsp), LDC 375#ifdef TRMMKERNEL 376 movsd 80(%rsp), %xmm12 377#endif 378 379#endif 380 381 EMMS 382 383 movq %rsp, %rbx # save old stack 384 subq $256 + LOCAL_BUFFER_SIZE, %rsp 385 andq $-4096, %rsp # align stack 386 387 STACK_TOUCHING 388 389 movq OLD_M, M 390 movq OLD_N, N 391 392 pcmpeqb %xmm7, %xmm7 393 psllq $63, %xmm7 # Generate mask 394 pxor %xmm10, %xmm10 395 396 movlpd %xmm0, 0 + ALPHA_R 397 movlpd %xmm0, 8 + ALPHA_R 398 399 movlpd %xmm1, 8 + ALPHA_I 400 xorpd %xmm7, %xmm1 401 movlpd %xmm1, 0 + ALPHA_I 402 403 movlpd %xmm10, 0 + POSINV 404 movlpd %xmm7, 8 + POSINV 405 406#ifdef TRMMKERNEL 407 movlpd %xmm12, OFFSET 408 movlpd %xmm12, KK 409#ifndef LEFT 410 negq KK 411#endif 412#endif 413 414 subq $-16 * SIZE, A 415 416 salq $ZBASE_SHIFT, LDC 417 418 movq N, J 419 sarq $1, J # j = (n >> 2) 420 jle .L100 421 ALIGN_4 422 423.L01: 424 movq C, CO1 # coffset1 = c 425 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 426 427#if defined(TRMMKERNEL) && defined(LEFT) 428 movq OFFSET, %rax 429 movq %rax, KK 430#endif 431 432 leaq 16 * SIZE + BUFFER, BO 433 434 movq K, %rax 435 sarq $2, %rax 436 jle .L03 437 ALIGN_4 438 439.L02: 440 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 441 442 movq 0 * SIZE(B), %mm0 443 movq %mm0, -16 * SIZE(BO) 444 movq %mm0, -15 * SIZE(BO) 445 movq 1 * SIZE(B), %mm1 446 movq %mm1, -14 * SIZE(BO) 447 movq %mm1, -13 * SIZE(BO) 448 449 movq 2 * SIZE(B), %mm2 450 movq %mm2, -12 * SIZE(BO) 451 movq %mm2, -11 * SIZE(BO) 452 movq 3 * SIZE(B), %mm3 453 movq %mm3, -10 * SIZE(BO) 454 movq %mm3, -9 * SIZE(BO) 455 456 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 457 458 movq 4 * SIZE(B), %mm4 459 movq %mm4, -8 * SIZE(BO) 460 movq %mm4, -7 * SIZE(BO) 461 movq 5 * SIZE(B), %mm5 462 movq %mm5, -6 * SIZE(BO) 463 movq %mm5, -5 * SIZE(BO) 464 465 PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) 466 467 movq 6 * SIZE(B), %mm6 468 movq %mm6, -4 * SIZE(BO) 469 movq %mm6, -3 * SIZE(BO) 470 movq 7 * SIZE(B), %mm7 471 movq %mm7, -2 * SIZE(BO) 472 movq %mm7, -1 * SIZE(BO) 473 474 PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) 475 476 movq 8 * SIZE(B), %mm0 477 movq %mm0, 0 * SIZE(BO) 478 movq %mm0, 1 * SIZE(BO) 479 movq 9 * SIZE(B), %mm1 480 movq %mm1, 2 * SIZE(BO) 481 movq %mm1, 3 * SIZE(BO) 482 483 movq 10 * SIZE(B), %mm2 484 movq %mm2, 4 * SIZE(BO) 485 movq %mm2, 5 * SIZE(BO) 486 movq 11 * SIZE(B), %mm3 487 movq %mm3, 6 * SIZE(BO) 488 movq %mm3, 7 * SIZE(BO) 489 490 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 491 492 movq 12 * SIZE(B), %mm4 493 movq %mm4, 8 * SIZE(BO) 494 movq %mm4, 9 * SIZE(BO) 495 movq 13 * SIZE(B), %mm5 496 movq %mm5, 10 * SIZE(BO) 497 movq %mm5, 11 * SIZE(BO) 498 499 PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) 500 501 movq 14 * SIZE(B), %mm6 502 movq %mm6, 12 * SIZE(BO) 503 movq %mm6, 13 * SIZE(BO) 504 movq 15 * SIZE(B), %mm7 505 movq %mm7, 14 * SIZE(BO) 506 movq %mm7, 15 * SIZE(BO) 507 508 addq $ 32 * SIZE, BO 509 subq $-16 * SIZE, B 510 decq %rax 511 jne .L02 512 ALIGN_4 513 514.L03: 515 movq K, %rax 516 andq $3, %rax 517 BRANCH 518 jle .L05 519 ALIGN_4 520 521.L04: 522 movq 0 * SIZE(B), %mm0 523 movq %mm0, -16 * SIZE(BO) 524 movq %mm0, -15 * SIZE(BO) 525 movq 1 * SIZE(B), %mm1 526 movq %mm1, -14 * SIZE(BO) 527 movq %mm1, -13 * SIZE(BO) 528 529 movq 2 * SIZE(B), %mm2 530 movq %mm2, -12 * SIZE(BO) 531 movq %mm2, -11 * SIZE(BO) 532 movq 3 * SIZE(B), %mm3 533 movq %mm3, -10 * SIZE(BO) 534 movq %mm3, -9 * SIZE(BO) 535 536 addq $ 4 * SIZE, B 537 addq $ 8 * SIZE, BO 538 539 decq %rax 540 jne .L04 541 ALIGN_4 542 543.L05: 544 movq A, AO # aoffset = a 545 546 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 547 548 movq M, I 549 sarq $1, I # i = (m >> 2) 550 jle .L30 551 ALIGN_4 552 553.L10: 554#if !defined(TRMMKERNEL) || \ 555 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 556 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 557 558 leaq 16 * SIZE + BUFFER, BO 559#else 560 leaq 16 * SIZE + BUFFER, BO 561 movq KK, %rax 562 leaq (, %rax, SIZE), %rax 563 leaq (AO, %rax, 4), AO 564 leaq (BO, %rax, 8), BO 565#endif 566 567 movapd -16 * SIZE(AO), %xmm0 568 movapd -16 * SIZE(BO), %xmm1 569 pxor %xmm8, %xmm8 570 PREFETCH 0 * SIZE(BB) 571 movapd -14 * SIZE(AO), %xmm2 572 movapd -14 * SIZE(BO), %xmm3 573 pxor %xmm9, %xmm9 574 movapd -12 * SIZE(AO), %xmm4 575 movapd -12 * SIZE(BO), %xmm5 576 pxor %xmm10, %xmm10 577 movapd -10 * SIZE(AO), %xmm6 578 movapd -8 * SIZE(BO), %xmm7 579 pxor %xmm11, %xmm11 580 581 pxor %xmm12, %xmm12 582 PREFETCHW 3 * SIZE(CO1) 583 pxor %xmm13, %xmm13 584 PREFETCHW 3 * SIZE(CO2) 585 pxor %xmm14, %xmm14 586 pxor %xmm15, %xmm15 587 588#ifndef TRMMKERNEL 589 movq K, %rax 590#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 591 movq K, %rax 592 subq KK, %rax 593 movq %rax, KKK 594#else 595 movq KK, %rax 596#ifdef LEFT 597 addq $2, %rax 598#else 599 addq $2, %rax 600#endif 601 movq %rax, KKK 602#endif 603#ifndef GENERIC 604 andq $-8, %rax 605 606 leaq (, %rax, SIZE), %rax 607 leaq (AO, %rax, 4), AO 608 leaq (BO, %rax, 8), BO 609 negq %rax 610 NOBRANCH 611 je .L15 612 ALIGN_3 613 614.L12: 615 KERNEL1(16 * 0) 616 KERNEL2(16 * 0) 617 KERNEL3(16 * 0) 618 KERNEL4(16 * 0) 619 KERNEL5(16 * 0) 620 KERNEL6(16 * 0) 621 KERNEL7(16 * 0) 622 KERNEL8(16 * 0) 623 624 KERNEL1(16 * 1) 625 KERNEL2(16 * 1) 626 KERNEL3(16 * 1) 627 KERNEL4(16 * 1) 628 KERNEL5(16 * 1) 629 KERNEL6(16 * 1) 630 KERNEL7(16 * 1) 631 KERNEL8(16 * 1) 632 633 addq $8 * SIZE, %rax 634 NOBRANCH 635 je .L15 636 KERNEL1(16 * 0) 637 KERNEL2(16 * 0) 638 KERNEL3(16 * 0) 639 KERNEL4(16 * 0) 640 KERNEL5(16 * 0) 641 KERNEL6(16 * 0) 642 KERNEL7(16 * 0) 643 KERNEL8(16 * 0) 644 645 KERNEL1(16 * 1) 646 KERNEL2(16 * 1) 647 KERNEL3(16 * 1) 648 KERNEL4(16 * 1) 649 KERNEL5(16 * 1) 650 KERNEL6(16 * 1) 651 KERNEL7(16 * 1) 652 KERNEL8(16 * 1) 653 654 addq $8 * SIZE, %rax 655 NOBRANCH 656 je .L15 657 KERNEL1(16 * 0) 658 KERNEL2(16 * 0) 659 KERNEL3(16 * 0) 660 KERNEL4(16 * 0) 661 KERNEL5(16 * 0) 662 KERNEL6(16 * 0) 663 KERNEL7(16 * 0) 664 KERNEL8(16 * 0) 665 666 KERNEL1(16 * 1) 667 KERNEL2(16 * 1) 668 KERNEL3(16 * 1) 669 KERNEL4(16 * 1) 670 KERNEL5(16 * 1) 671 KERNEL6(16 * 1) 672 KERNEL7(16 * 1) 673 KERNEL8(16 * 1) 674 675 addq $8 * SIZE, %rax 676 NOBRANCH 677 je .L15 678 KERNEL1(16 * 0) 679 KERNEL2(16 * 0) 680 KERNEL3(16 * 0) 681 KERNEL4(16 * 0) 682 KERNEL5(16 * 0) 683 KERNEL6(16 * 0) 684 KERNEL7(16 * 0) 685 KERNEL8(16 * 0) 686 687 KERNEL1(16 * 1) 688 KERNEL2(16 * 1) 689 KERNEL3(16 * 1) 690 KERNEL4(16 * 1) 691 KERNEL5(16 * 1) 692 KERNEL6(16 * 1) 693 KERNEL7(16 * 1) 694 KERNEL8(16 * 1) 695 696 addq $8 * SIZE, %rax 697 NOBRANCH 698 je .L15 699 KERNEL1(16 * 0) 700 KERNEL2(16 * 0) 701 KERNEL3(16 * 0) 702 KERNEL4(16 * 0) 703 KERNEL5(16 * 0) 704 KERNEL6(16 * 0) 705 KERNEL7(16 * 0) 706 KERNEL8(16 * 0) 707 708 KERNEL1(16 * 1) 709 KERNEL2(16 * 1) 710 KERNEL3(16 * 1) 711 KERNEL4(16 * 1) 712 KERNEL5(16 * 1) 713 KERNEL6(16 * 1) 714 KERNEL7(16 * 1) 715 KERNEL8(16 * 1) 716 717 addq $8 * SIZE, %rax 718 NOBRANCH 719 je .L15 720 KERNEL1(16 * 0) 721 KERNEL2(16 * 0) 722 KERNEL3(16 * 0) 723 KERNEL4(16 * 0) 724 KERNEL5(16 * 0) 725 KERNEL6(16 * 0) 726 KERNEL7(16 * 0) 727 KERNEL8(16 * 0) 728 729 KERNEL1(16 * 1) 730 KERNEL2(16 * 1) 731 KERNEL3(16 * 1) 732 KERNEL4(16 * 1) 733 KERNEL5(16 * 1) 734 KERNEL6(16 * 1) 735 KERNEL7(16 * 1) 736 KERNEL8(16 * 1) 737 738 addq $8 * SIZE, %rax 739 NOBRANCH 740 je .L15 741 KERNEL1(16 * 0) 742 KERNEL2(16 * 0) 743 KERNEL3(16 * 0) 744 KERNEL4(16 * 0) 745 KERNEL5(16 * 0) 746 KERNEL6(16 * 0) 747 KERNEL7(16 * 0) 748 KERNEL8(16 * 0) 749 750 KERNEL1(16 * 1) 751 KERNEL2(16 * 1) 752 KERNEL3(16 * 1) 753 KERNEL4(16 * 1) 754 KERNEL5(16 * 1) 755 KERNEL6(16 * 1) 756 KERNEL7(16 * 1) 757 KERNEL8(16 * 1) 758 759 addq $8 * SIZE, %rax 760 NOBRANCH 761 je .L15 762 KERNEL1(16 * 0) 763 KERNEL2(16 * 0) 764 KERNEL3(16 * 0) 765 KERNEL4(16 * 0) 766 KERNEL5(16 * 0) 767 KERNEL6(16 * 0) 768 KERNEL7(16 * 0) 769 KERNEL8(16 * 0) 770 771 KERNEL1(16 * 1) 772 KERNEL2(16 * 1) 773 KERNEL3(16 * 1) 774 KERNEL4(16 * 1) 775 KERNEL5(16 * 1) 776 KERNEL6(16 * 1) 777 KERNEL7(16 * 1) 778 KERNEL8(16 * 1) 779 780 addq $8 * SIZE, %rax 781 BRANCH 782 jl .L12 783 ALIGN_3 784 785.L15: 786 PREFETCH 8 * SIZE(BB) 787 subq $-16 * SIZE, BB 788 789#ifndef TRMMKERNEL 790 movq K, %rax 791#else 792 movq KKK, %rax 793#endif 794 testq $4, %rax 795 je .L16 796 xorq %rax, %rax 797 ALIGN_3 798 799 KERNEL1(16 * 0) 800 KERNEL2(16 * 0) 801 KERNEL3(16 * 0) 802 KERNEL4(16 * 0) 803 KERNEL5(16 * 0) 804 KERNEL6(16 * 0) 805 KERNEL7(16 * 0) 806 KERNEL8(16 * 0) 807 808 addq $32 * SIZE, BO 809 addq $16 * SIZE, AO 810 ALIGN_3 811#else 812 sarq $2, %rax 813 NOBRANCH 814 jle .L16 815 ALIGN_3 816 817.L12: 818 KERNEL1(16 * 0) 819 KERNEL2(16 * 0) 820 KERNEL3(16 * 0) 821 KERNEL4(16 * 0) 822 KERNEL5(16 * 0) 823 KERNEL6(16 * 0) 824 KERNEL7(16 * 0) 825 KERNEL8(16 * 0) 826 827 addq $ 32 * SIZE, BO 828 subq $-16 * SIZE, AO 829 decq %rax 830 BRANCH 831 jg .L12 832#endif 833 834.L16: 835 movapd POSINV, %xmm5 836 movapd ALPHA_R, %xmm6 837 movapd ALPHA_I, %xmm7 838 839#ifndef TRMMKERNEL 840 movq K, %rax 841#else 842 movq KKK, %rax 843#endif 844 andq $3, %rax # if (k & 1) 845 je .L19 846 847 leaq (, %rax, SIZE), %rax 848 leaq (AO, %rax, 4), AO 849 leaq (BO, %rax, 8), BO 850 negq %rax 851 ALIGN_3 852 853.L17: 854 mulpd %xmm0, %xmm1 855 addpd %xmm1, %xmm8 856 movapd -14 * SIZE(BO, %rax, 8), %xmm1 857 mulpd %xmm0, %xmm1 858 addpd %xmm1, %xmm9 859 movapd -12 * SIZE(BO, %rax, 8), %xmm1 860 mulpd %xmm0, %xmm1 861 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 862 addpd %xmm1, %xmm10 863 movapd -16 * SIZE(BO, %rax, 8), %xmm1 864 addpd %xmm0, %xmm11 865 movapd -12 * SIZE(AO, %rax, 4), %xmm0 866 mulpd %xmm2, %xmm1 867 addpd %xmm1, %xmm12 868 movapd -14 * SIZE(BO, %rax, 8), %xmm1 869 mulpd %xmm2, %xmm1 870 addpd %xmm1, %xmm13 871 movapd -12 * SIZE(BO, %rax, 8), %xmm1 872 mulpd %xmm2, %xmm1 873 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 874 addpd %xmm1, %xmm14 875 movapd -8 * SIZE(BO, %rax, 8), %xmm1 876 addpd %xmm2, %xmm15 877 movapd -10 * SIZE(AO, %rax, 4), %xmm2 878 879 addq $SIZE, %rax 880 jl .L17 881 ALIGN_3 882 883.L19: 884#ifndef TRMMKERNEL 885 movlpd 0 * SIZE(CO1), %xmm0 886 movhpd 1 * SIZE(CO1), %xmm0 887 movlpd 2 * SIZE(CO1), %xmm2 888 movhpd 3 * SIZE(CO1), %xmm2 889 890 movlpd 0 * SIZE(CO2), %xmm1 891 movhpd 1 * SIZE(CO2), %xmm1 892 movlpd 2 * SIZE(CO2), %xmm3 893 movhpd 3 * SIZE(CO2), %xmm3 894#endif 895 896 SHUFPD_1 %xmm9, %xmm9 897 SHUFPD_1 %xmm11, %xmm11 898 SHUFPD_1 %xmm13, %xmm13 899 SHUFPD_1 %xmm15, %xmm15 900 901#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 902 defined(NR) || defined(NC) || defined(TR) || defined(TC) 903 xorpd %xmm5, %xmm9 904 xorpd %xmm5, %xmm11 905 xorpd %xmm5, %xmm13 906 xorpd %xmm5, %xmm15 907#else 908 xorpd %xmm5, %xmm8 909 xorpd %xmm5, %xmm10 910 xorpd %xmm5, %xmm12 911 xorpd %xmm5, %xmm14 912#endif 913 914#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 915 defined(RR) || defined(RC) || defined(CR) || defined(CC) 916 subpd %xmm9, %xmm8 917 subpd %xmm11, %xmm10 918 subpd %xmm13, %xmm12 919 subpd %xmm15, %xmm14 920#else 921 addpd %xmm9, %xmm8 922 addpd %xmm11, %xmm10 923 addpd %xmm13, %xmm12 924 addpd %xmm15, %xmm14 925#endif 926 927 pshufd $0x4e, %xmm8, %xmm9 928 pshufd $0x4e, %xmm10, %xmm11 929 pshufd $0x4e, %xmm12, %xmm13 930 pshufd $0x4e, %xmm14, %xmm15 931 932 mulpd %xmm6, %xmm8 933 mulpd %xmm7, %xmm9 934 mulpd %xmm6, %xmm10 935 mulpd %xmm7, %xmm11 936 937 mulpd %xmm6, %xmm12 938 mulpd %xmm7, %xmm13 939 mulpd %xmm6, %xmm14 940 mulpd %xmm7, %xmm15 941 942 addpd %xmm9, %xmm8 943 addpd %xmm11, %xmm10 944 addpd %xmm13, %xmm12 945 addpd %xmm15, %xmm14 946 947#ifndef TRMMKERNEL 948 addpd %xmm0, %xmm8 949 addpd %xmm2, %xmm12 950 addpd %xmm1, %xmm10 951 addpd %xmm3, %xmm14 952#endif 953 954 movlpd %xmm8, 0 * SIZE(CO1) 955 movhpd %xmm8, 1 * SIZE(CO1) 956 movlpd %xmm12, 2 * SIZE(CO1) 957 movhpd %xmm12, 3 * SIZE(CO1) 958 959 movlpd %xmm10, 0 * SIZE(CO2) 960 movhpd %xmm10, 1 * SIZE(CO2) 961 movlpd %xmm14, 2 * SIZE(CO2) 962 movhpd %xmm14, 3 * SIZE(CO2) 963 964#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 965 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 966 movq K, %rax 967 subq KKK, %rax 968 leaq (,%rax, SIZE), %rax 969 leaq (AO, %rax, 4), AO 970 leaq (BO, %rax, 8), BO 971#endif 972 973#if defined(TRMMKERNEL) && defined(LEFT) 974 addq $2, KK 975#endif 976 977 addq $4 * SIZE, CO1 # coffset += 4 978 addq $4 * SIZE, CO2 # coffset += 4 979 decq I # i -- 980 jg .L10 981 ALIGN_4 982 983.L30: 984 testq $1, M 985 jle .L99 986 987#if !defined(TRMMKERNEL) || \ 988 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 989 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 990 991 leaq 16 * SIZE + BUFFER, BO 992#else 993 leaq 16 * SIZE + BUFFER, BO 994 movq KK, %rax 995 leaq (, %rax, SIZE), %rax 996 leaq (AO, %rax, 2), AO 997 leaq (BO, %rax, 8), BO 998#endif 999 1000 movapd -16 * SIZE(AO), %xmm0 1001 pxor %xmm8, %xmm8 1002 movapd -8 * SIZE(AO), %xmm2 1003 pxor %xmm9, %xmm9 1004 movapd -16 * SIZE(BO), %xmm1 1005 pxor %xmm10, %xmm10 1006 movapd -8 * SIZE(BO), %xmm3 1007 pxor %xmm11, %xmm11 1008 1009#ifndef TRMMKERNEL 1010 movq K, %rax 1011#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1012 movq K, %rax 1013 subq KK, %rax 1014 movq %rax, KKK 1015#else 1016 movq KK, %rax 1017#ifdef LEFT 1018 addq $1, %rax 1019#else 1020 addq $2, %rax 1021#endif 1022 movq %rax, KKK 1023#endif 1024 sarq $3, %rax 1025 je .L44 1026 ALIGN_4 1027 1028.L41: 1029 mulpd %xmm0, %xmm1 1030 addpd %xmm1, %xmm8 1031 movapd -14 * SIZE(BO), %xmm1 1032 mulpd %xmm0, %xmm1 1033 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1034 addpd %xmm1, %xmm9 1035 movapd -12 * SIZE(BO), %xmm1 1036 mulpd %xmm0, %xmm1 1037 mulpd -10 * SIZE(BO), %xmm0 1038 addpd %xmm1, %xmm10 1039 movapd 0 * SIZE(BO), %xmm1 1040 addpd %xmm0, %xmm11 1041 movapd -14 * SIZE(AO), %xmm0 1042 mulpd %xmm0, %xmm3 1043 addpd %xmm3, %xmm8 1044 movapd -6 * SIZE(BO), %xmm3 1045 mulpd %xmm0, %xmm3 1046 addpd %xmm3, %xmm9 1047 movapd -4 * SIZE(BO), %xmm3 1048 mulpd %xmm0, %xmm3 1049 mulpd -2 * SIZE(BO), %xmm0 1050 addpd %xmm3, %xmm10 1051 movapd 8 * SIZE(BO), %xmm3 1052 addpd %xmm0, %xmm11 1053 movapd -12 * SIZE(AO), %xmm0 1054 mulpd %xmm0, %xmm1 1055 addpd %xmm1, %xmm8 1056 movapd 2 * SIZE(BO), %xmm1 1057 mulpd %xmm0, %xmm1 1058 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1059 addpd %xmm1, %xmm9 1060 movapd 4 * SIZE(BO), %xmm1 1061 mulpd %xmm0, %xmm1 1062 mulpd 6 * SIZE(BO), %xmm0 1063 addpd %xmm1, %xmm10 1064 movapd 16 * SIZE(BO), %xmm1 1065 addpd %xmm0, %xmm11 1066 movapd -10 * SIZE(AO), %xmm0 1067 mulpd %xmm0, %xmm3 1068 addpd %xmm3, %xmm8 1069 movapd 10 * SIZE(BO), %xmm3 1070 mulpd %xmm0, %xmm3 1071 addpd %xmm3, %xmm9 1072 movapd 12 * SIZE(BO), %xmm3 1073 mulpd %xmm0, %xmm3 1074 mulpd 14 * SIZE(BO), %xmm0 1075 addpd %xmm3, %xmm10 1076 movapd 24 * SIZE(BO), %xmm3 1077 addpd %xmm0, %xmm11 1078 movapd 0 * SIZE(AO), %xmm0 1079 mulpd %xmm2, %xmm1 1080 addpd %xmm1, %xmm8 1081 movapd 18 * SIZE(BO), %xmm1 1082 mulpd %xmm2, %xmm1 1083 addpd %xmm1, %xmm9 1084 movapd 20 * SIZE(BO), %xmm1 1085 mulpd %xmm2, %xmm1 1086 mulpd 22 * SIZE(BO), %xmm2 1087 addpd %xmm1, %xmm10 1088 movapd 32 * SIZE(BO), %xmm1 1089 addpd %xmm2, %xmm11 1090 movapd -6 * SIZE(AO), %xmm2 1091 mulpd %xmm2, %xmm3 1092 addpd %xmm3, %xmm8 1093 movapd 26 * SIZE(BO), %xmm3 1094 mulpd %xmm2, %xmm3 1095 addpd %xmm3, %xmm9 1096 movapd 28 * SIZE(BO), %xmm3 1097 mulpd %xmm2, %xmm3 1098 mulpd 30 * SIZE(BO), %xmm2 1099 addpd %xmm3, %xmm10 1100 movapd 40 * SIZE(BO), %xmm3 1101 addpd %xmm2, %xmm11 1102 movapd -4 * SIZE(AO), %xmm2 1103 mulpd %xmm2, %xmm1 1104 addpd %xmm1, %xmm8 1105 movapd 34 * SIZE(BO), %xmm1 1106 mulpd %xmm2, %xmm1 1107 addpd %xmm1, %xmm9 1108 movapd 36 * SIZE(BO), %xmm1 1109 mulpd %xmm2, %xmm1 1110 mulpd 38 * SIZE(BO), %xmm2 1111 addpd %xmm1, %xmm10 1112 movapd 48 * SIZE(BO), %xmm1 1113 addpd %xmm2, %xmm11 1114 movapd -2 * SIZE(AO), %xmm2 1115 mulpd %xmm2, %xmm3 1116 addpd %xmm3, %xmm8 1117 movapd 42 * SIZE(BO), %xmm3 1118 mulpd %xmm2, %xmm3 1119 addpd %xmm3, %xmm9 1120 movapd 44 * SIZE(BO), %xmm3 1121 mulpd %xmm2, %xmm3 1122 mulpd 46 * SIZE(BO), %xmm2 1123 addpd %xmm3, %xmm10 1124 movapd 56 * SIZE(BO), %xmm3 1125 addpd %xmm2, %xmm11 1126 movapd 8 * SIZE(AO), %xmm2 1127 1128 subq $-16 * SIZE, AO 1129 addq $64 * SIZE, BO 1130 decq %rax 1131 jne .L41 1132 ALIGN_4 1133 1134.L44: 1135#ifndef TRMMKERNEL 1136 movq K, %rax 1137#else 1138 movq KKK, %rax 1139#endif 1140 andq $4, %rax 1141 BRANCH 1142 jle .L45 1143 1144 mulpd %xmm0, %xmm1 1145 addpd %xmm1, %xmm8 1146 movapd -14 * SIZE(BO), %xmm1 1147 mulpd %xmm0, %xmm1 1148 addpd %xmm1, %xmm9 1149 movapd -12 * SIZE(BO), %xmm1 1150 mulpd %xmm0, %xmm1 1151 mulpd -10 * SIZE(BO), %xmm0 1152 addpd %xmm1, %xmm10 1153 movapd 0 * SIZE(BO), %xmm1 1154 addpd %xmm0, %xmm11 1155 movapd -14 * SIZE(AO), %xmm0 1156 mulpd %xmm0, %xmm3 1157 addpd %xmm3, %xmm8 1158 movapd -6 * SIZE(BO), %xmm3 1159 mulpd %xmm0, %xmm3 1160 addpd %xmm3, %xmm9 1161 movapd -4 * SIZE(BO), %xmm3 1162 mulpd %xmm0, %xmm3 1163 mulpd -2 * SIZE(BO), %xmm0 1164 addpd %xmm3, %xmm10 1165 movapd 8 * SIZE(BO), %xmm3 1166 addpd %xmm0, %xmm11 1167 movapd -12 * SIZE(AO), %xmm0 1168 mulpd %xmm0, %xmm1 1169 addpd %xmm1, %xmm8 1170 movapd 2 * SIZE(BO), %xmm1 1171 mulpd %xmm0, %xmm1 1172 addpd %xmm1, %xmm9 1173 movapd 4 * SIZE(BO), %xmm1 1174 mulpd %xmm0, %xmm1 1175 mulpd 6 * SIZE(BO), %xmm0 1176 addpd %xmm1, %xmm10 1177 movapd 16 * SIZE(BO), %xmm1 1178 addpd %xmm0, %xmm11 1179 movapd -10 * SIZE(AO), %xmm0 1180 mulpd %xmm0, %xmm3 1181 addpd %xmm3, %xmm8 1182 movapd 10 * SIZE(BO), %xmm3 1183 mulpd %xmm0, %xmm3 1184 addpd %xmm3, %xmm9 1185 movapd 12 * SIZE(BO), %xmm3 1186 mulpd %xmm0, %xmm3 1187 mulpd 14 * SIZE(BO), %xmm0 1188 addpd %xmm3, %xmm10 1189 movapd 24 * SIZE(BO), %xmm3 1190 addpd %xmm0, %xmm11 1191 movapd -8 * SIZE(AO), %xmm0 1192 1193 addq $ 8 * SIZE, AO 1194 addq $32 * SIZE, BO 1195 ALIGN_4 1196 1197.L45: 1198#ifndef TRMMKERNEL 1199 movq K, %rax 1200#else 1201 movq KKK, %rax 1202#endif 1203 movapd POSINV, %xmm5 1204 movapd ALPHA_R, %xmm6 1205 movapd ALPHA_I, %xmm7 1206 andq $3, %rax # if (k & 1) 1207 BRANCH 1208 jle .L47 1209 ALIGN_4 1210 1211.L46: 1212 mulpd %xmm0, %xmm1 1213 addpd %xmm1, %xmm8 1214 movapd -14 * SIZE(BO), %xmm1 1215 mulpd %xmm0, %xmm1 1216 addpd %xmm1, %xmm9 1217 movapd -12 * SIZE(BO), %xmm1 1218 mulpd %xmm0, %xmm1 1219 mulpd -10 * SIZE(BO), %xmm0 1220 addpd %xmm1, %xmm10 1221 movapd -8 * SIZE(BO), %xmm1 1222 addpd %xmm0, %xmm11 1223 movapd -14 * SIZE(AO), %xmm0 1224 1225 addq $2 * SIZE, AO 1226 addq $8 * SIZE, BO 1227 1228 decq %rax 1229 jg .L46 1230 ALIGN_4 1231 1232.L47: 1233#ifndef TRMMKERNEL 1234 movlpd 0 * SIZE(CO1), %xmm0 1235 movhpd 1 * SIZE(CO1), %xmm0 1236 movlpd 0 * SIZE(CO2), %xmm1 1237 movhpd 1 * SIZE(CO2), %xmm1 1238#endif 1239 1240 SHUFPD_1 %xmm9, %xmm9 1241 SHUFPD_1 %xmm11, %xmm11 1242 1243#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1244 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1245 xorpd %xmm5, %xmm9 1246 xorpd %xmm5, %xmm11 1247#else 1248 xorpd %xmm5, %xmm8 1249 xorpd %xmm5, %xmm10 1250#endif 1251 1252#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1253 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1254 subpd %xmm9, %xmm8 1255 subpd %xmm11, %xmm10 1256#else 1257 addpd %xmm9, %xmm8 1258 addpd %xmm11, %xmm10 1259#endif 1260 1261 pshufd $0x4e, %xmm8, %xmm9 1262 pshufd $0x4e, %xmm10, %xmm11 1263 1264 mulpd %xmm6, %xmm8 1265 mulpd %xmm7, %xmm9 1266 mulpd %xmm6, %xmm10 1267 mulpd %xmm7, %xmm11 1268 1269 addpd %xmm9, %xmm8 1270 addpd %xmm11, %xmm10 1271 1272#ifndef TRMMKERNEL 1273 addpd %xmm0, %xmm8 1274 addpd %xmm1, %xmm10 1275#endif 1276 1277 movlpd %xmm8, 0 * SIZE(CO1) 1278 movhpd %xmm8, 1 * SIZE(CO1) 1279 movlpd %xmm10, 0 * SIZE(CO2) 1280 movhpd %xmm10, 1 * SIZE(CO2) 1281 1282#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1283 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1284 movq K, %rax 1285 subq KKK, %rax 1286 leaq (,%rax, SIZE), %rax 1287 leaq (AO, %rax, 2), AO 1288 leaq (BO, %rax, 8), BO 1289#endif 1290 1291#if defined(TRMMKERNEL) && defined(LEFT) 1292 addq $1, KK 1293#endif 1294 ALIGN_4 1295 1296.L99: 1297#if defined(TRMMKERNEL) && !defined(LEFT) 1298 addl $2, KK 1299#endif 1300 1301 leaq (C, LDC, 2), C # c += 2 * ldc 1302 decq J # j -- 1303 jg .L01 1304 1305.L100: 1306 testq $1, N 1307 jle .L999 1308 1309.L101: 1310#if defined(TRMMKERNEL) && defined(LEFT) 1311 movq OFFSET, %rax 1312 movq %rax, KK 1313#endif 1314 1315/* Copying to Sub Buffer */ 1316 leaq BUFFER, BO 1317 1318 movq K, %rax 1319 sarq $2, %rax 1320 jle .L103 1321 ALIGN_4 1322 1323.L102: 1324 movlpd 0 * SIZE(B), %xmm8 1325 movlpd 1 * SIZE(B), %xmm9 1326 movlpd 2 * SIZE(B), %xmm10 1327 movlpd 3 * SIZE(B), %xmm11 1328 movlpd 4 * SIZE(B), %xmm12 1329 movlpd 5 * SIZE(B), %xmm13 1330 movlpd 6 * SIZE(B), %xmm14 1331 movlpd 7 * SIZE(B), %xmm15 1332 1333 movlpd %xmm8, 0 * SIZE(BO) 1334 movlpd %xmm8, 1 * SIZE(BO) 1335 movlpd %xmm9, 2 * SIZE(BO) 1336 movlpd %xmm9, 3 * SIZE(BO) 1337 movlpd %xmm10, 4 * SIZE(BO) 1338 movlpd %xmm10, 5 * SIZE(BO) 1339 movlpd %xmm11, 6 * SIZE(BO) 1340 movlpd %xmm11, 7 * SIZE(BO) 1341 movlpd %xmm12, 8 * SIZE(BO) 1342 movlpd %xmm12, 9 * SIZE(BO) 1343 movlpd %xmm13, 10 * SIZE(BO) 1344 movlpd %xmm13, 11 * SIZE(BO) 1345 movlpd %xmm14, 12 * SIZE(BO) 1346 movlpd %xmm14, 13 * SIZE(BO) 1347 movlpd %xmm15, 14 * SIZE(BO) 1348 movlpd %xmm15, 15 * SIZE(BO) 1349 1350 subq $-16 * SIZE, BO 1351 addq $ 8 * SIZE, B 1352 decq %rax 1353 jne .L102 1354 ALIGN_4 1355 1356.L103: 1357 movq K, %rax 1358 andq $3, %rax 1359 BRANCH 1360 jle .L105 1361 ALIGN_4 1362 1363.L104: 1364 movlpd 0 * SIZE(B), %xmm8 1365 movlpd 1 * SIZE(B), %xmm9 1366 1367 movlpd %xmm8, 0 * SIZE(BO) 1368 movlpd %xmm8, 1 * SIZE(BO) 1369 movlpd %xmm9, 2 * SIZE(BO) 1370 movlpd %xmm9, 3 * SIZE(BO) 1371 1372 addq $4 * SIZE, BO 1373 addq $2 * SIZE, B 1374 decq %rax 1375 jne .L104 1376 ALIGN_4 1377 1378.L105: 1379 movq C, CO1 # coffset1 = c 1380 movq A, AO # aoffset = a 1381 1382 movq M, I 1383 sarq $1, I # i = (m >> 2) 1384 jle .L130 1385 ALIGN_4 1386 1387.L110: 1388#if !defined(TRMMKERNEL) || \ 1389 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1390 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1391 1392 leaq 16 * SIZE + BUFFER, BO 1393#else 1394 leaq 16 * SIZE + BUFFER, BO 1395 movq KK, %rax 1396 leaq (, %rax, SIZE), %rax 1397 leaq (AO, %rax, 4), AO 1398 leaq (BO, %rax, 4), BO 1399#endif 1400 1401 movapd -16 * SIZE(AO), %xmm0 1402 pxor %xmm8, %xmm8 1403 movapd -16 * SIZE(BO), %xmm1 1404 pxor %xmm9, %xmm9 1405 movapd -8 * SIZE(AO), %xmm2 1406 pxor %xmm12, %xmm12 1407 movapd -8 * SIZE(BO), %xmm3 1408 pxor %xmm13, %xmm13 1409 PREFETCHW 3 * SIZE(CO1) 1410 1411#ifndef TRMMKERNEL 1412 movq K, %rax 1413#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1414 movq K, %rax 1415 subq KK, %rax 1416 movq %rax, KKK 1417#else 1418 movq KK, %rax 1419#ifdef LEFT 1420 addq $2, %rax 1421#else 1422 addq $1, %rax 1423#endif 1424 movq %rax, KKK 1425#endif 1426 sarq $2, %rax 1427 je .L112 1428 1429.L111: 1430 mulpd %xmm0, %xmm1 1431 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1432 mulpd -14 * SIZE(BO), %xmm0 1433 addpd %xmm1, %xmm8 1434 movapd -16 * SIZE(BO), %xmm1 1435 addpd %xmm0, %xmm9 1436 movapd -14 * SIZE(AO), %xmm0 1437 mulpd %xmm0, %xmm1 1438 mulpd -14 * SIZE(BO), %xmm0 1439 addpd %xmm1, %xmm12 1440 movapd -12 * SIZE(BO), %xmm1 1441 addpd %xmm0, %xmm13 1442 movapd -12 * SIZE(AO), %xmm0 1443 mulpd %xmm0, %xmm1 1444 mulpd -10 * SIZE(BO), %xmm0 1445 addpd %xmm1, %xmm8 1446 movapd -12 * SIZE(BO), %xmm1 1447 addpd %xmm0, %xmm9 1448 movapd -10 * SIZE(AO), %xmm0 1449 mulpd %xmm0, %xmm1 1450 mulpd -10 * SIZE(BO), %xmm0 1451 addpd %xmm1, %xmm12 1452 movapd 0 * SIZE(BO), %xmm1 1453 addpd %xmm0, %xmm13 1454 movapd 0 * SIZE(AO), %xmm0 1455 mulpd %xmm2, %xmm3 1456 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1457 mulpd -6 * SIZE(BO), %xmm2 1458 addpd %xmm3, %xmm8 1459 movapd -8 * SIZE(BO), %xmm3 1460 addpd %xmm2, %xmm9 1461 movapd -6 * SIZE(AO), %xmm2 1462 mulpd %xmm2, %xmm3 1463 mulpd -6 * SIZE(BO), %xmm2 1464 addpd %xmm3, %xmm12 1465 movapd -4 * SIZE(BO), %xmm3 1466 addpd %xmm2, %xmm13 1467 movapd -4 * SIZE(AO), %xmm2 1468 mulpd %xmm2, %xmm3 1469 mulpd -2 * SIZE(BO), %xmm2 1470 addpd %xmm3, %xmm8 1471 movapd -4 * SIZE(BO), %xmm3 1472 addpd %xmm2, %xmm9 1473 movapd -2 * SIZE(AO), %xmm2 1474 mulpd %xmm2, %xmm3 1475 mulpd -2 * SIZE(BO), %xmm2 1476 addpd %xmm3, %xmm12 1477 movapd 8 * SIZE(BO), %xmm3 1478 addpd %xmm2, %xmm13 1479 movapd 8 * SIZE(AO), %xmm2 1480 1481 subq $-16 * SIZE, AO 1482 subq $-16 * SIZE, BO 1483 decq %rax 1484 jne .L111 1485 ALIGN_4 1486 1487.L112: 1488#ifndef TRMMKERNEL 1489 movq K, %rax 1490#else 1491 movq KKK, %rax 1492#endif 1493 movapd POSINV, %xmm5 1494 movapd ALPHA_R, %xmm6 1495 movapd ALPHA_I, %xmm7 1496 andq $3, %rax # if (k & 1) 1497 BRANCH 1498 jle .L114 1499 1500.L113: 1501 mulpd %xmm0, %xmm1 1502 mulpd -14 * SIZE(BO), %xmm0 1503 addpd %xmm1, %xmm8 1504 movapd -16 * SIZE(BO), %xmm1 1505 addpd %xmm0, %xmm9 1506 movapd -14 * SIZE(AO), %xmm0 1507 mulpd %xmm0, %xmm1 1508 mulpd -14 * SIZE(BO), %xmm0 1509 addpd %xmm1, %xmm12 1510 movapd -12 * SIZE(BO), %xmm1 1511 addpd %xmm0, %xmm13 1512 movapd -12 * SIZE(AO), %xmm0 1513 1514 addq $4 * SIZE, AO # aoffset += 4 1515 addq $4 * SIZE, BO # boffset1 += 8 1516 decq %rax 1517 jg .L113 1518 ALIGN_4 1519 1520.L114: 1521#ifndef TRMMKERNEL 1522 movlpd 0 * SIZE(CO1), %xmm0 1523 movhpd 1 * SIZE(CO1), %xmm0 1524 movlpd 2 * SIZE(CO1), %xmm2 1525 movhpd 3 * SIZE(CO1), %xmm2 1526#endif 1527 1528 SHUFPD_1 %xmm9, %xmm9 1529 SHUFPD_1 %xmm13, %xmm13 1530 1531#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1532 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1533 xorpd %xmm5, %xmm9 1534 xorpd %xmm5, %xmm13 1535#else 1536 xorpd %xmm5, %xmm8 1537 xorpd %xmm5, %xmm12 1538#endif 1539 1540#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1541 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1542 subpd %xmm9, %xmm8 1543 subpd %xmm13, %xmm12 1544#else 1545 addpd %xmm9, %xmm8 1546 addpd %xmm13, %xmm12 1547#endif 1548 1549 pshufd $0x4e, %xmm8, %xmm9 1550 pshufd $0x4e, %xmm12, %xmm13 1551 1552 mulpd %xmm6, %xmm8 1553 mulpd %xmm7, %xmm9 1554 mulpd %xmm6, %xmm12 1555 mulpd %xmm7, %xmm13 1556 1557 addpd %xmm9, %xmm8 1558 addpd %xmm13, %xmm12 1559 1560#ifndef TRMMKERNEL 1561 addpd %xmm0, %xmm8 1562 addpd %xmm2, %xmm12 1563#endif 1564 1565 movlpd %xmm8, 0 * SIZE(CO1) 1566 movhpd %xmm8, 1 * SIZE(CO1) 1567 movlpd %xmm12, 2 * SIZE(CO1) 1568 movhpd %xmm12, 3 * SIZE(CO1) 1569 1570#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1571 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1572 movq K, %rax 1573 subq KKK, %rax 1574 leaq (,%rax, SIZE), %rax 1575 leaq (AO, %rax, 4), AO 1576 leaq (BO, %rax, 4), BO 1577#endif 1578 1579#if defined(TRMMKERNEL) && defined(LEFT) 1580 addq $2, KK 1581#endif 1582 1583 addq $4 * SIZE, CO1 # coffset += 4 1584 decq I # i -- 1585 jg .L110 1586 ALIGN_4 1587 1588.L130: 1589 testq $1, M 1590 jle .L999 1591 1592#if !defined(TRMMKERNEL) || \ 1593 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1594 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1595 1596 leaq 16 * SIZE + BUFFER, BO 1597#else 1598 leaq 16 * SIZE + BUFFER, BO 1599 movq KK, %rax 1600 leaq (, %rax, SIZE), %rax 1601 leaq (AO, %rax, 2), AO 1602 leaq (BO, %rax, 4), BO 1603#endif 1604 1605 movapd -16 * SIZE(AO), %xmm0 1606 movapd -16 * SIZE(BO), %xmm1 1607 movapd -8 * SIZE(AO), %xmm2 1608 movapd -8 * SIZE(BO), %xmm3 1609 1610 pxor %xmm8, %xmm8 1611 pxor %xmm9, %xmm9 1612 pxor %xmm10, %xmm10 1613 pxor %xmm11, %xmm11 1614 1615#ifndef TRMMKERNEL 1616 movq K, %rax 1617#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1618 movq K, %rax 1619 subq KK, %rax 1620 movq %rax, KKK 1621#else 1622 movq KK, %rax 1623#ifdef LEFT 1624 addq $1, %rax 1625#else 1626 addq $1, %rax 1627#endif 1628 movq %rax, KKK 1629#endif 1630 sarq $3, %rax 1631 je .L144 1632 ALIGN_4 1633 1634.L141: 1635 mulpd %xmm0, %xmm1 1636 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1637 mulpd -14 * SIZE(BO), %xmm0 1638 addpd %xmm1, %xmm8 1639 movapd -12 * SIZE(BO), %xmm1 1640 addpd %xmm0, %xmm9 1641 movapd -14 * SIZE(AO), %xmm0 1642 mulpd %xmm0, %xmm1 1643 mulpd -10 * SIZE(BO), %xmm0 1644 addpd %xmm1, %xmm10 1645 movapd 0 * SIZE(BO), %xmm1 1646 addpd %xmm0, %xmm11 1647 movapd -12 * SIZE(AO), %xmm0 1648 mulpd %xmm0, %xmm3 1649 mulpd -6 * SIZE(BO), %xmm0 1650 addpd %xmm3, %xmm8 1651 movapd -4 * SIZE(BO), %xmm3 1652 addpd %xmm0, %xmm9 1653 movapd -10 * SIZE(AO), %xmm0 1654 mulpd %xmm0, %xmm3 1655 mulpd -2 * SIZE(BO), %xmm0 1656 addpd %xmm3, %xmm10 1657 movapd 8 * SIZE(BO), %xmm3 1658 addpd %xmm0, %xmm11 1659 movapd 0 * SIZE(AO), %xmm0 1660 mulpd %xmm2, %xmm1 1661 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1662 mulpd 2 * SIZE(BO), %xmm2 1663 addpd %xmm1, %xmm8 1664 movapd 4 * SIZE(BO), %xmm1 1665 addpd %xmm2, %xmm9 1666 movapd -6 * SIZE(AO), %xmm2 1667 mulpd %xmm2, %xmm1 1668 mulpd 6 * SIZE(BO), %xmm2 1669 addpd %xmm1, %xmm10 1670 movapd 16 * SIZE(BO), %xmm1 1671 addpd %xmm2, %xmm11 1672 movapd -4 * SIZE(AO), %xmm2 1673 mulpd %xmm2, %xmm3 1674 mulpd 10 * SIZE(BO), %xmm2 1675 addpd %xmm3, %xmm8 1676 movapd 12 * SIZE(BO), %xmm3 1677 addpd %xmm2, %xmm9 1678 movapd -2 * SIZE(AO), %xmm2 1679 mulpd %xmm2, %xmm3 1680 mulpd 14 * SIZE(BO), %xmm2 1681 addpd %xmm3, %xmm10 1682 movapd 24 * SIZE(BO), %xmm3 1683 addpd %xmm2, %xmm11 1684 movapd 8 * SIZE(AO), %xmm2 1685 1686 subq $-16 * SIZE, AO 1687 subq $-32 * SIZE, BO 1688 decq %rax 1689 jne .L141 1690 ALIGN_4 1691 1692 1693.L144: 1694#ifndef TRMMKERNEL 1695 movq K, %rax 1696#else 1697 movq KKK, %rax 1698#endif 1699 andq $4, %rax # if (k & 1) 1700 BRANCH 1701 jle .L145 1702 1703 mulpd %xmm0, %xmm1 1704 mulpd -14 * SIZE(BO), %xmm0 1705 addpd %xmm1, %xmm8 1706 movapd -12 * SIZE(BO), %xmm1 1707 addpd %xmm0, %xmm9 1708 movapd -14 * SIZE(AO), %xmm0 1709 mulpd %xmm0, %xmm1 1710 mulpd -10 * SIZE(BO), %xmm0 1711 addpd %xmm1, %xmm10 1712 movapd 0 * SIZE(BO), %xmm1 1713 addpd %xmm0, %xmm11 1714 movapd -12 * SIZE(AO), %xmm0 1715 mulpd %xmm0, %xmm3 1716 mulpd -6 * SIZE(BO), %xmm0 1717 addpd %xmm3, %xmm8 1718 movapd -4 * SIZE(BO), %xmm3 1719 addpd %xmm0, %xmm9 1720 movapd -10 * SIZE(AO), %xmm0 1721 mulpd %xmm0, %xmm3 1722 mulpd -2 * SIZE(BO), %xmm0 1723 addpd %xmm3, %xmm10 1724 addpd %xmm0, %xmm11 1725 movapd -8 * SIZE(AO), %xmm0 1726 1727 addq $8 * SIZE, AO 1728 subq $-16 * SIZE, BO 1729 ALIGN_4 1730 1731.L145: 1732 movapd POSINV, %xmm5 1733 movapd ALPHA_R, %xmm6 1734 movapd ALPHA_I, %xmm7 1735 1736#ifndef TRMMKERNEL 1737 movq K, %rax 1738#else 1739 movq KKK, %rax 1740#endif 1741 andq $3, %rax # if (k & 1) 1742 BRANCH 1743 jle .L148 1744 ALIGN_4 1745 1746.L146: 1747 mulpd %xmm0, %xmm1 1748 mulpd -14 * SIZE(BO), %xmm0 1749 addpd %xmm1, %xmm8 1750 movapd -12 * SIZE(BO), %xmm1 1751 addpd %xmm0, %xmm9 1752 movapd -14 * SIZE(AO), %xmm0 1753 1754 addq $2 * SIZE, AO # aoffset += 4 1755 addq $4 * SIZE, BO # boffset1 += 8 1756 decq %rax 1757 jg .L146 1758 ALIGN_4 1759 1760.L148: 1761 addpd %xmm10, %xmm8 1762 addpd %xmm11, %xmm9 1763 1764#ifndef TRMMKERNEL 1765 movlpd 0 * SIZE(CO1), %xmm0 1766 movhpd 1 * SIZE(CO1), %xmm0 1767#endif 1768 1769 SHUFPD_1 %xmm9, %xmm9 1770 1771#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1772 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1773 xorpd %xmm5, %xmm9 1774#else 1775 xorpd %xmm5, %xmm8 1776#endif 1777 1778#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1779 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1780 subpd %xmm9, %xmm8 1781#else 1782 addpd %xmm9, %xmm8 1783#endif 1784 1785 pshufd $0x4e, %xmm8, %xmm9 1786 1787 mulpd %xmm6, %xmm8 1788 mulpd %xmm7, %xmm9 1789 1790 addpd %xmm9, %xmm8 1791 1792#ifndef TRMMKERNEL 1793 addpd %xmm0, %xmm8 1794#endif 1795 1796 movlpd %xmm8, 0 * SIZE(CO1) 1797 movhpd %xmm8, 1 * SIZE(CO1) 1798 ALIGN_4 1799 1800.L999: 1801 movq %rbx, %rsp 1802 EMMS 1803 1804 movq 0(%rsp), %rbx 1805 movq 8(%rsp), %rbp 1806 movq 16(%rsp), %r12 1807 movq 24(%rsp), %r13 1808 movq 32(%rsp), %r14 1809 movq 40(%rsp), %r15 1810 1811#ifdef WINDOWS_ABI 1812 movq 48(%rsp), %rdi 1813 movq 56(%rsp), %rsi 1814 movups 64(%rsp), %xmm6 1815 movups 80(%rsp), %xmm7 1816 movups 96(%rsp), %xmm8 1817 movups 112(%rsp), %xmm9 1818 movups 128(%rsp), %xmm10 1819 movups 144(%rsp), %xmm11 1820 movups 160(%rsp), %xmm12 1821 movups 176(%rsp), %xmm13 1822 movups 192(%rsp), %xmm14 1823 movups 208(%rsp), %xmm15 1824#endif 1825 1826 addq $STACKSIZE, %rsp 1827 ret 1828 1829 EPILOGUE 1830