1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %r12 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 64 62 63#else 64 65#define STACKSIZE 256 66 67#define OLD_A 40 + STACKSIZE(%rsp) 68#define OLD_B 48 + STACKSIZE(%rsp) 69#define OLD_C 56 + STACKSIZE(%rsp) 70#define OLD_LDC 64 + STACKSIZE(%rsp) 71#define OLD_OFFSET 72 + STACKSIZE(%rsp) 72 73#endif 74 75#define ALPHA 0(%rsp) 76#define J 16(%rsp) 77#define OFFSET 24(%rsp) 78#define KK 32(%rsp) 79#define KKK 40(%rsp) 80#define BUFFER 128(%rsp) 81 82#define PREFETCH prefetch 83#define PREFETCHSIZE (16 * 17 + 0) 84 85#define RPREFETCHSIZE (16 * 4 + 0) 86#define WPREFETCHSIZE (16 * 9 + 0) 87 88#define KERNEL1(xx) \ 89 vfmaddps %xmm8,%xmm1,%xmm0,%xmm8 ;\ 90 vmovaps %xmm2, %xmm0 ;\ 91 vmovups -28 * SIZE(AO, %rax, 4),%xmm2 ;\ 92 vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ 93 vmovups -24 * SIZE(BO, %rax, 8), %xmm1 ;\ 94 vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\ 95 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 96 vmovups -20 * SIZE(BO, %rax, 8), %xmm3 ;\ 97 vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\ 98 vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ 99 vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\ 100 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 101 vmovups -24 * SIZE(AO, %rax, 4), %xmm0 ;\ 102 vmovups -16 * SIZE(BO, %rax, 8), %xmm1 ;\ 103 vmovups -12 * SIZE(BO, %rax, 8), %xmm3 ;\ 104 vmovaps %xmm0, %xmm2 105 106 107#define KERNEL2(xx) \ 108 vfmaddps %xmm8,%xmm1,%xmm0,%xmm8 ;\ 109 vmovaps %xmm2, %xmm0 ;\ 110 vmovups -20 * SIZE(AO, %rax, 4),%xmm2 ;\ 111 vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ 112 vmovups -8 * SIZE(BO, %rax, 8), %xmm1 ;\ 113 vfmaddps %xmm9,%xmm3, %xmm0, %xmm9 ;\ 114 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 115 vmovups -4 * SIZE(BO, %rax, 8), %xmm3 ;\ 116 vfmaddps %xmm10,%xmm1, %xmm0, %xmm10 ;\ 117 vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ 118 vfmaddps %xmm11,%xmm3, %xmm0, %xmm11 ;\ 119 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 120 vmovups 4 * SIZE(BO, %rax, 8), %xmm3 ;\ 121 vmovaps %xmm4, %xmm2 122 123 124 125#define KERNEL3(xx) \ 126 vfmaddps %xmm8,%xmm5,%xmm4,%xmm8 ;\ 127 vmovups -12 * SIZE(AO, %rax, 4),%xmm2 ;\ 128 vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ 129 vmovups 32 * SIZE(BO, %rax, 8), %xmm1 ;\ 130 vmovups 8 * SIZE(BO, %rax, 8), %xmm5 ;\ 131 vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\ 132 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 133 vmovups 12 * SIZE(BO, %rax, 8), %xmm3 ;\ 134 vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\ 135 vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ 136 vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\ 137 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 138 vmovups -8 * SIZE(AO, %rax, 4), %xmm4 ;\ 139 vmovups 16 * SIZE(BO, %rax, 8), %xmm5 ;\ 140 vmovups 20 * SIZE(BO, %rax, 8), %xmm3 ;\ 141 vmovaps %xmm4, %xmm2 142 143#define KERNEL4(xx) \ 144 vfmaddps %xmm8,%xmm5, %xmm4, %xmm8 ;\ 145 vmovups -4 * SIZE(AO, %rax, 4),%xmm2 ;\ 146 vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ 147 vmovups 24 * SIZE(BO, %rax, 8), %xmm5 ;\ 148 vfmaddps %xmm9,%xmm3, %xmm4, %xmm9 ;\ 149 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 150 vmovups 28 * SIZE(BO, %rax, 8), %xmm3 ;\ 151 vfmaddps %xmm10,%xmm5, %xmm4, %xmm10 ;\ 152 vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ 153 vmovups 64 * SIZE(BO, %rax, 8), %xmm5 ;\ 154 vfmaddps %xmm11,%xmm3, %xmm4, %xmm11 ;\ 155 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 156 vmovups (AO, %rax, 4), %xmm6 ;\ 157 vmovups 36 * SIZE(BO, %rax, 8), %xmm3 ;\ 158 vmovaps %xmm6, %xmm2 159 160#define KERNEL5(xx) \ 161 vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\ 162 vmovups 4 * SIZE(AO, %rax, 4),%xmm2 ;\ 163 vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ 164 vmovups 40 * SIZE(BO, %rax, 8), %xmm1 ;\ 165 vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\ 166 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 167 vmovups 16 * SIZE(AO, %rax, 4), %xmm7 ;\ 168 vmovups 44 * SIZE(BO, %rax, 8), %xmm3 ;\ 169 vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\ 170 vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ 171 vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\ 172 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 173 vmovups 8 * SIZE(AO, %rax, 4), %xmm6 ;\ 174 vmovups 48 * SIZE(BO, %rax, 8), %xmm1 ;\ 175 vmovups 52 * SIZE(BO, %rax, 8), %xmm3 ;\ 176 vmovaps %xmm6, %xmm2 177 178#define KERNEL6(xx) \ 179 vfmaddps %xmm8,%xmm1, %xmm6, %xmm8 ;\ 180 vmovups 12 * SIZE(AO, %rax, 4),%xmm2 ;\ 181 vfmaddps %xmm12,%xmm2, %xmm1, %xmm12 ;\ 182 vmovups 56 * SIZE(BO, %rax, 8), %xmm1 ;\ 183 vfmaddps %xmm9,%xmm3, %xmm6, %xmm9 ;\ 184 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 185 vmovups 60 * SIZE(BO, %rax, 8), %xmm3 ;\ 186 vfmaddps %xmm10,%xmm1, %xmm6, %xmm10 ;\ 187 vfmaddps %xmm14,%xmm2, %xmm1, %xmm14 ;\ 188 vfmaddps %xmm11,%xmm3, %xmm6, %xmm11 ;\ 189 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 190 vmovups 32 * SIZE(AO, %rax, 4), %xmm0 ;\ 191 vmovups 68 * SIZE(BO, %rax, 8), %xmm3 ;\ 192 vmovaps %xmm7, %xmm2 193 194#define KERNEL7(xx) \ 195 vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\ 196 vmovups 20 * SIZE(AO, %rax, 4),%xmm2 ;\ 197 vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ 198 vmovups 96 * SIZE(BO, %rax, 8), %xmm1 ;\ 199 vmovups 72 * SIZE(BO, %rax, 8), %xmm5 ;\ 200 vfmaddps %xmm9,%xmm3, %xmm7, %xmm9 ;\ 201 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 202 vmovups 76 * SIZE(BO, %rax, 8), %xmm3 ;\ 203 vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\ 204 vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ 205 vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\ 206 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 207 vmovups 24 * SIZE(AO, %rax, 4), %xmm7 ;\ 208 vmovups 80 * SIZE(BO, %rax, 8), %xmm5 ;\ 209 vmovups 84 * SIZE(BO, %rax, 8), %xmm3 ;\ 210 movaps %xmm7, %xmm2 211 212#define KERNEL8(xx) \ 213 vfmaddps %xmm8,%xmm5, %xmm7, %xmm8 ;\ 214 vmovups 28 * SIZE(AO, %rax, 4),%xmm2 ;\ 215 vfmaddps %xmm12,%xmm2, %xmm5, %xmm12 ;\ 216 vmovups 88 * SIZE(BO, %rax, 8), %xmm5 ;\ 217 vfmaddps %xmm9, %xmm3, %xmm7, %xmm9 ;\ 218 vfmaddps %xmm13,%xmm2, %xmm3, %xmm13 ;\ 219 vmovups 92 * SIZE(BO, %rax, 8), %xmm3 ;\ 220 vfmaddps %xmm10,%xmm5, %xmm7, %xmm10 ;\ 221 vfmaddps %xmm14,%xmm2, %xmm5, %xmm14 ;\ 222 vmovups 48 * SIZE(AO, %rax, 4), %xmm4 ;\ 223 vmovups 128 * SIZE(BO, %rax, 8), %xmm5 ;\ 224 vfmaddps %xmm11,%xmm3, %xmm7, %xmm11 ;\ 225 vfmaddps %xmm15,%xmm2, %xmm3, %xmm15 ;\ 226 vmovups 100 * SIZE(BO, %rax, 8), %xmm3 ;\ 227 vmovaps %xmm0, %xmm2 ;\ 228 addq $16 * SIZE, %rax 229 230#define KERNEL_SUB1(xx) \ 231 mulps %xmm1, %xmm0 ;\ 232 mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ 233 addps %xmm0, %xmm8 ;\ 234 movaps %xmm2, %xmm0 ;\ 235 addps %xmm1, %xmm12 ;\ 236 movaps -24 * SIZE(BO, %rax, 8), %xmm1 ;\ 237 mulps %xmm3, %xmm2 ;\ 238 mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ 239 addps %xmm2, %xmm9 ;\ 240 movaps %xmm0, %xmm2 ;\ 241 addps %xmm3, %xmm13 ;\ 242 movaps -20 * SIZE(BO, %rax, 8), %xmm3 ;\ 243 mulps %xmm1, %xmm0 ;\ 244 mulps -28 * SIZE(AO, %rax, 4), %xmm1 ;\ 245 addps %xmm0, %xmm10 ;\ 246 movaps -24 * SIZE(AO, %rax, 4), %xmm0 ;\ 247 addps %xmm1, %xmm14 ;\ 248 movaps -16 * SIZE(BO, %rax, 8), %xmm1 ;\ 249 mulps %xmm3, %xmm2 ;\ 250 mulps -28 * SIZE(AO, %rax, 4), %xmm3 ;\ 251 addps %xmm2, %xmm11 ;\ 252 addps %xmm3, %xmm15 ;\ 253 movaps -12 * SIZE(BO, %rax, 8), %xmm3 ;\ 254 movaps %xmm0, %xmm2 255 256#define KERNEL_SUB2(xx) \ 257 mulps %xmm1, %xmm0 ;\ 258 mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ 259 addps %xmm0, %xmm8 ;\ 260 movaps %xmm2, %xmm0 ;\ 261 addps %xmm1, %xmm12 ;\ 262 movaps -8 * SIZE(BO, %rax, 8), %xmm1 ;\ 263 mulps %xmm3, %xmm2 ;\ 264 mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ 265 addps %xmm2, %xmm9 ;\ 266 movaps %xmm0, %xmm2 ;\ 267 addps %xmm3, %xmm13 ;\ 268 movaps -4 * SIZE(BO, %rax, 8), %xmm3 ;\ 269 mulps %xmm1, %xmm0 ;\ 270 mulps -20 * SIZE(AO, %rax, 4), %xmm1 ;\ 271 addps %xmm0, %xmm10 ;\ 272 movaps (AO, %rax, 4), %xmm0 ;\ 273 addps %xmm1, %xmm14 ;\ 274 movaps 32 * SIZE(BO, %rax, 8), %xmm1 ;\ 275 mulps %xmm3, %xmm2 ;\ 276 mulps -20 * SIZE(AO, %rax, 4), %xmm3 ;\ 277 addps %xmm2, %xmm11 ;\ 278 addps %xmm3, %xmm15 ;\ 279 movaps 4 * SIZE(BO, %rax, 8), %xmm3 ;\ 280 movaps %xmm4, %xmm2 281 282#define KERNEL_SUB3(xx) \ 283 mulps %xmm5, %xmm4 ;\ 284 mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ 285 addps %xmm4, %xmm8 ;\ 286 movaps %xmm2, %xmm4 ;\ 287 addps %xmm5, %xmm12 ;\ 288 movaps 8 * SIZE(BO, %rax, 8), %xmm5 ;\ 289 mulps %xmm3, %xmm2 ;\ 290 mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ 291 addps %xmm2, %xmm9 ;\ 292 movaps %xmm4, %xmm2 ;\ 293 addps %xmm3, %xmm13 ;\ 294 movaps 12 * SIZE(BO, %rax, 8), %xmm3 ;\ 295 mulps %xmm5, %xmm4 ;\ 296 mulps -12 * SIZE(AO, %rax, 4), %xmm5 ;\ 297 addps %xmm4, %xmm10 ;\ 298 movaps -8 * SIZE(AO, %rax, 4), %xmm4 ;\ 299 addps %xmm5, %xmm14 ;\ 300 movaps 16 * SIZE(BO, %rax, 8), %xmm5 ;\ 301 mulps %xmm3, %xmm2 ;\ 302 mulps -12 * SIZE(AO, %rax, 4), %xmm3 ;\ 303 addps %xmm2, %xmm11 ;\ 304 addps %xmm3, %xmm15 ;\ 305 movaps 20 * SIZE(BO, %rax, 8), %xmm3 ;\ 306 movaps %xmm4, %xmm2 307 308#define KERNEL_SUB4(xx) \ 309 mulps %xmm5, %xmm4 ;\ 310 mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ 311 addps %xmm4, %xmm8 ;\ 312 movaps %xmm2, %xmm4 ;\ 313 addps %xmm5, %xmm12 ;\ 314 movaps 24 * SIZE(BO, %rax, 8), %xmm5 ;\ 315 mulps %xmm3, %xmm2 ;\ 316 mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ 317 addps %xmm2, %xmm9 ;\ 318 movaps %xmm4, %xmm2 ;\ 319 addps %xmm3, %xmm13 ;\ 320 movaps 28 * SIZE(BO, %rax, 8), %xmm3 ;\ 321 mulps %xmm5, %xmm4 ;\ 322 mulps -4 * SIZE(AO, %rax, 4), %xmm5 ;\ 323 addps %xmm4, %xmm10 ;\ 324 addps %xmm5, %xmm14 ;\ 325 mulps %xmm3, %xmm2 ;\ 326 mulps -4 * SIZE(AO, %rax, 4), %xmm3 ;\ 327 addps %xmm2, %xmm11 ;\ 328 addps %xmm3, %xmm15 ;\ 329 movaps 36 * SIZE(BO, %rax, 8), %xmm3 ;\ 330 movaps %xmm0, %xmm2 331 332#if defined(OS_LINUX) && defined(CORE_BULLDOZER) && !defined(TRMMKERNEL) 333 .align 32768 334#endif 335 PROLOGUE 336 PROFCODE 337 338 subq $STACKSIZE, %rsp 339 340 movq %rbx, 0(%rsp) 341 movq %rbp, 8(%rsp) 342 movq %r12, 16(%rsp) 343 movq %r13, 24(%rsp) 344 movq %r14, 32(%rsp) 345 movq %r15, 40(%rsp) 346 347#ifdef WINDOWS_ABI 348 movq %rdi, 48(%rsp) 349 movq %rsi, 56(%rsp) 350 movups %xmm6, 64(%rsp) 351 movups %xmm7, 80(%rsp) 352 movups %xmm8, 96(%rsp) 353 movups %xmm9, 112(%rsp) 354 movups %xmm10, 128(%rsp) 355 movups %xmm11, 144(%rsp) 356 movups %xmm12, 160(%rsp) 357 movups %xmm13, 176(%rsp) 358 movups %xmm14, 192(%rsp) 359 movups %xmm15, 208(%rsp) 360 361 movq ARG1, OLD_M 362 movq ARG2, OLD_N 363 movq ARG3, K 364 movq OLD_A, A 365 movq OLD_B, B 366 movq OLD_C, C 367 movq OLD_LDC, LDC 368#ifdef TRMMKERNEL 369 movsd OLD_OFFSET, %xmm12 370#endif 371 movaps %xmm3, %xmm0 372 373#else 374 movq 72(%rsp), LDC 375#ifdef TRMMKERNEL 376 movsd 80(%rsp), %xmm12 377#endif 378 379#endif 380 381 movq %rsp, %rbx # save old stack 382 subq $128 + LOCAL_BUFFER_SIZE, %rsp 383 andq $-4096, %rsp # align stack 384 385 STACK_TOUCHING 386 387 movq OLD_M, M 388 movq OLD_N, N 389 390 shufps $0, %xmm0, %xmm0 391 movaps %xmm0, ALPHA 392 393#ifdef TRMMKERNEL 394 movsd %xmm12, OFFSET 395 movsd %xmm12, KK 396#ifndef LEFT 397 negq KK 398#endif 399#endif 400 401 subq $-32 * SIZE, A 402 403 leaq (, LDC, SIZE), LDC 404 405 movq N, J 406 sarq $2, J # j = (n >> 2) 407 jle .L50 408 409.L01: 410#if defined(TRMMKERNEL) && defined(LEFT) 411 movq OFFSET, %rax 412 movq %rax, KK 413#endif 414 415/* Copying to Sub Buffer */ 416 leaq BUFFER, BO 417 418 movq K, %rax 419 sarq $2, %rax 420 jle .L03 421 ALIGN_4 422 423.L02: 424 425 prefetcht0 192(B) 426 prefetcht0 256(B) 427 prefetcht0 192(BO) 428 prefetcht0 256(BO) 429 movaps 0 * SIZE(B), %xmm3 430 movaps 0 * SIZE(B), %xmm3 431 movaps 4 * SIZE(B), %xmm7 432 movaps 8 * SIZE(B), %xmm11 433 movaps 12 * SIZE(B), %xmm15 434 435 436 pshufd $0x00, %xmm3, %xmm0 437 pshufd $0x55, %xmm3, %xmm1 438 pshufd $0xaa, %xmm3, %xmm2 439 pshufd $0xff, %xmm3, %xmm3 440 441 442 pshufd $0x00, %xmm7, %xmm4 443 pshufd $0x55, %xmm7, %xmm5 444 pshufd $0xaa, %xmm7, %xmm6 445 pshufd $0xff, %xmm7, %xmm7 446 447 movaps %xmm0, 0 * SIZE(BO) 448 movaps %xmm1, 4 * SIZE(BO) 449 movaps %xmm2, 8 * SIZE(BO) 450 movaps %xmm3, 12 * SIZE(BO) 451 movaps %xmm4, 16 * SIZE(BO) 452 movaps %xmm5, 20 * SIZE(BO) 453 movaps %xmm6, 24 * SIZE(BO) 454 movaps %xmm7, 28 * SIZE(BO) 455 456 457 pshufd $0x00, %xmm11, %xmm0 458 pshufd $0x55, %xmm11, %xmm1 459 pshufd $0xaa, %xmm11, %xmm2 460 pshufd $0xff, %xmm11, %xmm3 461 462 463 pshufd $0x00, %xmm15, %xmm4 464 pshufd $0x55, %xmm15, %xmm5 465 pshufd $0xaa, %xmm15, %xmm6 466 pshufd $0xff, %xmm15, %xmm7 467 468 movaps %xmm0, 32 * SIZE(BO) 469 movaps %xmm1, 36 * SIZE(BO) 470 movaps %xmm2, 40 * SIZE(BO) 471 movaps %xmm3, 44 * SIZE(BO) 472 movaps %xmm4, 48 * SIZE(BO) 473 movaps %xmm5, 52 * SIZE(BO) 474 movaps %xmm6, 56 * SIZE(BO) 475 movaps %xmm7, 60 * SIZE(BO) 476 477 addq $16 * SIZE, B 478 addq $64 * SIZE, BO 479 480 decq %rax 481 jne .L02 482 ALIGN_4 483 484.L03: 485 movq K, %rax 486 andq $3, %rax 487 BRANCH 488 jle .L10 489 ALIGN_4 490 491.L04: 492 movaps 0 * SIZE(B), %xmm3 493 494 pshufd $0x00, %xmm3, %xmm0 495 pshufd $0x55, %xmm3, %xmm1 496 pshufd $0xaa, %xmm3, %xmm2 497 pshufd $0xff, %xmm3, %xmm3 498 499 movaps %xmm0, 0 * SIZE(BO) 500 movaps %xmm1, 4 * SIZE(BO) 501 movaps %xmm2, 8 * SIZE(BO) 502 movaps %xmm3, 12 * SIZE(BO) 503 504 addq $ 4 * SIZE, B 505 addq $16 * SIZE, BO 506 decq %rax 507 jne .L04 508 ALIGN_4 509 510.L10: 511 movq C, CO1 512 leaq (C, LDC, 1), CO2 513 movq A, AO 514 515 516 movq M, I 517 sarq $3, I # i = (m >> 3) 518 jle .L20 519 ALIGN_4 520 521.L11: 522#if !defined(TRMMKERNEL) || \ 523 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 524 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 525 526 leaq 32 * SIZE + BUFFER, BO 527#else 528 leaq 32 * SIZE + BUFFER, BO 529 movq KK, %rax 530 leaq (, %rax, 8), %rax 531 leaq (AO, %rax, 4), AO 532 leaq (BO, %rax, 8), BO 533#endif 534 535 movaps -32 * SIZE(AO), %xmm0 536 movaps -32 * SIZE(BO), %xmm1 537 xorps %xmm8, %xmm8 538 movaps -28 * SIZE(BO), %xmm3 539 xorps %xmm9, %xmm9 540 movaps -16 * SIZE(AO), %xmm4 541 xorps %xmm10, %xmm10 542 movaps 0 * SIZE(BO), %xmm5 543 xorps %xmm11, %xmm11 544 545 546 xorps %xmm12, %xmm12 547 xorps %xmm13, %xmm13 548 xorps %xmm14, %xmm14 549 xorps %xmm15, %xmm15 550 movaps %xmm0, %xmm2 551 prefetcht0 (CO1) 552 prefetcht0 (CO1,LDC, 2) 553 prefetcht0 (CO2) 554 prefetcht0 (CO2,LDC, 2) 555 556#ifndef TRMMKERNEL 557 movq K, %rax 558#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 559 movq K, %rax 560 subq KK, %rax 561 movq %rax, KKK 562#else 563 movq KK, %rax 564#ifdef LEFT 565 addq $8, %rax 566#else 567 addq $4, %rax 568#endif 569 movq %rax, KKK 570#endif 571 andq $-8, %rax 572 573 leaq (, %rax, 8), %rax 574 leaq (AO, %rax, 4), AO 575 leaq (BO, %rax, 8), BO 576 negq %rax 577 NOBRANCH 578 je .L15 579 ALIGN_3 580 581.L12: 582 KERNEL1(32 * 0) 583 KERNEL2(32 * 0) 584 KERNEL3(32 * 0) 585 KERNEL4(32 * 0) 586 KERNEL5(32 * 0) 587 KERNEL6(32 * 0) 588 KERNEL7(32 * 0) 589 KERNEL8(32 * 0) 590 NOBRANCH 591 je .L15 592 KERNEL1(32 * 0) 593 KERNEL2(32 * 0) 594 KERNEL3(32 * 0) 595 KERNEL4(32 * 0) 596 KERNEL5(32 * 0) 597 KERNEL6(32 * 0) 598 KERNEL7(32 * 0) 599 KERNEL8(32 * 0) 600 NOBRANCH 601 je .L15 602 KERNEL1(32 * 0) 603 KERNEL2(32 * 0) 604 KERNEL3(32 * 0) 605 KERNEL4(32 * 0) 606 KERNEL5(32 * 0) 607 KERNEL6(32 * 0) 608 KERNEL7(32 * 0) 609 KERNEL8(32 * 0) 610 NOBRANCH 611 je .L15 612 KERNEL1(32 * 0) 613 KERNEL2(32 * 0) 614 KERNEL3(32 * 0) 615 KERNEL4(32 * 0) 616 KERNEL5(32 * 0) 617 KERNEL6(32 * 0) 618 KERNEL7(32 * 0) 619 KERNEL8(32 * 0) 620 NOBRANCH 621 je .L15 622 KERNEL1(32 * 0) 623 KERNEL2(32 * 0) 624 KERNEL3(32 * 0) 625 KERNEL4(32 * 0) 626 KERNEL5(32 * 0) 627 KERNEL6(32 * 0) 628 KERNEL7(32 * 0) 629 KERNEL8(32 * 0) 630 NOBRANCH 631 je .L15 632 KERNEL1(32 * 0) 633 KERNEL2(32 * 0) 634 KERNEL3(32 * 0) 635 KERNEL4(32 * 0) 636 KERNEL5(32 * 0) 637 KERNEL6(32 * 0) 638 KERNEL7(32 * 0) 639 KERNEL8(32 * 0) 640 NOBRANCH 641 je .L15 642 KERNEL1(32 * 0) 643 KERNEL2(32 * 0) 644 KERNEL3(32 * 0) 645 KERNEL4(32 * 0) 646 KERNEL5(32 * 0) 647 KERNEL6(32 * 0) 648 KERNEL7(32 * 0) 649 KERNEL8(32 * 0) 650 NOBRANCH 651 je .L15 652 KERNEL1(32 * 0) 653 KERNEL2(32 * 0) 654 KERNEL3(32 * 0) 655 KERNEL4(32 * 0) 656 KERNEL5(32 * 0) 657 KERNEL6(32 * 0) 658 KERNEL7(32 * 0) 659 KERNEL8(32 * 0) 660 BRANCH 661 jl .L12 662 ALIGN_4 663 664.L15: 665 666 movaps ALPHA, %xmm7 667 668#ifndef TRMMKERNEL 669 movq K, %rax 670#else 671 movq KKK, %rax 672#endif 673 testq $4, %rax 674 je .L16 675 xorq %rax, %rax 676 ALIGN_3 677 678 KERNEL_SUB1(32 * 0) 679 KERNEL_SUB2(32 * 0) 680 KERNEL_SUB3(32 * 0) 681 KERNEL_SUB4(32 * 0) 682 683 addq $32 * SIZE, AO 684 addq $64 * SIZE, BO 685 ALIGN_3 686 687.L16: 688#ifndef TRMMKERNEL 689 movq K, %rax 690#else 691 movq KKK, %rax 692#endif 693 andq $3, %rax # if (k & 1) 694 je .L18 695 696 leaq (, %rax, 8), %rax 697 leaq (AO, %rax, 4), AO 698 leaq (BO, %rax, 8), BO 699 negq %rax 700 ALIGN_4 701 702.L17: 703 mulps %xmm1, %xmm0 704 mulps -28 * SIZE(AO, %rax, 4), %xmm1 705 addps %xmm0, %xmm8 706 movaps %xmm2, %xmm0 707 addps %xmm1, %xmm12 708 movaps -24 * SIZE(BO, %rax, 8), %xmm1 709 mulps %xmm3, %xmm2 710 mulps -28 * SIZE(AO, %rax, 4), %xmm3 711 addps %xmm2, %xmm9 712 movaps %xmm0, %xmm2 713 addps %xmm3, %xmm13 714 movaps -20 * SIZE(BO, %rax, 8), %xmm3 715 mulps %xmm1, %xmm0 716 mulps -28 * SIZE(AO, %rax, 4), %xmm1 717 addps %xmm0, %xmm10 718 movaps -24 * SIZE(AO, %rax, 4), %xmm0 719 addps %xmm1, %xmm14 720 movaps -16 * SIZE(BO, %rax, 8), %xmm1 721 mulps %xmm3, %xmm2 722 mulps -28 * SIZE(AO, %rax, 4), %xmm3 723 addps %xmm2, %xmm11 724 addps %xmm3, %xmm15 725 movaps -12 * SIZE(BO, %rax, 8), %xmm3 726 movaps %xmm0, %xmm2 727 728 addq $SIZE * 2, %rax 729 jl .L17 730 ALIGN_4 731 732.L18: 733#ifndef TRMMKERNEL 734 735 vfmaddps 0 * SIZE(CO1),%xmm7, %xmm8, %xmm8 736 vfmaddps 4 * SIZE(CO1),%xmm7, %xmm12, %xmm12 737 vfmaddps 0 * SIZE(CO2),%xmm7, %xmm9, %xmm9 738 vfmaddps 4 * SIZE(CO2),%xmm7, %xmm13, %xmm13 739 vfmaddps 0 * SIZE(CO1, LDC, 2),%xmm7, %xmm10, %xmm10 740 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm7, %xmm14, %xmm14 741 vfmaddps 0 * SIZE(CO2, LDC, 2),%xmm7, %xmm11, %xmm11 742 vfmaddps 4 * SIZE(CO2, LDC, 2),%xmm7, %xmm15, %xmm15 743 744#else 745 746 vmulps %xmm7, %xmm8, %xmm8 747 vmulps %xmm7, %xmm9, %xmm9 748 vmulps %xmm7, %xmm10, %xmm10 749 vmulps %xmm7, %xmm11, %xmm11 750 751 vmulps %xmm7, %xmm12,%xmm12 752 vmulps %xmm7, %xmm13,%xmm13 753 vmulps %xmm7, %xmm14,%xmm14 754 vmulps %xmm7, %xmm15,%xmm15 755 756#endif 757 758 759 vmovups %xmm8, 0 * SIZE(CO1) 760 vmovups %xmm12, 4 * SIZE(CO1) 761 vmovups %xmm9, 0 * SIZE(CO2) 762 vmovups %xmm13, 4 * SIZE(CO2) 763 vmovups %xmm10, 0 * SIZE(CO1, LDC, 2) 764 vmovups %xmm14, 4 * SIZE(CO1, LDC, 2) 765 vmovups %xmm11, 0 * SIZE(CO2, LDC, 2) 766 vmovups %xmm15, 4 * SIZE(CO2, LDC, 2) 767 prefetcht0 64(CO1) 768 prefetcht0 64(CO1,LDC, 2) 769 prefetcht0 64(CO2) 770 prefetcht0 64(CO2,LDC, 2) 771 772#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 773 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 774 movq K, %rax 775 subq KKK, %rax 776 leaq (,%rax, 8), %rax 777 leaq (AO, %rax, 4), AO 778 leaq (BO, %rax, 8), BO 779#endif 780 781#if defined(TRMMKERNEL) && defined(LEFT) 782 addq $8, KK 783#endif 784 785 addq $8 * SIZE, CO1 # coffset += 4 786 addq $8 * SIZE, CO2 # coffset += 4 787 decq I # i -- 788 jg .L11 789 ALIGN_4 790 791.L20: 792 testq $4, M 793 je .L30 794 795#if !defined(TRMMKERNEL) || \ 796 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 797 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 798 799 leaq BUFFER, BO 800#else 801 leaq BUFFER, BO 802 movq KK, %rax 803 leaq (, %rax, 8), %rax 804 leaq (AO, %rax, 2), AO 805 leaq (BO, %rax, 8), BO 806#endif 807 808 movaps -32 * SIZE(AO), %xmm8 809 movaps -16 * SIZE(AO), %xmm10 810 811 movaps 0 * SIZE(BO), %xmm9 812 movaps 16 * SIZE(BO), %xmm11 813 movaps 32 * SIZE(BO), %xmm13 814 movaps 48 * SIZE(BO), %xmm15 815 816 xorps %xmm0, %xmm0 817 xorps %xmm1, %xmm1 818 xorps %xmm2, %xmm2 819 xorps %xmm3, %xmm3 820 821#ifndef TRMMKERNEL 822 movq K, %rax 823#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 824 movq K, %rax 825 subq KK, %rax 826 movq %rax, KKK 827#else 828 movq KK, %rax 829#ifdef LEFT 830 addq $4, %rax 831#else 832 addq $4, %rax 833#endif 834 movq %rax, KKK 835#endif 836 sarq $3, %rax 837 je .L25 838 ALIGN_4 839 840.L22: 841 mulps %xmm8, %xmm9 842 addps %xmm9, %xmm0 843 movaps 4 * SIZE(BO), %xmm9 844 mulps %xmm8, %xmm9 845 addps %xmm9, %xmm1 846 movaps 8 * SIZE(BO), %xmm9 847 mulps %xmm8, %xmm9 848 mulps 12 * SIZE(BO), %xmm8 849 addps %xmm9, %xmm2 850 movaps 64 * SIZE(BO), %xmm9 851 addps %xmm8, %xmm3 852 movaps -28 * SIZE(AO), %xmm8 853 854 mulps %xmm8, %xmm11 855 addps %xmm11, %xmm0 856 movaps 20 * SIZE(BO), %xmm11 857 mulps %xmm8, %xmm11 858 addps %xmm11, %xmm1 859 movaps 24 * SIZE(BO), %xmm11 860 mulps %xmm8, %xmm11 861 mulps 28 * SIZE(BO), %xmm8 862 addps %xmm11, %xmm2 863 movaps 80 * SIZE(BO), %xmm11 864 addps %xmm8, %xmm3 865 movaps -24 * SIZE(AO), %xmm8 866 867 mulps %xmm8, %xmm13 868 addps %xmm13, %xmm0 869 movaps 36 * SIZE(BO), %xmm13 870 mulps %xmm8, %xmm13 871 addps %xmm13, %xmm1 872 movaps 40 * SIZE(BO), %xmm13 873 mulps %xmm8, %xmm13 874 mulps 44 * SIZE(BO), %xmm8 875 addps %xmm13, %xmm2 876 movaps 96 * SIZE(BO), %xmm13 877 addps %xmm8, %xmm3 878 movaps -20 * SIZE(AO), %xmm8 879 880 mulps %xmm8, %xmm15 881 addps %xmm15, %xmm0 882 movaps 52 * SIZE(BO), %xmm15 883 mulps %xmm8, %xmm15 884 addps %xmm15, %xmm1 885 movaps 56 * SIZE(BO), %xmm15 886 mulps %xmm8, %xmm15 887 mulps 60 * SIZE(BO), %xmm8 888 addps %xmm15, %xmm2 889 movaps 112 * SIZE(BO), %xmm15 890 addps %xmm8, %xmm3 891 movaps 0 * SIZE(AO), %xmm8 892 893 mulps %xmm10, %xmm9 894 addps %xmm9, %xmm0 895 movaps 68 * SIZE(BO), %xmm9 896 mulps %xmm10, %xmm9 897 addps %xmm9, %xmm1 898 movaps 72 * SIZE(BO), %xmm9 899 mulps %xmm10, %xmm9 900 mulps 76 * SIZE(BO), %xmm10 901 addps %xmm9, %xmm2 902 movaps 128 * SIZE(BO), %xmm9 903 addps %xmm10, %xmm3 904 movaps -12 * SIZE(AO), %xmm10 905 906 mulps %xmm10, %xmm11 907 addps %xmm11, %xmm0 908 movaps 84 * SIZE(BO), %xmm11 909 mulps %xmm10, %xmm11 910 addps %xmm11, %xmm1 911 movaps 88 * SIZE(BO), %xmm11 912 mulps %xmm10, %xmm11 913 mulps 92 * SIZE(BO), %xmm10 914 addps %xmm11, %xmm2 915 movaps 144 * SIZE(BO), %xmm11 916 addps %xmm10, %xmm3 917 movaps -8 * SIZE(AO), %xmm10 918 919 mulps %xmm10, %xmm13 920 addps %xmm13, %xmm0 921 movaps 100 * SIZE(BO), %xmm13 922 mulps %xmm10, %xmm13 923 addps %xmm13, %xmm1 924 movaps 104 * SIZE(BO), %xmm13 925 mulps %xmm10, %xmm13 926 mulps 108 * SIZE(BO), %xmm10 927 addps %xmm13, %xmm2 928 movaps 160 * SIZE(BO), %xmm13 929 addps %xmm10, %xmm3 930 movaps -4 * SIZE(AO), %xmm10 931 932 mulps %xmm10, %xmm15 933 addps %xmm15, %xmm0 934 movaps 116 * SIZE(BO), %xmm15 935 mulps %xmm10, %xmm15 936 addps %xmm15, %xmm1 937 movaps 120 * SIZE(BO), %xmm15 938 mulps %xmm10, %xmm15 939 mulps 124 * SIZE(BO), %xmm10 940 addps %xmm15, %xmm2 941 movaps 176 * SIZE(BO), %xmm15 942 addps %xmm10, %xmm3 943 movaps 16 * SIZE(AO), %xmm10 944 945 addq $ 32 * SIZE, AO 946 addq $128 * SIZE, BO 947 decq %rax 948 jne .L22 949 ALIGN_4 950 951.L25: 952#ifndef TRMMKERNEL 953 movq K, %rax 954#else 955 movq KKK, %rax 956#endif 957 movaps ALPHA, %xmm15 958 andq $7, %rax # if (k & 1) 959 BRANCH 960 je .L28 961 ALIGN_4 962 963.L26: 964 mulps %xmm8, %xmm9 965 addps %xmm9, %xmm0 966 movaps 4 * SIZE(BO), %xmm9 967 mulps %xmm8, %xmm9 968 addps %xmm9, %xmm1 969 movaps 8 * SIZE(BO), %xmm9 970 mulps %xmm8, %xmm9 971 mulps 12 * SIZE(BO), %xmm8 972 addps %xmm9, %xmm2 973 movaps 16 * SIZE(BO), %xmm9 974 addps %xmm8, %xmm3 975 movaps -28 * SIZE(AO), %xmm8 976 977 addq $ 4 * SIZE, AO # aoffset += 4 978 addq $16 * SIZE, BO # boffset1 += 8 979 decq %rax 980 jg .L26 981 ALIGN_4 982 983.L28: 984 mulps %xmm15, %xmm0 985 mulps %xmm15, %xmm1 986 mulps %xmm15, %xmm2 987 mulps %xmm15, %xmm3 988 989#ifndef TRMMKERNEL 990 movsd 0 * SIZE(CO1), %xmm8 991 movhps 2 * SIZE(CO1), %xmm8 992 movsd 0 * SIZE(CO2), %xmm10 993 movhps 2 * SIZE(CO2), %xmm10 994 995 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 996 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 997 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 998 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 999 1000 addps %xmm8, %xmm0 1001 addps %xmm10, %xmm1 1002 addps %xmm12, %xmm2 1003 addps %xmm14, %xmm3 1004#endif 1005 1006 vmovups %xmm0, 0 * SIZE(CO1) 1007 vmovups %xmm1, 0 * SIZE(CO2) 1008 1009 vmovups %xmm2, 0 * SIZE(CO1, LDC, 2) 1010 vmovups %xmm3, 0 * SIZE(CO2, LDC, 2) 1011 1012#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1013 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1014 movq K, %rax 1015 subq KKK, %rax 1016 leaq (,%rax, 8), %rax 1017 leaq (AO, %rax, 2), AO 1018 leaq (BO, %rax, 8), BO 1019#endif 1020 1021#if defined(TRMMKERNEL) && defined(LEFT) 1022 addq $4, KK 1023#endif 1024 1025 addq $4 * SIZE, CO1 # coffset += 4 1026 addq $4 * SIZE, CO2 # coffset += 4 1027 ALIGN_4 1028 1029.L30: 1030 testq $2, M 1031 je .L40 1032 1033#if !defined(TRMMKERNEL) || \ 1034 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1035 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1036 1037 leaq BUFFER, BO 1038#else 1039 leaq BUFFER, BO 1040 movq KK, %rax 1041 leaq (, %rax, 8), %rax 1042 leaq (AO, %rax, 1), AO 1043 leaq (BO, %rax, 8), BO 1044#endif 1045 1046 movaps -32 * SIZE(AO), %xmm8 1047 movaps -24 * SIZE(AO), %xmm10 1048 1049 movaps 0 * SIZE(BO), %xmm9 1050 movaps 16 * SIZE(BO), %xmm11 1051 movaps 32 * SIZE(BO), %xmm13 1052 movaps 48 * SIZE(BO), %xmm15 1053 1054 xorps %xmm0, %xmm0 1055 xorps %xmm1, %xmm1 1056 xorps %xmm2, %xmm2 1057 xorps %xmm3, %xmm3 1058 1059#ifndef TRMMKERNEL 1060 movq K, %rax 1061#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1062 movq K, %rax 1063 subq KK, %rax 1064 movq %rax, KKK 1065#else 1066 movq KK, %rax 1067#ifdef LEFT 1068 addq $2, %rax 1069#else 1070 addq $4, %rax 1071#endif 1072 movq %rax, KKK 1073#endif 1074 sarq $3, %rax 1075 je .L35 1076 ALIGN_4 1077 1078.L32: 1079 mulps %xmm8, %xmm9 1080 addps %xmm9, %xmm0 1081 movsd 4 * SIZE(BO), %xmm9 1082 mulps %xmm8, %xmm9 1083 addps %xmm9, %xmm1 1084 movsd 8 * SIZE(BO), %xmm9 1085 mulps %xmm8, %xmm9 1086 addps %xmm9, %xmm2 1087 movsd 12 * SIZE(BO), %xmm9 1088 mulps %xmm8, %xmm9 1089 movsd -30 * SIZE(AO), %xmm8 1090 addps %xmm9, %xmm3 1091 movsd 64 * SIZE(BO), %xmm9 1092 1093 mulps %xmm8, %xmm11 1094 addps %xmm11, %xmm0 1095 movsd 20 * SIZE(BO), %xmm11 1096 mulps %xmm8, %xmm11 1097 addps %xmm11, %xmm1 1098 movsd 24 * SIZE(BO), %xmm11 1099 mulps %xmm8, %xmm11 1100 addps %xmm11, %xmm2 1101 movsd 28 * SIZE(BO), %xmm11 1102 mulps %xmm8, %xmm11 1103 movsd -28 * SIZE(AO), %xmm8 1104 addps %xmm11, %xmm3 1105 movsd 80 * SIZE(BO), %xmm11 1106 1107 mulps %xmm8, %xmm13 1108 addps %xmm13, %xmm0 1109 movsd 36 * SIZE(BO), %xmm13 1110 mulps %xmm8, %xmm13 1111 addps %xmm13, %xmm1 1112 movsd 40 * SIZE(BO), %xmm13 1113 mulps %xmm8, %xmm13 1114 addps %xmm13, %xmm2 1115 movsd 44 * SIZE(BO), %xmm13 1116 mulps %xmm8, %xmm13 1117 movsd -26 * SIZE(AO), %xmm8 1118 addps %xmm13, %xmm3 1119 movsd 96 * SIZE(BO), %xmm13 1120 1121 mulps %xmm8, %xmm15 1122 addps %xmm15, %xmm0 1123 movsd 52 * SIZE(BO), %xmm15 1124 mulps %xmm8, %xmm15 1125 addps %xmm15, %xmm1 1126 movsd 56 * SIZE(BO), %xmm15 1127 mulps %xmm8, %xmm15 1128 addps %xmm15, %xmm2 1129 movsd 60 * SIZE(BO), %xmm15 1130 mulps %xmm8, %xmm15 1131 movsd -16 * SIZE(AO), %xmm8 1132 addps %xmm15, %xmm3 1133 movsd 112 * SIZE(BO), %xmm15 1134 1135 mulps %xmm10, %xmm9 1136 addps %xmm9, %xmm0 1137 movsd 68 * SIZE(BO), %xmm9 1138 mulps %xmm10, %xmm9 1139 addps %xmm9, %xmm1 1140 movsd 72 * SIZE(BO), %xmm9 1141 mulps %xmm10, %xmm9 1142 addps %xmm9, %xmm2 1143 movsd 76 * SIZE(BO), %xmm9 1144 mulps %xmm10, %xmm9 1145 movsd -22 * SIZE(AO), %xmm10 1146 addps %xmm9, %xmm3 1147 movsd 128 * SIZE(BO), %xmm9 1148 1149 mulps %xmm10, %xmm11 1150 addps %xmm11, %xmm0 1151 movsd 84 * SIZE(BO), %xmm11 1152 mulps %xmm10, %xmm11 1153 addps %xmm11, %xmm1 1154 movsd 88 * SIZE(BO), %xmm11 1155 mulps %xmm10, %xmm11 1156 addps %xmm11, %xmm2 1157 movsd 92 * SIZE(BO), %xmm11 1158 mulps %xmm10, %xmm11 1159 movsd -20 * SIZE(AO), %xmm10 1160 addps %xmm11, %xmm3 1161 movsd 144 * SIZE(BO), %xmm11 1162 1163 mulps %xmm10, %xmm13 1164 addps %xmm13, %xmm0 1165 movsd 100 * SIZE(BO), %xmm13 1166 mulps %xmm10, %xmm13 1167 addps %xmm13, %xmm1 1168 movsd 104 * SIZE(BO), %xmm13 1169 mulps %xmm10, %xmm13 1170 addps %xmm13, %xmm2 1171 movsd 108 * SIZE(BO), %xmm13 1172 mulps %xmm10, %xmm13 1173 movsd -18 * SIZE(AO), %xmm10 1174 addps %xmm13, %xmm3 1175 movsd 160 * SIZE(BO), %xmm13 1176 1177 mulps %xmm10, %xmm15 1178 addps %xmm15, %xmm0 1179 movsd 116 * SIZE(BO), %xmm15 1180 mulps %xmm10, %xmm15 1181 addps %xmm15, %xmm1 1182 movsd 120 * SIZE(BO), %xmm15 1183 mulps %xmm10, %xmm15 1184 addps %xmm15, %xmm2 1185 movsd 124 * SIZE(BO), %xmm15 1186 mulps %xmm10, %xmm15 1187 movsd -8 * SIZE(AO), %xmm10 1188 addps %xmm15, %xmm3 1189 movsd 176 * SIZE(BO), %xmm15 1190 1191 addq $ 16 * SIZE, AO 1192 addq $128 * SIZE, BO 1193 decq %rax 1194 jne .L32 1195 ALIGN_4 1196 1197.L35: 1198#ifndef TRMMKERNEL 1199 movq K, %rax 1200#else 1201 movq KKK, %rax 1202#endif 1203 movaps ALPHA, %xmm15 1204 andq $7, %rax # if (k & 1) 1205 BRANCH 1206 je .L38 1207 ALIGN_4 1208 1209.L36: 1210 mulps %xmm8, %xmm9 1211 addps %xmm9, %xmm0 1212 movsd 4 * SIZE(BO), %xmm9 1213 mulps %xmm8, %xmm9 1214 addps %xmm9, %xmm1 1215 movsd 8 * SIZE(BO), %xmm9 1216 mulps %xmm8, %xmm9 1217 addps %xmm9, %xmm2 1218 movsd 12 * SIZE(BO), %xmm9 1219 mulps %xmm8, %xmm9 1220 movsd -30 * SIZE(AO), %xmm8 1221 addps %xmm9, %xmm3 1222 movsd 16 * SIZE(BO), %xmm9 1223 1224 addq $ 2 * SIZE, AO # aoffset += 4 1225 addq $16 * SIZE, BO # boffset1 += 8 1226 decq %rax 1227 jg .L36 1228 ALIGN_4 1229 1230.L38: 1231 mulps %xmm15, %xmm0 1232 mulps %xmm15, %xmm1 1233 mulps %xmm15, %xmm2 1234 mulps %xmm15, %xmm3 1235 1236#ifndef TRMMKERNEL 1237 movsd 0 * SIZE(CO1), %xmm8 1238 movsd 0 * SIZE(CO2), %xmm10 1239 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1240 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1241 1242 addps %xmm8, %xmm0 1243 addps %xmm10, %xmm1 1244 addps %xmm12, %xmm2 1245 addps %xmm14, %xmm3 1246#endif 1247 1248 movsd %xmm0, 0 * SIZE(CO1) 1249 movsd %xmm1, 0 * SIZE(CO2) 1250 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 1251 movsd %xmm3, 0 * SIZE(CO2, LDC, 2) 1252 1253#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1254 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1255 movq K, %rax 1256 subq KKK, %rax 1257 leaq (,%rax, 8), %rax 1258 leaq (AO, %rax, 1), AO 1259 leaq (BO, %rax, 8), BO 1260#endif 1261 1262#if defined(TRMMKERNEL) && defined(LEFT) 1263 addq $2, KK 1264#endif 1265 1266 addq $2 * SIZE, CO1 # coffset += 4 1267 addq $2 * SIZE, CO2 # coffset += 4 1268 ALIGN_4 1269 1270.L40: 1271 testq $1, M 1272 je .L49 1273 1274#if !defined(TRMMKERNEL) || \ 1275 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1276 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1277 1278 leaq BUFFER, BO 1279#else 1280 leaq BUFFER, BO 1281 movq KK, %rax 1282 leaq (, %rax, 4), %rax 1283 leaq (AO, %rax, 1), AO 1284 leaq (BO, %rax, 8), BO 1285 leaq (BO, %rax, 8), BO 1286#endif 1287 1288 movss -32 * SIZE(AO), %xmm8 1289 movss -28 * SIZE(AO), %xmm10 1290 1291 movss 0 * SIZE(BO), %xmm9 1292 movss 16 * SIZE(BO), %xmm11 1293 movss 32 * SIZE(BO), %xmm13 1294 movss 48 * SIZE(BO), %xmm15 1295 1296 xorps %xmm0, %xmm0 1297 xorps %xmm1, %xmm1 1298 xorps %xmm2, %xmm2 1299 xorps %xmm3, %xmm3 1300 1301#ifndef TRMMKERNEL 1302 movq K, %rax 1303#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1304 movq K, %rax 1305 subq KK, %rax 1306 movq %rax, KKK 1307#else 1308 movq KK, %rax 1309#ifdef LEFT 1310 addq $1, %rax 1311#else 1312 addq $4, %rax 1313#endif 1314 movq %rax, KKK 1315#endif 1316 sarq $3, %rax 1317 je .L45 1318 ALIGN_4 1319 1320.L42: 1321 mulss %xmm8, %xmm9 1322 addss %xmm9, %xmm0 1323 movss 4 * SIZE(BO), %xmm9 1324 mulss %xmm8, %xmm9 1325 addss %xmm9, %xmm1 1326 movss 8 * SIZE(BO), %xmm9 1327 mulss %xmm8, %xmm9 1328 addss %xmm9, %xmm2 1329 movss 12 * SIZE(BO), %xmm9 1330 mulss %xmm8, %xmm9 1331 movss -31 * SIZE(AO), %xmm8 1332 addss %xmm9, %xmm3 1333 movss 64 * SIZE(BO), %xmm9 1334 1335 mulss %xmm8, %xmm11 1336 addss %xmm11, %xmm0 1337 movss 20 * SIZE(BO), %xmm11 1338 mulss %xmm8, %xmm11 1339 addss %xmm11, %xmm1 1340 movss 24 * SIZE(BO), %xmm11 1341 mulss %xmm8, %xmm11 1342 addss %xmm11, %xmm2 1343 movss 28 * SIZE(BO), %xmm11 1344 mulss %xmm8, %xmm11 1345 movss -30 * SIZE(AO), %xmm8 1346 addss %xmm11, %xmm3 1347 movss 80 * SIZE(BO), %xmm11 1348 1349 mulss %xmm8, %xmm13 1350 addss %xmm13, %xmm0 1351 movss 36 * SIZE(BO), %xmm13 1352 mulss %xmm8, %xmm13 1353 addss %xmm13, %xmm1 1354 movss 40 * SIZE(BO), %xmm13 1355 mulss %xmm8, %xmm13 1356 addss %xmm13, %xmm2 1357 movss 44 * SIZE(BO), %xmm13 1358 mulss %xmm8, %xmm13 1359 movss -29 * SIZE(AO), %xmm8 1360 addss %xmm13, %xmm3 1361 movss 96 * SIZE(BO), %xmm13 1362 1363 mulss %xmm8, %xmm15 1364 addss %xmm15, %xmm0 1365 movss 52 * SIZE(BO), %xmm15 1366 mulss %xmm8, %xmm15 1367 addss %xmm15, %xmm1 1368 movss 56 * SIZE(BO), %xmm15 1369 mulss %xmm8, %xmm15 1370 addss %xmm15, %xmm2 1371 movss 60 * SIZE(BO), %xmm15 1372 mulss %xmm8, %xmm15 1373 movss -24 * SIZE(AO), %xmm8 1374 addss %xmm15, %xmm3 1375 movss 112 * SIZE(BO), %xmm15 1376 1377 mulss %xmm10, %xmm9 1378 addss %xmm9, %xmm0 1379 movss 68 * SIZE(BO), %xmm9 1380 mulss %xmm10, %xmm9 1381 addss %xmm9, %xmm1 1382 movss 72 * SIZE(BO), %xmm9 1383 mulss %xmm10, %xmm9 1384 addss %xmm9, %xmm2 1385 movss 76 * SIZE(BO), %xmm9 1386 mulss %xmm10, %xmm9 1387 movss -27 * SIZE(AO), %xmm10 1388 addss %xmm9, %xmm3 1389 movss 128 * SIZE(BO), %xmm9 1390 1391 mulss %xmm10, %xmm11 1392 addss %xmm11, %xmm0 1393 movss 84 * SIZE(BO), %xmm11 1394 mulss %xmm10, %xmm11 1395 addss %xmm11, %xmm1 1396 movss 88 * SIZE(BO), %xmm11 1397 mulss %xmm10, %xmm11 1398 addss %xmm11, %xmm2 1399 movss 92 * SIZE(BO), %xmm11 1400 mulss %xmm10, %xmm11 1401 movss -26 * SIZE(AO), %xmm10 1402 addss %xmm11, %xmm3 1403 movss 144 * SIZE(BO), %xmm11 1404 1405 mulss %xmm10, %xmm13 1406 addss %xmm13, %xmm0 1407 movss 100 * SIZE(BO), %xmm13 1408 mulss %xmm10, %xmm13 1409 addss %xmm13, %xmm1 1410 movss 104 * SIZE(BO), %xmm13 1411 mulss %xmm10, %xmm13 1412 addss %xmm13, %xmm2 1413 movss 108 * SIZE(BO), %xmm13 1414 mulss %xmm10, %xmm13 1415 movss -25 * SIZE(AO), %xmm10 1416 addss %xmm13, %xmm3 1417 movss 160 * SIZE(BO), %xmm13 1418 1419 mulss %xmm10, %xmm15 1420 addss %xmm15, %xmm0 1421 movss 116 * SIZE(BO), %xmm15 1422 mulss %xmm10, %xmm15 1423 addss %xmm15, %xmm1 1424 movss 120 * SIZE(BO), %xmm15 1425 mulss %xmm10, %xmm15 1426 addss %xmm15, %xmm2 1427 movss 124 * SIZE(BO), %xmm15 1428 mulss %xmm10, %xmm15 1429 movss -20 * SIZE(AO), %xmm10 1430 addss %xmm15, %xmm3 1431 movss 176 * SIZE(BO), %xmm15 1432 1433 addq $ 8 * SIZE, AO 1434 addq $128 * SIZE, BO 1435 decq %rax 1436 jne .L42 1437 ALIGN_4 1438 1439.L45: 1440#ifndef TRMMKERNEL 1441 movq K, %rax 1442#else 1443 movq KKK, %rax 1444#endif 1445 movaps ALPHA, %xmm15 1446 andq $7, %rax # if (k & 1) 1447 BRANCH 1448 je .L48 1449 ALIGN_4 1450 1451.L46: 1452 mulps %xmm8, %xmm9 1453 addps %xmm9, %xmm0 1454 movss 4 * SIZE(BO), %xmm9 1455 mulps %xmm8, %xmm9 1456 addps %xmm9, %xmm1 1457 movss 8 * SIZE(BO), %xmm9 1458 mulps %xmm8, %xmm9 1459 addps %xmm9, %xmm2 1460 movss 12 * SIZE(BO), %xmm9 1461 mulps %xmm8, %xmm9 1462 movss -31 * SIZE(AO), %xmm8 1463 addps %xmm9, %xmm3 1464 movss 16 * SIZE(BO), %xmm9 1465 1466 addq $ 1 * SIZE, AO # aoffset += 4 1467 addq $16 * SIZE, BO # boffset1 += 8 1468 decq %rax 1469 jg .L46 1470 ALIGN_4 1471 1472.L48: 1473 mulss %xmm15, %xmm0 1474 mulss %xmm15, %xmm1 1475 mulss %xmm15, %xmm2 1476 mulss %xmm15, %xmm3 1477 1478#ifndef TRMMKERNEL 1479 movss 0 * SIZE(CO1), %xmm8 1480 movss 0 * SIZE(CO2), %xmm10 1481 movss 0 * SIZE(CO1, LDC, 2), %xmm12 1482 movss 0 * SIZE(CO2, LDC, 2), %xmm14 1483 1484 addss %xmm8, %xmm0 1485 addss %xmm10, %xmm1 1486 addss %xmm12, %xmm2 1487 addss %xmm14, %xmm3 1488#endif 1489 1490 movss %xmm0, 0 * SIZE(CO1) 1491 movss %xmm1, 0 * SIZE(CO2) 1492 movss %xmm2, 0 * SIZE(CO1, LDC, 2) 1493 movss %xmm3, 0 * SIZE(CO2, LDC, 2) 1494 1495#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1496 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1497 movq K, %rax 1498 subq KKK, %rax 1499 leaq (,%rax, 4), %rax 1500 leaq (AO, %rax, 1), AO 1501 leaq (BO, %rax, 8), BO 1502 leaq (BO, %rax, 8), BO 1503#endif 1504 1505#if defined(TRMMKERNEL) && defined(LEFT) 1506 addq $1, KK 1507#endif 1508 ALIGN_4 1509 1510.L49: 1511#if defined(TRMMKERNEL) && !defined(LEFT) 1512 addl $4, KK 1513#endif 1514 leaq (C, LDC, 4), C # c += 4 * ldc 1515 decq J # j -- 1516 jg .L01 1517 1518.L50: 1519 testq $2, N 1520 je .L100 1521 1522.L51: 1523#if defined(TRMMKERNEL) && defined(LEFT) 1524 movq OFFSET, %rax 1525 movq %rax, KK 1526#endif 1527 1528/* Copying to Sub Buffer */ 1529 leaq BUFFER, BO 1530 1531 movq K, %rax 1532 sarq $2, %rax 1533 jle .L53 1534 ALIGN_4 1535 1536.L52: 1537 1538 movaps 0 * SIZE(B), %xmm3 1539 movaps 4 * SIZE(B), %xmm7 1540 1541 1542 pshufd $0x00, %xmm3, %xmm0 1543 pshufd $0x55, %xmm3, %xmm1 1544 pshufd $0xaa, %xmm3, %xmm2 1545 pshufd $0xff, %xmm3, %xmm3 1546 1547 1548 pshufd $0x00, %xmm7, %xmm4 1549 pshufd $0x55, %xmm7, %xmm5 1550 pshufd $0xaa, %xmm7, %xmm6 1551 pshufd $0xff, %xmm7, %xmm7 1552 1553 movaps %xmm0, 0 * SIZE(BO) 1554 movaps %xmm1, 4 * SIZE(BO) 1555 movaps %xmm2, 8 * SIZE(BO) 1556 movaps %xmm3, 12 * SIZE(BO) 1557 movaps %xmm4, 16 * SIZE(BO) 1558 movaps %xmm5, 20 * SIZE(BO) 1559 movaps %xmm6, 24 * SIZE(BO) 1560 movaps %xmm7, 28 * SIZE(BO) 1561 1562 addq $ 8 * SIZE, B 1563 addq $32 * SIZE, BO 1564 1565 decq %rax 1566 jne .L52 1567 ALIGN_4 1568 1569.L53: 1570 movq K, %rax 1571 andq $3, %rax 1572 BRANCH 1573 jle .L60 1574 ALIGN_4 1575 1576.L54: 1577 movsd 0 * SIZE(B), %xmm3 1578 1579 pshufd $0x00, %xmm3, %xmm0 1580 pshufd $0x55, %xmm3, %xmm1 1581 1582 pshufd $0x00, %xmm7, %xmm4 1583 pshufd $0x55, %xmm7, %xmm5 1584 pshufd $0xaa, %xmm7, %xmm6 1585 pshufd $0xff, %xmm7, %xmm7 1586 1587 movaps %xmm0, 0 * SIZE(BO) 1588 movaps %xmm1, 4 * SIZE(BO) 1589 1590 addq $ 2 * SIZE, B 1591 addq $ 8 * SIZE, BO 1592 decq %rax 1593 jne .L54 1594 ALIGN_4 1595 1596.L60: 1597 movq C, CO1 # coffset1 = c 1598 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1599 movq A, AO # aoffset = a 1600 1601 movq M, I 1602 sarq $3, I # i = (m >> 3) 1603 jle .L70 1604 ALIGN_4 1605 1606.L61: 1607#if !defined(TRMMKERNEL) || \ 1608 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1609 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1610 1611 leaq BUFFER, BO 1612#else 1613 leaq BUFFER, BO 1614 movq KK, %rax 1615 leaq (, %rax, 8), %rax 1616 leaq (AO, %rax, 4), AO 1617 leaq (BO, %rax, 4), BO 1618#endif 1619 1620 movaps -32 * SIZE(AO), %xmm8 1621 movaps -16 * SIZE(AO), %xmm10 1622 movaps 0 * SIZE(AO), %xmm12 1623 movaps 16 * SIZE(AO), %xmm14 1624 1625 movaps 0 * SIZE(BO), %xmm9 1626 movaps 16 * SIZE(BO), %xmm11 1627 movaps 32 * SIZE(BO), %xmm13 1628 movaps 48 * SIZE(BO), %xmm15 1629 1630 xorps %xmm0, %xmm0 1631 xorps %xmm1, %xmm1 1632 1633 prefetchw 4 * SIZE(CO1) 1634 xorps %xmm4, %xmm4 1635 prefetchw 4 * SIZE(CO2) 1636 xorps %xmm5, %xmm5 1637 1638#ifndef TRMMKERNEL 1639 movq K, %rax 1640#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1641 movq K, %rax 1642 subq KK, %rax 1643 movq %rax, KKK 1644#else 1645 movq KK, %rax 1646#ifdef LEFT 1647 addq $8, %rax 1648#else 1649 addq $2, %rax 1650#endif 1651 movq %rax, KKK 1652#endif 1653 sarq $3, %rax 1654 je .L65 1655 ALIGN_4 1656 1657.L62: 1658 mulps %xmm8, %xmm9 1659 mulps 4 * SIZE(BO), %xmm8 1660 addps %xmm9, %xmm0 1661 movaps 0 * SIZE(BO), %xmm9 1662 addps %xmm8, %xmm1 1663 movaps -28 * SIZE(AO), %xmm8 1664 mulps %xmm8, %xmm9 1665 mulps 4 * SIZE(BO), %xmm8 1666 addps %xmm9, %xmm4 1667 movaps 8 * SIZE(BO), %xmm9 1668 addps %xmm8, %xmm5 1669 movaps -24 * SIZE(AO), %xmm8 1670 1671 mulps %xmm8, %xmm9 1672 mulps 12 * SIZE(BO), %xmm8 1673 addps %xmm9, %xmm0 1674 movaps 8 * SIZE(BO), %xmm9 1675 addps %xmm8, %xmm1 1676 movaps -20 * SIZE(AO), %xmm8 1677 mulps %xmm8, %xmm9 1678 mulps 12 * SIZE(BO), %xmm8 1679 addps %xmm9, %xmm4 1680 movaps 64 * SIZE(BO), %xmm9 1681 addps %xmm8, %xmm5 1682 movaps 32 * SIZE(AO), %xmm8 1683 1684 mulps %xmm10, %xmm11 1685 mulps 20 * SIZE(BO), %xmm10 1686 addps %xmm11, %xmm0 1687 movaps 16 * SIZE(BO), %xmm11 1688 addps %xmm10, %xmm1 1689 movaps -12 * SIZE(AO), %xmm10 1690 mulps %xmm10, %xmm11 1691 mulps 20 * SIZE(BO), %xmm10 1692 addps %xmm11, %xmm4 1693 movaps 24 * SIZE(BO), %xmm11 1694 addps %xmm10, %xmm5 1695 movaps -8 * SIZE(AO), %xmm10 1696 1697 mulps %xmm10, %xmm11 1698 mulps 28 * SIZE(BO), %xmm10 1699 addps %xmm11, %xmm0 1700 movaps 24 * SIZE(BO), %xmm11 1701 addps %xmm10, %xmm1 1702 movaps -4 * SIZE(AO), %xmm10 1703 mulps %xmm10, %xmm11 1704 mulps 28 * SIZE(BO), %xmm10 1705 addps %xmm11, %xmm4 1706 movaps 80 * SIZE(BO), %xmm11 1707 addps %xmm10, %xmm5 1708 movaps 48 * SIZE(AO), %xmm10 1709 1710 mulps %xmm12, %xmm13 1711 mulps 36 * SIZE(BO), %xmm12 1712 addps %xmm13, %xmm0 1713 movaps 32 * SIZE(BO), %xmm13 1714 addps %xmm12, %xmm1 1715 movaps 4 * SIZE(AO), %xmm12 1716 mulps %xmm12, %xmm13 1717 mulps 36 * SIZE(BO), %xmm12 1718 addps %xmm13, %xmm4 1719 movaps 40 * SIZE(BO), %xmm13 1720 addps %xmm12, %xmm5 1721 movaps 8 * SIZE(AO), %xmm12 1722 1723 mulps %xmm12, %xmm13 1724 mulps 44 * SIZE(BO), %xmm12 1725 addps %xmm13, %xmm0 1726 movaps 40 * SIZE(BO), %xmm13 1727 addps %xmm12, %xmm1 1728 movaps 12 * SIZE(AO), %xmm12 1729 mulps %xmm12, %xmm13 1730 mulps 44 * SIZE(BO), %xmm12 1731 addps %xmm13, %xmm4 1732 movaps 96 * SIZE(BO), %xmm13 1733 addps %xmm12, %xmm5 1734 movaps 64 * SIZE(AO), %xmm12 1735 1736 mulps %xmm14, %xmm15 1737 mulps 52 * SIZE(BO), %xmm14 1738 addps %xmm15, %xmm0 1739 movaps 48 * SIZE(BO), %xmm15 1740 addps %xmm14, %xmm1 1741 movaps 20 * SIZE(AO), %xmm14 1742 mulps %xmm14, %xmm15 1743 mulps 52 * SIZE(BO), %xmm14 1744 addps %xmm15, %xmm4 1745 movaps 56 * SIZE(BO), %xmm15 1746 addps %xmm14, %xmm5 1747 movaps 24 * SIZE(AO), %xmm14 1748 1749 mulps %xmm14, %xmm15 1750 mulps 60 * SIZE(BO), %xmm14 1751 addps %xmm15, %xmm0 1752 movaps 56 * SIZE(BO), %xmm15 1753 addps %xmm14, %xmm1 1754 movaps 28 * SIZE(AO), %xmm14 1755 mulps %xmm14, %xmm15 1756 mulps 60 * SIZE(BO), %xmm14 1757 addps %xmm15, %xmm4 1758 movaps 112 * SIZE(BO), %xmm15 1759 addps %xmm14, %xmm5 1760 movaps 80 * SIZE(AO), %xmm14 1761 1762 addq $64 * SIZE, AO 1763 addq $64 * SIZE, BO 1764 decq %rax 1765 jne .L62 1766 ALIGN_4 1767 1768.L65: 1769#ifndef TRMMKERNEL 1770 movq K, %rax 1771#else 1772 movq KKK, %rax 1773#endif 1774 movaps ALPHA, %xmm15 1775 andq $7, %rax # if (k & 1) 1776 BRANCH 1777 je .L68 1778 ALIGN_4 1779 1780.L66: 1781 mulps %xmm8, %xmm9 1782 mulps 4 * SIZE(BO), %xmm8 1783 addps %xmm9, %xmm0 1784 movaps 0 * SIZE(BO), %xmm9 1785 addps %xmm8, %xmm1 1786 movaps -28 * SIZE(AO), %xmm8 1787 mulps %xmm8, %xmm9 1788 mulps 4 * SIZE(BO), %xmm8 1789 addps %xmm9, %xmm4 1790 movaps 8 * SIZE(BO), %xmm9 1791 addps %xmm8, %xmm5 1792 movaps -24 * SIZE(AO), %xmm8 1793 1794 addq $8 * SIZE, AO # aoffset += 4 1795 addq $8 * SIZE, BO # boffset1 += 8 1796 decq %rax 1797 jg .L66 1798 ALIGN_4 1799 1800.L68: 1801#ifndef TRMMKERNEL 1802 movsd 0 * SIZE(CO1), %xmm8 1803 movhps 2 * SIZE(CO1), %xmm8 1804 movsd 4 * SIZE(CO1), %xmm9 1805 movhps 6 * SIZE(CO1), %xmm9 1806 1807 movsd 0 * SIZE(CO2), %xmm10 1808 movhps 2 * SIZE(CO2), %xmm10 1809 movsd 4 * SIZE(CO2), %xmm11 1810 movhps 6 * SIZE(CO2), %xmm11 1811#endif 1812 1813 mulps %xmm15, %xmm0 1814 mulps %xmm15, %xmm4 1815 mulps %xmm15, %xmm1 1816 mulps %xmm15, %xmm5 1817 1818#ifndef TRMMKERNEL 1819 addps %xmm8, %xmm0 1820 addps %xmm9, %xmm4 1821 addps %xmm10, %xmm1 1822 addps %xmm11, %xmm5 1823#endif 1824 1825 vmovups %xmm0, 0 * SIZE(CO1) 1826 vmovups %xmm4, 4 * SIZE(CO1) 1827 1828 vmovups %xmm1, 0 * SIZE(CO2) 1829 vmovups %xmm5, 4 * SIZE(CO2) 1830 1831#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1832 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1833 movq K, %rax 1834 subq KKK, %rax 1835 leaq (,%rax, 8), %rax 1836 leaq (AO, %rax, 4), AO 1837 leaq (BO, %rax, 4), BO 1838#endif 1839 1840#if defined(TRMMKERNEL) && defined(LEFT) 1841 addq $8, KK 1842#endif 1843 1844 addq $8 * SIZE, CO1 # coffset += 4 1845 addq $8 * SIZE, CO2 # coffset += 4 1846 decq I # i -- 1847 jg .L61 1848 ALIGN_4 1849 1850.L70: 1851 testq $4, M 1852 je .L80 1853 1854 1855#if !defined(TRMMKERNEL) || \ 1856 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1857 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1858 1859 leaq BUFFER, BO 1860#else 1861 leaq BUFFER, BO 1862 movq KK, %rax 1863 leaq (, %rax, 8), %rax 1864 leaq (AO, %rax, 2), AO 1865 leaq (BO, %rax, 4), BO 1866#endif 1867 1868 movaps -32 * SIZE(AO), %xmm8 1869 movaps -16 * SIZE(AO), %xmm10 1870 1871 movaps 0 * SIZE(BO), %xmm9 1872 movaps 16 * SIZE(BO), %xmm11 1873 movaps 32 * SIZE(BO), %xmm13 1874 movaps 48 * SIZE(BO), %xmm15 1875 1876 xorps %xmm0, %xmm0 1877 xorps %xmm1, %xmm1 1878 xorps %xmm2, %xmm2 1879 xorps %xmm3, %xmm3 1880 1881#ifndef TRMMKERNEL 1882 movq K, %rax 1883#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1884 movq K, %rax 1885 subq KK, %rax 1886 movq %rax, KKK 1887#else 1888 movq KK, %rax 1889#ifdef LEFT 1890 addq $4, %rax 1891#else 1892 addq $2, %rax 1893#endif 1894 movq %rax, KKK 1895#endif 1896 sarq $3, %rax 1897 je .L75 1898 ALIGN_4 1899 1900.L72: 1901 mulps %xmm8, %xmm9 1902 1903 mulps 4 * SIZE(BO), %xmm8 1904 addps %xmm9, %xmm0 1905 movaps 8 * SIZE(BO), %xmm9 1906 addps %xmm8, %xmm1 1907 movaps -28 * SIZE(AO), %xmm8 1908 1909 mulps %xmm8, %xmm9 1910 mulps 12 * SIZE(BO), %xmm8 1911 addps %xmm9, %xmm2 1912 movaps 64 * SIZE(BO), %xmm9 1913 addps %xmm8, %xmm3 1914 movaps -24 * SIZE(AO), %xmm8 1915 1916 mulps %xmm8, %xmm11 1917 mulps 20 * SIZE(BO), %xmm8 1918 addps %xmm11, %xmm0 1919 movaps 24 * SIZE(BO), %xmm11 1920 addps %xmm8, %xmm1 1921 movaps -20 * SIZE(AO), %xmm8 1922 1923 mulps %xmm8, %xmm11 1924 mulps 28 * SIZE(BO), %xmm8 1925 addps %xmm11, %xmm2 1926 movaps 80 * SIZE(BO), %xmm11 1927 addps %xmm8, %xmm3 1928 movaps 0 * SIZE(AO), %xmm8 1929 1930 mulps %xmm10, %xmm13 1931 mulps 36 * SIZE(BO), %xmm10 1932 addps %xmm13, %xmm0 1933 movaps 40 * SIZE(BO), %xmm13 1934 addps %xmm10, %xmm1 1935 movaps -12 * SIZE(AO), %xmm10 1936 1937 mulps %xmm10, %xmm13 1938 mulps 44 * SIZE(BO), %xmm10 1939 addps %xmm13, %xmm2 1940 movaps 96 * SIZE(BO), %xmm13 1941 addps %xmm10, %xmm3 1942 movaps -8 * SIZE(AO), %xmm10 1943 1944 mulps %xmm10, %xmm15 1945 mulps 52 * SIZE(BO), %xmm10 1946 addps %xmm15, %xmm0 1947 movaps 56 * SIZE(BO), %xmm15 1948 addps %xmm10, %xmm1 1949 movaps -4 * SIZE(AO), %xmm10 1950 1951 mulps %xmm10, %xmm15 1952 mulps 60 * SIZE(BO), %xmm10 1953 addps %xmm15, %xmm2 1954 movaps 112 * SIZE(BO), %xmm15 1955 addps %xmm10, %xmm3 1956 movaps 16 * SIZE(AO), %xmm10 1957 1958 addq $32 * SIZE, AO 1959 addq $64 * SIZE, BO 1960 decq %rax 1961 jne .L72 1962 ALIGN_4 1963 1964.L75: 1965#ifndef TRMMKERNEL 1966 movq K, %rax 1967#else 1968 movq KKK, %rax 1969#endif 1970 movaps ALPHA, %xmm15 1971 andq $7, %rax # if (k & 1) 1972 BRANCH 1973 je .L78 1974 ALIGN_4 1975 1976.L76: 1977 mulps %xmm8, %xmm9 1978 mulps 4 * SIZE(BO), %xmm8 1979 addps %xmm9, %xmm0 1980 movaps 8 * SIZE(BO), %xmm9 1981 addps %xmm8, %xmm1 1982 movaps -28 * SIZE(AO), %xmm8 1983 1984 addq $4 * SIZE, AO # aoffset += 4 1985 addq $8 * SIZE, BO # boffset1 += 8 1986 decq %rax 1987 jg .L76 1988 ALIGN_4 1989 1990.L78: 1991#ifndef TRMMKERNEL 1992 movsd 0 * SIZE(CO1), %xmm8 1993 movhps 2 * SIZE(CO1), %xmm8 1994 movsd 0 * SIZE(CO2), %xmm10 1995 movhps 2 * SIZE(CO2), %xmm10 1996#endif 1997 1998 addps %xmm2, %xmm0 1999 addps %xmm3, %xmm1 2000 2001 mulps %xmm15, %xmm0 2002 mulps %xmm15, %xmm1 2003 2004#ifndef TRMMKERNEL 2005 addps %xmm8, %xmm0 2006 addps %xmm10, %xmm1 2007#endif 2008 2009 vmovups %xmm0, 0 * SIZE(CO1) 2010 vmovups %xmm1, 0 * SIZE(CO2) 2011 2012#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2013 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2014 movq K, %rax 2015 subq KKK, %rax 2016 leaq (,%rax, 8), %rax 2017 leaq (AO, %rax, 2), AO 2018 leaq (BO, %rax, 4), BO 2019#endif 2020 2021#if defined(TRMMKERNEL) && defined(LEFT) 2022 addq $4, KK 2023#endif 2024 2025 addq $4 * SIZE, CO1 # coffset += 4 2026 addq $4 * SIZE, CO2 # coffset += 4 2027 ALIGN_4 2028 2029.L80: 2030 testq $2, M 2031 je .L90 2032 2033#if !defined(TRMMKERNEL) || \ 2034 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2035 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2036 2037 leaq BUFFER, BO 2038#else 2039 leaq BUFFER, BO 2040 movq KK, %rax 2041 leaq (, %rax, 8), %rax 2042 leaq (AO, %rax, 1), AO 2043 leaq (BO, %rax, 4), BO 2044#endif 2045 2046 movaps -32 * SIZE(AO), %xmm8 2047 movaps -24 * SIZE(AO), %xmm10 2048 2049 movaps 0 * SIZE(BO), %xmm9 2050 movaps 16 * SIZE(BO), %xmm11 2051 movaps 32 * SIZE(BO), %xmm13 2052 movaps 48 * SIZE(BO), %xmm15 2053 2054 xorps %xmm0, %xmm0 2055 xorps %xmm1, %xmm1 2056 xorps %xmm2, %xmm2 2057 xorps %xmm3, %xmm3 2058 2059#ifndef TRMMKERNEL 2060 movq K, %rax 2061#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2062 movq K, %rax 2063 subq KK, %rax 2064 movq %rax, KKK 2065#else 2066 movq KK, %rax 2067#ifdef LEFT 2068 addq $2, %rax 2069#else 2070 addq $2, %rax 2071#endif 2072 movq %rax, KKK 2073#endif 2074 sarq $3, %rax 2075 je .L85 2076 ALIGN_4 2077 2078.L82: 2079 mulps %xmm8, %xmm9 2080 addps %xmm9, %xmm0 2081 movsd 4 * SIZE(BO), %xmm9 2082 mulps %xmm8, %xmm9 2083 movsd -30 * SIZE(AO), %xmm8 2084 addps %xmm9, %xmm1 2085 movsd 8 * SIZE(BO), %xmm9 2086 2087 mulps %xmm8, %xmm9 2088 addps %xmm9, %xmm2 2089 movsd 12 * SIZE(BO), %xmm9 2090 mulps %xmm8, %xmm9 2091 movsd -28 * SIZE(AO), %xmm8 2092 addps %xmm9, %xmm3 2093 movsd 64 * SIZE(BO), %xmm9 2094 2095 mulps %xmm8, %xmm11 2096 addps %xmm11, %xmm0 2097 movsd 20 * SIZE(BO), %xmm11 2098 mulps %xmm8, %xmm11 2099 movsd -26 * SIZE(AO), %xmm8 2100 addps %xmm11, %xmm1 2101 movsd 24 * SIZE(BO), %xmm11 2102 2103 mulps %xmm8, %xmm11 2104 addps %xmm11, %xmm2 2105 movsd 28 * SIZE(BO), %xmm11 2106 mulps %xmm8, %xmm11 2107 movsd -16 * SIZE(AO), %xmm8 2108 addps %xmm11, %xmm3 2109 movsd 80 * SIZE(BO), %xmm11 2110 2111 mulps %xmm10, %xmm13 2112 addps %xmm13, %xmm0 2113 movsd 36 * SIZE(BO), %xmm13 2114 mulps %xmm10, %xmm13 2115 movsd -22 * SIZE(AO), %xmm10 2116 addps %xmm13, %xmm1 2117 movsd 40 * SIZE(BO), %xmm13 2118 2119 mulps %xmm10, %xmm13 2120 addps %xmm13, %xmm2 2121 movsd 44 * SIZE(BO), %xmm13 2122 mulps %xmm10, %xmm13 2123 movsd -20 * SIZE(AO), %xmm10 2124 addps %xmm13, %xmm3 2125 movsd 96 * SIZE(BO), %xmm13 2126 2127 mulps %xmm10, %xmm15 2128 addps %xmm15, %xmm0 2129 movsd 52 * SIZE(BO), %xmm15 2130 mulps %xmm10, %xmm15 2131 movsd -18 * SIZE(AO), %xmm10 2132 addps %xmm15, %xmm1 2133 movsd 56 * SIZE(BO), %xmm15 2134 2135 mulps %xmm10, %xmm15 2136 addps %xmm15, %xmm2 2137 movsd 60 * SIZE(BO), %xmm15 2138 mulps %xmm10, %xmm15 2139 movsd -8 * SIZE(AO), %xmm10 2140 addps %xmm15, %xmm3 2141 movsd 112 * SIZE(BO), %xmm15 2142 2143 addq $16 * SIZE, AO 2144 addq $64 * SIZE, BO 2145 decq %rax 2146 jne .L82 2147 ALIGN_4 2148 2149.L85: 2150#ifndef TRMMKERNEL 2151 movq K, %rax 2152#else 2153 movq KKK, %rax 2154#endif 2155 movaps ALPHA, %xmm15 2156 andq $7, %rax # if (k & 1) 2157 BRANCH 2158 je .L88 2159 ALIGN_4 2160 2161.L86: 2162 mulps %xmm8, %xmm9 2163 addps %xmm9, %xmm0 2164 movsd 4 * SIZE(BO), %xmm9 2165 mulps %xmm8, %xmm9 2166 movsd -30 * SIZE(AO), %xmm8 2167 addps %xmm9, %xmm1 2168 movsd 8 * SIZE(BO), %xmm9 2169 2170 addq $2 * SIZE, AO # aoffset += 4 2171 addq $8 * SIZE, BO # boffset1 += 8 2172 decq %rax 2173 jg .L86 2174 ALIGN_4 2175 2176.L88: 2177#ifndef TRMMKERNEL 2178 movsd 0 * SIZE(CO1), %xmm8 2179 movsd 0 * SIZE(CO2), %xmm10 2180#endif 2181 2182 addps %xmm2, %xmm0 2183 addps %xmm3, %xmm1 2184 2185 mulps %xmm15, %xmm0 2186 mulps %xmm15, %xmm1 2187 2188#ifndef TRMMKERNEL 2189 addps %xmm8, %xmm0 2190 addps %xmm10, %xmm1 2191#endif 2192 2193 movsd %xmm0, 0 * SIZE(CO1) 2194 movsd %xmm1, 0 * SIZE(CO2) 2195 2196#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2197 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2198 movq K, %rax 2199 subq KKK, %rax 2200 leaq (,%rax, 8), %rax 2201 leaq (AO, %rax, 1), AO 2202 leaq (BO, %rax, 4), BO 2203#endif 2204 2205#if defined(TRMMKERNEL) && defined(LEFT) 2206 addq $2, KK 2207#endif 2208 2209 addq $2 * SIZE, CO1 # coffset += 4 2210 addq $2 * SIZE, CO2 # coffset += 4 2211 ALIGN_4 2212 2213.L90: 2214 testq $1, M 2215 je .L99 2216 2217#if !defined(TRMMKERNEL) || \ 2218 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2219 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2220 2221 leaq BUFFER, BO 2222#else 2223 leaq BUFFER, BO 2224 movq KK, %rax 2225 leaq (, %rax, 4), %rax 2226 leaq (AO, %rax, 1), AO 2227 leaq (BO, %rax, 8), BO 2228#endif 2229 2230 movss -32 * SIZE(AO), %xmm8 2231 movss -28 * SIZE(AO), %xmm10 2232 2233 movss 0 * SIZE(BO), %xmm9 2234 movss 16 * SIZE(BO), %xmm11 2235 movss 32 * SIZE(BO), %xmm13 2236 movss 48 * SIZE(BO), %xmm15 2237 2238 xorps %xmm0, %xmm0 2239 xorps %xmm1, %xmm1 2240 xorps %xmm2, %xmm2 2241 xorps %xmm3, %xmm3 2242 2243#ifndef TRMMKERNEL 2244 movq K, %rax 2245#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2246 movq K, %rax 2247 subq KK, %rax 2248 movq %rax, KKK 2249#else 2250 movq KK, %rax 2251#ifdef LEFT 2252 addq $1, %rax 2253#else 2254 addq $2, %rax 2255#endif 2256 movq %rax, KKK 2257#endif 2258 sarq $3, %rax 2259 je .L95 2260 ALIGN_4 2261 2262.L92: 2263 mulps %xmm8, %xmm9 2264 addps %xmm9, %xmm0 2265 movss 4 * SIZE(BO), %xmm9 2266 mulps %xmm8, %xmm9 2267 movss -31 * SIZE(AO), %xmm8 2268 addps %xmm9, %xmm1 2269 movss 8 * SIZE(BO), %xmm9 2270 2271 mulps %xmm8, %xmm9 2272 addps %xmm9, %xmm2 2273 movss 12 * SIZE(BO), %xmm9 2274 mulps %xmm8, %xmm9 2275 movss -30 * SIZE(AO), %xmm8 2276 addps %xmm9, %xmm3 2277 movss 64 * SIZE(BO), %xmm9 2278 2279 mulps %xmm8, %xmm11 2280 addps %xmm11, %xmm0 2281 movss 20 * SIZE(BO), %xmm11 2282 mulps %xmm8, %xmm11 2283 movss -29 * SIZE(AO), %xmm8 2284 addps %xmm11, %xmm1 2285 movss 24 * SIZE(BO), %xmm11 2286 2287 mulps %xmm8, %xmm11 2288 addps %xmm11, %xmm2 2289 movss 28 * SIZE(BO), %xmm11 2290 mulps %xmm8, %xmm11 2291 movss -24 * SIZE(AO), %xmm8 2292 addps %xmm11, %xmm3 2293 movss 80 * SIZE(BO), %xmm11 2294 2295 mulps %xmm10, %xmm13 2296 addps %xmm13, %xmm0 2297 movss 36 * SIZE(BO), %xmm13 2298 mulps %xmm10, %xmm13 2299 movss -27 * SIZE(AO), %xmm10 2300 addps %xmm13, %xmm1 2301 movss 40 * SIZE(BO), %xmm13 2302 2303 mulps %xmm10, %xmm13 2304 addps %xmm13, %xmm2 2305 movss 44 * SIZE(BO), %xmm13 2306 mulps %xmm10, %xmm13 2307 movss -26 * SIZE(AO), %xmm10 2308 addps %xmm13, %xmm3 2309 movss 96 * SIZE(BO), %xmm13 2310 2311 mulps %xmm10, %xmm15 2312 addps %xmm15, %xmm0 2313 movss 52 * SIZE(BO), %xmm15 2314 mulps %xmm10, %xmm15 2315 movss -25 * SIZE(AO), %xmm10 2316 addps %xmm15, %xmm1 2317 movss 56 * SIZE(BO), %xmm15 2318 2319 mulps %xmm10, %xmm15 2320 addps %xmm15, %xmm2 2321 movss 60 * SIZE(BO), %xmm15 2322 mulps %xmm10, %xmm15 2323 movss -20 * SIZE(AO), %xmm10 2324 addps %xmm15, %xmm3 2325 movss 112 * SIZE(BO), %xmm15 2326 2327 addq $ 8 * SIZE, AO 2328 addq $64 * SIZE, BO 2329 decq %rax 2330 jne .L92 2331 ALIGN_4 2332 2333.L95: 2334#ifndef TRMMKERNEL 2335 movq K, %rax 2336#else 2337 movq KKK, %rax 2338#endif 2339 movaps ALPHA, %xmm15 2340 andq $7, %rax # if (k & 1) 2341 BRANCH 2342 je .L98 2343 ALIGN_4 2344 2345.L96: 2346 mulps %xmm8, %xmm9 2347 addps %xmm9, %xmm0 2348 movss 4 * SIZE(BO), %xmm9 2349 mulps %xmm8, %xmm9 2350 movss -31 * SIZE(AO), %xmm8 2351 addps %xmm9, %xmm1 2352 movss 8 * SIZE(BO), %xmm9 2353 2354 addq $1 * SIZE, AO # aoffset += 4 2355 addq $8 * SIZE, BO # boffset1 += 8 2356 decq %rax 2357 jg .L96 2358 ALIGN_4 2359 2360.L98: 2361#ifndef TRMMKERNEL 2362 movss 0 * SIZE(CO1), %xmm8 2363 movss 0 * SIZE(CO2), %xmm10 2364#endif 2365 2366 addss %xmm2, %xmm0 2367 addss %xmm3, %xmm1 2368 mulss %xmm15, %xmm0 2369 mulss %xmm15, %xmm1 2370 2371#ifndef TRMMKERNEL 2372 addss %xmm8, %xmm0 2373 addss %xmm10, %xmm1 2374#endif 2375 2376 movss %xmm0, 0 * SIZE(CO1) 2377 movss %xmm1, 0 * SIZE(CO2) 2378 2379#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2380 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2381 movq K, %rax 2382 subq KKK, %rax 2383 leaq (,%rax, 4), %rax 2384 leaq (AO, %rax, 1), AO 2385 leaq (BO, %rax, 8), BO 2386#endif 2387 2388#if defined(TRMMKERNEL) && defined(LEFT) 2389 addq $1, KK 2390#endif 2391 ALIGN_4 2392 2393.L99: 2394#if defined(TRMMKERNEL) && !defined(LEFT) 2395 addl $2, KK 2396#endif 2397 leaq (C, LDC, 2), C # c += 4 * ldc 2398 ALIGN_4 2399 2400 2401.L100: 2402 testq $1, N 2403 je .L999 2404 2405.L101: 2406#if defined(TRMMKERNEL) && defined(LEFT) 2407 movq OFFSET, %rax 2408 movq %rax, KK 2409#endif 2410 2411/* Copying to Sub Buffer */ 2412 leaq BUFFER, BO 2413 2414 movq K, %rax 2415 sarq $3, %rax 2416 jle .L103 2417 ALIGN_4 2418 2419 2420.L102: 2421 2422 movups 0 * SIZE(B), %xmm3 2423 movups 4 * SIZE(B), %xmm7 2424 2425 2426 pshufd $0x00, %xmm3, %xmm0 2427 pshufd $0x55, %xmm3, %xmm1 2428 pshufd $0xaa, %xmm3, %xmm2 2429 pshufd $0xff, %xmm3, %xmm3 2430 2431 2432 pshufd $0x00, %xmm7, %xmm4 2433 pshufd $0x55, %xmm7, %xmm5 2434 pshufd $0xaa, %xmm7, %xmm6 2435 pshufd $0xff, %xmm7, %xmm7 2436 2437 movaps %xmm0, 0 * SIZE(BO) 2438 movaps %xmm1, 4 * SIZE(BO) 2439 movaps %xmm2, 8 * SIZE(BO) 2440 movaps %xmm3, 12 * SIZE(BO) 2441 movaps %xmm4, 16 * SIZE(BO) 2442 movaps %xmm5, 20 * SIZE(BO) 2443 movaps %xmm6, 24 * SIZE(BO) 2444 movaps %xmm7, 28 * SIZE(BO) 2445 2446 addq $ 8 * SIZE, B 2447 addq $32 * SIZE, BO 2448 2449 decq %rax 2450 jne .L102 2451 ALIGN_4 2452 2453.L103: 2454 movq K, %rax 2455 andq $7, %rax 2456 BRANCH 2457 jle .L110 2458 ALIGN_4 2459 2460.L104: 2461 movss 0 * SIZE(B), %xmm3 2462 2463 pshufd $0x00, %xmm3, %xmm0 2464 2465 movaps %xmm0, 0 * SIZE(BO) 2466 2467 addq $ 1 * SIZE, B 2468 addq $ 4 * SIZE, BO 2469 decq %rax 2470 jne .L104 2471 ALIGN_4 2472 2473.L110: 2474 movq C, CO1 # coffset1 = c 2475 movq A, AO # aoffset = a 2476 2477 movq M, I 2478 sarq $3, I # i = (m >> 3) 2479 jle .L120 2480 ALIGN_4 2481 2482.L111: 2483#if !defined(TRMMKERNEL) || \ 2484 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2485 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2486 2487 leaq BUFFER, BO 2488#else 2489 leaq BUFFER, BO 2490 movq KK, %rax 2491 leaq (, %rax, 8), %rax 2492 leaq (AO, %rax, 4), AO 2493 leaq (BO, %rax, 2), BO 2494#endif 2495 2496 movaps -32 * SIZE(AO), %xmm8 2497 movaps -16 * SIZE(AO), %xmm10 2498 movaps 0 * SIZE(AO), %xmm12 2499 movaps 16 * SIZE(AO), %xmm14 2500 2501 movaps 0 * SIZE(BO), %xmm9 2502 movaps 16 * SIZE(BO), %xmm11 2503 movaps 32 * SIZE(BO), %xmm13 2504 movaps 48 * SIZE(BO), %xmm15 2505 2506 xorps %xmm0, %xmm0 2507 xorps %xmm1, %xmm1 2508 2509 prefetchw 4 * SIZE(CO1) 2510 xorps %xmm4, %xmm4 2511 xorps %xmm5, %xmm5 2512 2513#ifndef TRMMKERNEL 2514 movq K, %rax 2515#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2516 movq K, %rax 2517 subq KK, %rax 2518 movq %rax, KKK 2519#else 2520 movq KK, %rax 2521#ifdef LEFT 2522 addq $8, %rax 2523#else 2524 addq $1, %rax 2525#endif 2526 movq %rax, KKK 2527#endif 2528 sarq $3, %rax 2529 je .L115 2530 ALIGN_4 2531 2532.L112: 2533 mulps %xmm9, %xmm8 2534 2535 mulps -28 * SIZE(AO), %xmm9 2536 addps %xmm8, %xmm0 2537 movaps -24 * SIZE(AO), %xmm8 2538 addps %xmm9, %xmm4 2539 movaps 4 * SIZE(BO), %xmm9 2540 2541 mulps %xmm9, %xmm8 2542 mulps -20 * SIZE(AO), %xmm9 2543 addps %xmm8, %xmm0 2544 movaps 32 * SIZE(AO), %xmm8 2545 addps %xmm9, %xmm4 2546 movaps 8 * SIZE(BO), %xmm9 2547 2548 mulps %xmm9, %xmm10 2549 mulps -12 * SIZE(AO), %xmm9 2550 addps %xmm10, %xmm0 2551 movaps -8 * SIZE(AO), %xmm10 2552 addps %xmm9, %xmm4 2553 movaps 12 * SIZE(BO), %xmm9 2554 2555 mulps %xmm9, %xmm10 2556 mulps -4 * SIZE(AO), %xmm9 2557 addps %xmm10, %xmm0 2558 movaps 48 * SIZE(AO), %xmm10 2559 addps %xmm9, %xmm4 2560 movaps 32 * SIZE(BO), %xmm9 2561 2562 mulps %xmm11, %xmm12 2563 mulps 4 * SIZE(AO), %xmm11 2564 addps %xmm12, %xmm0 2565 movaps 8 * SIZE(AO), %xmm12 2566 addps %xmm11, %xmm4 2567 movaps 20 * SIZE(BO), %xmm11 2568 2569 mulps %xmm11, %xmm12 2570 mulps 12 * SIZE(AO), %xmm11 2571 addps %xmm12, %xmm0 2572 movaps 64 * SIZE(AO), %xmm12 2573 addps %xmm11, %xmm4 2574 movaps 24 * SIZE(BO), %xmm11 2575 2576 mulps %xmm11, %xmm14 2577 mulps 20 * SIZE(AO), %xmm11 2578 addps %xmm14, %xmm0 2579 movaps 24 * SIZE(AO), %xmm14 2580 addps %xmm11, %xmm4 2581 movaps 28 * SIZE(BO), %xmm11 2582 2583 mulps %xmm11, %xmm14 2584 mulps 28 * SIZE(AO), %xmm11 2585 addps %xmm14, %xmm0 2586 movaps 80 * SIZE(AO), %xmm14 2587 addps %xmm11, %xmm4 2588 movaps 48 * SIZE(BO), %xmm11 2589 2590 addq $64 * SIZE, AO 2591 addq $32 * SIZE, BO 2592 decq %rax 2593 jne .L112 2594 ALIGN_4 2595 2596.L115: 2597#ifndef TRMMKERNEL 2598 movq K, %rax 2599#else 2600 movq KKK, %rax 2601#endif 2602 movaps ALPHA, %xmm15 2603 andq $7, %rax # if (k & 1) 2604 BRANCH 2605 je .L118 2606 ALIGN_4 2607 2608.L116: 2609 mulps %xmm9, %xmm8 2610 mulps -28 * SIZE(AO), %xmm9 2611 addps %xmm8, %xmm0 2612 movaps -24 * SIZE(AO), %xmm8 2613 addps %xmm9, %xmm4 2614 movaps 4 * SIZE(BO), %xmm9 2615 2616 addq $8 * SIZE, AO # aoffset += 4 2617 addq $4 * SIZE, BO # boffset1 += 8 2618 decq %rax 2619 jg .L116 2620 ALIGN_4 2621 2622.L118: 2623#ifndef TRMMKERNEL 2624 movsd 0 * SIZE(CO1), %xmm8 2625 movhps 2 * SIZE(CO1), %xmm8 2626 movsd 4 * SIZE(CO1), %xmm9 2627 movhps 6 * SIZE(CO1), %xmm9 2628#endif 2629 2630 mulps %xmm15, %xmm0 2631 mulps %xmm15, %xmm4 2632#ifndef TRMMKERNEL 2633 addps %xmm8, %xmm0 2634 addps %xmm9, %xmm4 2635#endif 2636 2637 vmovups %xmm0, 0 * SIZE(CO1) 2638 vmovups %xmm4, 4 * SIZE(CO1) 2639 2640#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2641 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2642 movq K, %rax 2643 subq KKK, %rax 2644 leaq (,%rax, 8), %rax 2645 leaq (AO, %rax, 4), AO 2646 leaq (BO, %rax, 2), BO 2647#endif 2648 2649#if defined(TRMMKERNEL) && defined(LEFT) 2650 addq $8, KK 2651#endif 2652 2653 addq $8 * SIZE, CO1 # coffset += 4 2654 decq I # i -- 2655 jg .L111 2656 ALIGN_4 2657 2658.L120: 2659 testq $4, M 2660 je .L130 2661 2662#if !defined(TRMMKERNEL) || \ 2663 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2664 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2665 2666 leaq BUFFER, BO 2667#else 2668 leaq BUFFER, BO 2669 movq KK, %rax 2670 leaq (, %rax, 8), %rax 2671 leaq (AO, %rax, 2), AO 2672 leaq (BO, %rax, 2), BO 2673#endif 2674 2675 movaps -32 * SIZE(AO), %xmm8 2676 movaps -16 * SIZE(AO), %xmm10 2677 2678 movaps 0 * SIZE(BO), %xmm9 2679 movaps 16 * SIZE(BO), %xmm11 2680 2681 xorps %xmm0, %xmm0 2682 xorps %xmm1, %xmm1 2683 xorps %xmm2, %xmm2 2684 xorps %xmm3, %xmm3 2685 2686#ifndef TRMMKERNEL 2687 movq K, %rax 2688#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2689 movq K, %rax 2690 subq KK, %rax 2691 movq %rax, KKK 2692#else 2693 movq KK, %rax 2694#ifdef LEFT 2695 addq $4, %rax 2696#else 2697 addq $1, %rax 2698#endif 2699 movq %rax, KKK 2700#endif 2701 sarq $3, %rax 2702 je .L125 2703 ALIGN_4 2704 2705.L122: 2706 mulps %xmm8, %xmm9 2707 movaps -28 * SIZE(AO), %xmm8 2708 mulps 4 * SIZE(BO), %xmm8 2709 addps %xmm9, %xmm0 2710 movaps 32 * SIZE(BO), %xmm9 2711 addps %xmm8, %xmm1 2712 movaps -24 * SIZE(AO), %xmm8 2713 mulps 8 * SIZE(BO), %xmm8 2714 addps %xmm8, %xmm2 2715 movaps -20 * SIZE(AO), %xmm8 2716 mulps 12 * SIZE(BO), %xmm8 2717 addps %xmm8, %xmm3 2718 movaps 0 * SIZE(AO), %xmm8 2719 2720 mulps %xmm10, %xmm11 2721 movaps -12 * SIZE(AO), %xmm10 2722 mulps 20 * SIZE(BO), %xmm10 2723 addps %xmm11, %xmm0 2724 movaps 48 * SIZE(BO), %xmm11 2725 addps %xmm10, %xmm1 2726 movaps -8 * SIZE(AO), %xmm10 2727 mulps 24 * SIZE(BO), %xmm10 2728 addps %xmm10, %xmm2 2729 movaps -4 * SIZE(AO), %xmm10 2730 mulps 28 * SIZE(BO), %xmm10 2731 addps %xmm10, %xmm3 2732 movaps 16 * SIZE(AO), %xmm10 2733 2734 addq $32 * SIZE, AO 2735 addq $32 * SIZE, BO 2736 decq %rax 2737 jne .L122 2738 ALIGN_4 2739 2740.L125: 2741#ifndef TRMMKERNEL 2742 movq K, %rax 2743#else 2744 movq KKK, %rax 2745#endif 2746 movaps ALPHA, %xmm15 2747 andq $7, %rax # if (k & 1) 2748 BRANCH 2749 je .L128 2750 ALIGN_4 2751 2752.L126: 2753 mulps %xmm8, %xmm9 2754 movaps -28 * SIZE(AO), %xmm8 2755 addps %xmm9, %xmm0 2756 movaps 4 * SIZE(BO), %xmm9 2757 2758 addq $4 * SIZE, AO # aoffset += 4 2759 addq $4 * SIZE, BO # boffset1 += 8 2760 decq %rax 2761 jg .L126 2762 ALIGN_4 2763 2764.L128: 2765#ifndef TRMMKERNEL 2766 movsd 0 * SIZE(CO1), %xmm8 2767 movhps 2 * SIZE(CO1), %xmm8 2768#endif 2769 2770 addps %xmm1, %xmm0 2771 addps %xmm3, %xmm2 2772 addps %xmm2, %xmm0 2773 2774 mulps %xmm15, %xmm0 2775#ifndef TRMMKERNEL 2776 addps %xmm8, %xmm0 2777#endif 2778 2779 vmovups %xmm0, 0 * SIZE(CO1) 2780 2781#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2782 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2783 movq K, %rax 2784 subq KKK, %rax 2785 leaq (,%rax, 8), %rax 2786 leaq (AO, %rax, 2), AO 2787 leaq (BO, %rax, 2), BO 2788#endif 2789 2790#if defined(TRMMKERNEL) && defined(LEFT) 2791 addq $4, KK 2792#endif 2793 2794 addq $4 * SIZE, CO1 # coffset += 4 2795 ALIGN_4 2796 2797.L130: 2798 testq $2, M 2799 je .L140 2800 2801#if !defined(TRMMKERNEL) || \ 2802 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2803 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2804 2805 leaq BUFFER, BO 2806#else 2807 leaq BUFFER, BO 2808 movq KK, %rax 2809 leaq (, %rax, 8), %rax 2810 leaq (AO, %rax, 1), AO 2811 leaq (BO, %rax, 2), BO 2812#endif 2813 2814 movaps -32 * SIZE(AO), %xmm8 2815 movaps -24 * SIZE(AO), %xmm10 2816 2817 movaps 0 * SIZE(BO), %xmm9 2818 movaps 16 * SIZE(BO), %xmm11 2819 2820 xorps %xmm0, %xmm0 2821 xorps %xmm1, %xmm1 2822 xorps %xmm2, %xmm2 2823 xorps %xmm3, %xmm3 2824 2825#ifndef TRMMKERNEL 2826 movq K, %rax 2827#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2828 movq K, %rax 2829 subq KK, %rax 2830 movq %rax, KKK 2831#else 2832 movq KK, %rax 2833#ifdef LEFT 2834 addq $2, %rax 2835#else 2836 addq $1, %rax 2837#endif 2838 movq %rax, KKK 2839#endif 2840 sarq $3, %rax 2841 je .L135 2842 ALIGN_4 2843 2844.L132: 2845 mulps %xmm8, %xmm9 2846 movsd -30 * SIZE(AO), %xmm8 2847 addps %xmm9, %xmm0 2848 movsd 4 * SIZE(BO), %xmm9 2849 mulps %xmm8, %xmm9 2850 movsd -28 * SIZE(AO), %xmm8 2851 addps %xmm9, %xmm1 2852 movsd 8 * SIZE(BO), %xmm9 2853 2854 mulps %xmm8, %xmm9 2855 movsd -26 * SIZE(AO), %xmm8 2856 addps %xmm9, %xmm0 2857 movsd 12 * SIZE(BO), %xmm9 2858 2859 mulps %xmm8, %xmm9 2860 movsd -16 * SIZE(AO), %xmm8 2861 addps %xmm9, %xmm1 2862 movsd 32 * SIZE(BO), %xmm9 2863 2864 mulps %xmm10, %xmm11 2865 movsd -22 * SIZE(AO), %xmm10 2866 addps %xmm11, %xmm0 2867 movsd 20 * SIZE(BO), %xmm11 2868 2869 mulps %xmm10, %xmm11 2870 movsd -20 * SIZE(AO), %xmm10 2871 addps %xmm11, %xmm1 2872 movsd 24 * SIZE(BO), %xmm11 2873 2874 mulps %xmm10, %xmm11 2875 movsd -18 * SIZE(AO), %xmm10 2876 addps %xmm11, %xmm0 2877 movsd 28 * SIZE(BO), %xmm11 2878 2879 mulps %xmm10, %xmm11 2880 movsd -8 * SIZE(AO), %xmm10 2881 addps %xmm11, %xmm1 2882 movsd 48 * SIZE(BO), %xmm11 2883 2884 addq $16 * SIZE, AO 2885 addq $32 * SIZE, BO 2886 decq %rax 2887 jne .L132 2888 ALIGN_4 2889 2890.L135: 2891#ifndef TRMMKERNEL 2892 movq K, %rax 2893#else 2894 movq KKK, %rax 2895#endif 2896 movaps ALPHA, %xmm15 2897 andq $7, %rax # if (k & 1) 2898 BRANCH 2899 je .L138 2900 ALIGN_4 2901 2902.L136: 2903 mulps %xmm8, %xmm9 2904 movsd -30 * SIZE(AO), %xmm8 2905 addps %xmm9, %xmm0 2906 movsd 4 * SIZE(BO), %xmm9 2907 2908 addq $2 * SIZE, AO # aoffset += 4 2909 addq $4 * SIZE, BO # boffset1 += 8 2910 decq %rax 2911 jg .L136 2912 ALIGN_4 2913 2914.L138: 2915 addps %xmm1, %xmm0 2916 mulps %xmm15, %xmm0 2917 2918#ifndef TRMMKERNEL 2919 movsd 0 * SIZE(CO1), %xmm8 2920 addps %xmm8, %xmm0 2921#endif 2922 2923 movsd %xmm0, 0 * SIZE(CO1) 2924 2925#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2926 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2927 movq K, %rax 2928 subq KKK, %rax 2929 leaq (,%rax, 8), %rax 2930 leaq (AO, %rax, 1), AO 2931 leaq (BO, %rax, 2), BO 2932#endif 2933 2934#if defined(TRMMKERNEL) && defined(LEFT) 2935 addq $2, KK 2936#endif 2937 2938 addq $2 * SIZE, CO1 # coffset += 4 2939 ALIGN_4 2940 2941.L140: 2942 testq $1, M 2943 je .L999 2944 2945#if !defined(TRMMKERNEL) || \ 2946 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2947 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2948 2949 leaq BUFFER, BO 2950#else 2951 leaq BUFFER, BO 2952 movq KK, %rax 2953 leaq (, %rax, 4), %rax 2954 leaq (AO, %rax, 1), AO 2955 leaq (BO, %rax, 4), BO 2956#endif 2957 2958 movss -32 * SIZE(AO), %xmm8 2959 movss -28 * SIZE(AO), %xmm10 2960 2961 movss 0 * SIZE(BO), %xmm9 2962 movss 16 * SIZE(BO), %xmm11 2963 2964 xorps %xmm0, %xmm0 2965 xorps %xmm1, %xmm1 2966 xorps %xmm2, %xmm2 2967 xorps %xmm3, %xmm3 2968 2969#ifndef TRMMKERNEL 2970 movq K, %rax 2971#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2972 movq K, %rax 2973 subq KK, %rax 2974 movq %rax, KKK 2975#else 2976 movq KK, %rax 2977#ifdef LEFT 2978 addq $1, %rax 2979#else 2980 addq $1, %rax 2981#endif 2982 movq %rax, KKK 2983#endif 2984 sarq $3, %rax 2985 je .L145 2986 ALIGN_4 2987 2988.L142: 2989 mulss %xmm8, %xmm9 2990 movss -31 * SIZE(AO), %xmm8 2991 mulss 4 * SIZE(BO), %xmm8 2992 addss %xmm9, %xmm0 2993 movss 32 * SIZE(BO), %xmm9 2994 addss %xmm8, %xmm1 2995 movss -30 * SIZE(AO), %xmm8 2996 mulss 8 * SIZE(BO), %xmm8 2997 addss %xmm8, %xmm2 2998 movss -29 * SIZE(AO), %xmm8 2999 mulss 12 * SIZE(BO), %xmm8 3000 addss %xmm8, %xmm3 3001 movss -24 * SIZE(AO), %xmm8 3002 mulss %xmm10, %xmm11 3003 movss -27 * SIZE(AO), %xmm10 3004 mulss 20 * SIZE(BO), %xmm10 3005 addss %xmm11, %xmm0 3006 movss 48 * SIZE(BO), %xmm11 3007 addss %xmm10, %xmm1 3008 movss -26 * SIZE(AO), %xmm10 3009 mulss 24 * SIZE(BO), %xmm10 3010 addss %xmm10, %xmm2 3011 movss -25 * SIZE(AO), %xmm10 3012 mulss 28 * SIZE(BO), %xmm10 3013 addss %xmm10, %xmm3 3014 movss -20 * SIZE(AO), %xmm10 3015 3016 addq $ 8 * SIZE, AO 3017 addq $32 * SIZE, BO 3018 decq %rax 3019 jne .L142 3020 ALIGN_4 3021 3022.L145: 3023#ifndef TRMMKERNEL 3024 movq K, %rax 3025#else 3026 movq KKK, %rax 3027#endif 3028 movss ALPHA, %xmm15 3029 andq $7, %rax # if (k & 1) 3030 BRANCH 3031 je .L148 3032 ALIGN_4 3033 3034.L146: 3035 mulss %xmm8, %xmm9 3036 movss -31 * SIZE(AO), %xmm8 3037 addss %xmm9, %xmm0 3038 movss 4 * SIZE(BO), %xmm9 3039 3040 addq $1 * SIZE, AO 3041 addq $4 * SIZE, BO 3042 decq %rax 3043 jg .L146 3044 ALIGN_4 3045 3046.L148: 3047 addss %xmm1, %xmm0 3048 addss %xmm3, %xmm2 3049 addss %xmm2, %xmm0 3050 3051 mulss %xmm15, %xmm0 3052 3053#ifndef TRMMKERNEL 3054 movss 0 * SIZE(CO1), %xmm8 3055 addss %xmm8, %xmm0 3056#endif 3057 movss %xmm0, 0 * SIZE(CO1) 3058 ALIGN_4 3059 3060.L999: 3061 movq %rbx, %rsp 3062 movq 0(%rsp), %rbx 3063 movq 8(%rsp), %rbp 3064 movq 16(%rsp), %r12 3065 movq 24(%rsp), %r13 3066 movq 32(%rsp), %r14 3067 movq 40(%rsp), %r15 3068 3069#ifdef WINDOWS_ABI 3070 movq 48(%rsp), %rdi 3071 movq 56(%rsp), %rsi 3072 movups 64(%rsp), %xmm6 3073 movups 80(%rsp), %xmm7 3074 movups 96(%rsp), %xmm8 3075 movups 112(%rsp), %xmm9 3076 movups 128(%rsp), %xmm10 3077 movups 144(%rsp), %xmm11 3078 movups 160(%rsp), %xmm12 3079 movups 176(%rsp), %xmm13 3080 movups 192(%rsp), %xmm14 3081 movups 208(%rsp), %xmm15 3082#endif 3083 3084 addq $STACKSIZE, %rsp 3085 ret 3086 3087 EPILOGUE 3088