1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %rbp 58#define BB %r12 59 60#ifndef WINDOWS_ABI 61 62#define STACKSIZE 64 63 64#define OLD_LDC 8 + STACKSIZE(%rsp) 65#define OLD_OFFSET 16 + STACKSIZE(%rsp) 66 67#else 68 69#define STACKSIZE 256 70 71#define OLD_A 40 + STACKSIZE(%rsp) 72#define OLD_B 48 + STACKSIZE(%rsp) 73#define OLD_C 56 + STACKSIZE(%rsp) 74#define OLD_LDC 64 + STACKSIZE(%rsp) 75#define OLD_OFFSET 72 + STACKSIZE(%rsp) 76 77#endif 78 79#define ALPHA 0(%rsp) 80#define J 16(%rsp) 81#define OFFSET 24(%rsp) 82#define KK 32(%rsp) 83#define KKK 40(%rsp) 84#define BUFFER 256(%rsp) 85 86#ifdef OPTERON 87#define PREFETCH prefetch 88#define PREFETCHW prefetchw 89#define PREFETCHSIZE (8 * 9 + 4) 90#define movsd movlps 91#define movapd movaps 92#endif 93 94#ifdef GENERIC 95#define PREFETCH prefetcht0 96#define PREFETCHW prefetcht0 97#define PREFETCHSIZE (8 * 13 + 4) 98#define movapd movaps 99#endif 100 101#ifndef GENERIC 102#define KERNEL1(xx) \ 103 mulpd %xmm0, %xmm1 ;\ 104 addpd %xmm1, %xmm8 ;\ 105 movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 106 mulpd %xmm0, %xmm3 ;\ 107 addpd %xmm3, %xmm9 ;\ 108 movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 109 mulpd %xmm0, %xmm5 ;\ 110 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 111 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 112 addpd %xmm5, %xmm10 ;\ 113 movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 114 addpd %xmm0, %xmm11 ;\ 115 movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 116 117#define KERNEL2(xx) \ 118 mulpd %xmm2, %xmm1 ;\ 119 addpd %xmm1, %xmm12 ;\ 120 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 121 mulpd %xmm2, %xmm3 ;\ 122 addpd %xmm3, %xmm13 ;\ 123 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 124 mulpd %xmm2, %xmm5 ;\ 125 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 126 addpd %xmm5, %xmm14 ;\ 127 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 128 addpd %xmm2, %xmm15 ;\ 129 movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 130 131#define KERNEL3(xx) \ 132 mulpd %xmm4, %xmm7 ;\ 133 addpd %xmm7, %xmm8 ;\ 134 movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 135 mulpd %xmm4, %xmm3 ;\ 136 addpd %xmm3, %xmm9 ;\ 137 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 138 mulpd %xmm4, %xmm5 ;\ 139 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 140 addpd %xmm5, %xmm10 ;\ 141 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 142 addpd %xmm4, %xmm11 ;\ 143 movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 144 145#define KERNEL4(xx) \ 146 mulpd %xmm6, %xmm7 ;\ 147 addpd %xmm7, %xmm12 ;\ 148 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 149 mulpd %xmm6, %xmm3 ;\ 150 addpd %xmm3, %xmm13 ;\ 151 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 152 mulpd %xmm6, %xmm5 ;\ 153 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 154 addpd %xmm5, %xmm14 ;\ 155 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 156 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 157 addpd %xmm6, %xmm15 ;\ 158 movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 159 160#define KERNEL5(xx) \ 161 mulpd %xmm0, %xmm1 ;\ 162 addpd %xmm1, %xmm8 ;\ 163 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 164 mulpd %xmm0, %xmm3 ;\ 165 addpd %xmm3, %xmm9 ;\ 166 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 167 mulpd %xmm0, %xmm5 ;\ 168 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 169 addpd %xmm5, %xmm10 ;\ 170 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 171 addpd %xmm0, %xmm11 ;\ 172 movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 173 174#define KERNEL6(xx) \ 175 mulpd %xmm2, %xmm1 ;\ 176 addpd %xmm1, %xmm12 ;\ 177 movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 178 mulpd %xmm2, %xmm3 ;\ 179 addpd %xmm3, %xmm13 ;\ 180 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 181 mulpd %xmm2, %xmm5 ;\ 182 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 183 addpd %xmm5, %xmm14 ;\ 184 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 185 addpd %xmm2, %xmm15 ;\ 186 movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 187 188#define KERNEL7(xx) \ 189 mulpd %xmm4, %xmm7 ;\ 190 addpd %xmm7, %xmm8 ;\ 191 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 192 mulpd %xmm4, %xmm3 ;\ 193 addpd %xmm3, %xmm9 ;\ 194 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 195 mulpd %xmm4, %xmm5 ;\ 196 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 197 addpd %xmm5, %xmm10 ;\ 198 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 199 addpd %xmm4, %xmm11 ;\ 200 movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 201 202#define KERNEL8(xx) \ 203 mulpd %xmm6, %xmm7 ;\ 204 addpd %xmm7, %xmm12 ;\ 205 movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 206 mulpd %xmm6, %xmm3 ;\ 207 addpd %xmm3, %xmm13 ;\ 208 movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 209 mulpd %xmm6, %xmm5 ;\ 210 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 211 addpd %xmm5, %xmm14 ;\ 212 movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 213 addpd %xmm6, %xmm15 ;\ 214 movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 215 216#else 217 218#define KERNEL1(xx) \ 219 mulpd %xmm0, %xmm1 ;\ 220 addpd %xmm1, %xmm8 ;\ 221 movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 222 mulpd %xmm0, %xmm3 ;\ 223 addpd %xmm3, %xmm9 ;\ 224 movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 225 mulpd %xmm0, %xmm5 ;\ 226 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 227 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 228 addpd %xmm5, %xmm10 ;\ 229 movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 230 addpd %xmm0, %xmm11 ;\ 231 movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 232 233#define KERNEL2(xx) \ 234 mulpd %xmm2, %xmm1 ;\ 235 addpd %xmm1, %xmm12 ;\ 236 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 237 mulpd %xmm2, %xmm3 ;\ 238 addpd %xmm3, %xmm13 ;\ 239 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 240 mulpd %xmm2, %xmm5 ;\ 241 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 242 addpd %xmm5, %xmm14 ;\ 243 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 244 addpd %xmm2, %xmm15 ;\ 245 movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 246 247#define KERNEL3(xx) \ 248 mulpd %xmm4, %xmm7 ;\ 249 addpd %xmm7, %xmm8 ;\ 250 movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 251 mulpd %xmm4, %xmm3 ;\ 252 addpd %xmm3, %xmm9 ;\ 253 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 254 mulpd %xmm4, %xmm5 ;\ 255 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 256 addpd %xmm5, %xmm10 ;\ 257 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 258 addpd %xmm4, %xmm11 ;\ 259 movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 260 261#define KERNEL4(xx) \ 262 mulpd %xmm6, %xmm7 ;\ 263 addpd %xmm7, %xmm12 ;\ 264 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 265 mulpd %xmm6, %xmm3 ;\ 266 addpd %xmm3, %xmm13 ;\ 267 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 268 mulpd %xmm6, %xmm5 ;\ 269 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 270 addpd %xmm5, %xmm14 ;\ 271 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 272 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 273 addpd %xmm6, %xmm15 ;\ 274 movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 275 276#define KERNEL5(xx) \ 277 mulpd %xmm0, %xmm1 ;\ 278 addpd %xmm1, %xmm8 ;\ 279 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 280 mulpd %xmm0, %xmm3 ;\ 281 addpd %xmm3, %xmm9 ;\ 282 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 283 mulpd %xmm0, %xmm5 ;\ 284 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 285 addpd %xmm5, %xmm10 ;\ 286 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 287 addpd %xmm0, %xmm11 ;\ 288 movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 289 290#define KERNEL6(xx) \ 291 mulpd %xmm2, %xmm1 ;\ 292 addpd %xmm1, %xmm12 ;\ 293 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 294 mulpd %xmm2, %xmm3 ;\ 295 addpd %xmm3, %xmm13 ;\ 296 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 297 mulpd %xmm2, %xmm5 ;\ 298 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 299 addpd %xmm5, %xmm14 ;\ 300 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 301 addpd %xmm2, %xmm15 ;\ 302 movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 303 304#define KERNEL7(xx) \ 305 mulpd %xmm4, %xmm7 ;\ 306 addpd %xmm7, %xmm8 ;\ 307 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 308 mulpd %xmm4, %xmm3 ;\ 309 addpd %xmm3, %xmm9 ;\ 310 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 311 mulpd %xmm4, %xmm5 ;\ 312 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 313 addpd %xmm5, %xmm10 ;\ 314 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 315 addpd %xmm4, %xmm11 ;\ 316 movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 317 318#define KERNEL8(xx) \ 319 mulpd %xmm6, %xmm7 ;\ 320 addpd %xmm7, %xmm12 ;\ 321 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 322 mulpd %xmm6, %xmm3 ;\ 323 addpd %xmm3, %xmm13 ;\ 324 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 325 mulpd %xmm6, %xmm5 ;\ 326 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 327 addpd %xmm5, %xmm14 ;\ 328 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 329 addpd %xmm6, %xmm15 ;\ 330 movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 331#endif 332 333 PROLOGUE 334 PROFCODE 335 336 subq $STACKSIZE, %rsp 337 movq %rbx, 0(%rsp) 338 movq %rbp, 8(%rsp) 339 movq %r12, 16(%rsp) 340 movq %r13, 24(%rsp) 341 movq %r14, 32(%rsp) 342 movq %r15, 40(%rsp) 343 344#ifdef WINDOWS_ABI 345 movq %rdi, 48(%rsp) 346 movq %rsi, 56(%rsp) 347 movups %xmm6, 64(%rsp) 348 movups %xmm7, 80(%rsp) 349 movups %xmm8, 96(%rsp) 350 movups %xmm9, 112(%rsp) 351 movups %xmm10, 128(%rsp) 352 movups %xmm11, 144(%rsp) 353 movups %xmm12, 160(%rsp) 354 movups %xmm13, 176(%rsp) 355 movups %xmm14, 192(%rsp) 356 movups %xmm15, 208(%rsp) 357 358 movq ARG1, OLD_M 359 movq ARG2, OLD_N 360 movq ARG3, K 361 movq OLD_A, A 362 movq OLD_B, B 363 movq OLD_C, C 364 movq OLD_LDC, LDC 365#ifdef TRMMKERNEL 366 movsd OLD_OFFSET, %xmm12 367#endif 368 movaps %xmm3, %xmm0 369 370#else 371 movq OLD_LDC, LDC 372#ifdef TRMMKERNEL 373 movsd OLD_OFFSET, %xmm12 374#endif 375 376#endif 377 378 EMMS 379 380 movq %rsp, %rbx # save old stack 381 subq $256 + LOCAL_BUFFER_SIZE, %rsp 382 andq $-4096, %rsp # align stack 383 384 STACK_TOUCHING 385 386 movq OLD_M, M 387 movq OLD_N, N 388 389 subq $-16 * SIZE, A 390 391 unpcklpd %xmm0, %xmm0 392 movapd %xmm0, ALPHA 393 394 leaq (, LDC, SIZE), LDC 395 396#ifdef TRMMKERNEL 397 movsd %xmm12, OFFSET 398 movsd %xmm12, KK 399#ifndef LEFT 400 negq KK 401#endif 402#endif 403 movq N, J 404 sarq $2, J # j = (n >> 2) 405 jle .L40 406 ALIGN_3 407 408.L01: 409/* Copying to Sub Buffer */ 410 leaq 16 * SIZE + BUFFER, BO 411 movq C, CO1 # coffset1 = c 412 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 413 414#if defined(TRMMKERNEL) && defined(LEFT) 415 movq OFFSET, %rax 416 movq %rax, KK 417#endif 418 419 movq K, %rax 420 sarq $2, %rax 421 jle .L03 422 ALIGN_3 423 424 425#define RPREFETCHSIZE (8 * 7 + 4) 426#define WPREFETCHSIZE (8 * 8 + 4) 427 428.L02: 429 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 430 431 movq 0 * SIZE(B), %mm0 432 movq %mm0, -16 * SIZE(BO) 433 movq %mm0, -15 * SIZE(BO) 434 movq 1 * SIZE(B), %mm1 435 movq %mm1, -14 * SIZE(BO) 436 movq %mm1, -13 * SIZE(BO) 437 438 movq 2 * SIZE(B), %mm2 439 movq %mm2, -12 * SIZE(BO) 440 movq %mm2, -11 * SIZE(BO) 441 movq 3 * SIZE(B), %mm3 442 movq %mm3, -10 * SIZE(BO) 443 movq %mm3, -9 * SIZE(BO) 444 445 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 446 447 movq 4 * SIZE(B), %mm4 448 movq %mm4, -8 * SIZE(BO) 449 movq %mm4, -7 * SIZE(BO) 450 movq 5 * SIZE(B), %mm5 451 movq %mm5, -6 * SIZE(BO) 452 movq %mm5, -5 * SIZE(BO) 453 454 PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) 455 456 movq 6 * SIZE(B), %mm6 457 movq %mm6, -4 * SIZE(BO) 458 movq %mm6, -3 * SIZE(BO) 459 movq 7 * SIZE(B), %mm7 460 movq %mm7, -2 * SIZE(BO) 461 movq %mm7, -1 * SIZE(BO) 462 463 PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) 464 465 movq 8 * SIZE(B), %mm0 466 movq %mm0, 0 * SIZE(BO) 467 movq %mm0, 1 * SIZE(BO) 468 movq 9 * SIZE(B), %mm1 469 movq %mm1, 2 * SIZE(BO) 470 movq %mm1, 3 * SIZE(BO) 471 472 movq 10 * SIZE(B), %mm2 473 movq %mm2, 4 * SIZE(BO) 474 movq %mm2, 5 * SIZE(BO) 475 movq 11 * SIZE(B), %mm3 476 movq %mm3, 6 * SIZE(BO) 477 movq %mm3, 7 * SIZE(BO) 478 479 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 480 481 movq 12 * SIZE(B), %mm4 482 movq %mm4, 8 * SIZE(BO) 483 movq %mm4, 9 * SIZE(BO) 484 movq 13 * SIZE(B), %mm5 485 movq %mm5, 10 * SIZE(BO) 486 movq %mm5, 11 * SIZE(BO) 487 488 PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) 489 490 movq 14 * SIZE(B), %mm6 491 movq %mm6, 12 * SIZE(BO) 492 movq %mm6, 13 * SIZE(BO) 493 movq 15 * SIZE(B), %mm7 494 movq %mm7, 14 * SIZE(BO) 495 movq %mm7, 15 * SIZE(BO) 496 497 addq $ 32 * SIZE, BO 498 subq $-16 * SIZE, B 499 500 subq $1, %rax 501 jne .L02 502 ALIGN_3 503 504.L03: 505 movq K, %rax 506 andq $3, %rax 507 BRANCH 508 jle .L10 509 ALIGN_3 510 511.L04: 512 movq 0 * SIZE(B), %mm0 513 movq %mm0, -16 * SIZE(BO) 514 movq %mm0, -15 * SIZE(BO) 515 movq 1 * SIZE(B), %mm1 516 movq %mm1, -14 * SIZE(BO) 517 movq %mm1, -13 * SIZE(BO) 518 519 movq 2 * SIZE(B), %mm2 520 movq %mm2, -12 * SIZE(BO) 521 movq %mm2, -11 * SIZE(BO) 522 movq 3 * SIZE(B), %mm3 523 movq %mm3, -10 * SIZE(BO) 524 movq %mm3, -9 * SIZE(BO) 525 526 addq $4 * SIZE, B 527 addq $8 * SIZE, BO 528 subq $1, %rax 529 jne .L04 530 ALIGN_3 531 532.L10: 533 movq A, AO # aoffset = a 534 535 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 536 537 movq M, I 538 sarq $2, I # i = (m >> 2) 539 jle .L20 540 ALIGN_3 541 542.L11: 543#if !defined(TRMMKERNEL) || \ 544 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 545 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 546 547 leaq 16 * SIZE + BUFFER, BO 548#else 549 leaq 16 * SIZE + BUFFER, BO 550 movq KK, %rax 551 leaq (, %rax, SIZE), %rax 552 leaq (AO, %rax, 4), AO 553 leaq (BO, %rax, 8), BO 554#endif 555 556 movapd -16 * SIZE(AO), %xmm0 557 movapd -16 * SIZE(BO), %xmm1 558 pxor %xmm8, %xmm8 559 movapd -14 * SIZE(AO), %xmm2 560 movapd -14 * SIZE(BO), %xmm3 561 pxor %xmm9, %xmm9 562 movapd -12 * SIZE(AO), %xmm4 563 movapd -12 * SIZE(BO), %xmm5 564 pxor %xmm10, %xmm10 565 movapd -10 * SIZE(AO), %xmm6 566 movapd -8 * SIZE(BO), %xmm7 567 pxor %xmm11, %xmm11 568 569 PREFETCHW 3 * SIZE(CO1) 570 pxor %xmm12, %xmm12 571 PREFETCHW 7 * SIZE(CO2) 572 pxor %xmm13, %xmm13 573 PREFETCHW 3 * SIZE(CO1, LDC, 2) 574 pxor %xmm14, %xmm14 575 PREFETCHW 7 * SIZE(CO2, LDC, 2) 576 pxor %xmm15, %xmm15 577 578 PREFETCH 0 * SIZE(BB) 579 580#ifndef TRMMKERNEL 581 movq K, %rax 582#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 583 movq K, %rax 584 subq KK, %rax 585 movq %rax, KKK 586#else 587 movq KK, %rax 588#ifdef LEFT 589 addq $4, %rax 590#else 591 addq $4, %rax 592#endif 593 movq %rax, KKK 594#endif 595 596#ifndef GENERIC 597 andq $-8, %rax 598 599 leaq (, %rax, SIZE), %rax 600 leaq (AO, %rax, 4), AO 601 leaq (BO, %rax, 8), BO 602 negq %rax 603 NOBRANCH 604 je .L15 605 ALIGN_3 606 607.L12: 608 KERNEL1(16 * 0) 609 KERNEL2(16 * 0) 610 KERNEL3(16 * 0) 611 KERNEL4(16 * 0) 612 KERNEL5(16 * 0) 613 KERNEL6(16 * 0) 614 KERNEL7(16 * 0) 615 KERNEL8(16 * 0) 616 617 KERNEL1(16 * 1) 618 KERNEL2(16 * 1) 619 KERNEL3(16 * 1) 620 KERNEL4(16 * 1) 621 KERNEL5(16 * 1) 622 KERNEL6(16 * 1) 623 KERNEL7(16 * 1) 624 KERNEL8(16 * 1) 625 626 addq $8 * SIZE, %rax 627 NOBRANCH 628 je .L15 629 630 KERNEL1(16 * 0) 631 KERNEL2(16 * 0) 632 KERNEL3(16 * 0) 633 KERNEL4(16 * 0) 634 KERNEL5(16 * 0) 635 KERNEL6(16 * 0) 636 KERNEL7(16 * 0) 637 KERNEL8(16 * 0) 638 639 KERNEL1(16 * 1) 640 KERNEL2(16 * 1) 641 KERNEL3(16 * 1) 642 KERNEL4(16 * 1) 643 KERNEL5(16 * 1) 644 KERNEL6(16 * 1) 645 KERNEL7(16 * 1) 646 KERNEL8(16 * 1) 647 648 addq $8 * SIZE, %rax 649 NOBRANCH 650 je .L15 651 652 KERNEL1(16 * 0) 653 KERNEL2(16 * 0) 654 KERNEL3(16 * 0) 655 KERNEL4(16 * 0) 656 KERNEL5(16 * 0) 657 KERNEL6(16 * 0) 658 KERNEL7(16 * 0) 659 KERNEL8(16 * 0) 660 661 KERNEL1(16 * 1) 662 KERNEL2(16 * 1) 663 KERNEL3(16 * 1) 664 KERNEL4(16 * 1) 665 KERNEL5(16 * 1) 666 KERNEL6(16 * 1) 667 KERNEL7(16 * 1) 668 KERNEL8(16 * 1) 669 670 addq $8 * SIZE, %rax 671 NOBRANCH 672 je .L15 673 674 KERNEL1(16 * 0) 675 KERNEL2(16 * 0) 676 KERNEL3(16 * 0) 677 KERNEL4(16 * 0) 678 KERNEL5(16 * 0) 679 KERNEL6(16 * 0) 680 KERNEL7(16 * 0) 681 KERNEL8(16 * 0) 682 683 KERNEL1(16 * 1) 684 KERNEL2(16 * 1) 685 KERNEL3(16 * 1) 686 KERNEL4(16 * 1) 687 KERNEL5(16 * 1) 688 KERNEL6(16 * 1) 689 KERNEL7(16 * 1) 690 KERNEL8(16 * 1) 691 692 addq $8 * SIZE, %rax 693 NOBRANCH 694 je .L15 695 696 KERNEL1(16 * 0) 697 KERNEL2(16 * 0) 698 KERNEL3(16 * 0) 699 KERNEL4(16 * 0) 700 KERNEL5(16 * 0) 701 KERNEL6(16 * 0) 702 KERNEL7(16 * 0) 703 KERNEL8(16 * 0) 704 705 KERNEL1(16 * 1) 706 KERNEL2(16 * 1) 707 KERNEL3(16 * 1) 708 KERNEL4(16 * 1) 709 KERNEL5(16 * 1) 710 KERNEL6(16 * 1) 711 KERNEL7(16 * 1) 712 KERNEL8(16 * 1) 713 714 addq $8 * SIZE, %rax 715 NOBRANCH 716 je .L15 717 718 KERNEL1(16 * 0) 719 KERNEL2(16 * 0) 720 KERNEL3(16 * 0) 721 KERNEL4(16 * 0) 722 KERNEL5(16 * 0) 723 KERNEL6(16 * 0) 724 KERNEL7(16 * 0) 725 KERNEL8(16 * 0) 726 727 KERNEL1(16 * 1) 728 KERNEL2(16 * 1) 729 KERNEL3(16 * 1) 730 KERNEL4(16 * 1) 731 KERNEL5(16 * 1) 732 KERNEL6(16 * 1) 733 KERNEL7(16 * 1) 734 KERNEL8(16 * 1) 735 736 addq $8 * SIZE, %rax 737 NOBRANCH 738 je .L15 739 740 KERNEL1(16 * 0) 741 KERNEL2(16 * 0) 742 KERNEL3(16 * 0) 743 KERNEL4(16 * 0) 744 KERNEL5(16 * 0) 745 KERNEL6(16 * 0) 746 KERNEL7(16 * 0) 747 KERNEL8(16 * 0) 748 749 KERNEL1(16 * 1) 750 KERNEL2(16 * 1) 751 KERNEL3(16 * 1) 752 KERNEL4(16 * 1) 753 KERNEL5(16 * 1) 754 KERNEL6(16 * 1) 755 KERNEL7(16 * 1) 756 KERNEL8(16 * 1) 757 758 addq $8 * SIZE, %rax 759 NOBRANCH 760 je .L15 761 762 KERNEL1(16 * 0) 763 KERNEL2(16 * 0) 764 KERNEL3(16 * 0) 765 KERNEL4(16 * 0) 766 KERNEL5(16 * 0) 767 KERNEL6(16 * 0) 768 KERNEL7(16 * 0) 769 KERNEL8(16 * 0) 770 771 KERNEL1(16 * 1) 772 KERNEL2(16 * 1) 773 KERNEL3(16 * 1) 774 KERNEL4(16 * 1) 775 KERNEL5(16 * 1) 776 KERNEL6(16 * 1) 777 KERNEL7(16 * 1) 778 KERNEL8(16 * 1) 779 780 addq $8 * SIZE, %rax 781 BRANCH 782 jl .L12 783 ALIGN_3 784 785.L15: 786#ifndef TRMMKERNEL 787 movq K, %rax 788#else 789 movq KKK, %rax 790#endif 791 testq $4, %rax 792 je .L16 793 xorq %rax, %rax 794 ALIGN_3 795 796 KERNEL1(16 * 0) 797 KERNEL2(16 * 0) 798 KERNEL3(16 * 0) 799 KERNEL4(16 * 0) 800 KERNEL5(16 * 0) 801 KERNEL6(16 * 0) 802 KERNEL7(16 * 0) 803 KERNEL8(16 * 0) 804 805 addq $32 * SIZE, BO 806 addq $16 * SIZE, AO 807 ALIGN_3 808 809#else 810 sarq $2, %rax 811 NOBRANCH 812 jle .L16 813 ALIGN_3 814 815.L12: 816 KERNEL1(16 * 0) 817 KERNEL2(16 * 0) 818 KERNEL3(16 * 0) 819 KERNEL4(16 * 0) 820 KERNEL5(16 * 0) 821 KERNEL6(16 * 0) 822 KERNEL7(16 * 0) 823 KERNEL8(16 * 0) 824 825 addq $ 32 * SIZE, BO 826 subq $-16 * SIZE, AO 827 decq %rax 828 BRANCH 829 jg .L12 830#endif 831 832.L16: 833 movapd ALPHA, %xmm7 834 835#ifndef TRMMKERNEL 836 movq K, %rax 837#else 838 movq KKK, %rax 839#endif 840 andq $3, %rax # if (k & 1) 841 je .L19 842 843 leaq (, %rax, SIZE), %rax 844 leaq (AO, %rax, 4), AO 845 leaq (BO, %rax, 8), BO 846 negq %rax 847 ALIGN_3 848 849.L17: 850 mulpd %xmm0, %xmm1 851 addpd %xmm1, %xmm8 852 movapd -14 * SIZE(BO, %rax, 8), %xmm1 853 mulpd %xmm0, %xmm1 854 addpd %xmm1, %xmm9 855 movapd -12 * SIZE(BO, %rax, 8), %xmm1 856 mulpd %xmm0, %xmm1 857 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 858 addpd %xmm1, %xmm10 859 movapd -16 * SIZE(BO, %rax, 8), %xmm1 860 addpd %xmm0, %xmm11 861 movapd -12 * SIZE(AO, %rax, 4), %xmm0 862 mulpd %xmm2, %xmm1 863 addpd %xmm1, %xmm12 864 movapd -14 * SIZE(BO, %rax, 8), %xmm1 865 mulpd %xmm2, %xmm1 866 addpd %xmm1, %xmm13 867 movapd -12 * SIZE(BO, %rax, 8), %xmm1 868 mulpd %xmm2, %xmm1 869 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 870 addpd %xmm1, %xmm14 871 movapd -8 * SIZE(BO, %rax, 8), %xmm1 872 addpd %xmm2, %xmm15 873 movapd -10 * SIZE(AO, %rax, 4), %xmm2 874 875 addq $SIZE, %rax 876 jl .L17 877 ALIGN_3 878 879.L19: 880 PREFETCH 8 * SIZE(BB) 881 subq $-12 * SIZE, BB 882 883#ifndef TRMMKERNEL 884 movsd 0 * SIZE(CO1), %xmm0 885 movhpd 1 * SIZE(CO1), %xmm0 886 movsd 2 * SIZE(CO1), %xmm1 887 movhpd 3 * SIZE(CO1), %xmm1 888 889 movsd 0 * SIZE(CO2), %xmm2 890 movhpd 1 * SIZE(CO2), %xmm2 891 movsd 2 * SIZE(CO2), %xmm3 892 movhpd 3 * SIZE(CO2), %xmm3 893#endif 894 895 mulpd %xmm7, %xmm8 896 mulpd %xmm7, %xmm9 897 mulpd %xmm7, %xmm10 898 mulpd %xmm7, %xmm11 899 900 mulpd %xmm7, %xmm12 901 mulpd %xmm7, %xmm13 902 mulpd %xmm7, %xmm14 903 mulpd %xmm7, %xmm15 904 905#ifndef TRMMKERNEL 906 movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 907 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 908 movlpd 2 * SIZE(CO1, LDC, 2), %xmm5 909 movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 910 911 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 912 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 913 movlpd 2 * SIZE(CO2, LDC, 2), %xmm7 914 movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 915 916 addpd %xmm0, %xmm8 917 addpd %xmm1, %xmm12 918 addpd %xmm2, %xmm9 919 addpd %xmm3, %xmm13 920#endif 921 922 movlpd %xmm8, 0 * SIZE(CO1) 923 movhpd %xmm8, 1 * SIZE(CO1) 924 movlpd %xmm12, 2 * SIZE(CO1) 925 movhpd %xmm12, 3 * SIZE(CO1) 926 927 movlpd %xmm9, 0 * SIZE(CO2) 928 movhpd %xmm9, 1 * SIZE(CO2) 929 movlpd %xmm13, 2 * SIZE(CO2) 930 movhpd %xmm13, 3 * SIZE(CO2) 931 932#ifndef TRMMKERNEL 933 addpd %xmm4, %xmm10 934 addpd %xmm5, %xmm14 935 addpd %xmm6, %xmm11 936 addpd %xmm7, %xmm15 937#endif 938 939 movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) 940 movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) 941 movlpd %xmm14, 2 * SIZE(CO1, LDC, 2) 942 movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) 943 944 movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) 945 movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) 946 movlpd %xmm15, 2 * SIZE(CO2, LDC, 2) 947 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 948 949#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 950 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 951 movq K, %rax 952 subq KKK, %rax 953 leaq (,%rax, SIZE), %rax 954 leaq (AO, %rax, 4), AO 955 leaq (BO, %rax, 8), BO 956#endif 957 958#if defined(TRMMKERNEL) && defined(LEFT) 959 addq $4, KK 960#endif 961 962 addq $4 * SIZE, CO1 # coffset += 4 963 addq $4 * SIZE, CO2 # coffset += 4 964 decq I # i -- 965 BRANCH 966 jg .L11 967 ALIGN_3 968 969.L20: 970 testq $3, M 971 je .L39 972 973 testq $2, M 974 je .L30 975 ALIGN_3 976 977.L21: 978#if !defined(TRMMKERNEL) || \ 979 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 980 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 981 982 leaq BUFFER, BO 983#else 984 leaq BUFFER, BO 985 movq KK, %rax 986 leaq (, %rax, SIZE), %rax 987 leaq (AO, %rax, 2), AO 988 leaq (BO, %rax, 8), BO 989#endif 990 991 movapd -16 * SIZE(AO), %xmm0 992 pxor %xmm8, %xmm8 993 movapd 0 * SIZE(BO), %xmm1 994 pxor %xmm9, %xmm9 995 movapd -8 * SIZE(AO), %xmm2 996 pxor %xmm10, %xmm10 997 movapd 8 * SIZE(BO), %xmm3 998 pxor %xmm11, %xmm11 999 1000 movapd 16 * SIZE(BO), %xmm5 1001 movapd 24 * SIZE(BO), %xmm7 1002 1003#ifndef TRMMKERNEL 1004 movq K, %rax 1005#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1006 movq K, %rax 1007 subq KK, %rax 1008 movq %rax, KKK 1009#else 1010 movq KK, %rax 1011#ifdef LEFT 1012 addq $2, %rax 1013#else 1014 addq $4, %rax 1015#endif 1016 movq %rax, KKK 1017#endif 1018 sarq $3, %rax 1019 je .L25 1020 ALIGN_3 1021 1022.L22: 1023 mulpd %xmm0, %xmm1 1024 addpd %xmm1, %xmm8 1025 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1026 movapd 2 * SIZE(BO), %xmm1 1027 mulpd %xmm0, %xmm1 1028 addpd %xmm1, %xmm9 1029 movapd 4 * SIZE(BO), %xmm1 1030 mulpd %xmm0, %xmm1 1031 mulpd 6 * SIZE(BO), %xmm0 1032 addpd %xmm1, %xmm10 1033 movapd 32 * SIZE(BO), %xmm1 1034 addpd %xmm0, %xmm11 1035 movapd -14 * SIZE(AO), %xmm0 1036 1037 mulpd %xmm0, %xmm3 1038 addpd %xmm3, %xmm8 1039 movapd 10 * SIZE(BO), %xmm3 1040 mulpd %xmm0, %xmm3 1041 addpd %xmm3, %xmm9 1042 movapd 12 * SIZE(BO), %xmm3 1043 mulpd %xmm0, %xmm3 1044 mulpd 14 * SIZE(BO), %xmm0 1045 addpd %xmm3, %xmm10 1046 movapd 40 * SIZE(BO), %xmm3 1047 addpd %xmm0, %xmm11 1048 movapd -12 * SIZE(AO), %xmm0 1049 1050 mulpd %xmm0, %xmm5 1051 addpd %xmm5, %xmm8 1052 movapd 18 * SIZE(BO), %xmm5 1053 mulpd %xmm0, %xmm5 1054 addpd %xmm5, %xmm9 1055 movapd 20 * SIZE(BO), %xmm5 1056 mulpd %xmm0, %xmm5 1057 mulpd 22 * SIZE(BO), %xmm0 1058 addpd %xmm5, %xmm10 1059 movapd 48 * SIZE(BO), %xmm5 1060 addpd %xmm0, %xmm11 1061 movapd -10 * SIZE(AO), %xmm0 1062 1063 mulpd %xmm0, %xmm7 1064 addpd %xmm7, %xmm8 1065 movapd 26 * SIZE(BO), %xmm7 1066 mulpd %xmm0, %xmm7 1067 addpd %xmm7, %xmm9 1068 movapd 28 * SIZE(BO), %xmm7 1069 mulpd %xmm0, %xmm7 1070 mulpd 30 * SIZE(BO), %xmm0 1071 addpd %xmm7, %xmm10 1072 movapd 56 * SIZE(BO), %xmm7 1073 addpd %xmm0, %xmm11 1074 movapd 0 * SIZE(AO), %xmm0 1075 1076 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1077 mulpd %xmm2, %xmm1 1078 addpd %xmm1, %xmm8 1079 movapd 34 * SIZE(BO), %xmm1 1080 mulpd %xmm2, %xmm1 1081 addpd %xmm1, %xmm9 1082 movapd 36 * SIZE(BO), %xmm1 1083 mulpd %xmm2, %xmm1 1084 mulpd 38 * SIZE(BO), %xmm2 1085 addpd %xmm1, %xmm10 1086 movapd 64 * SIZE(BO), %xmm1 1087 addpd %xmm2, %xmm11 1088 movapd -6 * SIZE(AO), %xmm2 1089 1090 mulpd %xmm2, %xmm3 1091 addpd %xmm3, %xmm8 1092 movapd 42 * SIZE(BO), %xmm3 1093 mulpd %xmm2, %xmm3 1094 addpd %xmm3, %xmm9 1095 movapd 44 * SIZE(BO), %xmm3 1096 mulpd %xmm2, %xmm3 1097 mulpd 46 * SIZE(BO), %xmm2 1098 addpd %xmm3, %xmm10 1099 movapd 72 * SIZE(BO), %xmm3 1100 addpd %xmm2, %xmm11 1101 movapd -4 * SIZE(AO), %xmm2 1102 1103 mulpd %xmm2, %xmm5 1104 addpd %xmm5, %xmm8 1105 movapd 50 * SIZE(BO), %xmm5 1106 mulpd %xmm2, %xmm5 1107 addpd %xmm5, %xmm9 1108 movapd 52 * SIZE(BO), %xmm5 1109 mulpd %xmm2, %xmm5 1110 mulpd 54 * SIZE(BO), %xmm2 1111 addpd %xmm5, %xmm10 1112 movapd 80 * SIZE(BO), %xmm5 1113 addpd %xmm2, %xmm11 1114 movapd -2 * SIZE(AO), %xmm2 1115 1116 mulpd %xmm2, %xmm7 1117 addpd %xmm7, %xmm8 1118 movapd 58 * SIZE(BO), %xmm7 1119 mulpd %xmm2, %xmm7 1120 addpd %xmm7, %xmm9 1121 movapd 60 * SIZE(BO), %xmm7 1122 mulpd %xmm2, %xmm7 1123 mulpd 62 * SIZE(BO), %xmm2 1124 addpd %xmm7, %xmm10 1125 movapd 88 * SIZE(BO), %xmm7 1126 addpd %xmm2, %xmm11 1127 movapd 8 * SIZE(AO), %xmm2 1128 1129 addq $16 * SIZE, AO 1130 addq $64 * SIZE, BO 1131 decq %rax 1132 jne .L22 1133 ALIGN_3 1134 1135.L25: 1136#ifndef TRMMKERNEL 1137 movq K, %rax 1138#else 1139 movq KKK, %rax 1140#endif 1141 movapd ALPHA, %xmm7 1142 andq $7, %rax # if (k & 1) 1143 BRANCH 1144 je .L29 1145 ALIGN_3 1146 1147.L26: 1148 mulpd %xmm0, %xmm1 1149 addpd %xmm1, %xmm8 1150 movapd 2 * SIZE(BO), %xmm1 1151 mulpd %xmm0, %xmm1 1152 addpd %xmm1, %xmm9 1153 movapd 4 * SIZE(BO), %xmm1 1154 mulpd %xmm0, %xmm1 1155 mulpd 6 * SIZE(BO), %xmm0 1156 addpd %xmm1, %xmm10 1157 movapd 8 * SIZE(BO), %xmm1 1158 addpd %xmm0, %xmm11 1159 movapd -14 * SIZE(AO), %xmm0 1160 1161 addq $2 * SIZE, AO # aoffset += 4 1162 addq $8 * SIZE, BO # boffset1 += 8 1163 decq %rax 1164 jg .L26 1165 ALIGN_3 1166 1167.L29: 1168#ifndef TRMMKERNEL 1169 movlpd 0 * SIZE(CO1), %xmm0 1170 movhpd 1 * SIZE(CO1), %xmm0 1171 movlpd 0 * SIZE(CO2), %xmm2 1172 movhpd 1 * SIZE(CO2), %xmm2 1173 1174 movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 1175 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 1176 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 1177 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 1178#endif 1179 mulpd %xmm7, %xmm8 1180 mulpd %xmm7, %xmm9 1181 mulpd %xmm7, %xmm10 1182 mulpd %xmm7, %xmm11 1183 1184#ifndef TRMMKERNEL 1185 addpd %xmm0, %xmm8 1186 addpd %xmm2, %xmm9 1187 addpd %xmm4, %xmm10 1188 addpd %xmm6, %xmm11 1189#endif 1190 1191 movlpd %xmm8, 0 * SIZE(CO1) 1192 movhpd %xmm8, 1 * SIZE(CO1) 1193 movlpd %xmm9, 0 * SIZE(CO2) 1194 movhpd %xmm9, 1 * SIZE(CO2) 1195 movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) 1196 movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) 1197 movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1198 movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) 1199 1200#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1201 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1202 movq K, %rax 1203 subq KKK, %rax 1204 leaq (,%rax, SIZE), %rax 1205 leaq (AO, %rax, 2), AO 1206 leaq (BO, %rax, 8), BO 1207#endif 1208 1209#if defined(TRMMKERNEL) && defined(LEFT) 1210 addq $2, KK 1211#endif 1212 1213 addq $2 * SIZE, CO1 # coffset += 4 1214 addq $2 * SIZE, CO2 # coffset += 4 1215 ALIGN_3 1216 1217.L30: 1218 testq $1, M 1219 je .L39 1220 ALIGN_3 1221 1222.L31: 1223#if !defined(TRMMKERNEL) || \ 1224 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1225 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1226 1227 leaq BUFFER, BO 1228#else 1229 leaq BUFFER, BO 1230 movq KK, %rax 1231 leaq (, %rax, SIZE), %rax 1232 leaq (AO, %rax, 1), AO 1233 leaq (BO, %rax, 8), BO 1234#endif 1235 1236 movsd -16 * SIZE(AO), %xmm0 1237 pxor %xmm8, %xmm8 1238 movsd 0 * SIZE(BO), %xmm1 1239 pxor %xmm9, %xmm9 1240 movsd -8 * SIZE(AO), %xmm2 1241 pxor %xmm10, %xmm10 1242 movsd 8 * SIZE(BO), %xmm3 1243 pxor %xmm11, %xmm11 1244 1245 movsd 16 * SIZE(BO), %xmm5 1246 movsd 24 * SIZE(BO), %xmm7 1247 1248#ifndef TRMMKERNEL 1249 movq K, %rax 1250#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1251 movq K, %rax 1252 subq KK, %rax 1253 movq %rax, KKK 1254#else 1255 movq KK, %rax 1256#ifdef LEFT 1257 addq $1, %rax 1258#else 1259 addq $4, %rax 1260#endif 1261 movq %rax, KKK 1262#endif 1263 sarq $3, %rax 1264 je .L35 1265 ALIGN_3 1266 1267.L32: 1268 mulsd %xmm0, %xmm1 1269 addsd %xmm1, %xmm8 1270 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1271 movsd 2 * SIZE(BO), %xmm1 1272 mulsd %xmm0, %xmm1 1273 addsd %xmm1, %xmm9 1274 movsd 4 * SIZE(BO), %xmm1 1275 mulsd %xmm0, %xmm1 1276 mulsd 6 * SIZE(BO), %xmm0 1277 addsd %xmm1, %xmm10 1278 movsd 32 * SIZE(BO), %xmm1 1279 addsd %xmm0, %xmm11 1280 movsd -15 * SIZE(AO), %xmm0 1281 1282 mulsd %xmm0, %xmm3 1283 addsd %xmm3, %xmm8 1284 movsd 10 * SIZE(BO), %xmm3 1285 mulsd %xmm0, %xmm3 1286 addsd %xmm3, %xmm9 1287 movsd 12 * SIZE(BO), %xmm3 1288 mulsd %xmm0, %xmm3 1289 mulsd 14 * SIZE(BO), %xmm0 1290 addsd %xmm3, %xmm10 1291 movsd 40 * SIZE(BO), %xmm3 1292 addsd %xmm0, %xmm11 1293 movsd -14 * SIZE(AO), %xmm0 1294 1295 mulsd %xmm0, %xmm5 1296 addsd %xmm5, %xmm8 1297 movsd 18 * SIZE(BO), %xmm5 1298 mulsd %xmm0, %xmm5 1299 addsd %xmm5, %xmm9 1300 movsd 20 * SIZE(BO), %xmm5 1301 mulsd %xmm0, %xmm5 1302 mulsd 22 * SIZE(BO), %xmm0 1303 addsd %xmm5, %xmm10 1304 movsd 48 * SIZE(BO), %xmm5 1305 addsd %xmm0, %xmm11 1306 movsd -13 * SIZE(AO), %xmm0 1307 1308 mulsd %xmm0, %xmm7 1309 addsd %xmm7, %xmm8 1310 movsd 26 * SIZE(BO), %xmm7 1311 mulsd %xmm0, %xmm7 1312 addsd %xmm7, %xmm9 1313 movsd 28 * SIZE(BO), %xmm7 1314 mulsd %xmm0, %xmm7 1315 mulsd 30 * SIZE(BO), %xmm0 1316 addsd %xmm7, %xmm10 1317 movsd 56 * SIZE(BO), %xmm7 1318 addsd %xmm0, %xmm11 1319 movsd -12 * SIZE(AO), %xmm0 1320 1321 mulsd %xmm0, %xmm1 1322 addsd %xmm1, %xmm8 1323 movsd 34 * SIZE(BO), %xmm1 1324 mulsd %xmm0, %xmm1 1325 addsd %xmm1, %xmm9 1326 movsd 36 * SIZE(BO), %xmm1 1327 mulsd %xmm0, %xmm1 1328 mulsd 38 * SIZE(BO), %xmm0 1329 addsd %xmm1, %xmm10 1330 movsd 64 * SIZE(BO), %xmm1 1331 addsd %xmm0, %xmm11 1332 movsd -11 * SIZE(AO), %xmm0 1333 1334 mulsd %xmm0, %xmm3 1335 addsd %xmm3, %xmm8 1336 movsd 42 * SIZE(BO), %xmm3 1337 mulsd %xmm0, %xmm3 1338 addsd %xmm3, %xmm9 1339 movsd 44 * SIZE(BO), %xmm3 1340 mulsd %xmm0, %xmm3 1341 mulsd 46 * SIZE(BO), %xmm0 1342 addsd %xmm3, %xmm10 1343 movsd 72 * SIZE(BO), %xmm3 1344 addsd %xmm0, %xmm11 1345 movsd -10 * SIZE(AO), %xmm0 1346 1347 mulsd %xmm0, %xmm5 1348 addsd %xmm5, %xmm8 1349 movsd 50 * SIZE(BO), %xmm5 1350 mulsd %xmm0, %xmm5 1351 addsd %xmm5, %xmm9 1352 movsd 52 * SIZE(BO), %xmm5 1353 mulsd %xmm0, %xmm5 1354 mulsd 54 * SIZE(BO), %xmm0 1355 addsd %xmm5, %xmm10 1356 movsd 80 * SIZE(BO), %xmm5 1357 addsd %xmm0, %xmm11 1358 movsd -9 * SIZE(AO), %xmm0 1359 1360 mulsd %xmm0, %xmm7 1361 addsd %xmm7, %xmm8 1362 movsd 58 * SIZE(BO), %xmm7 1363 mulsd %xmm0, %xmm7 1364 addsd %xmm7, %xmm9 1365 movsd 60 * SIZE(BO), %xmm7 1366 mulsd %xmm0, %xmm7 1367 mulsd 62 * SIZE(BO), %xmm0 1368 addsd %xmm7, %xmm10 1369 movsd 88 * SIZE(BO), %xmm7 1370 addsd %xmm0, %xmm11 1371 movsd -8 * SIZE(AO), %xmm0 1372 1373 addq $ 8 * SIZE, AO 1374 addq $64 * SIZE, BO 1375 decq %rax 1376 jne .L32 1377 ALIGN_3 1378 1379.L35: 1380#ifndef TRMMKERNEL 1381 movq K, %rax 1382#else 1383 movq KKK, %rax 1384#endif 1385 movsd ALPHA, %xmm7 1386 andq $7, %rax # if (k & 1) 1387 BRANCH 1388 je .L38 1389 ALIGN_3 1390 1391.L36: 1392 mulsd %xmm0, %xmm1 1393 addsd %xmm1, %xmm8 1394 movsd 2 * SIZE(BO), %xmm1 1395 mulsd %xmm0, %xmm1 1396 addsd %xmm1, %xmm9 1397 movsd 4 * SIZE(BO), %xmm1 1398 mulsd %xmm0, %xmm1 1399 mulsd 6 * SIZE(BO), %xmm0 1400 addsd %xmm1, %xmm10 1401 movsd 8 * SIZE(BO), %xmm1 1402 addsd %xmm0, %xmm11 1403 movsd -15 * SIZE(AO), %xmm0 1404 1405 addq $1 * SIZE, AO # aoffset += 4 1406 addq $8 * SIZE, BO # boffset1 += 8 1407 decq %rax 1408 jg .L36 1409 ALIGN_3 1410 1411.L38: 1412#ifndef TRMMKERNEL 1413 movsd 0 * SIZE(CO1), %xmm0 1414 movsd 0 * SIZE(CO2), %xmm2 1415 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 1416 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 1417#endif 1418 1419 mulsd %xmm7, %xmm8 1420 mulsd %xmm7, %xmm9 1421 mulsd %xmm7, %xmm10 1422 mulsd %xmm7, %xmm11 1423 1424#ifndef TRMMKERNEL 1425 addsd %xmm0, %xmm8 1426 addsd %xmm2, %xmm9 1427 addsd %xmm4, %xmm10 1428 addsd %xmm6, %xmm11 1429#endif 1430 1431 movsd %xmm8, 0 * SIZE(CO1) 1432 movsd %xmm9, 0 * SIZE(CO2) 1433 movsd %xmm10, 0 * SIZE(CO1, LDC, 2) 1434 movsd %xmm11, 0 * SIZE(CO2, LDC, 2) 1435 1436#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1437 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1438 movq K, %rax 1439 subq KKK, %rax 1440 leaq (,%rax, SIZE), %rax 1441 leaq (AO, %rax, 1), AO 1442 leaq (BO, %rax, 8), BO 1443#endif 1444 1445#if defined(TRMMKERNEL) && defined(LEFT) 1446 addq $1, KK 1447#endif 1448 ALIGN_3 1449 1450.L39: 1451#if defined(TRMMKERNEL) && !defined(LEFT) 1452 addl $4, KK 1453#endif 1454 1455 leaq (C, LDC, 4), C # c += 4 * ldc 1456 decq J # j -- 1457 jg .L01 1458 ALIGN_3 1459 1460.L40: 1461 testq $3, N 1462 je .L999 1463 1464 testq $2, N 1465 je .L80 1466 ALIGN_4 1467 1468.L41: 1469/* Copying to Sub Buffer */ 1470 leaq BUFFER, BO 1471 1472#if defined(TRMMKERNEL) && defined(LEFT) 1473 movq OFFSET, %rax 1474 movq %rax, KK 1475#endif 1476 1477 movq K, %rax 1478 sarq $2, %rax 1479 jle .L43 1480 ALIGN_3 1481 1482.L42: 1483 PREFETCH 56 * SIZE(B) 1484 1485 movq 0 * SIZE(B), %mm0 1486 movq 1 * SIZE(B), %mm1 1487 movq 2 * SIZE(B), %mm2 1488 movq 3 * SIZE(B), %mm3 1489 movq 4 * SIZE(B), %mm4 1490 movq 5 * SIZE(B), %mm5 1491 movq 6 * SIZE(B), %mm6 1492 movq 7 * SIZE(B), %mm7 1493 1494 addq $ 8 * SIZE, B 1495 addq $16 * SIZE, BO 1496 1497 movq %mm0, -16 * SIZE(BO) 1498 movq %mm0, -15 * SIZE(BO) 1499 movq %mm1, -14 * SIZE(BO) 1500 movq %mm1, -13 * SIZE(BO) 1501 movq %mm2, -12 * SIZE(BO) 1502 movq %mm2, -11 * SIZE(BO) 1503 movq %mm3, -10 * SIZE(BO) 1504 movq %mm3, -9 * SIZE(BO) 1505 movq %mm4, -8 * SIZE(BO) 1506 movq %mm4, -7 * SIZE(BO) 1507 movq %mm5, -6 * SIZE(BO) 1508 movq %mm5, -5 * SIZE(BO) 1509 movq %mm6, -4 * SIZE(BO) 1510 movq %mm6, -3 * SIZE(BO) 1511 movq %mm7, -2 * SIZE(BO) 1512 movq %mm7, -1 * SIZE(BO) 1513 1514 decq %rax 1515 jne .L42 1516 ALIGN_3 1517 1518.L43: 1519 movq K, %rax 1520 andq $3, %rax 1521 BRANCH 1522 jle .L50 1523 ALIGN_3 1524 1525.L44: 1526 movq 0 * SIZE(B), %mm0 1527 movq 1 * SIZE(B), %mm1 1528 1529 movq %mm0, 0 * SIZE(BO) 1530 movq %mm0, 1 * SIZE(BO) 1531 movq %mm1, 2 * SIZE(BO) 1532 movq %mm1, 3 * SIZE(BO) 1533 1534 addq $2 * SIZE, B 1535 addq $4 * SIZE, BO 1536 decq %rax 1537 jne .L44 1538 ALIGN_3 1539 1540.L50: 1541 movq C, CO1 # coffset1 = c 1542 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1543 movq A, AO # aoffset = a 1544 1545 movq M, I 1546 sarq $2, I # i = (m >> 2) 1547 jle .L60 1548 ALIGN_3 1549 1550.L51: 1551#if !defined(TRMMKERNEL) || \ 1552 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1553 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1554 1555 leaq BUFFER, BO 1556#else 1557 leaq BUFFER, BO 1558 movq KK, %rax 1559 leaq (, %rax, SIZE), %rax 1560 leaq (AO, %rax, 4), AO 1561 leaq (BO, %rax, 4), BO 1562#endif 1563 1564 movapd -16 * SIZE(AO), %xmm0 1565 pxor %xmm8, %xmm8 1566 movapd 0 * SIZE(BO), %xmm1 1567 pxor %xmm9, %xmm9 1568 movapd -8 * SIZE(AO), %xmm2 1569 pxor %xmm12, %xmm12 1570 movapd 8 * SIZE(BO), %xmm3 1571 pxor %xmm13, %xmm13 1572 1573 movapd 0 * SIZE(AO), %xmm4 1574 movapd 16 * SIZE(BO), %xmm5 1575 movapd 8 * SIZE(AO), %xmm6 1576 movapd 24 * SIZE(BO), %xmm7 1577 1578 PREFETCHW 4 * SIZE(CO1) 1579 PREFETCHW 4 * SIZE(CO2) 1580 1581#ifndef TRMMKERNEL 1582 movq K, %rax 1583#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1584 movq K, %rax 1585 subq KK, %rax 1586 movq %rax, KKK 1587#else 1588 movq KK, %rax 1589#ifdef LEFT 1590 addq $4, %rax 1591#else 1592 addq $2, %rax 1593#endif 1594 movq %rax, KKK 1595#endif 1596 sarq $3, %rax 1597 je .L55 1598 ALIGN_3 1599 1600.L52: 1601 mulpd %xmm0, %xmm1 1602 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1603 mulpd 2 * SIZE(BO), %xmm0 1604 addpd %xmm1, %xmm8 1605 movapd 0 * SIZE(BO), %xmm1 1606 addpd %xmm0, %xmm9 1607 movapd -14 * SIZE(AO), %xmm0 1608 mulpd %xmm0, %xmm1 1609 mulpd 2 * SIZE(BO), %xmm0 1610 addpd %xmm1, %xmm12 1611 movapd 4 * SIZE(BO), %xmm1 1612 addpd %xmm0, %xmm13 1613 movapd -12 * SIZE(AO), %xmm0 1614 1615 mulpd %xmm0, %xmm1 1616 mulpd 6 * SIZE(BO), %xmm0 1617 addpd %xmm1, %xmm8 1618 movapd 4 * SIZE(BO), %xmm1 1619 addpd %xmm0, %xmm9 1620 movapd -10 * SIZE(AO), %xmm0 1621 mulpd %xmm0, %xmm1 1622 mulpd 6 * SIZE(BO), %xmm0 1623 addpd %xmm1, %xmm12 1624 movapd 32 * SIZE(BO), %xmm1 1625 addpd %xmm0, %xmm13 1626 movapd 16 * SIZE(AO), %xmm0 1627 1628 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1629 mulpd %xmm2, %xmm3 1630 mulpd 10 * SIZE(BO), %xmm2 1631 addpd %xmm3, %xmm8 1632 movapd 8 * SIZE(BO), %xmm3 1633 addpd %xmm2, %xmm9 1634 movapd -6 * SIZE(AO), %xmm2 1635 mulpd %xmm2, %xmm3 1636 mulpd 10 * SIZE(BO), %xmm2 1637 addpd %xmm3, %xmm12 1638 movapd 12 * SIZE(BO), %xmm3 1639 addpd %xmm2, %xmm13 1640 movapd -4 * SIZE(AO), %xmm2 1641 1642 mulpd %xmm2, %xmm3 1643 mulpd 14 * SIZE(BO), %xmm2 1644 addpd %xmm3, %xmm8 1645 movapd 12 * SIZE(BO), %xmm3 1646 addpd %xmm2, %xmm9 1647 movapd -2 * SIZE(AO), %xmm2 1648 mulpd %xmm2, %xmm3 1649 mulpd 14 * SIZE(BO), %xmm2 1650 addpd %xmm3, %xmm12 1651 movapd 40 * SIZE(BO), %xmm3 1652 addpd %xmm2, %xmm13 1653 movapd 24 * SIZE(AO), %xmm2 1654 1655 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1656 mulpd %xmm4, %xmm5 1657 mulpd 18 * SIZE(BO), %xmm4 1658 addpd %xmm5, %xmm8 1659 movapd 16 * SIZE(BO), %xmm5 1660 addpd %xmm4, %xmm9 1661 movapd 2 * SIZE(AO), %xmm4 1662 mulpd %xmm4, %xmm5 1663 mulpd 18 * SIZE(BO), %xmm4 1664 addpd %xmm5, %xmm12 1665 movapd 20 * SIZE(BO), %xmm5 1666 addpd %xmm4, %xmm13 1667 movapd 4 * SIZE(AO), %xmm4 1668 1669 mulpd %xmm4, %xmm5 1670 mulpd 22 * SIZE(BO), %xmm4 1671 addpd %xmm5, %xmm8 1672 movapd 20 * SIZE(BO), %xmm5 1673 addpd %xmm4, %xmm9 1674 movapd 6 * SIZE(AO), %xmm4 1675 mulpd %xmm4, %xmm5 1676 mulpd 22 * SIZE(BO), %xmm4 1677 addpd %xmm5, %xmm12 1678 movapd 48 * SIZE(BO), %xmm5 1679 addpd %xmm4, %xmm13 1680 movapd 32 * SIZE(AO), %xmm4 1681 1682 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 1683 mulpd %xmm6, %xmm7 1684 mulpd 26 * SIZE(BO), %xmm6 1685 addpd %xmm7, %xmm8 1686 movapd 24 * SIZE(BO), %xmm7 1687 addpd %xmm6, %xmm9 1688 movapd 10 * SIZE(AO), %xmm6 1689 mulpd %xmm6, %xmm7 1690 mulpd 26 * SIZE(BO), %xmm6 1691 addpd %xmm7, %xmm12 1692 movapd 28 * SIZE(BO), %xmm7 1693 addpd %xmm6, %xmm13 1694 movapd 12 * SIZE(AO), %xmm6 1695 1696 mulpd %xmm6, %xmm7 1697 mulpd 30 * SIZE(BO), %xmm6 1698 addpd %xmm7, %xmm8 1699 movapd 28 * SIZE(BO), %xmm7 1700 addpd %xmm6, %xmm9 1701 movapd 14 * SIZE(AO), %xmm6 1702 mulpd %xmm6, %xmm7 1703 mulpd 30 * SIZE(BO), %xmm6 1704 addpd %xmm7, %xmm12 1705 movapd 56 * SIZE(BO), %xmm7 1706 addpd %xmm6, %xmm13 1707 movapd 40 * SIZE(AO), %xmm6 1708 1709 addq $32 * SIZE, AO 1710 addq $32 * SIZE, BO 1711 decq %rax 1712 jne .L52 1713 ALIGN_3 1714 1715.L55: 1716#ifndef TRMMKERNEL 1717 movq K, %rax 1718#else 1719 movq KKK, %rax 1720#endif 1721 movapd ALPHA, %xmm7 1722 andq $7, %rax # if (k & 1) 1723 BRANCH 1724 je .L59 1725 ALIGN_3 1726 1727.L56: 1728 movapd 0 * SIZE(BO), %xmm1 1729 mulpd %xmm0, %xmm1 1730 addpd %xmm1, %xmm8 1731 mulpd 2 * SIZE(BO), %xmm0 1732 addpd %xmm0, %xmm9 1733 movapd -14 * SIZE(AO), %xmm0 1734 movapd 0 * SIZE(BO), %xmm1 1735 mulpd %xmm0, %xmm1 1736 addpd %xmm1, %xmm12 1737 mulpd 2 * SIZE(BO), %xmm0 1738 addpd %xmm0, %xmm13 1739 movapd -12 * SIZE(AO), %xmm0 1740 1741 addq $4 * SIZE, AO # aoffset += 4 1742 addq $4 * SIZE, BO # boffset1 += 8 1743 decq %rax 1744 jg .L56 1745 ALIGN_3 1746 1747.L59: 1748#ifndef TRMMKERNEL 1749 movsd 0 * SIZE(CO1), %xmm0 1750 movhpd 1 * SIZE(CO1), %xmm0 1751 movsd 2 * SIZE(CO1), %xmm1 1752 movhpd 3 * SIZE(CO1), %xmm1 1753 movsd 0 * SIZE(CO2), %xmm2 1754 movhpd 1 * SIZE(CO2), %xmm2 1755 movsd 2 * SIZE(CO2), %xmm3 1756 movhpd 3 * SIZE(CO2), %xmm3 1757#endif 1758 1759 mulpd %xmm7, %xmm8 1760 mulpd %xmm7, %xmm9 1761 mulpd %xmm7, %xmm12 1762 mulpd %xmm7, %xmm13 1763 1764#ifndef TRMMKERNEL 1765 addpd %xmm0, %xmm8 1766 addpd %xmm1, %xmm12 1767 addpd %xmm2, %xmm9 1768 addpd %xmm3, %xmm13 1769#endif 1770 1771 movsd %xmm8, 0 * SIZE(CO1) 1772 movhpd %xmm8, 1 * SIZE(CO1) 1773 movsd %xmm12, 2 * SIZE(CO1) 1774 movhpd %xmm12, 3 * SIZE(CO1) 1775 movsd %xmm9, 0 * SIZE(CO2) 1776 movhpd %xmm9, 1 * SIZE(CO2) 1777 movsd %xmm13, 2 * SIZE(CO2) 1778 movhpd %xmm13, 3 * SIZE(CO2) 1779 1780#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1781 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1782 movq K, %rax 1783 subq KKK, %rax 1784 leaq (,%rax, SIZE), %rax 1785 leaq (AO, %rax, 4), AO 1786 leaq (BO, %rax, 4), BO 1787#endif 1788 1789#if defined(TRMMKERNEL) && defined(LEFT) 1790 addq $4, KK 1791#endif 1792 1793 addq $4 * SIZE, CO1 # coffset += 4 1794 addq $4 * SIZE, CO2 # coffset += 4 1795 decq I # i -- 1796 jg .L51 1797 ALIGN_3 1798 1799.L60: 1800 testq $2, M 1801 je .L70 1802 ALIGN_3 1803 1804.L61: 1805#if !defined(TRMMKERNEL) || \ 1806 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1807 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1808 1809 leaq BUFFER, BO 1810#else 1811 leaq BUFFER, BO 1812 movq KK, %rax 1813 leaq (, %rax, SIZE), %rax 1814 leaq (AO, %rax, 2), AO 1815 leaq (BO, %rax, 4), BO 1816#endif 1817 1818 movapd -16 * SIZE(AO), %xmm0 1819 pxor %xmm8, %xmm8 1820 movapd 0 * SIZE(BO), %xmm1 1821 pxor %xmm9, %xmm9 1822 movapd -8 * SIZE(AO), %xmm2 1823 pxor %xmm10, %xmm10 1824 movapd 8 * SIZE(BO), %xmm3 1825 pxor %xmm11, %xmm11 1826 1827 movapd 16 * SIZE(BO), %xmm5 1828 movapd 24 * SIZE(BO), %xmm7 1829 1830#ifndef TRMMKERNEL 1831 movq K, %rax 1832#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1833 movq K, %rax 1834 subq KK, %rax 1835 movq %rax, KKK 1836#else 1837 movq KK, %rax 1838#ifdef LEFT 1839 addq $2, %rax 1840#else 1841 addq $2, %rax 1842#endif 1843 movq %rax, KKK 1844#endif 1845 sarq $3, %rax 1846 je .L65 1847 ALIGN_3 1848 1849.L62: 1850 mulpd %xmm0, %xmm1 1851 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1852 mulpd 2 * SIZE(BO), %xmm0 1853 addpd %xmm1, %xmm8 1854 movapd 4 * SIZE(BO), %xmm1 1855 addpd %xmm0, %xmm9 1856 movapd -14 * SIZE(AO), %xmm0 1857 1858 mulpd %xmm0, %xmm1 1859 mulpd 6 * SIZE(BO), %xmm0 1860 addpd %xmm1, %xmm10 1861 movapd 32 * SIZE(BO), %xmm1 1862 addpd %xmm0, %xmm11 1863 movapd -12 * SIZE(AO), %xmm0 1864 1865 mulpd %xmm0, %xmm3 1866 mulpd 10 * SIZE(BO), %xmm0 1867 addpd %xmm3, %xmm8 1868 movapd 12 * SIZE(BO), %xmm3 1869 addpd %xmm0, %xmm9 1870 movapd -10 * SIZE(AO), %xmm0 1871 1872 mulpd %xmm0, %xmm3 1873 mulpd 14 * SIZE(BO), %xmm0 1874 addpd %xmm3, %xmm10 1875 movapd 40 * SIZE(BO), %xmm3 1876 addpd %xmm0, %xmm11 1877 movapd 0 * SIZE(AO), %xmm0 1878 1879 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1880 mulpd %xmm2, %xmm5 1881 mulpd 18 * SIZE(BO), %xmm2 1882 addpd %xmm5, %xmm8 1883 movapd 20 * SIZE(BO), %xmm5 1884 addpd %xmm2, %xmm9 1885 movapd -6 * SIZE(AO), %xmm2 1886 1887 mulpd %xmm2, %xmm5 1888 mulpd 22 * SIZE(BO), %xmm2 1889 addpd %xmm5, %xmm10 1890 movapd 48 * SIZE(BO), %xmm5 1891 addpd %xmm2, %xmm11 1892 movapd -4 * SIZE(AO), %xmm2 1893 1894 mulpd %xmm2, %xmm7 1895 mulpd 26 * SIZE(BO), %xmm2 1896 addpd %xmm7, %xmm8 1897 movapd 28 * SIZE(BO), %xmm7 1898 addpd %xmm2, %xmm9 1899 movapd -2 * SIZE(AO), %xmm2 1900 1901 mulpd %xmm2, %xmm7 1902 mulpd 30 * SIZE(BO), %xmm2 1903 addpd %xmm7, %xmm10 1904 movapd 56 * SIZE(BO), %xmm7 1905 addpd %xmm2, %xmm11 1906 movapd 8 * SIZE(AO), %xmm2 1907 1908 addq $16 * SIZE, AO 1909 addq $32 * SIZE, BO 1910 decq %rax 1911 jne .L62 1912 ALIGN_3 1913 1914.L65: 1915#ifndef TRMMKERNEL 1916 movq K, %rax 1917#else 1918 movq KKK, %rax 1919#endif 1920 movapd ALPHA, %xmm7 1921 andq $7, %rax # if (k & 1) 1922 BRANCH 1923 je .L69 1924 ALIGN_3 1925 1926.L66: 1927 mulpd %xmm0, %xmm1 1928 mulpd 2 * SIZE(BO), %xmm0 1929 addpd %xmm1, %xmm8 1930 movapd 4 * SIZE(BO), %xmm1 1931 addpd %xmm0, %xmm9 1932 movapd -14 * SIZE(AO), %xmm0 1933 1934 addq $2 * SIZE, AO # aoffset += 4 1935 addq $4 * SIZE, BO # boffset1 += 8 1936 decq %rax 1937 jg .L66 1938 ALIGN_3 1939 1940.L69: 1941#ifndef TRMMKERNEL 1942 movsd 0 * SIZE(CO1), %xmm0 1943 movhpd 1 * SIZE(CO1), %xmm0 1944 movsd 0 * SIZE(CO2), %xmm2 1945 movhpd 1 * SIZE(CO2), %xmm2 1946#endif 1947 1948 addpd %xmm10, %xmm8 1949 addpd %xmm11, %xmm9 1950 1951 mulpd %xmm7, %xmm8 1952 mulpd %xmm7, %xmm9 1953 1954#ifndef TRMMKERNEL 1955 addpd %xmm0, %xmm8 1956 addpd %xmm2, %xmm9 1957#endif 1958 1959 movsd %xmm8, 0 * SIZE(CO1) 1960 movhpd %xmm8, 1 * SIZE(CO1) 1961 movsd %xmm9, 0 * SIZE(CO2) 1962 movhpd %xmm9, 1 * SIZE(CO2) 1963 1964#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1965 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1966 movq K, %rax 1967 subq KKK, %rax 1968 leaq (,%rax, SIZE), %rax 1969 leaq (AO, %rax, 2), AO 1970 leaq (BO, %rax, 4), BO 1971#endif 1972 1973#if defined(TRMMKERNEL) && defined(LEFT) 1974 addq $2, KK 1975#endif 1976 1977 addq $2 * SIZE, CO1 # coffset += 4 1978 addq $2 * SIZE, CO2 # coffset += 4 1979 ALIGN_3 1980 1981.L70: 1982 testq $1, M 1983 je .L79 1984 ALIGN_3 1985 1986.L71: 1987#if !defined(TRMMKERNEL) || \ 1988 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1989 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1990 1991 leaq BUFFER, BO 1992#else 1993 leaq BUFFER, BO 1994 movq KK, %rax 1995 leaq (, %rax, SIZE), %rax 1996 leaq (AO, %rax, 1), AO 1997 leaq (BO, %rax, 4), BO 1998#endif 1999 2000 movsd -16 * SIZE(AO), %xmm0 2001 pxor %xmm8, %xmm8 2002 movsd 0 * SIZE(BO), %xmm1 2003 pxor %xmm9, %xmm9 2004 movsd -12 * SIZE(AO), %xmm2 2005 pxor %xmm10, %xmm10 2006 movsd 8 * SIZE(BO), %xmm3 2007 pxor %xmm11, %xmm11 2008 2009 movsd 16 * SIZE(BO), %xmm5 2010 movsd 24 * SIZE(BO), %xmm7 2011 2012#ifndef TRMMKERNEL 2013 movq K, %rax 2014#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2015 movq K, %rax 2016 subq KK, %rax 2017 movq %rax, KKK 2018#else 2019 movq KK, %rax 2020#ifdef LEFT 2021 addq $1, %rax 2022#else 2023 addq $2, %rax 2024#endif 2025 movq %rax, KKK 2026#endif 2027 sarq $3, %rax 2028 je .L75 2029 ALIGN_3 2030 2031.L72: 2032 mulsd %xmm0, %xmm1 2033 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2034 mulsd 2 * SIZE(BO), %xmm0 2035 addsd %xmm1, %xmm8 2036 movsd 4 * SIZE(BO), %xmm1 2037 addsd %xmm0, %xmm9 2038 movsd -15 * SIZE(AO), %xmm0 2039 2040 mulsd %xmm0, %xmm1 2041 mulsd 6 * SIZE(BO), %xmm0 2042 addsd %xmm1, %xmm10 2043 movsd 32 * SIZE(BO), %xmm1 2044 addsd %xmm0, %xmm11 2045 movsd -14 * SIZE(AO), %xmm0 2046 2047 mulsd %xmm0, %xmm3 2048 mulsd 10 * SIZE(BO), %xmm0 2049 addsd %xmm3, %xmm8 2050 movsd 12 * SIZE(BO), %xmm3 2051 addsd %xmm0, %xmm9 2052 movsd -13 * SIZE(AO), %xmm0 2053 2054 mulsd %xmm0, %xmm3 2055 mulsd 14 * SIZE(BO), %xmm0 2056 addsd %xmm3, %xmm10 2057 movsd 40 * SIZE(BO), %xmm3 2058 addsd %xmm0, %xmm11 2059 movsd -8 * SIZE(AO), %xmm0 2060 2061 mulsd %xmm2, %xmm5 2062 mulsd 18 * SIZE(BO), %xmm2 2063 addsd %xmm5, %xmm8 2064 movsd 20 * SIZE(BO), %xmm5 2065 addsd %xmm2, %xmm9 2066 movsd -11 * SIZE(AO), %xmm2 2067 2068 mulsd %xmm2, %xmm5 2069 mulsd 22 * SIZE(BO), %xmm2 2070 addsd %xmm5, %xmm10 2071 movsd 48 * SIZE(BO), %xmm5 2072 addsd %xmm2, %xmm11 2073 movsd -10 * SIZE(AO), %xmm2 2074 2075 mulsd %xmm2, %xmm7 2076 mulsd 26 * SIZE(BO), %xmm2 2077 addsd %xmm7, %xmm8 2078 movsd 28 * SIZE(BO), %xmm7 2079 addsd %xmm2, %xmm9 2080 movsd -9 * SIZE(AO), %xmm2 2081 2082 mulsd %xmm2, %xmm7 2083 mulsd 30 * SIZE(BO), %xmm2 2084 addsd %xmm7, %xmm10 2085 movsd 56 * SIZE(BO), %xmm7 2086 addsd %xmm2, %xmm11 2087 movsd -4 * SIZE(AO), %xmm2 2088 2089 addq $ 8 * SIZE, AO 2090 addq $32 * SIZE, BO 2091 decq %rax 2092 jne .L72 2093 ALIGN_3 2094 2095.L75: 2096#ifndef TRMMKERNEL 2097 movq K, %rax 2098#else 2099 movq KKK, %rax 2100#endif 2101 movsd ALPHA, %xmm7 2102 andq $7, %rax # if (k & 1) 2103 BRANCH 2104 je .L78 2105 ALIGN_3 2106 2107.L76: 2108 mulsd %xmm0, %xmm1 2109 mulsd 2 * SIZE(BO), %xmm0 2110 addsd %xmm1, %xmm8 2111 addsd %xmm0, %xmm9 2112 movsd -15 * SIZE(AO), %xmm0 2113 movsd 4 * SIZE(BO), %xmm1 2114 2115 addq $1 * SIZE, AO # aoffset += 4 2116 addq $4 * SIZE, BO # boffset1 += 8 2117 decq %rax 2118 jg .L76 2119 ALIGN_3 2120 2121.L78: 2122#ifndef TRMMKERNEL 2123 movsd 0 * SIZE(CO1), %xmm0 2124 movsd 0 * SIZE(CO2), %xmm2 2125#endif 2126 2127 addsd %xmm10, %xmm8 2128 addsd %xmm11, %xmm9 2129 2130 mulsd %xmm7, %xmm8 2131 mulsd %xmm7, %xmm9 2132 2133#ifndef TRMMKERNEL 2134 addsd %xmm0, %xmm8 2135 addsd %xmm2, %xmm9 2136#endif 2137 2138 movsd %xmm8, 0 * SIZE(CO1) 2139 movsd %xmm9, 0 * SIZE(CO2) 2140 2141#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2142 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2143 movq K, %rax 2144 subq KKK, %rax 2145 leaq (,%rax, SIZE), %rax 2146 leaq (AO, %rax, 1), AO 2147 leaq (BO, %rax, 4), BO 2148#endif 2149 2150#if defined(TRMMKERNEL) && defined(LEFT) 2151 addq $1, KK 2152#endif 2153 ALIGN_3 2154 2155.L79: 2156#if defined(TRMMKERNEL) && !defined(LEFT) 2157 addl $2, KK 2158#endif 2159 leaq (C, LDC, 2), C 2160 ALIGN_3 2161 2162.L80: 2163 testq $1, N 2164 je .L999 2165 ALIGN_4 2166 2167.L81: 2168/* Copying to Sub Buffer */ 2169 leaq BUFFER, BO 2170 2171#if defined(TRMMKERNEL) && defined(LEFT) 2172 movq OFFSET, %rax 2173 movq %rax, KK 2174#endif 2175 2176 movq K, %rax 2177 sarq $3, %rax 2178 jle .L83 2179 ALIGN_3 2180 2181.L82: 2182 PREFETCH 56 * SIZE(B) 2183 2184 movq 0 * SIZE(B), %mm0 2185 movq 1 * SIZE(B), %mm1 2186 movq 2 * SIZE(B), %mm2 2187 movq 3 * SIZE(B), %mm3 2188 movq 4 * SIZE(B), %mm4 2189 movq 5 * SIZE(B), %mm5 2190 movq 6 * SIZE(B), %mm6 2191 movq 7 * SIZE(B), %mm7 2192 2193 addq $ 8 * SIZE, B 2194 addq $16 * SIZE, BO 2195 2196 movq %mm0, -16 * SIZE(BO) 2197 movq %mm0, -15 * SIZE(BO) 2198 movq %mm1, -14 * SIZE(BO) 2199 movq %mm1, -13 * SIZE(BO) 2200 movq %mm2, -12 * SIZE(BO) 2201 movq %mm2, -11 * SIZE(BO) 2202 movq %mm3, -10 * SIZE(BO) 2203 movq %mm3, -9 * SIZE(BO) 2204 movq %mm4, -8 * SIZE(BO) 2205 movq %mm4, -7 * SIZE(BO) 2206 movq %mm5, -6 * SIZE(BO) 2207 movq %mm5, -5 * SIZE(BO) 2208 movq %mm6, -4 * SIZE(BO) 2209 movq %mm6, -3 * SIZE(BO) 2210 movq %mm7, -2 * SIZE(BO) 2211 movq %mm7, -1 * SIZE(BO) 2212 2213 decq %rax 2214 jne .L82 2215 ALIGN_3 2216 2217.L83: 2218 movq K, %rax 2219 andq $7, %rax 2220 BRANCH 2221 jle .L90 2222 ALIGN_3 2223 2224.L84: 2225 movq 0 * SIZE(B), %mm0 2226 2227 movq %mm0, 0 * SIZE(BO) 2228 movq %mm0, 1 * SIZE(BO) 2229 2230 addq $1 * SIZE, B 2231 addq $2 * SIZE, BO 2232 decq %rax 2233 jne .L84 2234 ALIGN_3 2235 2236.L90: 2237 movq C, CO1 # coffset1 = c 2238 movq A, AO # aoffset = a 2239 2240 movq M, I 2241 sarq $2, I # i = (m >> 2) 2242 jle .L100 2243 ALIGN_3 2244 2245.L91: 2246#if !defined(TRMMKERNEL) || \ 2247 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2248 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2249 2250 leaq BUFFER, BO 2251#else 2252 leaq BUFFER, BO 2253 movq KK, %rax 2254 leaq (, %rax, SIZE), %rax 2255 leaq (AO, %rax, 4), AO 2256 leaq (BO, %rax, 2), BO 2257#endif 2258 2259 movapd -16 * SIZE(AO), %xmm0 2260 pxor %xmm8, %xmm8 2261 movapd 0 * SIZE(BO), %xmm1 2262 pxor %xmm9, %xmm9 2263 movapd -8 * SIZE(AO), %xmm2 2264 pxor %xmm10, %xmm10 2265 movapd 8 * SIZE(BO), %xmm3 2266 pxor %xmm11, %xmm11 2267 2268 movapd 0 * SIZE(AO), %xmm4 2269 movapd 8 * SIZE(AO), %xmm6 2270 2271 PREFETCHW 4 * SIZE(CO1) 2272 2273#ifndef TRMMKERNEL 2274 movq K, %rax 2275#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2276 movq K, %rax 2277 subq KK, %rax 2278 movq %rax, KKK 2279#else 2280 movq KK, %rax 2281#ifdef LEFT 2282 addq $4, %rax 2283#else 2284 addq $1, %rax 2285#endif 2286 movq %rax, KKK 2287#endif 2288 sarq $3, %rax 2289 je .L95 2290 ALIGN_3 2291 2292.L92: 2293 mulpd %xmm1, %xmm0 2294 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2295 mulpd -14 * SIZE(AO), %xmm1 2296 addpd %xmm0, %xmm8 2297 movapd -12 * SIZE(AO), %xmm0 2298 addpd %xmm1, %xmm9 2299 movapd 2 * SIZE(BO), %xmm1 2300 mulpd %xmm1, %xmm0 2301 mulpd -10 * SIZE(AO), %xmm1 2302 addpd %xmm0, %xmm10 2303 movapd 16 * SIZE(AO), %xmm0 2304 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 2305 addpd %xmm1, %xmm11 2306 movapd 4 * SIZE(BO), %xmm1 2307 mulpd %xmm1, %xmm2 2308 mulpd -6 * SIZE(AO), %xmm1 2309 addpd %xmm2, %xmm8 2310 movapd -4 * SIZE(AO), %xmm2 2311 addpd %xmm1, %xmm9 2312 movapd 6 * SIZE(BO), %xmm1 2313 mulpd %xmm1, %xmm2 2314 mulpd -2 * SIZE(AO), %xmm1 2315 addpd %xmm2, %xmm10 2316 movapd 24 * SIZE(AO), %xmm2 2317 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2318 addpd %xmm1, %xmm11 2319 movapd 16 * SIZE(BO), %xmm1 2320 mulpd %xmm3, %xmm4 2321 mulpd 2 * SIZE(AO), %xmm3 2322 addpd %xmm4, %xmm8 2323 movapd 4 * SIZE(AO), %xmm4 2324 addpd %xmm3, %xmm9 2325 movapd 10 * SIZE(BO), %xmm3 2326 mulpd %xmm3, %xmm4 2327 mulpd 6 * SIZE(AO), %xmm3 2328 addpd %xmm4, %xmm10 2329 movapd 32 * SIZE(AO), %xmm4 2330 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 2331 addpd %xmm3, %xmm11 2332 movapd 12 * SIZE(BO), %xmm3 2333 mulpd %xmm3, %xmm6 2334 mulpd 10 * SIZE(AO), %xmm3 2335 addpd %xmm6, %xmm8 2336 movapd 12 * SIZE(AO), %xmm6 2337 addpd %xmm3, %xmm9 2338 movapd 14 * SIZE(BO), %xmm3 2339 mulpd %xmm3, %xmm6 2340 mulpd 14 * SIZE(AO), %xmm3 2341 addpd %xmm6, %xmm10 2342 movapd 40 * SIZE(AO), %xmm6 2343 addpd %xmm3, %xmm11 2344 movapd 24 * SIZE(BO), %xmm3 2345 2346 addq $32 * SIZE, AO 2347 addq $16 * SIZE, BO 2348 decq %rax 2349 jne .L92 2350 ALIGN_3 2351 2352.L95: 2353#ifndef TRMMKERNEL 2354 movq K, %rax 2355#else 2356 movq KKK, %rax 2357#endif 2358 movapd ALPHA, %xmm7 2359 andq $7, %rax # if (k & 1) 2360 BRANCH 2361 je .L99 2362 ALIGN_3 2363 2364.L96: 2365 mulpd %xmm1, %xmm0 2366 mulpd -14 * SIZE(AO), %xmm1 2367 addpd %xmm0, %xmm8 2368 movapd -12 * SIZE(AO), %xmm0 2369 addpd %xmm1, %xmm9 2370 movapd 2 * SIZE(BO), %xmm1 2371 2372 addq $4 * SIZE, AO # aoffset += 4 2373 addq $2 * SIZE, BO # boffset1 += 8 2374 decq %rax 2375 jg .L96 2376 ALIGN_3 2377 2378.L99: 2379#ifndef TRMMKERNEL 2380 movsd 0 * SIZE(CO1), %xmm0 2381 movhpd 1 * SIZE(CO1), %xmm0 2382 movsd 2 * SIZE(CO1), %xmm1 2383 movhpd 3 * SIZE(CO1), %xmm1 2384#endif 2385 2386 addpd %xmm10, %xmm8 2387 addpd %xmm11, %xmm9 2388 2389 mulpd %xmm7, %xmm8 2390 mulpd %xmm7, %xmm9 2391 2392#ifndef TRMMKERNEL 2393 addpd %xmm0, %xmm8 2394 addpd %xmm1, %xmm9 2395#endif 2396 2397 movsd %xmm8, 0 * SIZE(CO1) 2398 movhpd %xmm8, 1 * SIZE(CO1) 2399 movsd %xmm9, 2 * SIZE(CO1) 2400 movhpd %xmm9, 3 * SIZE(CO1) 2401 2402#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2403 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2404 movq K, %rax 2405 subq KKK, %rax 2406 leaq (,%rax, SIZE), %rax 2407 leaq (AO, %rax, 4), AO 2408 leaq (BO, %rax, 2), BO 2409#endif 2410 2411#if defined(TRMMKERNEL) && defined(LEFT) 2412 addq $4, KK 2413#endif 2414 2415 addq $4 * SIZE, CO1 # coffset += 4 2416 decq I # i -- 2417 jg .L91 2418 ALIGN_3 2419 2420.L100: 2421 testq $2, M 2422 je .L110 2423 ALIGN_3 2424 2425.L101: 2426#if !defined(TRMMKERNEL) || \ 2427 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2428 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2429 2430 leaq BUFFER, BO 2431#else 2432 leaq BUFFER, BO 2433 movq KK, %rax 2434 leaq (, %rax, SIZE), %rax 2435 leaq (AO, %rax, 2), AO 2436 leaq (BO, %rax, 2), BO 2437#endif 2438 2439 movapd -16 * SIZE(AO), %xmm0 2440 pxor %xmm8, %xmm8 2441 movapd 0 * SIZE(BO), %xmm1 2442 pxor %xmm9, %xmm9 2443 movapd -8 * SIZE(AO), %xmm2 2444 pxor %xmm10, %xmm10 2445 movapd 8 * SIZE(BO), %xmm3 2446 pxor %xmm11, %xmm11 2447 2448#ifndef TRMMKERNEL 2449 movq K, %rax 2450#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2451 movq K, %rax 2452 subq KK, %rax 2453 movq %rax, KKK 2454#else 2455 movq KK, %rax 2456#ifdef LEFT 2457 addq $2, %rax 2458#else 2459 addq $1, %rax 2460#endif 2461 movq %rax, KKK 2462#endif 2463 sarq $3, %rax 2464 je .L105 2465 ALIGN_3 2466 2467.L102: 2468 mulpd %xmm0, %xmm1 2469 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2470 movapd -14 * SIZE(AO), %xmm0 2471 mulpd 2 * SIZE(BO), %xmm0 2472 addpd %xmm1, %xmm8 2473 movapd 16 * SIZE(BO), %xmm1 2474 addpd %xmm0, %xmm9 2475 movapd -12 * SIZE(AO), %xmm0 2476 mulpd 4 * SIZE(BO), %xmm0 2477 addpd %xmm0, %xmm10 2478 movapd -10 * SIZE(AO), %xmm0 2479 mulpd 6 * SIZE(BO), %xmm0 2480 addpd %xmm0, %xmm11 2481 movapd 0 * SIZE(AO), %xmm0 2482 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2483 mulpd %xmm2, %xmm3 2484 movapd -6 * SIZE(AO), %xmm2 2485 mulpd 10 * SIZE(BO), %xmm2 2486 addpd %xmm3, %xmm8 2487 movapd 24 * SIZE(BO), %xmm3 2488 addpd %xmm2, %xmm9 2489 movapd -4 * SIZE(AO), %xmm2 2490 mulpd 12 * SIZE(BO), %xmm2 2491 addpd %xmm2, %xmm10 2492 movapd -2 * SIZE(AO), %xmm2 2493 mulpd 14 * SIZE(BO), %xmm2 2494 addpd %xmm2, %xmm11 2495 movapd 8 * SIZE(AO), %xmm2 2496 2497 addq $16 * SIZE, AO 2498 addq $16 * SIZE, BO 2499 decq %rax 2500 jne .L102 2501 ALIGN_3 2502 2503.L105: 2504#ifndef TRMMKERNEL 2505 movq K, %rax 2506#else 2507 movq KKK, %rax 2508#endif 2509 movapd ALPHA, %xmm7 2510 andq $7, %rax # if (k & 1) 2511 BRANCH 2512 je .L109 2513 ALIGN_3 2514 2515.L106: 2516 mulpd %xmm0, %xmm1 2517 addpd %xmm1, %xmm8 2518 movapd -14 * SIZE(AO), %xmm0 2519 movapd 2 * SIZE(BO), %xmm1 2520 2521 addq $2 * SIZE, AO # aoffset += 4 2522 addq $2 * SIZE, BO # boffset1 += 8 2523 decq %rax 2524 jg .L106 2525 ALIGN_3 2526 2527.L109: 2528 addpd %xmm9, %xmm8 2529 addpd %xmm11, %xmm10 2530 addpd %xmm10, %xmm8 2531 2532 mulpd %xmm7, %xmm8 2533 2534#ifndef TRMMKERNEL 2535 movsd 0 * SIZE(CO1), %xmm0 2536 movhpd 1 * SIZE(CO1), %xmm0 2537 2538 addpd %xmm0, %xmm8 2539#endif 2540 2541 movsd %xmm8, 0 * SIZE(CO1) 2542 movhpd %xmm8, 1 * SIZE(CO1) 2543 addq $2 * SIZE, CO1 # coffset += 4 2544 2545#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2546 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2547 movq K, %rax 2548 subq KKK, %rax 2549 leaq (,%rax, SIZE), %rax 2550 leaq (AO, %rax, 2), AO 2551 leaq (BO, %rax, 2), BO 2552#endif 2553 2554#if defined(TRMMKERNEL) && defined(LEFT) 2555 addq $2, KK 2556#endif 2557 ALIGN_3 2558 2559.L110: 2560 testq $1, M 2561 je .L999 2562 ALIGN_3 2563 2564.L111: 2565#if !defined(TRMMKERNEL) || \ 2566 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2567 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2568 2569 leaq BUFFER, BO 2570#else 2571 leaq BUFFER, BO 2572 movq KK, %rax 2573 leaq (, %rax, SIZE), %rax 2574 leaq (AO, %rax, 1), AO 2575 leaq (BO, %rax, 2), BO 2576#endif 2577 2578 movsd -16 * SIZE(AO), %xmm0 2579 pxor %xmm8, %xmm8 2580 movsd 0 * SIZE(BO), %xmm1 2581 pxor %xmm9, %xmm9 2582 movsd -12 * SIZE(AO), %xmm2 2583 pxor %xmm10, %xmm10 2584 movsd 8 * SIZE(BO), %xmm3 2585 pxor %xmm11, %xmm11 2586 2587#ifndef TRMMKERNEL 2588 movq K, %rax 2589#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2590 movq K, %rax 2591 subq KK, %rax 2592 movq %rax, KKK 2593#else 2594 movq KK, %rax 2595#ifdef LEFT 2596 addq $1, %rax 2597#else 2598 addq $1, %rax 2599#endif 2600 movq %rax, KKK 2601#endif 2602 sarq $3, %rax 2603 je .L115 2604 ALIGN_3 2605 2606.L112: 2607 mulsd %xmm0, %xmm1 2608 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2609 movsd -15 * SIZE(AO), %xmm0 2610 addsd %xmm1, %xmm8 2611 movsd 16 * SIZE(BO), %xmm1 2612 mulsd 2 * SIZE(BO), %xmm0 2613 addsd %xmm0, %xmm9 2614 movsd -14 * SIZE(AO), %xmm0 2615 mulsd 4 * SIZE(BO), %xmm0 2616 addsd %xmm0, %xmm10 2617 movsd -13 * SIZE(AO), %xmm0 2618 mulsd 6 * SIZE(BO), %xmm0 2619 addsd %xmm0, %xmm11 2620 movsd -8 * SIZE(AO), %xmm0 2621 mulsd %xmm2, %xmm3 2622 movsd -11 * SIZE(AO), %xmm2 2623 addsd %xmm3, %xmm8 2624 movsd 24 * SIZE(BO), %xmm3 2625 mulsd 10 * SIZE(BO), %xmm2 2626 addsd %xmm2, %xmm9 2627 movsd -10 * SIZE(AO), %xmm2 2628 mulsd 12 * SIZE(BO), %xmm2 2629 addsd %xmm2, %xmm10 2630 movsd -9 * SIZE(AO), %xmm2 2631 mulsd 14 * SIZE(BO), %xmm2 2632 addsd %xmm2, %xmm11 2633 movsd -4 * SIZE(AO), %xmm2 2634 2635 addq $ 8 * SIZE, AO 2636 addq $16 * SIZE, BO 2637 decq %rax 2638 jne .L112 2639 ALIGN_3 2640 2641.L115: 2642#ifndef TRMMKERNEL 2643 movq K, %rax 2644#else 2645 movq KKK, %rax 2646#endif 2647 movsd ALPHA, %xmm7 2648 andq $7, %rax # if (k & 1) 2649 BRANCH 2650 je .L118 2651 ALIGN_3 2652 2653.L116: 2654 mulsd %xmm0, %xmm1 2655 movsd -15 * SIZE(AO), %xmm0 2656 addsd %xmm1, %xmm8 2657 movsd 2 * SIZE(BO), %xmm1 2658 2659 addq $1 * SIZE, AO # aoffset += 4 2660 addq $2 * SIZE, BO # boffset1 += 8 2661 decq %rax 2662 jg .L116 2663 ALIGN_3 2664 2665.L118: 2666 addsd %xmm10, %xmm8 2667 addsd %xmm11, %xmm9 2668 addsd %xmm9, %xmm8 2669 2670 mulsd %xmm7, %xmm8 2671#ifndef TRMMKERNEL 2672 addsd 0 * SIZE(CO1), %xmm8 2673#endif 2674 movsd %xmm8, 0 * SIZE(CO1) 2675 ALIGN_3 2676 2677.L999: 2678 movq %rbx, %rsp 2679 2680 EMMS 2681 2682 movq 0(%rsp), %rbx 2683 movq 8(%rsp), %rbp 2684 movq 16(%rsp), %r12 2685 movq 24(%rsp), %r13 2686 movq 32(%rsp), %r14 2687 movq 40(%rsp), %r15 2688 2689#ifdef WINDOWS_ABI 2690 movq 48(%rsp), %rdi 2691 movq 56(%rsp), %rsi 2692 movups 64(%rsp), %xmm6 2693 movups 80(%rsp), %xmm7 2694 movups 96(%rsp), %xmm8 2695 movups 112(%rsp), %xmm9 2696 movups 128(%rsp), %xmm10 2697 movups 144(%rsp), %xmm11 2698 movups 160(%rsp), %xmm12 2699 movups 176(%rsp), %xmm13 2700 movups 192(%rsp), %xmm14 2701 movups 208(%rsp), %xmm15 2702#endif 2703 2704 addq $STACKSIZE, %rsp 2705 ret 2706 2707 EPILOGUE 2708