1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %rdi 43#define N %rsi 44#define K %rdx 45 46#define A %rcx 47#define B %r8 48#define C %r9 49#define LDC %r10 50 51#define I %r11 52#define J %r12 53#define AO %r13 54#define BO %r14 55#define CO1 %r15 56#define CO2 %rbx 57#define BB %rbp 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 128 62 63#define OLD_LDC 8 + STACKSIZE(%rsp) 64#define OLD_OFFSET 16 + STACKSIZE(%rsp) 65 66#define ALPHA 48(%rsp) 67#define OFFSET 56(%rsp) 68#define KKK 64(%rsp) 69#define KK 72(%rsp) 70 71#else 72 73#define STACKSIZE 256 74 75#define OLD_A 40 + STACKSIZE(%rsp) 76#define OLD_B 48 + STACKSIZE(%rsp) 77#define OLD_C 56 + STACKSIZE(%rsp) 78#define OLD_LDC 64 + STACKSIZE(%rsp) 79#define OLD_OFFSET 72 + STACKSIZE(%rsp) 80 81#define ALPHA 224(%rsp) 82#define OFFSET 232(%rsp) 83#define KK 240(%rsp) 84#define KKK 248(%rsp) 85 86#endif 87 88#define PREFETCH prefetcht1 89#define PREFETCHSIZE (16 * 12 + 3) 90#define PREFETCH_R (4 * 4 + 0) 91 92#define KERNEL1(address) \ 93 mulpd %xmm8, %xmm9 ;\ 94 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ 95 addpd %xmm9, %xmm0;\ 96 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 97 mulpd %xmm8, %xmm9;\ 98 addpd %xmm9, %xmm1;\ 99 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 100 mulpd %xmm8, %xmm9;\ 101 addpd %xmm9, %xmm2;\ 102 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 103 mulpd %xmm8, %xmm9;\ 104 movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 105 addpd %xmm9, %xmm3;\ 106 movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 107 108#define KERNEL2(address) \ 109 mulpd %xmm8, %xmm9;\ 110 addpd %xmm9, %xmm4;\ 111 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 112 mulpd %xmm8, %xmm9;\ 113 addpd %xmm9, %xmm5;\ 114 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 115 mulpd %xmm8, %xmm9;\ 116 addpd %xmm9, %xmm6;\ 117 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 118 mulpd %xmm8, %xmm9;\ 119 movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 120 addpd %xmm9, %xmm7;\ 121 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 122 123#define KERNEL3(address) \ 124 mulpd %xmm8, %xmm9;\ 125 addpd %xmm9, %xmm0;\ 126 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 127 mulpd %xmm8, %xmm9;\ 128 addpd %xmm9, %xmm1;\ 129 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 130 mulpd %xmm8, %xmm9;\ 131 addpd %xmm9, %xmm2;\ 132 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 133 mulpd %xmm8, %xmm9;\ 134 movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 135 addpd %xmm9, %xmm3;\ 136 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 137 138#define KERNEL4(address) \ 139 mulpd %xmm8, %xmm9;\ 140 addpd %xmm9, %xmm4;\ 141 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 142 mulpd %xmm8, %xmm9;\ 143 addpd %xmm9, %xmm5;\ 144 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 145 mulpd %xmm8, %xmm9;\ 146 addpd %xmm9, %xmm6;\ 147 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 148 mulpd %xmm8, %xmm9;\ 149 movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 150 addpd %xmm9, %xmm7;\ 151 movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 152 153#define KERNEL5(address) \ 154 mulpd %xmm10, %xmm11;\ 155 addpd %xmm11, %xmm0;\ 156 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 157 mulpd %xmm10, %xmm11;\ 158 addpd %xmm11, %xmm1;\ 159 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 160 mulpd %xmm10, %xmm11;\ 161 addpd %xmm11, %xmm2;\ 162 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 163 mulpd %xmm10, %xmm11;\ 164 movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 165 addpd %xmm11, %xmm3;\ 166 movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 167 168#define KERNEL6(address) \ 169 mulpd %xmm10, %xmm11;\ 170 addpd %xmm11, %xmm4;\ 171 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 172 mulpd %xmm10, %xmm11;\ 173 addpd %xmm11, %xmm5;\ 174 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 175 mulpd %xmm10, %xmm11;\ 176 addpd %xmm11, %xmm6;\ 177 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 178 mulpd %xmm10, %xmm11;\ 179 movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 180 addpd %xmm11, %xmm7;\ 181 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 182 183#define KERNEL7(address) \ 184 mulpd %xmm10, %xmm11;\ 185 addpd %xmm11, %xmm0;\ 186 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 187 mulpd %xmm10, %xmm11;\ 188 addpd %xmm11, %xmm1;\ 189 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 190 mulpd %xmm10, %xmm11;\ 191 addpd %xmm11, %xmm2;\ 192 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 193 mulpd %xmm10, %xmm11;\ 194 movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 195 addpd %xmm11, %xmm3;\ 196 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 197 198#define KERNEL8(address) \ 199 mulpd %xmm10, %xmm11;\ 200 addpd %xmm11, %xmm4;\ 201 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 202 mulpd %xmm10, %xmm11;\ 203 addpd %xmm11, %xmm5;\ 204 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 205 mulpd %xmm10, %xmm11;\ 206 addpd %xmm11, %xmm6;\ 207 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 208 mulpd %xmm10, %xmm11;\ 209 movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 210 addpd %xmm11, %xmm7;\ 211 movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 212 213#define KERNEL9(address) \ 214 mulpd %xmm12, %xmm13;\ 215 PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ 216 addpd %xmm13, %xmm0;\ 217 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 218 mulpd %xmm12, %xmm13;\ 219 addpd %xmm13, %xmm1;\ 220 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 221 mulpd %xmm12, %xmm13;\ 222 addpd %xmm13, %xmm2;\ 223 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 224 mulpd %xmm12, %xmm13;\ 225 movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 226 addpd %xmm13, %xmm3;\ 227 movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 228 229#define KERNEL10(address) \ 230 mulpd %xmm12, %xmm13;\ 231 addpd %xmm13, %xmm4;\ 232 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 233 mulpd %xmm12, %xmm13;\ 234 addpd %xmm13, %xmm5;\ 235 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 236 mulpd %xmm12, %xmm13;\ 237 addpd %xmm13, %xmm6;\ 238 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 239 mulpd %xmm12, %xmm13;\ 240 movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 241 addpd %xmm13, %xmm7;\ 242 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 243 244#define KERNEL11(address) \ 245 mulpd %xmm12, %xmm13;\ 246 addpd %xmm13, %xmm0;\ 247 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 248 mulpd %xmm12, %xmm13;\ 249 addpd %xmm13, %xmm1;\ 250 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 251 mulpd %xmm12, %xmm13;\ 252 addpd %xmm13, %xmm2;\ 253 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 254 mulpd %xmm12, %xmm13;\ 255 movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 256 addpd %xmm13, %xmm3;\ 257 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 258 259#define KERNEL12(address) \ 260 mulpd %xmm12, %xmm13;\ 261 addpd %xmm13, %xmm4;\ 262 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 263 mulpd %xmm12, %xmm13;\ 264 addpd %xmm13, %xmm5;\ 265 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 266 mulpd %xmm12, %xmm13;\ 267 addpd %xmm13, %xmm6;\ 268 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 269 mulpd %xmm12, %xmm13;\ 270 movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 271 addpd %xmm13, %xmm7;\ 272 movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 273 274#define KERNEL13(address) \ 275 mulpd %xmm14, %xmm15;\ 276 addpd %xmm15, %xmm0;\ 277 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 278 mulpd %xmm14, %xmm15;\ 279 addpd %xmm15, %xmm1;\ 280 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 281 mulpd %xmm14, %xmm15;\ 282 addpd %xmm15, %xmm2;\ 283 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 284 mulpd %xmm14, %xmm15;\ 285 movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 286 addpd %xmm15, %xmm3;\ 287 movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 288 289#define KERNEL14(address) \ 290 mulpd %xmm14, %xmm15;\ 291 addpd %xmm15, %xmm4;\ 292 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 293 mulpd %xmm14, %xmm15;\ 294 addpd %xmm15, %xmm5;\ 295 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 296 mulpd %xmm14, %xmm15;\ 297 addpd %xmm15, %xmm6;\ 298 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 299 mulpd %xmm14, %xmm15;\ 300 movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 301 addpd %xmm15, %xmm7;\ 302 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 303 304#define KERNEL15(address) \ 305 mulpd %xmm14, %xmm15;\ 306 addpd %xmm15, %xmm0;\ 307 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 308 mulpd %xmm14, %xmm15;\ 309 addpd %xmm15, %xmm1;\ 310 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 311 mulpd %xmm14, %xmm15;\ 312 addpd %xmm15, %xmm2;\ 313 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 314 mulpd %xmm14, %xmm15;\ 315 movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 316 addpd %xmm15, %xmm3;\ 317 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 318 319#define KERNEL16(address) \ 320 mulpd %xmm14, %xmm15;\ 321 addpd %xmm15, %xmm4;\ 322 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 323 mulpd %xmm14, %xmm15;\ 324 addpd %xmm15, %xmm5;\ 325 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 326 mulpd %xmm14, %xmm15;\ 327 addpd %xmm15, %xmm6;\ 328 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 329 mulpd %xmm14, %xmm15;\ 330 movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 331 addpd %xmm15, %xmm7;\ 332 movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 333 334 PROLOGUE 335 PROFCODE 336 337 subq $STACKSIZE, %rsp 338 movq %rbx, 0(%rsp) 339 movq %rbp, 8(%rsp) 340 movq %r12, 16(%rsp) 341 movq %r13, 24(%rsp) 342 movq %r14, 32(%rsp) 343 movq %r15, 40(%rsp) 344 345#ifdef WINDOWS_ABI 346 movq %rdi, 48(%rsp) 347 movq %rsi, 56(%rsp) 348 movups %xmm6, 64(%rsp) 349 movups %xmm7, 80(%rsp) 350 movups %xmm8, 96(%rsp) 351 movups %xmm9, 112(%rsp) 352 movups %xmm10, 128(%rsp) 353 movups %xmm11, 144(%rsp) 354 movups %xmm12, 160(%rsp) 355 movups %xmm13, 176(%rsp) 356 movups %xmm14, 192(%rsp) 357 movups %xmm15, 208(%rsp) 358 359 movq ARG1, M 360 movq ARG2, N 361 movq ARG3, K 362 movq OLD_A, A 363 movq OLD_B, B 364 movq OLD_C, C 365 movq OLD_LDC, LDC 366#ifdef TRMMKERNEL 367 movsd OLD_OFFSET, %xmm4 368#endif 369 movaps %xmm3, %xmm0 370 371#else 372 movq OLD_LDC, LDC 373#ifdef TRMMKERNEL 374 movsd OLD_OFFSET, %xmm4 375#endif 376 377#endif 378 379 movsd %xmm0, ALPHA 380 381#ifdef TRMMKERNEL 382 movsd %xmm4, OFFSET 383 movsd %xmm4, KK 384#ifndef LEFT 385 negq KK 386#endif 387#endif 388 389 leaq (, LDC, SIZE), LDC 390 391 movq N, J 392 sarq $2, J # j = (n >> 2) 393 jle .L40 394 ALIGN_4 395 396.L10: 397#if defined(TRMMKERNEL) && defined(LEFT) 398 movq OFFSET, %rax 399 movq %rax, KK 400#endif 401 402 movq C, CO1 # coffset1 = c 403 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 404 movq A, AO # aoffset = a 405 406 movq K, %rax 407 salq $BASE_SHIFT + 2, %rax 408 leaq (B, %rax), BB 409 410 movq M, I 411 sarq $2, I # i = (m >> 2) 412 jle .L20 413 ALIGN_4 414 415.L11: 416#if !defined(TRMMKERNEL) || \ 417 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 418 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 419 420 movq B, BO 421#else 422 movq KK, %rax 423 leaq (, %rax, SIZE), %rax 424 leaq (AO, %rax, 4), AO 425 leaq (B, %rax, 4), BO 426#endif 427 428 movapd 0 * SIZE(AO), %xmm8 429 pxor %xmm0, %xmm0 430 movddup 0 * SIZE(BO), %xmm9 431 pxor %xmm1, %xmm1 432 movapd 8 * SIZE(AO), %xmm10 433 pxor %xmm2, %xmm2 434 movddup 8 * SIZE(BO), %xmm11 435 pxor %xmm3, %xmm3 436 437 movapd 16 * SIZE(AO), %xmm12 438 pxor %xmm4, %xmm4 439 movddup 16 * SIZE(BO), %xmm13 440 pxor %xmm5, %xmm5 441 movapd 24 * SIZE(AO), %xmm14 442 pxor %xmm6, %xmm6 443 movddup 24 * SIZE(BO), %xmm15 444 pxor %xmm7, %xmm7 445 446 prefetchnta 3 * SIZE(CO1) 447 prefetchnta 3 * SIZE(CO2) 448 prefetchnta 3 * SIZE(CO1, LDC, 2) 449 prefetchnta 3 * SIZE(CO2, LDC, 2) 450 451 prefetcht0 0 * SIZE(BB) 452 subq $-8 * SIZE, BB 453 454#ifndef TRMMKERNEL 455 movq K, %rax 456#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 457 movq K, %rax 458 subq KK, %rax 459 movq %rax, KKK 460#else 461 movq KK, %rax 462#ifdef LEFT 463 addq $4, %rax 464#else 465 addq $4, %rax 466#endif 467 movq %rax, KKK 468#endif 469 470#if 1 471 andq $-8, %rax 472 salq $4, %rax 473 NOBRANCH 474 je .L15 475 476.L1X: 477 KERNEL1 (16 * 0) 478 KERNEL2 (16 * 0) 479 KERNEL3 (16 * 0) 480 KERNEL4 (16 * 0) 481 KERNEL5 (16 * 0) 482 KERNEL6 (16 * 0) 483 KERNEL7 (16 * 0) 484 KERNEL8 (16 * 0) 485 KERNEL9 (16 * 0) 486 KERNEL10(16 * 0) 487 KERNEL11(16 * 0) 488 KERNEL12(16 * 0) 489 KERNEL13(16 * 0) 490 KERNEL14(16 * 0) 491 KERNEL15(16 * 0) 492 KERNEL16(16 * 0) 493 cmpq $128 * 1, %rax 494 NOBRANCH 495 jle .L12 496 KERNEL1 (16 * 1) 497 KERNEL2 (16 * 1) 498 KERNEL3 (16 * 1) 499 KERNEL4 (16 * 1) 500 KERNEL5 (16 * 1) 501 KERNEL6 (16 * 1) 502 KERNEL7 (16 * 1) 503 KERNEL8 (16 * 1) 504 KERNEL9 (16 * 1) 505 KERNEL10(16 * 1) 506 KERNEL11(16 * 1) 507 KERNEL12(16 * 1) 508 KERNEL13(16 * 1) 509 KERNEL14(16 * 1) 510 KERNEL15(16 * 1) 511 KERNEL16(16 * 1) 512 cmpq $128 * 2, %rax 513 NOBRANCH 514 jle .L12 515 KERNEL1 (16 * 2) 516 KERNEL2 (16 * 2) 517 KERNEL3 (16 * 2) 518 KERNEL4 (16 * 2) 519 KERNEL5 (16 * 2) 520 KERNEL6 (16 * 2) 521 KERNEL7 (16 * 2) 522 KERNEL8 (16 * 2) 523 KERNEL9 (16 * 2) 524 KERNEL10(16 * 2) 525 KERNEL11(16 * 2) 526 KERNEL12(16 * 2) 527 KERNEL13(16 * 2) 528 KERNEL14(16 * 2) 529 KERNEL15(16 * 2) 530 KERNEL16(16 * 2) 531 cmpq $128 * 3, %rax 532 NOBRANCH 533 jle .L12 534 KERNEL1 (16 * 3) 535 KERNEL2 (16 * 3) 536 KERNEL3 (16 * 3) 537 KERNEL4 (16 * 3) 538 KERNEL5 (16 * 3) 539 KERNEL6 (16 * 3) 540 KERNEL7 (16 * 3) 541 KERNEL8 (16 * 3) 542 KERNEL9 (16 * 3) 543 KERNEL10(16 * 3) 544 KERNEL11(16 * 3) 545 KERNEL12(16 * 3) 546 KERNEL13(16 * 3) 547 KERNEL14(16 * 3) 548 KERNEL15(16 * 3) 549 KERNEL16(16 * 3) 550 cmpq $128 * 4, %rax 551 NOBRANCH 552 jle .L12 553 KERNEL1 (16 * 4) 554 KERNEL2 (16 * 4) 555 KERNEL3 (16 * 4) 556 KERNEL4 (16 * 4) 557 KERNEL5 (16 * 4) 558 KERNEL6 (16 * 4) 559 KERNEL7 (16 * 4) 560 KERNEL8 (16 * 4) 561 KERNEL9 (16 * 4) 562 KERNEL10(16 * 4) 563 KERNEL11(16 * 4) 564 KERNEL12(16 * 4) 565 KERNEL13(16 * 4) 566 KERNEL14(16 * 4) 567 KERNEL15(16 * 4) 568 KERNEL16(16 * 4) 569 cmpq $128 * 5, %rax 570 NOBRANCH 571 jle .L12 572 KERNEL1 (16 * 5) 573 KERNEL2 (16 * 5) 574 KERNEL3 (16 * 5) 575 KERNEL4 (16 * 5) 576 KERNEL5 (16 * 5) 577 KERNEL6 (16 * 5) 578 KERNEL7 (16 * 5) 579 KERNEL8 (16 * 5) 580 KERNEL9 (16 * 5) 581 KERNEL10(16 * 5) 582 KERNEL11(16 * 5) 583 KERNEL12(16 * 5) 584 KERNEL13(16 * 5) 585 KERNEL14(16 * 5) 586 KERNEL15(16 * 5) 587 KERNEL16(16 * 5) 588 cmpq $128 * 6, %rax 589 NOBRANCH 590 jle .L12 591 KERNEL1 (16 * 6) 592 KERNEL2 (16 * 6) 593 KERNEL3 (16 * 6) 594 KERNEL4 (16 * 6) 595 KERNEL5 (16 * 6) 596 KERNEL6 (16 * 6) 597 KERNEL7 (16 * 6) 598 KERNEL8 (16 * 6) 599 KERNEL9 (16 * 6) 600 KERNEL10(16 * 6) 601 KERNEL11(16 * 6) 602 KERNEL12(16 * 6) 603 KERNEL13(16 * 6) 604 KERNEL14(16 * 6) 605 KERNEL15(16 * 6) 606 KERNEL16(16 * 6) 607 cmpq $128 * 7, %rax 608 NOBRANCH 609 jle .L12 610 KERNEL1 (16 * 7) 611 KERNEL2 (16 * 7) 612 KERNEL3 (16 * 7) 613 KERNEL4 (16 * 7) 614 KERNEL5 (16 * 7) 615 KERNEL6 (16 * 7) 616 KERNEL7 (16 * 7) 617 KERNEL8 (16 * 7) 618 KERNEL9 (16 * 7) 619 KERNEL10(16 * 7) 620 KERNEL11(16 * 7) 621 KERNEL12(16 * 7) 622 KERNEL13(16 * 7) 623 KERNEL14(16 * 7) 624 KERNEL15(16 * 7) 625 KERNEL16(16 * 7) 626 627 addq $32 * 8 * SIZE, AO 628 addq $32 * 8 * SIZE, BO 629 subq $128 * 8, %rax 630 BRANCH 631 jg .L1X 632 633.L12: 634 leaq (AO, %rax, 2), AO # * 16 635 leaq (BO, %rax, 2), BO # * 64 636 637#else 638 sarq $3, %rax 639 je .L15 640 ALIGN_4 641 642.L12: 643 mulpd %xmm8, %xmm9 644 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 645 addpd %xmm9, %xmm0 646 movddup 1 * SIZE(BO), %xmm9 647 mulpd %xmm8, %xmm9 648 addpd %xmm9, %xmm1 649 movddup 2 * SIZE(BO), %xmm9 650 mulpd %xmm8, %xmm9 651 addpd %xmm9, %xmm2 652 movddup 3 * SIZE(BO), %xmm9 653 mulpd %xmm8, %xmm9 654 movapd 2 * SIZE(AO), %xmm8 655 addpd %xmm9, %xmm3 656 movddup 0 * SIZE(BO), %xmm9 657 mulpd %xmm8, %xmm9 658 addpd %xmm9, %xmm4 659 movddup 1 * SIZE(BO), %xmm9 660 mulpd %xmm8, %xmm9 661 addpd %xmm9, %xmm5 662 movddup 2 * SIZE(BO), %xmm9 663 mulpd %xmm8, %xmm9 664 addpd %xmm9, %xmm6 665 movddup 3 * SIZE(BO), %xmm9 666 mulpd %xmm8, %xmm9 667 movapd 4 * SIZE(AO), %xmm8 668 addpd %xmm9, %xmm7 669 movddup 4 * SIZE(BO), %xmm9 670 mulpd %xmm8, %xmm9 671 addpd %xmm9, %xmm0 672 movddup 5 * SIZE(BO), %xmm9 673 mulpd %xmm8, %xmm9 674 addpd %xmm9, %xmm1 675 movddup 6 * SIZE(BO), %xmm9 676 mulpd %xmm8, %xmm9 677 addpd %xmm9, %xmm2 678 movddup 7 * SIZE(BO), %xmm9 679 mulpd %xmm8, %xmm9 680 movapd 6 * SIZE(AO), %xmm8 681 addpd %xmm9, %xmm3 682 movddup 4 * SIZE(BO), %xmm9 683 mulpd %xmm8, %xmm9 684 addpd %xmm9, %xmm4 685 movddup 5 * SIZE(BO), %xmm9 686 mulpd %xmm8, %xmm9 687 addpd %xmm9, %xmm5 688 movddup 6 * SIZE(BO), %xmm9 689 mulpd %xmm8, %xmm9 690 addpd %xmm9, %xmm6 691 movddup 7 * SIZE(BO), %xmm9 692 mulpd %xmm8, %xmm9 693 movapd 32 * SIZE(AO), %xmm8 694 addpd %xmm9, %xmm7 695 696 movddup 32 * SIZE(BO), %xmm9 697 mulpd %xmm10, %xmm11 698 addpd %xmm11, %xmm0 699 movddup 9 * SIZE(BO), %xmm11 700 mulpd %xmm10, %xmm11 701 addpd %xmm11, %xmm1 702 movddup 10 * SIZE(BO), %xmm11 703 mulpd %xmm10, %xmm11 704 addpd %xmm11, %xmm2 705 movddup 11 * SIZE(BO), %xmm11 706 mulpd %xmm10, %xmm11 707 movapd 10 * SIZE(AO), %xmm10 708 addpd %xmm11, %xmm3 709 710 movddup 8 * SIZE(BO), %xmm11 711 mulpd %xmm10, %xmm11 712 addpd %xmm11, %xmm4 713 movddup 9 * SIZE(BO), %xmm11 714 mulpd %xmm10, %xmm11 715 addpd %xmm11, %xmm5 716 movddup 10 * SIZE(BO), %xmm11 717 mulpd %xmm10, %xmm11 718 addpd %xmm11, %xmm6 719 movddup 11 * SIZE(BO), %xmm11 720 mulpd %xmm10, %xmm11 721 movapd 12 * SIZE(AO), %xmm10 722 addpd %xmm11, %xmm7 723 movddup 12 * SIZE(BO), %xmm11 724 mulpd %xmm10, %xmm11 725 addpd %xmm11, %xmm0 726 movddup 13 * SIZE(BO), %xmm11 727 mulpd %xmm10, %xmm11 728 addpd %xmm11, %xmm1 729 movddup 14 * SIZE(BO), %xmm11 730 mulpd %xmm10, %xmm11 731 addpd %xmm11, %xmm2 732 movddup 15 * SIZE(BO), %xmm11 733 mulpd %xmm10, %xmm11 734 movapd 14 * SIZE(AO), %xmm10 735 addpd %xmm11, %xmm3 736 737 movddup 12 * SIZE(BO), %xmm11 738 mulpd %xmm10, %xmm11 739 addpd %xmm11, %xmm4 740 movddup 13 * SIZE(BO), %xmm11 741 mulpd %xmm10, %xmm11 742 addpd %xmm11, %xmm5 743 movddup 14 * SIZE(BO), %xmm11 744 mulpd %xmm10, %xmm11 745 addpd %xmm11, %xmm6 746 movddup 15 * SIZE(BO), %xmm11 747 mulpd %xmm10, %xmm11 748 movapd 40 * SIZE(AO), %xmm10 749 addpd %xmm11, %xmm7 750 movddup 40 * SIZE(BO), %xmm11 751 752 mulpd %xmm12, %xmm13 753 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 754 addpd %xmm13, %xmm0 755 movddup 17 * SIZE(BO), %xmm13 756 mulpd %xmm12, %xmm13 757 addpd %xmm13, %xmm1 758 movddup 18 * SIZE(BO), %xmm13 759 mulpd %xmm12, %xmm13 760 addpd %xmm13, %xmm2 761 movddup 19 * SIZE(BO), %xmm13 762 mulpd %xmm12, %xmm13 763 movapd 18 * SIZE(AO), %xmm12 764 addpd %xmm13, %xmm3 765 766 movddup 16 * SIZE(BO), %xmm13 767 mulpd %xmm12, %xmm13 768 addpd %xmm13, %xmm4 769 movddup 17 * SIZE(BO), %xmm13 770 mulpd %xmm12, %xmm13 771 addpd %xmm13, %xmm5 772 movddup 18 * SIZE(BO), %xmm13 773 mulpd %xmm12, %xmm13 774 addpd %xmm13, %xmm6 775 movddup 19 * SIZE(BO), %xmm13 776 mulpd %xmm12, %xmm13 777 movapd 20 * SIZE(AO), %xmm12 778 addpd %xmm13, %xmm7 779 780 movddup 20 * SIZE(BO), %xmm13 781 mulpd %xmm12, %xmm13 782 addpd %xmm13, %xmm0 783 movddup 21 * SIZE(BO), %xmm13 784 mulpd %xmm12, %xmm13 785 addpd %xmm13, %xmm1 786 movddup 22 * SIZE(BO), %xmm13 787 mulpd %xmm12, %xmm13 788 addpd %xmm13, %xmm2 789 movddup 23 * SIZE(BO), %xmm13 790 mulpd %xmm12, %xmm13 791 movapd 22 * SIZE(AO), %xmm12 792 addpd %xmm13, %xmm3 793 794 movddup 20 * SIZE(BO), %xmm13 795 mulpd %xmm12, %xmm13 796 addpd %xmm13, %xmm4 797 movddup 21 * SIZE(BO), %xmm13 798 mulpd %xmm12, %xmm13 799 addpd %xmm13, %xmm5 800 movddup 22 * SIZE(BO), %xmm13 801 mulpd %xmm12, %xmm13 802 addpd %xmm13, %xmm6 803 movddup 23 * SIZE(BO), %xmm13 804 mulpd %xmm12, %xmm13 805 movapd 48 * SIZE(AO), %xmm12 806 addpd %xmm13, %xmm7 807 movddup 48 * SIZE(BO), %xmm13 808 809 mulpd %xmm14, %xmm15 810 addpd %xmm15, %xmm0 811 movddup 25 * SIZE(BO), %xmm15 812 mulpd %xmm14, %xmm15 813 addpd %xmm15, %xmm1 814 movddup 26 * SIZE(BO), %xmm15 815 mulpd %xmm14, %xmm15 816 addpd %xmm15, %xmm2 817 movddup 27 * SIZE(BO), %xmm15 818 mulpd %xmm14, %xmm15 819 movapd 26 * SIZE(AO), %xmm14 820 addpd %xmm15, %xmm3 821 822 movddup 24 * SIZE(BO), %xmm15 823 mulpd %xmm14, %xmm15 824 addpd %xmm15, %xmm4 825 movddup 25 * SIZE(BO), %xmm15 826 mulpd %xmm14, %xmm15 827 addpd %xmm15, %xmm5 828 movddup 26 * SIZE(BO), %xmm15 829 mulpd %xmm14, %xmm15 830 addpd %xmm15, %xmm6 831 movddup 27 * SIZE(BO), %xmm15 832 mulpd %xmm14, %xmm15 833 movapd 28 * SIZE(AO), %xmm14 834 addpd %xmm15, %xmm7 835 836 movddup 28 * SIZE(BO), %xmm15 837 mulpd %xmm14, %xmm15 838 addpd %xmm15, %xmm0 839 movddup 29 * SIZE(BO), %xmm15 840 mulpd %xmm14, %xmm15 841 addpd %xmm15, %xmm1 842 movddup 30 * SIZE(BO), %xmm15 843 mulpd %xmm14, %xmm15 844 addpd %xmm15, %xmm2 845 movddup 31 * SIZE(BO), %xmm15 846 mulpd %xmm14, %xmm15 847 movapd 30 * SIZE(AO), %xmm14 848 addpd %xmm15, %xmm3 849 850 movddup 28 * SIZE(BO), %xmm15 851 mulpd %xmm14, %xmm15 852 addpd %xmm15, %xmm4 853 movddup 29 * SIZE(BO), %xmm15 854 mulpd %xmm14, %xmm15 855 addpd %xmm15, %xmm5 856 movddup 30 * SIZE(BO), %xmm15 857 mulpd %xmm14, %xmm15 858 addpd %xmm15, %xmm6 859 movddup 31 * SIZE(BO), %xmm15 860 mulpd %xmm14, %xmm15 861 movapd 56 * SIZE(AO), %xmm14 862 addpd %xmm15, %xmm7 863 movddup 56 * SIZE(BO), %xmm15 864 865 addq $32 * SIZE, BO 866 addq $32 * SIZE, AO 867 decq %rax 868 BRANCH 869 jne .L12 870#endif 871 ALIGN_4 872 873.L15: 874#ifndef TRMMKERNEL 875 movq K, %rax 876#else 877 movq KKK, %rax 878#endif 879 movddup ALPHA, %xmm15 880 andq $7, %rax # if (k & 1) 881 BRANCH 882 BRANCH 883 je .L19 884 ALIGN_4 885 886.L16: 887 mulpd %xmm8, %xmm9 888 movapd 2 * SIZE(AO), %xmm10 889 addpd %xmm9, %xmm0 890 movddup 1 * SIZE(BO), %xmm9 891 mulpd %xmm8, %xmm9 892 movddup 0 * SIZE(BO), %xmm11 893 addpd %xmm9, %xmm1 894 movddup 2 * SIZE(BO), %xmm9 895 mulpd %xmm8, %xmm9 896 addpd %xmm9, %xmm2 897 movddup 3 * SIZE(BO), %xmm9 898 mulpd %xmm8, %xmm9 899 movapd 4 * SIZE(AO), %xmm8 900 addpd %xmm9, %xmm3 901 movddup 4 * SIZE(BO), %xmm9 902 mulpd %xmm10, %xmm11 903 addpd %xmm11, %xmm4 904 movddup 1 * SIZE(BO), %xmm11 905 mulpd %xmm10, %xmm11 906 addpd %xmm11, %xmm5 907 movddup 2 * SIZE(BO), %xmm11 908 mulpd %xmm10, %xmm11 909 addpd %xmm11, %xmm6 910 movddup 3 * SIZE(BO), %xmm11 911 mulpd %xmm10, %xmm11 912 addpd %xmm11, %xmm7 913 914 addq $4 * SIZE, AO # aoffset += 4 915 addq $4 * SIZE, BO # boffset1 += 8 916 decq %rax 917 BRANCH 918 jg .L16 919 ALIGN_4 920 921.L19: 922 mulpd %xmm15, %xmm0 923 mulpd %xmm15, %xmm4 924 mulpd %xmm15, %xmm1 925 mulpd %xmm15, %xmm5 926 927 testq $15, CO1 928 NOBRANCH 929 jne .L19x 930 testq $15, LDC 931 NOBRANCH 932 jne .L19x 933 934 mulpd %xmm15, %xmm2 935 mulpd %xmm15, %xmm3 936 mulpd %xmm15, %xmm6 937 mulpd %xmm15, %xmm7 938 939#if! defined(TRMMKERNEL) && !defined(BETAZERO) 940 addpd 0 * SIZE(CO1), %xmm0 941 addpd 2 * SIZE(CO1), %xmm4 942 addpd 0 * SIZE(CO2), %xmm1 943 addpd 2 * SIZE(CO2), %xmm5 944 945 addpd 0 * SIZE(CO1, LDC, 2), %xmm2 946 addpd 2 * SIZE(CO1, LDC, 2), %xmm6 947 addpd 0 * SIZE(CO2, LDC, 2), %xmm3 948 addpd 2 * SIZE(CO2, LDC, 2), %xmm7 949#endif 950 951 movapd %xmm0, 0 * SIZE(CO1) 952 movapd %xmm4, 2 * SIZE(CO1) 953 movapd %xmm1, 0 * SIZE(CO2) 954 movapd %xmm5, 2 * SIZE(CO2) 955 956 movapd %xmm2, 0 * SIZE(CO1, LDC, 2) 957 movapd %xmm6, 2 * SIZE(CO1, LDC, 2) 958 movapd %xmm3, 0 * SIZE(CO2, LDC, 2) 959 movapd %xmm7, 2 * SIZE(CO2, LDC, 2) 960 961#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 962 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 963 movq K, %rax 964 subq KKK, %rax 965 leaq (,%rax, SIZE), %rax 966 leaq (AO, %rax, 4), AO 967 leaq (BO, %rax, 4), BO 968#endif 969 970#if defined(TRMMKERNEL) && defined(LEFT) 971 addq $4, KK 972#endif 973 974 addq $4 * SIZE, CO1 # coffset += 4 975 addq $4 * SIZE, CO2 # coffset += 4 976 977 decq I # i -- 978 jg .L11 979 jmp .L20 980 ALIGN_4 981 982.L19x: 983#if! defined(TRMMKERNEL) && !defined(BETAZERO) 984 movsd 0 * SIZE(CO1), %xmm8 985 movhpd 1 * SIZE(CO1), %xmm8 986 movsd 2 * SIZE(CO1), %xmm9 987 movhpd 3 * SIZE(CO1), %xmm9 988 989 movsd 0 * SIZE(CO2), %xmm10 990 movhpd 1 * SIZE(CO2), %xmm10 991 movsd 2 * SIZE(CO2), %xmm11 992 movhpd 3 * SIZE(CO2), %xmm11 993 994 addpd %xmm8, %xmm0 995 addpd %xmm9, %xmm4 996 addpd %xmm10, %xmm1 997 addpd %xmm11, %xmm5 998#endif 999 1000 mulpd %xmm15, %xmm2 1001 mulpd %xmm15, %xmm3 1002 mulpd %xmm15, %xmm6 1003 mulpd %xmm15, %xmm7 1004 1005#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1006 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1007 movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 1008 movsd 2 * SIZE(CO1, LDC, 2), %xmm13 1009 movhpd 3 * SIZE(CO1, LDC, 2), %xmm13 1010 1011 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1012 movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 1013 movsd 2 * SIZE(CO2, LDC, 2), %xmm15 1014 movhpd 3 * SIZE(CO2, LDC, 2), %xmm15 1015 1016 addpd %xmm12, %xmm2 1017 addpd %xmm13, %xmm6 1018 addpd %xmm14, %xmm3 1019 addpd %xmm15, %xmm7 1020#endif 1021 1022 movsd %xmm0, 0 * SIZE(CO1) 1023 movhpd %xmm0, 1 * SIZE(CO1) 1024 movsd %xmm4, 2 * SIZE(CO1) 1025 movhpd %xmm4, 3 * SIZE(CO1) 1026 1027 movsd %xmm1, 0 * SIZE(CO2) 1028 movhpd %xmm1, 1 * SIZE(CO2) 1029 movsd %xmm5, 2 * SIZE(CO2) 1030 movhpd %xmm5, 3 * SIZE(CO2) 1031 1032 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 1033 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) 1034 movsd %xmm6, 2 * SIZE(CO1, LDC, 2) 1035 movhpd %xmm6, 3 * SIZE(CO1, LDC, 2) 1036 1037 movsd %xmm3, 0 * SIZE(CO2, LDC, 2) 1038 movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) 1039 movsd %xmm7, 2 * SIZE(CO2, LDC, 2) 1040 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 1041 1042#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1043 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1044 movq K, %rax 1045 subq KKK, %rax 1046 leaq (,%rax, SIZE), %rax 1047 leaq (AO, %rax, 4), AO 1048 leaq (BO, %rax, 4), BO 1049#endif 1050 1051#if defined(TRMMKERNEL) && defined(LEFT) 1052 addq $4, KK 1053#endif 1054 1055 addq $4 * SIZE, CO1 # coffset += 4 1056 addq $4 * SIZE, CO2 # coffset += 4 1057 1058 decq I # i -- 1059 jg .L11 1060 ALIGN_4 1061 1062.L20: 1063 testq $2, M 1064 BRANCH 1065 je .L30 1066 ALIGN_4 1067 1068.L21: 1069#if !defined(TRMMKERNEL) || \ 1070 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1071 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1072 1073 movq B, BO 1074#else 1075 movq KK, %rax 1076 leaq (, %rax, SIZE), %rax 1077 leaq (AO, %rax, 2), AO 1078 leaq (B, %rax, 4), BO 1079#endif 1080 1081 movapd 0 * SIZE(AO), %xmm8 1082 pxor %xmm0, %xmm0 1083 movddup 0 * SIZE(BO), %xmm9 1084 pxor %xmm1, %xmm1 1085 movapd 8 * SIZE(AO), %xmm10 1086 pxor %xmm2, %xmm2 1087 movddup 8 * SIZE(BO), %xmm11 1088 pxor %xmm3, %xmm3 1089 1090#ifndef TRMMKERNEL 1091 movq K, %rax 1092#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1093 movq K, %rax 1094 subq KK, %rax 1095 movq %rax, KKK 1096#else 1097 movq KK, %rax 1098#ifdef LEFT 1099 addq $2, %rax 1100#else 1101 addq $4, %rax 1102#endif 1103 movq %rax, KKK 1104#endif 1105 sarq $3, %rax 1106 je .L25 1107 ALIGN_4 1108 1109.L22: 1110 mulpd %xmm8, %xmm9 1111 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1112 addpd %xmm9, %xmm0 1113 movddup 1 * SIZE(BO), %xmm9 1114 mulpd %xmm8, %xmm9 1115 addpd %xmm9, %xmm1 1116 movddup 2 * SIZE(BO), %xmm9 1117 mulpd %xmm8, %xmm9 1118 addpd %xmm9, %xmm2 1119 movddup 3 * SIZE(BO), %xmm9 1120 mulpd %xmm8, %xmm9 1121 movapd 2 * SIZE(AO), %xmm8 1122 addpd %xmm9, %xmm3 1123 movddup 4 * SIZE(BO), %xmm9 1124 mulpd %xmm8, %xmm9 1125 addpd %xmm9, %xmm0 1126 movddup 5 * SIZE(BO), %xmm9 1127 mulpd %xmm8, %xmm9 1128 addpd %xmm9, %xmm1 1129 movddup 6 * SIZE(BO), %xmm9 1130 mulpd %xmm8, %xmm9 1131 addpd %xmm9, %xmm2 1132 movddup 7 * SIZE(BO), %xmm9 1133 mulpd %xmm8, %xmm9 1134 movapd 4 * SIZE(AO), %xmm8 1135 addpd %xmm9, %xmm3 1136 movddup 16 * SIZE(BO), %xmm9 1137 mulpd %xmm8, %xmm11 1138 addpd %xmm11, %xmm0 1139 movddup 9 * SIZE(BO), %xmm11 1140 mulpd %xmm8, %xmm11 1141 addpd %xmm11, %xmm1 1142 movddup 10 * SIZE(BO), %xmm11 1143 mulpd %xmm8, %xmm11 1144 addpd %xmm11, %xmm2 1145 movddup 11 * SIZE(BO), %xmm11 1146 mulpd %xmm8, %xmm11 1147 movapd 6 * SIZE(AO), %xmm8 1148 addpd %xmm11, %xmm3 1149 movddup 12 * SIZE(BO), %xmm11 1150 mulpd %xmm8, %xmm11 1151 addpd %xmm11, %xmm0 1152 movddup 13 * SIZE(BO), %xmm11 1153 mulpd %xmm8, %xmm11 1154 addpd %xmm11, %xmm1 1155 movddup 14 * SIZE(BO), %xmm11 1156 mulpd %xmm8, %xmm11 1157 addpd %xmm11, %xmm2 1158 movddup 15 * SIZE(BO), %xmm11 1159 mulpd %xmm8, %xmm11 1160 movapd 16 * SIZE(AO), %xmm8 1161 addpd %xmm11, %xmm3 1162 movddup 24 * SIZE(BO), %xmm11 1163 mulpd %xmm10, %xmm9 1164 addpd %xmm9, %xmm0 1165 movddup 17 * SIZE(BO), %xmm9 1166 mulpd %xmm10, %xmm9 1167 addpd %xmm9, %xmm1 1168 movddup 18 * SIZE(BO), %xmm9 1169 mulpd %xmm10, %xmm9 1170 addpd %xmm9, %xmm2 1171 movddup 19 * SIZE(BO), %xmm9 1172 mulpd %xmm10, %xmm9 1173 movapd 10 * SIZE(AO), %xmm10 1174 addpd %xmm9, %xmm3 1175 movddup 20 * SIZE(BO), %xmm9 1176 mulpd %xmm10, %xmm9 1177 addpd %xmm9, %xmm0 1178 movddup 21 * SIZE(BO), %xmm9 1179 mulpd %xmm10, %xmm9 1180 addpd %xmm9, %xmm1 1181 movddup 22 * SIZE(BO), %xmm9 1182 mulpd %xmm10, %xmm9 1183 addpd %xmm9, %xmm2 1184 movddup 23 * SIZE(BO), %xmm9 1185 mulpd %xmm10, %xmm9 1186 movapd 12 * SIZE(AO), %xmm10 1187 addpd %xmm9, %xmm3 1188 movddup 32 * SIZE(BO), %xmm9 1189 mulpd %xmm10, %xmm11 1190 addpd %xmm11, %xmm0 1191 movddup 25 * SIZE(BO), %xmm11 1192 mulpd %xmm10, %xmm11 1193 addpd %xmm11, %xmm1 1194 movddup 26 * SIZE(BO), %xmm11 1195 mulpd %xmm10, %xmm11 1196 addpd %xmm11, %xmm2 1197 movddup 27 * SIZE(BO), %xmm11 1198 mulpd %xmm10, %xmm11 1199 movapd 14 * SIZE(AO), %xmm10 1200 addpd %xmm11, %xmm3 1201 movddup 28 * SIZE(BO), %xmm11 1202 mulpd %xmm10, %xmm11 1203 addpd %xmm11, %xmm0 1204 movddup 29 * SIZE(BO), %xmm11 1205 mulpd %xmm10, %xmm11 1206 addpd %xmm11, %xmm1 1207 movddup 30 * SIZE(BO), %xmm11 1208 mulpd %xmm10, %xmm11 1209 addpd %xmm11, %xmm2 1210 movddup 31 * SIZE(BO), %xmm11 1211 mulpd %xmm10, %xmm11 1212 movapd 24 * SIZE(AO), %xmm10 1213 addpd %xmm11, %xmm3 1214 movddup 40 * SIZE(BO), %xmm11 1215 1216 addq $16 * SIZE, AO 1217 addq $32 * SIZE, BO 1218 decq %rax 1219 jne .L22 1220 ALIGN_4 1221 1222.L25: 1223#ifndef TRMMKERNEL 1224 movq K, %rax 1225#else 1226 movq KKK, %rax 1227#endif 1228 movddup ALPHA, %xmm15 1229 andq $7, %rax # if (k & 1) 1230 BRANCH 1231 je .L29 1232 ALIGN_4 1233 1234.L26: 1235 mulpd %xmm8, %xmm9 1236 addpd %xmm9, %xmm0 1237 movddup 1 * SIZE(BO), %xmm9 1238 mulpd %xmm8, %xmm9 1239 addpd %xmm9, %xmm1 1240 movddup 2 * SIZE(BO), %xmm9 1241 mulpd %xmm8, %xmm9 1242 addpd %xmm9, %xmm2 1243 movddup 3 * SIZE(BO), %xmm9 1244 mulpd %xmm8, %xmm9 1245 movapd 2 * SIZE(AO), %xmm8 1246 addpd %xmm9, %xmm3 1247 movddup 4 * SIZE(BO), %xmm9 1248 1249 addq $2 * SIZE, AO # aoffset += 4 1250 addq $4 * SIZE, BO # boffset1 += 8 1251 decq %rax 1252 jg .L26 1253 ALIGN_4 1254 1255.L29: 1256#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1257 movsd 0 * SIZE(CO1), %xmm8 1258 movhpd 1 * SIZE(CO1), %xmm8 1259 movsd 0 * SIZE(CO2), %xmm10 1260 movhpd 1 * SIZE(CO2), %xmm10 1261 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1262 movhpd 1 * SIZE(CO1, LDC, 2), %xmm12 1263 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1264 movhpd 1 * SIZE(CO2, LDC, 2), %xmm14 1265#endif 1266 1267 mulpd %xmm15, %xmm0 1268 mulpd %xmm15, %xmm1 1269 mulpd %xmm15, %xmm2 1270 mulpd %xmm15, %xmm3 1271 1272#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1273 addpd %xmm8, %xmm0 1274 addpd %xmm10, %xmm1 1275 addpd %xmm12, %xmm2 1276 addpd %xmm14, %xmm3 1277#endif 1278 1279 movsd %xmm0, 0 * SIZE(CO1) 1280 movhpd %xmm0, 1 * SIZE(CO1) 1281 movsd %xmm1, 0 * SIZE(CO2) 1282 movhpd %xmm1, 1 * SIZE(CO2) 1283 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 1284 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) 1285 movsd %xmm3, 0 * SIZE(CO2, LDC, 2) 1286 movhpd %xmm3, 1 * SIZE(CO2, LDC, 2) 1287 1288#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1289 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1290 movq K, %rax 1291 subq KKK, %rax 1292 leaq (,%rax, SIZE), %rax 1293 leaq (AO, %rax, 2), AO 1294 leaq (BO, %rax, 4), BO 1295#endif 1296 1297#if defined(TRMMKERNEL) && defined(LEFT) 1298 addq $2, KK 1299#endif 1300 1301 addq $2 * SIZE, CO1 # coffset += 4 1302 addq $2 * SIZE, CO2 # coffset += 4 1303 ALIGN_4 1304 1305.L30: 1306 testq $1, M 1307 je .L39 1308 ALIGN_4 1309 1310.L31: 1311#if !defined(TRMMKERNEL) || \ 1312 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1313 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1314 1315 movq B, BO 1316#else 1317 movq KK, %rax 1318 leaq (, %rax, SIZE), %rax 1319 leaq (AO, %rax, 1), AO 1320 leaq (B, %rax, 4), BO 1321#endif 1322 1323 movddup 0 * SIZE(AO), %xmm8 1324 pxor %xmm0, %xmm0 1325 movapd 0 * SIZE(BO), %xmm9 1326 pxor %xmm1, %xmm1 1327 movddup 4 * SIZE(AO), %xmm10 1328 pxor %xmm2, %xmm2 1329 movapd 8 * SIZE(BO), %xmm11 1330 pxor %xmm3, %xmm3 1331 1332#ifndef TRMMKERNEL 1333 movq K, %rax 1334#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1335 movq K, %rax 1336 subq KK, %rax 1337 movq %rax, KKK 1338#else 1339 movq KK, %rax 1340#ifdef LEFT 1341 addq $1, %rax 1342#else 1343 addq $4, %rax 1344#endif 1345 movq %rax, KKK 1346#endif 1347 sarq $3, %rax 1348 je .L35 1349 ALIGN_4 1350 1351.L32: 1352 mulpd %xmm8, %xmm9 1353 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1354 addpd %xmm9, %xmm0 1355 movapd 2 * SIZE(BO), %xmm9 1356 mulpd %xmm8, %xmm9 1357 movddup 1 * SIZE(AO), %xmm8 1358 addpd %xmm9, %xmm1 1359 movapd 4 * SIZE(BO), %xmm9 1360 mulpd %xmm8, %xmm9 1361 addpd %xmm9, %xmm0 1362 movapd 6 * SIZE(BO), %xmm9 1363 mulpd %xmm8, %xmm9 1364 movddup 2 * SIZE(AO), %xmm8 1365 addpd %xmm9, %xmm1 1366 movapd 16 * SIZE(BO), %xmm9 1367 mulpd %xmm8, %xmm11 1368 addpd %xmm11, %xmm0 1369 movapd 10 * SIZE(BO), %xmm11 1370 mulpd %xmm8, %xmm11 1371 movddup 3 * SIZE(AO), %xmm8 1372 addpd %xmm11, %xmm1 1373 movapd 12 * SIZE(BO), %xmm11 1374 mulpd %xmm8, %xmm11 1375 addpd %xmm11, %xmm0 1376 movapd 14 * SIZE(BO), %xmm11 1377 mulpd %xmm8, %xmm11 1378 movddup 8 * SIZE(AO), %xmm8 1379 addpd %xmm11, %xmm1 1380 movapd 24 * SIZE(BO), %xmm11 1381 mulpd %xmm10, %xmm9 1382 addpd %xmm9, %xmm0 1383 movapd 18 * SIZE(BO), %xmm9 1384 mulpd %xmm10, %xmm9 1385 movddup 5 * SIZE(AO), %xmm10 1386 addpd %xmm9, %xmm1 1387 movapd 20 * SIZE(BO), %xmm9 1388 mulpd %xmm10, %xmm9 1389 addpd %xmm9, %xmm0 1390 movapd 22 * SIZE(BO), %xmm9 1391 mulpd %xmm10, %xmm9 1392 movddup 6 * SIZE(AO), %xmm10 1393 addpd %xmm9, %xmm1 1394 movapd 32 * SIZE(BO), %xmm9 1395 mulpd %xmm10, %xmm11 1396 addpd %xmm11, %xmm0 1397 movapd 26 * SIZE(BO), %xmm11 1398 mulpd %xmm10, %xmm11 1399 movddup 7 * SIZE(AO), %xmm10 1400 addpd %xmm11, %xmm1 1401 movapd 28 * SIZE(BO), %xmm11 1402 mulpd %xmm10, %xmm11 1403 addpd %xmm11, %xmm0 1404 movapd 30 * SIZE(BO), %xmm11 1405 mulpd %xmm10, %xmm11 1406 movddup 12 * SIZE(AO), %xmm10 1407 addpd %xmm11, %xmm1 1408 movapd 40 * SIZE(BO), %xmm11 1409 1410 addq $ 8 * SIZE, AO 1411 addq $32 * SIZE, BO 1412 decq %rax 1413 jne .L32 1414 ALIGN_4 1415 1416.L35: 1417#ifndef TRMMKERNEL 1418 movq K, %rax 1419#else 1420 movq KKK, %rax 1421#endif 1422 movddup ALPHA, %xmm15 1423 andq $7, %rax # if (k & 1) 1424 BRANCH 1425 je .L38 1426 ALIGN_4 1427 1428.L36: 1429 mulpd %xmm8, %xmm9 1430 addpd %xmm9, %xmm0 1431 movapd 2 * SIZE(BO), %xmm9 1432 mulpd %xmm8, %xmm9 1433 movddup 1 * SIZE(AO), %xmm8 1434 addpd %xmm9, %xmm1 1435 movapd 4 * SIZE(BO), %xmm9 1436 1437 addq $1 * SIZE, AO # aoffset += 4 1438 addq $4 * SIZE, BO # boffset1 += 8 1439 decq %rax 1440 jg .L36 1441 ALIGN_4 1442 1443.L38: 1444#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1445 movsd 0 * SIZE(CO1), %xmm8 1446 movhpd 0 * SIZE(CO2), %xmm8 1447 movsd 0 * SIZE(CO1, LDC, 2), %xmm9 1448 movhpd 0 * SIZE(CO2, LDC, 2), %xmm9 1449#endif 1450 mulpd %xmm15, %xmm0 1451 mulpd %xmm15, %xmm1 1452 1453#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1454 addpd %xmm8, %xmm0 1455 addpd %xmm9, %xmm1 1456#endif 1457 1458 movsd %xmm0, 0 * SIZE(CO1) 1459 movhpd %xmm0, 0 * SIZE(CO2) 1460 movsd %xmm1, 0 * SIZE(CO1, LDC, 2) 1461 movhpd %xmm1, 0 * SIZE(CO2, LDC, 2) 1462 1463#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1464 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1465 movq K, %rax 1466 subq KKK, %rax 1467 leaq (,%rax, SIZE), %rax 1468 leaq (AO, %rax, 1), AO 1469 leaq (BO, %rax, 4), BO 1470#endif 1471 1472#if defined(TRMMKERNEL) && defined(LEFT) 1473 addq $1, KK 1474#endif 1475 ALIGN_4 1476 1477.L39: 1478#if defined(TRMMKERNEL) && !defined(LEFT) 1479 addl $4, KK 1480#endif 1481 1482 leaq (C, LDC, 4), C # c += 4 * ldc 1483 movq BO, B 1484 decq J # j -- 1485 jg .L10 1486 ALIGN_4 1487 1488.L40: 1489 testq $2, N 1490 je .L80 1491 ALIGN_4 1492 1493#if defined(TRMMKERNEL) && defined(LEFT) 1494 movq OFFSET, %rax 1495 movq %rax, KK 1496#endif 1497 1498 movq C, CO1 # coffset1 = c 1499 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1500 movq A, AO # aoffset = a 1501 1502 movq K, %rax 1503 salq $BASE_SHIFT + 1, %rax 1504 leaq (B, %rax), BB 1505 1506 movq M, I 1507 sarq $2, I # i = (m >> 2) 1508 jle .L60 1509 ALIGN_4 1510 1511.L51: 1512#if !defined(TRMMKERNEL) || \ 1513 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1514 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1515 1516 movq B, BO 1517#else 1518 movq KK, %rax 1519 leaq (, %rax, SIZE), %rax 1520 leaq (AO, %rax, 4), AO 1521 leaq (B, %rax, 2), BO 1522#endif 1523 1524 prefetcht0 0 * SIZE(BB) 1525 subq $-4 * SIZE, BB 1526 1527 movapd 0 * SIZE(AO), %xmm8 1528 pxor %xmm0, %xmm0 1529 movddup 0 * SIZE(BO), %xmm9 1530 pxor %xmm1, %xmm1 1531 movapd 8 * SIZE(AO), %xmm10 1532 pxor %xmm4, %xmm4 1533 movddup 8 * SIZE(BO), %xmm11 1534 pxor %xmm5, %xmm5 1535 1536#ifdef HAVE_3DNOW 1537 prefetchw 4 * SIZE(CO1) 1538 prefetchw 4 * SIZE(CO2) 1539#else 1540 prefetchnta 4 * SIZE(CO1) 1541 prefetchnta 4 * SIZE(CO2) 1542#endif 1543 1544#ifndef TRMMKERNEL 1545 movq K, %rax 1546#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1547 movq K, %rax 1548 subq KK, %rax 1549 movq %rax, KKK 1550#else 1551 movq KK, %rax 1552#ifdef LEFT 1553 addq $4, %rax 1554#else 1555 addq $2, %rax 1556#endif 1557 movq %rax, KKK 1558#endif 1559 sarq $3, %rax 1560 je .L55 1561 ALIGN_4 1562 1563.L52: 1564 mulpd %xmm8, %xmm9 1565 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1566 addpd %xmm9, %xmm0 1567 movddup 1 * SIZE(BO), %xmm9 1568 mulpd %xmm8, %xmm9 1569 movapd 2 * SIZE(AO), %xmm8 1570 addpd %xmm9, %xmm1 1571 movddup 0 * SIZE(BO), %xmm9 1572 mulpd %xmm8, %xmm9 1573 addpd %xmm9, %xmm4 1574 movddup 1 * SIZE(BO), %xmm9 1575 mulpd %xmm8, %xmm9 1576 movapd 4 * SIZE(AO), %xmm8 1577 addpd %xmm9, %xmm5 1578 movddup 2 * SIZE(BO), %xmm9 1579 mulpd %xmm8, %xmm9 1580 addpd %xmm9, %xmm0 1581 movddup 3 * SIZE(BO), %xmm9 1582 mulpd %xmm8, %xmm9 1583 movapd 6 * SIZE(AO), %xmm8 1584 addpd %xmm9, %xmm1 1585 movddup 2 * SIZE(BO), %xmm9 1586 mulpd %xmm8, %xmm9 1587 addpd %xmm9, %xmm4 1588 movddup 3 * SIZE(BO), %xmm9 1589 mulpd %xmm8, %xmm9 1590 movapd 16 * SIZE(AO), %xmm8 1591 addpd %xmm9, %xmm5 1592 movddup 4 * SIZE(BO), %xmm9 1593 mulpd %xmm10, %xmm9 1594 addpd %xmm9, %xmm0 1595 movddup 5 * SIZE(BO), %xmm9 1596 mulpd %xmm10, %xmm9 1597 movapd 10 * SIZE(AO), %xmm10 1598 addpd %xmm9, %xmm1 1599 movddup 4 * SIZE(BO), %xmm9 1600 mulpd %xmm10, %xmm9 1601 addpd %xmm9, %xmm4 1602 movddup 5 * SIZE(BO), %xmm9 1603 mulpd %xmm10, %xmm9 1604 movapd 12 * SIZE(AO), %xmm10 1605 addpd %xmm9, %xmm5 1606 movddup 6 * SIZE(BO), %xmm9 1607 mulpd %xmm10, %xmm9 1608 addpd %xmm9, %xmm0 1609 movddup 7 * SIZE(BO), %xmm9 1610 mulpd %xmm10, %xmm9 1611 movapd 14 * SIZE(AO), %xmm10 1612 addpd %xmm9, %xmm1 1613 movddup 6 * SIZE(BO), %xmm9 1614 mulpd %xmm10, %xmm9 1615 addpd %xmm9, %xmm4 1616 movddup 7 * SIZE(BO), %xmm9 1617 mulpd %xmm10, %xmm9 1618 movapd 40 * SIZE(AO), %xmm10 1619 addpd %xmm9, %xmm5 1620 movddup 16 * SIZE(BO), %xmm9 1621 mulpd %xmm8, %xmm11 1622 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1623 addpd %xmm11, %xmm0 1624 movddup 9 * SIZE(BO), %xmm11 1625 mulpd %xmm8, %xmm11 1626 movapd 18 * SIZE(AO), %xmm8 1627 addpd %xmm11, %xmm1 1628 movddup 8 * SIZE(BO), %xmm11 1629 mulpd %xmm8, %xmm11 1630 addpd %xmm11, %xmm4 1631 movddup 9 * SIZE(BO), %xmm11 1632 mulpd %xmm8, %xmm11 1633 movapd 20 * SIZE(AO), %xmm8 1634 addpd %xmm11, %xmm5 1635 movddup 10 * SIZE(BO), %xmm11 1636 mulpd %xmm8, %xmm11 1637 addpd %xmm11, %xmm0 1638 movddup 11 * SIZE(BO), %xmm11 1639 mulpd %xmm8, %xmm11 1640 movapd 22 * SIZE(AO), %xmm8 1641 addpd %xmm11, %xmm1 1642 movddup 10 * SIZE(BO), %xmm11 1643 mulpd %xmm8, %xmm11 1644 addpd %xmm11, %xmm4 1645 movddup 11 * SIZE(BO), %xmm11 1646 mulpd %xmm8, %xmm11 1647 movapd 24 * SIZE(AO), %xmm8 1648 addpd %xmm11, %xmm5 1649 movddup 12 * SIZE(BO), %xmm11 1650 mulpd %xmm8, %xmm11 1651 addpd %xmm11, %xmm0 1652 movddup 13 * SIZE(BO), %xmm11 1653 mulpd %xmm8, %xmm11 1654 movapd 26 * SIZE(AO), %xmm8 1655 addpd %xmm11, %xmm1 1656 movddup 12 * SIZE(BO), %xmm11 1657 mulpd %xmm8, %xmm11 1658 addpd %xmm11, %xmm4 1659 movddup 13 * SIZE(BO), %xmm11 1660 mulpd %xmm8, %xmm11 1661 movapd 28 * SIZE(AO), %xmm8 1662 addpd %xmm11, %xmm5 1663 movddup 14 * SIZE(BO), %xmm11 1664 mulpd %xmm8, %xmm11 1665 addpd %xmm11, %xmm0 1666 movddup 15 * SIZE(BO), %xmm11 1667 mulpd %xmm8, %xmm11 1668 movapd 30 * SIZE(AO), %xmm8 1669 addpd %xmm11, %xmm1 1670 movddup 14 * SIZE(BO), %xmm11 1671 mulpd %xmm8, %xmm11 1672 addpd %xmm11, %xmm4 1673 movddup 15 * SIZE(BO), %xmm11 1674 mulpd %xmm8, %xmm11 1675 movapd 32 * SIZE(AO), %xmm8 1676 addpd %xmm11, %xmm5 1677 movddup 24 * SIZE(BO), %xmm11 1678 1679 addq $32 * SIZE, AO 1680 addq $16 * SIZE, BO 1681 decq %rax 1682 jne .L52 1683 ALIGN_4 1684 1685.L55: 1686#ifndef TRMMKERNEL 1687 movq K, %rax 1688#else 1689 movq KKK, %rax 1690#endif 1691 movddup ALPHA, %xmm15 1692 andq $7, %rax # if (k & 1) 1693 BRANCH 1694 je .L59 1695 ALIGN_4 1696 1697.L56: 1698 mulpd %xmm8, %xmm9 1699 movapd 2 * SIZE(AO), %xmm10 1700 addpd %xmm9, %xmm0 1701 movddup 1 * SIZE(BO), %xmm9 1702 mulpd %xmm8, %xmm9 1703 movddup 0 * SIZE(BO), %xmm11 1704 addpd %xmm9, %xmm1 1705 movddup 2 * SIZE(BO), %xmm9 1706 mulpd %xmm10, %xmm11 1707 movapd 4 * SIZE(AO), %xmm8 1708 addpd %xmm11, %xmm4 1709 movddup 1 * SIZE(BO), %xmm11 1710 mulpd %xmm10, %xmm11 1711 addpd %xmm11, %xmm5 1712 1713 addq $4 * SIZE, AO # aoffset += 4 1714 addq $2 * SIZE, BO # boffset1 += 8 1715 decq %rax 1716 jg .L56 1717 ALIGN_4 1718 1719.L59: 1720#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1721 movsd 0 * SIZE(CO1), %xmm8 1722 movhpd 1 * SIZE(CO1), %xmm8 1723 movsd 2 * SIZE(CO1), %xmm9 1724 movhpd 3 * SIZE(CO1), %xmm9 1725 movsd 0 * SIZE(CO2), %xmm10 1726 movhpd 1 * SIZE(CO2), %xmm10 1727 movsd 2 * SIZE(CO2), %xmm11 1728 movhpd 3 * SIZE(CO2), %xmm11 1729#endif 1730 1731 mulpd %xmm15, %xmm0 1732 mulpd %xmm15, %xmm1 1733 mulpd %xmm15, %xmm4 1734 mulpd %xmm15, %xmm5 1735 1736#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1737 addpd %xmm8, %xmm0 1738 addpd %xmm9, %xmm4 1739 addpd %xmm10, %xmm1 1740 addpd %xmm11, %xmm5 1741#endif 1742 1743 movsd %xmm0, 0 * SIZE(CO1) 1744 movhpd %xmm0, 1 * SIZE(CO1) 1745 movsd %xmm4, 2 * SIZE(CO1) 1746 movhpd %xmm4, 3 * SIZE(CO1) 1747 movsd %xmm1, 0 * SIZE(CO2) 1748 movhpd %xmm1, 1 * SIZE(CO2) 1749 movsd %xmm5, 2 * SIZE(CO2) 1750 movhpd %xmm5, 3 * SIZE(CO2) 1751 1752#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1753 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1754 movq K, %rax 1755 subq KKK, %rax 1756 leaq (,%rax, SIZE), %rax 1757 leaq (AO, %rax, 4), AO 1758 leaq (BO, %rax, 2), BO 1759#endif 1760 1761#if defined(TRMMKERNEL) && defined(LEFT) 1762 addq $4, KK 1763#endif 1764 1765 addq $4 * SIZE, CO1 # coffset += 4 1766 addq $4 * SIZE, CO2 # coffset += 4 1767 decq I # i -- 1768 jg .L51 1769 ALIGN_4 1770 1771.L60: 1772 testq $2, M 1773 je .L70 1774 ALIGN_4 1775 1776.L61: 1777#if !defined(TRMMKERNEL) || \ 1778 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1779 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1780 1781 movq B, BO 1782#else 1783 movq KK, %rax 1784 leaq (, %rax, SIZE), %rax 1785 leaq (AO, %rax, 2), AO 1786 leaq (B, %rax, 2), BO 1787#endif 1788 1789 movapd 0 * SIZE(AO), %xmm8 1790 pxor %xmm0, %xmm0 1791 movddup 0 * SIZE(BO), %xmm9 1792 pxor %xmm1, %xmm1 1793 movapd 8 * SIZE(AO), %xmm10 1794 pxor %xmm2, %xmm2 1795 movddup 8 * SIZE(BO), %xmm11 1796 pxor %xmm3, %xmm3 1797 1798#ifndef TRMMKERNEL 1799 movq K, %rax 1800#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1801 movq K, %rax 1802 subq KK, %rax 1803 movq %rax, KKK 1804#else 1805 movq KK, %rax 1806#ifdef LEFT 1807 addq $2, %rax 1808#else 1809 addq $2, %rax 1810#endif 1811 movq %rax, KKK 1812#endif 1813 sarq $3, %rax 1814 je .L65 1815 ALIGN_4 1816 1817.L62: 1818 mulpd %xmm8, %xmm9 1819 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1820 addpd %xmm9, %xmm0 1821 movddup 1 * SIZE(BO), %xmm9 1822 mulpd %xmm8, %xmm9 1823 movapd 2 * SIZE(AO), %xmm8 1824 addpd %xmm9, %xmm1 1825 movddup 2 * SIZE(BO), %xmm9 1826 mulpd %xmm8, %xmm9 1827 addpd %xmm9, %xmm2 1828 movddup 3 * SIZE(BO), %xmm9 1829 mulpd %xmm8, %xmm9 1830 movapd 4 * SIZE(AO), %xmm8 1831 addpd %xmm9, %xmm3 1832 movddup 4 * SIZE(BO), %xmm9 1833 mulpd %xmm8, %xmm9 1834 addpd %xmm9, %xmm0 1835 movddup 5 * SIZE(BO), %xmm9 1836 mulpd %xmm8, %xmm9 1837 movapd 6 * SIZE(AO), %xmm8 1838 addpd %xmm9, %xmm1 1839 movddup 6 * SIZE(BO), %xmm9 1840 mulpd %xmm8, %xmm9 1841 addpd %xmm9, %xmm2 1842 movddup 7 * SIZE(BO), %xmm9 1843 mulpd %xmm8, %xmm9 1844 movapd 16 * SIZE(AO), %xmm8 1845 addpd %xmm9, %xmm3 1846 movddup 16 * SIZE(BO), %xmm9 1847 mulpd %xmm10, %xmm11 1848 addpd %xmm11, %xmm0 1849 movddup 9 * SIZE(BO), %xmm11 1850 mulpd %xmm10, %xmm11 1851 movapd 10 * SIZE(AO), %xmm10 1852 addpd %xmm11, %xmm1 1853 movddup 10 * SIZE(BO), %xmm11 1854 mulpd %xmm10, %xmm11 1855 addpd %xmm11, %xmm2 1856 movddup 11 * SIZE(BO), %xmm11 1857 mulpd %xmm10, %xmm11 1858 movapd 12 * SIZE(AO), %xmm10 1859 addpd %xmm11, %xmm3 1860 movddup 12 * SIZE(BO), %xmm11 1861 mulpd %xmm10, %xmm11 1862 addpd %xmm11, %xmm0 1863 movddup 13 * SIZE(BO), %xmm11 1864 mulpd %xmm10, %xmm11 1865 movapd 14 * SIZE(AO), %xmm10 1866 addpd %xmm11, %xmm1 1867 movddup 14 * SIZE(BO), %xmm11 1868 mulpd %xmm10, %xmm11 1869 addpd %xmm11, %xmm2 1870 movddup 15 * SIZE(BO), %xmm11 1871 mulpd %xmm10, %xmm11 1872 movapd 24 * SIZE(AO), %xmm10 1873 addpd %xmm11, %xmm3 1874 movddup 24 * SIZE(BO), %xmm11 1875 1876 addq $16 * SIZE, AO 1877 addq $16 * SIZE, BO 1878 decq %rax 1879 jne .L62 1880 ALIGN_4 1881 1882.L65: 1883#ifndef TRMMKERNEL 1884 movq K, %rax 1885#else 1886 movq KKK, %rax 1887#endif 1888 movddup ALPHA, %xmm15 1889 andq $7, %rax # if (k & 1) 1890 BRANCH 1891 je .L69 1892 ALIGN_4 1893 1894.L66: 1895 mulpd %xmm8, %xmm9 1896 addpd %xmm9, %xmm0 1897 movddup 1 * SIZE(BO), %xmm9 1898 mulpd %xmm8, %xmm9 1899 movapd 2 * SIZE(AO), %xmm8 1900 addpd %xmm9, %xmm1 1901 movddup 2 * SIZE(BO), %xmm9 1902 1903 addq $2 * SIZE, AO # aoffset += 4 1904 addq $2 * SIZE, BO # boffset1 += 8 1905 decq %rax 1906 jg .L66 1907 ALIGN_4 1908 1909.L69: 1910#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1911 movsd 0 * SIZE(CO1), %xmm8 1912 movhpd 1 * SIZE(CO1), %xmm8 1913 movsd 0 * SIZE(CO2), %xmm10 1914 movhpd 1 * SIZE(CO2), %xmm10 1915#endif 1916 1917 addpd %xmm2, %xmm0 1918 addpd %xmm3, %xmm1 1919 1920 mulpd %xmm15, %xmm0 1921 mulpd %xmm15, %xmm1 1922 1923#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1924 addpd %xmm8, %xmm0 1925 addpd %xmm10, %xmm1 1926#endif 1927 1928 movsd %xmm0, 0 * SIZE(CO1) 1929 movhpd %xmm0, 1 * SIZE(CO1) 1930 movsd %xmm1, 0 * SIZE(CO2) 1931 movhpd %xmm1, 1 * SIZE(CO2) 1932 1933#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1934 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1935 movq K, %rax 1936 subq KKK, %rax 1937 leaq (,%rax, SIZE), %rax 1938 leaq (AO, %rax, 2), AO 1939 leaq (BO, %rax, 2), BO 1940#endif 1941 1942#if defined(TRMMKERNEL) && defined(LEFT) 1943 addq $2, KK 1944#endif 1945 addq $2 * SIZE, CO1 # coffset += 4 1946 addq $2 * SIZE, CO2 # coffset += 4 1947 ALIGN_4 1948 1949.L70: 1950 testq $1, M 1951 je .L79 1952 ALIGN_4 1953 1954.L71: 1955#if !defined(TRMMKERNEL) || \ 1956 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1957 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1958 1959 movq B, BO 1960#else 1961 movq KK, %rax 1962 leaq (, %rax, SIZE), %rax 1963 leaq (AO, %rax, 1), AO 1964 leaq (B, %rax, 2), BO 1965#endif 1966 1967 movddup 0 * SIZE(AO), %xmm8 1968 pxor %xmm0, %xmm0 1969 movapd 0 * SIZE(BO), %xmm9 1970 pxor %xmm1, %xmm1 1971 movddup 4 * SIZE(AO), %xmm10 1972 pxor %xmm2, %xmm2 1973 movapd 8 * SIZE(BO), %xmm11 1974 pxor %xmm3, %xmm3 1975 1976#ifndef TRMMKERNEL 1977 movq K, %rax 1978#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1979 movq K, %rax 1980 subq KK, %rax 1981 movq %rax, KKK 1982#else 1983 movq KK, %rax 1984#ifdef LEFT 1985 addq $1, %rax 1986#else 1987 addq $2, %rax 1988#endif 1989 movq %rax, KKK 1990#endif 1991 sarq $3, %rax 1992 je .L75 1993 ALIGN_4 1994 1995.L72: 1996 mulpd %xmm8, %xmm9 1997 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1998 movddup 1 * SIZE(AO), %xmm8 1999 addpd %xmm9, %xmm0 2000 mulpd 2 * SIZE(BO), %xmm8 2001 movapd 16 * SIZE(BO), %xmm9 2002 addpd %xmm8, %xmm1 2003 movddup 2 * SIZE(AO), %xmm8 2004 mulpd 4 * SIZE(BO), %xmm8 2005 addpd %xmm8, %xmm2 2006 movddup 3 * SIZE(AO), %xmm8 2007 mulpd 6 * SIZE(BO), %xmm8 2008 addpd %xmm8, %xmm3 2009 movddup 8 * SIZE(AO), %xmm8 2010 mulpd %xmm10, %xmm11 2011 movddup 5 * SIZE(AO), %xmm10 2012 addpd %xmm11, %xmm0 2013 mulpd 10 * SIZE(BO), %xmm10 2014 movapd 24 * SIZE(BO), %xmm11 2015 addpd %xmm10, %xmm1 2016 movddup 6 * SIZE(AO), %xmm10 2017 mulpd 12 * SIZE(BO), %xmm10 2018 addpd %xmm10, %xmm2 2019 movddup 7 * SIZE(AO), %xmm10 2020 mulpd 14 * SIZE(BO), %xmm10 2021 addpd %xmm10, %xmm3 2022 movddup 12 * SIZE(AO), %xmm10 2023 2024 addq $ 8 * SIZE, AO 2025 addq $16 * SIZE, BO 2026 decq %rax 2027 jne .L72 2028 ALIGN_4 2029 2030.L75: 2031#ifndef TRMMKERNEL 2032 movq K, %rax 2033#else 2034 movq KKK, %rax 2035#endif 2036 movddup ALPHA, %xmm15 2037 andq $7, %rax # if (k & 1) 2038 BRANCH 2039 je .L78 2040 ALIGN_4 2041 2042.L76: 2043 mulpd %xmm8, %xmm9 2044 movddup 1 * SIZE(AO), %xmm8 2045 addpd %xmm9, %xmm0 2046 movapd 2 * SIZE(BO), %xmm9 2047 2048 addq $1 * SIZE, AO # aoffset += 4 2049 addq $2 * SIZE, BO # boffset1 += 8 2050 decq %rax 2051 jg .L76 2052 ALIGN_4 2053 2054.L78: 2055#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2056 movsd 0 * SIZE(CO1), %xmm8 2057 movhpd 0 * SIZE(CO2), %xmm8 2058#endif 2059 2060 addpd %xmm1, %xmm0 2061 addpd %xmm3, %xmm2 2062 addpd %xmm2, %xmm0 2063 2064 mulpd %xmm15, %xmm0 2065#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2066 addpd %xmm8, %xmm0 2067#endif 2068 2069 movsd %xmm0, 0 * SIZE(CO1) 2070 movhpd %xmm0, 0 * SIZE(CO2) 2071 2072#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2073 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2074 movq K, %rax 2075 subq KKK, %rax 2076 leaq (,%rax, SIZE), %rax 2077 leaq (AO, %rax, 1), AO 2078 leaq (BO, %rax, 2), BO 2079#endif 2080 2081#if defined(TRMMKERNEL) && defined(LEFT) 2082 addq $1, KK 2083#endif 2084 ALIGN_4 2085 2086.L79: 2087#if defined(TRMMKERNEL) && !defined(LEFT) 2088 addl $2, KK 2089#endif 2090 leaq (C, LDC, 2), C 2091 movq BO, B 2092 ALIGN_4 2093 2094.L80: 2095 testq $1, N 2096 je .L999 2097 ALIGN_4 2098 2099#if defined(TRMMKERNEL) && defined(LEFT) 2100 movq OFFSET, %rax 2101 movq %rax, KK 2102#endif 2103 2104 movq C, CO1 2105 movq A, AO 2106 2107 movq M, I 2108 sarq $2, I # i = (m >> 2) 2109 jle .L100 2110 ALIGN_4 2111 2112.L91: 2113#if !defined(TRMMKERNEL) || \ 2114 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2115 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2116 2117 movq B, BO 2118#else 2119 movq KK, %rax 2120 leaq (, %rax, SIZE), %rax 2121 leaq (AO, %rax, 4), AO 2122 leaq (B, %rax, 1), BO 2123#endif 2124 2125 movapd 0 * SIZE(AO), %xmm8 2126 pxor %xmm0, %xmm0 2127 movddup 0 * SIZE(BO), %xmm9 2128 pxor %xmm1, %xmm1 2129 movapd 8 * SIZE(AO), %xmm10 2130 pxor %xmm2, %xmm2 2131 movddup 4 * SIZE(BO), %xmm11 2132 pxor %xmm3, %xmm3 2133 2134#ifdef HAVE_3DNOW 2135 prefetchw 4 * SIZE(CO1) 2136#else 2137 prefetchnta 4 * SIZE(CO1) 2138#endif 2139 2140#ifndef TRMMKERNEL 2141 movq K, %rax 2142#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2143 movq K, %rax 2144 subq KK, %rax 2145 movq %rax, KKK 2146#else 2147 movq KK, %rax 2148#ifdef LEFT 2149 addq $4, %rax 2150#else 2151 addq $1, %rax 2152#endif 2153 movq %rax, KKK 2154#endif 2155 sarq $3, %rax 2156 je .L95 2157 ALIGN_4 2158 2159.L92: 2160 mulpd %xmm9, %xmm8 2161 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2162 mulpd 2 * SIZE(AO), %xmm9 2163 addpd %xmm8, %xmm0 2164 movapd 4 * SIZE(AO), %xmm8 2165 addpd %xmm9, %xmm1 2166 movddup 1 * SIZE(BO), %xmm9 2167 mulpd %xmm9, %xmm8 2168 mulpd 6 * SIZE(AO), %xmm9 2169 addpd %xmm8, %xmm2 2170 movapd 16 * SIZE(AO), %xmm8 2171 addpd %xmm9, %xmm3 2172 movddup 2 * SIZE(BO), %xmm9 2173 mulpd %xmm9, %xmm10 2174 mulpd 10 * SIZE(AO), %xmm9 2175 addpd %xmm10, %xmm0 2176 movapd 12 * SIZE(AO), %xmm10 2177 addpd %xmm9, %xmm1 2178 movddup 3 * SIZE(BO), %xmm9 2179 mulpd %xmm9, %xmm10 2180 mulpd 14 * SIZE(AO), %xmm9 2181 addpd %xmm10, %xmm2 2182 movapd 24 * SIZE(AO), %xmm10 2183 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2184 addpd %xmm9, %xmm3 2185 movddup 8 * SIZE(BO), %xmm9 2186 mulpd %xmm11, %xmm8 2187 mulpd 18 * SIZE(AO), %xmm11 2188 addpd %xmm8, %xmm0 2189 movapd 20 * SIZE(AO), %xmm8 2190 addpd %xmm11, %xmm1 2191 movddup 5 * SIZE(BO), %xmm11 2192 mulpd %xmm11, %xmm8 2193 mulpd 22 * SIZE(AO), %xmm11 2194 addpd %xmm8, %xmm2 2195 movapd 32 * SIZE(AO), %xmm8 2196 addpd %xmm11, %xmm3 2197 movddup 6 * SIZE(BO), %xmm11 2198 mulpd %xmm11, %xmm10 2199 mulpd 26 * SIZE(AO), %xmm11 2200 addpd %xmm10, %xmm0 2201 movapd 28 * SIZE(AO), %xmm10 2202 addpd %xmm11, %xmm1 2203 movddup 7 * SIZE(BO), %xmm11 2204 mulpd %xmm11, %xmm10 2205 mulpd 30 * SIZE(AO), %xmm11 2206 addpd %xmm10, %xmm2 2207 movapd 40 * SIZE(AO), %xmm10 2208 addpd %xmm11, %xmm3 2209 movddup 12 * SIZE(BO), %xmm11 2210 2211 addq $32 * SIZE, AO 2212 addq $8 * SIZE, BO 2213 decq %rax 2214 jne .L92 2215 ALIGN_4 2216 2217.L95: 2218#ifndef TRMMKERNEL 2219 movq K, %rax 2220#else 2221 movq KKK, %rax 2222#endif 2223 movddup ALPHA, %xmm15 2224 andq $7, %rax # if (k & 1) 2225 BRANCH 2226 je .L99 2227 ALIGN_4 2228 2229.L96: 2230 mulpd %xmm9, %xmm8 2231 mulpd 2 * SIZE(AO), %xmm9 2232 addpd %xmm8, %xmm0 2233 movapd 4 * SIZE(AO), %xmm8 2234 addpd %xmm9, %xmm1 2235 movddup 1 * SIZE(BO), %xmm9 2236 2237 addq $4 * SIZE, AO # aoffset += 4 2238 addq $1 * SIZE, BO # boffset1 += 8 2239 decq %rax 2240 jg .L96 2241 ALIGN_4 2242 2243.L99: 2244#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2245 movsd 0 * SIZE(CO1), %xmm8 2246 movhpd 1 * SIZE(CO1), %xmm8 2247 movsd 2 * SIZE(CO1), %xmm9 2248 movhpd 3 * SIZE(CO1), %xmm9 2249#endif 2250 2251 addpd %xmm2, %xmm0 2252 addpd %xmm3, %xmm1 2253 2254 mulpd %xmm15, %xmm0 2255 mulpd %xmm15, %xmm1 2256 2257#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2258 addpd %xmm8, %xmm0 2259 addpd %xmm9, %xmm1 2260#endif 2261 2262 movsd %xmm0, 0 * SIZE(CO1) 2263 movhpd %xmm0, 1 * SIZE(CO1) 2264 movsd %xmm1, 2 * SIZE(CO1) 2265 movhpd %xmm1, 3 * SIZE(CO1) 2266 2267#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2268 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2269 movq K, %rax 2270 subq KKK, %rax 2271 leaq (,%rax, SIZE), %rax 2272 leaq (AO, %rax, 4), AO 2273 leaq (BO, %rax, 1), BO 2274#endif 2275 2276#if defined(TRMMKERNEL) && defined(LEFT) 2277 addq $4, KK 2278#endif 2279 2280 addq $4 * SIZE, CO1 # coffset += 4 2281 decq I # i -- 2282 jg .L91 2283 ALIGN_4 2284 2285.L100: 2286 testq $2, M 2287 je .L110 2288 ALIGN_4 2289 2290.L101: 2291#if !defined(TRMMKERNEL) || \ 2292 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2293 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2294 2295 movq B, BO 2296#else 2297 movq KK, %rax 2298 leaq (, %rax, SIZE), %rax 2299 leaq (AO, %rax, 2), AO 2300 leaq (B, %rax, 1), BO 2301#endif 2302 2303 movapd 0 * SIZE(AO), %xmm8 2304 pxor %xmm0, %xmm0 2305 movddup 0 * SIZE(BO), %xmm9 2306 pxor %xmm1, %xmm1 2307 movapd 8 * SIZE(AO), %xmm10 2308 pxor %xmm2, %xmm2 2309 movddup 4 * SIZE(BO), %xmm11 2310 pxor %xmm3, %xmm3 2311 2312#ifndef TRMMKERNEL 2313 movq K, %rax 2314#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2315 movq K, %rax 2316 subq KK, %rax 2317 movq %rax, KKK 2318#else 2319 movq KK, %rax 2320#ifdef LEFT 2321 addq $2, %rax 2322#else 2323 addq $1, %rax 2324#endif 2325 movq %rax, KKK 2326#endif 2327 sarq $3, %rax 2328 je .L105 2329 ALIGN_4 2330 2331.L102: 2332 mulpd %xmm9, %xmm8 2333 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2334 movddup 1 * SIZE(BO), %xmm9 2335 addpd %xmm8, %xmm0 2336 mulpd 2 * SIZE(AO), %xmm9 2337 movapd 16 * SIZE(AO), %xmm8 2338 addpd %xmm9, %xmm1 2339 movddup 2 * SIZE(BO), %xmm9 2340 mulpd 4 * SIZE(AO), %xmm9 2341 addpd %xmm9, %xmm2 2342 movddup 3 * SIZE(BO), %xmm9 2343 mulpd 6 * SIZE(AO), %xmm9 2344 addpd %xmm9, %xmm3 2345 movddup 8 * SIZE(BO), %xmm9 2346 mulpd %xmm11, %xmm10 2347 movddup 5 * SIZE(BO), %xmm11 2348 addpd %xmm10, %xmm0 2349 mulpd 10 * SIZE(AO), %xmm11 2350 movapd 24 * SIZE(AO), %xmm10 2351 addpd %xmm11, %xmm1 2352 movddup 6 * SIZE(BO), %xmm11 2353 mulpd 12 * SIZE(AO), %xmm11 2354 addpd %xmm11, %xmm2 2355 movddup 7 * SIZE(BO), %xmm11 2356 mulpd 14 * SIZE(AO), %xmm11 2357 addpd %xmm11, %xmm3 2358 movddup 12 * SIZE(BO), %xmm11 2359 2360 addq $16 * SIZE, AO 2361 addq $ 8 * SIZE, BO 2362 decq %rax 2363 jne .L102 2364 ALIGN_4 2365 2366.L105: 2367#ifndef TRMMKERNEL 2368 movq K, %rax 2369#else 2370 movq KKK, %rax 2371#endif 2372 movddup ALPHA, %xmm15 2373 andq $7, %rax # if (k & 1) 2374 BRANCH 2375 je .L109 2376 ALIGN_4 2377 2378.L106: 2379 mulpd %xmm9, %xmm8 2380 movddup 1 * SIZE(BO), %xmm9 2381 addpd %xmm8, %xmm0 2382 movapd 2 * SIZE(AO), %xmm8 2383 2384 addq $2 * SIZE, AO # aoffset += 4 2385 addq $1 * SIZE, BO # boffset1 += 8 2386 decq %rax 2387 jg .L106 2388 ALIGN_4 2389 2390.L109: 2391#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2392 movsd 0 * SIZE(CO1), %xmm8 2393 movhpd 1 * SIZE(CO1), %xmm8 2394#endif 2395 2396 addpd %xmm1, %xmm0 2397 addpd %xmm3, %xmm2 2398 addpd %xmm2, %xmm0 2399 2400 mulpd %xmm15, %xmm0 2401#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2402 addpd %xmm8, %xmm0 2403#endif 2404 2405 movsd %xmm0, 0 * SIZE(CO1) 2406 movhpd %xmm0, 1 * SIZE(CO1) 2407 2408#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2409 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2410 movq K, %rax 2411 subq KKK, %rax 2412 leaq (,%rax, SIZE), %rax 2413 leaq (AO, %rax, 2), AO 2414 leaq (BO, %rax, 1), BO 2415#endif 2416 2417#if defined(TRMMKERNEL) && defined(LEFT) 2418 addq $2, KK 2419#endif 2420 2421 addq $2 * SIZE, CO1 # coffset += 4 2422 ALIGN_4 2423 2424.L110: 2425 testq $1, M 2426 je .L999 2427 ALIGN_4 2428 2429.L111: 2430#if !defined(TRMMKERNEL) || \ 2431 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2432 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2433 2434 movq B, BO 2435#else 2436 movq KK, %rax 2437 leaq (, %rax, SIZE), %rax 2438 leaq (AO, %rax, 1), AO 2439 leaq (B, %rax, 1), BO 2440#endif 2441 2442 movsd 0 * SIZE(AO), %xmm8 2443 pxor %xmm0, %xmm0 2444 movsd 0 * SIZE(BO), %xmm9 2445 pxor %xmm1, %xmm1 2446 movsd 4 * SIZE(AO), %xmm10 2447 pxor %xmm2, %xmm2 2448 movsd 4 * SIZE(BO), %xmm11 2449 pxor %xmm3, %xmm3 2450 2451 movapd 0 * SIZE(AO), %xmm9 2452 movapd 0 * SIZE(BO), %xmm8 2453 movapd 4 * SIZE(AO), %xmm11 2454 movapd 4 * SIZE(BO), %xmm10 2455 2456#ifndef TRMMKERNEL 2457 movq K, %rax 2458#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2459 movq K, %rax 2460 subq KK, %rax 2461 movq %rax, KKK 2462#else 2463 movq KK, %rax 2464#ifdef LEFT 2465 addq $1, %rax 2466#else 2467 addq $1, %rax 2468#endif 2469 movq %rax, KKK 2470#endif 2471 sarq $3, %rax 2472 je .L115 2473 ALIGN_4 2474 2475.L112: 2476 mulpd %xmm9, %xmm8 2477 movapd 2 * SIZE(AO), %xmm9 2478 addpd %xmm8, %xmm0 2479 mulpd 2 * SIZE(BO), %xmm9 2480 movapd 8 * SIZE(BO), %xmm8 2481 addpd %xmm9, %xmm1 2482 movapd 8 * SIZE(AO), %xmm9 2483 mulpd %xmm11, %xmm10 2484 movapd 6 * SIZE(AO), %xmm11 2485 addpd %xmm10, %xmm0 2486 mulpd 6 * SIZE(BO), %xmm11 2487 movapd 12 * SIZE(BO), %xmm10 2488 addpd %xmm11, %xmm1 2489 movapd 12 * SIZE(AO), %xmm11 2490 2491 addq $8 * SIZE, AO 2492 addq $8 * SIZE, BO 2493 decq %rax 2494 jne .L112 2495 ALIGN_4 2496 2497.L115: 2498#ifndef TRMMKERNEL 2499 movq K, %rax 2500#else 2501 movq KKK, %rax 2502#endif 2503 movddup ALPHA, %xmm15 2504 andq $7, %rax # if (k & 1) 2505 BRANCH 2506 je .L118 2507 ALIGN_4 2508 2509.L116: 2510 mulsd 0 * SIZE(BO), %xmm9 2511 addsd %xmm9, %xmm0 2512 movsd 1 * SIZE(AO), %xmm9 2513 2514 addq $1 * SIZE, AO # aoffset += 4 2515 addq $1 * SIZE, BO # boffset1 += 8 2516 decq %rax 2517 jg .L116 2518 ALIGN_4 2519 2520.L118: 2521#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2522 movsd 0 * SIZE(CO1), %xmm8 2523#endif 2524 2525 addpd %xmm1, %xmm0 2526 haddpd %xmm0, %xmm0 2527 mulsd %xmm15, %xmm0 2528#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2529 addsd %xmm8, %xmm0 2530#endif 2531 2532 movsd %xmm0, 0 * SIZE(CO1) 2533 ALIGN_4 2534 2535.L999: 2536 movq 0(%rsp), %rbx 2537 movq 8(%rsp), %rbp 2538 movq 16(%rsp), %r12 2539 movq 24(%rsp), %r13 2540 movq 32(%rsp), %r14 2541 movq 40(%rsp), %r15 2542 2543#ifdef WINDOWS_ABI 2544 movq 48(%rsp), %rdi 2545 movq 56(%rsp), %rsi 2546 movups 64(%rsp), %xmm6 2547 movups 80(%rsp), %xmm7 2548 movups 96(%rsp), %xmm8 2549 movups 112(%rsp), %xmm9 2550 movups 128(%rsp), %xmm10 2551 movups 144(%rsp), %xmm11 2552 movups 160(%rsp), %xmm12 2553 movups 176(%rsp), %xmm13 2554 movups 192(%rsp), %xmm14 2555 movups 208(%rsp), %xmm15 2556#endif 2557 2558 addq $STACKSIZE, %rsp 2559 ret 2560 2561 EPILOGUE 2562