1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %rdi 43#define N %rsi 44#define K %rdx 45 46#define A %rcx 47#define B %r8 48#define C %r9 49#define LDC %r10 50 51#define I %r11 52#define J %r12 53#define AO %r13 54#define BO %r14 55#define CO1 %r15 56#define CO2 %rbx 57#define BB %rbp 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 128 62 63#define OLD_LDC 8 + STACKSIZE(%rsp) 64#define OLD_OFFSET 16 + STACKSIZE(%rsp) 65 66#define ALPHA_R 48(%rsp) 67#define ALPHA_I 56(%rsp) 68#define OFFSET 64(%rsp) 69#define KKK 72(%rsp) 70#define KK 80(%rsp) 71 72#else 73 74#define STACKSIZE 512 75 76#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 77#define OLD_A 48 + STACKSIZE(%rsp) 78#define OLD_B 56 + STACKSIZE(%rsp) 79#define OLD_C 64 + STACKSIZE(%rsp) 80#define OLD_LDC 72 + STACKSIZE(%rsp) 81#define OLD_OFFSET 80 + STACKSIZE(%rsp) 82 83#define ALPHA_R 224(%rsp) 84#define ALPHA_I 232(%rsp) 85#define OFFSET 240(%rsp) 86#define KK 248(%rsp) 87#define KKK 256(%rsp) 88 89#endif 90 91#define PREFETCH prefetcht2 92#define PREFETCHSIZE (16 * 12 + 3) 93 94#define KERNEL1(address) \ 95 mulpd %xmm8, %xmm9 ;\ 96 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ 97 addpd %xmm9, %xmm0;\ 98 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 99 mulpd %xmm8, %xmm9;\ 100 addpd %xmm9, %xmm1;\ 101 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 102 mulpd %xmm8, %xmm9;\ 103 addpd %xmm9, %xmm2;\ 104 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 105 mulpd %xmm8, %xmm9;\ 106 movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 107 addpd %xmm9, %xmm3;\ 108 movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 109 110#define KERNEL2(address) \ 111 mulpd %xmm8, %xmm9;\ 112 addpd %xmm9, %xmm4;\ 113 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 114 mulpd %xmm8, %xmm9;\ 115 addpd %xmm9, %xmm5;\ 116 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 117 mulpd %xmm8, %xmm9;\ 118 addpd %xmm9, %xmm6;\ 119 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 120 mulpd %xmm8, %xmm9;\ 121 movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 122 addpd %xmm9, %xmm7;\ 123 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 124 125#define KERNEL3(address) \ 126 mulpd %xmm8, %xmm9;\ 127 addpd %xmm9, %xmm0;\ 128 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 129 mulpd %xmm8, %xmm9;\ 130 addpd %xmm9, %xmm1;\ 131 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 132 mulpd %xmm8, %xmm9;\ 133 addpd %xmm9, %xmm2;\ 134 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 135 mulpd %xmm8, %xmm9;\ 136 movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 137 addpd %xmm9, %xmm3;\ 138 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 139 140#define KERNEL4(address) \ 141 mulpd %xmm8, %xmm9;\ 142 addpd %xmm9, %xmm4;\ 143 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 144 mulpd %xmm8, %xmm9;\ 145 addpd %xmm9, %xmm5;\ 146 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 147 mulpd %xmm8, %xmm9;\ 148 addpd %xmm9, %xmm6;\ 149 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 150 mulpd %xmm8, %xmm9;\ 151 movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 152 addpd %xmm9, %xmm7;\ 153 movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 154 155#define KERNEL5(address) \ 156 mulpd %xmm10, %xmm11;\ 157 addpd %xmm11, %xmm0;\ 158 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 159 mulpd %xmm10, %xmm11;\ 160 addpd %xmm11, %xmm1;\ 161 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 162 mulpd %xmm10, %xmm11;\ 163 addpd %xmm11, %xmm2;\ 164 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 165 mulpd %xmm10, %xmm11;\ 166 movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 167 addpd %xmm11, %xmm3;\ 168 movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 169 170#define KERNEL6(address) \ 171 mulpd %xmm10, %xmm11;\ 172 addpd %xmm11, %xmm4;\ 173 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 174 mulpd %xmm10, %xmm11;\ 175 addpd %xmm11, %xmm5;\ 176 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 177 mulpd %xmm10, %xmm11;\ 178 addpd %xmm11, %xmm6;\ 179 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 180 mulpd %xmm10, %xmm11;\ 181 movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 182 addpd %xmm11, %xmm7;\ 183 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 184 185#define KERNEL7(address) \ 186 mulpd %xmm10, %xmm11;\ 187 addpd %xmm11, %xmm0;\ 188 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 189 mulpd %xmm10, %xmm11;\ 190 addpd %xmm11, %xmm1;\ 191 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 192 mulpd %xmm10, %xmm11;\ 193 addpd %xmm11, %xmm2;\ 194 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 195 mulpd %xmm10, %xmm11;\ 196 movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 197 addpd %xmm11, %xmm3;\ 198 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 199 200#define KERNEL8(address) \ 201 mulpd %xmm10, %xmm11;\ 202 addpd %xmm11, %xmm4;\ 203 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 204 mulpd %xmm10, %xmm11;\ 205 addpd %xmm11, %xmm5;\ 206 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 207 mulpd %xmm10, %xmm11;\ 208 addpd %xmm11, %xmm6;\ 209 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 210 mulpd %xmm10, %xmm11;\ 211 movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 212 addpd %xmm11, %xmm7;\ 213 movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 214 215#define KERNEL9(address) \ 216 mulpd %xmm12, %xmm13;\ 217 PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ 218 addpd %xmm13, %xmm0;\ 219 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 220 mulpd %xmm12, %xmm13;\ 221 addpd %xmm13, %xmm1;\ 222 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 223 mulpd %xmm12, %xmm13;\ 224 addpd %xmm13, %xmm2;\ 225 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 226 mulpd %xmm12, %xmm13;\ 227 movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 228 addpd %xmm13, %xmm3;\ 229 movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 230 231#define KERNEL10(address) \ 232 mulpd %xmm12, %xmm13;\ 233 addpd %xmm13, %xmm4;\ 234 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 235 mulpd %xmm12, %xmm13;\ 236 addpd %xmm13, %xmm5;\ 237 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 238 mulpd %xmm12, %xmm13;\ 239 addpd %xmm13, %xmm6;\ 240 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 241 mulpd %xmm12, %xmm13;\ 242 movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 243 addpd %xmm13, %xmm7;\ 244 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 245 246#define KERNEL11(address) \ 247 mulpd %xmm12, %xmm13;\ 248 addpd %xmm13, %xmm0;\ 249 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 250 mulpd %xmm12, %xmm13;\ 251 addpd %xmm13, %xmm1;\ 252 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 253 mulpd %xmm12, %xmm13;\ 254 addpd %xmm13, %xmm2;\ 255 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 256 mulpd %xmm12, %xmm13;\ 257 movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 258 addpd %xmm13, %xmm3;\ 259 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 260 261#define KERNEL12(address) \ 262 mulpd %xmm12, %xmm13;\ 263 addpd %xmm13, %xmm4;\ 264 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 265 mulpd %xmm12, %xmm13;\ 266 addpd %xmm13, %xmm5;\ 267 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 268 mulpd %xmm12, %xmm13;\ 269 addpd %xmm13, %xmm6;\ 270 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 271 mulpd %xmm12, %xmm13;\ 272 movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 273 addpd %xmm13, %xmm7;\ 274 movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 275 276#define KERNEL13(address) \ 277 mulpd %xmm14, %xmm15;\ 278 addpd %xmm15, %xmm0;\ 279 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 280 mulpd %xmm14, %xmm15;\ 281 addpd %xmm15, %xmm1;\ 282 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 283 mulpd %xmm14, %xmm15;\ 284 addpd %xmm15, %xmm2;\ 285 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 286 mulpd %xmm14, %xmm15;\ 287 movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 288 addpd %xmm15, %xmm3;\ 289 movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 290 291#define KERNEL14(address) \ 292 mulpd %xmm14, %xmm15;\ 293 addpd %xmm15, %xmm4;\ 294 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 295 mulpd %xmm14, %xmm15;\ 296 addpd %xmm15, %xmm5;\ 297 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 298 mulpd %xmm14, %xmm15;\ 299 addpd %xmm15, %xmm6;\ 300 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 301 mulpd %xmm14, %xmm15;\ 302 movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 303 addpd %xmm15, %xmm7;\ 304 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 305 306#define KERNEL15(address) \ 307 mulpd %xmm14, %xmm15;\ 308 addpd %xmm15, %xmm0;\ 309 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 310 mulpd %xmm14, %xmm15;\ 311 addpd %xmm15, %xmm1;\ 312 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 313 mulpd %xmm14, %xmm15;\ 314 addpd %xmm15, %xmm2;\ 315 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 316 mulpd %xmm14, %xmm15;\ 317 movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 318 addpd %xmm15, %xmm3;\ 319 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 320 321#define KERNEL16(address) \ 322 mulpd %xmm14, %xmm15;\ 323 addpd %xmm15, %xmm4;\ 324 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 325 mulpd %xmm14, %xmm15;\ 326 addpd %xmm15, %xmm5;\ 327 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 328 mulpd %xmm14, %xmm15;\ 329 addpd %xmm15, %xmm6;\ 330 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 331 mulpd %xmm14, %xmm15;\ 332 movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 333 addpd %xmm15, %xmm7;\ 334 movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 335 336#if defined(OS_LINUX) && defined(CORE_BARCELONA) 337 .align 32768 338#endif 339 PROLOGUE 340 PROFCODE 341 342 subq $STACKSIZE, %rsp 343 movq %rbx, 0(%rsp) 344 movq %rbp, 8(%rsp) 345 movq %r12, 16(%rsp) 346 movq %r13, 24(%rsp) 347 movq %r14, 32(%rsp) 348 movq %r15, 40(%rsp) 349 350#ifdef WINDOWS_ABI 351 movq %rdi, 48(%rsp) 352 movq %rsi, 56(%rsp) 353 movups %xmm6, 64(%rsp) 354 movups %xmm7, 80(%rsp) 355 movups %xmm8, 96(%rsp) 356 movups %xmm9, 112(%rsp) 357 movups %xmm10, 128(%rsp) 358 movups %xmm11, 144(%rsp) 359 movups %xmm12, 160(%rsp) 360 movups %xmm13, 176(%rsp) 361 movups %xmm14, 192(%rsp) 362 movups %xmm15, 208(%rsp) 363 364 movq ARG1, M 365 movq ARG2, N 366 movq ARG3, K 367 movq OLD_A, A 368 movq OLD_B, B 369 movq OLD_C, C 370 movq OLD_LDC, LDC 371 372 movaps %xmm3, %xmm0 373 movsd OLD_ALPHA_I, %xmm1 374#else 375 movq OLD_LDC, LDC 376#endif 377 378 movsd %xmm0, ALPHA_R 379 movsd %xmm1, ALPHA_I 380 381 salq $ZBASE_SHIFT, LDC 382 383 movq N, J 384 sarq $2, J # j = (n >> 2) 385 jle .L40 386 ALIGN_4 387 388.L10: 389#if defined(TRMMKERNEL) && defined(LEFT) 390 movq OFFSET, %rax 391 movq %rax, KK 392#endif 393 394 movq C, CO1 # coffset1 = c 395 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 396 movq A, AO # aoffset = a 397 398 leaq (, K, 4), BB 399 leaq (B, BB, SIZE), BB 400 401 movq M, I 402 sarq $2, I # i = (m >> 2) 403 jle .L20 404 ALIGN_4 405 406.L11: 407 prefetcht0 0 * SIZE(BB) 408 prefetcht0 8 * SIZE(BB) 409 subq $-8 * SIZE, BB 410 411#if !defined(TRMMKERNEL) || \ 412 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 413 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 414 415 movq B, BO 416#else 417 movq KK, %rax 418 leaq (, %rax, SIZE), %rax 419 leaq (AO, %rax, 4), AO 420 leaq (B, %rax, 4), BO 421#endif 422 423 movapd 0 * SIZE(AO), %xmm8 424 pxor %xmm0, %xmm0 425 movddup 0 * SIZE(BO), %xmm9 426 pxor %xmm1, %xmm1 427 movapd 8 * SIZE(AO), %xmm10 428 pxor %xmm2, %xmm2 429 movddup 8 * SIZE(BO), %xmm11 430 pxor %xmm3, %xmm3 431 432 movapd 16 * SIZE(AO), %xmm12 433 pxor %xmm4, %xmm4 434 movddup 16 * SIZE(BO), %xmm13 435 pxor %xmm5, %xmm5 436 movapd 24 * SIZE(AO), %xmm14 437 pxor %xmm6, %xmm6 438 movddup 24 * SIZE(BO), %xmm15 439 pxor %xmm7, %xmm7 440 441 prefetchnta 7 * SIZE(CO1) 442 prefetchnta 7 * SIZE(CO2) 443 prefetchnta 7 * SIZE(CO1, LDC, 2) 444 prefetchnta 7 * SIZE(CO2, LDC, 2) 445 446#ifndef TRMMKERNEL 447 movq K, %rax 448#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 449 movq K, %rax 450 subq KK, %rax 451 movq %rax, KKK 452#else 453 movq KK, %rax 454#ifdef LEFT 455 addq $4, %rax 456#else 457 addq $4, %rax 458#endif 459 movq %rax, KKK 460#endif 461 462#if 1 463 andq $-8, %rax 464 salq $4, %rax 465 NOBRANCH 466 je .L15 467 468.L1X: 469 KERNEL1 (16 * 0) 470 KERNEL2 (16 * 0) 471 KERNEL3 (16 * 0) 472 KERNEL4 (16 * 0) 473 KERNEL5 (16 * 0) 474 KERNEL6 (16 * 0) 475 KERNEL7 (16 * 0) 476 KERNEL8 (16 * 0) 477 KERNEL9 (16 * 0) 478 KERNEL10(16 * 0) 479 KERNEL11(16 * 0) 480 KERNEL12(16 * 0) 481 KERNEL13(16 * 0) 482 KERNEL14(16 * 0) 483 KERNEL15(16 * 0) 484 KERNEL16(16 * 0) 485 cmpq $128 * 1, %rax 486 NOBRANCH 487 jle .L12 488 KERNEL1 (16 * 1) 489 KERNEL2 (16 * 1) 490 KERNEL3 (16 * 1) 491 KERNEL4 (16 * 1) 492 KERNEL5 (16 * 1) 493 KERNEL6 (16 * 1) 494 KERNEL7 (16 * 1) 495 KERNEL8 (16 * 1) 496 KERNEL9 (16 * 1) 497 KERNEL10(16 * 1) 498 KERNEL11(16 * 1) 499 KERNEL12(16 * 1) 500 KERNEL13(16 * 1) 501 KERNEL14(16 * 1) 502 KERNEL15(16 * 1) 503 KERNEL16(16 * 1) 504 cmpq $128 * 2, %rax 505 NOBRANCH 506 jle .L12 507 KERNEL1 (16 * 2) 508 KERNEL2 (16 * 2) 509 KERNEL3 (16 * 2) 510 KERNEL4 (16 * 2) 511 KERNEL5 (16 * 2) 512 KERNEL6 (16 * 2) 513 KERNEL7 (16 * 2) 514 KERNEL8 (16 * 2) 515 KERNEL9 (16 * 2) 516 KERNEL10(16 * 2) 517 KERNEL11(16 * 2) 518 KERNEL12(16 * 2) 519 KERNEL13(16 * 2) 520 KERNEL14(16 * 2) 521 KERNEL15(16 * 2) 522 KERNEL16(16 * 2) 523 cmpq $128 * 3, %rax 524 NOBRANCH 525 jle .L12 526 KERNEL1 (16 * 3) 527 KERNEL2 (16 * 3) 528 KERNEL3 (16 * 3) 529 KERNEL4 (16 * 3) 530 KERNEL5 (16 * 3) 531 KERNEL6 (16 * 3) 532 KERNEL7 (16 * 3) 533 KERNEL8 (16 * 3) 534 KERNEL9 (16 * 3) 535 KERNEL10(16 * 3) 536 KERNEL11(16 * 3) 537 KERNEL12(16 * 3) 538 KERNEL13(16 * 3) 539 KERNEL14(16 * 3) 540 KERNEL15(16 * 3) 541 KERNEL16(16 * 3) 542 cmpq $128 * 4, %rax 543 NOBRANCH 544 jle .L12 545 KERNEL1 (16 * 4) 546 KERNEL2 (16 * 4) 547 KERNEL3 (16 * 4) 548 KERNEL4 (16 * 4) 549 KERNEL5 (16 * 4) 550 KERNEL6 (16 * 4) 551 KERNEL7 (16 * 4) 552 KERNEL8 (16 * 4) 553 KERNEL9 (16 * 4) 554 KERNEL10(16 * 4) 555 KERNEL11(16 * 4) 556 KERNEL12(16 * 4) 557 KERNEL13(16 * 4) 558 KERNEL14(16 * 4) 559 KERNEL15(16 * 4) 560 KERNEL16(16 * 4) 561 cmpq $128 * 5, %rax 562 NOBRANCH 563 jle .L12 564 KERNEL1 (16 * 5) 565 KERNEL2 (16 * 5) 566 KERNEL3 (16 * 5) 567 KERNEL4 (16 * 5) 568 KERNEL5 (16 * 5) 569 KERNEL6 (16 * 5) 570 KERNEL7 (16 * 5) 571 KERNEL8 (16 * 5) 572 KERNEL9 (16 * 5) 573 KERNEL10(16 * 5) 574 KERNEL11(16 * 5) 575 KERNEL12(16 * 5) 576 KERNEL13(16 * 5) 577 KERNEL14(16 * 5) 578 KERNEL15(16 * 5) 579 KERNEL16(16 * 5) 580 cmpq $128 * 6, %rax 581 NOBRANCH 582 jle .L12 583 KERNEL1 (16 * 6) 584 KERNEL2 (16 * 6) 585 KERNEL3 (16 * 6) 586 KERNEL4 (16 * 6) 587 KERNEL5 (16 * 6) 588 KERNEL6 (16 * 6) 589 KERNEL7 (16 * 6) 590 KERNEL8 (16 * 6) 591 KERNEL9 (16 * 6) 592 KERNEL10(16 * 6) 593 KERNEL11(16 * 6) 594 KERNEL12(16 * 6) 595 KERNEL13(16 * 6) 596 KERNEL14(16 * 6) 597 KERNEL15(16 * 6) 598 KERNEL16(16 * 6) 599 cmpq $128 * 7, %rax 600 NOBRANCH 601 jle .L12 602 KERNEL1 (16 * 7) 603 KERNEL2 (16 * 7) 604 KERNEL3 (16 * 7) 605 KERNEL4 (16 * 7) 606 KERNEL5 (16 * 7) 607 KERNEL6 (16 * 7) 608 KERNEL7 (16 * 7) 609 KERNEL8 (16 * 7) 610 KERNEL9 (16 * 7) 611 KERNEL10(16 * 7) 612 KERNEL11(16 * 7) 613 KERNEL12(16 * 7) 614 KERNEL13(16 * 7) 615 KERNEL14(16 * 7) 616 KERNEL15(16 * 7) 617 KERNEL16(16 * 7) 618 619 addq $32 * 8 * SIZE, AO 620 addq $32 * 8 * SIZE, BO 621 subq $128 * 8, %rax 622 BRANCH 623 jg .L1X 624 625.L12: 626 leaq (AO, %rax, 2), AO # * 16 627 leaq (BO, %rax, 2), BO # * 64 628 629#else 630 sarq $3, %rax 631 je .L15 632 ALIGN_4 633 634.L12: 635 mulpd %xmm8, %xmm9 636 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 637 addpd %xmm9, %xmm0 638 movddup 1 * SIZE(BO), %xmm9 639 mulpd %xmm8, %xmm9 640 addpd %xmm9, %xmm1 641 movddup 2 * SIZE(BO), %xmm9 642 mulpd %xmm8, %xmm9 643 addpd %xmm9, %xmm2 644 movddup 3 * SIZE(BO), %xmm9 645 mulpd %xmm8, %xmm9 646 movapd 2 * SIZE(AO), %xmm8 647 addpd %xmm9, %xmm3 648 movddup 0 * SIZE(BO), %xmm9 649 mulpd %xmm8, %xmm9 650 addpd %xmm9, %xmm4 651 movddup 1 * SIZE(BO), %xmm9 652 mulpd %xmm8, %xmm9 653 addpd %xmm9, %xmm5 654 movddup 2 * SIZE(BO), %xmm9 655 mulpd %xmm8, %xmm9 656 addpd %xmm9, %xmm6 657 movddup 3 * SIZE(BO), %xmm9 658 mulpd %xmm8, %xmm9 659 movapd 4 * SIZE(AO), %xmm8 660 addpd %xmm9, %xmm7 661 movddup 4 * SIZE(BO), %xmm9 662 mulpd %xmm8, %xmm9 663 addpd %xmm9, %xmm0 664 movddup 5 * SIZE(BO), %xmm9 665 mulpd %xmm8, %xmm9 666 addpd %xmm9, %xmm1 667 movddup 6 * SIZE(BO), %xmm9 668 mulpd %xmm8, %xmm9 669 addpd %xmm9, %xmm2 670 movddup 7 * SIZE(BO), %xmm9 671 mulpd %xmm8, %xmm9 672 movapd 6 * SIZE(AO), %xmm8 673 addpd %xmm9, %xmm3 674 movddup 4 * SIZE(BO), %xmm9 675 mulpd %xmm8, %xmm9 676 addpd %xmm9, %xmm4 677 movddup 5 * SIZE(BO), %xmm9 678 mulpd %xmm8, %xmm9 679 addpd %xmm9, %xmm5 680 movddup 6 * SIZE(BO), %xmm9 681 mulpd %xmm8, %xmm9 682 addpd %xmm9, %xmm6 683 movddup 7 * SIZE(BO), %xmm9 684 mulpd %xmm8, %xmm9 685 movapd 32 * SIZE(AO), %xmm8 686 addpd %xmm9, %xmm7 687 688 movddup 32 * SIZE(BO), %xmm9 689 mulpd %xmm10, %xmm11 690 addpd %xmm11, %xmm0 691 movddup 9 * SIZE(BO), %xmm11 692 mulpd %xmm10, %xmm11 693 addpd %xmm11, %xmm1 694 movddup 10 * SIZE(BO), %xmm11 695 mulpd %xmm10, %xmm11 696 addpd %xmm11, %xmm2 697 movddup 11 * SIZE(BO), %xmm11 698 mulpd %xmm10, %xmm11 699 movapd 10 * SIZE(AO), %xmm10 700 addpd %xmm11, %xmm3 701 702 movddup 8 * SIZE(BO), %xmm11 703 mulpd %xmm10, %xmm11 704 addpd %xmm11, %xmm4 705 movddup 9 * SIZE(BO), %xmm11 706 mulpd %xmm10, %xmm11 707 addpd %xmm11, %xmm5 708 movddup 10 * SIZE(BO), %xmm11 709 mulpd %xmm10, %xmm11 710 addpd %xmm11, %xmm6 711 movddup 11 * SIZE(BO), %xmm11 712 mulpd %xmm10, %xmm11 713 movapd 12 * SIZE(AO), %xmm10 714 addpd %xmm11, %xmm7 715 movddup 12 * SIZE(BO), %xmm11 716 mulpd %xmm10, %xmm11 717 addpd %xmm11, %xmm0 718 movddup 13 * SIZE(BO), %xmm11 719 mulpd %xmm10, %xmm11 720 addpd %xmm11, %xmm1 721 movddup 14 * SIZE(BO), %xmm11 722 mulpd %xmm10, %xmm11 723 addpd %xmm11, %xmm2 724 movddup 15 * SIZE(BO), %xmm11 725 mulpd %xmm10, %xmm11 726 movapd 14 * SIZE(AO), %xmm10 727 addpd %xmm11, %xmm3 728 729 movddup 12 * SIZE(BO), %xmm11 730 mulpd %xmm10, %xmm11 731 addpd %xmm11, %xmm4 732 movddup 13 * SIZE(BO), %xmm11 733 mulpd %xmm10, %xmm11 734 addpd %xmm11, %xmm5 735 movddup 14 * SIZE(BO), %xmm11 736 mulpd %xmm10, %xmm11 737 addpd %xmm11, %xmm6 738 movddup 15 * SIZE(BO), %xmm11 739 mulpd %xmm10, %xmm11 740 movapd 40 * SIZE(AO), %xmm10 741 addpd %xmm11, %xmm7 742 movddup 40 * SIZE(BO), %xmm11 743 744 mulpd %xmm12, %xmm13 745 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 746 addpd %xmm13, %xmm0 747 movddup 17 * SIZE(BO), %xmm13 748 mulpd %xmm12, %xmm13 749 addpd %xmm13, %xmm1 750 movddup 18 * SIZE(BO), %xmm13 751 mulpd %xmm12, %xmm13 752 addpd %xmm13, %xmm2 753 movddup 19 * SIZE(BO), %xmm13 754 mulpd %xmm12, %xmm13 755 movapd 18 * SIZE(AO), %xmm12 756 addpd %xmm13, %xmm3 757 758 movddup 16 * SIZE(BO), %xmm13 759 mulpd %xmm12, %xmm13 760 addpd %xmm13, %xmm4 761 movddup 17 * SIZE(BO), %xmm13 762 mulpd %xmm12, %xmm13 763 addpd %xmm13, %xmm5 764 movddup 18 * SIZE(BO), %xmm13 765 mulpd %xmm12, %xmm13 766 addpd %xmm13, %xmm6 767 movddup 19 * SIZE(BO), %xmm13 768 mulpd %xmm12, %xmm13 769 movapd 20 * SIZE(AO), %xmm12 770 addpd %xmm13, %xmm7 771 772 movddup 20 * SIZE(BO), %xmm13 773 mulpd %xmm12, %xmm13 774 addpd %xmm13, %xmm0 775 movddup 21 * SIZE(BO), %xmm13 776 mulpd %xmm12, %xmm13 777 addpd %xmm13, %xmm1 778 movddup 22 * SIZE(BO), %xmm13 779 mulpd %xmm12, %xmm13 780 addpd %xmm13, %xmm2 781 movddup 23 * SIZE(BO), %xmm13 782 mulpd %xmm12, %xmm13 783 movapd 22 * SIZE(AO), %xmm12 784 addpd %xmm13, %xmm3 785 786 movddup 20 * SIZE(BO), %xmm13 787 mulpd %xmm12, %xmm13 788 addpd %xmm13, %xmm4 789 movddup 21 * SIZE(BO), %xmm13 790 mulpd %xmm12, %xmm13 791 addpd %xmm13, %xmm5 792 movddup 22 * SIZE(BO), %xmm13 793 mulpd %xmm12, %xmm13 794 addpd %xmm13, %xmm6 795 movddup 23 * SIZE(BO), %xmm13 796 mulpd %xmm12, %xmm13 797 movapd 48 * SIZE(AO), %xmm12 798 addpd %xmm13, %xmm7 799 movddup 48 * SIZE(BO), %xmm13 800 801 mulpd %xmm14, %xmm15 802 addpd %xmm15, %xmm0 803 movddup 25 * SIZE(BO), %xmm15 804 mulpd %xmm14, %xmm15 805 addpd %xmm15, %xmm1 806 movddup 26 * SIZE(BO), %xmm15 807 mulpd %xmm14, %xmm15 808 addpd %xmm15, %xmm2 809 movddup 27 * SIZE(BO), %xmm15 810 mulpd %xmm14, %xmm15 811 movapd 26 * SIZE(AO), %xmm14 812 addpd %xmm15, %xmm3 813 814 movddup 24 * SIZE(BO), %xmm15 815 mulpd %xmm14, %xmm15 816 addpd %xmm15, %xmm4 817 movddup 25 * SIZE(BO), %xmm15 818 mulpd %xmm14, %xmm15 819 addpd %xmm15, %xmm5 820 movddup 26 * SIZE(BO), %xmm15 821 mulpd %xmm14, %xmm15 822 addpd %xmm15, %xmm6 823 movddup 27 * SIZE(BO), %xmm15 824 mulpd %xmm14, %xmm15 825 movapd 28 * SIZE(AO), %xmm14 826 addpd %xmm15, %xmm7 827 828 movddup 28 * SIZE(BO), %xmm15 829 mulpd %xmm14, %xmm15 830 addpd %xmm15, %xmm0 831 movddup 29 * SIZE(BO), %xmm15 832 mulpd %xmm14, %xmm15 833 addpd %xmm15, %xmm1 834 movddup 30 * SIZE(BO), %xmm15 835 mulpd %xmm14, %xmm15 836 addpd %xmm15, %xmm2 837 movddup 31 * SIZE(BO), %xmm15 838 mulpd %xmm14, %xmm15 839 movapd 30 * SIZE(AO), %xmm14 840 addpd %xmm15, %xmm3 841 842 movddup 28 * SIZE(BO), %xmm15 843 mulpd %xmm14, %xmm15 844 addpd %xmm15, %xmm4 845 movddup 29 * SIZE(BO), %xmm15 846 mulpd %xmm14, %xmm15 847 addpd %xmm15, %xmm5 848 movddup 30 * SIZE(BO), %xmm15 849 mulpd %xmm14, %xmm15 850 addpd %xmm15, %xmm6 851 movddup 31 * SIZE(BO), %xmm15 852 mulpd %xmm14, %xmm15 853 movapd 56 * SIZE(AO), %xmm14 854 addpd %xmm15, %xmm7 855 movddup 56 * SIZE(BO), %xmm15 856 857 addq $32 * SIZE, BO 858 addq $32 * SIZE, AO 859 decq %rax 860 BRANCH 861 jne .L12 862#endif 863 ALIGN_4 864 865.L15: 866#ifndef TRMMKERNEL 867 movq K, %rax 868#else 869 movq KKK, %rax 870#endif 871 movsd ALPHA_R, %xmm15 872 movhpd ALPHA_I, %xmm15 873 andq $7, %rax # if (k & 1) 874 BRANCH 875 BRANCH 876 je .L19 877 ALIGN_4 878 879.L16: 880 mulpd %xmm8, %xmm9 881 movapd 2 * SIZE(AO), %xmm10 882 addpd %xmm9, %xmm0 883 movddup 1 * SIZE(BO), %xmm9 884 mulpd %xmm8, %xmm9 885 movddup 0 * SIZE(BO), %xmm11 886 addpd %xmm9, %xmm1 887 movddup 2 * SIZE(BO), %xmm9 888 mulpd %xmm8, %xmm9 889 addpd %xmm9, %xmm2 890 movddup 3 * SIZE(BO), %xmm9 891 mulpd %xmm8, %xmm9 892 movapd 4 * SIZE(AO), %xmm8 893 addpd %xmm9, %xmm3 894 movddup 4 * SIZE(BO), %xmm9 895 mulpd %xmm10, %xmm11 896 addpd %xmm11, %xmm4 897 movddup 1 * SIZE(BO), %xmm11 898 mulpd %xmm10, %xmm11 899 addpd %xmm11, %xmm5 900 movddup 2 * SIZE(BO), %xmm11 901 mulpd %xmm10, %xmm11 902 addpd %xmm11, %xmm6 903 movddup 3 * SIZE(BO), %xmm11 904 mulpd %xmm10, %xmm11 905 addpd %xmm11, %xmm7 906 907 addq $4 * SIZE, AO # aoffset += 4 908 addq $4 * SIZE, BO # boffset1 += 8 909 decq %rax 910 BRANCH 911 jg .L16 912 ALIGN_4 913 914.L19: 915 movsd 0 * SIZE(CO1), %xmm8 916 movhpd 1 * SIZE(CO1), %xmm8 917 movsd 2 * SIZE(CO1), %xmm9 918 movhpd 3 * SIZE(CO1), %xmm9 919 920 movsd 4 * SIZE(CO1), %xmm10 921 movhpd 5 * SIZE(CO1), %xmm10 922 movsd 6 * SIZE(CO1), %xmm11 923 movhpd 7 * SIZE(CO1), %xmm11 924 925 movddup %xmm0, %xmm12 926 unpckhpd %xmm0, %xmm0 927 movddup %xmm4, %xmm13 928 unpckhpd %xmm4, %xmm4 929 930 mulpd %xmm15, %xmm12 931 mulpd %xmm15, %xmm0 932 mulpd %xmm15, %xmm13 933 mulpd %xmm15, %xmm4 934 935 addpd %xmm12, %xmm8 936 addpd %xmm0, %xmm9 937 addpd %xmm13, %xmm10 938 addpd %xmm4, %xmm11 939 940 movsd %xmm8, 0 * SIZE(CO1) 941 movhpd %xmm8, 1 * SIZE(CO1) 942 movsd %xmm9, 2 * SIZE(CO1) 943 movhpd %xmm9, 3 * SIZE(CO1) 944 945 movsd %xmm10, 4 * SIZE(CO1) 946 movhpd %xmm10, 5 * SIZE(CO1) 947 movsd %xmm11, 6 * SIZE(CO1) 948 movhpd %xmm11, 7 * SIZE(CO1) 949 950 movsd 0 * SIZE(CO2), %xmm8 951 movhpd 1 * SIZE(CO2), %xmm8 952 movsd 2 * SIZE(CO2), %xmm9 953 movhpd 3 * SIZE(CO2), %xmm9 954 955 movsd 4 * SIZE(CO2), %xmm10 956 movhpd 5 * SIZE(CO2), %xmm10 957 movsd 6 * SIZE(CO2), %xmm11 958 movhpd 7 * SIZE(CO2), %xmm11 959 960 movddup %xmm1, %xmm12 961 unpckhpd %xmm1, %xmm1 962 movddup %xmm5, %xmm13 963 unpckhpd %xmm5, %xmm5 964 965 mulpd %xmm15, %xmm12 966 mulpd %xmm15, %xmm1 967 mulpd %xmm15, %xmm13 968 mulpd %xmm15, %xmm5 969 970 addpd %xmm12, %xmm8 971 addpd %xmm1, %xmm9 972 addpd %xmm13, %xmm10 973 addpd %xmm5, %xmm11 974 975 movsd %xmm8, 0 * SIZE(CO2) 976 movhpd %xmm8, 1 * SIZE(CO2) 977 movsd %xmm9, 2 * SIZE(CO2) 978 movhpd %xmm9, 3 * SIZE(CO2) 979 980 movsd %xmm10, 4 * SIZE(CO2) 981 movhpd %xmm10, 5 * SIZE(CO2) 982 movsd %xmm11, 6 * SIZE(CO2) 983 movhpd %xmm11, 7 * SIZE(CO2) 984 985 movsd 0 * SIZE(CO1, LDC, 2), %xmm8 986 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 987 movsd 2 * SIZE(CO1, LDC, 2), %xmm9 988 movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 989 990 movsd 4 * SIZE(CO1, LDC, 2), %xmm10 991 movhpd 5 * SIZE(CO1, LDC, 2), %xmm10 992 movsd 6 * SIZE(CO1, LDC, 2), %xmm11 993 movhpd 7 * SIZE(CO1, LDC, 2), %xmm11 994 995 movddup %xmm2, %xmm12 996 unpckhpd %xmm2, %xmm2 997 movddup %xmm6, %xmm13 998 unpckhpd %xmm6, %xmm6 999 1000 mulpd %xmm15, %xmm12 1001 mulpd %xmm15, %xmm2 1002 mulpd %xmm15, %xmm13 1003 mulpd %xmm15, %xmm6 1004 1005 addpd %xmm12, %xmm8 1006 addpd %xmm2, %xmm9 1007 addpd %xmm13, %xmm10 1008 addpd %xmm6, %xmm11 1009 1010 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) 1011 movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) 1012 movsd %xmm9, 2 * SIZE(CO1, LDC, 2) 1013 movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) 1014 1015 movsd %xmm10, 4 * SIZE(CO1, LDC, 2) 1016 movhpd %xmm10, 5 * SIZE(CO1, LDC, 2) 1017 movsd %xmm11, 6 * SIZE(CO1, LDC, 2) 1018 movhpd %xmm11, 7 * SIZE(CO1, LDC, 2) 1019 1020 movsd 0 * SIZE(CO2, LDC, 2), %xmm8 1021 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 1022 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 1023 movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 1024 1025 movsd 4 * SIZE(CO2, LDC, 2), %xmm10 1026 movhpd 5 * SIZE(CO2, LDC, 2), %xmm10 1027 movsd 6 * SIZE(CO2, LDC, 2), %xmm11 1028 movhpd 7 * SIZE(CO2, LDC, 2), %xmm11 1029 1030 movddup %xmm3, %xmm12 1031 unpckhpd %xmm3, %xmm3 1032 movddup %xmm7, %xmm13 1033 unpckhpd %xmm7, %xmm7 1034 1035 mulpd %xmm15, %xmm12 1036 mulpd %xmm15, %xmm3 1037 mulpd %xmm15, %xmm13 1038 mulpd %xmm15, %xmm7 1039 1040 addpd %xmm12, %xmm8 1041 addpd %xmm3, %xmm9 1042 addpd %xmm13, %xmm10 1043 addpd %xmm7, %xmm11 1044 1045 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) 1046 movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) 1047 movsd %xmm9, 2 * SIZE(CO2, LDC, 2) 1048 movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) 1049 1050 movsd %xmm10, 4 * SIZE(CO2, LDC, 2) 1051 movhpd %xmm10, 5 * SIZE(CO2, LDC, 2) 1052 movsd %xmm11, 6 * SIZE(CO2, LDC, 2) 1053 movhpd %xmm11, 7 * SIZE(CO2, LDC, 2) 1054 1055 addq $8 * SIZE, CO1 # coffset += 4 1056 addq $8 * SIZE, CO2 # coffset += 4 1057 1058 decq I # i -- 1059 jg .L11 1060 jmp .L20 1061 ALIGN_4 1062 1063.L20: 1064 testq $2, M 1065 BRANCH 1066 je .L30 1067 ALIGN_4 1068 1069.L21: 1070#if !defined(TRMMKERNEL) || \ 1071 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1072 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1073 1074 movq B, BO 1075#else 1076 movq KK, %rax 1077 leaq (, %rax, SIZE), %rax 1078 leaq (AO, %rax, 2), AO 1079 leaq (B, %rax, 4), BO 1080#endif 1081 1082 movapd 0 * SIZE(AO), %xmm8 1083 pxor %xmm0, %xmm0 1084 movddup 0 * SIZE(BO), %xmm9 1085 pxor %xmm1, %xmm1 1086 movapd 8 * SIZE(AO), %xmm10 1087 pxor %xmm2, %xmm2 1088 movddup 8 * SIZE(BO), %xmm11 1089 pxor %xmm3, %xmm3 1090 1091#ifndef TRMMKERNEL 1092 movq K, %rax 1093#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1094 movq K, %rax 1095 subq KK, %rax 1096 movq %rax, KKK 1097#else 1098 movq KK, %rax 1099#ifdef LEFT 1100 addq $2, %rax 1101#else 1102 addq $4, %rax 1103#endif 1104 movq %rax, KKK 1105#endif 1106 sarq $3, %rax 1107 je .L25 1108 ALIGN_4 1109 1110.L22: 1111 mulpd %xmm8, %xmm9 1112 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1113 addpd %xmm9, %xmm0 1114 movddup 1 * SIZE(BO), %xmm9 1115 mulpd %xmm8, %xmm9 1116 addpd %xmm9, %xmm1 1117 movddup 2 * SIZE(BO), %xmm9 1118 mulpd %xmm8, %xmm9 1119 addpd %xmm9, %xmm2 1120 movddup 3 * SIZE(BO), %xmm9 1121 mulpd %xmm8, %xmm9 1122 movapd 2 * SIZE(AO), %xmm8 1123 addpd %xmm9, %xmm3 1124 movddup 4 * SIZE(BO), %xmm9 1125 mulpd %xmm8, %xmm9 1126 addpd %xmm9, %xmm0 1127 movddup 5 * SIZE(BO), %xmm9 1128 mulpd %xmm8, %xmm9 1129 addpd %xmm9, %xmm1 1130 movddup 6 * SIZE(BO), %xmm9 1131 mulpd %xmm8, %xmm9 1132 addpd %xmm9, %xmm2 1133 movddup 7 * SIZE(BO), %xmm9 1134 mulpd %xmm8, %xmm9 1135 movapd 4 * SIZE(AO), %xmm8 1136 addpd %xmm9, %xmm3 1137 movddup 16 * SIZE(BO), %xmm9 1138 mulpd %xmm8, %xmm11 1139 addpd %xmm11, %xmm0 1140 movddup 9 * SIZE(BO), %xmm11 1141 mulpd %xmm8, %xmm11 1142 addpd %xmm11, %xmm1 1143 movddup 10 * SIZE(BO), %xmm11 1144 mulpd %xmm8, %xmm11 1145 addpd %xmm11, %xmm2 1146 movddup 11 * SIZE(BO), %xmm11 1147 mulpd %xmm8, %xmm11 1148 movapd 6 * SIZE(AO), %xmm8 1149 addpd %xmm11, %xmm3 1150 movddup 12 * SIZE(BO), %xmm11 1151 mulpd %xmm8, %xmm11 1152 addpd %xmm11, %xmm0 1153 movddup 13 * SIZE(BO), %xmm11 1154 mulpd %xmm8, %xmm11 1155 addpd %xmm11, %xmm1 1156 movddup 14 * SIZE(BO), %xmm11 1157 mulpd %xmm8, %xmm11 1158 addpd %xmm11, %xmm2 1159 movddup 15 * SIZE(BO), %xmm11 1160 mulpd %xmm8, %xmm11 1161 movapd 16 * SIZE(AO), %xmm8 1162 addpd %xmm11, %xmm3 1163 movddup 24 * SIZE(BO), %xmm11 1164 mulpd %xmm10, %xmm9 1165 addpd %xmm9, %xmm0 1166 movddup 17 * SIZE(BO), %xmm9 1167 mulpd %xmm10, %xmm9 1168 addpd %xmm9, %xmm1 1169 movddup 18 * SIZE(BO), %xmm9 1170 mulpd %xmm10, %xmm9 1171 addpd %xmm9, %xmm2 1172 movddup 19 * SIZE(BO), %xmm9 1173 mulpd %xmm10, %xmm9 1174 movapd 10 * SIZE(AO), %xmm10 1175 addpd %xmm9, %xmm3 1176 movddup 20 * SIZE(BO), %xmm9 1177 mulpd %xmm10, %xmm9 1178 addpd %xmm9, %xmm0 1179 movddup 21 * SIZE(BO), %xmm9 1180 mulpd %xmm10, %xmm9 1181 addpd %xmm9, %xmm1 1182 movddup 22 * SIZE(BO), %xmm9 1183 mulpd %xmm10, %xmm9 1184 addpd %xmm9, %xmm2 1185 movddup 23 * SIZE(BO), %xmm9 1186 mulpd %xmm10, %xmm9 1187 movapd 12 * SIZE(AO), %xmm10 1188 addpd %xmm9, %xmm3 1189 movddup 32 * SIZE(BO), %xmm9 1190 mulpd %xmm10, %xmm11 1191 addpd %xmm11, %xmm0 1192 movddup 25 * SIZE(BO), %xmm11 1193 mulpd %xmm10, %xmm11 1194 addpd %xmm11, %xmm1 1195 movddup 26 * SIZE(BO), %xmm11 1196 mulpd %xmm10, %xmm11 1197 addpd %xmm11, %xmm2 1198 movddup 27 * SIZE(BO), %xmm11 1199 mulpd %xmm10, %xmm11 1200 movapd 14 * SIZE(AO), %xmm10 1201 addpd %xmm11, %xmm3 1202 movddup 28 * SIZE(BO), %xmm11 1203 mulpd %xmm10, %xmm11 1204 addpd %xmm11, %xmm0 1205 movddup 29 * SIZE(BO), %xmm11 1206 mulpd %xmm10, %xmm11 1207 addpd %xmm11, %xmm1 1208 movddup 30 * SIZE(BO), %xmm11 1209 mulpd %xmm10, %xmm11 1210 addpd %xmm11, %xmm2 1211 movddup 31 * SIZE(BO), %xmm11 1212 mulpd %xmm10, %xmm11 1213 movapd 24 * SIZE(AO), %xmm10 1214 addpd %xmm11, %xmm3 1215 movddup 40 * SIZE(BO), %xmm11 1216 1217 addq $16 * SIZE, AO 1218 addq $32 * SIZE, BO 1219 decq %rax 1220 jne .L22 1221 ALIGN_4 1222 1223.L25: 1224#ifndef TRMMKERNEL 1225 movq K, %rax 1226#else 1227 movq KKK, %rax 1228#endif 1229 movsd ALPHA_R, %xmm15 1230 movhpd ALPHA_I, %xmm15 1231 andq $7, %rax # if (k & 1) 1232 BRANCH 1233 je .L29 1234 ALIGN_4 1235 1236.L26: 1237 mulpd %xmm8, %xmm9 1238 addpd %xmm9, %xmm0 1239 movddup 1 * SIZE(BO), %xmm9 1240 mulpd %xmm8, %xmm9 1241 addpd %xmm9, %xmm1 1242 movddup 2 * SIZE(BO), %xmm9 1243 mulpd %xmm8, %xmm9 1244 addpd %xmm9, %xmm2 1245 movddup 3 * SIZE(BO), %xmm9 1246 mulpd %xmm8, %xmm9 1247 movapd 2 * SIZE(AO), %xmm8 1248 addpd %xmm9, %xmm3 1249 movddup 4 * SIZE(BO), %xmm9 1250 1251 addq $2 * SIZE, AO # aoffset += 4 1252 addq $4 * SIZE, BO # boffset1 += 8 1253 decq %rax 1254 jg .L26 1255 ALIGN_4 1256 1257.L29: 1258 movsd 0 * SIZE(CO1), %xmm8 1259 movhpd 1 * SIZE(CO1), %xmm8 1260 movsd 2 * SIZE(CO1), %xmm9 1261 movhpd 3 * SIZE(CO1), %xmm9 1262 1263 movddup %xmm0, %xmm12 1264 unpckhpd %xmm0, %xmm0 1265 1266 mulpd %xmm15, %xmm12 1267 mulpd %xmm15, %xmm0 1268 addpd %xmm12, %xmm8 1269 addpd %xmm0, %xmm9 1270 1271 movsd %xmm8, 0 * SIZE(CO1) 1272 movhpd %xmm8, 1 * SIZE(CO1) 1273 movsd %xmm9, 2 * SIZE(CO1) 1274 movhpd %xmm9, 3 * SIZE(CO1) 1275 1276 movsd 0 * SIZE(CO2), %xmm8 1277 movhpd 1 * SIZE(CO2), %xmm8 1278 movsd 2 * SIZE(CO2), %xmm9 1279 movhpd 3 * SIZE(CO2), %xmm9 1280 1281 movddup %xmm1, %xmm12 1282 unpckhpd %xmm1, %xmm1 1283 1284 mulpd %xmm15, %xmm12 1285 mulpd %xmm15, %xmm1 1286 addpd %xmm12, %xmm8 1287 addpd %xmm1, %xmm9 1288 1289 movsd %xmm8, 0 * SIZE(CO2) 1290 movhpd %xmm8, 1 * SIZE(CO2) 1291 movsd %xmm9, 2 * SIZE(CO2) 1292 movhpd %xmm9, 3 * SIZE(CO2) 1293 1294 movsd 0 * SIZE(CO1, LDC, 2), %xmm8 1295 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 1296 movsd 2 * SIZE(CO1, LDC, 2), %xmm9 1297 movhpd 3 * SIZE(CO1, LDC, 2), %xmm9 1298 1299 movddup %xmm2, %xmm12 1300 unpckhpd %xmm2, %xmm2 1301 1302 mulpd %xmm15, %xmm12 1303 mulpd %xmm15, %xmm2 1304 addpd %xmm12, %xmm8 1305 addpd %xmm2, %xmm9 1306 1307 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) 1308 movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) 1309 movsd %xmm9, 2 * SIZE(CO1, LDC, 2) 1310 movhpd %xmm9, 3 * SIZE(CO1, LDC, 2) 1311 1312 movsd 0 * SIZE(CO2, LDC, 2), %xmm8 1313 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 1314 movsd 2 * SIZE(CO2, LDC, 2), %xmm9 1315 movhpd 3 * SIZE(CO2, LDC, 2), %xmm9 1316 1317 movddup %xmm3, %xmm12 1318 unpckhpd %xmm3, %xmm3 1319 1320 mulpd %xmm15, %xmm12 1321 mulpd %xmm15, %xmm3 1322 addpd %xmm12, %xmm8 1323 addpd %xmm3, %xmm9 1324 1325 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) 1326 movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) 1327 movsd %xmm9, 2 * SIZE(CO2, LDC, 2) 1328 movhpd %xmm9, 3 * SIZE(CO2, LDC, 2) 1329 1330 addq $4 * SIZE, CO1 # coffset += 4 1331 addq $4 * SIZE, CO2 # coffset += 4 1332 ALIGN_4 1333 1334.L30: 1335 testq $1, M 1336 je .L39 1337 ALIGN_4 1338 1339.L31: 1340#if !defined(TRMMKERNEL) || \ 1341 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1342 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1343 1344 movq B, BO 1345#else 1346 movq KK, %rax 1347 leaq (, %rax, SIZE), %rax 1348 leaq (AO, %rax, 1), AO 1349 leaq (B, %rax, 4), BO 1350#endif 1351 1352 movddup 0 * SIZE(AO), %xmm8 1353 pxor %xmm0, %xmm0 1354 movapd 0 * SIZE(BO), %xmm9 1355 pxor %xmm1, %xmm1 1356 movddup 4 * SIZE(AO), %xmm10 1357 pxor %xmm2, %xmm2 1358 movapd 8 * SIZE(BO), %xmm11 1359 pxor %xmm3, %xmm3 1360 1361#ifndef TRMMKERNEL 1362 movq K, %rax 1363#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1364 movq K, %rax 1365 subq KK, %rax 1366 movq %rax, KKK 1367#else 1368 movq KK, %rax 1369#ifdef LEFT 1370 addq $1, %rax 1371#else 1372 addq $4, %rax 1373#endif 1374 movq %rax, KKK 1375#endif 1376 sarq $3, %rax 1377 je .L35 1378 ALIGN_4 1379 1380.L32: 1381 mulpd %xmm8, %xmm9 1382 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1383 addpd %xmm9, %xmm0 1384 movapd 2 * SIZE(BO), %xmm9 1385 mulpd %xmm8, %xmm9 1386 movddup 1 * SIZE(AO), %xmm8 1387 addpd %xmm9, %xmm1 1388 movapd 4 * SIZE(BO), %xmm9 1389 mulpd %xmm8, %xmm9 1390 addpd %xmm9, %xmm0 1391 movapd 6 * SIZE(BO), %xmm9 1392 mulpd %xmm8, %xmm9 1393 movddup 2 * SIZE(AO), %xmm8 1394 addpd %xmm9, %xmm1 1395 movapd 16 * SIZE(BO), %xmm9 1396 mulpd %xmm8, %xmm11 1397 addpd %xmm11, %xmm0 1398 movapd 10 * SIZE(BO), %xmm11 1399 mulpd %xmm8, %xmm11 1400 movddup 3 * SIZE(AO), %xmm8 1401 addpd %xmm11, %xmm1 1402 movapd 12 * SIZE(BO), %xmm11 1403 mulpd %xmm8, %xmm11 1404 addpd %xmm11, %xmm0 1405 movapd 14 * SIZE(BO), %xmm11 1406 mulpd %xmm8, %xmm11 1407 movddup 8 * SIZE(AO), %xmm8 1408 addpd %xmm11, %xmm1 1409 movapd 24 * SIZE(BO), %xmm11 1410 mulpd %xmm10, %xmm9 1411 addpd %xmm9, %xmm0 1412 movapd 18 * SIZE(BO), %xmm9 1413 mulpd %xmm10, %xmm9 1414 movddup 5 * SIZE(AO), %xmm10 1415 addpd %xmm9, %xmm1 1416 movapd 20 * SIZE(BO), %xmm9 1417 mulpd %xmm10, %xmm9 1418 addpd %xmm9, %xmm0 1419 movapd 22 * SIZE(BO), %xmm9 1420 mulpd %xmm10, %xmm9 1421 movddup 6 * SIZE(AO), %xmm10 1422 addpd %xmm9, %xmm1 1423 movapd 32 * SIZE(BO), %xmm9 1424 mulpd %xmm10, %xmm11 1425 addpd %xmm11, %xmm0 1426 movapd 26 * SIZE(BO), %xmm11 1427 mulpd %xmm10, %xmm11 1428 movddup 7 * SIZE(AO), %xmm10 1429 addpd %xmm11, %xmm1 1430 movapd 28 * SIZE(BO), %xmm11 1431 mulpd %xmm10, %xmm11 1432 addpd %xmm11, %xmm0 1433 movapd 30 * SIZE(BO), %xmm11 1434 mulpd %xmm10, %xmm11 1435 movddup 12 * SIZE(AO), %xmm10 1436 addpd %xmm11, %xmm1 1437 movapd 40 * SIZE(BO), %xmm11 1438 1439 addq $ 8 * SIZE, AO 1440 addq $32 * SIZE, BO 1441 decq %rax 1442 jne .L32 1443 ALIGN_4 1444 1445.L35: 1446#ifndef TRMMKERNEL 1447 movq K, %rax 1448#else 1449 movq KKK, %rax 1450#endif 1451 movsd ALPHA_R, %xmm15 1452 movhpd ALPHA_I, %xmm15 1453 andq $7, %rax # if (k & 1) 1454 BRANCH 1455 je .L38 1456 ALIGN_4 1457 1458.L36: 1459 mulpd %xmm8, %xmm9 1460 addpd %xmm9, %xmm0 1461 movapd 2 * SIZE(BO), %xmm9 1462 mulpd %xmm8, %xmm9 1463 movddup 1 * SIZE(AO), %xmm8 1464 addpd %xmm9, %xmm1 1465 movapd 4 * SIZE(BO), %xmm9 1466 1467 addq $1 * SIZE, AO # aoffset += 4 1468 addq $4 * SIZE, BO # boffset1 += 8 1469 decq %rax 1470 jg .L36 1471 ALIGN_4 1472 1473.L38: 1474 movsd 0 * SIZE(CO1), %xmm8 1475 movhpd 1 * SIZE(CO1), %xmm8 1476 1477 movddup %xmm0, %xmm12 1478 1479 mulpd %xmm15, %xmm12 1480 addpd %xmm12, %xmm8 1481 1482 movsd %xmm8, 0 * SIZE(CO1) 1483 movhpd %xmm8, 1 * SIZE(CO1) 1484 1485 movsd 0 * SIZE(CO2), %xmm8 1486 movhpd 1 * SIZE(CO2), %xmm8 1487 1488 unpckhpd %xmm0, %xmm0 1489 1490 mulpd %xmm15, %xmm0 1491 addpd %xmm0, %xmm8 1492 1493 movsd %xmm8, 0 * SIZE(CO2) 1494 movhpd %xmm8, 1 * SIZE(CO2) 1495 1496 movsd 0 * SIZE(CO1, LDC, 2), %xmm8 1497 movhpd 1 * SIZE(CO1, LDC, 2), %xmm8 1498 1499 movddup %xmm1, %xmm12 1500 1501 mulpd %xmm15, %xmm12 1502 addpd %xmm12, %xmm8 1503 1504 movsd %xmm8, 0 * SIZE(CO1, LDC, 2) 1505 movhpd %xmm8, 1 * SIZE(CO1, LDC, 2) 1506 1507 movsd 0 * SIZE(CO2, LDC, 2), %xmm8 1508 movhpd 1 * SIZE(CO2, LDC, 2), %xmm8 1509 1510 unpckhpd %xmm1, %xmm1 1511 1512 mulpd %xmm15, %xmm1 1513 addpd %xmm1, %xmm8 1514 1515 movsd %xmm8, 0 * SIZE(CO2, LDC, 2) 1516 movhpd %xmm8, 1 * SIZE(CO2, LDC, 2) 1517 ALIGN_4 1518 1519.L39: 1520#if defined(TRMMKERNEL) && !defined(LEFT) 1521 addl $4, KK 1522#endif 1523 1524 leaq (C, LDC, 4), C # c += 4 * ldc 1525 movq BO, B 1526 decq J # j -- 1527 jg .L10 1528 ALIGN_4 1529 1530.L40: 1531 testq $2, N 1532 je .L80 1533 ALIGN_4 1534 1535#if defined(TRMMKERNEL) && defined(LEFT) 1536 movq OFFSET, %rax 1537 movq %rax, KK 1538#endif 1539 1540 movq C, CO1 # coffset1 = c 1541 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1542 movq A, AO # aoffset = a 1543 1544 movq M, I 1545 sarq $2, I # i = (m >> 2) 1546 jle .L60 1547 ALIGN_4 1548 1549.L51: 1550#if !defined(TRMMKERNEL) || \ 1551 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1552 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1553 1554 movq B, BO 1555#else 1556 movq KK, %rax 1557 leaq (, %rax, SIZE), %rax 1558 leaq (AO, %rax, 4), AO 1559 leaq (B, %rax, 2), BO 1560#endif 1561 1562 movapd 0 * SIZE(AO), %xmm8 1563 pxor %xmm0, %xmm0 1564 movddup 0 * SIZE(BO), %xmm9 1565 pxor %xmm1, %xmm1 1566 movapd 8 * SIZE(AO), %xmm10 1567 pxor %xmm4, %xmm4 1568 movddup 8 * SIZE(BO), %xmm11 1569 pxor %xmm5, %xmm5 1570 1571#ifdef HAVE_3DNOW 1572 prefetchw 4 * SIZE(CO1) 1573 prefetchw 4 * SIZE(CO2) 1574#else 1575 prefetchnta 4 * SIZE(CO1) 1576 prefetchnta 4 * SIZE(CO2) 1577#endif 1578 1579#ifndef TRMMKERNEL 1580 movq K, %rax 1581#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1582 movq K, %rax 1583 subq KK, %rax 1584 movq %rax, KKK 1585#else 1586 movq KK, %rax 1587#ifdef LEFT 1588 addq $4, %rax 1589#else 1590 addq $2, %rax 1591#endif 1592 movq %rax, KKK 1593#endif 1594 sarq $3, %rax 1595 je .L55 1596 ALIGN_4 1597 1598.L52: 1599 mulpd %xmm8, %xmm9 1600 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1601 addpd %xmm9, %xmm0 1602 movddup 1 * SIZE(BO), %xmm9 1603 mulpd %xmm8, %xmm9 1604 movapd 2 * SIZE(AO), %xmm8 1605 addpd %xmm9, %xmm1 1606 movddup 0 * SIZE(BO), %xmm9 1607 mulpd %xmm8, %xmm9 1608 addpd %xmm9, %xmm4 1609 movddup 1 * SIZE(BO), %xmm9 1610 mulpd %xmm8, %xmm9 1611 movapd 4 * SIZE(AO), %xmm8 1612 addpd %xmm9, %xmm5 1613 movddup 2 * SIZE(BO), %xmm9 1614 mulpd %xmm8, %xmm9 1615 addpd %xmm9, %xmm0 1616 movddup 3 * SIZE(BO), %xmm9 1617 mulpd %xmm8, %xmm9 1618 movapd 6 * SIZE(AO), %xmm8 1619 addpd %xmm9, %xmm1 1620 movddup 2 * SIZE(BO), %xmm9 1621 mulpd %xmm8, %xmm9 1622 addpd %xmm9, %xmm4 1623 movddup 3 * SIZE(BO), %xmm9 1624 mulpd %xmm8, %xmm9 1625 movapd 16 * SIZE(AO), %xmm8 1626 addpd %xmm9, %xmm5 1627 movddup 4 * SIZE(BO), %xmm9 1628 mulpd %xmm10, %xmm9 1629 addpd %xmm9, %xmm0 1630 movddup 5 * SIZE(BO), %xmm9 1631 mulpd %xmm10, %xmm9 1632 movapd 10 * SIZE(AO), %xmm10 1633 addpd %xmm9, %xmm1 1634 movddup 4 * SIZE(BO), %xmm9 1635 mulpd %xmm10, %xmm9 1636 addpd %xmm9, %xmm4 1637 movddup 5 * SIZE(BO), %xmm9 1638 mulpd %xmm10, %xmm9 1639 movapd 12 * SIZE(AO), %xmm10 1640 addpd %xmm9, %xmm5 1641 movddup 6 * SIZE(BO), %xmm9 1642 mulpd %xmm10, %xmm9 1643 addpd %xmm9, %xmm0 1644 movddup 7 * SIZE(BO), %xmm9 1645 mulpd %xmm10, %xmm9 1646 movapd 14 * SIZE(AO), %xmm10 1647 addpd %xmm9, %xmm1 1648 movddup 6 * SIZE(BO), %xmm9 1649 mulpd %xmm10, %xmm9 1650 addpd %xmm9, %xmm4 1651 movddup 7 * SIZE(BO), %xmm9 1652 mulpd %xmm10, %xmm9 1653 movapd 40 * SIZE(AO), %xmm10 1654 addpd %xmm9, %xmm5 1655 movddup 16 * SIZE(BO), %xmm9 1656 mulpd %xmm8, %xmm11 1657 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1658 addpd %xmm11, %xmm0 1659 movddup 9 * SIZE(BO), %xmm11 1660 mulpd %xmm8, %xmm11 1661 movapd 18 * SIZE(AO), %xmm8 1662 addpd %xmm11, %xmm1 1663 movddup 8 * SIZE(BO), %xmm11 1664 mulpd %xmm8, %xmm11 1665 addpd %xmm11, %xmm4 1666 movddup 9 * SIZE(BO), %xmm11 1667 mulpd %xmm8, %xmm11 1668 movapd 20 * SIZE(AO), %xmm8 1669 addpd %xmm11, %xmm5 1670 movddup 10 * SIZE(BO), %xmm11 1671 mulpd %xmm8, %xmm11 1672 addpd %xmm11, %xmm0 1673 movddup 11 * SIZE(BO), %xmm11 1674 mulpd %xmm8, %xmm11 1675 movapd 22 * SIZE(AO), %xmm8 1676 addpd %xmm11, %xmm1 1677 movddup 10 * SIZE(BO), %xmm11 1678 mulpd %xmm8, %xmm11 1679 addpd %xmm11, %xmm4 1680 movddup 11 * SIZE(BO), %xmm11 1681 mulpd %xmm8, %xmm11 1682 movapd 24 * SIZE(AO), %xmm8 1683 addpd %xmm11, %xmm5 1684 movddup 12 * SIZE(BO), %xmm11 1685 mulpd %xmm8, %xmm11 1686 addpd %xmm11, %xmm0 1687 movddup 13 * SIZE(BO), %xmm11 1688 mulpd %xmm8, %xmm11 1689 movapd 26 * SIZE(AO), %xmm8 1690 addpd %xmm11, %xmm1 1691 movddup 12 * SIZE(BO), %xmm11 1692 mulpd %xmm8, %xmm11 1693 addpd %xmm11, %xmm4 1694 movddup 13 * SIZE(BO), %xmm11 1695 mulpd %xmm8, %xmm11 1696 movapd 28 * SIZE(AO), %xmm8 1697 addpd %xmm11, %xmm5 1698 movddup 14 * SIZE(BO), %xmm11 1699 mulpd %xmm8, %xmm11 1700 addpd %xmm11, %xmm0 1701 movddup 15 * SIZE(BO), %xmm11 1702 mulpd %xmm8, %xmm11 1703 movapd 30 * SIZE(AO), %xmm8 1704 addpd %xmm11, %xmm1 1705 movddup 14 * SIZE(BO), %xmm11 1706 mulpd %xmm8, %xmm11 1707 addpd %xmm11, %xmm4 1708 movddup 15 * SIZE(BO), %xmm11 1709 mulpd %xmm8, %xmm11 1710 movapd 32 * SIZE(AO), %xmm8 1711 addpd %xmm11, %xmm5 1712 movddup 24 * SIZE(BO), %xmm11 1713 1714 addq $32 * SIZE, AO 1715 addq $16 * SIZE, BO 1716 decq %rax 1717 jne .L52 1718 ALIGN_4 1719 1720.L55: 1721#ifndef TRMMKERNEL 1722 movq K, %rax 1723#else 1724 movq KKK, %rax 1725#endif 1726 movsd ALPHA_R, %xmm15 1727 movhpd ALPHA_I, %xmm15 1728 andq $7, %rax # if (k & 1) 1729 BRANCH 1730 je .L59 1731 ALIGN_4 1732 1733.L56: 1734 mulpd %xmm8, %xmm9 1735 movapd 2 * SIZE(AO), %xmm10 1736 addpd %xmm9, %xmm0 1737 movddup 1 * SIZE(BO), %xmm9 1738 mulpd %xmm8, %xmm9 1739 movddup 0 * SIZE(BO), %xmm11 1740 addpd %xmm9, %xmm1 1741 movddup 2 * SIZE(BO), %xmm9 1742 mulpd %xmm10, %xmm11 1743 movapd 4 * SIZE(AO), %xmm8 1744 addpd %xmm11, %xmm4 1745 movddup 1 * SIZE(BO), %xmm11 1746 mulpd %xmm10, %xmm11 1747 addpd %xmm11, %xmm5 1748 1749 addq $4 * SIZE, AO # aoffset += 4 1750 addq $2 * SIZE, BO # boffset1 += 8 1751 decq %rax 1752 jg .L56 1753 ALIGN_4 1754 1755.L59: 1756 movsd 0 * SIZE(CO1), %xmm8 1757 movhpd 1 * SIZE(CO1), %xmm8 1758 movsd 2 * SIZE(CO1), %xmm9 1759 movhpd 3 * SIZE(CO1), %xmm9 1760 1761 movsd 4 * SIZE(CO1), %xmm10 1762 movhpd 5 * SIZE(CO1), %xmm10 1763 movsd 6 * SIZE(CO1), %xmm11 1764 movhpd 7 * SIZE(CO1), %xmm11 1765 1766 movddup %xmm0, %xmm12 1767 unpckhpd %xmm0, %xmm0 1768 movddup %xmm4, %xmm13 1769 unpckhpd %xmm4, %xmm4 1770 1771 mulpd %xmm15, %xmm12 1772 mulpd %xmm15, %xmm0 1773 mulpd %xmm15, %xmm13 1774 mulpd %xmm15, %xmm4 1775 1776 addpd %xmm12, %xmm8 1777 addpd %xmm0, %xmm9 1778 addpd %xmm13, %xmm10 1779 addpd %xmm4, %xmm11 1780 1781 movsd %xmm8, 0 * SIZE(CO1) 1782 movhpd %xmm8, 1 * SIZE(CO1) 1783 movsd %xmm9, 2 * SIZE(CO1) 1784 movhpd %xmm9, 3 * SIZE(CO1) 1785 1786 movsd %xmm10, 4 * SIZE(CO1) 1787 movhpd %xmm10, 5 * SIZE(CO1) 1788 movsd %xmm11, 6 * SIZE(CO1) 1789 movhpd %xmm11, 7 * SIZE(CO1) 1790 1791 movsd 0 * SIZE(CO2), %xmm8 1792 movhpd 1 * SIZE(CO2), %xmm8 1793 movsd 2 * SIZE(CO2), %xmm9 1794 movhpd 3 * SIZE(CO2), %xmm9 1795 1796 movsd 4 * SIZE(CO2), %xmm10 1797 movhpd 5 * SIZE(CO2), %xmm10 1798 movsd 6 * SIZE(CO2), %xmm11 1799 movhpd 7 * SIZE(CO2), %xmm11 1800 1801 movddup %xmm1, %xmm12 1802 unpckhpd %xmm1, %xmm1 1803 movddup %xmm5, %xmm13 1804 unpckhpd %xmm5, %xmm5 1805 1806 mulpd %xmm15, %xmm12 1807 mulpd %xmm15, %xmm1 1808 mulpd %xmm15, %xmm13 1809 mulpd %xmm15, %xmm5 1810 1811 addpd %xmm12, %xmm8 1812 addpd %xmm1, %xmm9 1813 addpd %xmm13, %xmm10 1814 addpd %xmm5, %xmm11 1815 1816 movsd %xmm8, 0 * SIZE(CO2) 1817 movhpd %xmm8, 1 * SIZE(CO2) 1818 movsd %xmm9, 2 * SIZE(CO2) 1819 movhpd %xmm9, 3 * SIZE(CO2) 1820 1821 movsd %xmm10, 4 * SIZE(CO2) 1822 movhpd %xmm10, 5 * SIZE(CO2) 1823 movsd %xmm11, 6 * SIZE(CO2) 1824 movhpd %xmm11, 7 * SIZE(CO2) 1825 1826 addq $8 * SIZE, CO1 # coffset += 4 1827 addq $8 * SIZE, CO2 # coffset += 4 1828 1829 decq I # i -- 1830 jg .L51 1831 ALIGN_4 1832 1833.L60: 1834 testq $2, M 1835 je .L70 1836 ALIGN_4 1837 1838.L61: 1839#if !defined(TRMMKERNEL) || \ 1840 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1841 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1842 1843 movq B, BO 1844#else 1845 movq KK, %rax 1846 leaq (, %rax, SIZE), %rax 1847 leaq (AO, %rax, 2), AO 1848 leaq (B, %rax, 2), BO 1849#endif 1850 1851 movapd 0 * SIZE(AO), %xmm8 1852 pxor %xmm0, %xmm0 1853 movddup 0 * SIZE(BO), %xmm9 1854 pxor %xmm1, %xmm1 1855 movapd 8 * SIZE(AO), %xmm10 1856 pxor %xmm2, %xmm2 1857 movddup 8 * SIZE(BO), %xmm11 1858 pxor %xmm3, %xmm3 1859 1860#ifndef TRMMKERNEL 1861 movq K, %rax 1862#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1863 movq K, %rax 1864 subq KK, %rax 1865 movq %rax, KKK 1866#else 1867 movq KK, %rax 1868#ifdef LEFT 1869 addq $2, %rax 1870#else 1871 addq $2, %rax 1872#endif 1873 movq %rax, KKK 1874#endif 1875 sarq $3, %rax 1876 je .L65 1877 ALIGN_4 1878 1879.L62: 1880 mulpd %xmm8, %xmm9 1881 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1882 addpd %xmm9, %xmm0 1883 movddup 1 * SIZE(BO), %xmm9 1884 mulpd %xmm8, %xmm9 1885 movapd 2 * SIZE(AO), %xmm8 1886 addpd %xmm9, %xmm1 1887 movddup 2 * SIZE(BO), %xmm9 1888 mulpd %xmm8, %xmm9 1889 addpd %xmm9, %xmm2 1890 movddup 3 * SIZE(BO), %xmm9 1891 mulpd %xmm8, %xmm9 1892 movapd 4 * SIZE(AO), %xmm8 1893 addpd %xmm9, %xmm3 1894 movddup 4 * SIZE(BO), %xmm9 1895 mulpd %xmm8, %xmm9 1896 addpd %xmm9, %xmm0 1897 movddup 5 * SIZE(BO), %xmm9 1898 mulpd %xmm8, %xmm9 1899 movapd 6 * SIZE(AO), %xmm8 1900 addpd %xmm9, %xmm1 1901 movddup 6 * SIZE(BO), %xmm9 1902 mulpd %xmm8, %xmm9 1903 addpd %xmm9, %xmm2 1904 movddup 7 * SIZE(BO), %xmm9 1905 mulpd %xmm8, %xmm9 1906 movapd 16 * SIZE(AO), %xmm8 1907 addpd %xmm9, %xmm3 1908 movddup 16 * SIZE(BO), %xmm9 1909 mulpd %xmm10, %xmm11 1910 addpd %xmm11, %xmm0 1911 movddup 9 * SIZE(BO), %xmm11 1912 mulpd %xmm10, %xmm11 1913 movapd 10 * SIZE(AO), %xmm10 1914 addpd %xmm11, %xmm1 1915 movddup 10 * SIZE(BO), %xmm11 1916 mulpd %xmm10, %xmm11 1917 addpd %xmm11, %xmm2 1918 movddup 11 * SIZE(BO), %xmm11 1919 mulpd %xmm10, %xmm11 1920 movapd 12 * SIZE(AO), %xmm10 1921 addpd %xmm11, %xmm3 1922 movddup 12 * SIZE(BO), %xmm11 1923 mulpd %xmm10, %xmm11 1924 addpd %xmm11, %xmm0 1925 movddup 13 * SIZE(BO), %xmm11 1926 mulpd %xmm10, %xmm11 1927 movapd 14 * SIZE(AO), %xmm10 1928 addpd %xmm11, %xmm1 1929 movddup 14 * SIZE(BO), %xmm11 1930 mulpd %xmm10, %xmm11 1931 addpd %xmm11, %xmm2 1932 movddup 15 * SIZE(BO), %xmm11 1933 mulpd %xmm10, %xmm11 1934 movapd 24 * SIZE(AO), %xmm10 1935 addpd %xmm11, %xmm3 1936 movddup 24 * SIZE(BO), %xmm11 1937 1938 addq $16 * SIZE, AO 1939 addq $16 * SIZE, BO 1940 decq %rax 1941 jne .L62 1942 ALIGN_4 1943 1944.L65: 1945#ifndef TRMMKERNEL 1946 movq K, %rax 1947#else 1948 movq KKK, %rax 1949#endif 1950 movsd ALPHA_R, %xmm15 1951 movhpd ALPHA_I, %xmm15 1952 andq $7, %rax # if (k & 1) 1953 BRANCH 1954 je .L69 1955 ALIGN_4 1956 1957.L66: 1958 mulpd %xmm8, %xmm9 1959 addpd %xmm9, %xmm0 1960 movddup 1 * SIZE(BO), %xmm9 1961 mulpd %xmm8, %xmm9 1962 movapd 2 * SIZE(AO), %xmm8 1963 addpd %xmm9, %xmm1 1964 movddup 2 * SIZE(BO), %xmm9 1965 1966 addq $2 * SIZE, AO # aoffset += 4 1967 addq $2 * SIZE, BO # boffset1 += 8 1968 decq %rax 1969 jg .L66 1970 ALIGN_4 1971 1972.L69: 1973 addpd %xmm2, %xmm0 1974 addpd %xmm3, %xmm1 1975 1976 movsd 0 * SIZE(CO1), %xmm8 1977 movhpd 1 * SIZE(CO1), %xmm8 1978 movsd 2 * SIZE(CO1), %xmm9 1979 movhpd 3 * SIZE(CO1), %xmm9 1980 1981 movddup %xmm0, %xmm12 1982 unpckhpd %xmm0, %xmm0 1983 1984 mulpd %xmm15, %xmm12 1985 mulpd %xmm15, %xmm0 1986 addpd %xmm12, %xmm8 1987 addpd %xmm0, %xmm9 1988 1989 movsd %xmm8, 0 * SIZE(CO1) 1990 movhpd %xmm8, 1 * SIZE(CO1) 1991 movsd %xmm9, 2 * SIZE(CO1) 1992 movhpd %xmm9, 3 * SIZE(CO1) 1993 1994 movsd 0 * SIZE(CO2), %xmm8 1995 movhpd 1 * SIZE(CO2), %xmm8 1996 movsd 2 * SIZE(CO2), %xmm9 1997 movhpd 3 * SIZE(CO2), %xmm9 1998 1999 movddup %xmm1, %xmm12 2000 unpckhpd %xmm1, %xmm1 2001 2002 mulpd %xmm15, %xmm12 2003 mulpd %xmm15, %xmm1 2004 addpd %xmm12, %xmm8 2005 addpd %xmm1, %xmm9 2006 2007 movsd %xmm8, 0 * SIZE(CO2) 2008 movhpd %xmm8, 1 * SIZE(CO2) 2009 movsd %xmm9, 2 * SIZE(CO2) 2010 movhpd %xmm9, 3 * SIZE(CO2) 2011 2012 addq $4 * SIZE, CO1 2013 addq $4 * SIZE, CO2 2014 ALIGN_4 2015 2016.L70: 2017 testq $1, M 2018 je .L79 2019 ALIGN_4 2020 2021.L71: 2022#if !defined(TRMMKERNEL) || \ 2023 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2024 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2025 2026 movq B, BO 2027#else 2028 movq KK, %rax 2029 leaq (, %rax, SIZE), %rax 2030 leaq (AO, %rax, 1), AO 2031 leaq (B, %rax, 2), BO 2032#endif 2033 2034 movddup 0 * SIZE(AO), %xmm8 2035 pxor %xmm0, %xmm0 2036 movapd 0 * SIZE(BO), %xmm9 2037 pxor %xmm1, %xmm1 2038 movddup 4 * SIZE(AO), %xmm10 2039 pxor %xmm2, %xmm2 2040 movapd 8 * SIZE(BO), %xmm11 2041 pxor %xmm3, %xmm3 2042 2043#ifndef TRMMKERNEL 2044 movq K, %rax 2045#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2046 movq K, %rax 2047 subq KK, %rax 2048 movq %rax, KKK 2049#else 2050 movq KK, %rax 2051#ifdef LEFT 2052 addq $1, %rax 2053#else 2054 addq $2, %rax 2055#endif 2056 movq %rax, KKK 2057#endif 2058 sarq $3, %rax 2059 je .L75 2060 ALIGN_4 2061 2062.L72: 2063 mulpd %xmm8, %xmm9 2064 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2065 movddup 1 * SIZE(AO), %xmm8 2066 addpd %xmm9, %xmm0 2067 mulpd 2 * SIZE(BO), %xmm8 2068 movapd 16 * SIZE(BO), %xmm9 2069 addpd %xmm8, %xmm1 2070 movddup 2 * SIZE(AO), %xmm8 2071 mulpd 4 * SIZE(BO), %xmm8 2072 addpd %xmm8, %xmm2 2073 movddup 3 * SIZE(AO), %xmm8 2074 mulpd 6 * SIZE(BO), %xmm8 2075 addpd %xmm8, %xmm3 2076 movddup 8 * SIZE(AO), %xmm8 2077 mulpd %xmm10, %xmm11 2078 movddup 5 * SIZE(AO), %xmm10 2079 addpd %xmm11, %xmm0 2080 mulpd 10 * SIZE(BO), %xmm10 2081 movapd 24 * SIZE(BO), %xmm11 2082 addpd %xmm10, %xmm1 2083 movddup 6 * SIZE(AO), %xmm10 2084 mulpd 12 * SIZE(BO), %xmm10 2085 addpd %xmm10, %xmm2 2086 movddup 7 * SIZE(AO), %xmm10 2087 mulpd 14 * SIZE(BO), %xmm10 2088 addpd %xmm10, %xmm3 2089 movddup 12 * SIZE(AO), %xmm10 2090 2091 addq $ 8 * SIZE, AO 2092 addq $16 * SIZE, BO 2093 decq %rax 2094 jne .L72 2095 ALIGN_4 2096 2097.L75: 2098#ifndef TRMMKERNEL 2099 movq K, %rax 2100#else 2101 movq KKK, %rax 2102#endif 2103 movsd ALPHA_R, %xmm15 2104 movhpd ALPHA_I, %xmm15 2105 andq $7, %rax # if (k & 1) 2106 BRANCH 2107 je .L78 2108 ALIGN_4 2109 2110.L76: 2111 mulpd %xmm8, %xmm9 2112 movddup 1 * SIZE(AO), %xmm8 2113 addpd %xmm9, %xmm0 2114 movapd 2 * SIZE(BO), %xmm9 2115 2116 addq $1 * SIZE, AO # aoffset += 4 2117 addq $2 * SIZE, BO # boffset1 += 8 2118 decq %rax 2119 jg .L76 2120 ALIGN_4 2121 2122.L78: 2123 addpd %xmm1, %xmm0 2124 addpd %xmm3, %xmm2 2125 addpd %xmm2, %xmm0 2126 2127 movsd 0 * SIZE(CO1), %xmm8 2128 movhpd 1 * SIZE(CO1), %xmm8 2129 2130 movddup %xmm0, %xmm12 2131 mulpd %xmm15, %xmm12 2132 addpd %xmm12, %xmm8 2133 2134 movsd %xmm8, 0 * SIZE(CO1) 2135 movhpd %xmm8, 1 * SIZE(CO1) 2136 2137 movsd 0 * SIZE(CO2), %xmm8 2138 movhpd 1 * SIZE(CO2), %xmm8 2139 2140 unpckhpd %xmm0, %xmm0 2141 2142 mulpd %xmm15, %xmm0 2143 addpd %xmm0, %xmm8 2144 2145 movsd %xmm8, 0 * SIZE(CO2) 2146 movhpd %xmm8, 1 * SIZE(CO2) 2147 ALIGN_4 2148 2149.L79: 2150#if defined(TRMMKERNEL) && !defined(LEFT) 2151 addl $2, KK 2152#endif 2153 leaq (C, LDC, 2), C 2154 movq BO, B 2155 ALIGN_4 2156 2157.L80: 2158 testq $1, N 2159 je .L999 2160 ALIGN_4 2161 2162#if defined(TRMMKERNEL) && defined(LEFT) 2163 movq OFFSET, %rax 2164 movq %rax, KK 2165#endif 2166 2167 movq C, CO1 2168 movq A, AO 2169 2170 movq M, I 2171 sarq $2, I # i = (m >> 2) 2172 jle .L100 2173 ALIGN_4 2174 2175.L91: 2176#if !defined(TRMMKERNEL) || \ 2177 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2178 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2179 2180 movq B, BO 2181#else 2182 movq KK, %rax 2183 leaq (, %rax, SIZE), %rax 2184 leaq (AO, %rax, 4), AO 2185 leaq (B, %rax, 1), BO 2186#endif 2187 2188 movapd 0 * SIZE(AO), %xmm8 2189 pxor %xmm0, %xmm0 2190 movddup 0 * SIZE(BO), %xmm9 2191 pxor %xmm1, %xmm1 2192 movapd 8 * SIZE(AO), %xmm10 2193 pxor %xmm2, %xmm2 2194 movddup 4 * SIZE(BO), %xmm11 2195 pxor %xmm3, %xmm3 2196 2197#ifdef HAVE_3DNOW 2198 prefetchw 4 * SIZE(CO1) 2199#else 2200 prefetchnta 4 * SIZE(CO1) 2201#endif 2202 2203#ifndef TRMMKERNEL 2204 movq K, %rax 2205#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2206 movq K, %rax 2207 subq KK, %rax 2208 movq %rax, KKK 2209#else 2210 movq KK, %rax 2211#ifdef LEFT 2212 addq $4, %rax 2213#else 2214 addq $1, %rax 2215#endif 2216 movq %rax, KKK 2217#endif 2218 sarq $3, %rax 2219 je .L95 2220 ALIGN_4 2221 2222.L92: 2223 mulpd %xmm9, %xmm8 2224 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2225 mulpd 2 * SIZE(AO), %xmm9 2226 addpd %xmm8, %xmm0 2227 movapd 4 * SIZE(AO), %xmm8 2228 addpd %xmm9, %xmm1 2229 movddup 1 * SIZE(BO), %xmm9 2230 mulpd %xmm9, %xmm8 2231 mulpd 6 * SIZE(AO), %xmm9 2232 addpd %xmm8, %xmm2 2233 movapd 16 * SIZE(AO), %xmm8 2234 addpd %xmm9, %xmm3 2235 movddup 2 * SIZE(BO), %xmm9 2236 mulpd %xmm9, %xmm10 2237 mulpd 10 * SIZE(AO), %xmm9 2238 addpd %xmm10, %xmm0 2239 movapd 12 * SIZE(AO), %xmm10 2240 addpd %xmm9, %xmm1 2241 movddup 3 * SIZE(BO), %xmm9 2242 mulpd %xmm9, %xmm10 2243 mulpd 14 * SIZE(AO), %xmm9 2244 addpd %xmm10, %xmm2 2245 movapd 24 * SIZE(AO), %xmm10 2246 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2247 addpd %xmm9, %xmm3 2248 movddup 8 * SIZE(BO), %xmm9 2249 mulpd %xmm11, %xmm8 2250 mulpd 18 * SIZE(AO), %xmm11 2251 addpd %xmm8, %xmm0 2252 movapd 20 * SIZE(AO), %xmm8 2253 addpd %xmm11, %xmm1 2254 movddup 5 * SIZE(BO), %xmm11 2255 mulpd %xmm11, %xmm8 2256 mulpd 22 * SIZE(AO), %xmm11 2257 addpd %xmm8, %xmm2 2258 movapd 32 * SIZE(AO), %xmm8 2259 addpd %xmm11, %xmm3 2260 movddup 6 * SIZE(BO), %xmm11 2261 mulpd %xmm11, %xmm10 2262 mulpd 26 * SIZE(AO), %xmm11 2263 addpd %xmm10, %xmm0 2264 movapd 28 * SIZE(AO), %xmm10 2265 addpd %xmm11, %xmm1 2266 movddup 7 * SIZE(BO), %xmm11 2267 mulpd %xmm11, %xmm10 2268 mulpd 30 * SIZE(AO), %xmm11 2269 addpd %xmm10, %xmm2 2270 movapd 40 * SIZE(AO), %xmm10 2271 addpd %xmm11, %xmm3 2272 movddup 12 * SIZE(BO), %xmm11 2273 2274 addq $32 * SIZE, AO 2275 addq $8 * SIZE, BO 2276 decq %rax 2277 jne .L92 2278 ALIGN_4 2279 2280.L95: 2281#ifndef TRMMKERNEL 2282 movq K, %rax 2283#else 2284 movq KKK, %rax 2285#endif 2286 movsd ALPHA_R, %xmm15 2287 movhpd ALPHA_I, %xmm15 2288 andq $7, %rax # if (k & 1) 2289 BRANCH 2290 je .L99 2291 ALIGN_4 2292 2293.L96: 2294 mulpd %xmm9, %xmm8 2295 mulpd 2 * SIZE(AO), %xmm9 2296 addpd %xmm8, %xmm0 2297 movapd 4 * SIZE(AO), %xmm8 2298 addpd %xmm9, %xmm1 2299 movddup 1 * SIZE(BO), %xmm9 2300 2301 addq $4 * SIZE, AO # aoffset += 4 2302 addq $1 * SIZE, BO # boffset1 += 8 2303 decq %rax 2304 jg .L96 2305 ALIGN_4 2306 2307.L99: 2308 addpd %xmm2, %xmm0 2309 addpd %xmm3, %xmm1 2310 2311 movsd 0 * SIZE(CO1), %xmm8 2312 movhpd 1 * SIZE(CO1), %xmm8 2313 movsd 2 * SIZE(CO1), %xmm9 2314 movhpd 3 * SIZE(CO1), %xmm9 2315 2316 movsd 4 * SIZE(CO1), %xmm10 2317 movhpd 5 * SIZE(CO1), %xmm10 2318 movsd 6 * SIZE(CO1), %xmm11 2319 movhpd 7 * SIZE(CO1), %xmm11 2320 2321 movddup %xmm0, %xmm12 2322 unpckhpd %xmm0, %xmm0 2323 movddup %xmm1, %xmm13 2324 unpckhpd %xmm1, %xmm1 2325 2326 mulpd %xmm15, %xmm12 2327 mulpd %xmm15, %xmm0 2328 mulpd %xmm15, %xmm13 2329 mulpd %xmm15, %xmm1 2330 2331 addpd %xmm12, %xmm8 2332 addpd %xmm0, %xmm9 2333 addpd %xmm13, %xmm10 2334 addpd %xmm1, %xmm11 2335 2336 movsd %xmm8, 0 * SIZE(CO1) 2337 movhpd %xmm8, 1 * SIZE(CO1) 2338 movsd %xmm9, 2 * SIZE(CO1) 2339 movhpd %xmm9, 3 * SIZE(CO1) 2340 2341 movsd %xmm10, 4 * SIZE(CO1) 2342 movhpd %xmm10, 5 * SIZE(CO1) 2343 movsd %xmm11, 6 * SIZE(CO1) 2344 movhpd %xmm11, 7 * SIZE(CO1) 2345 2346 addq $8 * SIZE, CO1 # coffset += 4 2347 decq I # i -- 2348 jg .L91 2349 ALIGN_4 2350 2351.L100: 2352 testq $2, M 2353 je .L110 2354 ALIGN_4 2355 2356.L101: 2357#if !defined(TRMMKERNEL) || \ 2358 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2359 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2360 2361 movq B, BO 2362#else 2363 movq KK, %rax 2364 leaq (, %rax, SIZE), %rax 2365 leaq (AO, %rax, 2), AO 2366 leaq (B, %rax, 1), BO 2367#endif 2368 2369 movapd 0 * SIZE(AO), %xmm8 2370 pxor %xmm0, %xmm0 2371 movddup 0 * SIZE(BO), %xmm9 2372 pxor %xmm1, %xmm1 2373 movapd 8 * SIZE(AO), %xmm10 2374 pxor %xmm2, %xmm2 2375 movddup 4 * SIZE(BO), %xmm11 2376 pxor %xmm3, %xmm3 2377 2378#ifndef TRMMKERNEL 2379 movq K, %rax 2380#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2381 movq K, %rax 2382 subq KK, %rax 2383 movq %rax, KKK 2384#else 2385 movq KK, %rax 2386#ifdef LEFT 2387 addq $2, %rax 2388#else 2389 addq $1, %rax 2390#endif 2391 movq %rax, KKK 2392#endif 2393 sarq $3, %rax 2394 je .L105 2395 ALIGN_4 2396 2397.L102: 2398 mulpd %xmm9, %xmm8 2399 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2400 movddup 1 * SIZE(BO), %xmm9 2401 addpd %xmm8, %xmm0 2402 mulpd 2 * SIZE(AO), %xmm9 2403 movapd 16 * SIZE(AO), %xmm8 2404 addpd %xmm9, %xmm1 2405 movddup 2 * SIZE(BO), %xmm9 2406 mulpd 4 * SIZE(AO), %xmm9 2407 addpd %xmm9, %xmm2 2408 movddup 3 * SIZE(BO), %xmm9 2409 mulpd 6 * SIZE(AO), %xmm9 2410 addpd %xmm9, %xmm3 2411 movddup 8 * SIZE(BO), %xmm9 2412 mulpd %xmm11, %xmm10 2413 movddup 5 * SIZE(BO), %xmm11 2414 addpd %xmm10, %xmm0 2415 mulpd 10 * SIZE(AO), %xmm11 2416 movapd 24 * SIZE(AO), %xmm10 2417 addpd %xmm11, %xmm1 2418 movddup 6 * SIZE(BO), %xmm11 2419 mulpd 12 * SIZE(AO), %xmm11 2420 addpd %xmm11, %xmm2 2421 movddup 7 * SIZE(BO), %xmm11 2422 mulpd 14 * SIZE(AO), %xmm11 2423 addpd %xmm11, %xmm3 2424 movddup 12 * SIZE(BO), %xmm11 2425 2426 addq $16 * SIZE, AO 2427 addq $ 8 * SIZE, BO 2428 decq %rax 2429 jne .L102 2430 ALIGN_4 2431 2432.L105: 2433#ifndef TRMMKERNEL 2434 movq K, %rax 2435#else 2436 movq KKK, %rax 2437#endif 2438 movsd ALPHA_R, %xmm15 2439 movhpd ALPHA_I, %xmm15 2440 andq $7, %rax # if (k & 1) 2441 BRANCH 2442 je .L109 2443 ALIGN_4 2444 2445.L106: 2446 mulpd %xmm9, %xmm8 2447 movddup 1 * SIZE(BO), %xmm9 2448 addpd %xmm8, %xmm0 2449 movapd 2 * SIZE(AO), %xmm8 2450 2451 addq $2 * SIZE, AO # aoffset += 4 2452 addq $1 * SIZE, BO # boffset1 += 8 2453 decq %rax 2454 jg .L106 2455 ALIGN_4 2456 2457.L109: 2458 addpd %xmm1, %xmm0 2459 addpd %xmm3, %xmm2 2460 addpd %xmm2, %xmm0 2461 2462 movsd 0 * SIZE(CO1), %xmm8 2463 movhpd 1 * SIZE(CO1), %xmm8 2464 movsd 2 * SIZE(CO1), %xmm9 2465 movhpd 3 * SIZE(CO1), %xmm9 2466 2467 movddup %xmm0, %xmm12 2468 unpckhpd %xmm0, %xmm0 2469 2470 mulpd %xmm15, %xmm12 2471 mulpd %xmm15, %xmm0 2472 addpd %xmm12, %xmm8 2473 addpd %xmm0, %xmm9 2474 2475 movsd %xmm8, 0 * SIZE(CO1) 2476 movhpd %xmm8, 1 * SIZE(CO1) 2477 movsd %xmm9, 2 * SIZE(CO1) 2478 movhpd %xmm9, 3 * SIZE(CO1) 2479 2480 addq $4 * SIZE, CO1 2481 ALIGN_4 2482 2483.L110: 2484 testq $1, M 2485 je .L999 2486 ALIGN_4 2487 2488.L111: 2489#if !defined(TRMMKERNEL) || \ 2490 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2491 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2492 2493 movq B, BO 2494#else 2495 movq KK, %rax 2496 leaq (, %rax, SIZE), %rax 2497 leaq (AO, %rax, 1), AO 2498 leaq (B, %rax, 1), BO 2499#endif 2500 2501 movsd 0 * SIZE(AO), %xmm8 2502 pxor %xmm0, %xmm0 2503 movsd 0 * SIZE(BO), %xmm9 2504 pxor %xmm1, %xmm1 2505 movsd 4 * SIZE(AO), %xmm10 2506 pxor %xmm2, %xmm2 2507 movsd 4 * SIZE(BO), %xmm11 2508 pxor %xmm3, %xmm3 2509 2510 movapd 0 * SIZE(AO), %xmm9 2511 movapd 0 * SIZE(BO), %xmm8 2512 movapd 4 * SIZE(AO), %xmm11 2513 movapd 4 * SIZE(BO), %xmm10 2514 2515#ifndef TRMMKERNEL 2516 movq K, %rax 2517#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2518 movq K, %rax 2519 subq KK, %rax 2520 movq %rax, KKK 2521#else 2522 movq KK, %rax 2523#ifdef LEFT 2524 addq $1, %rax 2525#else 2526 addq $1, %rax 2527#endif 2528 movq %rax, KKK 2529#endif 2530 sarq $3, %rax 2531 je .L115 2532 ALIGN_4 2533 2534.L112: 2535 mulpd %xmm9, %xmm8 2536 movapd 2 * SIZE(AO), %xmm9 2537 addpd %xmm8, %xmm0 2538 mulpd 2 * SIZE(BO), %xmm9 2539 movapd 8 * SIZE(BO), %xmm8 2540 addpd %xmm9, %xmm1 2541 movapd 8 * SIZE(AO), %xmm9 2542 mulpd %xmm11, %xmm10 2543 movapd 6 * SIZE(AO), %xmm11 2544 addpd %xmm10, %xmm0 2545 mulpd 6 * SIZE(BO), %xmm11 2546 movapd 12 * SIZE(BO), %xmm10 2547 addpd %xmm11, %xmm1 2548 movapd 12 * SIZE(AO), %xmm11 2549 2550 addq $8 * SIZE, AO 2551 addq $8 * SIZE, BO 2552 decq %rax 2553 jne .L112 2554 ALIGN_4 2555 2556.L115: 2557#ifndef TRMMKERNEL 2558 movq K, %rax 2559#else 2560 movq KKK, %rax 2561#endif 2562 movsd ALPHA_R, %xmm15 2563 movhpd ALPHA_I, %xmm15 2564 andq $7, %rax # if (k & 1) 2565 BRANCH 2566 je .L118 2567 ALIGN_4 2568 2569.L116: 2570 mulsd 0 * SIZE(BO), %xmm9 2571 addsd %xmm9, %xmm0 2572 movsd 1 * SIZE(AO), %xmm9 2573 2574 addq $1 * SIZE, AO # aoffset += 4 2575 addq $1 * SIZE, BO # boffset1 += 8 2576 decq %rax 2577 jg .L116 2578 ALIGN_4 2579 2580.L118: 2581 addpd %xmm1, %xmm0 2582 haddpd %xmm0, %xmm0 2583 2584 movsd 0 * SIZE(CO1), %xmm8 2585 movhpd 1 * SIZE(CO1), %xmm8 2586 2587 movddup %xmm0, %xmm12 2588 2589 mulpd %xmm15, %xmm12 2590 addpd %xmm12, %xmm8 2591 2592 movsd %xmm8, 0 * SIZE(CO1) 2593 movhpd %xmm8, 1 * SIZE(CO1) 2594 ALIGN_4 2595 2596.L999: 2597 movq 0(%rsp), %rbx 2598 movq 8(%rsp), %rbp 2599 movq 16(%rsp), %r12 2600 movq 24(%rsp), %r13 2601 movq 32(%rsp), %r14 2602 movq 40(%rsp), %r15 2603 2604#ifdef WINDOWS_ABI 2605 movq 48(%rsp), %rdi 2606 movq 56(%rsp), %rsi 2607 movups 64(%rsp), %xmm6 2608 movups 80(%rsp), %xmm7 2609 movups 96(%rsp), %xmm8 2610 movups 112(%rsp), %xmm9 2611 movups 128(%rsp), %xmm10 2612 movups 144(%rsp), %xmm11 2613 movups 160(%rsp), %xmm12 2614 movups 176(%rsp), %xmm13 2615 movups 192(%rsp), %xmm14 2616 movups 208(%rsp), %xmm15 2617#endif 2618 2619 addq $STACKSIZE, %rsp 2620 ret 2621 2622 EPILOGUE 2623