1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %rdi 43#define N %rsi 44#define K %rdx 45 46#define A %rcx 47#define B %r8 48#define C %r9 49#define LDC %r10 50 51#define I %r11 52#define AO %r13 53#define BO %r14 54#define CO1 %r15 55#define CO2 %rbx 56#define KK %rbp 57#define BB %r12 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 128 62 63#define OLD_LDC 8 + STACKSIZE(%rsp) 64#define OLD_OFFSET 16 + STACKSIZE(%rsp) 65 66#define OFFSET 48(%rsp) 67#define J 56(%rsp) 68#define KKK 64(%rsp) 69#define AORIG 72(%rsp) 70 71#else 72 73#define STACKSIZE 256 74 75#define OLD_A 40 + STACKSIZE(%rsp) 76#define OLD_B 48 + STACKSIZE(%rsp) 77#define OLD_C 56 + STACKSIZE(%rsp) 78#define OLD_LDC 64 + STACKSIZE(%rsp) 79#define OLD_OFFSET 72 + STACKSIZE(%rsp) 80 81#define OFFSET 224(%rsp) 82#define J 232(%rsp) 83#define KKK 240(%rsp) 84#define AORIG 248(%rsp) 85 86#endif 87 88#define PREFETCH prefetcht1 89#define PREFETCHSIZE (16 * 12 + 3) 90#define PREFETCH_R (4 * 4 + 0) 91 92#define KERNEL1(address) \ 93 mulpd %xmm8, %xmm9 ;\ 94 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO);\ 95 addpd %xmm9, %xmm0;\ 96 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 97 mulpd %xmm8, %xmm9;\ 98 addpd %xmm9, %xmm1;\ 99 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 100 mulpd %xmm8, %xmm9;\ 101 addpd %xmm9, %xmm2;\ 102 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 103 mulpd %xmm8, %xmm9;\ 104 movapd 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 105 addpd %xmm9, %xmm3;\ 106 movddup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 107 108#define KERNEL2(address) \ 109 mulpd %xmm8, %xmm9;\ 110 addpd %xmm9, %xmm4;\ 111 movddup 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 112 mulpd %xmm8, %xmm9;\ 113 addpd %xmm9, %xmm5;\ 114 movddup 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 115 mulpd %xmm8, %xmm9;\ 116 addpd %xmm9, %xmm6;\ 117 movddup 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 118 mulpd %xmm8, %xmm9;\ 119 movapd 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 120 addpd %xmm9, %xmm7;\ 121 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 122 123#define KERNEL3(address) \ 124 mulpd %xmm8, %xmm9;\ 125 addpd %xmm9, %xmm0;\ 126 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 127 mulpd %xmm8, %xmm9;\ 128 addpd %xmm9, %xmm1;\ 129 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 130 mulpd %xmm8, %xmm9;\ 131 addpd %xmm9, %xmm2;\ 132 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 133 mulpd %xmm8, %xmm9;\ 134 movapd 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 135 addpd %xmm9, %xmm3;\ 136 movddup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9 137 138#define KERNEL4(address) \ 139 mulpd %xmm8, %xmm9;\ 140 addpd %xmm9, %xmm4;\ 141 movddup 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 142 mulpd %xmm8, %xmm9;\ 143 addpd %xmm9, %xmm5;\ 144 movddup 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 145 mulpd %xmm8, %xmm9;\ 146 addpd %xmm9, %xmm6;\ 147 movddup 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\ 148 mulpd %xmm8, %xmm9;\ 149 movapd 32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\ 150 addpd %xmm9, %xmm7;\ 151 movddup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm9 152 153#define KERNEL5(address) \ 154 mulpd %xmm10, %xmm11;\ 155 addpd %xmm11, %xmm0;\ 156 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 157 mulpd %xmm10, %xmm11;\ 158 addpd %xmm11, %xmm1;\ 159 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 160 mulpd %xmm10, %xmm11;\ 161 addpd %xmm11, %xmm2;\ 162 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 163 mulpd %xmm10, %xmm11;\ 164 movapd 10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 165 addpd %xmm11, %xmm3;\ 166 movddup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11 167 168#define KERNEL6(address) \ 169 mulpd %xmm10, %xmm11;\ 170 addpd %xmm11, %xmm4;\ 171 movddup 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 172 mulpd %xmm10, %xmm11;\ 173 addpd %xmm11, %xmm5;\ 174 movddup 10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 175 mulpd %xmm10, %xmm11;\ 176 addpd %xmm11, %xmm6;\ 177 movddup 11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 178 mulpd %xmm10, %xmm11;\ 179 movapd 12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 180 addpd %xmm11, %xmm7;\ 181 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 182 183#define KERNEL7(address) \ 184 mulpd %xmm10, %xmm11;\ 185 addpd %xmm11, %xmm0;\ 186 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 187 mulpd %xmm10, %xmm11;\ 188 addpd %xmm11, %xmm1;\ 189 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 190 mulpd %xmm10, %xmm11;\ 191 addpd %xmm11, %xmm2;\ 192 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 193 mulpd %xmm10, %xmm11;\ 194 movapd 14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 195 addpd %xmm11, %xmm3;\ 196 movddup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm11 197 198#define KERNEL8(address) \ 199 mulpd %xmm10, %xmm11;\ 200 addpd %xmm11, %xmm4;\ 201 movddup 13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 202 mulpd %xmm10, %xmm11;\ 203 addpd %xmm11, %xmm5;\ 204 movddup 14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 205 mulpd %xmm10, %xmm11;\ 206 addpd %xmm11, %xmm6;\ 207 movddup 15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\ 208 mulpd %xmm10, %xmm11;\ 209 movapd 40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\ 210 addpd %xmm11, %xmm7;\ 211 movddup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm11 212 213#define KERNEL9(address) \ 214 mulpd %xmm12, %xmm13;\ 215 PREFETCH (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\ 216 addpd %xmm13, %xmm0;\ 217 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 218 mulpd %xmm12, %xmm13;\ 219 addpd %xmm13, %xmm1;\ 220 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 221 mulpd %xmm12, %xmm13;\ 222 addpd %xmm13, %xmm2;\ 223 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 224 mulpd %xmm12, %xmm13;\ 225 movapd 18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 226 addpd %xmm13, %xmm3;\ 227 movddup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm13 228 229#define KERNEL10(address) \ 230 mulpd %xmm12, %xmm13;\ 231 addpd %xmm13, %xmm4;\ 232 movddup 17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 233 mulpd %xmm12, %xmm13;\ 234 addpd %xmm13, %xmm5;\ 235 movddup 18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 236 mulpd %xmm12, %xmm13;\ 237 addpd %xmm13, %xmm6;\ 238 movddup 19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 239 mulpd %xmm12, %xmm13;\ 240 movapd 20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 241 addpd %xmm13, %xmm7;\ 242 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 243 244#define KERNEL11(address) \ 245 mulpd %xmm12, %xmm13;\ 246 addpd %xmm13, %xmm0;\ 247 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 248 mulpd %xmm12, %xmm13;\ 249 addpd %xmm13, %xmm1;\ 250 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 251 mulpd %xmm12, %xmm13;\ 252 addpd %xmm13, %xmm2;\ 253 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 254 mulpd %xmm12, %xmm13;\ 255 movapd 22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 256 addpd %xmm13, %xmm3;\ 257 movddup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm13 258 259#define KERNEL12(address) \ 260 mulpd %xmm12, %xmm13;\ 261 addpd %xmm13, %xmm4;\ 262 movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 263 mulpd %xmm12, %xmm13;\ 264 addpd %xmm13, %xmm5;\ 265 movddup 22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 266 mulpd %xmm12, %xmm13;\ 267 addpd %xmm13, %xmm6;\ 268 movddup 23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\ 269 mulpd %xmm12, %xmm13;\ 270 movapd 48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\ 271 addpd %xmm13, %xmm7;\ 272 movddup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm13 273 274#define KERNEL13(address) \ 275 mulpd %xmm14, %xmm15;\ 276 addpd %xmm15, %xmm0;\ 277 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 278 mulpd %xmm14, %xmm15;\ 279 addpd %xmm15, %xmm1;\ 280 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 281 mulpd %xmm14, %xmm15;\ 282 addpd %xmm15, %xmm2;\ 283 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 284 mulpd %xmm14, %xmm15;\ 285 movapd 26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 286 addpd %xmm15, %xmm3;\ 287 movddup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm15 288 289#define KERNEL14(address) \ 290 mulpd %xmm14, %xmm15;\ 291 addpd %xmm15, %xmm4;\ 292 movddup 25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 293 mulpd %xmm14, %xmm15;\ 294 addpd %xmm15, %xmm5;\ 295 movddup 26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 296 mulpd %xmm14, %xmm15;\ 297 addpd %xmm15, %xmm6;\ 298 movddup 27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 299 mulpd %xmm14, %xmm15;\ 300 movapd 28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 301 addpd %xmm15, %xmm7;\ 302 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 303 304#define KERNEL15(address) \ 305 mulpd %xmm14, %xmm15;\ 306 addpd %xmm15, %xmm0;\ 307 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 308 mulpd %xmm14, %xmm15;\ 309 addpd %xmm15, %xmm1;\ 310 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 311 mulpd %xmm14, %xmm15;\ 312 addpd %xmm15, %xmm2;\ 313 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 314 mulpd %xmm14, %xmm15;\ 315 movapd 30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 316 addpd %xmm15, %xmm3;\ 317 movddup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm15 318 319#define KERNEL16(address) \ 320 mulpd %xmm14, %xmm15;\ 321 addpd %xmm15, %xmm4;\ 322 movddup 29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 323 mulpd %xmm14, %xmm15;\ 324 addpd %xmm15, %xmm5;\ 325 movddup 30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 326 mulpd %xmm14, %xmm15;\ 327 addpd %xmm15, %xmm6;\ 328 movddup 31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\ 329 mulpd %xmm14, %xmm15;\ 330 movapd 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\ 331 addpd %xmm15, %xmm7;\ 332 movddup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 333 334 PROLOGUE 335 PROFCODE 336 337 subq $STACKSIZE, %rsp 338 movq %rbx, 0(%rsp) 339 movq %rbp, 8(%rsp) 340 movq %r12, 16(%rsp) 341 movq %r13, 24(%rsp) 342 movq %r14, 32(%rsp) 343 movq %r15, 40(%rsp) 344 345#ifdef WINDOWS_ABI 346 movq %rdi, 48(%rsp) 347 movq %rsi, 56(%rsp) 348 movups %xmm6, 64(%rsp) 349 movups %xmm7, 80(%rsp) 350 movups %xmm8, 96(%rsp) 351 movups %xmm9, 112(%rsp) 352 movups %xmm10, 128(%rsp) 353 movups %xmm11, 144(%rsp) 354 movups %xmm12, 160(%rsp) 355 movups %xmm13, 176(%rsp) 356 movups %xmm14, 192(%rsp) 357 movups %xmm15, 208(%rsp) 358 359 movq ARG1, M 360 movq ARG2, N 361 movq ARG3, K 362 movq OLD_A, A 363 movq OLD_B, B 364 movq OLD_C, C 365#endif 366 367 movq OLD_LDC, LDC 368 movq OLD_OFFSET, KK 369 370 movq KK, OFFSET 371 372 leaq (, LDC, SIZE), LDC 373 374#ifdef LN 375 leaq (, M, SIZE), %rax 376 addq %rax, C 377 imulq K, %rax 378 addq %rax, A 379#endif 380 381#ifdef RT 382 leaq (, N, SIZE), %rax 383 imulq K, %rax 384 addq %rax, B 385 movq N, %rax 386 imulq LDC, %rax 387 addq %rax, C 388#endif 389 390#ifdef RN 391 negq KK 392#endif 393 394#ifdef RT 395 movq N, %rax 396 subq OFFSET, %rax 397 movq %rax, KK 398#endif 399 400 movq N, J 401 sarq $2, J # j = (n >> 2) 402 jle .L40 403 ALIGN_4 404 405.L10: 406#if defined(LT) || defined(RN) 407 movq A, AO 408#else 409 movq A, AORIG 410#endif 411 412#ifdef RT 413 movq K, %rax 414 salq $2 + BASE_SHIFT, %rax 415 subq %rax, B 416 417 leaq (, LDC, 4), %rax 418 subq %rax, C 419#endif 420 421 movq C, CO1 422 leaq (C, LDC, 1), CO2 423#ifndef RT 424 leaq (C, LDC, 4), C 425#endif 426 427#ifdef LN 428 movq OFFSET, %rax 429 addq M, %rax 430 movq %rax, KK 431#endif 432 433 movq K, %rax 434 salq $BASE_SHIFT + 2, %rax 435 leaq (B, %rax), BB 436 437#ifdef LT 438 movq OFFSET, %rax 439 movq %rax, KK 440#endif 441 442 movq M, I 443 sarq $2, I # i = (m >> 2) 444 jle .L20 445 ALIGN_4 446 447.L11: 448#ifdef LN 449 movq K, %rax 450 salq $2 + BASE_SHIFT, %rax 451 subq %rax, AORIG 452#endif 453 454#if defined(LN) || defined(RT) 455 movq KK, %rax 456 leaq (, %rax, SIZE), %rax 457 movq AORIG, AO 458 leaq (AO, %rax, 4), AO 459 leaq (B, %rax, 4), BO 460#else 461 movq B, BO 462#endif 463 464 prefetcht0 0 * SIZE(BB) 465 subq $-8 * SIZE, BB 466 467 movapd 0 * SIZE(AO), %xmm8 468 pxor %xmm0, %xmm0 469 movddup 0 * SIZE(BO), %xmm9 470 pxor %xmm1, %xmm1 471 movapd 8 * SIZE(AO), %xmm10 472 pxor %xmm2, %xmm2 473 movddup 8 * SIZE(BO), %xmm11 474 pxor %xmm3, %xmm3 475 476 movapd 16 * SIZE(AO), %xmm12 477 movddup 16 * SIZE(BO), %xmm13 478 movapd 24 * SIZE(AO), %xmm14 479 movddup 24 * SIZE(BO), %xmm15 480 481 prefetchnta 4 * SIZE(CO1) 482 pxor %xmm4, %xmm4 483 prefetchnta 4 * SIZE(CO2) 484 pxor %xmm5, %xmm5 485 prefetchnta 4 * SIZE(CO1, LDC, 2) 486 pxor %xmm6, %xmm6 487 prefetchnta 4 * SIZE(CO2, LDC, 2) 488 pxor %xmm7, %xmm7 489 490#if defined(LT) || defined(RN) 491 movq KK, %rax 492#else 493 movq K, %rax 494 subq KK, %rax 495#endif 496 497#if 1 498 andq $-8, %rax 499 salq $4, %rax 500 je .L15 501.L1X: 502 KERNEL1 (16 * 0) 503 KERNEL2 (16 * 0) 504 KERNEL3 (16 * 0) 505 KERNEL4 (16 * 0) 506 KERNEL5 (16 * 0) 507 KERNEL6 (16 * 0) 508 KERNEL7 (16 * 0) 509 KERNEL8 (16 * 0) 510 KERNEL9 (16 * 0) 511 KERNEL10(16 * 0) 512 KERNEL11(16 * 0) 513 KERNEL12(16 * 0) 514 KERNEL13(16 * 0) 515 KERNEL14(16 * 0) 516 KERNEL15(16 * 0) 517 KERNEL16(16 * 0) 518 cmpq $128 * 1, %rax 519 NOBRANCH 520 jle .L12 521 KERNEL1 (16 * 1) 522 KERNEL2 (16 * 1) 523 KERNEL3 (16 * 1) 524 KERNEL4 (16 * 1) 525 KERNEL5 (16 * 1) 526 KERNEL6 (16 * 1) 527 KERNEL7 (16 * 1) 528 KERNEL8 (16 * 1) 529 KERNEL9 (16 * 1) 530 KERNEL10(16 * 1) 531 KERNEL11(16 * 1) 532 KERNEL12(16 * 1) 533 KERNEL13(16 * 1) 534 KERNEL14(16 * 1) 535 KERNEL15(16 * 1) 536 KERNEL16(16 * 1) 537 cmpq $128 * 2, %rax 538 NOBRANCH 539 jle .L12 540 KERNEL1 (16 * 2) 541 KERNEL2 (16 * 2) 542 KERNEL3 (16 * 2) 543 KERNEL4 (16 * 2) 544 KERNEL5 (16 * 2) 545 KERNEL6 (16 * 2) 546 KERNEL7 (16 * 2) 547 KERNEL8 (16 * 2) 548 KERNEL9 (16 * 2) 549 KERNEL10(16 * 2) 550 KERNEL11(16 * 2) 551 KERNEL12(16 * 2) 552 KERNEL13(16 * 2) 553 KERNEL14(16 * 2) 554 KERNEL15(16 * 2) 555 KERNEL16(16 * 2) 556 cmpq $128 * 3, %rax 557 NOBRANCH 558 jle .L12 559 KERNEL1 (16 * 3) 560 KERNEL2 (16 * 3) 561 KERNEL3 (16 * 3) 562 KERNEL4 (16 * 3) 563 KERNEL5 (16 * 3) 564 KERNEL6 (16 * 3) 565 KERNEL7 (16 * 3) 566 KERNEL8 (16 * 3) 567 KERNEL9 (16 * 3) 568 KERNEL10(16 * 3) 569 KERNEL11(16 * 3) 570 KERNEL12(16 * 3) 571 KERNEL13(16 * 3) 572 KERNEL14(16 * 3) 573 KERNEL15(16 * 3) 574 KERNEL16(16 * 3) 575 cmpq $128 * 4, %rax 576 NOBRANCH 577 jle .L12 578 KERNEL1 (16 * 4) 579 KERNEL2 (16 * 4) 580 KERNEL3 (16 * 4) 581 KERNEL4 (16 * 4) 582 KERNEL5 (16 * 4) 583 KERNEL6 (16 * 4) 584 KERNEL7 (16 * 4) 585 KERNEL8 (16 * 4) 586 KERNEL9 (16 * 4) 587 KERNEL10(16 * 4) 588 KERNEL11(16 * 4) 589 KERNEL12(16 * 4) 590 KERNEL13(16 * 4) 591 KERNEL14(16 * 4) 592 KERNEL15(16 * 4) 593 KERNEL16(16 * 4) 594 cmpq $128 * 5, %rax 595 NOBRANCH 596 jle .L12 597 KERNEL1 (16 * 5) 598 KERNEL2 (16 * 5) 599 KERNEL3 (16 * 5) 600 KERNEL4 (16 * 5) 601 KERNEL5 (16 * 5) 602 KERNEL6 (16 * 5) 603 KERNEL7 (16 * 5) 604 KERNEL8 (16 * 5) 605 KERNEL9 (16 * 5) 606 KERNEL10(16 * 5) 607 KERNEL11(16 * 5) 608 KERNEL12(16 * 5) 609 KERNEL13(16 * 5) 610 KERNEL14(16 * 5) 611 KERNEL15(16 * 5) 612 KERNEL16(16 * 5) 613 cmpq $128 * 6, %rax 614 NOBRANCH 615 jle .L12 616 KERNEL1 (16 * 6) 617 KERNEL2 (16 * 6) 618 KERNEL3 (16 * 6) 619 KERNEL4 (16 * 6) 620 KERNEL5 (16 * 6) 621 KERNEL6 (16 * 6) 622 KERNEL7 (16 * 6) 623 KERNEL8 (16 * 6) 624 KERNEL9 (16 * 6) 625 KERNEL10(16 * 6) 626 KERNEL11(16 * 6) 627 KERNEL12(16 * 6) 628 KERNEL13(16 * 6) 629 KERNEL14(16 * 6) 630 KERNEL15(16 * 6) 631 KERNEL16(16 * 6) 632 cmpq $128 * 7, %rax 633 NOBRANCH 634 jle .L12 635 KERNEL1 (16 * 7) 636 KERNEL2 (16 * 7) 637 KERNEL3 (16 * 7) 638 KERNEL4 (16 * 7) 639 KERNEL5 (16 * 7) 640 KERNEL6 (16 * 7) 641 KERNEL7 (16 * 7) 642 KERNEL8 (16 * 7) 643 KERNEL9 (16 * 7) 644 KERNEL10(16 * 7) 645 KERNEL11(16 * 7) 646 KERNEL12(16 * 7) 647 KERNEL13(16 * 7) 648 KERNEL14(16 * 7) 649 KERNEL15(16 * 7) 650 KERNEL16(16 * 7) 651 652 addq $32 * 8 * SIZE, AO 653 addq $32 * 8 * SIZE, BO 654 subq $128 * 8, %rax 655 jg .L1X 656 657.L12: 658 leaq (AO, %rax, 2), AO # * 16 659 leaq (BO, %rax, 2), BO # * 64 660#else 661 sarq $3, %rax 662 je .L15 663 ALIGN_4 664 665.L12: 666 mulpd %xmm8, %xmm9 667 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 668 addpd %xmm9, %xmm0 669 movddup 1 * SIZE(BO), %xmm9 670 mulpd %xmm8, %xmm9 671 addpd %xmm9, %xmm1 672 movddup 2 * SIZE(BO), %xmm9 673 mulpd %xmm8, %xmm9 674 addpd %xmm9, %xmm2 675 movddup 3 * SIZE(BO), %xmm9 676 mulpd %xmm8, %xmm9 677 movapd 2 * SIZE(AO), %xmm8 678 addpd %xmm9, %xmm3 679 movddup 0 * SIZE(BO), %xmm9 680 mulpd %xmm8, %xmm9 681 addpd %xmm9, %xmm4 682 movddup 1 * SIZE(BO), %xmm9 683 mulpd %xmm8, %xmm9 684 addpd %xmm9, %xmm5 685 movddup 2 * SIZE(BO), %xmm9 686 mulpd %xmm8, %xmm9 687 addpd %xmm9, %xmm6 688 movddup 3 * SIZE(BO), %xmm9 689 mulpd %xmm8, %xmm9 690 movapd 4 * SIZE(AO), %xmm8 691 addpd %xmm9, %xmm7 692 movddup 4 * SIZE(BO), %xmm9 693 mulpd %xmm8, %xmm9 694 addpd %xmm9, %xmm0 695 movddup 5 * SIZE(BO), %xmm9 696 mulpd %xmm8, %xmm9 697 addpd %xmm9, %xmm1 698 movddup 6 * SIZE(BO), %xmm9 699 mulpd %xmm8, %xmm9 700 addpd %xmm9, %xmm2 701 movddup 7 * SIZE(BO), %xmm9 702 mulpd %xmm8, %xmm9 703 movapd 6 * SIZE(AO), %xmm8 704 addpd %xmm9, %xmm3 705 movddup 4 * SIZE(BO), %xmm9 706 mulpd %xmm8, %xmm9 707 addpd %xmm9, %xmm4 708 movddup 5 * SIZE(BO), %xmm9 709 mulpd %xmm8, %xmm9 710 addpd %xmm9, %xmm5 711 movddup 6 * SIZE(BO), %xmm9 712 mulpd %xmm8, %xmm9 713 addpd %xmm9, %xmm6 714 movddup 7 * SIZE(BO), %xmm9 715 mulpd %xmm8, %xmm9 716 movapd 32 * SIZE(AO), %xmm8 717 addpd %xmm9, %xmm7 718 719 movddup 32 * SIZE(BO), %xmm9 720 mulpd %xmm10, %xmm11 721 addpd %xmm11, %xmm0 722 movddup 9 * SIZE(BO), %xmm11 723 mulpd %xmm10, %xmm11 724 addpd %xmm11, %xmm1 725 movddup 10 * SIZE(BO), %xmm11 726 mulpd %xmm10, %xmm11 727 addpd %xmm11, %xmm2 728 movddup 11 * SIZE(BO), %xmm11 729 mulpd %xmm10, %xmm11 730 movapd 10 * SIZE(AO), %xmm10 731 addpd %xmm11, %xmm3 732 733 movddup 8 * SIZE(BO), %xmm11 734 mulpd %xmm10, %xmm11 735 addpd %xmm11, %xmm4 736 movddup 9 * SIZE(BO), %xmm11 737 mulpd %xmm10, %xmm11 738 addpd %xmm11, %xmm5 739 movddup 10 * SIZE(BO), %xmm11 740 mulpd %xmm10, %xmm11 741 addpd %xmm11, %xmm6 742 movddup 11 * SIZE(BO), %xmm11 743 mulpd %xmm10, %xmm11 744 movapd 12 * SIZE(AO), %xmm10 745 addpd %xmm11, %xmm7 746 movddup 12 * SIZE(BO), %xmm11 747 mulpd %xmm10, %xmm11 748 addpd %xmm11, %xmm0 749 movddup 13 * SIZE(BO), %xmm11 750 mulpd %xmm10, %xmm11 751 addpd %xmm11, %xmm1 752 movddup 14 * SIZE(BO), %xmm11 753 mulpd %xmm10, %xmm11 754 addpd %xmm11, %xmm2 755 movddup 15 * SIZE(BO), %xmm11 756 mulpd %xmm10, %xmm11 757 movapd 14 * SIZE(AO), %xmm10 758 addpd %xmm11, %xmm3 759 760 movddup 12 * SIZE(BO), %xmm11 761 mulpd %xmm10, %xmm11 762 addpd %xmm11, %xmm4 763 movddup 13 * SIZE(BO), %xmm11 764 mulpd %xmm10, %xmm11 765 addpd %xmm11, %xmm5 766 movddup 14 * SIZE(BO), %xmm11 767 mulpd %xmm10, %xmm11 768 addpd %xmm11, %xmm6 769 movddup 15 * SIZE(BO), %xmm11 770 mulpd %xmm10, %xmm11 771 movapd 40 * SIZE(AO), %xmm10 772 addpd %xmm11, %xmm7 773 movddup 40 * SIZE(BO), %xmm11 774 775 mulpd %xmm12, %xmm13 776 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 777 addpd %xmm13, %xmm0 778 movddup 17 * SIZE(BO), %xmm13 779 mulpd %xmm12, %xmm13 780 addpd %xmm13, %xmm1 781 movddup 18 * SIZE(BO), %xmm13 782 mulpd %xmm12, %xmm13 783 addpd %xmm13, %xmm2 784 movddup 19 * SIZE(BO), %xmm13 785 mulpd %xmm12, %xmm13 786 movapd 18 * SIZE(AO), %xmm12 787 addpd %xmm13, %xmm3 788 789 movddup 16 * SIZE(BO), %xmm13 790 mulpd %xmm12, %xmm13 791 addpd %xmm13, %xmm4 792 movddup 17 * SIZE(BO), %xmm13 793 mulpd %xmm12, %xmm13 794 addpd %xmm13, %xmm5 795 movddup 18 * SIZE(BO), %xmm13 796 mulpd %xmm12, %xmm13 797 addpd %xmm13, %xmm6 798 movddup 19 * SIZE(BO), %xmm13 799 mulpd %xmm12, %xmm13 800 movapd 20 * SIZE(AO), %xmm12 801 addpd %xmm13, %xmm7 802 803 movddup 20 * SIZE(BO), %xmm13 804 mulpd %xmm12, %xmm13 805 addpd %xmm13, %xmm0 806 movddup 21 * SIZE(BO), %xmm13 807 mulpd %xmm12, %xmm13 808 addpd %xmm13, %xmm1 809 movddup 22 * SIZE(BO), %xmm13 810 mulpd %xmm12, %xmm13 811 addpd %xmm13, %xmm2 812 movddup 23 * SIZE(BO), %xmm13 813 mulpd %xmm12, %xmm13 814 movapd 22 * SIZE(AO), %xmm12 815 addpd %xmm13, %xmm3 816 817 movddup 20 * SIZE(BO), %xmm13 818 mulpd %xmm12, %xmm13 819 addpd %xmm13, %xmm4 820 movddup 21 * SIZE(BO), %xmm13 821 mulpd %xmm12, %xmm13 822 addpd %xmm13, %xmm5 823 movddup 22 * SIZE(BO), %xmm13 824 mulpd %xmm12, %xmm13 825 addpd %xmm13, %xmm6 826 movddup 23 * SIZE(BO), %xmm13 827 mulpd %xmm12, %xmm13 828 movapd 48 * SIZE(AO), %xmm12 829 addpd %xmm13, %xmm7 830 movddup 48 * SIZE(BO), %xmm13 831 832 mulpd %xmm14, %xmm15 833 addpd %xmm15, %xmm0 834 movddup 25 * SIZE(BO), %xmm15 835 mulpd %xmm14, %xmm15 836 addpd %xmm15, %xmm1 837 movddup 26 * SIZE(BO), %xmm15 838 mulpd %xmm14, %xmm15 839 addpd %xmm15, %xmm2 840 movddup 27 * SIZE(BO), %xmm15 841 mulpd %xmm14, %xmm15 842 movapd 26 * SIZE(AO), %xmm14 843 addpd %xmm15, %xmm3 844 845 movddup 24 * SIZE(BO), %xmm15 846 mulpd %xmm14, %xmm15 847 addpd %xmm15, %xmm4 848 movddup 25 * SIZE(BO), %xmm15 849 mulpd %xmm14, %xmm15 850 addpd %xmm15, %xmm5 851 movddup 26 * SIZE(BO), %xmm15 852 mulpd %xmm14, %xmm15 853 addpd %xmm15, %xmm6 854 movddup 27 * SIZE(BO), %xmm15 855 mulpd %xmm14, %xmm15 856 movapd 28 * SIZE(AO), %xmm14 857 addpd %xmm15, %xmm7 858 859 movddup 28 * SIZE(BO), %xmm15 860 mulpd %xmm14, %xmm15 861 addpd %xmm15, %xmm0 862 movddup 29 * SIZE(BO), %xmm15 863 mulpd %xmm14, %xmm15 864 addpd %xmm15, %xmm1 865 movddup 30 * SIZE(BO), %xmm15 866 mulpd %xmm14, %xmm15 867 addpd %xmm15, %xmm2 868 movddup 31 * SIZE(BO), %xmm15 869 mulpd %xmm14, %xmm15 870 movapd 30 * SIZE(AO), %xmm14 871 addpd %xmm15, %xmm3 872 873 movddup 28 * SIZE(BO), %xmm15 874 mulpd %xmm14, %xmm15 875 addpd %xmm15, %xmm4 876 movddup 29 * SIZE(BO), %xmm15 877 mulpd %xmm14, %xmm15 878 addpd %xmm15, %xmm5 879 movddup 30 * SIZE(BO), %xmm15 880 mulpd %xmm14, %xmm15 881 addpd %xmm15, %xmm6 882 movddup 31 * SIZE(BO), %xmm15 883 mulpd %xmm14, %xmm15 884 movapd 56 * SIZE(AO), %xmm14 885 addpd %xmm15, %xmm7 886 movddup 56 * SIZE(BO), %xmm15 887 888 addq $32 * SIZE, BO 889 addq $32 * SIZE, AO 890 decq %rax 891 BRANCH 892 jne .L12 893#endif 894 ALIGN_4 895 896.L15: 897#if defined(LT) || defined(RN) 898 movq KK, %rax 899#else 900 movq K, %rax 901 subq KK, %rax 902#endif 903 andq $7, %rax # if (k & 1) 904 BRANCH 905 je .L19 906 ALIGN_4 907 908.L16: 909 mulpd %xmm8, %xmm9 910 movapd 2 * SIZE(AO), %xmm10 911 addpd %xmm9, %xmm0 912 movddup 1 * SIZE(BO), %xmm9 913 mulpd %xmm8, %xmm9 914 movddup 0 * SIZE(BO), %xmm11 915 addpd %xmm9, %xmm1 916 movddup 2 * SIZE(BO), %xmm9 917 mulpd %xmm8, %xmm9 918 addpd %xmm9, %xmm2 919 movddup 3 * SIZE(BO), %xmm9 920 mulpd %xmm8, %xmm9 921 movapd 4 * SIZE(AO), %xmm8 922 addpd %xmm9, %xmm3 923 movddup 4 * SIZE(BO), %xmm9 924 mulpd %xmm10, %xmm11 925 addpd %xmm11, %xmm4 926 movddup 1 * SIZE(BO), %xmm11 927 mulpd %xmm10, %xmm11 928 addpd %xmm11, %xmm5 929 movddup 2 * SIZE(BO), %xmm11 930 mulpd %xmm10, %xmm11 931 addpd %xmm11, %xmm6 932 movddup 3 * SIZE(BO), %xmm11 933 mulpd %xmm10, %xmm11 934 addpd %xmm11, %xmm7 935 936 addq $4 * SIZE, AO # aoffset += 4 937 addq $4 * SIZE, BO # boffset1 += 8 938 decq %rax 939 jg .L16 940 ALIGN_4 941 942.L19: 943#if defined(LN) || defined(RT) 944 movq KK, %rax 945 subq $4, %rax 946 947 leaq (, %rax, SIZE), %rax 948 949 movq AORIG, AO 950 leaq (AO, %rax, 4), AO 951 leaq (B, %rax, 4), BO 952#endif 953 954#if defined(LN) || defined(LT) 955 movapd %xmm0, %xmm8 956 unpcklpd %xmm1, %xmm0 957 unpckhpd %xmm1, %xmm8 958 959 movapd %xmm2, %xmm10 960 unpcklpd %xmm3, %xmm2 961 unpckhpd %xmm3, %xmm10 962 963 movapd %xmm4, %xmm12 964 unpcklpd %xmm5, %xmm4 965 unpckhpd %xmm5, %xmm12 966 967 movapd %xmm6, %xmm14 968 unpcklpd %xmm7, %xmm6 969 unpckhpd %xmm7, %xmm14 970 971 movapd 0 * SIZE(BO), %xmm1 972 movapd 2 * SIZE(BO), %xmm3 973 movapd 4 * SIZE(BO), %xmm5 974 movapd 6 * SIZE(BO), %xmm7 975 movapd 8 * SIZE(BO), %xmm9 976 movapd 10 * SIZE(BO), %xmm11 977 movapd 12 * SIZE(BO), %xmm13 978 movapd 14 * SIZE(BO), %xmm15 979 980 subpd %xmm0, %xmm1 981 subpd %xmm2, %xmm3 982 subpd %xmm8, %xmm5 983 subpd %xmm10, %xmm7 984 subpd %xmm4, %xmm9 985 subpd %xmm6, %xmm11 986 subpd %xmm12, %xmm13 987 subpd %xmm14, %xmm15 988#else 989 990 movapd 0 * SIZE(AO), %xmm8 991 movapd 2 * SIZE(AO), %xmm9 992 movapd 4 * SIZE(AO), %xmm10 993 movapd 6 * SIZE(AO), %xmm11 994 995 movapd 8 * SIZE(AO), %xmm12 996 movapd 10 * SIZE(AO), %xmm13 997 movapd 12 * SIZE(AO), %xmm14 998 movapd 14 * SIZE(AO), %xmm15 999 1000 subpd %xmm0, %xmm8 1001 subpd %xmm4, %xmm9 1002 subpd %xmm1, %xmm10 1003 subpd %xmm5, %xmm11 1004 subpd %xmm2, %xmm12 1005 subpd %xmm6, %xmm13 1006 subpd %xmm3, %xmm14 1007 subpd %xmm7, %xmm15 1008#endif 1009 1010 1011#ifdef LN 1012 movddup 15 * SIZE(AO), %xmm0 1013 mulpd %xmm0, %xmm13 1014 mulpd %xmm0, %xmm15 1015 1016 movddup 14 * SIZE(AO), %xmm2 1017 mulpd %xmm13, %xmm2 1018 subpd %xmm2, %xmm9 1019 movddup 14 * SIZE(AO), %xmm2 1020 mulpd %xmm15, %xmm2 1021 subpd %xmm2, %xmm11 1022 1023 movddup 13 * SIZE(AO), %xmm4 1024 mulpd %xmm13, %xmm4 1025 subpd %xmm4, %xmm5 1026 movddup 13 * SIZE(AO), %xmm4 1027 mulpd %xmm15, %xmm4 1028 subpd %xmm4, %xmm7 1029 1030 movddup 12 * SIZE(AO), %xmm6 1031 mulpd %xmm13, %xmm6 1032 subpd %xmm6, %xmm1 1033 movddup 12 * SIZE(AO), %xmm6 1034 mulpd %xmm15, %xmm6 1035 subpd %xmm6, %xmm3 1036 1037 movddup 10 * SIZE(AO), %xmm0 1038 mulpd %xmm0, %xmm9 1039 mulpd %xmm0, %xmm11 1040 1041 movddup 9 * SIZE(AO), %xmm2 1042 mulpd %xmm9, %xmm2 1043 subpd %xmm2, %xmm5 1044 movddup 9 * SIZE(AO), %xmm2 1045 mulpd %xmm11, %xmm2 1046 subpd %xmm2, %xmm7 1047 1048 movddup 8 * SIZE(AO), %xmm4 1049 mulpd %xmm9, %xmm4 1050 subpd %xmm4, %xmm1 1051 movddup 8 * SIZE(AO), %xmm4 1052 mulpd %xmm11, %xmm4 1053 subpd %xmm4, %xmm3 1054 1055 movddup 5 * SIZE(AO), %xmm0 1056 mulpd %xmm0, %xmm5 1057 mulpd %xmm0, %xmm7 1058 1059 movddup 4 * SIZE(AO), %xmm2 1060 mulpd %xmm5, %xmm2 1061 subpd %xmm2, %xmm1 1062 movddup 4 * SIZE(AO), %xmm2 1063 mulpd %xmm7, %xmm2 1064 subpd %xmm2, %xmm3 1065 1066 movddup 0 * SIZE(AO), %xmm0 1067 mulpd %xmm0, %xmm1 1068 mulpd %xmm0, %xmm3 1069#endif 1070 1071#ifdef LT 1072 movddup 0 * SIZE(AO), %xmm0 1073 mulpd %xmm0, %xmm1 1074 mulpd %xmm0, %xmm3 1075 1076 movddup 1 * SIZE(AO), %xmm2 1077 mulpd %xmm1, %xmm2 1078 subpd %xmm2, %xmm5 1079 movddup 1 * SIZE(AO), %xmm2 1080 mulpd %xmm3, %xmm2 1081 subpd %xmm2, %xmm7 1082 1083 movddup 2 * SIZE(AO), %xmm4 1084 mulpd %xmm1, %xmm4 1085 subpd %xmm4, %xmm9 1086 movddup 2 * SIZE(AO), %xmm4 1087 mulpd %xmm3, %xmm4 1088 subpd %xmm4, %xmm11 1089 1090 movddup 3 * SIZE(AO), %xmm6 1091 mulpd %xmm1, %xmm6 1092 subpd %xmm6, %xmm13 1093 movddup 3 * SIZE(AO), %xmm6 1094 mulpd %xmm3, %xmm6 1095 subpd %xmm6, %xmm15 1096 1097 movddup 5 * SIZE(AO), %xmm0 1098 mulpd %xmm0, %xmm5 1099 mulpd %xmm0, %xmm7 1100 1101 movddup 6 * SIZE(AO), %xmm2 1102 mulpd %xmm5, %xmm2 1103 subpd %xmm2, %xmm9 1104 movddup 6 * SIZE(AO), %xmm2 1105 mulpd %xmm7, %xmm2 1106 subpd %xmm2, %xmm11 1107 1108 movddup 7 * SIZE(AO), %xmm4 1109 mulpd %xmm5, %xmm4 1110 subpd %xmm4, %xmm13 1111 movddup 7 * SIZE(AO), %xmm4 1112 mulpd %xmm7, %xmm4 1113 subpd %xmm4, %xmm15 1114 1115 movddup 10 * SIZE(AO), %xmm0 1116 mulpd %xmm0, %xmm9 1117 mulpd %xmm0, %xmm11 1118 1119 movddup 11 * SIZE(AO), %xmm2 1120 mulpd %xmm9, %xmm2 1121 subpd %xmm2, %xmm13 1122 movddup 11 * SIZE(AO), %xmm2 1123 mulpd %xmm11, %xmm2 1124 subpd %xmm2, %xmm15 1125 1126 movddup 15 * SIZE(AO), %xmm0 1127 mulpd %xmm0, %xmm13 1128 mulpd %xmm0, %xmm15 1129#endif 1130 1131 1132#ifdef RN 1133 movddup 0 * SIZE(BO), %xmm0 1134 mulpd %xmm0, %xmm8 1135 mulpd %xmm0, %xmm9 1136 1137 movddup 1 * SIZE(BO), %xmm1 1138 mulpd %xmm8, %xmm1 1139 subpd %xmm1, %xmm10 1140 movddup 1 * SIZE(BO), %xmm1 1141 mulpd %xmm9, %xmm1 1142 subpd %xmm1, %xmm11 1143 1144 movddup 2 * SIZE(BO), %xmm2 1145 mulpd %xmm8, %xmm2 1146 subpd %xmm2, %xmm12 1147 movddup 2 * SIZE(BO), %xmm2 1148 mulpd %xmm9, %xmm2 1149 subpd %xmm2, %xmm13 1150 1151 movddup 3 * SIZE(BO), %xmm3 1152 mulpd %xmm8, %xmm3 1153 subpd %xmm3, %xmm14 1154 movddup 3 * SIZE(BO), %xmm3 1155 mulpd %xmm9, %xmm3 1156 subpd %xmm3, %xmm15 1157 1158 movddup 5 * SIZE(BO), %xmm0 1159 mulpd %xmm0, %xmm10 1160 mulpd %xmm0, %xmm11 1161 1162 movddup 6 * SIZE(BO), %xmm1 1163 mulpd %xmm10, %xmm1 1164 subpd %xmm1, %xmm12 1165 movddup 6 * SIZE(BO), %xmm1 1166 mulpd %xmm11, %xmm1 1167 subpd %xmm1, %xmm13 1168 1169 movddup 7 * SIZE(BO), %xmm2 1170 mulpd %xmm10, %xmm2 1171 subpd %xmm2, %xmm14 1172 movddup 7 * SIZE(BO), %xmm2 1173 mulpd %xmm11, %xmm2 1174 subpd %xmm2, %xmm15 1175 1176 movddup 10 * SIZE(BO), %xmm0 1177 mulpd %xmm0, %xmm12 1178 mulpd %xmm0, %xmm13 1179 1180 movddup 11 * SIZE(BO), %xmm1 1181 mulpd %xmm12, %xmm1 1182 subpd %xmm1, %xmm14 1183 movddup 11 * SIZE(BO), %xmm1 1184 mulpd %xmm13, %xmm1 1185 subpd %xmm1, %xmm15 1186 1187 movddup 15 * SIZE(BO), %xmm0 1188 mulpd %xmm0, %xmm14 1189 mulpd %xmm0, %xmm15 1190#endif 1191 1192#ifdef RT 1193 movddup 15 * SIZE(BO), %xmm0 1194 mulpd %xmm0, %xmm14 1195 mulpd %xmm0, %xmm15 1196 1197 movddup 14 * SIZE(BO), %xmm1 1198 mulpd %xmm14, %xmm1 1199 subpd %xmm1, %xmm12 1200 movddup 14 * SIZE(BO), %xmm1 1201 mulpd %xmm15, %xmm1 1202 subpd %xmm1, %xmm13 1203 1204 movddup 13 * SIZE(BO), %xmm2 1205 mulpd %xmm14, %xmm2 1206 subpd %xmm2, %xmm10 1207 movddup 13 * SIZE(BO), %xmm2 1208 mulpd %xmm15, %xmm2 1209 subpd %xmm2, %xmm11 1210 1211 movddup 12 * SIZE(BO), %xmm3 1212 mulpd %xmm14, %xmm3 1213 subpd %xmm3, %xmm8 1214 movddup 12 * SIZE(BO), %xmm3 1215 mulpd %xmm15, %xmm3 1216 subpd %xmm3, %xmm9 1217 1218 movddup 10 * SIZE(BO), %xmm0 1219 mulpd %xmm0, %xmm12 1220 mulpd %xmm0, %xmm13 1221 1222 movddup 9 * SIZE(BO), %xmm1 1223 mulpd %xmm12, %xmm1 1224 subpd %xmm1, %xmm10 1225 movddup 9 * SIZE(BO), %xmm1 1226 mulpd %xmm13, %xmm1 1227 subpd %xmm1, %xmm11 1228 1229 movddup 8 * SIZE(BO), %xmm2 1230 mulpd %xmm12, %xmm2 1231 subpd %xmm2, %xmm8 1232 movddup 8 * SIZE(BO), %xmm2 1233 mulpd %xmm13, %xmm2 1234 subpd %xmm2, %xmm9 1235 1236 movddup 5 * SIZE(BO), %xmm0 1237 mulpd %xmm0, %xmm10 1238 mulpd %xmm0, %xmm11 1239 1240 movddup 4 * SIZE(BO), %xmm1 1241 mulpd %xmm10, %xmm1 1242 subpd %xmm1, %xmm8 1243 movddup 4 * SIZE(BO), %xmm1 1244 mulpd %xmm11, %xmm1 1245 subpd %xmm1, %xmm9 1246 1247 movddup 0 * SIZE(BO), %xmm0 1248 mulpd %xmm0, %xmm8 1249 mulpd %xmm0, %xmm9 1250#endif 1251 1252#ifdef LN 1253 subq $4 * SIZE, CO1 1254 subq $4 * SIZE, CO2 1255#endif 1256 1257#if defined(LN) || defined(LT) 1258 movsd %xmm1, 0 * SIZE(CO1) 1259 movsd %xmm5, 1 * SIZE(CO1) 1260 movsd %xmm9, 2 * SIZE(CO1) 1261 movsd %xmm13, 3 * SIZE(CO1) 1262 1263 movhpd %xmm1, 0 * SIZE(CO2) 1264 movhpd %xmm5, 1 * SIZE(CO2) 1265 movhpd %xmm9, 2 * SIZE(CO2) 1266 movhpd %xmm13, 3 * SIZE(CO2) 1267 1268 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 1269 movsd %xmm7, 1 * SIZE(CO1, LDC, 2) 1270 movsd %xmm11, 2 * SIZE(CO1, LDC, 2) 1271 movsd %xmm15, 3 * SIZE(CO1, LDC, 2) 1272 1273 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 1274 movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) 1275 movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) 1276 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 1277#else 1278 movsd %xmm8, 0 * SIZE(CO1) 1279 movhpd %xmm8, 1 * SIZE(CO1) 1280 movsd %xmm9, 2 * SIZE(CO1) 1281 movhpd %xmm9, 3 * SIZE(CO1) 1282 1283 movsd %xmm10, 0 * SIZE(CO2) 1284 movhpd %xmm10, 1 * SIZE(CO2) 1285 movsd %xmm11, 2 * SIZE(CO2) 1286 movhpd %xmm11, 3 * SIZE(CO2) 1287 1288 movsd %xmm12, 0 * SIZE(CO1, LDC, 2) 1289 movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) 1290 movsd %xmm13, 2 * SIZE(CO1, LDC, 2) 1291 movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) 1292 1293 movsd %xmm14, 0 * SIZE(CO2, LDC, 2) 1294 movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) 1295 movsd %xmm15, 2 * SIZE(CO2, LDC, 2) 1296 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 1297#endif 1298 1299#if defined(LN) || defined(LT) 1300 movapd %xmm1, 0 * SIZE(BO) 1301 movapd %xmm3, 2 * SIZE(BO) 1302 movapd %xmm5, 4 * SIZE(BO) 1303 movapd %xmm7, 6 * SIZE(BO) 1304 movapd %xmm9, 8 * SIZE(BO) 1305 movapd %xmm11, 10 * SIZE(BO) 1306 movapd %xmm13, 12 * SIZE(BO) 1307 movapd %xmm15, 14 * SIZE(BO) 1308#else 1309 movapd %xmm8, 0 * SIZE(AO) 1310 movapd %xmm9, 2 * SIZE(AO) 1311 movapd %xmm10, 4 * SIZE(AO) 1312 movapd %xmm11, 6 * SIZE(AO) 1313 movapd %xmm12, 8 * SIZE(AO) 1314 movapd %xmm13, 10 * SIZE(AO) 1315 movapd %xmm14, 12 * SIZE(AO) 1316 movapd %xmm15, 14 * SIZE(AO) 1317#endif 1318 1319#ifndef LN 1320 addq $4 * SIZE, CO1 1321 addq $4 * SIZE, CO2 1322#endif 1323 1324#if defined(LT) || defined(RN) 1325 movq K, %rax 1326 subq KK, %rax 1327 leaq (,%rax, SIZE), %rax 1328 leaq (AO, %rax, 4), AO 1329 leaq (BO, %rax, 4), BO 1330#endif 1331 1332#ifdef LN 1333 subq $4, KK 1334#endif 1335 1336#ifdef LT 1337 addq $4, KK 1338#endif 1339 1340#ifdef RT 1341 movq K, %rax 1342 salq $2 + BASE_SHIFT, %rax 1343 addq %rax, AORIG 1344#endif 1345 1346 decq I # i -- 1347 jg .L11 1348 ALIGN_4 1349 1350.L20: 1351 testq $2, M 1352 BRANCH 1353 je .L30 1354 ALIGN_4 1355 1356.L21: 1357 1358#ifdef LN 1359 movq K, %rax 1360 salq $1 + BASE_SHIFT, %rax 1361 subq %rax, AORIG 1362#endif 1363 1364#if defined(LN) || defined(RT) 1365 movq KK, %rax 1366 leaq (, %rax, SIZE), %rax 1367 movq AORIG, AO 1368 leaq (AO, %rax, 2), AO 1369 leaq (B, %rax, 4), BO 1370#else 1371 movq B, BO 1372#endif 1373 1374 movapd 0 * SIZE(AO), %xmm8 1375 pxor %xmm0, %xmm0 1376 movddup 0 * SIZE(BO), %xmm9 1377 pxor %xmm1, %xmm1 1378 movapd 8 * SIZE(AO), %xmm10 1379 pxor %xmm2, %xmm2 1380 movddup 8 * SIZE(BO), %xmm11 1381 pxor %xmm3, %xmm3 1382 1383#if defined(LT) || defined(RN) 1384 movq KK, %rax 1385#else 1386 movq K, %rax 1387 subq KK, %rax 1388#endif 1389 sarq $3, %rax 1390 je .L25 1391 ALIGN_4 1392 1393.L22: 1394 mulpd %xmm8, %xmm9 1395 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1396 addpd %xmm9, %xmm0 1397 movddup 1 * SIZE(BO), %xmm9 1398 mulpd %xmm8, %xmm9 1399 addpd %xmm9, %xmm1 1400 movddup 2 * SIZE(BO), %xmm9 1401 mulpd %xmm8, %xmm9 1402 addpd %xmm9, %xmm2 1403 movddup 3 * SIZE(BO), %xmm9 1404 mulpd %xmm8, %xmm9 1405 movapd 2 * SIZE(AO), %xmm8 1406 addpd %xmm9, %xmm3 1407 movddup 4 * SIZE(BO), %xmm9 1408 mulpd %xmm8, %xmm9 1409 addpd %xmm9, %xmm0 1410 movddup 5 * SIZE(BO), %xmm9 1411 mulpd %xmm8, %xmm9 1412 addpd %xmm9, %xmm1 1413 movddup 6 * SIZE(BO), %xmm9 1414 mulpd %xmm8, %xmm9 1415 addpd %xmm9, %xmm2 1416 movddup 7 * SIZE(BO), %xmm9 1417 mulpd %xmm8, %xmm9 1418 movapd 4 * SIZE(AO), %xmm8 1419 addpd %xmm9, %xmm3 1420 movddup 16 * SIZE(BO), %xmm9 1421 mulpd %xmm8, %xmm11 1422 addpd %xmm11, %xmm0 1423 movddup 9 * SIZE(BO), %xmm11 1424 mulpd %xmm8, %xmm11 1425 addpd %xmm11, %xmm1 1426 movddup 10 * SIZE(BO), %xmm11 1427 mulpd %xmm8, %xmm11 1428 addpd %xmm11, %xmm2 1429 movddup 11 * SIZE(BO), %xmm11 1430 mulpd %xmm8, %xmm11 1431 movapd 6 * SIZE(AO), %xmm8 1432 addpd %xmm11, %xmm3 1433 movddup 12 * SIZE(BO), %xmm11 1434 mulpd %xmm8, %xmm11 1435 addpd %xmm11, %xmm0 1436 movddup 13 * SIZE(BO), %xmm11 1437 mulpd %xmm8, %xmm11 1438 addpd %xmm11, %xmm1 1439 movddup 14 * SIZE(BO), %xmm11 1440 mulpd %xmm8, %xmm11 1441 addpd %xmm11, %xmm2 1442 movddup 15 * SIZE(BO), %xmm11 1443 mulpd %xmm8, %xmm11 1444 movapd 16 * SIZE(AO), %xmm8 1445 addpd %xmm11, %xmm3 1446 movddup 24 * SIZE(BO), %xmm11 1447 mulpd %xmm10, %xmm9 1448 addpd %xmm9, %xmm0 1449 movddup 17 * SIZE(BO), %xmm9 1450 mulpd %xmm10, %xmm9 1451 addpd %xmm9, %xmm1 1452 movddup 18 * SIZE(BO), %xmm9 1453 mulpd %xmm10, %xmm9 1454 addpd %xmm9, %xmm2 1455 movddup 19 * SIZE(BO), %xmm9 1456 mulpd %xmm10, %xmm9 1457 movapd 10 * SIZE(AO), %xmm10 1458 addpd %xmm9, %xmm3 1459 movddup 20 * SIZE(BO), %xmm9 1460 mulpd %xmm10, %xmm9 1461 addpd %xmm9, %xmm0 1462 movddup 21 * SIZE(BO), %xmm9 1463 mulpd %xmm10, %xmm9 1464 addpd %xmm9, %xmm1 1465 movddup 22 * SIZE(BO), %xmm9 1466 mulpd %xmm10, %xmm9 1467 addpd %xmm9, %xmm2 1468 movddup 23 * SIZE(BO), %xmm9 1469 mulpd %xmm10, %xmm9 1470 movapd 12 * SIZE(AO), %xmm10 1471 addpd %xmm9, %xmm3 1472 movddup 32 * SIZE(BO), %xmm9 1473 mulpd %xmm10, %xmm11 1474 addpd %xmm11, %xmm0 1475 movddup 25 * SIZE(BO), %xmm11 1476 mulpd %xmm10, %xmm11 1477 addpd %xmm11, %xmm1 1478 movddup 26 * SIZE(BO), %xmm11 1479 mulpd %xmm10, %xmm11 1480 addpd %xmm11, %xmm2 1481 movddup 27 * SIZE(BO), %xmm11 1482 mulpd %xmm10, %xmm11 1483 movapd 14 * SIZE(AO), %xmm10 1484 addpd %xmm11, %xmm3 1485 movddup 28 * SIZE(BO), %xmm11 1486 mulpd %xmm10, %xmm11 1487 addpd %xmm11, %xmm0 1488 movddup 29 * SIZE(BO), %xmm11 1489 mulpd %xmm10, %xmm11 1490 addpd %xmm11, %xmm1 1491 movddup 30 * SIZE(BO), %xmm11 1492 mulpd %xmm10, %xmm11 1493 addpd %xmm11, %xmm2 1494 movddup 31 * SIZE(BO), %xmm11 1495 mulpd %xmm10, %xmm11 1496 movapd 24 * SIZE(AO), %xmm10 1497 addpd %xmm11, %xmm3 1498 movddup 40 * SIZE(BO), %xmm11 1499 1500 addq $16 * SIZE, AO 1501 addq $32 * SIZE, BO 1502 decq %rax 1503 jne .L22 1504 ALIGN_4 1505 1506.L25: 1507#if defined(LT) || defined(RN) 1508 movq KK, %rax 1509#else 1510 movq K, %rax 1511 subq KK, %rax 1512#endif 1513 andq $7, %rax # if (k & 1) 1514 BRANCH 1515 je .L29 1516 ALIGN_4 1517 1518.L26: 1519 mulpd %xmm8, %xmm9 1520 addpd %xmm9, %xmm0 1521 movddup 1 * SIZE(BO), %xmm9 1522 mulpd %xmm8, %xmm9 1523 addpd %xmm9, %xmm1 1524 movddup 2 * SIZE(BO), %xmm9 1525 mulpd %xmm8, %xmm9 1526 addpd %xmm9, %xmm2 1527 movddup 3 * SIZE(BO), %xmm9 1528 mulpd %xmm8, %xmm9 1529 movapd 2 * SIZE(AO), %xmm8 1530 addpd %xmm9, %xmm3 1531 movddup 4 * SIZE(BO), %xmm9 1532 1533 addq $2 * SIZE, AO # aoffset += 4 1534 addq $4 * SIZE, BO # boffset1 += 8 1535 decq %rax 1536 jg .L26 1537 ALIGN_4 1538 1539.L29: 1540#if defined(LN) || defined(RT) 1541 movq KK, %rax 1542#ifdef LN 1543 subq $2, %rax 1544#else 1545 subq $4, %rax 1546#endif 1547 1548 leaq (, %rax, SIZE), %rax 1549 movq AORIG, AO 1550 leaq (AO, %rax, 2), AO 1551 leaq (B, %rax, 4), BO 1552#endif 1553 1554#if defined(LN) || defined(LT) 1555 movapd %xmm0, %xmm8 1556 unpcklpd %xmm1, %xmm0 1557 unpckhpd %xmm1, %xmm8 1558 1559 movapd %xmm2, %xmm10 1560 unpcklpd %xmm3, %xmm2 1561 unpckhpd %xmm3, %xmm10 1562 1563 movapd 0 * SIZE(BO), %xmm1 1564 movapd 2 * SIZE(BO), %xmm3 1565 movapd 4 * SIZE(BO), %xmm5 1566 movapd 6 * SIZE(BO), %xmm7 1567 1568 subpd %xmm0, %xmm1 1569 subpd %xmm2, %xmm3 1570 subpd %xmm8, %xmm5 1571 subpd %xmm10, %xmm7 1572#else 1573 1574 movapd 0 * SIZE(AO), %xmm8 1575 movapd 2 * SIZE(AO), %xmm10 1576 movapd 4 * SIZE(AO), %xmm12 1577 movapd 6 * SIZE(AO), %xmm14 1578 1579 subpd %xmm0, %xmm8 1580 subpd %xmm1, %xmm10 1581 subpd %xmm2, %xmm12 1582 subpd %xmm3, %xmm14 1583#endif 1584 1585#ifdef LN 1586 movddup 3 * SIZE(AO), %xmm0 1587 mulpd %xmm0, %xmm5 1588 mulpd %xmm0, %xmm7 1589 1590 movddup 2 * SIZE(AO), %xmm2 1591 mulpd %xmm5, %xmm2 1592 subpd %xmm2, %xmm1 1593 movddup 2 * SIZE(AO), %xmm2 1594 mulpd %xmm7, %xmm2 1595 subpd %xmm2, %xmm3 1596 1597 movddup 0 * SIZE(AO), %xmm0 1598 mulpd %xmm0, %xmm1 1599 mulpd %xmm0, %xmm3 1600#endif 1601 1602#ifdef LT 1603 movddup 0 * SIZE(AO), %xmm0 1604 mulpd %xmm0, %xmm1 1605 mulpd %xmm0, %xmm3 1606 1607 movddup 1 * SIZE(AO), %xmm2 1608 mulpd %xmm1, %xmm2 1609 subpd %xmm2, %xmm5 1610 movddup 1 * SIZE(AO), %xmm2 1611 mulpd %xmm3, %xmm2 1612 subpd %xmm2, %xmm7 1613 1614 movddup 3 * SIZE(AO), %xmm0 1615 mulpd %xmm0, %xmm5 1616 mulpd %xmm0, %xmm7 1617#endif 1618 1619#ifdef RN 1620 movddup 0 * SIZE(BO), %xmm0 1621 mulpd %xmm0, %xmm8 1622 1623 movddup 1 * SIZE(BO), %xmm1 1624 mulpd %xmm8, %xmm1 1625 subpd %xmm1, %xmm10 1626 movddup 2 * SIZE(BO), %xmm2 1627 mulpd %xmm8, %xmm2 1628 subpd %xmm2, %xmm12 1629 movddup 3 * SIZE(BO), %xmm3 1630 mulpd %xmm8, %xmm3 1631 subpd %xmm3, %xmm14 1632 1633 movddup 5 * SIZE(BO), %xmm0 1634 mulpd %xmm0, %xmm10 1635 movddup 6 * SIZE(BO), %xmm1 1636 mulpd %xmm10, %xmm1 1637 subpd %xmm1, %xmm12 1638 movddup 7 * SIZE(BO), %xmm2 1639 mulpd %xmm10, %xmm2 1640 subpd %xmm2, %xmm14 1641 1642 movddup 10 * SIZE(BO), %xmm0 1643 mulpd %xmm0, %xmm12 1644 1645 movddup 11 * SIZE(BO), %xmm1 1646 mulpd %xmm12, %xmm1 1647 subpd %xmm1, %xmm14 1648 1649 movddup 15 * SIZE(BO), %xmm0 1650 mulpd %xmm0, %xmm14 1651#endif 1652 1653#ifdef RT 1654 movddup 15 * SIZE(BO), %xmm0 1655 mulpd %xmm0, %xmm14 1656 1657 movddup 14 * SIZE(BO), %xmm1 1658 mulpd %xmm14, %xmm1 1659 subpd %xmm1, %xmm12 1660 movddup 13 * SIZE(BO), %xmm2 1661 mulpd %xmm14, %xmm2 1662 subpd %xmm2, %xmm10 1663 movddup 12 * SIZE(BO), %xmm3 1664 mulpd %xmm14, %xmm3 1665 subpd %xmm3, %xmm8 1666 1667 movddup 10 * SIZE(BO), %xmm0 1668 mulpd %xmm0, %xmm12 1669 movddup 9 * SIZE(BO), %xmm1 1670 mulpd %xmm12, %xmm1 1671 subpd %xmm1, %xmm10 1672 movddup 8 * SIZE(BO), %xmm2 1673 mulpd %xmm12, %xmm2 1674 subpd %xmm2, %xmm8 1675 1676 movddup 5 * SIZE(BO), %xmm0 1677 mulpd %xmm0, %xmm10 1678 movddup 4 * SIZE(BO), %xmm1 1679 mulpd %xmm10, %xmm1 1680 subpd %xmm1, %xmm8 1681 1682 movddup 0 * SIZE(BO), %xmm0 1683 mulpd %xmm0, %xmm8 1684#endif 1685 1686#ifdef LN 1687 subq $2 * SIZE, CO1 1688 subq $2 * SIZE, CO2 1689#endif 1690 1691#if defined(LN) || defined(LT) 1692 movsd %xmm1, 0 * SIZE(CO1) 1693 movsd %xmm5, 1 * SIZE(CO1) 1694 movhpd %xmm1, 0 * SIZE(CO2) 1695 movhpd %xmm5, 1 * SIZE(CO2) 1696 1697 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 1698 movsd %xmm7, 1 * SIZE(CO1, LDC, 2) 1699 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 1700 movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) 1701#else 1702 movsd %xmm8, 0 * SIZE(CO1) 1703 movhpd %xmm8, 1 * SIZE(CO1) 1704 movsd %xmm10, 0 * SIZE(CO2) 1705 movhpd %xmm10, 1 * SIZE(CO2) 1706 1707 movsd %xmm12, 0 * SIZE(CO1, LDC, 2) 1708 movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) 1709 movsd %xmm14, 0 * SIZE(CO2, LDC, 2) 1710 movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) 1711#endif 1712 1713#if defined(LN) || defined(LT) 1714 movapd %xmm1, 0 * SIZE(BO) 1715 movapd %xmm3, 2 * SIZE(BO) 1716 movapd %xmm5, 4 * SIZE(BO) 1717 movapd %xmm7, 6 * SIZE(BO) 1718#else 1719 movapd %xmm8, 0 * SIZE(AO) 1720 movapd %xmm10, 2 * SIZE(AO) 1721 movapd %xmm12, 4 * SIZE(AO) 1722 movapd %xmm14, 6 * SIZE(AO) 1723#endif 1724 1725#ifndef LN 1726 addq $2 * SIZE, CO1 1727 addq $2 * SIZE, CO2 1728#endif 1729 1730#if defined(LT) || defined(RN) 1731 movq K, %rax 1732 subq KK, %rax 1733 leaq (,%rax, SIZE), %rax 1734 leaq (AO, %rax, 2), AO 1735 leaq (BO, %rax, 4), BO 1736#endif 1737 1738#ifdef LN 1739 subq $2, KK 1740#endif 1741 1742#ifdef LT 1743 addq $2, KK 1744#endif 1745 1746#ifdef RT 1747 movq K, %rax 1748 salq $1 + BASE_SHIFT, %rax 1749 addq %rax, AORIG 1750#endif 1751 ALIGN_4 1752 1753.L30: 1754 testq $1, M 1755 je .L39 1756 ALIGN_4 1757 1758.L31: 1759#ifdef LN 1760 movq K, %rax 1761 salq $0 + BASE_SHIFT, %rax 1762 subq %rax, AORIG 1763#endif 1764 1765 1766#if defined(LN) || defined(RT) 1767 movq KK, %rax 1768 leaq (, %rax, SIZE), %rax 1769 movq AORIG, AO 1770 leaq (AO, %rax, 1), AO 1771 leaq (B, %rax, 4), BO 1772#else 1773 movq B, BO 1774#endif 1775 1776 movddup 0 * SIZE(AO), %xmm8 1777 pxor %xmm0, %xmm0 1778 movapd 0 * SIZE(BO), %xmm9 1779 pxor %xmm1, %xmm1 1780 movddup 4 * SIZE(AO), %xmm10 1781 pxor %xmm2, %xmm2 1782 movapd 8 * SIZE(BO), %xmm11 1783 pxor %xmm3, %xmm3 1784 1785#if defined(LT) || defined(RN) 1786 movq KK, %rax 1787#else 1788 movq K, %rax 1789 subq KK, %rax 1790#endif 1791 sarq $3, %rax 1792 je .L35 1793 ALIGN_4 1794 1795.L32: 1796 mulpd %xmm8, %xmm9 1797 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1798 addpd %xmm9, %xmm0 1799 movapd 2 * SIZE(BO), %xmm9 1800 mulpd %xmm8, %xmm9 1801 movddup 1 * SIZE(AO), %xmm8 1802 addpd %xmm9, %xmm1 1803 movapd 4 * SIZE(BO), %xmm9 1804 mulpd %xmm8, %xmm9 1805 addpd %xmm9, %xmm0 1806 movapd 6 * SIZE(BO), %xmm9 1807 mulpd %xmm8, %xmm9 1808 movddup 2 * SIZE(AO), %xmm8 1809 addpd %xmm9, %xmm1 1810 movapd 16 * SIZE(BO), %xmm9 1811 mulpd %xmm8, %xmm11 1812 addpd %xmm11, %xmm0 1813 movapd 10 * SIZE(BO), %xmm11 1814 mulpd %xmm8, %xmm11 1815 movddup 3 * SIZE(AO), %xmm8 1816 addpd %xmm11, %xmm1 1817 movapd 12 * SIZE(BO), %xmm11 1818 mulpd %xmm8, %xmm11 1819 addpd %xmm11, %xmm0 1820 movapd 14 * SIZE(BO), %xmm11 1821 mulpd %xmm8, %xmm11 1822 movddup 8 * SIZE(AO), %xmm8 1823 addpd %xmm11, %xmm1 1824 movapd 24 * SIZE(BO), %xmm11 1825 mulpd %xmm10, %xmm9 1826 addpd %xmm9, %xmm0 1827 movapd 18 * SIZE(BO), %xmm9 1828 mulpd %xmm10, %xmm9 1829 movddup 5 * SIZE(AO), %xmm10 1830 addpd %xmm9, %xmm1 1831 movapd 20 * SIZE(BO), %xmm9 1832 mulpd %xmm10, %xmm9 1833 addpd %xmm9, %xmm0 1834 movapd 22 * SIZE(BO), %xmm9 1835 mulpd %xmm10, %xmm9 1836 movddup 6 * SIZE(AO), %xmm10 1837 addpd %xmm9, %xmm1 1838 movapd 32 * SIZE(BO), %xmm9 1839 mulpd %xmm10, %xmm11 1840 addpd %xmm11, %xmm0 1841 movapd 26 * SIZE(BO), %xmm11 1842 mulpd %xmm10, %xmm11 1843 movddup 7 * SIZE(AO), %xmm10 1844 addpd %xmm11, %xmm1 1845 movapd 28 * SIZE(BO), %xmm11 1846 mulpd %xmm10, %xmm11 1847 addpd %xmm11, %xmm0 1848 movapd 30 * SIZE(BO), %xmm11 1849 mulpd %xmm10, %xmm11 1850 movddup 12 * SIZE(AO), %xmm10 1851 addpd %xmm11, %xmm1 1852 movapd 40 * SIZE(BO), %xmm11 1853 1854 addq $ 8 * SIZE, AO 1855 addq $32 * SIZE, BO 1856 decq %rax 1857 jne .L32 1858 ALIGN_4 1859 1860.L35: 1861#if defined(LT) || defined(RN) 1862 movq KK, %rax 1863#else 1864 movq K, %rax 1865 subq KK, %rax 1866#endif 1867 andq $7, %rax # if (k & 1) 1868 BRANCH 1869 je .L38 1870 ALIGN_4 1871 1872.L36: 1873 mulpd %xmm8, %xmm9 1874 addpd %xmm9, %xmm0 1875 movapd 2 * SIZE(BO), %xmm9 1876 mulpd %xmm8, %xmm9 1877 movddup 1 * SIZE(AO), %xmm8 1878 addpd %xmm9, %xmm1 1879 movapd 4 * SIZE(BO), %xmm9 1880 1881 addq $1 * SIZE, AO # aoffset += 4 1882 addq $4 * SIZE, BO # boffset1 += 8 1883 decq %rax 1884 jg .L36 1885 ALIGN_4 1886 1887.L38: 1888 1889#if defined(LN) || defined(RT) 1890 movq KK, %rax 1891#ifdef LN 1892 subq $1, %rax 1893#else 1894 subq $4, %rax 1895#endif 1896 1897 leaq (, %rax, SIZE), %rax 1898 movq AORIG, AO 1899 leaq (AO, %rax, 1), AO 1900 leaq (B, %rax, 4), BO 1901#endif 1902 1903 1904#if defined(LN) || defined(LT) 1905 movapd 0 * SIZE(BO), %xmm2 1906 movapd 2 * SIZE(BO), %xmm3 1907 1908 subpd %xmm0, %xmm2 1909 subpd %xmm1, %xmm3 1910#else 1911 movapd 0 * SIZE(AO), %xmm2 1912 movapd 2 * SIZE(AO), %xmm3 1913 1914 subpd %xmm0, %xmm2 1915 subpd %xmm1, %xmm3 1916#endif 1917 1918#ifdef LN 1919 movddup 0 * SIZE(AO), %xmm0 1920 mulpd %xmm0, %xmm2 1921 mulpd %xmm0, %xmm3 1922#endif 1923 1924#ifdef LT 1925 movddup 0 * SIZE(AO), %xmm0 1926 mulpd %xmm0, %xmm2 1927 mulpd %xmm0, %xmm3 1928#endif 1929 1930#ifdef RN 1931 movapd %xmm2, %xmm0 1932 unpckhpd %xmm0, %xmm0 1933 1934 movapd %xmm3, %xmm1 1935 unpckhpd %xmm1, %xmm1 1936 1937 movsd 0 * SIZE(BO), %xmm4 1938 mulsd %xmm4, %xmm2 1939 1940 movsd 1 * SIZE(BO), %xmm5 1941 mulsd %xmm2, %xmm5 1942 subsd %xmm5, %xmm0 1943 movsd 2 * SIZE(BO), %xmm6 1944 mulsd %xmm2, %xmm6 1945 subsd %xmm6, %xmm3 1946 movsd 3 * SIZE(BO), %xmm7 1947 mulsd %xmm2, %xmm7 1948 subsd %xmm7, %xmm1 1949 1950 movsd 5 * SIZE(BO), %xmm4 1951 mulsd %xmm4, %xmm0 1952 1953 movsd 6 * SIZE(BO), %xmm5 1954 mulsd %xmm0, %xmm5 1955 subsd %xmm5, %xmm3 1956 movsd 7 * SIZE(BO), %xmm6 1957 mulsd %xmm0, %xmm6 1958 subsd %xmm6, %xmm1 1959 1960 movsd 10 * SIZE(BO), %xmm4 1961 mulsd %xmm4, %xmm3 1962 1963 movsd 11 * SIZE(BO), %xmm5 1964 mulsd %xmm3, %xmm5 1965 subsd %xmm5, %xmm1 1966 1967 movsd 15 * SIZE(BO), %xmm4 1968 mulsd %xmm4, %xmm1 1969 1970 unpcklpd %xmm0, %xmm2 1971 unpcklpd %xmm1, %xmm3 1972#endif 1973 1974#ifdef RT 1975 movapd %xmm2, %xmm0 1976 unpckhpd %xmm0, %xmm0 1977 1978 movapd %xmm3, %xmm1 1979 unpckhpd %xmm1, %xmm1 1980 1981 movsd 15 * SIZE(BO), %xmm4 1982 mulsd %xmm4, %xmm1 1983 1984 movsd 14 * SIZE(BO), %xmm5 1985 mulsd %xmm1, %xmm5 1986 subsd %xmm5, %xmm3 1987 movsd 13 * SIZE(BO), %xmm6 1988 mulsd %xmm1, %xmm6 1989 subsd %xmm6, %xmm0 1990 movsd 12 * SIZE(BO), %xmm7 1991 mulsd %xmm1, %xmm7 1992 subsd %xmm7, %xmm2 1993 1994 movsd 10 * SIZE(BO), %xmm4 1995 mulsd %xmm4, %xmm3 1996 1997 movsd 9 * SIZE(BO), %xmm5 1998 mulsd %xmm3, %xmm5 1999 subsd %xmm5, %xmm0 2000 movsd 8 * SIZE(BO), %xmm6 2001 mulsd %xmm3, %xmm6 2002 subsd %xmm6, %xmm2 2003 2004 movsd 5 * SIZE(BO), %xmm4 2005 mulsd %xmm4, %xmm0 2006 2007 movsd 4 * SIZE(BO), %xmm5 2008 mulsd %xmm0, %xmm5 2009 subsd %xmm5, %xmm2 2010 2011 movsd 0 * SIZE(BO), %xmm4 2012 mulsd %xmm4, %xmm2 2013 2014 unpcklpd %xmm0, %xmm2 2015 unpcklpd %xmm1, %xmm3 2016 2017#endif 2018 2019#ifdef LN 2020 subq $1 * SIZE, CO1 2021 subq $1 * SIZE, CO2 2022#endif 2023 2024#if defined(LN) || defined(LT) 2025 movsd %xmm2, 0 * SIZE(CO1) 2026 movhpd %xmm2, 0 * SIZE(CO2) 2027 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 2028 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 2029#else 2030 movsd %xmm2, 0 * SIZE(CO1) 2031 movhpd %xmm2, 0 * SIZE(CO2) 2032 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 2033 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 2034#endif 2035 2036#if defined(LN) || defined(LT) 2037 movapd %xmm2, 0 * SIZE(BO) 2038 movapd %xmm3, 2 * SIZE(BO) 2039#else 2040 movapd %xmm2, 0 * SIZE(AO) 2041 movapd %xmm3, 2 * SIZE(AO) 2042#endif 2043 2044#ifndef LN 2045 addq $1 * SIZE, CO1 2046 addq $1 * SIZE, CO2 2047#endif 2048 2049#if defined(LT) || defined(RN) 2050 movq K, %rax 2051 subq KK, %rax 2052 leaq (,%rax, SIZE), %rax 2053 leaq (AO, %rax, 1), AO 2054 leaq (BO, %rax, 4), BO 2055#endif 2056 2057#ifdef LN 2058 subq $1, KK 2059#endif 2060 2061#ifdef LT 2062 addq $1, KK 2063#endif 2064 2065#ifdef RT 2066 movq K, %rax 2067 salq $0 + BASE_SHIFT, %rax 2068 addq %rax, AORIG 2069#endif 2070 ALIGN_4 2071 2072.L39: 2073#ifdef LN 2074 leaq (, K, SIZE), %rax 2075 leaq (B, %rax, 4), B 2076#endif 2077#if defined(LT) || defined(RN) 2078 movq BO, B 2079#endif 2080 2081#ifdef RN 2082 addq $4, KK 2083#endif 2084 2085#ifdef RT 2086 subq $4, KK 2087#endif 2088 2089 decq J # j -- 2090 jg .L10 2091 ALIGN_4 2092 2093.L40: 2094 testq $2, N 2095 je .L80 2096 ALIGN_4 2097 2098#if defined(LT) || defined(RN) 2099 movq A, AO 2100#else 2101 movq A, AORIG 2102#endif 2103 2104#ifdef RT 2105 movq K, %rax 2106 salq $1 + BASE_SHIFT, %rax 2107 subq %rax, B 2108 2109 leaq (, LDC, 2), %rax 2110 subq %rax, C 2111#endif 2112 2113 movq C, CO1 2114 leaq (C, LDC, 1), CO2 2115#ifndef RT 2116 leaq (C, LDC, 2), C 2117#endif 2118 2119#ifdef LN 2120 movq OFFSET, %rax 2121 addq M, %rax 2122 movq %rax, KK 2123#endif 2124 2125 movq K, %rax 2126 salq $BASE_SHIFT + 1, %rax 2127 leaq (B, %rax), BB 2128 2129#ifdef LT 2130 movq OFFSET, %rax 2131 movq %rax, KK 2132#endif 2133 2134 movq M, I 2135 sarq $2, I # i = (m >> 2) 2136 jle .L60 2137 ALIGN_4 2138 2139.L51: 2140#ifdef LN 2141 movq K, %rax 2142 salq $2 + BASE_SHIFT, %rax 2143 subq %rax, AORIG 2144#endif 2145 2146#if defined(LN) || defined(RT) 2147 movq KK, %rax 2148 leaq (, %rax, SIZE), %rax 2149 movq AORIG, AO 2150 leaq (AO, %rax, 4), AO 2151 leaq (B, %rax, 2), BO 2152#else 2153 movq B, BO 2154#endif 2155 2156 prefetcht0 0 * SIZE(BB) 2157 subq $-4 * SIZE, BB 2158 2159 movapd 0 * SIZE(AO), %xmm8 2160 pxor %xmm0, %xmm0 2161 movddup 0 * SIZE(BO), %xmm9 2162 pxor %xmm1, %xmm1 2163 movapd 8 * SIZE(AO), %xmm10 2164 pxor %xmm4, %xmm4 2165 movddup 8 * SIZE(BO), %xmm11 2166 pxor %xmm5, %xmm5 2167 2168#ifdef HAVE_3DNOW 2169 prefetchw 4 * SIZE(CO1) 2170 prefetchw 4 * SIZE(CO2) 2171#else 2172 prefetchnta 4 * SIZE(CO1) 2173 prefetchnta 4 * SIZE(CO2) 2174#endif 2175 2176#if defined(LT) || defined(RN) 2177 movq KK, %rax 2178#else 2179 movq K, %rax 2180 subq KK, %rax 2181#endif 2182 sarq $3, %rax 2183 je .L55 2184 ALIGN_4 2185 2186.L52: 2187 mulpd %xmm8, %xmm9 2188 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2189 addpd %xmm9, %xmm0 2190 movddup 1 * SIZE(BO), %xmm9 2191 mulpd %xmm8, %xmm9 2192 movapd 2 * SIZE(AO), %xmm8 2193 addpd %xmm9, %xmm1 2194 movddup 0 * SIZE(BO), %xmm9 2195 mulpd %xmm8, %xmm9 2196 addpd %xmm9, %xmm4 2197 movddup 1 * SIZE(BO), %xmm9 2198 mulpd %xmm8, %xmm9 2199 movapd 4 * SIZE(AO), %xmm8 2200 addpd %xmm9, %xmm5 2201 movddup 2 * SIZE(BO), %xmm9 2202 mulpd %xmm8, %xmm9 2203 addpd %xmm9, %xmm0 2204 movddup 3 * SIZE(BO), %xmm9 2205 mulpd %xmm8, %xmm9 2206 movapd 6 * SIZE(AO), %xmm8 2207 addpd %xmm9, %xmm1 2208 movddup 2 * SIZE(BO), %xmm9 2209 mulpd %xmm8, %xmm9 2210 addpd %xmm9, %xmm4 2211 movddup 3 * SIZE(BO), %xmm9 2212 mulpd %xmm8, %xmm9 2213 movapd 16 * SIZE(AO), %xmm8 2214 addpd %xmm9, %xmm5 2215 movddup 4 * SIZE(BO), %xmm9 2216 mulpd %xmm10, %xmm9 2217 addpd %xmm9, %xmm0 2218 movddup 5 * SIZE(BO), %xmm9 2219 mulpd %xmm10, %xmm9 2220 movapd 10 * SIZE(AO), %xmm10 2221 addpd %xmm9, %xmm1 2222 movddup 4 * SIZE(BO), %xmm9 2223 mulpd %xmm10, %xmm9 2224 addpd %xmm9, %xmm4 2225 movddup 5 * SIZE(BO), %xmm9 2226 mulpd %xmm10, %xmm9 2227 movapd 12 * SIZE(AO), %xmm10 2228 addpd %xmm9, %xmm5 2229 movddup 6 * SIZE(BO), %xmm9 2230 mulpd %xmm10, %xmm9 2231 addpd %xmm9, %xmm0 2232 movddup 7 * SIZE(BO), %xmm9 2233 mulpd %xmm10, %xmm9 2234 movapd 14 * SIZE(AO), %xmm10 2235 addpd %xmm9, %xmm1 2236 movddup 6 * SIZE(BO), %xmm9 2237 mulpd %xmm10, %xmm9 2238 addpd %xmm9, %xmm4 2239 movddup 7 * SIZE(BO), %xmm9 2240 mulpd %xmm10, %xmm9 2241 movapd 40 * SIZE(AO), %xmm10 2242 addpd %xmm9, %xmm5 2243 movddup 16 * SIZE(BO), %xmm9 2244 mulpd %xmm8, %xmm11 2245 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2246 addpd %xmm11, %xmm0 2247 movddup 9 * SIZE(BO), %xmm11 2248 mulpd %xmm8, %xmm11 2249 movapd 18 * SIZE(AO), %xmm8 2250 addpd %xmm11, %xmm1 2251 movddup 8 * SIZE(BO), %xmm11 2252 mulpd %xmm8, %xmm11 2253 addpd %xmm11, %xmm4 2254 movddup 9 * SIZE(BO), %xmm11 2255 mulpd %xmm8, %xmm11 2256 movapd 20 * SIZE(AO), %xmm8 2257 addpd %xmm11, %xmm5 2258 movddup 10 * SIZE(BO), %xmm11 2259 mulpd %xmm8, %xmm11 2260 addpd %xmm11, %xmm0 2261 movddup 11 * SIZE(BO), %xmm11 2262 mulpd %xmm8, %xmm11 2263 movapd 22 * SIZE(AO), %xmm8 2264 addpd %xmm11, %xmm1 2265 movddup 10 * SIZE(BO), %xmm11 2266 mulpd %xmm8, %xmm11 2267 addpd %xmm11, %xmm4 2268 movddup 11 * SIZE(BO), %xmm11 2269 mulpd %xmm8, %xmm11 2270 movapd 24 * SIZE(AO), %xmm8 2271 addpd %xmm11, %xmm5 2272 movddup 12 * SIZE(BO), %xmm11 2273 mulpd %xmm8, %xmm11 2274 addpd %xmm11, %xmm0 2275 movddup 13 * SIZE(BO), %xmm11 2276 mulpd %xmm8, %xmm11 2277 movapd 26 * SIZE(AO), %xmm8 2278 addpd %xmm11, %xmm1 2279 movddup 12 * SIZE(BO), %xmm11 2280 mulpd %xmm8, %xmm11 2281 addpd %xmm11, %xmm4 2282 movddup 13 * SIZE(BO), %xmm11 2283 mulpd %xmm8, %xmm11 2284 movapd 28 * SIZE(AO), %xmm8 2285 addpd %xmm11, %xmm5 2286 movddup 14 * SIZE(BO), %xmm11 2287 mulpd %xmm8, %xmm11 2288 addpd %xmm11, %xmm0 2289 movddup 15 * SIZE(BO), %xmm11 2290 mulpd %xmm8, %xmm11 2291 movapd 30 * SIZE(AO), %xmm8 2292 addpd %xmm11, %xmm1 2293 movddup 14 * SIZE(BO), %xmm11 2294 mulpd %xmm8, %xmm11 2295 addpd %xmm11, %xmm4 2296 movddup 15 * SIZE(BO), %xmm11 2297 mulpd %xmm8, %xmm11 2298 movapd 32 * SIZE(AO), %xmm8 2299 addpd %xmm11, %xmm5 2300 movddup 24 * SIZE(BO), %xmm11 2301 2302 addq $32 * SIZE, AO 2303 addq $16 * SIZE, BO 2304 decq %rax 2305 jne .L52 2306 ALIGN_4 2307 2308.L55: 2309#if defined(LT) || defined(RN) 2310 movq KK, %rax 2311#else 2312 movq K, %rax 2313 subq KK, %rax 2314#endif 2315 andq $7, %rax # if (k & 1) 2316 BRANCH 2317 je .L59 2318 ALIGN_4 2319 2320.L56: 2321 mulpd %xmm8, %xmm9 2322 movapd 2 * SIZE(AO), %xmm10 2323 addpd %xmm9, %xmm0 2324 movddup 1 * SIZE(BO), %xmm9 2325 mulpd %xmm8, %xmm9 2326 movddup 0 * SIZE(BO), %xmm11 2327 addpd %xmm9, %xmm1 2328 movddup 2 * SIZE(BO), %xmm9 2329 mulpd %xmm10, %xmm11 2330 movapd 4 * SIZE(AO), %xmm8 2331 addpd %xmm11, %xmm4 2332 movddup 1 * SIZE(BO), %xmm11 2333 mulpd %xmm10, %xmm11 2334 addpd %xmm11, %xmm5 2335 2336 addq $4 * SIZE, AO # aoffset += 4 2337 addq $2 * SIZE, BO # boffset1 += 8 2338 decq %rax 2339 jg .L56 2340 ALIGN_4 2341 2342.L59: 2343#if defined(LN) || defined(RT) 2344 movq KK, %rax 2345#ifdef LN 2346 subq $4, %rax 2347#else 2348 subq $2, %rax 2349#endif 2350 leaq (, %rax, SIZE), %rax 2351 2352 movq AORIG, AO 2353 leaq (AO, %rax, 4), AO 2354 leaq (B, %rax, 2), BO 2355#endif 2356 2357#if defined(LN) || defined(LT) 2358 movapd %xmm0, %xmm8 2359 unpcklpd %xmm1, %xmm0 2360 unpckhpd %xmm1, %xmm8 2361 2362 movapd %xmm4, %xmm12 2363 unpcklpd %xmm5, %xmm4 2364 unpckhpd %xmm5, %xmm12 2365 2366 movapd 0 * SIZE(BO), %xmm1 2367 movapd 2 * SIZE(BO), %xmm5 2368 movapd 4 * SIZE(BO), %xmm9 2369 movapd 6 * SIZE(BO), %xmm13 2370 2371 subpd %xmm0, %xmm1 2372 subpd %xmm8, %xmm5 2373 subpd %xmm4, %xmm9 2374 subpd %xmm12, %xmm13 2375#else 2376 2377 movapd 0 * SIZE(AO), %xmm8 2378 movapd 2 * SIZE(AO), %xmm9 2379 movapd 4 * SIZE(AO), %xmm10 2380 movapd 6 * SIZE(AO), %xmm11 2381 2382 subpd %xmm0, %xmm8 2383 subpd %xmm4, %xmm9 2384 subpd %xmm1, %xmm10 2385 subpd %xmm5, %xmm11 2386#endif 2387 2388 2389#ifdef LN 2390 movddup 15 * SIZE(AO), %xmm0 2391 mulpd %xmm0, %xmm13 2392 2393 movddup 14 * SIZE(AO), %xmm2 2394 mulpd %xmm13, %xmm2 2395 subpd %xmm2, %xmm9 2396 movddup 13 * SIZE(AO), %xmm4 2397 mulpd %xmm13, %xmm4 2398 subpd %xmm4, %xmm5 2399 movddup 12 * SIZE(AO), %xmm6 2400 mulpd %xmm13, %xmm6 2401 subpd %xmm6, %xmm1 2402 2403 movddup 10 * SIZE(AO), %xmm0 2404 mulpd %xmm0, %xmm9 2405 movddup 9 * SIZE(AO), %xmm2 2406 mulpd %xmm9, %xmm2 2407 subpd %xmm2, %xmm5 2408 movddup 8 * SIZE(AO), %xmm4 2409 mulpd %xmm9, %xmm4 2410 subpd %xmm4, %xmm1 2411 2412 movddup 5 * SIZE(AO), %xmm0 2413 mulpd %xmm0, %xmm5 2414 movddup 4 * SIZE(AO), %xmm2 2415 mulpd %xmm5, %xmm2 2416 subpd %xmm2, %xmm1 2417 2418 movddup 0 * SIZE(AO), %xmm0 2419 mulpd %xmm0, %xmm1 2420#endif 2421 2422 2423#ifdef LT 2424 movddup 0 * SIZE(AO), %xmm0 2425 mulpd %xmm0, %xmm1 2426 2427 movddup 1 * SIZE(AO), %xmm2 2428 mulpd %xmm1, %xmm2 2429 subpd %xmm2, %xmm5 2430 movddup 2 * SIZE(AO), %xmm4 2431 mulpd %xmm1, %xmm4 2432 subpd %xmm4, %xmm9 2433 movddup 3 * SIZE(AO), %xmm6 2434 mulpd %xmm1, %xmm6 2435 subpd %xmm6, %xmm13 2436 2437 movddup 5 * SIZE(AO), %xmm0 2438 mulpd %xmm0, %xmm5 2439 2440 movddup 6 * SIZE(AO), %xmm2 2441 mulpd %xmm5, %xmm2 2442 subpd %xmm2, %xmm9 2443 movddup 7 * SIZE(AO), %xmm4 2444 mulpd %xmm5, %xmm4 2445 subpd %xmm4, %xmm13 2446 2447 movddup 10 * SIZE(AO), %xmm0 2448 mulpd %xmm0, %xmm9 2449 2450 movddup 11 * SIZE(AO), %xmm2 2451 mulpd %xmm9, %xmm2 2452 subpd %xmm2, %xmm13 2453 2454 movddup 15 * SIZE(AO), %xmm0 2455 mulpd %xmm0, %xmm13 2456#endif 2457 2458#ifdef RN 2459 movddup 0 * SIZE(BO), %xmm0 2460 mulpd %xmm0, %xmm8 2461 mulpd %xmm0, %xmm9 2462 2463 movddup 1 * SIZE(BO), %xmm1 2464 mulpd %xmm8, %xmm1 2465 subpd %xmm1, %xmm10 2466 movddup 1 * SIZE(BO), %xmm1 2467 mulpd %xmm9, %xmm1 2468 subpd %xmm1, %xmm11 2469 2470 movddup 3 * SIZE(BO), %xmm0 2471 mulpd %xmm0, %xmm10 2472 mulpd %xmm0, %xmm11 2473#endif 2474 2475#ifdef RT 2476 movddup 3 * SIZE(BO), %xmm0 2477 mulpd %xmm0, %xmm10 2478 mulpd %xmm0, %xmm11 2479 2480 movddup 2 * SIZE(BO), %xmm1 2481 mulpd %xmm10, %xmm1 2482 subpd %xmm1, %xmm8 2483 movddup 2 * SIZE(BO), %xmm1 2484 mulpd %xmm11, %xmm1 2485 subpd %xmm1, %xmm9 2486 2487 movddup 0 * SIZE(BO), %xmm0 2488 mulpd %xmm0, %xmm8 2489 mulpd %xmm0, %xmm9 2490#endif 2491 2492#ifdef LN 2493 subq $4 * SIZE, CO1 2494 subq $4 * SIZE, CO2 2495#endif 2496 2497#if defined(LN) || defined(LT) 2498 movsd %xmm1, 0 * SIZE(CO1) 2499 movsd %xmm5, 1 * SIZE(CO1) 2500 movsd %xmm9, 2 * SIZE(CO1) 2501 movsd %xmm13, 3 * SIZE(CO1) 2502 2503 movhpd %xmm1, 0 * SIZE(CO2) 2504 movhpd %xmm5, 1 * SIZE(CO2) 2505 movhpd %xmm9, 2 * SIZE(CO2) 2506 movhpd %xmm13, 3 * SIZE(CO2) 2507#else 2508 movsd %xmm8, 0 * SIZE(CO1) 2509 movhpd %xmm8, 1 * SIZE(CO1) 2510 movsd %xmm9, 2 * SIZE(CO1) 2511 movhpd %xmm9, 3 * SIZE(CO1) 2512 2513 movsd %xmm10, 0 * SIZE(CO2) 2514 movhpd %xmm10, 1 * SIZE(CO2) 2515 movsd %xmm11, 2 * SIZE(CO2) 2516 movhpd %xmm11, 3 * SIZE(CO2) 2517#endif 2518 2519#if defined(LN) || defined(LT) 2520 movapd %xmm1, 0 * SIZE(BO) 2521 movapd %xmm5, 2 * SIZE(BO) 2522 movapd %xmm9, 4 * SIZE(BO) 2523 movapd %xmm13, 6 * SIZE(BO) 2524#else 2525 movapd %xmm8, 0 * SIZE(AO) 2526 movapd %xmm9, 2 * SIZE(AO) 2527 movapd %xmm10, 4 * SIZE(AO) 2528 movapd %xmm11, 6 * SIZE(AO) 2529#endif 2530 2531#ifndef LN 2532 addq $4 * SIZE, CO1 2533 addq $4 * SIZE, CO2 2534#endif 2535 2536#if defined(LT) || defined(RN) 2537 movq K, %rax 2538 subq KK, %rax 2539 leaq (,%rax, SIZE), %rax 2540 leaq (AO, %rax, 4), AO 2541 leaq (BO, %rax, 2), BO 2542#endif 2543 2544#ifdef LN 2545 subq $4, KK 2546#endif 2547 2548#ifdef LT 2549 addq $4, KK 2550#endif 2551 2552#ifdef RT 2553 movq K, %rax 2554 salq $2 + BASE_SHIFT, %rax 2555 addq %rax, AORIG 2556#endif 2557 2558 decq I # i -- 2559 jg .L51 2560 ALIGN_4 2561 2562.L60: 2563 testq $2, M 2564 je .L70 2565 ALIGN_4 2566 2567.L61: 2568#ifdef LN 2569 movq K, %rax 2570 salq $1 + BASE_SHIFT, %rax 2571 subq %rax, AORIG 2572#endif 2573 2574#if defined(LN) || defined(RT) 2575 movq KK, %rax 2576 leaq (, %rax, SIZE), %rax 2577 movq AORIG, AO 2578 leaq (AO, %rax, 2), AO 2579 leaq (B, %rax, 2), BO 2580#else 2581 movq B, BO 2582#endif 2583 2584 movapd 0 * SIZE(AO), %xmm8 2585 pxor %xmm0, %xmm0 2586 movddup 0 * SIZE(BO), %xmm9 2587 pxor %xmm1, %xmm1 2588 movapd 8 * SIZE(AO), %xmm10 2589 pxor %xmm2, %xmm2 2590 movddup 8 * SIZE(BO), %xmm11 2591 pxor %xmm3, %xmm3 2592 2593#if defined(LT) || defined(RN) 2594 movq KK, %rax 2595#else 2596 movq K, %rax 2597 subq KK, %rax 2598#endif 2599 sarq $3, %rax 2600 je .L65 2601 ALIGN_4 2602 2603.L62: 2604 mulpd %xmm8, %xmm9 2605 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2606 addpd %xmm9, %xmm0 2607 movddup 1 * SIZE(BO), %xmm9 2608 mulpd %xmm8, %xmm9 2609 movapd 2 * SIZE(AO), %xmm8 2610 addpd %xmm9, %xmm1 2611 movddup 2 * SIZE(BO), %xmm9 2612 mulpd %xmm8, %xmm9 2613 addpd %xmm9, %xmm2 2614 movddup 3 * SIZE(BO), %xmm9 2615 mulpd %xmm8, %xmm9 2616 movapd 4 * SIZE(AO), %xmm8 2617 addpd %xmm9, %xmm3 2618 movddup 4 * SIZE(BO), %xmm9 2619 mulpd %xmm8, %xmm9 2620 addpd %xmm9, %xmm0 2621 movddup 5 * SIZE(BO), %xmm9 2622 mulpd %xmm8, %xmm9 2623 movapd 6 * SIZE(AO), %xmm8 2624 addpd %xmm9, %xmm1 2625 movddup 6 * SIZE(BO), %xmm9 2626 mulpd %xmm8, %xmm9 2627 addpd %xmm9, %xmm2 2628 movddup 7 * SIZE(BO), %xmm9 2629 mulpd %xmm8, %xmm9 2630 movapd 16 * SIZE(AO), %xmm8 2631 addpd %xmm9, %xmm3 2632 movddup 16 * SIZE(BO), %xmm9 2633 mulpd %xmm10, %xmm11 2634 addpd %xmm11, %xmm0 2635 movddup 9 * SIZE(BO), %xmm11 2636 mulpd %xmm10, %xmm11 2637 movapd 10 * SIZE(AO), %xmm10 2638 addpd %xmm11, %xmm1 2639 movddup 10 * SIZE(BO), %xmm11 2640 mulpd %xmm10, %xmm11 2641 addpd %xmm11, %xmm2 2642 movddup 11 * SIZE(BO), %xmm11 2643 mulpd %xmm10, %xmm11 2644 movapd 12 * SIZE(AO), %xmm10 2645 addpd %xmm11, %xmm3 2646 movddup 12 * SIZE(BO), %xmm11 2647 mulpd %xmm10, %xmm11 2648 addpd %xmm11, %xmm0 2649 movddup 13 * SIZE(BO), %xmm11 2650 mulpd %xmm10, %xmm11 2651 movapd 14 * SIZE(AO), %xmm10 2652 addpd %xmm11, %xmm1 2653 movddup 14 * SIZE(BO), %xmm11 2654 mulpd %xmm10, %xmm11 2655 addpd %xmm11, %xmm2 2656 movddup 15 * SIZE(BO), %xmm11 2657 mulpd %xmm10, %xmm11 2658 movapd 24 * SIZE(AO), %xmm10 2659 addpd %xmm11, %xmm3 2660 movddup 24 * SIZE(BO), %xmm11 2661 2662 addq $16 * SIZE, AO 2663 addq $16 * SIZE, BO 2664 decq %rax 2665 jne .L62 2666 ALIGN_4 2667 2668.L65: 2669#if defined(LT) || defined(RN) 2670 movq KK, %rax 2671#else 2672 movq K, %rax 2673 subq KK, %rax 2674#endif 2675 andq $7, %rax # if (k & 1) 2676 BRANCH 2677 je .L69 2678 ALIGN_4 2679 2680.L66: 2681 mulpd %xmm8, %xmm9 2682 addpd %xmm9, %xmm0 2683 movddup 1 * SIZE(BO), %xmm9 2684 mulpd %xmm8, %xmm9 2685 movapd 2 * SIZE(AO), %xmm8 2686 addpd %xmm9, %xmm1 2687 movddup 2 * SIZE(BO), %xmm9 2688 2689 addq $2 * SIZE, AO # aoffset += 4 2690 addq $2 * SIZE, BO # boffset1 += 8 2691 decq %rax 2692 jg .L66 2693 ALIGN_4 2694 2695.L69: 2696 addpd %xmm2, %xmm0 2697 addpd %xmm3, %xmm1 2698 2699#if defined(LN) || defined(RT) 2700 movq KK, %rax 2701#ifdef LN 2702 subq $2, %rax 2703#else 2704 subq $2, %rax 2705#endif 2706 leaq (, %rax, SIZE), %rax 2707 movq AORIG, AO 2708 leaq (AO, %rax, 2), AO 2709 leaq (B, %rax, 2), BO 2710#endif 2711 2712#if defined(LN) || defined(LT) 2713 movapd %xmm0, %xmm8 2714 unpcklpd %xmm1, %xmm0 2715 unpckhpd %xmm1, %xmm8 2716 2717 movapd 0 * SIZE(BO), %xmm1 2718 movapd 2 * SIZE(BO), %xmm5 2719 2720 subpd %xmm0, %xmm1 2721 subpd %xmm8, %xmm5 2722#else 2723 2724 movapd 0 * SIZE(AO), %xmm8 2725 movapd 2 * SIZE(AO), %xmm10 2726 2727 subpd %xmm0, %xmm8 2728 subpd %xmm1, %xmm10 2729#endif 2730 2731#ifdef LN 2732 movddup 3 * SIZE(AO), %xmm0 2733 mulpd %xmm0, %xmm5 2734 movddup 2 * SIZE(AO), %xmm2 2735 mulpd %xmm5, %xmm2 2736 subpd %xmm2, %xmm1 2737 2738 movddup 0 * SIZE(AO), %xmm0 2739 mulpd %xmm0, %xmm1 2740#endif 2741 2742#ifdef LT 2743 movddup 0 * SIZE(AO), %xmm0 2744 mulpd %xmm0, %xmm1 2745 2746 movddup 1 * SIZE(AO), %xmm2 2747 mulpd %xmm1, %xmm2 2748 subpd %xmm2, %xmm5 2749 2750 movddup 3 * SIZE(AO), %xmm0 2751 mulpd %xmm0, %xmm5 2752#endif 2753 2754#ifdef RN 2755 movddup 0 * SIZE(BO), %xmm0 2756 mulpd %xmm0, %xmm8 2757 2758 movddup 1 * SIZE(BO), %xmm1 2759 mulpd %xmm8, %xmm1 2760 subpd %xmm1, %xmm10 2761 2762 movddup 3 * SIZE(BO), %xmm0 2763 mulpd %xmm0, %xmm10 2764#endif 2765 2766#ifdef RT 2767 movddup 3 * SIZE(BO), %xmm0 2768 mulpd %xmm0, %xmm10 2769 2770 movddup 2 * SIZE(BO), %xmm1 2771 mulpd %xmm10, %xmm1 2772 subpd %xmm1, %xmm8 2773 2774 movddup 0 * SIZE(BO), %xmm0 2775 mulpd %xmm0, %xmm8 2776#endif 2777 2778#ifdef LN 2779 subq $2 * SIZE, CO1 2780 subq $2 * SIZE, CO2 2781#endif 2782 2783#if defined(LN) || defined(LT) 2784 movsd %xmm1, 0 * SIZE(CO1) 2785 movsd %xmm5, 1 * SIZE(CO1) 2786 movhpd %xmm1, 0 * SIZE(CO2) 2787 movhpd %xmm5, 1 * SIZE(CO2) 2788#else 2789 movsd %xmm8, 0 * SIZE(CO1) 2790 movhpd %xmm8, 1 * SIZE(CO1) 2791 movsd %xmm10, 0 * SIZE(CO2) 2792 movhpd %xmm10, 1 * SIZE(CO2) 2793#endif 2794 2795#if defined(LN) || defined(LT) 2796 movapd %xmm1, 0 * SIZE(BO) 2797 movapd %xmm5, 2 * SIZE(BO) 2798#else 2799 movapd %xmm8, 0 * SIZE(AO) 2800 movapd %xmm10, 2 * SIZE(AO) 2801#endif 2802 2803#ifndef LN 2804 addq $2 * SIZE, CO1 2805 addq $2 * SIZE, CO2 2806#endif 2807 2808#if defined(LT) || defined(RN) 2809 movq K, %rax 2810 subq KK, %rax 2811 leaq (,%rax, SIZE), %rax 2812 leaq (AO, %rax, 2), AO 2813 leaq (BO, %rax, 2), BO 2814#endif 2815 2816#ifdef LN 2817 subq $2, KK 2818#endif 2819 2820#ifdef LT 2821 addq $2, KK 2822#endif 2823 2824#ifdef RT 2825 movq K, %rax 2826 salq $1 + BASE_SHIFT, %rax 2827 addq %rax, AORIG 2828#endif 2829 ALIGN_4 2830 2831.L70: 2832 testq $1, M 2833 je .L79 2834 ALIGN_4 2835 2836.L71: 2837#ifdef LN 2838 movq K, %rax 2839 salq $0 + BASE_SHIFT, %rax 2840 subq %rax, AORIG 2841#endif 2842 2843#if defined(LN) || defined(RT) 2844 movq KK, %rax 2845 leaq (, %rax, SIZE), %rax 2846 movq AORIG, AO 2847 leaq (AO, %rax, 1), AO 2848 leaq (B, %rax, 2), BO 2849#else 2850 movq B, BO 2851#endif 2852 2853 movddup 0 * SIZE(AO), %xmm8 2854 pxor %xmm0, %xmm0 2855 movapd 0 * SIZE(BO), %xmm9 2856 pxor %xmm1, %xmm1 2857 movddup 4 * SIZE(AO), %xmm10 2858 pxor %xmm2, %xmm2 2859 movapd 8 * SIZE(BO), %xmm11 2860 pxor %xmm3, %xmm3 2861 2862#if defined(LT) || defined(RN) 2863 movq KK, %rax 2864#else 2865 movq K, %rax 2866 subq KK, %rax 2867#endif 2868 sarq $3, %rax 2869 je .L75 2870 ALIGN_4 2871 2872.L72: 2873 mulpd %xmm8, %xmm9 2874 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2875 movddup 1 * SIZE(AO), %xmm8 2876 addpd %xmm9, %xmm0 2877 mulpd 2 * SIZE(BO), %xmm8 2878 movapd 16 * SIZE(BO), %xmm9 2879 addpd %xmm8, %xmm1 2880 movddup 2 * SIZE(AO), %xmm8 2881 mulpd 4 * SIZE(BO), %xmm8 2882 addpd %xmm8, %xmm2 2883 movddup 3 * SIZE(AO), %xmm8 2884 mulpd 6 * SIZE(BO), %xmm8 2885 addpd %xmm8, %xmm3 2886 movddup 8 * SIZE(AO), %xmm8 2887 mulpd %xmm10, %xmm11 2888 movddup 5 * SIZE(AO), %xmm10 2889 addpd %xmm11, %xmm0 2890 mulpd 10 * SIZE(BO), %xmm10 2891 movapd 24 * SIZE(BO), %xmm11 2892 addpd %xmm10, %xmm1 2893 movddup 6 * SIZE(AO), %xmm10 2894 mulpd 12 * SIZE(BO), %xmm10 2895 addpd %xmm10, %xmm2 2896 movddup 7 * SIZE(AO), %xmm10 2897 mulpd 14 * SIZE(BO), %xmm10 2898 addpd %xmm10, %xmm3 2899 movddup 12 * SIZE(AO), %xmm10 2900 2901 addq $ 8 * SIZE, AO 2902 addq $16 * SIZE, BO 2903 decq %rax 2904 jne .L72 2905 ALIGN_4 2906 2907.L75: 2908#if defined(LT) || defined(RN) 2909 movq KK, %rax 2910#else 2911 movq K, %rax 2912 subq KK, %rax 2913#endif 2914 andq $7, %rax # if (k & 1) 2915 BRANCH 2916 je .L78 2917 ALIGN_4 2918 2919.L76: 2920 mulpd %xmm8, %xmm9 2921 movddup 1 * SIZE(AO), %xmm8 2922 addpd %xmm9, %xmm0 2923 movapd 2 * SIZE(BO), %xmm9 2924 2925 addq $1 * SIZE, AO # aoffset += 4 2926 addq $2 * SIZE, BO # boffset1 += 8 2927 decq %rax 2928 jg .L76 2929 ALIGN_4 2930 2931.L78: 2932 addpd %xmm1, %xmm0 2933 addpd %xmm3, %xmm2 2934 addpd %xmm2, %xmm0 2935 2936#if defined(LN) || defined(RT) 2937 movq KK, %rax 2938#ifdef LN 2939 subq $1, %rax 2940#else 2941 subq $2, %rax 2942#endif 2943 leaq (, %rax, SIZE), %rax 2944 movq AORIG, AO 2945 leaq (AO, %rax, 1), AO 2946 leaq (B, %rax, 2), BO 2947#endif 2948 2949#if defined(LN) || defined(LT) 2950 movapd 0 * SIZE(BO), %xmm2 2951 subpd %xmm0, %xmm2 2952#else 2953 movapd 0 * SIZE(AO), %xmm2 2954 subpd %xmm0, %xmm2 2955#endif 2956 2957#ifdef LN 2958 movddup 0 * SIZE(AO), %xmm0 2959 mulpd %xmm0, %xmm2 2960#endif 2961 2962#ifdef LT 2963 movddup 0 * SIZE(AO), %xmm0 2964 mulpd %xmm0, %xmm2 2965#endif 2966 2967#ifdef RN 2968 movapd %xmm2, %xmm0 2969 unpckhpd %xmm0, %xmm0 2970 2971 movsd 0 * SIZE(BO), %xmm4 2972 mulsd %xmm4, %xmm2 2973 2974 movsd 1 * SIZE(BO), %xmm5 2975 mulsd %xmm2, %xmm5 2976 subsd %xmm5, %xmm0 2977 2978 movsd 3 * SIZE(BO), %xmm4 2979 mulsd %xmm4, %xmm0 2980 2981 unpcklpd %xmm0, %xmm2 2982#endif 2983 2984#ifdef RT 2985 movapd %xmm2, %xmm0 2986 unpckhpd %xmm0, %xmm0 2987 2988 movsd 3 * SIZE(BO), %xmm4 2989 mulsd %xmm4, %xmm0 2990 2991 movsd 2 * SIZE(BO), %xmm5 2992 mulsd %xmm0, %xmm5 2993 subsd %xmm5, %xmm2 2994 2995 movsd 0 * SIZE(BO), %xmm4 2996 mulsd %xmm4, %xmm2 2997 2998 unpcklpd %xmm0, %xmm2 2999#endif 3000 3001#ifdef LN 3002 subq $1 * SIZE, CO1 3003 subq $1 * SIZE, CO2 3004#endif 3005 3006#if defined(LN) || defined(LT) 3007 movsd %xmm2, 0 * SIZE(CO1) 3008 movhpd %xmm2, 0 * SIZE(CO2) 3009#else 3010 movsd %xmm2, 0 * SIZE(CO1) 3011 movhpd %xmm2, 0 * SIZE(CO2) 3012#endif 3013 3014#if defined(LN) || defined(LT) 3015 movapd %xmm2, 0 * SIZE(BO) 3016#else 3017 movapd %xmm2, 0 * SIZE(AO) 3018#endif 3019 3020#ifndef LN 3021 addq $1 * SIZE, CO1 3022 addq $1 * SIZE, CO2 3023#endif 3024 3025#if defined(LT) || defined(RN) 3026 movq K, %rax 3027 subq KK, %rax 3028 leaq (,%rax, SIZE), %rax 3029 leaq (AO, %rax, 1), AO 3030 leaq (BO, %rax, 2), BO 3031#endif 3032 3033#ifdef LN 3034 subq $1, KK 3035#endif 3036 3037#ifdef LT 3038 addq $1, KK 3039#endif 3040 3041#ifdef RT 3042 movq K, %rax 3043 salq $0 + BASE_SHIFT, %rax 3044 addq %rax, AORIG 3045#endif 3046 ALIGN_4 3047 3048.L79: 3049#ifdef LN 3050 leaq (, K, SIZE), %rax 3051 leaq (B, %rax, 2), B 3052#endif 3053 3054#if defined(LT) || defined(RN) 3055 movq BO, B 3056#endif 3057 3058#ifdef RN 3059 addq $2, KK 3060#endif 3061 3062#ifdef RT 3063 subq $2, KK 3064#endif 3065 ALIGN_4 3066 3067.L80: 3068 testq $1, N 3069 je .L999 3070 ALIGN_4 3071 3072#if defined(LT) || defined(RN) 3073 movq A, AO 3074#else 3075 movq A, AORIG 3076#endif 3077 3078#ifdef RT 3079 movq K, %rax 3080 salq $0 + BASE_SHIFT, %rax 3081 subq %rax, B 3082 3083 subq LDC, C 3084#endif 3085 3086 3087 movq C, CO1 3088#ifndef RT 3089 addq LDC, C 3090#endif 3091 3092#ifdef LN 3093 movq OFFSET, %rax 3094 addq M, %rax 3095 movq %rax, KK 3096#endif 3097 3098#ifdef LT 3099 movq OFFSET, %rax 3100 movq %rax, KK 3101#endif 3102 3103 movq M, I 3104 sarq $2, I # i = (m >> 2) 3105 jle .L100 3106 ALIGN_4 3107 3108.L91: 3109#ifdef LN 3110 movq K, %rax 3111 salq $2 + BASE_SHIFT, %rax 3112 subq %rax, AORIG 3113#endif 3114 3115#if defined(LN) || defined(RT) 3116 movq KK, %rax 3117 leaq (, %rax, SIZE), %rax 3118 movq AORIG, AO 3119 leaq (AO, %rax, 4), AO 3120 leaq (B, %rax, 1), BO 3121#else 3122 movq B, BO 3123#endif 3124 3125 movapd 0 * SIZE(AO), %xmm8 3126 pxor %xmm0, %xmm0 3127 movddup 0 * SIZE(BO), %xmm9 3128 pxor %xmm1, %xmm1 3129 movapd 8 * SIZE(AO), %xmm10 3130 pxor %xmm2, %xmm2 3131 movddup 4 * SIZE(BO), %xmm11 3132 pxor %xmm3, %xmm3 3133 3134#ifdef HAVE_3DNOW 3135 prefetchw 4 * SIZE(CO1) 3136#else 3137 prefetchnta 4 * SIZE(CO1) 3138#endif 3139 3140#if defined(LT) || defined(RN) 3141 movq KK, %rax 3142#else 3143 movq K, %rax 3144 subq KK, %rax 3145#endif 3146 sarq $3, %rax 3147 je .L95 3148 ALIGN_4 3149 3150.L92: 3151 mulpd %xmm9, %xmm8 3152 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3153 mulpd 2 * SIZE(AO), %xmm9 3154 addpd %xmm8, %xmm0 3155 movapd 4 * SIZE(AO), %xmm8 3156 addpd %xmm9, %xmm1 3157 movddup 1 * SIZE(BO), %xmm9 3158 mulpd %xmm9, %xmm8 3159 mulpd 6 * SIZE(AO), %xmm9 3160 addpd %xmm8, %xmm2 3161 movapd 16 * SIZE(AO), %xmm8 3162 addpd %xmm9, %xmm3 3163 movddup 2 * SIZE(BO), %xmm9 3164 mulpd %xmm9, %xmm10 3165 mulpd 10 * SIZE(AO), %xmm9 3166 addpd %xmm10, %xmm0 3167 movapd 12 * SIZE(AO), %xmm10 3168 addpd %xmm9, %xmm1 3169 movddup 3 * SIZE(BO), %xmm9 3170 mulpd %xmm9, %xmm10 3171 mulpd 14 * SIZE(AO), %xmm9 3172 addpd %xmm10, %xmm2 3173 movapd 24 * SIZE(AO), %xmm10 3174 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 3175 addpd %xmm9, %xmm3 3176 movddup 8 * SIZE(BO), %xmm9 3177 mulpd %xmm11, %xmm8 3178 mulpd 18 * SIZE(AO), %xmm11 3179 addpd %xmm8, %xmm0 3180 movapd 20 * SIZE(AO), %xmm8 3181 addpd %xmm11, %xmm1 3182 movddup 5 * SIZE(BO), %xmm11 3183 mulpd %xmm11, %xmm8 3184 mulpd 22 * SIZE(AO), %xmm11 3185 addpd %xmm8, %xmm2 3186 movapd 32 * SIZE(AO), %xmm8 3187 addpd %xmm11, %xmm3 3188 movddup 6 * SIZE(BO), %xmm11 3189 mulpd %xmm11, %xmm10 3190 mulpd 26 * SIZE(AO), %xmm11 3191 addpd %xmm10, %xmm0 3192 movapd 28 * SIZE(AO), %xmm10 3193 addpd %xmm11, %xmm1 3194 movddup 7 * SIZE(BO), %xmm11 3195 mulpd %xmm11, %xmm10 3196 mulpd 30 * SIZE(AO), %xmm11 3197 addpd %xmm10, %xmm2 3198 movapd 40 * SIZE(AO), %xmm10 3199 addpd %xmm11, %xmm3 3200 movddup 12 * SIZE(BO), %xmm11 3201 3202 addq $32 * SIZE, AO 3203 addq $8 * SIZE, BO 3204 decq %rax 3205 jne .L92 3206 ALIGN_4 3207 3208.L95: 3209#if defined(LT) || defined(RN) 3210 movq KK, %rax 3211#else 3212 movq K, %rax 3213 subq KK, %rax 3214#endif 3215 andq $7, %rax # if (k & 1) 3216 BRANCH 3217 je .L99 3218 ALIGN_4 3219 3220.L96: 3221 mulpd %xmm9, %xmm8 3222 mulpd 2 * SIZE(AO), %xmm9 3223 addpd %xmm8, %xmm0 3224 movapd 4 * SIZE(AO), %xmm8 3225 addpd %xmm9, %xmm1 3226 movddup 1 * SIZE(BO), %xmm9 3227 3228 addq $4 * SIZE, AO # aoffset += 4 3229 addq $1 * SIZE, BO # boffset1 += 8 3230 decq %rax 3231 jg .L96 3232 ALIGN_4 3233 3234.L99: 3235 addpd %xmm2, %xmm0 3236 addpd %xmm3, %xmm1 3237 3238#if defined(LN) || defined(RT) 3239 movq KK, %rax 3240#ifdef LN 3241 subq $4, %rax 3242#else 3243 subq $1, %rax 3244#endif 3245 leaq (, %rax, SIZE), %rax 3246 3247 movq AORIG, AO 3248 leaq (AO, %rax, 4), AO 3249 leaq (B, %rax, 1), BO 3250#endif 3251 3252#if defined(LN) || defined(LT) 3253 movapd 0 * SIZE(BO), %xmm2 3254 movapd 2 * SIZE(BO), %xmm3 3255 3256 subpd %xmm0, %xmm2 3257 subpd %xmm1, %xmm3 3258#else 3259 movapd 0 * SIZE(AO), %xmm2 3260 movapd 2 * SIZE(AO), %xmm3 3261 3262 subpd %xmm0, %xmm2 3263 subpd %xmm1, %xmm3 3264#endif 3265 3266#ifdef LN 3267 movapd %xmm2, %xmm0 3268 unpckhpd %xmm0, %xmm0 3269 3270 movapd %xmm3, %xmm1 3271 unpckhpd %xmm1, %xmm1 3272 3273 movsd 15 * SIZE(AO), %xmm4 3274 mulsd %xmm4, %xmm1 3275 3276 movsd 14 * SIZE(AO), %xmm5 3277 mulsd %xmm1, %xmm5 3278 subsd %xmm5, %xmm3 3279 movsd 13 * SIZE(AO), %xmm6 3280 mulsd %xmm1, %xmm6 3281 subsd %xmm6, %xmm0 3282 movsd 12 * SIZE(AO), %xmm7 3283 mulsd %xmm1, %xmm7 3284 subsd %xmm7, %xmm2 3285 3286 movsd 10 * SIZE(AO), %xmm4 3287 mulsd %xmm4, %xmm3 3288 3289 movsd 9 * SIZE(AO), %xmm5 3290 mulsd %xmm3, %xmm5 3291 subsd %xmm5, %xmm0 3292 movsd 8 * SIZE(AO), %xmm6 3293 mulsd %xmm3, %xmm6 3294 subsd %xmm6, %xmm2 3295 3296 movsd 5 * SIZE(AO), %xmm4 3297 mulsd %xmm4, %xmm0 3298 3299 movsd 4 * SIZE(AO), %xmm5 3300 mulsd %xmm0, %xmm5 3301 subsd %xmm5, %xmm2 3302 3303 movsd 0 * SIZE(AO), %xmm4 3304 mulsd %xmm4, %xmm2 3305 3306 unpcklpd %xmm0, %xmm2 3307 unpcklpd %xmm1, %xmm3 3308#endif 3309 3310#ifdef LT 3311 movapd %xmm2, %xmm0 3312 unpckhpd %xmm0, %xmm0 3313 3314 movapd %xmm3, %xmm1 3315 unpckhpd %xmm1, %xmm1 3316 3317 movsd 0 * SIZE(AO), %xmm4 3318 mulsd %xmm4, %xmm2 3319 3320 movsd 1 * SIZE(AO), %xmm5 3321 mulsd %xmm2, %xmm5 3322 subsd %xmm5, %xmm0 3323 movsd 2 * SIZE(AO), %xmm6 3324 mulsd %xmm2, %xmm6 3325 subsd %xmm6, %xmm3 3326 movsd 3 * SIZE(AO), %xmm7 3327 mulsd %xmm2, %xmm7 3328 subsd %xmm7, %xmm1 3329 3330 movsd 5 * SIZE(AO), %xmm4 3331 mulsd %xmm4, %xmm0 3332 3333 movsd 6 * SIZE(AO), %xmm5 3334 mulsd %xmm0, %xmm5 3335 subsd %xmm5, %xmm3 3336 movsd 7 * SIZE(AO), %xmm6 3337 mulsd %xmm0, %xmm6 3338 subsd %xmm6, %xmm1 3339 3340 movsd 10 * SIZE(AO), %xmm4 3341 mulsd %xmm4, %xmm3 3342 3343 movsd 11 * SIZE(AO), %xmm5 3344 mulsd %xmm3, %xmm5 3345 subsd %xmm5, %xmm1 3346 3347 movsd 15 * SIZE(AO), %xmm4 3348 mulsd %xmm4, %xmm1 3349 3350 unpcklpd %xmm0, %xmm2 3351 unpcklpd %xmm1, %xmm3 3352#endif 3353 3354#ifdef RN 3355 movddup 0 * SIZE(BO), %xmm0 3356 mulpd %xmm0, %xmm2 3357 mulpd %xmm0, %xmm3 3358#endif 3359 3360#ifdef RT 3361 movddup 0 * SIZE(BO), %xmm0 3362 mulpd %xmm0, %xmm2 3363 mulpd %xmm0, %xmm3 3364#endif 3365 3366#ifdef LN 3367 subq $4 * SIZE, CO1 3368#endif 3369 3370#if defined(LN) || defined(LT) 3371 movsd %xmm2, 0 * SIZE(CO1) 3372 movhpd %xmm2, 1 * SIZE(CO1) 3373 movsd %xmm3, 2 * SIZE(CO1) 3374 movhpd %xmm3, 3 * SIZE(CO1) 3375#else 3376 movsd %xmm2, 0 * SIZE(CO1) 3377 movhpd %xmm2, 1 * SIZE(CO1) 3378 movsd %xmm3, 2 * SIZE(CO1) 3379 movhpd %xmm3, 3 * SIZE(CO1) 3380#endif 3381 3382#if defined(LN) || defined(LT) 3383 movapd %xmm2, 0 * SIZE(BO) 3384 movapd %xmm3, 2 * SIZE(BO) 3385#else 3386 movapd %xmm2, 0 * SIZE(AO) 3387 movapd %xmm3, 2 * SIZE(AO) 3388#endif 3389 3390#ifndef LN 3391 addq $4 * SIZE, CO1 3392#endif 3393 3394#if defined(LT) || defined(RN) 3395 movq K, %rax 3396 subq KK, %rax 3397 leaq (,%rax, SIZE), %rax 3398 leaq (AO, %rax, 4), AO 3399 leaq (BO, %rax, 1), BO 3400#endif 3401 3402#ifdef LN 3403 subq $4, KK 3404#endif 3405 3406#ifdef LT 3407 addq $4, KK 3408#endif 3409 3410#ifdef RT 3411 movq K, %rax 3412 salq $2 + BASE_SHIFT, %rax 3413 addq %rax, AORIG 3414#endif 3415 3416 decq I # i -- 3417 jg .L91 3418 ALIGN_4 3419 3420.L100: 3421 testq $2, M 3422 je .L110 3423 ALIGN_4 3424 3425.L101: 3426#ifdef LN 3427 movq K, %rax 3428 salq $1 + BASE_SHIFT, %rax 3429 subq %rax, AORIG 3430#endif 3431 3432#if defined(LN) || defined(RT) 3433 movq KK, %rax 3434 leaq (, %rax, SIZE), %rax 3435 movq AORIG, AO 3436 leaq (AO, %rax, 2), AO 3437 leaq (B, %rax, 1), BO 3438#else 3439 movq B, BO 3440#endif 3441 3442 movapd 0 * SIZE(AO), %xmm8 3443 pxor %xmm0, %xmm0 3444 movddup 0 * SIZE(BO), %xmm9 3445 pxor %xmm1, %xmm1 3446 movapd 8 * SIZE(AO), %xmm10 3447 pxor %xmm2, %xmm2 3448 movddup 4 * SIZE(BO), %xmm11 3449 pxor %xmm3, %xmm3 3450 3451#if defined(LT) || defined(RN) 3452 movq KK, %rax 3453#else 3454 movq K, %rax 3455 subq KK, %rax 3456#endif 3457 sarq $3, %rax 3458 je .L105 3459 ALIGN_4 3460 3461.L102: 3462 mulpd %xmm9, %xmm8 3463 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3464 movddup 1 * SIZE(BO), %xmm9 3465 addpd %xmm8, %xmm0 3466 mulpd 2 * SIZE(AO), %xmm9 3467 movapd 16 * SIZE(AO), %xmm8 3468 addpd %xmm9, %xmm1 3469 movddup 2 * SIZE(BO), %xmm9 3470 mulpd 4 * SIZE(AO), %xmm9 3471 addpd %xmm9, %xmm2 3472 movddup 3 * SIZE(BO), %xmm9 3473 mulpd 6 * SIZE(AO), %xmm9 3474 addpd %xmm9, %xmm3 3475 movddup 8 * SIZE(BO), %xmm9 3476 mulpd %xmm11, %xmm10 3477 movddup 5 * SIZE(BO), %xmm11 3478 addpd %xmm10, %xmm0 3479 mulpd 10 * SIZE(AO), %xmm11 3480 movapd 24 * SIZE(AO), %xmm10 3481 addpd %xmm11, %xmm1 3482 movddup 6 * SIZE(BO), %xmm11 3483 mulpd 12 * SIZE(AO), %xmm11 3484 addpd %xmm11, %xmm2 3485 movddup 7 * SIZE(BO), %xmm11 3486 mulpd 14 * SIZE(AO), %xmm11 3487 addpd %xmm11, %xmm3 3488 movddup 12 * SIZE(BO), %xmm11 3489 3490 addq $16 * SIZE, AO 3491 addq $ 8 * SIZE, BO 3492 decq %rax 3493 jne .L102 3494 ALIGN_4 3495 3496.L105: 3497#if defined(LT) || defined(RN) 3498 movq KK, %rax 3499#else 3500 movq K, %rax 3501 subq KK, %rax 3502#endif 3503 andq $7, %rax # if (k & 1) 3504 BRANCH 3505 je .L109 3506 ALIGN_4 3507 3508.L106: 3509 mulpd %xmm9, %xmm8 3510 movddup 1 * SIZE(BO), %xmm9 3511 addpd %xmm8, %xmm0 3512 movapd 2 * SIZE(AO), %xmm8 3513 3514 addq $2 * SIZE, AO # aoffset += 4 3515 addq $1 * SIZE, BO # boffset1 += 8 3516 decq %rax 3517 jg .L106 3518 ALIGN_4 3519 3520.L109: 3521 addpd %xmm1, %xmm0 3522 addpd %xmm3, %xmm2 3523 addpd %xmm2, %xmm0 3524 3525#if defined(LN) || defined(RT) 3526 movq KK, %rax 3527#ifdef LN 3528 subq $2, %rax 3529#else 3530 subq $1, %rax 3531#endif 3532 leaq (, %rax, SIZE), %rax 3533 3534 movq AORIG, AO 3535 leaq (AO, %rax, 2), AO 3536 leaq (B, %rax, 1), BO 3537#endif 3538 3539#if defined(LN) || defined(LT) 3540 movapd 0 * SIZE(BO), %xmm2 3541 subpd %xmm0, %xmm2 3542#else 3543 movapd 0 * SIZE(AO), %xmm2 3544 subpd %xmm0, %xmm2 3545#endif 3546 3547#ifdef LN 3548 movapd %xmm2, %xmm0 3549 unpckhpd %xmm0, %xmm0 3550 3551 movsd 3 * SIZE(AO), %xmm4 3552 mulsd %xmm4, %xmm0 3553 3554 movsd 2 * SIZE(AO), %xmm5 3555 mulsd %xmm0, %xmm5 3556 subsd %xmm5, %xmm2 3557 3558 movsd 0 * SIZE(AO), %xmm4 3559 mulsd %xmm4, %xmm2 3560 3561 unpcklpd %xmm0, %xmm2 3562#endif 3563 3564#ifdef LT 3565 movapd %xmm2, %xmm0 3566 unpckhpd %xmm0, %xmm0 3567 3568 movsd 0 * SIZE(AO), %xmm4 3569 mulsd %xmm4, %xmm2 3570 3571 movsd 1 * SIZE(AO), %xmm5 3572 mulsd %xmm2, %xmm5 3573 subsd %xmm5, %xmm0 3574 3575 movsd 3 * SIZE(AO), %xmm4 3576 mulsd %xmm4, %xmm0 3577 3578 unpcklpd %xmm0, %xmm2 3579#endif 3580 3581#ifdef RN 3582 movddup 0 * SIZE(BO), %xmm0 3583 mulpd %xmm0, %xmm2 3584#endif 3585 3586#ifdef RT 3587 movddup 0 * SIZE(BO), %xmm0 3588 mulpd %xmm0, %xmm2 3589#endif 3590 3591#ifdef LN 3592 subq $2 * SIZE, CO1 3593#endif 3594 3595#if defined(LN) || defined(LT) 3596 movsd %xmm2, 0 * SIZE(CO1) 3597 movhpd %xmm2, 1 * SIZE(CO1) 3598#else 3599 movsd %xmm2, 0 * SIZE(CO1) 3600 movhpd %xmm2, 1 * SIZE(CO1) 3601#endif 3602 3603#if defined(LN) || defined(LT) 3604 movapd %xmm2, 0 * SIZE(BO) 3605#else 3606 movapd %xmm2, 0 * SIZE(AO) 3607#endif 3608 3609#ifndef LN 3610 addq $2 * SIZE, CO1 3611#endif 3612 3613#if defined(LT) || defined(RN) 3614 movq K, %rax 3615 subq KK, %rax 3616 leaq (,%rax, SIZE), %rax 3617 leaq (AO, %rax, 2), AO 3618 leaq (BO, %rax, 1), BO 3619#endif 3620 3621#ifdef LN 3622 subq $2, KK 3623#endif 3624 3625#ifdef LT 3626 addq $2, KK 3627#endif 3628 3629#ifdef RT 3630 movq K, %rax 3631 salq $1 + BASE_SHIFT, %rax 3632 addq %rax, AORIG 3633#endif 3634 ALIGN_4 3635 3636.L110: 3637 testq $1, M 3638 je .L119 3639 ALIGN_4 3640 3641.L111: 3642#ifdef LN 3643 movq K, %rax 3644 salq $0 + BASE_SHIFT, %rax 3645 subq %rax, AORIG 3646#endif 3647 3648#if defined(LN) || defined(RT) 3649 movq KK, %rax 3650 leaq (, %rax, SIZE), %rax 3651 movq AORIG, AO 3652 leaq (AO, %rax, 1), AO 3653 leaq (B, %rax, 1), BO 3654#else 3655 movq B, BO 3656#endif 3657 3658 movapd 0 * SIZE(AO), %xmm9 3659 pxor %xmm0, %xmm0 3660 movapd 0 * SIZE(BO), %xmm8 3661 pxor %xmm1, %xmm1 3662 movapd 4 * SIZE(AO), %xmm11 3663 pxor %xmm2, %xmm2 3664 movapd 4 * SIZE(BO), %xmm10 3665 pxor %xmm3, %xmm3 3666 3667#if defined(LT) || defined(RN) 3668 movq KK, %rax 3669#else 3670 movq K, %rax 3671 subq KK, %rax 3672#endif 3673 sarq $3, %rax 3674 je .L115 3675 ALIGN_4 3676 3677.L112: 3678 mulpd %xmm9, %xmm8 3679 movapd 2 * SIZE(AO), %xmm9 3680 addpd %xmm8, %xmm0 3681 mulpd 2 * SIZE(BO), %xmm9 3682 movapd 8 * SIZE(BO), %xmm8 3683 addpd %xmm9, %xmm1 3684 movapd 8 * SIZE(AO), %xmm9 3685 mulpd %xmm11, %xmm10 3686 movapd 6 * SIZE(AO), %xmm11 3687 addpd %xmm10, %xmm0 3688 mulpd 6 * SIZE(BO), %xmm11 3689 movapd 12 * SIZE(BO), %xmm10 3690 addpd %xmm11, %xmm1 3691 movapd 12 * SIZE(AO), %xmm11 3692 3693 addq $8 * SIZE, AO 3694 addq $8 * SIZE, BO 3695 decq %rax 3696 jne .L112 3697 ALIGN_4 3698 3699.L115: 3700#if defined(LT) || defined(RN) 3701 movq KK, %rax 3702#else 3703 movq K, %rax 3704 subq KK, %rax 3705#endif 3706 andq $7, %rax # if (k & 1) 3707 BRANCH 3708 je .L118 3709 ALIGN_4 3710 3711.L116: 3712 mulsd 0 * SIZE(BO), %xmm9 3713 addsd %xmm9, %xmm0 3714 movsd 1 * SIZE(AO), %xmm9 3715 3716 addq $1 * SIZE, AO # aoffset += 4 3717 addq $1 * SIZE, BO # boffset1 += 8 3718 decq %rax 3719 jg .L116 3720 ALIGN_4 3721 3722.L118: 3723 addpd %xmm1, %xmm0 3724 haddpd %xmm0, %xmm0 3725 3726#if defined(LN) || defined(RT) 3727 movq KK, %rax 3728#ifdef LN 3729 subq $1, %rax 3730#else 3731 subq $1, %rax 3732#endif 3733 leaq (, %rax, SIZE), %rax 3734 3735 movq AORIG, AO 3736 leaq (AO, %rax, 1), AO 3737 leaq (B, %rax, 1), BO 3738#endif 3739 3740#if defined(LN) || defined(LT) 3741 movsd 0 * SIZE(BO), %xmm2 3742 subsd %xmm0, %xmm2 3743#else 3744 movsd 0 * SIZE(AO), %xmm2 3745 subsd %xmm0, %xmm2 3746#endif 3747 3748#ifdef LN 3749 movsd 0 * SIZE(AO), %xmm4 3750 mulsd %xmm4, %xmm2 3751#endif 3752 3753#ifdef LT 3754 movsd 0 * SIZE(AO), %xmm4 3755 mulsd %xmm4, %xmm2 3756#endif 3757 3758#ifdef RN 3759 movsd 0 * SIZE(BO), %xmm0 3760 mulsd %xmm0, %xmm2 3761#endif 3762 3763#ifdef RT 3764 movsd 0 * SIZE(BO), %xmm0 3765 mulsd %xmm0, %xmm2 3766#endif 3767 3768#ifdef LN 3769 subq $1 * SIZE, CO1 3770#endif 3771 3772#if defined(LN) || defined(LT) 3773 movsd %xmm2, 0 * SIZE(CO1) 3774#else 3775 movsd %xmm2, 0 * SIZE(CO1) 3776#endif 3777 3778#if defined(LN) || defined(LT) 3779 movsd %xmm2, 0 * SIZE(BO) 3780#else 3781 movsd %xmm2, 0 * SIZE(AO) 3782#endif 3783 3784#ifndef LN 3785 addq $1 * SIZE, CO1 3786#endif 3787 3788#if defined(LT) || defined(RN) 3789 movq K, %rax 3790 subq KK, %rax 3791 leaq (,%rax, SIZE), %rax 3792 leaq (AO, %rax, 1), AO 3793 leaq (BO, %rax, 1), BO 3794#endif 3795 3796#ifdef LN 3797 subq $1, KK 3798#endif 3799 3800#ifdef LT 3801 addq $1, KK 3802#endif 3803 3804#ifdef RT 3805 movq K, %rax 3806 salq $0 + BASE_SHIFT, %rax 3807 addq %rax, AORIG 3808#endif 3809 ALIGN_4 3810 3811.L119: 3812#ifdef LN 3813 leaq (, K, SIZE), %rax 3814 leaq (B, %rax, 1), B 3815#endif 3816 3817#if defined(LT) || defined(RN) 3818 movq BO, B 3819#endif 3820 3821#ifdef RN 3822 addq $1, KK 3823#endif 3824 3825#ifdef RT 3826 subq $1, KK 3827#endif 3828 ALIGN_2 3829 3830.L999: 3831 movq 0(%rsp), %rbx 3832 movq 8(%rsp), %rbp 3833 movq 16(%rsp), %r12 3834 movq 24(%rsp), %r13 3835 movq 32(%rsp), %r14 3836 movq 40(%rsp), %r15 3837 3838#ifdef WINDOWS_ABI 3839 movq 48(%rsp), %rdi 3840 movq 56(%rsp), %rsi 3841 movups 64(%rsp), %xmm6 3842 movups 80(%rsp), %xmm7 3843 movups 96(%rsp), %xmm8 3844 movups 112(%rsp), %xmm9 3845 movups 128(%rsp), %xmm10 3846 movups 144(%rsp), %xmm11 3847 movups 160(%rsp), %xmm12 3848 movups 176(%rsp), %xmm13 3849 movups 192(%rsp), %xmm14 3850 movups 208(%rsp), %xmm15 3851#endif 3852 3853 addq $STACKSIZE, %rsp 3854 ret 3855 3856 EPILOGUE 3857