1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %r12 58#define BB %rbp 59#define J %rbx 60 61#ifndef WINDOWS_ABI 62 63#define STACKSIZE 96 64 65#define OFFSET 48(%rsp) 66#define AORIG 56(%rsp) 67#define KK 64(%rsp) 68#define KKK 72(%rsp) 69 70#else 71 72#define STACKSIZE 256 73 74#define OLD_A 40 + STACKSIZE(%rsp) 75#define OLD_B 48 + STACKSIZE(%rsp) 76#define OLD_C 56 + STACKSIZE(%rsp) 77#define OLD_LDC 64 + STACKSIZE(%rsp) 78#define OLD_OFFSET 72 + STACKSIZE(%rsp) 79 80#define OFFSET 224(%rsp) 81#define AORIG 232(%rsp) 82#define KK 240(%rsp) 83#define KKK 248(%rsp) 84 85#endif 86 87#define PREFETCH prefetch 88#define PREFETCHSIZE (8 * 7 + 0) 89 90#define movlpd movsd 91#define movapd movups 92#define movupd movups 93 94#define KERNEL1(xx) \ 95 mulpd %xmm1, %xmm0 ;\ 96 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 97 addpd %xmm0, %xmm8 ;\ 98 movapd %xmm2, %xmm0 ;\ 99 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ 100 addpd %xmm1, %xmm12 ;\ 101 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 102 mulpd %xmm3, %xmm2 ;\ 103 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 104 addpd %xmm2, %xmm9 ;\ 105 movapd %xmm0, %xmm2 ;\ 106 addpd %xmm3, %xmm13 ;\ 107 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 108 mulpd %xmm1, %xmm0 ;\ 109 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 110 addpd %xmm0, %xmm10 ;\ 111 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 112 addpd %xmm1, %xmm14 ;\ 113 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 114 mulpd %xmm3, %xmm2 ;\ 115 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 116 addpd %xmm2, %xmm11 ;\ 117 addpd %xmm3, %xmm15 ;\ 118 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 119 movapd %xmm0, %xmm2 120 121#define KERNEL2(xx) \ 122 mulpd %xmm1, %xmm0 ;\ 123 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 124 addpd %xmm0, %xmm8 ;\ 125 movapd %xmm2, %xmm0 ;\ 126 addpd %xmm1, %xmm12 ;\ 127 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 128 mulpd %xmm3, %xmm2 ;\ 129 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 130 addpd %xmm2, %xmm9 ;\ 131 movapd %xmm0, %xmm2 ;\ 132 addpd %xmm3, %xmm13 ;\ 133 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 134 mulpd %xmm1, %xmm0 ;\ 135 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 136 addpd %xmm0, %xmm10 ;\ 137 addpd %xmm1, %xmm14 ;\ 138 mulpd %xmm3, %xmm2 ;\ 139 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 140 addpd %xmm2, %xmm11 ;\ 141 addpd %xmm3, %xmm15 ;\ 142 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 143/**/ movddup (BO, %rax, 4), %xmm1 ;\ 144 movapd %xmm4, %xmm2 145 146#define KERNEL3(xx) \ 147 mulpd %xmm5, %xmm4 ;\ 148 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 149 addpd %xmm4, %xmm8 ;\ 150 movapd %xmm2, %xmm4 ;\ 151 addpd %xmm5, %xmm12 ;\ 152 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 153 mulpd %xmm3, %xmm2 ;\ 154 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 155 addpd %xmm2, %xmm9 ;\ 156 movapd %xmm4, %xmm2 ;\ 157 addpd %xmm3, %xmm13 ;\ 158 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 159 mulpd %xmm5, %xmm4 ;\ 160 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 161 addpd %xmm4, %xmm10 ;\ 162 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 163 addpd %xmm5, %xmm14 ;\ 164 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 165 mulpd %xmm3, %xmm2 ;\ 166 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 167 addpd %xmm2, %xmm11 ;\ 168 addpd %xmm3, %xmm15 ;\ 169 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 170 movapd %xmm4, %xmm2 171 172#define KERNEL4(xx) \ 173 mulpd %xmm5, %xmm4 ;\ 174 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 175 addpd %xmm4, %xmm8 ;\ 176 movapd %xmm2, %xmm4 ;\ 177 addpd %xmm5, %xmm12 ;\ 178 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 179 mulpd %xmm3, %xmm2 ;\ 180 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 181 addpd %xmm2, %xmm9 ;\ 182 movapd %xmm4, %xmm2 ;\ 183 addpd %xmm3, %xmm13 ;\ 184 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 185 mulpd %xmm5, %xmm4 ;\ 186 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 187/***/ movapd (AO, %rax, 4), %xmm6 ;\ 188 addpd %xmm4, %xmm10 ;\ 189 addpd %xmm5, %xmm14 ;\ 190 mulpd %xmm3, %xmm2 ;\ 191 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 192 addpd %xmm2, %xmm11 ;\ 193 addpd %xmm3, %xmm15 ;\ 194 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 195 movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ 196 movapd %xmm6, %xmm2 197 198#define KERNEL5(xx) \ 199 mulpd %xmm1, %xmm6 ;\ 200 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 201 addpd %xmm6, %xmm8 ;\ 202 movapd %xmm2, %xmm6 ;\ 203 addpd %xmm1, %xmm12 ;\ 204 movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ 205 mulpd %xmm3, %xmm2 ;\ 206 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 207 addpd %xmm2, %xmm9 ;\ 208/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ 209 movapd %xmm6, %xmm2 ;\ 210 addpd %xmm3, %xmm13 ;\ 211 movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ 212 mulpd %xmm1, %xmm6 ;\ 213 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 214 addpd %xmm6, %xmm10 ;\ 215 movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ 216 addpd %xmm1, %xmm14 ;\ 217 movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ 218 mulpd %xmm3, %xmm2 ;\ 219 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 220 addpd %xmm2, %xmm11 ;\ 221 addpd %xmm3, %xmm15 ;\ 222 movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ 223 movapd %xmm6, %xmm2 224 225#define KERNEL6(xx) \ 226 mulpd %xmm1, %xmm6 ;\ 227 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 228 addpd %xmm6, %xmm8 ;\ 229 movapd %xmm2, %xmm6 ;\ 230 addpd %xmm1, %xmm12 ;\ 231 movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ 232 mulpd %xmm3, %xmm2 ;\ 233 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 234 addpd %xmm2, %xmm9 ;\ 235 movapd %xmm6, %xmm2 ;\ 236 addpd %xmm3, %xmm13 ;\ 237 movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ 238 mulpd %xmm1, %xmm6 ;\ 239 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 240 addpd %xmm6, %xmm10 ;\ 241/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ 242 addpd %xmm1, %xmm14 ;\ 243 mulpd %xmm3, %xmm2 ;\ 244 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 245 addpd %xmm2, %xmm11 ;\ 246 addpd %xmm3, %xmm15 ;\ 247 movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ 248 movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ 249 movapd %xmm7, %xmm2 250 251#define KERNEL7(xx) \ 252 mulpd %xmm5, %xmm7 ;\ 253 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 254 addpd %xmm7, %xmm8 ;\ 255 movapd %xmm2, %xmm7 ;\ 256 addpd %xmm5, %xmm12 ;\ 257 movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ 258 mulpd %xmm3, %xmm2 ;\ 259 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 260 addpd %xmm2, %xmm9 ;\ 261 movapd %xmm7, %xmm2 ;\ 262 addpd %xmm3, %xmm13 ;\ 263 movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ 264 mulpd %xmm5, %xmm7 ;\ 265 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 266 addpd %xmm7, %xmm10 ;\ 267 movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ 268 addpd %xmm5, %xmm14 ;\ 269 movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ 270 mulpd %xmm3, %xmm2 ;\ 271 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 272 addpd %xmm2, %xmm11 ;\ 273 addpd %xmm3, %xmm15 ;\ 274 movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ 275 movapd %xmm7, %xmm2 276 277#define KERNEL8(xx) \ 278 mulpd %xmm5, %xmm7 ;\ 279 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 280 addpd %xmm7, %xmm8 ;\ 281 movapd %xmm2, %xmm7 ;\ 282 addpd %xmm5, %xmm12 ;\ 283 movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ 284 mulpd %xmm3, %xmm2 ;\ 285 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 286 addpd %xmm2, %xmm9 ;\ 287 movapd %xmm7, %xmm2 ;\ 288 addpd %xmm3, %xmm13 ;\ 289 movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ 290 mulpd %xmm5, %xmm7 ;\ 291 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 292 addpd %xmm7, %xmm10 ;\ 293 addpd %xmm5, %xmm14 ;\ 294/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ 295 mulpd %xmm3, %xmm2 ;\ 296 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 297 addpd %xmm3, %xmm15 ;\ 298 movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ 299 addpd %xmm2, %xmm11 ;\ 300 movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ 301 movapd %xmm0, %xmm2 ;\ 302 addq $8 * SIZE, %rax 303 304#define KERNEL_SUB1(xx) \ 305 mulpd %xmm1, %xmm0 ;\ 306 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 307 addpd %xmm0, %xmm8 ;\ 308 movapd %xmm2, %xmm0 ;\ 309 addpd %xmm1, %xmm12 ;\ 310 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 311 mulpd %xmm3, %xmm2 ;\ 312 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 313 addpd %xmm2, %xmm9 ;\ 314 movapd %xmm0, %xmm2 ;\ 315 addpd %xmm3, %xmm13 ;\ 316 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 317 mulpd %xmm1, %xmm0 ;\ 318 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 319 addpd %xmm0, %xmm10 ;\ 320 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 321 addpd %xmm1, %xmm14 ;\ 322 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 323 mulpd %xmm3, %xmm2 ;\ 324 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 325 addpd %xmm2, %xmm11 ;\ 326 addpd %xmm3, %xmm15 ;\ 327 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 328 movapd %xmm0, %xmm2 329 330#define KERNEL_SUB2(xx) \ 331 mulpd %xmm1, %xmm0 ;\ 332 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 333 addpd %xmm0, %xmm8 ;\ 334 movapd %xmm2, %xmm0 ;\ 335 addpd %xmm1, %xmm12 ;\ 336 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 337 mulpd %xmm3, %xmm2 ;\ 338 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 339 addpd %xmm2, %xmm9 ;\ 340 movapd %xmm0, %xmm2 ;\ 341 addpd %xmm3, %xmm13 ;\ 342 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 343 mulpd %xmm1, %xmm0 ;\ 344 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 345 addpd %xmm0, %xmm10 ;\ 346 movapd (AO, %rax, 4), %xmm0 ;\ 347 addpd %xmm1, %xmm14 ;\ 348 movddup (BO, %rax, 4), %xmm1 ;\ 349 mulpd %xmm3, %xmm2 ;\ 350 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 351 addpd %xmm2, %xmm11 ;\ 352 addpd %xmm3, %xmm15 ;\ 353 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 354 movapd %xmm4, %xmm2 355 356#define KERNEL_SUB3(xx) \ 357 mulpd %xmm5, %xmm4 ;\ 358 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 359 addpd %xmm4, %xmm8 ;\ 360 movapd %xmm2, %xmm4 ;\ 361 addpd %xmm5, %xmm12 ;\ 362 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 363 mulpd %xmm3, %xmm2 ;\ 364 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 365 addpd %xmm2, %xmm9 ;\ 366 movapd %xmm4, %xmm2 ;\ 367 addpd %xmm3, %xmm13 ;\ 368 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 369 mulpd %xmm5, %xmm4 ;\ 370 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 371 addpd %xmm4, %xmm10 ;\ 372 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 373 addpd %xmm5, %xmm14 ;\ 374 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 375 mulpd %xmm3, %xmm2 ;\ 376 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 377 addpd %xmm2, %xmm11 ;\ 378 addpd %xmm3, %xmm15 ;\ 379 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 380 movapd %xmm4, %xmm2 381 382#define KERNEL_SUB4(xx) \ 383 mulpd %xmm5, %xmm4 ;\ 384 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 385 addpd %xmm4, %xmm8 ;\ 386 movapd %xmm2, %xmm4 ;\ 387 addpd %xmm5, %xmm12 ;\ 388 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 389 mulpd %xmm3, %xmm2 ;\ 390 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 391 addpd %xmm2, %xmm9 ;\ 392 movapd %xmm4, %xmm2 ;\ 393 addpd %xmm3, %xmm13 ;\ 394 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 395 mulpd %xmm5, %xmm4 ;\ 396 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 397 addpd %xmm4, %xmm10 ;\ 398 addpd %xmm5, %xmm14 ;\ 399 mulpd %xmm3, %xmm2 ;\ 400 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 401 addpd %xmm2, %xmm11 ;\ 402 addpd %xmm3, %xmm15 ;\ 403 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 404 movapd %xmm0, %xmm2 405 406 PROLOGUE 407 PROFCODE 408 409 subq $STACKSIZE, %rsp 410 movq %rbx, (%rsp) 411 movq %rbp, 8(%rsp) 412 movq %r12, 16(%rsp) 413 movq %r13, 24(%rsp) 414 movq %r14, 32(%rsp) 415 movq %r15, 40(%rsp) 416 417#ifdef WINDOWS_ABI 418 movq %rdi, 48(%rsp) 419 movq %rsi, 56(%rsp) 420 movups %xmm6, 64(%rsp) 421 movups %xmm7, 80(%rsp) 422 movups %xmm8, 96(%rsp) 423 movups %xmm9, 112(%rsp) 424 movups %xmm10, 128(%rsp) 425 movups %xmm11, 144(%rsp) 426 movups %xmm12, 160(%rsp) 427 movups %xmm13, 176(%rsp) 428 movups %xmm14, 192(%rsp) 429 movups %xmm15, 208(%rsp) 430 431 movq ARG1, OLD_M 432 movq ARG2, OLD_N 433 movq ARG3, K 434 movq OLD_A, A 435 movq OLD_B, B 436 movq OLD_C, C 437 movq OLD_LDC, LDC 438 movsd OLD_OFFSET, %xmm12 439#else 440 movq STACKSIZE + 8(%rsp), LDC 441 movsd STACKSIZE + 16(%rsp), %xmm12 442#endif 443 444 movq OLD_M, M 445 movq OLD_N, N 446 447 subq $-16 * SIZE, A 448 subq $-16 * SIZE, B 449 450 movsd %xmm12, OFFSET 451 movsd %xmm12, KK 452 453 leaq (, LDC, SIZE), LDC 454 455#ifdef LN 456 leaq (, M, SIZE), %rax 457 addq %rax, C 458 imulq K, %rax 459 addq %rax, A 460#endif 461 462#ifdef RT 463 leaq (, N, SIZE), %rax 464 imulq K, %rax 465 addq %rax, B 466 movq N, %rax 467 imulq LDC, %rax 468 addq %rax, C 469#endif 470 471#ifdef RN 472 negq KK 473#endif 474 475#ifdef RT 476 movq N, %rax 477 subq OFFSET, %rax 478 movq %rax, KK 479#endif 480 481 testq $1, N 482 je .L40 483 484#if defined(LT) || defined(RN) 485 movq A, AO 486#else 487 movq A, AORIG 488#endif 489 490#ifdef RT 491 movq K, %rax 492 salq $0 + BASE_SHIFT, %rax 493 subq %rax, B 494 495 subq LDC, C 496#endif 497 498 movq C, CO1 # coffset1 = c 499#ifndef RT 500 addq LDC, C 501#endif 502 503#ifdef LN 504 movq OFFSET, %rax 505 addq M, %rax 506 movq %rax, KK 507#endif 508 509#ifdef LT 510 movq OFFSET, %rax 511 movq %rax, KK 512#endif 513 514 movq M, I 515 sarq $2, I # i = (m >> 2) 516 jle .L100 517 ALIGN_4 518 519.L91: 520#ifdef LN 521 movq K, %rax 522 salq $2 + BASE_SHIFT, %rax 523 subq %rax, AORIG 524#endif 525 526#if defined(LN) || defined(RT) 527 movq KK, %rax 528 movq AORIG, AO 529 leaq (, %rax, SIZE), %rax 530 leaq (AO, %rax, 4), AO 531#endif 532 533 movq B, BO 534 535#if defined(LN) || defined(RT) 536 movq KK, %rax 537 leaq (BO, %rax, SIZE), BO 538#endif 539 540 movapd -16 * SIZE(AO), %xmm0 541 pxor %xmm8, %xmm8 542 movapd -8 * SIZE(AO), %xmm2 543 pxor %xmm9, %xmm9 544 movddup -16 * SIZE(BO), %xmm1 545 pxor %xmm10, %xmm10 546 movddup -15 * SIZE(BO), %xmm5 547 pxor %xmm11, %xmm11 548 movddup -14 * SIZE(BO), %xmm3 549 550#ifndef LN 551 prefetchw 3 * SIZE(CO1) 552#else 553 prefetchw -8 * SIZE(CO1) 554#endif 555 556#if defined(LT) || defined(RN) 557 movq KK, %rax 558#else 559 movq K, %rax 560 subq KK, %rax 561#endif 562 andq $-4, %rax 563 leaq (, %rax, SIZE), %rax 564 leaq (AO, %rax, 4), AO 565 leaq (BO, %rax, 1), BO 566 negq %rax 567 NOBRANCH 568 je .L96 569 ALIGN_4 570 571.L92: 572 mulpd %xmm1, %xmm0 573 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 574 addpd %xmm0, %xmm8 575 movapd -12 * SIZE(AO, %rax, 4), %xmm0 576 addpd %xmm1, %xmm9 577 movddup -12 * SIZE(BO, %rax, 1), %xmm1 578 mulpd %xmm5, %xmm0 579 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 580 addpd %xmm0, %xmm10 581 movapd (AO, %rax, 4), %xmm0 582 addpd %xmm5, %xmm11 583 movddup -13 * SIZE(BO, %rax, 1), %xmm5 584 mulpd %xmm3, %xmm2 585 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 586 addpd %xmm2, %xmm8 587 movapd -4 * SIZE(AO, %rax, 4), %xmm2 588 addpd %xmm3, %xmm9 589 movddup -10 * SIZE(BO, %rax, 1), %xmm3 590 mulpd %xmm5, %xmm2 591 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 592 addpd %xmm2, %xmm10 593 movapd 8 * SIZE(AO, %rax, 4), %xmm2 594 addpd %xmm5, %xmm11 595 movddup -11 * SIZE(BO, %rax, 1), %xmm5 596 597 addq $4 * SIZE, %rax 598 BRANCH 599 jl .L92 600 ALIGN_4 601 602.L96: 603#if defined(LT) || defined(RN) 604 movq KK, %rax 605#else 606 movq K, %rax 607 subq KK, %rax 608#endif 609 andq $3, %rax # if (k & 1) 610 je .L99 611 612 leaq (, %rax, SIZE), %rax 613 leaq (AO, %rax, 4), AO 614 leaq (BO, %rax, 1), BO 615 negq %rax 616 ALIGN_4 617 618.L97: 619 mulpd %xmm1, %xmm0 620 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 621 addpd %xmm0, %xmm8 622 movapd -12 * SIZE(AO, %rax, 4), %xmm0 623 addpd %xmm1, %xmm9 624 movddup -15 * SIZE(BO, %rax, 1), %xmm1 625 626 addq $SIZE, %rax 627 jl .L97 628 ALIGN_4 629.L99: 630 addpd %xmm10, %xmm8 631 addpd %xmm11, %xmm9 632 633#if defined(LN) || defined(RT) 634 movq KK, %rax 635#ifdef LN 636 subq $4, %rax 637#else 638 subq $1, %rax 639#endif 640 641 leaq (, %rax, SIZE), %rax 642 643 movq AORIG, AO 644 leaq (AO, %rax, 4), AO 645 leaq (B, %rax, 1), BO 646#endif 647 648#if defined(LN) || defined(LT) 649 movapd -16 * SIZE(BO), %xmm10 650 movapd -14 * SIZE(BO), %xmm11 651 652 subpd %xmm8, %xmm10 653 subpd %xmm9, %xmm11 654#else 655 movapd -16 * SIZE(AO), %xmm10 656 movapd -14 * SIZE(AO), %xmm11 657 658 subpd %xmm8, %xmm10 659 subpd %xmm9, %xmm11 660#endif 661 662#ifdef LN 663 movapd %xmm10, %xmm8 664 unpckhpd %xmm8, %xmm8 665 666 movapd %xmm11, %xmm9 667 unpckhpd %xmm9, %xmm9 668 669 movsd -1 * SIZE(AO), %xmm12 670 mulsd %xmm12, %xmm9 671 672 movsd -2 * SIZE(AO), %xmm13 673 mulsd %xmm9, %xmm13 674 subsd %xmm13, %xmm11 675 movsd -3 * SIZE(AO), %xmm14 676 mulsd %xmm9, %xmm14 677 subsd %xmm14, %xmm8 678 movsd -4 * SIZE(AO), %xmm15 679 mulsd %xmm9, %xmm15 680 subsd %xmm15, %xmm10 681 682 movsd -6 * SIZE(AO), %xmm12 683 mulsd %xmm12, %xmm11 684 685 movsd -7 * SIZE(AO), %xmm13 686 mulsd %xmm11, %xmm13 687 subsd %xmm13, %xmm8 688 movsd -8 * SIZE(AO), %xmm14 689 mulsd %xmm11, %xmm14 690 subsd %xmm14, %xmm10 691 692 movsd -11 * SIZE(AO), %xmm12 693 mulsd %xmm12, %xmm8 694 695 movsd -12 * SIZE(AO), %xmm13 696 mulsd %xmm8, %xmm13 697 subsd %xmm13, %xmm10 698 699 movsd -16 * SIZE(AO), %xmm12 700 mulsd %xmm12, %xmm10 701 702 unpcklpd %xmm8, %xmm10 703 unpcklpd %xmm9, %xmm11 704#endif 705 706#ifdef LT 707 movapd %xmm10, %xmm8 708 unpckhpd %xmm8, %xmm8 709 710 movapd %xmm11, %xmm9 711 unpckhpd %xmm9, %xmm9 712 713 movsd -16 * SIZE(AO), %xmm12 714 mulsd %xmm12, %xmm10 715 716 movsd -15 * SIZE(AO), %xmm13 717 mulsd %xmm10, %xmm13 718 subsd %xmm13, %xmm8 719 movsd -14 * SIZE(AO), %xmm14 720 mulsd %xmm10, %xmm14 721 subsd %xmm14, %xmm11 722 movsd -13 * SIZE(AO), %xmm15 723 mulsd %xmm10, %xmm15 724 subsd %xmm15, %xmm9 725 726 movsd -11 * SIZE(AO), %xmm12 727 mulsd %xmm12, %xmm8 728 729 movsd -10 * SIZE(AO), %xmm13 730 mulsd %xmm8, %xmm13 731 subsd %xmm13, %xmm11 732 movsd -9 * SIZE(AO), %xmm14 733 mulsd %xmm8, %xmm14 734 subsd %xmm14, %xmm9 735 736 movsd -6 * SIZE(AO), %xmm12 737 mulsd %xmm12, %xmm11 738 739 movsd -5 * SIZE(AO), %xmm13 740 mulsd %xmm11, %xmm13 741 subsd %xmm13, %xmm9 742 743 movsd -1 * SIZE(AO), %xmm12 744 mulsd %xmm12, %xmm9 745 746 unpcklpd %xmm8, %xmm10 747 unpcklpd %xmm9, %xmm11 748#endif 749 750#ifdef RN 751 movddup -16 * SIZE(BO), %xmm8 752 mulpd %xmm8, %xmm10 753 mulpd %xmm8, %xmm11 754#endif 755 756#ifdef RT 757 movddup -16 * SIZE(BO), %xmm8 758 mulpd %xmm8, %xmm10 759 mulpd %xmm8, %xmm11 760#endif 761 762#ifdef LN 763 subq $4 * SIZE, CO1 764#endif 765 766 movlpd %xmm10, 0 * SIZE(CO1) 767 movhpd %xmm10, 1 * SIZE(CO1) 768 movlpd %xmm11, 2 * SIZE(CO1) 769 movhpd %xmm11, 3 * SIZE(CO1) 770 771#if defined(LN) || defined(LT) 772 movaps %xmm10, -16 * SIZE(BO) 773 movaps %xmm11, -14 * SIZE(BO) 774#else 775 movaps %xmm10, -16 * SIZE(AO) 776 movaps %xmm11, -14 * SIZE(AO) 777#endif 778 779#ifndef LN 780 addq $4 * SIZE, CO1 781#endif 782 783#if defined(LT) || defined(RN) 784 movq K, %rax 785 subq KK, %rax 786 leaq (,%rax, SIZE), %rax 787 leaq (AO, %rax, 4), AO 788 addq %rax, BO 789#endif 790 791#ifdef LN 792 subq $4, KK 793#endif 794 795#ifdef LT 796 addq $4, KK 797#endif 798 799#ifdef RT 800 movq K, %rax 801 salq $2 + BASE_SHIFT, %rax 802 addq %rax, AORIG 803#endif 804 805 decq I # i -- 806 jg .L91 807 ALIGN_4 808 809.L100: 810 testq $2, M 811 je .L110 812 813#ifdef LN 814 movq K, %rax 815 salq $1 + BASE_SHIFT, %rax 816 subq %rax, AORIG 817#endif 818 819#if defined(LN) || defined(RT) 820 movq KK, %rax 821 movq AORIG, AO 822 leaq (, %rax, SIZE), %rax 823 leaq (AO, %rax, 2), AO 824#endif 825 826 movq B, BO 827 828#if defined(LN) || defined(RT) 829 movq KK, %rax 830 leaq (BO, %rax, SIZE), BO 831#endif 832 833 movddup -16 * SIZE(BO), %xmm0 834 pxor %xmm8, %xmm8 835 movddup -15 * SIZE(BO), %xmm1 836 pxor %xmm9, %xmm9 837 movddup -14 * SIZE(BO), %xmm2 838 pxor %xmm10, %xmm10 839 movddup -13 * SIZE(BO), %xmm3 840 pxor %xmm11, %xmm11 841 842#if defined(LT) || defined(RN) 843 movq KK, %rax 844#else 845 movq K, %rax 846 subq KK, %rax 847#endif 848 andq $-4, %rax 849 leaq (, %rax, SIZE), %rax 850 leaq (AO, %rax, 2), AO 851 leaq (BO, %rax, 1), BO 852 negq %rax 853 NOBRANCH 854 je .L106 855 ALIGN_4 856 857.L102: 858 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 859 addpd %xmm0, %xmm8 860 movddup -12 * SIZE(BO, %rax, 1), %xmm0 861 862 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 863 addpd %xmm1, %xmm9 864 movddup -11 * SIZE(BO, %rax, 1), %xmm1 865 866 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 867 addpd %xmm2, %xmm10 868 movddup -10 * SIZE(BO, %rax, 1), %xmm2 869 870 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 871 addpd %xmm3, %xmm11 872 movddup -9 * SIZE(BO, %rax, 1), %xmm3 873 874 addq $4 * SIZE, %rax 875 BRANCH 876 jl .L102 877 ALIGN_4 878 879.L106: 880#if defined(LT) || defined(RN) 881 movq KK, %rax 882#else 883 movq K, %rax 884 subq KK, %rax 885#endif 886 andq $3, %rax # if (k & 1) 887 je .L109 888 889 leaq (, %rax, SIZE), %rax 890 leaq (AO, %rax, 2), AO 891 leaq (BO, %rax, 1), BO 892 negq %rax 893 ALIGN_4 894 895.L107: 896 movddup -16 * SIZE(BO, %rax, 1), %xmm0 897 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 898 addpd %xmm0, %xmm8 899 900 addq $SIZE, %rax 901 jl .L107 902 ALIGN_4 903 904.L109: 905 addpd %xmm9, %xmm8 906 addpd %xmm11, %xmm10 907 addpd %xmm10, %xmm8 908 909#if defined(LN) || defined(RT) 910 movq KK, %rax 911#ifdef LN 912 subq $2, %rax 913#else 914 subq $1, %rax 915#endif 916 917 leaq (, %rax, SIZE), %rax 918 919 movq AORIG, AO 920 leaq (AO, %rax, 2), AO 921 leaq (B, %rax, 1), BO 922#endif 923 924#if defined(LN) || defined(LT) 925 movapd -16 * SIZE(BO), %xmm10 926 subpd %xmm8, %xmm10 927#else 928 movapd -16 * SIZE(AO), %xmm10 929 subpd %xmm8, %xmm10 930#endif 931 932#ifdef LN 933 movapd %xmm10, %xmm8 934 unpckhpd %xmm8, %xmm8 935 936 movsd -13 * SIZE(AO), %xmm12 937 mulsd %xmm12, %xmm8 938 939 movsd -14 * SIZE(AO), %xmm13 940 mulsd %xmm8, %xmm13 941 subsd %xmm13, %xmm10 942 943 movsd -16 * SIZE(AO), %xmm12 944 mulsd %xmm12, %xmm10 945 946 unpcklpd %xmm8, %xmm10 947#endif 948 949#ifdef LT 950 movapd %xmm10, %xmm8 951 unpckhpd %xmm8, %xmm8 952 953 movsd -16 * SIZE(AO), %xmm12 954 mulsd %xmm12, %xmm10 955 956 movsd -15 * SIZE(AO), %xmm13 957 mulsd %xmm10, %xmm13 958 subsd %xmm13, %xmm8 959 960 movsd -13 * SIZE(AO), %xmm12 961 mulsd %xmm12, %xmm8 962 963 unpcklpd %xmm8, %xmm10 964#endif 965 966#ifdef RN 967 movddup -16 * SIZE(BO), %xmm8 968 mulpd %xmm8, %xmm10 969#endif 970 971#ifdef RT 972 movddup -16 * SIZE(BO), %xmm8 973 mulpd %xmm8, %xmm10 974#endif 975 976#ifdef LN 977 subq $2 * SIZE, CO1 978#endif 979 980#if defined(LN) || defined(LT) 981 movlpd %xmm10, 0 * SIZE(CO1) 982 movhpd %xmm10, 1 * SIZE(CO1) 983#else 984 movlpd %xmm10, 0 * SIZE(CO1) 985 movhpd %xmm10, 1 * SIZE(CO1) 986#endif 987 988#if defined(LN) || defined(LT) 989 movaps %xmm10, -16 * SIZE(BO) 990#else 991 movaps %xmm10, -16 * SIZE(AO) 992#endif 993 994#ifndef LN 995 addq $2 * SIZE, CO1 996#endif 997 998#if defined(LT) || defined(RN) 999 movq K, %rax 1000 subq KK, %rax 1001 leaq (,%rax, SIZE), %rax 1002 leaq (AO, %rax, 2), AO 1003 addq %rax, BO 1004#endif 1005 1006#ifdef LN 1007 subq $2, KK 1008#endif 1009 1010#ifdef LT 1011 addq $2, KK 1012#endif 1013 1014#ifdef RT 1015 movq K, %rax 1016 salq $1 + BASE_SHIFT, %rax 1017 addq %rax, AORIG 1018#endif 1019 ALIGN_4 1020 1021.L110: 1022 testq $1, M 1023 je .L119 1024 1025#ifdef LN 1026 movq K, %rax 1027 salq $0 + BASE_SHIFT, %rax 1028 subq %rax, AORIG 1029#endif 1030 1031#if defined(LN) || defined(RT) 1032 movq KK, %rax 1033 movq AORIG, AO 1034 leaq (, %rax, SIZE), %rax 1035 leaq (AO, %rax, 1), AO 1036#endif 1037 1038 movq B, BO 1039 1040#if defined(LN) || defined(RT) 1041 movq KK, %rax 1042 leaq (BO, %rax, SIZE), BO 1043#endif 1044 1045 movapd -16 * SIZE(AO), %xmm0 1046 pxor %xmm8, %xmm8 1047 movapd -14 * SIZE(AO), %xmm1 1048 pxor %xmm9, %xmm9 1049 1050#if defined(LT) || defined(RN) 1051 movq KK, %rax 1052#else 1053 movq K, %rax 1054 subq KK, %rax 1055#endif 1056 andq $-4, %rax 1057 leaq (, %rax, SIZE), %rax 1058 leaq (AO, %rax, 1), AO 1059 leaq (BO, %rax, 1), BO 1060 negq %rax 1061 NOBRANCH 1062 je .L116 1063 ALIGN_4 1064 1065.L112: 1066 mulpd -16 * SIZE(BO, %rax, 1), %xmm0 1067 addpd %xmm0, %xmm8 1068 movapd -12 * SIZE(AO, %rax, 1), %xmm0 1069 1070 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 1071 addpd %xmm1, %xmm9 1072 movapd -10 * SIZE(AO, %rax, 1), %xmm1 1073 1074 addq $4 * SIZE, %rax 1075 BRANCH 1076 jl .L112 1077 ALIGN_4 1078 1079.L116: 1080#if defined(LT) || defined(RN) 1081 movq KK, %rax 1082#else 1083 movq K, %rax 1084 subq KK, %rax 1085#endif 1086 andq $3, %rax # if (k & 1) 1087 je .L118 1088 1089 leaq (, %rax, SIZE), %rax 1090 leaq (AO, %rax, 1), AO 1091 leaq (BO, %rax, 1), BO 1092 negq %rax 1093 ALIGN_4 1094 1095.L117: 1096 mulsd -16 * SIZE(BO, %rax, 1), %xmm0 1097 addsd %xmm0, %xmm8 1098 movsd -15 * SIZE(AO, %rax, 1), %xmm0 1099 1100 addq $SIZE, %rax 1101 jl .L117 1102 ALIGN_4 1103 1104.L118: 1105 addpd %xmm9, %xmm8 1106 haddpd %xmm8, %xmm8 1107 1108#if defined(LN) || defined(RT) 1109 movq KK, %rax 1110#ifdef LN 1111 subq $1, %rax 1112#else 1113 subq $1, %rax 1114#endif 1115 1116 leaq (, %rax, SIZE), %rax 1117 1118 movq AORIG, AO 1119 leaq (AO, %rax, 1), AO 1120 leaq (B, %rax, 1), BO 1121#endif 1122 1123#if defined(LN) || defined(LT) 1124 movsd -16 * SIZE(BO), %xmm10 1125 subsd %xmm8, %xmm10 1126#else 1127 movsd -16 * SIZE(AO), %xmm10 1128 subsd %xmm8, %xmm10 1129#endif 1130 1131#if defined(LN) || defined(LT) 1132 movsd -16 * SIZE(AO), %xmm12 1133 mulsd %xmm12, %xmm10 1134#endif 1135 1136#if defined(RN) || defined(RT) 1137 movsd -16 * SIZE(BO), %xmm8 1138 mulsd %xmm8, %xmm10 1139#endif 1140 1141#ifdef LN 1142 subq $1 * SIZE, CO1 1143#endif 1144 1145 movsd %xmm10, 0 * SIZE(CO1) 1146 1147#if defined(LN) || defined(LT) 1148 movlpd %xmm10, -16 * SIZE(BO) 1149#else 1150 movlpd %xmm10, -16 * SIZE(AO) 1151#endif 1152 1153#ifndef LN 1154 addq $1 * SIZE, CO1 1155#endif 1156 1157#if defined(LT) || defined(RN) 1158 movq K, %rax 1159 subq KK, %rax 1160 leaq (,%rax, SIZE), %rax 1161 addq %rax, AO 1162 addq %rax, BO 1163#endif 1164 1165#ifdef LN 1166 subq $1, KK 1167#endif 1168 1169#ifdef LT 1170 addq $1, KK 1171#endif 1172 1173#ifdef RT 1174 movq K, %rax 1175 salq $0 + BASE_SHIFT, %rax 1176 addq %rax, AORIG 1177#endif 1178 ALIGN_4 1179 1180.L119: 1181#ifdef LN 1182 leaq (B, K, SIZE), B 1183#endif 1184 1185#if defined(LT) || defined(RN) 1186 movq BO, B 1187#endif 1188 1189#ifdef RN 1190 addq $1, KK 1191#endif 1192 1193#ifdef RT 1194 subq $1, KK 1195#endif 1196 ALIGN_4 1197 1198.L40: 1199 testq $2, N 1200 je .L80 1201 1202#if defined(LT) || defined(RN) 1203 movq A, AO 1204#else 1205 movq A, AORIG 1206#endif 1207 1208#ifdef RT 1209 movq K, %rax 1210 salq $1 + BASE_SHIFT, %rax 1211 subq %rax, B 1212 1213 leaq (, LDC, 2), %rax 1214 subq %rax, C 1215#endif 1216 1217 movq C, CO1 # coffset1 = c 1218 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1219#ifndef RT 1220 leaq (C, LDC, 2), C 1221#endif 1222 1223#ifdef LN 1224 movq OFFSET, %rax 1225 addq M, %rax 1226 movq %rax, KK 1227#endif 1228 1229#if defined(LT) 1230 movq OFFSET, %rax 1231 movq %rax, KK 1232#endif 1233 1234 movq M, I 1235 sarq $2, I # i = (m >> 2) 1236 jle .L60 1237 ALIGN_4 1238 1239.L51: 1240#ifdef LN 1241 movq K, %rax 1242 salq $2 + BASE_SHIFT, %rax 1243 subq %rax, AORIG 1244#endif 1245 1246#if defined(LN) || defined(RT) 1247 movq KK, %rax 1248 movq AORIG, AO 1249 leaq (, %rax, SIZE), %rax 1250 leaq (AO, %rax, 4), AO 1251#endif 1252 1253 movq B, BO 1254 1255#if defined(LN) || defined(RT) 1256 movq KK, %rax 1257 leaq (, %rax, SIZE), %rax 1258 leaq (BO, %rax, 2), BO 1259#endif 1260 1261 movddup -16 * SIZE(BO), %xmm1 1262 movddup -15 * SIZE(BO), %xmm5 1263 pxor %xmm8, %xmm8 1264 movddup -12 * SIZE(BO), %xmm3 1265 pxor %xmm9, %xmm9 1266 movapd -16 * SIZE(AO), %xmm0 1267 pxor %xmm12, %xmm12 1268 movapd -8 * SIZE(AO), %xmm4 1269 pxor %xmm13, %xmm13 1270 1271#ifndef LN 1272 prefetchw 3 * SIZE(CO1) 1273 movapd %xmm0, %xmm2 1274 prefetchw 5 * SIZE(CO2) 1275#else 1276 prefetchw -4 * SIZE(CO1) 1277 movapd %xmm0, %xmm2 1278 prefetchw -4 * SIZE(CO2) 1279#endif 1280 1281 1282#if defined(LT) || defined(RN) 1283 movq KK, %rax 1284#else 1285 movq K, %rax 1286 subq KK, %rax 1287#endif 1288 andq $-4, %rax 1289 leaq (, %rax, SIZE), %rax 1290 leaq (AO, %rax, 4), AO 1291 leaq (BO, %rax, 2), BO 1292 negq %rax 1293 NOBRANCH 1294 je .L56 1295 ALIGN_4 1296 1297.L52: 1298 mulpd %xmm1, %xmm0 1299 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1300 addpd %xmm0, %xmm8 1301 movapd -12 * SIZE(AO, %rax, 4), %xmm0 1302 addpd %xmm1, %xmm12 1303 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1304 mulpd %xmm5, %xmm2 1305 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 1306 addpd %xmm2, %xmm9 1307 addpd %xmm5, %xmm13 1308 movddup -13 * SIZE(BO, %rax, 2), %xmm5 1309 movapd %xmm0, %xmm2 1310 mulpd %xmm1, %xmm0 1311 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 1312 addpd %xmm0, %xmm8 1313 movapd (AO, %rax, 4), %xmm0 1314 addpd %xmm1, %xmm12 1315 movddup -8 * SIZE(BO, %rax, 2), %xmm1 1316 mulpd %xmm5, %xmm2 1317 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 1318 addpd %xmm2, %xmm9 1319 addpd %xmm5, %xmm13 1320 movddup -11 * SIZE(BO, %rax, 2), %xmm5 1321 movapd %xmm4, %xmm2 1322 mulpd %xmm3, %xmm4 1323 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 1324 addpd %xmm4, %xmm8 1325 movapd -4 * SIZE(AO, %rax, 4), %xmm4 1326 addpd %xmm3, %xmm12 1327 movddup -10 * SIZE(BO, %rax, 2), %xmm3 1328 mulpd %xmm5, %xmm2 1329 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 1330 addpd %xmm2, %xmm9 1331 addpd %xmm5, %xmm13 1332 movddup -9 * SIZE(BO, %rax, 2), %xmm5 1333 movapd %xmm4, %xmm2 1334 mulpd %xmm3, %xmm4 1335 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 1336 addpd %xmm4, %xmm8 1337 movapd 8 * SIZE(AO, %rax, 4), %xmm4 1338 addpd %xmm3, %xmm12 1339 movddup -4 * SIZE(BO, %rax, 2), %xmm3 1340 mulpd %xmm5, %xmm2 1341 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 1342 addpd %xmm2, %xmm9 1343 addpd %xmm5, %xmm13 1344 movddup -7 * SIZE(BO, %rax, 2), %xmm5 1345 movapd %xmm0, %xmm2 1346 1347 addq $4 * SIZE, %rax 1348 BRANCH 1349 jl .L52 1350 ALIGN_4 1351 1352.L56: 1353#if defined(LT) || defined(RN) 1354 movq KK, %rax 1355#else 1356 movq K, %rax 1357 subq KK, %rax 1358#endif 1359 andq $3, %rax # if (k & 1) 1360 je .L59 1361 1362 leaq (, %rax, SIZE), %rax 1363 leaq (AO, %rax, 4), AO 1364 leaq (BO, %rax, 2), BO 1365 negq %rax 1366 ALIGN_4 1367 1368.L57: 1369 mulpd %xmm1, %xmm0 1370 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1371 addpd %xmm0, %xmm8 1372 movapd -12 * SIZE(AO, %rax, 4), %xmm0 1373 addpd %xmm1, %xmm12 1374 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1375 mulpd %xmm5, %xmm2 1376 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 1377 addpd %xmm2, %xmm9 1378 addpd %xmm5, %xmm13 1379 movddup -13 * SIZE(BO, %rax, 2), %xmm5 1380 movapd %xmm0, %xmm2 1381 1382 addq $SIZE, %rax 1383 jl .L57 1384 ALIGN_4 1385 1386.L59: 1387#if defined(LN) || defined(RT) 1388 movq KK, %rax 1389#ifdef LN 1390 subq $4, %rax 1391#else 1392 subq $2, %rax 1393#endif 1394 1395 leaq (, %rax, SIZE), %rax 1396 1397 movq AORIG, AO 1398 leaq (AO, %rax, 4), AO 1399 leaq (B, %rax, 2), BO 1400#endif 1401 1402#if defined(LN) || defined(LT) 1403 movapd %xmm8, %xmm0 1404 unpcklpd %xmm9, %xmm8 1405 unpckhpd %xmm9, %xmm0 1406 1407 movapd %xmm12, %xmm4 1408 unpcklpd %xmm13, %xmm12 1409 unpckhpd %xmm13, %xmm4 1410 1411 movapd -16 * SIZE(BO), %xmm9 1412 movapd -14 * SIZE(BO), %xmm13 1413 movapd -12 * SIZE(BO), %xmm1 1414 movapd -10 * SIZE(BO), %xmm5 1415 1416 subpd %xmm8, %xmm9 1417 subpd %xmm0, %xmm13 1418 subpd %xmm12, %xmm1 1419 subpd %xmm4, %xmm5 1420#else 1421 movapd -16 * SIZE(AO), %xmm0 1422 movapd -14 * SIZE(AO), %xmm1 1423 movapd -12 * SIZE(AO), %xmm2 1424 movapd -10 * SIZE(AO), %xmm3 1425 1426 subpd %xmm8, %xmm0 1427 subpd %xmm12, %xmm1 1428 subpd %xmm9, %xmm2 1429 subpd %xmm13, %xmm3 1430#endif 1431 1432#ifdef LN 1433 movddup -1 * SIZE(AO), %xmm8 1434 mulpd %xmm8, %xmm5 1435 movddup -2 * SIZE(AO), %xmm10 1436 mulpd %xmm5, %xmm10 1437 subpd %xmm10, %xmm1 1438 movddup -3 * SIZE(AO), %xmm12 1439 mulpd %xmm5, %xmm12 1440 subpd %xmm12, %xmm13 1441 movddup -4 * SIZE(AO), %xmm14 1442 mulpd %xmm5, %xmm14 1443 subpd %xmm14, %xmm9 1444 1445 movddup -6 * SIZE(AO), %xmm8 1446 mulpd %xmm8, %xmm1 1447 movddup -7 * SIZE(AO), %xmm10 1448 mulpd %xmm1, %xmm10 1449 subpd %xmm10, %xmm13 1450 movddup -8 * SIZE(AO), %xmm12 1451 mulpd %xmm1, %xmm12 1452 subpd %xmm12, %xmm9 1453 1454 movddup -11 * SIZE(AO), %xmm8 1455 mulpd %xmm8, %xmm13 1456 movddup -12 * SIZE(AO), %xmm10 1457 mulpd %xmm13, %xmm10 1458 subpd %xmm10, %xmm9 1459 1460 movddup -16 * SIZE(AO), %xmm8 1461 mulpd %xmm8, %xmm9 1462#endif 1463 1464#ifdef LT 1465 movddup -16 * SIZE(AO), %xmm8 1466 mulpd %xmm8, %xmm9 1467 movddup -15 * SIZE(AO), %xmm10 1468 mulpd %xmm9, %xmm10 1469 subpd %xmm10, %xmm13 1470 movddup -14 * SIZE(AO), %xmm12 1471 mulpd %xmm9, %xmm12 1472 subpd %xmm12, %xmm1 1473 movddup -13 * SIZE(AO), %xmm14 1474 mulpd %xmm9, %xmm14 1475 subpd %xmm14, %xmm5 1476 1477 1478 movddup -11 * SIZE(AO), %xmm8 1479 mulpd %xmm8, %xmm13 1480 1481 movddup -10 * SIZE(AO), %xmm10 1482 mulpd %xmm13, %xmm10 1483 subpd %xmm10, %xmm1 1484 movddup -9 * SIZE(AO), %xmm12 1485 mulpd %xmm13, %xmm12 1486 subpd %xmm12, %xmm5 1487 1488 movddup -6 * SIZE(AO), %xmm8 1489 mulpd %xmm8, %xmm1 1490 movddup -5 * SIZE(AO), %xmm10 1491 mulpd %xmm1, %xmm10 1492 subpd %xmm10, %xmm5 1493 1494 movddup -1 * SIZE(AO), %xmm8 1495 mulpd %xmm8, %xmm5 1496#endif 1497 1498#ifdef RN 1499 movddup -16 * SIZE(BO), %xmm8 1500 mulpd %xmm8, %xmm0 1501 mulpd %xmm8, %xmm1 1502 1503 movddup -15 * SIZE(BO), %xmm9 1504 mulpd %xmm0, %xmm9 1505 subpd %xmm9, %xmm2 1506 movddup -15 * SIZE(BO), %xmm9 1507 mulpd %xmm1, %xmm9 1508 subpd %xmm9, %xmm3 1509 1510 movddup -13 * SIZE(BO), %xmm8 1511 mulpd %xmm8, %xmm2 1512 mulpd %xmm8, %xmm3 1513#endif 1514 1515#ifdef RT 1516 movddup -13 * SIZE(BO), %xmm8 1517 mulpd %xmm8, %xmm2 1518 mulpd %xmm8, %xmm3 1519 1520 movddup -14 * SIZE(BO), %xmm9 1521 mulpd %xmm2, %xmm9 1522 subpd %xmm9, %xmm0 1523 movddup -14 * SIZE(BO), %xmm9 1524 mulpd %xmm3, %xmm9 1525 subpd %xmm9, %xmm1 1526 1527 movddup -16 * SIZE(BO), %xmm8 1528 mulpd %xmm8, %xmm0 1529 mulpd %xmm8, %xmm1 1530#endif 1531 1532#ifdef LN 1533 subq $4 * SIZE, CO1 1534 subq $4 * SIZE, CO2 1535#endif 1536 1537#if defined(LN) || defined(LT) 1538 movlpd %xmm9, 0 * SIZE(CO1) 1539 movlpd %xmm13, 1 * SIZE(CO1) 1540 movlpd %xmm1, 2 * SIZE(CO1) 1541 movlpd %xmm5, 3 * SIZE(CO1) 1542 1543 movhpd %xmm9, 0 * SIZE(CO2) 1544 movhpd %xmm13, 1 * SIZE(CO2) 1545 movhpd %xmm1, 2 * SIZE(CO2) 1546 movhpd %xmm5, 3 * SIZE(CO2) 1547#else 1548 movlpd %xmm0, 0 * SIZE(CO1) 1549 movhpd %xmm0, 1 * SIZE(CO1) 1550 movlpd %xmm1, 2 * SIZE(CO1) 1551 movhpd %xmm1, 3 * SIZE(CO1) 1552 1553 movlpd %xmm2, 0 * SIZE(CO2) 1554 movhpd %xmm2, 1 * SIZE(CO2) 1555 movlpd %xmm3, 2 * SIZE(CO2) 1556 movhpd %xmm3, 3 * SIZE(CO2) 1557#endif 1558 1559#if defined(LN) || defined(LT) 1560 movaps %xmm9, -16 * SIZE(BO) 1561 movaps %xmm13,-14 * SIZE(BO) 1562 movaps %xmm1, -12 * SIZE(BO) 1563 movaps %xmm5, -10 * SIZE(BO) 1564#else 1565 movaps %xmm0, -16 * SIZE(AO) 1566 movaps %xmm1, -14 * SIZE(AO) 1567 movaps %xmm2, -12 * SIZE(AO) 1568 movaps %xmm3, -10 * SIZE(AO) 1569#endif 1570 1571#ifndef LN 1572 addq $4 * SIZE, CO1 1573 addq $4 * SIZE, CO2 1574#endif 1575 1576#if defined(LT) || defined(RN) 1577 movq K, %rax 1578 subq KK, %rax 1579 leaq (,%rax, SIZE), %rax 1580 leaq (AO, %rax, 4), AO 1581 leaq (BO, %rax, 2), BO 1582#endif 1583 1584#ifdef LN 1585 subq $4, KK 1586#endif 1587 1588#ifdef LT 1589 addq $4, KK 1590#endif 1591 1592#ifdef RT 1593 movq K, %rax 1594 salq $2 + BASE_SHIFT, %rax 1595 addq %rax, AORIG 1596#endif 1597 1598 decq I # i -- 1599 jg .L51 1600 ALIGN_4 1601 1602.L60: 1603 testq $2, M 1604 je .L70 1605 1606#ifdef LN 1607 movq K, %rax 1608 salq $1 + BASE_SHIFT, %rax 1609 subq %rax, AORIG 1610#endif 1611 1612#if defined(LN) || defined(RT) 1613 movq KK, %rax 1614 movq AORIG, AO 1615 leaq (, %rax, SIZE), %rax 1616 leaq (AO, %rax, 2), AO 1617#endif 1618 1619 movq B, BO 1620 1621#if defined(LN) || defined(RT) 1622 movq KK, %rax 1623 leaq (, %rax, SIZE), %rax 1624 leaq (BO, %rax, 2), BO 1625#endif 1626 1627 movapd -16 * SIZE(AO), %xmm0 1628 pxor %xmm8, %xmm8 1629 movapd -12 * SIZE(AO), %xmm2 1630 pxor %xmm9, %xmm9 1631 movddup -16 * SIZE(BO), %xmm1 1632 pxor %xmm10, %xmm10 1633 movddup -15 * SIZE(BO), %xmm3 1634 pxor %xmm11, %xmm11 1635 1636#if defined(LT) || defined(RN) 1637 movq KK, %rax 1638#else 1639 movq K, %rax 1640 subq KK, %rax 1641#endif 1642 andq $-4, %rax 1643 leaq (, %rax, SIZE), %rax 1644 leaq (AO, %rax, 2), AO 1645 leaq (BO, %rax, 2), BO 1646 negq %rax 1647 NOBRANCH 1648 je .L66 1649 ALIGN_4 1650 1651.L62: 1652 mulpd %xmm0, %xmm1 1653 addpd %xmm1, %xmm8 1654 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1655 mulpd %xmm0, %xmm3 1656 movapd -14 * SIZE(AO, %rax, 2), %xmm0 1657 addpd %xmm3, %xmm9 1658 movddup -13 * SIZE(BO, %rax, 2), %xmm3 1659 mulpd %xmm0, %xmm1 1660 addpd %xmm1, %xmm10 1661 movddup -12 * SIZE(BO, %rax, 2), %xmm1 1662 mulpd %xmm0, %xmm3 1663 movapd -8 * SIZE(AO, %rax, 2), %xmm0 1664 addpd %xmm3, %xmm11 1665 movddup -11 * SIZE(BO, %rax, 2), %xmm3 1666 mulpd %xmm2, %xmm1 1667 addpd %xmm1, %xmm8 1668 movddup -10 * SIZE(BO, %rax, 2), %xmm1 1669 mulpd %xmm2, %xmm3 1670 movapd -10 * SIZE(AO, %rax, 2), %xmm2 1671 addpd %xmm3, %xmm9 1672 movddup -9 * SIZE(BO, %rax, 2), %xmm3 1673 mulpd %xmm2, %xmm1 1674 addpd %xmm1, %xmm10 1675 movddup -8 * SIZE(BO, %rax, 2), %xmm1 1676 mulpd %xmm2, %xmm3 1677 movapd -4 * SIZE(AO, %rax, 2), %xmm2 1678 addpd %xmm3, %xmm11 1679 movddup -7 * SIZE(BO, %rax, 2), %xmm3 1680 1681 addq $4 * SIZE, %rax 1682 BRANCH 1683 jl .L62 1684 ALIGN_4 1685 1686.L66: 1687#if defined(LT) || defined(RN) 1688 movq KK, %rax 1689#else 1690 movq K, %rax 1691 subq KK, %rax 1692#endif 1693 andq $3, %rax # if (k & 1) 1694 je .L69 1695 1696 leaq (, %rax, SIZE), %rax 1697 leaq (AO, %rax, 2), AO 1698 leaq (BO, %rax, 2), BO 1699 negq %rax 1700 ALIGN_4 1701 1702.L67: 1703 mulpd %xmm0, %xmm1 1704 addpd %xmm1, %xmm8 1705 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1706 mulpd %xmm0, %xmm3 1707 movapd -14 * SIZE(AO, %rax, 2), %xmm0 1708 addpd %xmm3, %xmm9 1709 movddup -13 * SIZE(BO, %rax, 2), %xmm3 1710 1711 addq $SIZE, %rax 1712 jl .L67 1713 ALIGN_4 1714 1715.L69: 1716 addpd %xmm10, %xmm8 1717 addpd %xmm11, %xmm9 1718 1719#if defined(LN) || defined(RT) 1720 movq KK, %rax 1721#ifdef LN 1722 subq $2, %rax 1723#else 1724 subq $2, %rax 1725#endif 1726 1727 leaq (, %rax, SIZE), %rax 1728 1729 movq AORIG, AO 1730 leaq (AO, %rax, 2), AO 1731 leaq (B, %rax, 2), BO 1732#endif 1733 1734#if defined(LN) || defined(LT) 1735 movapd %xmm8, %xmm0 1736 unpcklpd %xmm9, %xmm8 1737 unpckhpd %xmm9, %xmm0 1738 1739 movapd -16 * SIZE(BO), %xmm9 1740 movapd -14 * SIZE(BO), %xmm13 1741 1742 subpd %xmm8, %xmm9 1743 subpd %xmm0, %xmm13 1744#else 1745 movapd -16 * SIZE(AO), %xmm0 1746 movapd -14 * SIZE(AO), %xmm2 1747 1748 subpd %xmm8, %xmm0 1749 subpd %xmm9, %xmm2 1750#endif 1751 1752 1753#ifdef LN 1754 movddup -13 * SIZE(AO), %xmm8 1755 mulpd %xmm8, %xmm13 1756 1757 movddup -14 * SIZE(AO), %xmm10 1758 mulpd %xmm13, %xmm10 1759 subpd %xmm10, %xmm9 1760 1761 movddup -16 * SIZE(AO), %xmm8 1762 mulpd %xmm8, %xmm9 1763#endif 1764 1765#ifdef LT 1766 movddup -16 * SIZE(AO), %xmm8 1767 mulpd %xmm8, %xmm9 1768 1769 movddup -15 * SIZE(AO), %xmm10 1770 mulpd %xmm9, %xmm10 1771 subpd %xmm10, %xmm13 1772 1773 movddup -13 * SIZE(AO), %xmm8 1774 mulpd %xmm8, %xmm13 1775#endif 1776 1777#ifdef RN 1778 movddup -16 * SIZE(BO), %xmm8 1779 mulpd %xmm8, %xmm0 1780 1781 movddup -15 * SIZE(BO), %xmm9 1782 mulpd %xmm0, %xmm9 1783 subpd %xmm9, %xmm2 1784 1785 movddup -13 * SIZE(BO), %xmm8 1786 mulpd %xmm8, %xmm2 1787#endif 1788 1789#ifdef RT 1790 movddup -13 * SIZE(BO), %xmm8 1791 mulpd %xmm8, %xmm2 1792 1793 movddup -14 * SIZE(BO), %xmm9 1794 mulpd %xmm2, %xmm9 1795 subpd %xmm9, %xmm0 1796 1797 movddup -16 * SIZE(BO), %xmm8 1798 mulpd %xmm8, %xmm0 1799#endif 1800 1801#ifdef LN 1802 subq $2 * SIZE, CO1 1803 subq $2 * SIZE, CO2 1804#endif 1805 1806#if defined(LN) || defined(LT) 1807 movlpd %xmm9, 0 * SIZE(CO1) 1808 movlpd %xmm13, 1 * SIZE(CO1) 1809 1810 movhpd %xmm9, 0 * SIZE(CO2) 1811 movhpd %xmm13, 1 * SIZE(CO2) 1812#else 1813 movlpd %xmm0, 0 * SIZE(CO1) 1814 movhpd %xmm0, 1 * SIZE(CO1) 1815 1816 movlpd %xmm2, 0 * SIZE(CO2) 1817 movhpd %xmm2, 1 * SIZE(CO2) 1818#endif 1819 1820#if defined(LN) || defined(LT) 1821 movaps %xmm9, -16 * SIZE(BO) 1822 movaps %xmm13, -14 * SIZE(BO) 1823#else 1824 movaps %xmm0, -16 * SIZE(AO) 1825 movaps %xmm2, -14 * SIZE(AO) 1826#endif 1827 1828#ifndef LN 1829 addq $2 * SIZE, CO1 1830 addq $2 * SIZE, CO2 1831#endif 1832 1833#if defined(LT) || defined(RN) 1834 movq K, %rax 1835 subq KK, %rax 1836 leaq (,%rax, SIZE), %rax 1837 leaq (AO, %rax, 2), AO 1838 leaq (BO, %rax, 2), BO 1839#endif 1840 1841#ifdef LN 1842 subq $2, KK 1843#endif 1844 1845#ifdef LT 1846 addq $2, KK 1847#endif 1848 1849#ifdef RT 1850 movq K, %rax 1851 salq $1 + BASE_SHIFT, %rax 1852 addq %rax, AORIG 1853#endif 1854 ALIGN_4 1855 1856.L70: 1857 testq $1, M 1858 je .L79 1859 ALIGN_4 1860 1861.L71: 1862#ifdef LN 1863 movq K, %rax 1864 salq $0 + BASE_SHIFT, %rax 1865 subq %rax, AORIG 1866#endif 1867 1868#if defined(LN) || defined(RT) 1869 movq KK, %rax 1870 movq AORIG, AO 1871 leaq (, %rax, SIZE), %rax 1872 leaq (AO, %rax, 1), AO 1873#endif 1874 1875 movq B, BO 1876 1877#if defined(LN) || defined(RT) 1878 movq KK, %rax 1879 salq $1 + BASE_SHIFT, %rax 1880 leaq (BO, %rax, 1), BO 1881#endif 1882 1883 movddup -16 * SIZE(AO), %xmm0 1884 pxor %xmm8, %xmm8 1885 movddup -15 * SIZE(AO), %xmm1 1886 pxor %xmm9, %xmm9 1887 movddup -14 * SIZE(AO), %xmm2 1888 pxor %xmm10, %xmm10 1889 movddup -13 * SIZE(AO), %xmm3 1890 pxor %xmm11, %xmm11 1891 1892#if defined(LT) || defined(RN) 1893 movq KK, %rax 1894#else 1895 movq K, %rax 1896 subq KK, %rax 1897#endif 1898 andq $-4, %rax 1899 leaq (, %rax, SIZE), %rax 1900 leaq (AO, %rax, 1), AO 1901 leaq (BO, %rax, 2), BO 1902 negq %rax 1903 NOBRANCH 1904 je .L76 1905 ALIGN_4 1906 1907.L72: 1908 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 1909 addpd %xmm0, %xmm8 1910 movddup -12 * SIZE(AO, %rax, 1), %xmm0 1911 1912 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 1913 addpd %xmm1, %xmm9 1914 movddup -11 * SIZE(AO, %rax, 1), %xmm1 1915 1916 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 1917 addpd %xmm2, %xmm10 1918 movddup -10 * SIZE(AO, %rax, 1), %xmm2 1919 1920 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 1921 addpd %xmm3, %xmm11 1922 movddup -9 * SIZE(AO, %rax, 1), %xmm3 1923 1924 addq $4 * SIZE, %rax 1925 BRANCH 1926 jl .L72 1927 ALIGN_4 1928 1929.L76: 1930#if defined(LT) || defined(RN) 1931 movq KK, %rax 1932#else 1933 movq K, %rax 1934 subq KK, %rax 1935#endif 1936 andq $3, %rax # if (k & 1) 1937 je .L78 1938 1939 leaq (, %rax, SIZE), %rax 1940 leaq (AO, %rax, 1), AO 1941 leaq (BO, %rax, 2), BO 1942 negq %rax 1943 ALIGN_4 1944 1945.L77: 1946 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 1947 addpd %xmm0, %xmm8 1948 movddup -15 * SIZE(AO, %rax, 1), %xmm0 1949 1950 addq $SIZE, %rax 1951 jl .L77 1952 ALIGN_4 1953 1954.L78: 1955 addpd %xmm9, %xmm8 1956 addpd %xmm11, %xmm10 1957 addpd %xmm10, %xmm8 1958 1959#if defined(LN) || defined(RT) 1960 movq KK, %rax 1961#ifdef LN 1962 subq $1, %rax 1963#else 1964 subq $2, %rax 1965#endif 1966 1967 leaq (, %rax, SIZE), %rax 1968 1969 movq AORIG, AO 1970 leaq (AO, %rax, 1), AO 1971 leaq (B, %rax, 2), BO 1972#endif 1973 1974#if defined(LN) || defined(LT) 1975 movapd -16 * SIZE(BO), %xmm2 1976#else 1977 movapd -16 * SIZE(AO), %xmm2 1978#endif 1979 1980 subpd %xmm8, %xmm2 1981 1982#if defined(LN) || defined(LT) 1983 movddup -16 * SIZE(AO), %xmm0 1984 1985 mulpd %xmm0, %xmm2 1986#endif 1987 1988#ifdef RN 1989 movapd %xmm2, %xmm0 1990 unpckhpd %xmm0, %xmm0 1991 1992 mulsd -16 * SIZE(BO), %xmm2 1993 movsd -15 * SIZE(BO), %xmm4 1994 mulsd %xmm2, %xmm4 1995 subsd %xmm4, %xmm0 1996 1997 mulsd -13 * SIZE(BO), %xmm0 1998 unpcklpd %xmm0, %xmm2 1999#endif 2000 2001#ifdef RT 2002 movapd %xmm2, %xmm0 2003 unpckhpd %xmm0, %xmm0 2004 2005 mulsd -13 * SIZE(BO), %xmm0 2006 2007 movlpd -14 * SIZE(BO), %xmm4 2008 mulsd %xmm0, %xmm4 2009 subsd %xmm4, %xmm2 2010 2011 mulsd -16 * SIZE(BO), %xmm2 2012 unpcklpd %xmm0, %xmm2 2013#endif 2014 2015#ifdef LN 2016 subq $1 * SIZE, CO1 2017 subq $1 * SIZE, CO2 2018#endif 2019 2020 movlpd %xmm2, 0 * SIZE(CO1) 2021 movhpd %xmm2, 0 * SIZE(CO2) 2022 2023#if defined(LN) || defined(LT) 2024 movaps %xmm2, -16 * SIZE(BO) 2025#else 2026 movaps %xmm2, -16 * SIZE(AO) 2027#endif 2028 2029#ifndef LN 2030 addq $1 * SIZE, CO1 2031 addq $1 * SIZE, CO2 2032#endif 2033 2034#if defined(LT) || defined(RN) 2035 movq K, %rax 2036 subq KK, %rax 2037 leaq (,%rax, SIZE), %rax 2038 leaq (AO, %rax, 1), AO 2039 leaq (BO, %rax, 2), BO 2040#endif 2041 2042#ifdef LN 2043 subq $1, KK 2044#endif 2045 2046#ifdef LT 2047 addq $1, KK 2048#endif 2049 2050#ifdef RT 2051 movq K, %rax 2052 salq $0 + BASE_SHIFT, %rax 2053 addq %rax, AORIG 2054#endif 2055 ALIGN_4 2056 2057.L79: 2058#ifdef LN 2059 leaq (, K, SIZE), %rax 2060 leaq (B, %rax, 2), B 2061#endif 2062 2063#if defined(LT) || defined(RN) 2064 movq BO, B 2065#endif 2066 2067#ifdef RN 2068 addq $2, KK 2069#endif 2070 2071#ifdef RT 2072 subq $2, KK 2073#endif 2074 ALIGN_4 2075 2076.L80: 2077 movq N, J 2078 sarq $2, J # j = (n >> 2) 2079 jle .L999 2080 2081.L01: 2082#if defined(LT) || defined(RN) 2083 movq A, AO 2084#else 2085 movq A, AORIG 2086#endif 2087 2088#ifdef RT 2089 movq K, %rax 2090 salq $2 + BASE_SHIFT, %rax 2091 subq %rax, B 2092 2093 leaq (, LDC, 4), %rax 2094 subq %rax, C 2095#endif 2096 2097 movq C, CO1 # coffset1 = c 2098 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 2099#ifndef RT 2100 leaq (C, LDC, 4), C 2101#endif 2102 2103#ifdef LN 2104 movq OFFSET, %rax 2105 addq M, %rax 2106 movq %rax, KK 2107#endif 2108 2109 movq K, %rax 2110 salq $BASE_SHIFT + 2, %rax 2111 movq B, BB 2112 subq %rax, BB 2113 2114#if defined(LT) 2115 movq OFFSET, %rax 2116 movq %rax, KK 2117#endif 2118 2119 movq M, I 2120 sarq $2, I # i = (m >> 2) 2121 jle .L20 2122 ALIGN_4 2123 2124.L11: 2125#ifdef LN 2126 movq K, %rax 2127 salq $2 + BASE_SHIFT, %rax 2128 subq %rax, AORIG 2129#endif 2130 2131#if defined(LN) || defined(RT) 2132 movq KK, %rax 2133 movq AORIG, AO 2134 leaq (, %rax, SIZE), %rax 2135 leaq (AO, %rax, 4), AO 2136#endif 2137 2138 movq B, BO 2139 2140#if defined(LN) || defined(RT) 2141 movq KK, %rax 2142 leaq (, %rax, SIZE), %rax 2143 leaq (BO, %rax, 4), BO 2144#endif 2145 2146 movapd -16 * SIZE(AO), %xmm0 2147 movddup -16 * SIZE(BO), %xmm1 2148 pxor %xmm8, %xmm8 2149 movddup -15 * SIZE(BO), %xmm3 2150 pxor %xmm9, %xmm9 2151 movapd -8 * SIZE(AO), %xmm4 2152 pxor %xmm10, %xmm10 2153 movddup -8 * SIZE(BO), %xmm5 2154 pxor %xmm11, %xmm11 2155 2156#ifndef LN 2157 prefetchw 3 * SIZE(CO1) 2158 pxor %xmm12, %xmm12 2159 prefetchw 5 * SIZE(CO2) 2160 pxor %xmm13, %xmm13 2161 prefetchw 3 * SIZE(CO1, LDC, 2) 2162 pxor %xmm14, %xmm14 2163 prefetchw 5 * SIZE(CO2, LDC, 2) 2164 pxor %xmm15, %xmm15 2165 movapd %xmm0, %xmm2 2166#else 2167 prefetchw -8 * SIZE(CO1) 2168 pxor %xmm12, %xmm12 2169 prefetchw -8 * SIZE(CO2) 2170 pxor %xmm13, %xmm13 2171 prefetchw -8 * SIZE(CO1, LDC, 2) 2172 pxor %xmm14, %xmm14 2173 prefetchw -8 * SIZE(CO2, LDC, 2) 2174 pxor %xmm15, %xmm15 2175 movapd %xmm0, %xmm2 2176#endif 2177 2178 prefetch -16 * SIZE(BB) 2179 prefetch -8 * SIZE(BB) 2180 subq $-16 * SIZE, BB 2181 2182#if defined(LT) || defined(RN) 2183 movq KK, %rax 2184#else 2185 movq K, %rax 2186 subq KK, %rax 2187#endif 2188 2189 andq $-8, %rax 2190 leaq (, %rax, SIZE), %rax 2191 leaq (AO, %rax, 4), AO 2192 leaq (BO, %rax, 4), BO 2193 negq %rax 2194 NOBRANCH 2195 je .L15 2196 ALIGN_4 2197 2198.L12: 2199 KERNEL1(16 * 0) 2200 KERNEL2(16 * 0) 2201 KERNEL3(16 * 0) 2202 KERNEL4(16 * 0) 2203 KERNEL5(16 * 0) 2204 KERNEL6(16 * 0) 2205 KERNEL7(16 * 0) 2206 KERNEL8(16 * 0) 2207 BRANCH 2208 jl .L12 2209 ALIGN_4 2210 2211.L15: 2212#if defined(LT) || defined(RN) 2213 movq KK, %rax 2214#else 2215 movq K, %rax 2216 subq KK, %rax 2217#endif 2218 testq $4, %rax 2219 je .L16 2220 xorq %rax, %rax 2221 ALIGN_4 2222 2223 KERNEL_SUB1(16 * 0) 2224 KERNEL_SUB2(16 * 0) 2225 KERNEL_SUB3(16 * 0) 2226 KERNEL_SUB4(16 * 0) 2227 2228 subq $-16 * SIZE, BO 2229 subq $-16 * SIZE, AO 2230 ALIGN_4 2231 2232.L16: 2233#if defined(LT) || defined(RN) 2234 movq KK, %rax 2235#else 2236 movq K, %rax 2237 subq KK, %rax 2238#endif 2239 andq $3, %rax # if (k & 1) 2240 je .L19 2241 2242 leaq (, %rax, SIZE), %rax 2243 leaq (AO, %rax, 4), AO 2244 leaq (BO, %rax, 4), BO 2245 negq %rax 2246 ALIGN_4 2247 2248.L17: 2249 mulpd %xmm1, %xmm0 2250 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2251 addpd %xmm0, %xmm8 2252 movapd %xmm2, %xmm0 2253 addpd %xmm1, %xmm12 2254 movddup -14 * SIZE(BO, %rax, 4), %xmm1 2255 mulpd %xmm3, %xmm2 2256 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 2257 addpd %xmm2, %xmm9 2258 movapd %xmm0, %xmm2 2259 addpd %xmm3, %xmm13 2260 movddup -13 * SIZE(BO, %rax, 4), %xmm3 2261 mulpd %xmm1, %xmm0 2262 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2263 addpd %xmm0, %xmm10 2264 movapd -12 * SIZE(AO, %rax, 4), %xmm0 2265 addpd %xmm1, %xmm14 2266 movddup -12 * SIZE(BO, %rax, 4), %xmm1 2267 mulpd %xmm3, %xmm2 2268 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 2269 addpd %xmm2, %xmm11 2270 addpd %xmm3, %xmm15 2271 movddup -11 * SIZE(BO, %rax, 4), %xmm3 2272 movapd %xmm0, %xmm2 2273 2274 addq $SIZE, %rax 2275 jl .L17 2276 ALIGN_4 2277 2278.L19: 2279#if defined(LN) || defined(RT) 2280 movq KK, %rax 2281#ifdef LN 2282 subq $4, %rax 2283#else 2284 subq $4, %rax 2285#endif 2286 2287 leaq (, %rax, SIZE), %rax 2288 2289 movq AORIG, AO 2290 leaq (AO, %rax, 4), AO 2291 leaq (B, %rax, 4), BO 2292#endif 2293 2294#if defined(LN) || defined(LT) 2295 movapd %xmm8, %xmm0 2296 unpcklpd %xmm9, %xmm8 2297 unpckhpd %xmm9, %xmm0 2298 2299 movapd %xmm10, %xmm2 2300 unpcklpd %xmm11, %xmm10 2301 unpckhpd %xmm11, %xmm2 2302 2303 movapd %xmm12, %xmm4 2304 unpcklpd %xmm13, %xmm12 2305 unpckhpd %xmm13, %xmm4 2306 2307 movapd %xmm14, %xmm6 2308 unpcklpd %xmm15, %xmm14 2309 unpckhpd %xmm15, %xmm6 2310 2311 movapd -16 * SIZE(BO), %xmm9 2312 movapd -14 * SIZE(BO), %xmm11 2313 movapd -12 * SIZE(BO), %xmm13 2314 movapd -10 * SIZE(BO), %xmm15 2315 movapd -8 * SIZE(BO), %xmm1 2316 movapd -6 * SIZE(BO), %xmm3 2317 movapd -4 * SIZE(BO), %xmm5 2318 movapd -2 * SIZE(BO), %xmm7 2319 2320 subpd %xmm8, %xmm9 2321 subpd %xmm10, %xmm11 2322 subpd %xmm0, %xmm13 2323 subpd %xmm2, %xmm15 2324 subpd %xmm12, %xmm1 2325 subpd %xmm14, %xmm3 2326 subpd %xmm4, %xmm5 2327 subpd %xmm6, %xmm7 2328#else 2329 movapd -16 * SIZE(AO), %xmm0 2330 movapd -14 * SIZE(AO), %xmm1 2331 movapd -12 * SIZE(AO), %xmm2 2332 movapd -10 * SIZE(AO), %xmm3 2333 2334 movapd -8 * SIZE(AO), %xmm4 2335 movapd -6 * SIZE(AO), %xmm5 2336 movapd -4 * SIZE(AO), %xmm6 2337 movapd -2 * SIZE(AO), %xmm7 2338 2339 subpd %xmm8, %xmm0 2340 subpd %xmm12, %xmm1 2341 subpd %xmm9, %xmm2 2342 subpd %xmm13, %xmm3 2343 subpd %xmm10, %xmm4 2344 subpd %xmm14, %xmm5 2345 subpd %xmm11, %xmm6 2346 subpd %xmm15, %xmm7 2347#endif 2348 2349#ifdef LN 2350 movddup -1 * SIZE(AO), %xmm8 2351 mulpd %xmm8, %xmm5 2352 mulpd %xmm8, %xmm7 2353 2354 movddup -2 * SIZE(AO), %xmm10 2355 mulpd %xmm5, %xmm10 2356 subpd %xmm10, %xmm1 2357 movddup -2 * SIZE(AO), %xmm10 2358 mulpd %xmm7, %xmm10 2359 subpd %xmm10, %xmm3 2360 2361 movddup -3 * SIZE(AO), %xmm12 2362 mulpd %xmm5, %xmm12 2363 subpd %xmm12, %xmm13 2364 movddup -3 * SIZE(AO), %xmm12 2365 mulpd %xmm7, %xmm12 2366 subpd %xmm12, %xmm15 2367 2368 movddup -4 * SIZE(AO), %xmm14 2369 mulpd %xmm5, %xmm14 2370 subpd %xmm14, %xmm9 2371 movddup -4 * SIZE(AO), %xmm14 2372 mulpd %xmm7, %xmm14 2373 subpd %xmm14, %xmm11 2374 2375 movddup -6 * SIZE(AO), %xmm8 2376 mulpd %xmm8, %xmm1 2377 mulpd %xmm8, %xmm3 2378 2379 movddup -7 * SIZE(AO), %xmm10 2380 mulpd %xmm1, %xmm10 2381 subpd %xmm10, %xmm13 2382 movddup -7 * SIZE(AO), %xmm10 2383 mulpd %xmm3, %xmm10 2384 subpd %xmm10, %xmm15 2385 2386 movddup -8 * SIZE(AO), %xmm12 2387 mulpd %xmm1, %xmm12 2388 subpd %xmm12, %xmm9 2389 movddup -8 * SIZE(AO), %xmm12 2390 mulpd %xmm3, %xmm12 2391 subpd %xmm12, %xmm11 2392 2393 movddup -11 * SIZE(AO), %xmm8 2394 mulpd %xmm8, %xmm13 2395 mulpd %xmm8, %xmm15 2396 2397 movddup -12 * SIZE(AO), %xmm10 2398 mulpd %xmm13, %xmm10 2399 subpd %xmm10, %xmm9 2400 movddup -12 * SIZE(AO), %xmm10 2401 mulpd %xmm15, %xmm10 2402 subpd %xmm10, %xmm11 2403 2404 movddup -16 * SIZE(AO), %xmm8 2405 mulpd %xmm8, %xmm9 2406 mulpd %xmm8, %xmm11 2407#endif 2408 2409#ifdef LT 2410 movddup -16 * SIZE(AO), %xmm8 2411 mulpd %xmm8, %xmm9 2412 mulpd %xmm8, %xmm11 2413 2414 movddup -15 * SIZE(AO), %xmm10 2415 mulpd %xmm9, %xmm10 2416 subpd %xmm10, %xmm13 2417 2418 movddup -15 * SIZE(AO), %xmm10 2419 mulpd %xmm11, %xmm10 2420 subpd %xmm10, %xmm15 2421 2422 movddup -14 * SIZE(AO), %xmm12 2423 mulpd %xmm9, %xmm12 2424 subpd %xmm12, %xmm1 2425 movddup -14 * SIZE(AO), %xmm12 2426 mulpd %xmm11, %xmm12 2427 subpd %xmm12, %xmm3 2428 2429 movddup -13 * SIZE(AO), %xmm14 2430 mulpd %xmm9, %xmm14 2431 subpd %xmm14, %xmm5 2432 movddup -13 * SIZE(AO), %xmm14 2433 mulpd %xmm11, %xmm14 2434 subpd %xmm14, %xmm7 2435 2436 movddup -11 * SIZE(AO), %xmm8 2437 mulpd %xmm8, %xmm13 2438 mulpd %xmm8, %xmm15 2439 2440 movddup -10 * SIZE(AO), %xmm10 2441 mulpd %xmm13, %xmm10 2442 subpd %xmm10, %xmm1 2443 movddup -10 * SIZE(AO), %xmm10 2444 mulpd %xmm15, %xmm10 2445 subpd %xmm10, %xmm3 2446 2447 movddup -9 * SIZE(AO), %xmm12 2448 mulpd %xmm13, %xmm12 2449 subpd %xmm12, %xmm5 2450 movddup -9 * SIZE(AO), %xmm12 2451 mulpd %xmm15, %xmm12 2452 subpd %xmm12, %xmm7 2453 2454 movddup -6 * SIZE(AO), %xmm8 2455 mulpd %xmm8, %xmm1 2456 mulpd %xmm8, %xmm3 2457 2458 movddup -5 * SIZE(AO), %xmm10 2459 mulpd %xmm1, %xmm10 2460 subpd %xmm10, %xmm5 2461 movddup -5 * SIZE(AO), %xmm10 2462 mulpd %xmm3, %xmm10 2463 subpd %xmm10, %xmm7 2464 2465 movddup -1 * SIZE(AO), %xmm8 2466 mulpd %xmm8, %xmm5 2467 mulpd %xmm8, %xmm7 2468#endif 2469 2470#ifdef RN 2471 movddup -16 * SIZE(BO), %xmm8 2472 mulpd %xmm8, %xmm0 2473 mulpd %xmm8, %xmm1 2474 2475 movddup -15 * SIZE(BO), %xmm9 2476 mulpd %xmm0, %xmm9 2477 subpd %xmm9, %xmm2 2478 movddup -15 * SIZE(BO), %xmm9 2479 mulpd %xmm1, %xmm9 2480 subpd %xmm9, %xmm3 2481 2482 movddup -14 * SIZE(BO), %xmm10 2483 mulpd %xmm0, %xmm10 2484 subpd %xmm10, %xmm4 2485 movddup -14 * SIZE(BO), %xmm10 2486 mulpd %xmm1, %xmm10 2487 subpd %xmm10, %xmm5 2488 2489 movddup -13 * SIZE(BO), %xmm11 2490 mulpd %xmm0, %xmm11 2491 subpd %xmm11, %xmm6 2492 movddup -13 * SIZE(BO), %xmm11 2493 mulpd %xmm1, %xmm11 2494 subpd %xmm11, %xmm7 2495 2496 movddup -11 * SIZE(BO), %xmm8 2497 mulpd %xmm8, %xmm2 2498 mulpd %xmm8, %xmm3 2499 2500 movddup -10 * SIZE(BO), %xmm9 2501 mulpd %xmm2, %xmm9 2502 subpd %xmm9, %xmm4 2503 movddup -10 * SIZE(BO), %xmm9 2504 mulpd %xmm3, %xmm9 2505 subpd %xmm9, %xmm5 2506 2507 movddup -9 * SIZE(BO), %xmm10 2508 mulpd %xmm2, %xmm10 2509 subpd %xmm10, %xmm6 2510 movddup -9 * SIZE(BO), %xmm10 2511 mulpd %xmm3, %xmm10 2512 subpd %xmm10, %xmm7 2513 2514 movddup -6 * SIZE(BO), %xmm8 2515 mulpd %xmm8, %xmm4 2516 mulpd %xmm8, %xmm5 2517 2518 movddup -5 * SIZE(BO), %xmm9 2519 mulpd %xmm4, %xmm9 2520 subpd %xmm9, %xmm6 2521 movddup -5 * SIZE(BO), %xmm9 2522 mulpd %xmm5, %xmm9 2523 subpd %xmm9, %xmm7 2524 2525 movddup -1 * SIZE(BO), %xmm8 2526 mulpd %xmm8, %xmm6 2527 mulpd %xmm8, %xmm7 2528#endif 2529 2530#ifdef RT 2531 movddup -1 * SIZE(BO), %xmm8 2532 mulpd %xmm8, %xmm6 2533 mulpd %xmm8, %xmm7 2534 2535 movddup -2 * SIZE(BO), %xmm9 2536 mulpd %xmm6, %xmm9 2537 subpd %xmm9, %xmm4 2538 movddup -2 * SIZE(BO), %xmm9 2539 mulpd %xmm7, %xmm9 2540 subpd %xmm9, %xmm5 2541 2542 movddup -3 * SIZE(BO), %xmm10 2543 mulpd %xmm6, %xmm10 2544 subpd %xmm10, %xmm2 2545 movddup -3 * SIZE(BO), %xmm10 2546 mulpd %xmm7, %xmm10 2547 subpd %xmm10, %xmm3 2548 2549 movddup -4 * SIZE(BO), %xmm11 2550 mulpd %xmm6, %xmm11 2551 subpd %xmm11, %xmm0 2552 movddup -4 * SIZE(BO), %xmm11 2553 mulpd %xmm7, %xmm11 2554 subpd %xmm11, %xmm1 2555 2556 movddup -6 * SIZE(BO), %xmm8 2557 mulpd %xmm8, %xmm4 2558 mulpd %xmm8, %xmm5 2559 2560 movddup -7 * SIZE(BO), %xmm9 2561 mulpd %xmm4, %xmm9 2562 subpd %xmm9, %xmm2 2563 movddup -7 * SIZE(BO), %xmm9 2564 mulpd %xmm5, %xmm9 2565 subpd %xmm9, %xmm3 2566 2567 movddup -8 * SIZE(BO), %xmm10 2568 mulpd %xmm4, %xmm10 2569 subpd %xmm10, %xmm0 2570 movddup -8 * SIZE(BO), %xmm10 2571 mulpd %xmm5, %xmm10 2572 subpd %xmm10, %xmm1 2573 2574 movddup -11 * SIZE(BO), %xmm8 2575 mulpd %xmm8, %xmm2 2576 mulpd %xmm8, %xmm3 2577 2578 movddup -12 * SIZE(BO), %xmm9 2579 mulpd %xmm2, %xmm9 2580 subpd %xmm9, %xmm0 2581 movddup -12 * SIZE(BO), %xmm9 2582 mulpd %xmm3, %xmm9 2583 subpd %xmm9, %xmm1 2584 2585 movddup -16 * SIZE(BO), %xmm8 2586 mulpd %xmm8, %xmm0 2587 mulpd %xmm8, %xmm1 2588#endif 2589 2590#ifdef LN 2591 subq $4 * SIZE, CO1 2592 subq $4 * SIZE, CO2 2593#endif 2594 2595#if defined(LN) || defined(LT) 2596 movlpd %xmm9, 0 * SIZE(CO1) 2597 movlpd %xmm13, 1 * SIZE(CO1) 2598 movlpd %xmm1, 2 * SIZE(CO1) 2599 movlpd %xmm5, 3 * SIZE(CO1) 2600 2601 movhpd %xmm9, 0 * SIZE(CO2) 2602 movhpd %xmm13, 1 * SIZE(CO2) 2603 movhpd %xmm1, 2 * SIZE(CO2) 2604 movhpd %xmm5, 3 * SIZE(CO2) 2605 2606 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 2607 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 2608 movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) 2609 movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) 2610 2611 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 2612 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 2613 movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) 2614 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 2615#else 2616 movlpd %xmm0, 0 * SIZE(CO1) 2617 movhpd %xmm0, 1 * SIZE(CO1) 2618 movlpd %xmm1, 2 * SIZE(CO1) 2619 movhpd %xmm1, 3 * SIZE(CO1) 2620 2621 movlpd %xmm2, 0 * SIZE(CO2) 2622 movhpd %xmm2, 1 * SIZE(CO2) 2623 movlpd %xmm3, 2 * SIZE(CO2) 2624 movhpd %xmm3, 3 * SIZE(CO2) 2625 2626 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 2627 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 2628 movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) 2629 movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) 2630 2631 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 2632 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 2633 movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) 2634 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 2635#endif 2636 2637#if defined(LN) || defined(LT) 2638 movaps %xmm9, -16 * SIZE(BO) 2639 movaps %xmm11, -14 * SIZE(BO) 2640 movaps %xmm13, -12 * SIZE(BO) 2641 movaps %xmm15, -10 * SIZE(BO) 2642 movaps %xmm1, -8 * SIZE(BO) 2643 movaps %xmm3, -6 * SIZE(BO) 2644 movaps %xmm5, -4 * SIZE(BO) 2645 movaps %xmm7, -2 * SIZE(BO) 2646#else 2647 movaps %xmm0, -16 * SIZE(AO) 2648 movaps %xmm1, -14 * SIZE(AO) 2649 movaps %xmm2, -12 * SIZE(AO) 2650 movaps %xmm3, -10 * SIZE(AO) 2651 movaps %xmm4, -8 * SIZE(AO) 2652 movaps %xmm5, -6 * SIZE(AO) 2653 movaps %xmm6, -4 * SIZE(AO) 2654 movaps %xmm7, -2 * SIZE(AO) 2655#endif 2656 2657#ifndef LN 2658 addq $4 * SIZE, CO1 2659 addq $4 * SIZE, CO2 2660#endif 2661 2662#if defined(LT) || defined(RN) 2663 movq K, %rax 2664 subq KK, %rax 2665 leaq (,%rax, SIZE), %rax 2666 leaq (AO, %rax, 4), AO 2667 leaq (BO, %rax, 4), BO 2668#endif 2669 2670#ifdef LN 2671 subq $4, KK 2672#endif 2673 2674#ifdef LT 2675 addq $4, KK 2676#endif 2677 2678#ifdef RT 2679 movq K, %rax 2680 salq $2 + BASE_SHIFT, %rax 2681 addq %rax, AORIG 2682#endif 2683 2684 decq I # i -- 2685 jg .L11 2686 ALIGN_4 2687 2688.L20: 2689 testq $3, M 2690 je .L39 2691 2692 testq $2, M 2693 je .L30 2694 ALIGN_4 2695 2696.L21: 2697#ifdef LN 2698 movq K, %rax 2699 salq $1 + BASE_SHIFT, %rax 2700 subq %rax, AORIG 2701#endif 2702 2703#if defined(LN) || defined(RT) 2704 movq KK, %rax 2705 movq AORIG, AO 2706 leaq (, %rax, SIZE), %rax 2707 leaq (AO, %rax, 2), AO 2708#endif 2709 2710 movq B, BO 2711 2712#if defined(LN) || defined(RT) 2713 movq KK, %rax 2714 leaq (, %rax, SIZE), %rax 2715 leaq (BO, %rax, 4), BO 2716#endif 2717 2718 movapd -16 * SIZE(AO), %xmm0 2719 pxor %xmm8, %xmm8 2720 movapd -12 * SIZE(AO), %xmm2 2721 pxor %xmm9, %xmm9 2722 movddup -16 * SIZE(BO), %xmm1 2723 pxor %xmm10, %xmm10 2724 movddup -15 * SIZE(BO), %xmm5 2725 pxor %xmm11, %xmm11 2726 movddup -8 * SIZE(BO), %xmm3 2727 2728#if defined(LT) || defined(RN) 2729 movq KK, %rax 2730#else 2731 movq K, %rax 2732 subq KK, %rax 2733#endif 2734 andq $-4, %rax 2735 leaq (, %rax, SIZE), %rax 2736 leaq (AO, %rax, 2), AO 2737 leaq (BO, %rax, 4), BO 2738 negq %rax 2739 NOBRANCH 2740 je .L26 2741 ALIGN_4 2742 2743.L22: 2744 mulpd %xmm0, %xmm1 2745 addpd %xmm1, %xmm8 2746 movddup -14 * SIZE(BO, %rax, 4), %xmm1 2747 mulpd %xmm0, %xmm5 2748 addpd %xmm5, %xmm9 2749 movddup -13 * SIZE(BO, %rax, 4), %xmm5 2750 mulpd %xmm0, %xmm1 2751 addpd %xmm1, %xmm10 2752 movddup -12 * SIZE(BO, %rax, 4), %xmm1 2753 mulpd %xmm0, %xmm5 2754 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2755 addpd %xmm5, %xmm11 2756 movddup -11 * SIZE(BO, %rax, 4), %xmm5 2757 mulpd %xmm0, %xmm1 2758 addpd %xmm1, %xmm8 2759 movddup -10 * SIZE(BO, %rax, 4), %xmm1 2760 mulpd %xmm0, %xmm5 2761 addpd %xmm5, %xmm9 2762 movddup -9 * SIZE(BO, %rax, 4), %xmm5 2763 mulpd %xmm0, %xmm1 2764 addpd %xmm1, %xmm10 2765 movddup (BO, %rax, 4), %xmm1 2766 mulpd %xmm0, %xmm5 2767 movapd -8 * SIZE(AO, %rax, 2), %xmm0 2768 addpd %xmm5, %xmm11 2769 movddup -7 * SIZE(BO, %rax, 4), %xmm5 2770 mulpd %xmm2, %xmm3 2771 addpd %xmm3, %xmm8 2772 movddup -6 * SIZE(BO, %rax, 4), %xmm3 2773 mulpd %xmm2, %xmm5 2774 addpd %xmm5, %xmm9 2775 movddup -5 * SIZE(BO, %rax, 4), %xmm5 2776 mulpd %xmm2, %xmm3 2777 addpd %xmm3, %xmm10 2778 movddup -4 * SIZE(BO, %rax, 4), %xmm3 2779 mulpd %xmm2, %xmm5 2780 movapd -10 * SIZE(AO, %rax, 2), %xmm2 2781 addpd %xmm5, %xmm11 2782 movddup -3 * SIZE(BO, %rax, 4), %xmm5 2783 mulpd %xmm2, %xmm3 2784 addpd %xmm3, %xmm8 2785 movddup -2 * SIZE(BO, %rax, 4), %xmm3 2786 mulpd %xmm2, %xmm5 2787 addpd %xmm5, %xmm9 2788 movddup -1 * SIZE(BO, %rax, 4), %xmm5 2789 mulpd %xmm2, %xmm3 2790 addpd %xmm3, %xmm10 2791 movddup 8 * SIZE(BO, %rax, 4), %xmm3 2792 mulpd %xmm2, %xmm5 2793 movapd -4 * SIZE(AO, %rax, 2), %xmm2 2794 addpd %xmm5, %xmm11 2795 movddup 1 * SIZE(BO, %rax, 4), %xmm5 2796 2797 addq $4 * SIZE, %rax 2798 BRANCH 2799 jl .L22 2800 ALIGN_4 2801 2802.L26: 2803#if defined(LT) || defined(RN) 2804 movq KK, %rax 2805#else 2806 movq K, %rax 2807 subq KK, %rax 2808#endif 2809 andq $3, %rax # if (k & 1) 2810 je .L29 2811 2812 leaq (, %rax, SIZE), %rax 2813 leaq (AO, %rax, 2), AO 2814 leaq (BO, %rax, 4), BO 2815 negq %rax 2816 ALIGN_4 2817 2818.L27: 2819 mulpd %xmm0, %xmm1 2820 addpd %xmm1, %xmm8 2821 movddup -14 * SIZE(BO, %rax, 4), %xmm1 2822 mulpd %xmm0, %xmm5 2823 addpd %xmm5, %xmm9 2824 movddup -13 * SIZE(BO, %rax, 4), %xmm5 2825 mulpd %xmm0, %xmm1 2826 addpd %xmm1, %xmm10 2827 movddup -12 * SIZE(BO, %rax, 4), %xmm1 2828 mulpd %xmm0, %xmm5 2829 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2830 addpd %xmm5, %xmm11 2831 movddup -11 * SIZE(BO, %rax, 4), %xmm5 2832 2833 addq $SIZE, %rax 2834 jl .L27 2835 ALIGN_4 2836 2837.L29: 2838#if defined(LN) || defined(RT) 2839 movq KK, %rax 2840#ifdef LN 2841 subq $2, %rax 2842#else 2843 subq $4, %rax 2844#endif 2845 2846 leaq (, %rax, SIZE), %rax 2847 2848 movq AORIG, AO 2849 leaq (AO, %rax, 2), AO 2850 leaq (B, %rax, 4), BO 2851#endif 2852 2853#if defined(LN) || defined(LT) 2854 movapd %xmm8, %xmm0 2855 unpcklpd %xmm9, %xmm8 2856 unpckhpd %xmm9, %xmm0 2857 2858 movapd %xmm10, %xmm2 2859 unpcklpd %xmm11, %xmm10 2860 unpckhpd %xmm11, %xmm2 2861 2862 movapd -16 * SIZE(BO), %xmm9 2863 movapd -14 * SIZE(BO), %xmm11 2864 movapd -12 * SIZE(BO), %xmm13 2865 movapd -10 * SIZE(BO), %xmm15 2866 2867 subpd %xmm8, %xmm9 2868 subpd %xmm10, %xmm11 2869 subpd %xmm0, %xmm13 2870 subpd %xmm2, %xmm15 2871#else 2872 movapd -16 * SIZE(AO), %xmm0 2873 movapd -14 * SIZE(AO), %xmm2 2874 movapd -12 * SIZE(AO), %xmm4 2875 movapd -10 * SIZE(AO), %xmm6 2876 2877 subpd %xmm8, %xmm0 2878 subpd %xmm9, %xmm2 2879 subpd %xmm10, %xmm4 2880 subpd %xmm11, %xmm6 2881#endif 2882 2883#ifdef LN 2884 movddup -13 * SIZE(AO), %xmm8 2885 mulpd %xmm8, %xmm13 2886 mulpd %xmm8, %xmm15 2887 2888 movddup -14 * SIZE(AO), %xmm10 2889 mulpd %xmm13, %xmm10 2890 subpd %xmm10, %xmm9 2891 movddup -14 * SIZE(AO), %xmm10 2892 mulpd %xmm15, %xmm10 2893 subpd %xmm10, %xmm11 2894 2895 movddup -16 * SIZE(AO), %xmm8 2896 mulpd %xmm8, %xmm9 2897 mulpd %xmm8, %xmm11 2898#endif 2899 2900#ifdef LT 2901 movddup -16 * SIZE(AO), %xmm8 2902 mulpd %xmm8, %xmm9 2903 mulpd %xmm8, %xmm11 2904 2905 movddup -15 * SIZE(AO), %xmm10 2906 mulpd %xmm9, %xmm10 2907 subpd %xmm10, %xmm13 2908 movddup -15 * SIZE(AO), %xmm10 2909 mulpd %xmm11, %xmm10 2910 subpd %xmm10, %xmm15 2911 2912 movddup -13 * SIZE(AO), %xmm8 2913 mulpd %xmm8, %xmm13 2914 mulpd %xmm8, %xmm15 2915#endif 2916 2917#ifdef RN 2918 movddup -16 * SIZE(BO), %xmm8 2919 mulpd %xmm8, %xmm0 2920 2921 movddup -15 * SIZE(BO), %xmm9 2922 mulpd %xmm0, %xmm9 2923 subpd %xmm9, %xmm2 2924 movddup -14 * SIZE(BO), %xmm10 2925 mulpd %xmm0, %xmm10 2926 subpd %xmm10, %xmm4 2927 movddup -13 * SIZE(BO), %xmm11 2928 mulpd %xmm0, %xmm11 2929 subpd %xmm11, %xmm6 2930 2931 movddup -11 * SIZE(BO), %xmm8 2932 mulpd %xmm8, %xmm2 2933 movddup -10 * SIZE(BO), %xmm9 2934 mulpd %xmm2, %xmm9 2935 subpd %xmm9, %xmm4 2936 movddup -9 * SIZE(BO), %xmm10 2937 mulpd %xmm2, %xmm10 2938 subpd %xmm10, %xmm6 2939 2940 movddup -6 * SIZE(BO), %xmm8 2941 mulpd %xmm8, %xmm4 2942 2943 movddup -5 * SIZE(BO), %xmm9 2944 mulpd %xmm4, %xmm9 2945 subpd %xmm9, %xmm6 2946 2947 movddup -1 * SIZE(BO), %xmm8 2948 mulpd %xmm8, %xmm6 2949#endif 2950 2951#ifdef RT 2952 movddup -1 * SIZE(BO), %xmm8 2953 mulpd %xmm8, %xmm6 2954 2955 movddup -2 * SIZE(BO), %xmm9 2956 mulpd %xmm6, %xmm9 2957 subpd %xmm9, %xmm4 2958 movddup -3 * SIZE(BO), %xmm10 2959 mulpd %xmm6, %xmm10 2960 subpd %xmm10, %xmm2 2961 movddup -4 * SIZE(BO), %xmm11 2962 mulpd %xmm6, %xmm11 2963 subpd %xmm11, %xmm0 2964 2965 movddup -6 * SIZE(BO), %xmm8 2966 mulpd %xmm8, %xmm4 2967 movddup -7 * SIZE(BO), %xmm9 2968 mulpd %xmm4, %xmm9 2969 subpd %xmm9, %xmm2 2970 movddup -8 * SIZE(BO), %xmm10 2971 mulpd %xmm4, %xmm10 2972 subpd %xmm10, %xmm0 2973 2974 movddup -11 * SIZE(BO), %xmm8 2975 mulpd %xmm8, %xmm2 2976 movddup -12 * SIZE(BO), %xmm9 2977 mulpd %xmm2, %xmm9 2978 subpd %xmm9, %xmm0 2979 2980 movddup -16 * SIZE(BO), %xmm8 2981 mulpd %xmm8, %xmm0 2982#endif 2983 2984#ifdef LN 2985 subq $2 * SIZE, CO1 2986 subq $2 * SIZE, CO2 2987#endif 2988 2989#if defined(LN) || defined(LT) 2990 movlpd %xmm9, 0 * SIZE(CO1) 2991 movlpd %xmm13, 1 * SIZE(CO1) 2992 2993 movhpd %xmm9, 0 * SIZE(CO2) 2994 movhpd %xmm13, 1 * SIZE(CO2) 2995 2996 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 2997 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 2998 2999 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 3000 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 3001#else 3002 movlpd %xmm0, 0 * SIZE(CO1) 3003 movhpd %xmm0, 1 * SIZE(CO1) 3004 3005 movlpd %xmm2, 0 * SIZE(CO2) 3006 movhpd %xmm2, 1 * SIZE(CO2) 3007 3008 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 3009 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 3010 3011 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 3012 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 3013#endif 3014 3015#if defined(LN) || defined(LT) 3016 movaps %xmm9, -16 * SIZE(BO) 3017 movaps %xmm11, -14 * SIZE(BO) 3018 movaps %xmm13, -12 * SIZE(BO) 3019 movaps %xmm15, -10 * SIZE(BO) 3020#else 3021 movaps %xmm0, -16 * SIZE(AO) 3022 movaps %xmm2, -14 * SIZE(AO) 3023 movaps %xmm4, -12 * SIZE(AO) 3024 movaps %xmm6, -10 * SIZE(AO) 3025#endif 3026 3027#ifndef LN 3028 addq $2 * SIZE, CO1 3029 addq $2 * SIZE, CO2 3030#endif 3031 3032#if defined(LT) || defined(RN) 3033 movq K, %rax 3034 subq KK, %rax 3035 leaq (,%rax, SIZE), %rax 3036 leaq (AO, %rax, 2), AO 3037 leaq (BO, %rax, 4), BO 3038#endif 3039 3040#ifdef LN 3041 subq $2, KK 3042#endif 3043 3044#ifdef LT 3045 addq $2, KK 3046#endif 3047 3048#ifdef RT 3049 movq K, %rax 3050 salq $1 + BASE_SHIFT, %rax 3051 addq %rax, AORIG 3052#endif 3053 ALIGN_4 3054 3055.L30: 3056 testq $1, M 3057 je .L39 3058 3059#ifdef LN 3060 movq K, %rax 3061 salq $0 + BASE_SHIFT, %rax 3062 subq %rax, AORIG 3063#endif 3064 3065#if defined(LN) || defined(RT) 3066 movq KK, %rax 3067 movq AORIG, AO 3068 leaq (, %rax, SIZE), %rax 3069 leaq (AO, %rax, 1), AO 3070#endif 3071 3072 movq B, BO 3073 3074#if defined(LN) || defined(RT) 3075 movq KK, %rax 3076 leaq (, %rax, SIZE), %rax 3077 leaq (BO, %rax, 4), BO 3078#endif 3079 3080 movddup -16 * SIZE(AO), %xmm0 3081 pxor %xmm8, %xmm8 3082 movddup -14 * SIZE(AO), %xmm2 3083 pxor %xmm9, %xmm9 3084 movddup -15 * SIZE(AO), %xmm4 3085 pxor %xmm10, %xmm10 3086 movapd -16 * SIZE(BO), %xmm1 3087 pxor %xmm11, %xmm11 3088 movapd -8 * SIZE(BO), %xmm3 3089 3090#if defined(LT) || defined(RN) 3091 movq KK, %rax 3092#else 3093 movq K, %rax 3094 subq KK, %rax 3095#endif 3096 andq $-4, %rax 3097 leaq (, %rax, SIZE), %rax 3098 leaq (AO, %rax, 1), AO 3099 leaq (BO, %rax, 4), BO 3100 negq %rax 3101 NOBRANCH 3102 je .L36 3103 ALIGN_4 3104 3105.L32: 3106 mulpd %xmm0, %xmm1 3107 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 3108 addpd %xmm1, %xmm8 3109 movapd -12 * SIZE(BO, %rax, 4), %xmm1 3110 addpd %xmm0, %xmm9 3111 movddup -12 * SIZE(AO, %rax, 1), %xmm0 3112 mulpd %xmm4, %xmm1 3113 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 3114 addpd %xmm1, %xmm10 3115 movapd (BO, %rax, 4), %xmm1 3116 addpd %xmm4, %xmm11 3117 movddup -11 * SIZE(AO, %rax, 1), %xmm4 3118 mulpd %xmm2, %xmm3 3119 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 3120 addpd %xmm3, %xmm8 3121 movapd -4 * SIZE(BO, %rax, 4), %xmm3 3122 addpd %xmm2, %xmm9 3123 movddup -13 * SIZE(AO, %rax, 1), %xmm2 3124 mulpd %xmm2, %xmm3 3125 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 3126 addpd %xmm3, %xmm10 3127 movapd 8 * SIZE(BO, %rax, 4), %xmm3 3128 addpd %xmm2, %xmm11 3129 movddup -10 * SIZE(AO, %rax, 1), %xmm2 3130 3131 addq $4 * SIZE, %rax 3132 BRANCH 3133 jl .L32 3134 ALIGN_4 3135 3136.L36: 3137#if defined(LT) || defined(RN) 3138 movq KK, %rax 3139#else 3140 movq K, %rax 3141 subq KK, %rax 3142#endif 3143 andq $3, %rax # if (k & 1) 3144 je .L38 3145 3146 leaq (, %rax, SIZE), %rax 3147 leaq (AO, %rax, 1), AO 3148 leaq (BO, %rax, 4), BO 3149 negq %rax 3150 ALIGN_4 3151 3152.L37: 3153 mulpd %xmm0, %xmm1 3154 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 3155 addpd %xmm1, %xmm8 3156 movapd -12 * SIZE(BO, %rax, 4), %xmm1 3157 addpd %xmm0, %xmm9 3158 movddup -15 * SIZE(AO, %rax, 1), %xmm0 3159 3160 addq $SIZE, %rax 3161 jl .L37 3162 ALIGN_4 3163 3164.L38: 3165 addpd %xmm10, %xmm8 3166 addpd %xmm11, %xmm9 3167 3168#if defined(LN) || defined(RT) 3169 movq KK, %rax 3170#ifdef LN 3171 subq $1, %rax 3172#else 3173 subq $4, %rax 3174#endif 3175 3176 leaq (, %rax, SIZE), %rax 3177 3178 movq AORIG, AO 3179 leaq (AO, %rax, 1), AO 3180 leaq (B, %rax, 4), BO 3181#endif 3182 3183#if defined(LN) || defined(LT) 3184 movapd -16 * SIZE(BO), %xmm2 3185 movapd -14 * SIZE(BO), %xmm3 3186 3187 subpd %xmm8, %xmm2 3188 subpd %xmm9, %xmm3 3189#else 3190 movapd -16 * SIZE(AO), %xmm2 3191 movapd -14 * SIZE(AO), %xmm3 3192 3193 subpd %xmm8, %xmm2 3194 subpd %xmm9, %xmm3 3195#endif 3196 3197#if defined(LN) || defined(LT) 3198 movddup -16 * SIZE(AO), %xmm0 3199 mulpd %xmm0, %xmm2 3200 mulpd %xmm0, %xmm3 3201#endif 3202 3203#ifdef RN 3204 movapd %xmm2, %xmm0 3205 unpckhpd %xmm0, %xmm0 3206 3207 movapd %xmm3, %xmm1 3208 unpckhpd %xmm1, %xmm1 3209 3210 movsd -16 * SIZE(BO), %xmm4 3211 mulsd %xmm4, %xmm2 3212 3213 movsd -15 * SIZE(BO), %xmm5 3214 mulsd %xmm2, %xmm5 3215 subsd %xmm5, %xmm0 3216 movsd -14 * SIZE(BO), %xmm6 3217 mulsd %xmm2, %xmm6 3218 subsd %xmm6, %xmm3 3219 movsd -13 * SIZE(BO), %xmm7 3220 mulsd %xmm2, %xmm7 3221 subsd %xmm7, %xmm1 3222 3223 movsd -11 * SIZE(BO), %xmm4 3224 mulsd %xmm4, %xmm0 3225 3226 movsd -10 * SIZE(BO), %xmm5 3227 mulsd %xmm0, %xmm5 3228 subsd %xmm5, %xmm3 3229 movsd -9 * SIZE(BO), %xmm6 3230 mulsd %xmm0, %xmm6 3231 subsd %xmm6, %xmm1 3232 3233 movsd -6 * SIZE(BO), %xmm4 3234 mulsd %xmm4, %xmm3 3235 3236 movsd -5 * SIZE(BO), %xmm5 3237 mulsd %xmm3, %xmm5 3238 subsd %xmm5, %xmm1 3239 3240 movsd -1 * SIZE(BO), %xmm4 3241 mulsd %xmm4, %xmm1 3242 3243 unpcklpd %xmm0, %xmm2 3244 unpcklpd %xmm1, %xmm3 3245#endif 3246 3247#ifdef RT 3248 movapd %xmm2, %xmm0 3249 unpckhpd %xmm0, %xmm0 3250 3251 movapd %xmm3, %xmm1 3252 unpckhpd %xmm1, %xmm1 3253 3254 movsd -1 * SIZE(BO), %xmm4 3255 mulsd %xmm4, %xmm1 3256 3257 movsd -2 * SIZE(BO), %xmm5 3258 mulsd %xmm1, %xmm5 3259 subsd %xmm5, %xmm3 3260 movsd -3 * SIZE(BO), %xmm6 3261 mulsd %xmm1, %xmm6 3262 subsd %xmm6, %xmm0 3263 movsd -4 * SIZE(BO), %xmm7 3264 mulsd %xmm1, %xmm7 3265 subsd %xmm7, %xmm2 3266 3267 movsd -6 * SIZE(BO), %xmm4 3268 mulsd %xmm4, %xmm3 3269 3270 movsd -7 * SIZE(BO), %xmm5 3271 mulsd %xmm3, %xmm5 3272 subsd %xmm5, %xmm0 3273 movsd -8 * SIZE(BO), %xmm6 3274 mulsd %xmm3, %xmm6 3275 subsd %xmm6, %xmm2 3276 3277 movsd -11 * SIZE(BO), %xmm4 3278 mulsd %xmm4, %xmm0 3279 3280 movsd -12 * SIZE(BO), %xmm5 3281 mulsd %xmm0, %xmm5 3282 subsd %xmm5, %xmm2 3283 3284 movsd -16 * SIZE(BO), %xmm4 3285 mulsd %xmm4, %xmm2 3286 3287 unpcklpd %xmm0, %xmm2 3288 unpcklpd %xmm1, %xmm3 3289 3290#endif 3291 3292#ifdef LN 3293 subq $1 * SIZE, CO1 3294 subq $1 * SIZE, CO2 3295#endif 3296 3297#if defined(LN) || defined(LT) 3298 movlpd %xmm2, 0 * SIZE(CO1) 3299 movhpd %xmm2, 0 * SIZE(CO2) 3300 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 3301 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 3302#else 3303 movlpd %xmm2, 0 * SIZE(CO1) 3304 movhpd %xmm2, 0 * SIZE(CO2) 3305 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 3306 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 3307#endif 3308 3309#if defined(LN) || defined(LT) 3310 movaps %xmm2, -16 * SIZE(BO) 3311 movaps %xmm3, -14 * SIZE(BO) 3312#else 3313 movaps %xmm2, -16 * SIZE(AO) 3314 movaps %xmm3, -14 * SIZE(AO) 3315#endif 3316 3317#ifndef LN 3318 addq $1 * SIZE, CO1 3319 addq $1 * SIZE, CO2 3320#endif 3321 3322#if defined(LT) || defined(RN) 3323 movq K, %rax 3324 subq KK, %rax 3325 leaq (,%rax, SIZE), %rax 3326 leaq (AO, %rax, 1), AO 3327 leaq (BO, %rax, 4), BO 3328#endif 3329 3330#ifdef LN 3331 subq $1, KK 3332#endif 3333 3334#ifdef LT 3335 addq $1, KK 3336#endif 3337 3338#ifdef RT 3339 movq K, %rax 3340 salq $0 + BASE_SHIFT, %rax 3341 addq %rax, AORIG 3342#endif 3343 ALIGN_4 3344 3345.L39: 3346#ifdef LN 3347 leaq (, K, SIZE), %rax 3348 leaq (B, %rax, 4), B 3349#endif 3350 3351#if defined(LT) || defined(RN) 3352 movq BO, B 3353#endif 3354 3355#ifdef RN 3356 addq $4, KK 3357#endif 3358 3359#ifdef RT 3360 subq $4, KK 3361#endif 3362 3363 decq J # j -- 3364 jg .L01 3365 ALIGN_4 3366 3367.L999: 3368 movq (%rsp), %rbx 3369 movq 8(%rsp), %rbp 3370 movq 16(%rsp), %r12 3371 movq 24(%rsp), %r13 3372 movq 32(%rsp), %r14 3373 movq 40(%rsp), %r15 3374 3375#ifdef WINDOWS_ABI 3376 movq 48(%rsp), %rdi 3377 movq 56(%rsp), %rsi 3378 movups 64(%rsp), %xmm6 3379 movups 80(%rsp), %xmm7 3380 movups 96(%rsp), %xmm8 3381 movups 112(%rsp), %xmm9 3382 movups 128(%rsp), %xmm10 3383 movups 144(%rsp), %xmm11 3384 movups 160(%rsp), %xmm12 3385 movups 176(%rsp), %xmm13 3386 movups 192(%rsp), %xmm14 3387 movups 208(%rsp), %xmm15 3388#endif 3389 3390 addq $STACKSIZE, %rsp 3391 ret 3392 3393 EPILOGUE 3394