1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %r12 58#define BB %rbp 59#define J %rbx 60 61#ifndef WINDOWS_ABI 62 63#define STACKSIZE 96 64 65#define OFFSET 48(%rsp) 66#define AORIG 56(%rsp) 67#define KK 64(%rsp) 68#define KKK 72(%rsp) 69 70#else 71 72#define STACKSIZE 256 73 74#define OLD_A 40 + STACKSIZE(%rsp) 75#define OLD_B 48 + STACKSIZE(%rsp) 76#define OLD_C 56 + STACKSIZE(%rsp) 77#define OLD_LDC 64 + STACKSIZE(%rsp) 78#define OLD_OFFSET 72 + STACKSIZE(%rsp) 79 80#define OFFSET 224(%rsp) 81#define AORIG 232(%rsp) 82#define KK 240(%rsp) 83#define KKK 248(%rsp) 84 85#endif 86 87#define PREFETCH prefetch 88#define PREFETCHSIZE (8 * 7 + 0) 89 90#define movlpd movsd 91#define movapd movups 92#define movupd movups 93 94#define KERNEL1(xx) \ 95 mulpd %xmm1, %xmm0 ;\ 96 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 97 addpd %xmm0, %xmm8 ;\ 98 movapd %xmm2, %xmm0 ;\ 99 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO, %rax, 4) ;\ 100 addpd %xmm1, %xmm12 ;\ 101 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 102 mulpd %xmm3, %xmm2 ;\ 103 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 104 addpd %xmm2, %xmm9 ;\ 105 movapd %xmm0, %xmm2 ;\ 106 addpd %xmm3, %xmm13 ;\ 107 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 108 mulpd %xmm1, %xmm0 ;\ 109 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 110 addpd %xmm0, %xmm10 ;\ 111 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 112 addpd %xmm1, %xmm14 ;\ 113 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 114 mulpd %xmm3, %xmm2 ;\ 115 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 116 addpd %xmm2, %xmm11 ;\ 117 addpd %xmm3, %xmm15 ;\ 118 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 119 movapd %xmm0, %xmm2 120 121#define KERNEL2(xx) \ 122 mulpd %xmm1, %xmm0 ;\ 123 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 124 addpd %xmm0, %xmm8 ;\ 125 movapd %xmm2, %xmm0 ;\ 126 addpd %xmm1, %xmm12 ;\ 127 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 128 mulpd %xmm3, %xmm2 ;\ 129 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 130 addpd %xmm2, %xmm9 ;\ 131 movapd %xmm0, %xmm2 ;\ 132 addpd %xmm3, %xmm13 ;\ 133 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 134 mulpd %xmm1, %xmm0 ;\ 135 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 136 addpd %xmm0, %xmm10 ;\ 137 addpd %xmm1, %xmm14 ;\ 138 mulpd %xmm3, %xmm2 ;\ 139 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 140 addpd %xmm2, %xmm11 ;\ 141 addpd %xmm3, %xmm15 ;\ 142 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 143/**/ movddup (BO, %rax, 4), %xmm1 ;\ 144 movapd %xmm4, %xmm2 145 146#define KERNEL3(xx) \ 147 mulpd %xmm5, %xmm4 ;\ 148 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 149 addpd %xmm4, %xmm8 ;\ 150 movapd %xmm2, %xmm4 ;\ 151 addpd %xmm5, %xmm12 ;\ 152 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 153 mulpd %xmm3, %xmm2 ;\ 154 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 155 addpd %xmm2, %xmm9 ;\ 156 movapd %xmm4, %xmm2 ;\ 157 addpd %xmm3, %xmm13 ;\ 158 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 159 mulpd %xmm5, %xmm4 ;\ 160 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 161 addpd %xmm4, %xmm10 ;\ 162 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 163 addpd %xmm5, %xmm14 ;\ 164 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 165 mulpd %xmm3, %xmm2 ;\ 166 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 167 addpd %xmm2, %xmm11 ;\ 168 addpd %xmm3, %xmm15 ;\ 169 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 170 movapd %xmm4, %xmm2 171 172#define KERNEL4(xx) \ 173 mulpd %xmm5, %xmm4 ;\ 174 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 175 addpd %xmm4, %xmm8 ;\ 176 movapd %xmm2, %xmm4 ;\ 177 addpd %xmm5, %xmm12 ;\ 178 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 179 mulpd %xmm3, %xmm2 ;\ 180 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 181 addpd %xmm2, %xmm9 ;\ 182 movapd %xmm4, %xmm2 ;\ 183 addpd %xmm3, %xmm13 ;\ 184 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 185 mulpd %xmm5, %xmm4 ;\ 186 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 187/***/ movapd (AO, %rax, 4), %xmm6 ;\ 188 addpd %xmm4, %xmm10 ;\ 189 addpd %xmm5, %xmm14 ;\ 190 mulpd %xmm3, %xmm2 ;\ 191 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 192 addpd %xmm2, %xmm11 ;\ 193 addpd %xmm3, %xmm15 ;\ 194 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 195 movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ 196 movapd %xmm6, %xmm2 197 198#define KERNEL5(xx) \ 199 mulpd %xmm1, %xmm6 ;\ 200 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 201 addpd %xmm6, %xmm8 ;\ 202 movapd %xmm2, %xmm6 ;\ 203 addpd %xmm1, %xmm12 ;\ 204 movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ 205 mulpd %xmm3, %xmm2 ;\ 206 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 207 addpd %xmm2, %xmm9 ;\ 208/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ 209 movapd %xmm6, %xmm2 ;\ 210 addpd %xmm3, %xmm13 ;\ 211 movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ 212 mulpd %xmm1, %xmm6 ;\ 213 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 214 addpd %xmm6, %xmm10 ;\ 215 movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ 216 addpd %xmm1, %xmm14 ;\ 217 movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ 218 mulpd %xmm3, %xmm2 ;\ 219 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 220 addpd %xmm2, %xmm11 ;\ 221 addpd %xmm3, %xmm15 ;\ 222 movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ 223 movapd %xmm6, %xmm2 224 225#define KERNEL6(xx) \ 226 mulpd %xmm1, %xmm6 ;\ 227 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 228 addpd %xmm6, %xmm8 ;\ 229 movapd %xmm2, %xmm6 ;\ 230 addpd %xmm1, %xmm12 ;\ 231 movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ 232 mulpd %xmm3, %xmm2 ;\ 233 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 234 addpd %xmm2, %xmm9 ;\ 235 movapd %xmm6, %xmm2 ;\ 236 addpd %xmm3, %xmm13 ;\ 237 movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ 238 mulpd %xmm1, %xmm6 ;\ 239 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 240 addpd %xmm6, %xmm10 ;\ 241/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ 242 addpd %xmm1, %xmm14 ;\ 243 mulpd %xmm3, %xmm2 ;\ 244 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 245 addpd %xmm2, %xmm11 ;\ 246 addpd %xmm3, %xmm15 ;\ 247 movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ 248 movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ 249 movapd %xmm7, %xmm2 250 251#define KERNEL7(xx) \ 252 mulpd %xmm5, %xmm7 ;\ 253 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 254 addpd %xmm7, %xmm8 ;\ 255 movapd %xmm2, %xmm7 ;\ 256 addpd %xmm5, %xmm12 ;\ 257 movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ 258 mulpd %xmm3, %xmm2 ;\ 259 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 260 addpd %xmm2, %xmm9 ;\ 261 movapd %xmm7, %xmm2 ;\ 262 addpd %xmm3, %xmm13 ;\ 263 movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ 264 mulpd %xmm5, %xmm7 ;\ 265 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 266 addpd %xmm7, %xmm10 ;\ 267 movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ 268 addpd %xmm5, %xmm14 ;\ 269 movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ 270 mulpd %xmm3, %xmm2 ;\ 271 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 272 addpd %xmm2, %xmm11 ;\ 273 addpd %xmm3, %xmm15 ;\ 274 movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ 275 movapd %xmm7, %xmm2 276 277#define KERNEL8(xx) \ 278 mulpd %xmm5, %xmm7 ;\ 279 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 280 addpd %xmm7, %xmm8 ;\ 281 movapd %xmm2, %xmm7 ;\ 282 addpd %xmm5, %xmm12 ;\ 283 movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ 284 mulpd %xmm3, %xmm2 ;\ 285 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 286 addpd %xmm2, %xmm9 ;\ 287 movapd %xmm7, %xmm2 ;\ 288 addpd %xmm3, %xmm13 ;\ 289 movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ 290 mulpd %xmm5, %xmm7 ;\ 291 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 292 addpd %xmm7, %xmm10 ;\ 293 addpd %xmm5, %xmm14 ;\ 294/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ 295 mulpd %xmm3, %xmm2 ;\ 296 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 297 addpd %xmm3, %xmm15 ;\ 298 movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ 299 addpd %xmm2, %xmm11 ;\ 300 movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ 301 movapd %xmm0, %xmm2 ;\ 302 addq $8 * SIZE, %rax 303 304#define KERNEL_SUB1(xx) \ 305 mulpd %xmm1, %xmm0 ;\ 306 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 307 addpd %xmm0, %xmm8 ;\ 308 movapd %xmm2, %xmm0 ;\ 309 addpd %xmm1, %xmm12 ;\ 310 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 311 mulpd %xmm3, %xmm2 ;\ 312 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 313 addpd %xmm2, %xmm9 ;\ 314 movapd %xmm0, %xmm2 ;\ 315 addpd %xmm3, %xmm13 ;\ 316 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 317 mulpd %xmm1, %xmm0 ;\ 318 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 319 addpd %xmm0, %xmm10 ;\ 320 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 321 addpd %xmm1, %xmm14 ;\ 322 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 323 mulpd %xmm3, %xmm2 ;\ 324 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 325 addpd %xmm2, %xmm11 ;\ 326 addpd %xmm3, %xmm15 ;\ 327 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 328 movapd %xmm0, %xmm2 329 330#define KERNEL_SUB2(xx) \ 331 mulpd %xmm1, %xmm0 ;\ 332 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 333 addpd %xmm0, %xmm8 ;\ 334 movapd %xmm2, %xmm0 ;\ 335 addpd %xmm1, %xmm12 ;\ 336 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 337 mulpd %xmm3, %xmm2 ;\ 338 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 339 addpd %xmm2, %xmm9 ;\ 340 movapd %xmm0, %xmm2 ;\ 341 addpd %xmm3, %xmm13 ;\ 342 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 343 mulpd %xmm1, %xmm0 ;\ 344 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 345 addpd %xmm0, %xmm10 ;\ 346 movapd (AO, %rax, 4), %xmm0 ;\ 347 addpd %xmm1, %xmm14 ;\ 348 movddup (BO, %rax, 4), %xmm1 ;\ 349 mulpd %xmm3, %xmm2 ;\ 350 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 351 addpd %xmm2, %xmm11 ;\ 352 addpd %xmm3, %xmm15 ;\ 353 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 354 movapd %xmm4, %xmm2 355 356#define KERNEL_SUB3(xx) \ 357 mulpd %xmm5, %xmm4 ;\ 358 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 359 addpd %xmm4, %xmm8 ;\ 360 movapd %xmm2, %xmm4 ;\ 361 addpd %xmm5, %xmm12 ;\ 362 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 363 mulpd %xmm3, %xmm2 ;\ 364 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 365 addpd %xmm2, %xmm9 ;\ 366 movapd %xmm4, %xmm2 ;\ 367 addpd %xmm3, %xmm13 ;\ 368 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 369 mulpd %xmm5, %xmm4 ;\ 370 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 371 addpd %xmm4, %xmm10 ;\ 372 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 373 addpd %xmm5, %xmm14 ;\ 374 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 375 mulpd %xmm3, %xmm2 ;\ 376 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 377 addpd %xmm2, %xmm11 ;\ 378 addpd %xmm3, %xmm15 ;\ 379 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 380 movapd %xmm4, %xmm2 381 382#define KERNEL_SUB4(xx) \ 383 mulpd %xmm5, %xmm4 ;\ 384 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 385 addpd %xmm4, %xmm8 ;\ 386 movapd %xmm2, %xmm4 ;\ 387 addpd %xmm5, %xmm12 ;\ 388 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 389 mulpd %xmm3, %xmm2 ;\ 390 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 391 addpd %xmm2, %xmm9 ;\ 392 movapd %xmm4, %xmm2 ;\ 393 addpd %xmm3, %xmm13 ;\ 394 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 395 mulpd %xmm5, %xmm4 ;\ 396 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 397 addpd %xmm4, %xmm10 ;\ 398 addpd %xmm5, %xmm14 ;\ 399 mulpd %xmm3, %xmm2 ;\ 400 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 401 addpd %xmm2, %xmm11 ;\ 402 addpd %xmm3, %xmm15 ;\ 403 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 404 movapd %xmm0, %xmm2 405 406 PROLOGUE 407 PROFCODE 408 409 subq $STACKSIZE, %rsp 410 movq %rbx, (%rsp) 411 movq %rbp, 8(%rsp) 412 movq %r12, 16(%rsp) 413 movq %r13, 24(%rsp) 414 movq %r14, 32(%rsp) 415 movq %r15, 40(%rsp) 416 417#ifdef WINDOWS_ABI 418 movq %rdi, 48(%rsp) 419 movq %rsi, 56(%rsp) 420 movups %xmm6, 64(%rsp) 421 movups %xmm7, 80(%rsp) 422 movups %xmm8, 96(%rsp) 423 movups %xmm9, 112(%rsp) 424 movups %xmm10, 128(%rsp) 425 movups %xmm11, 144(%rsp) 426 movups %xmm12, 160(%rsp) 427 movups %xmm13, 176(%rsp) 428 movups %xmm14, 192(%rsp) 429 movups %xmm15, 208(%rsp) 430 431 movq ARG1, OLD_M 432 movq ARG2, OLD_N 433 movq ARG3, K 434 movq OLD_A, A 435 movq OLD_B, B 436 movq OLD_C, C 437 movq OLD_LDC, LDC 438 movsd OLD_OFFSET, %xmm12 439#else 440 movq STACKSIZE + 8(%rsp), LDC 441 movsd STACKSIZE + 16(%rsp), %xmm12 442#endif 443 444 movq OLD_M, M 445 movq OLD_N, N 446 447 subq $-16 * SIZE, A 448 subq $-16 * SIZE, B 449 450 movsd %xmm12, OFFSET 451 movsd %xmm12, KK 452 453 leaq (, LDC, SIZE), LDC 454 455#ifdef LN 456 leaq (, M, SIZE), %rax 457 addq %rax, C 458 imulq K, %rax 459 addq %rax, A 460#endif 461 462#ifdef RT 463 leaq (, N, SIZE), %rax 464 imulq K, %rax 465 addq %rax, B 466 movq N, %rax 467 imulq LDC, %rax 468 addq %rax, C 469#endif 470 471#ifdef RN 472 negq KK 473#endif 474 475#ifdef RT 476 movq N, %rax 477 subq OFFSET, %rax 478 movq %rax, KK 479#endif 480 481 movq N, J 482 sarq $2, J # j = (n >> 2) 483 jle .L40 484 485.L01: 486#if defined(LT) || defined(RN) 487 movq A, AO 488#else 489 movq A, AORIG 490#endif 491 492#ifdef RT 493 movq K, %rax 494 salq $2 + BASE_SHIFT, %rax 495 subq %rax, B 496 497 leaq (, LDC, 4), %rax 498 subq %rax, C 499#endif 500 501 movq C, CO1 # coffset1 = c 502 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 503#ifndef RT 504 leaq (C, LDC, 4), C 505#endif 506 507#ifdef LN 508 movq OFFSET, %rax 509 addq M, %rax 510 movq %rax, KK 511#endif 512 513 movq K, %rax 514 salq $BASE_SHIFT + 2, %rax 515 leaq (B, %rax), BB 516 517#if defined(LT) 518 movq OFFSET, %rax 519 movq %rax, KK 520#endif 521 522 testq $1, M 523 je .L20 524 525#ifdef LN 526 movq K, %rax 527 salq $0 + BASE_SHIFT, %rax 528 subq %rax, AORIG 529#endif 530 531#if defined(LN) || defined(RT) 532 movq KK, %rax 533 movq AORIG, AO 534 leaq (, %rax, SIZE), %rax 535 leaq (AO, %rax, 1), AO 536#endif 537 538 movq B, BO 539 540#if defined(LN) || defined(RT) 541 movq KK, %rax 542 leaq (, %rax, SIZE), %rax 543 leaq (BO, %rax, 4), BO 544#endif 545 546 movddup -16 * SIZE(AO), %xmm0 547 pxor %xmm8, %xmm8 548 movddup -14 * SIZE(AO), %xmm2 549 pxor %xmm9, %xmm9 550 movddup -15 * SIZE(AO), %xmm4 551 pxor %xmm10, %xmm10 552 movapd -16 * SIZE(BO), %xmm1 553 pxor %xmm11, %xmm11 554 movapd -8 * SIZE(BO), %xmm3 555 556#if defined(LT) || defined(RN) 557 movq KK, %rax 558#else 559 movq K, %rax 560 subq KK, %rax 561#endif 562 andq $-4, %rax 563 leaq (, %rax, SIZE), %rax 564 leaq (AO, %rax, 1), AO 565 leaq (BO, %rax, 4), BO 566 negq %rax 567 NOBRANCH 568 je .L36 569 ALIGN_4 570 571.L32: 572 mulpd %xmm0, %xmm1 573 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 574 addpd %xmm1, %xmm8 575 movapd -12 * SIZE(BO, %rax, 4), %xmm1 576 addpd %xmm0, %xmm9 577 movddup -12 * SIZE(AO, %rax, 1), %xmm0 578 mulpd %xmm4, %xmm1 579 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 580 addpd %xmm1, %xmm10 581 movapd (BO, %rax, 4), %xmm1 582 addpd %xmm4, %xmm11 583 movddup -11 * SIZE(AO, %rax, 1), %xmm4 584 mulpd %xmm2, %xmm3 585 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 586 addpd %xmm3, %xmm8 587 movapd -4 * SIZE(BO, %rax, 4), %xmm3 588 addpd %xmm2, %xmm9 589 movddup -13 * SIZE(AO, %rax, 1), %xmm2 590 mulpd %xmm2, %xmm3 591 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 592 addpd %xmm3, %xmm10 593 movapd 8 * SIZE(BO, %rax, 4), %xmm3 594 addpd %xmm2, %xmm11 595 movddup -10 * SIZE(AO, %rax, 1), %xmm2 596 597 addq $4 * SIZE, %rax 598 BRANCH 599 jl .L32 600 ALIGN_4 601 602.L36: 603#if defined(LT) || defined(RN) 604 movq KK, %rax 605#else 606 movq K, %rax 607 subq KK, %rax 608#endif 609 andq $3, %rax # if (k & 1) 610 je .L38 611 612 leaq (, %rax, SIZE), %rax 613 leaq (AO, %rax, 1), AO 614 leaq (BO, %rax, 4), BO 615 negq %rax 616 ALIGN_4 617 618.L37: 619 mulpd %xmm0, %xmm1 620 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 621 addpd %xmm1, %xmm8 622 movapd -12 * SIZE(BO, %rax, 4), %xmm1 623 addpd %xmm0, %xmm9 624 movddup -15 * SIZE(AO, %rax, 1), %xmm0 625 626 addq $SIZE, %rax 627 jl .L37 628 ALIGN_4 629 630.L38: 631 addpd %xmm10, %xmm8 632 addpd %xmm11, %xmm9 633 634#if defined(LN) || defined(RT) 635 movq KK, %rax 636#ifdef LN 637 subq $1, %rax 638#else 639 subq $4, %rax 640#endif 641 642 leaq (, %rax, SIZE), %rax 643 644 movq AORIG, AO 645 leaq (AO, %rax, 1), AO 646 leaq (B, %rax, 4), BO 647#endif 648 649#if defined(LN) || defined(LT) 650 movapd -16 * SIZE(BO), %xmm2 651 movapd -14 * SIZE(BO), %xmm3 652 653 subpd %xmm8, %xmm2 654 subpd %xmm9, %xmm3 655#else 656 movapd -16 * SIZE(AO), %xmm2 657 movapd -14 * SIZE(AO), %xmm3 658 659 subpd %xmm8, %xmm2 660 subpd %xmm9, %xmm3 661#endif 662 663#if defined(LN) || defined(LT) 664 movddup -16 * SIZE(AO), %xmm0 665 mulpd %xmm0, %xmm2 666 mulpd %xmm0, %xmm3 667#endif 668 669#ifdef RN 670 movapd %xmm2, %xmm0 671 unpckhpd %xmm0, %xmm0 672 673 movapd %xmm3, %xmm1 674 unpckhpd %xmm1, %xmm1 675 676 movsd -16 * SIZE(BO), %xmm4 677 mulsd %xmm4, %xmm2 678 679 movsd -15 * SIZE(BO), %xmm5 680 mulsd %xmm2, %xmm5 681 subsd %xmm5, %xmm0 682 movsd -14 * SIZE(BO), %xmm6 683 mulsd %xmm2, %xmm6 684 subsd %xmm6, %xmm3 685 movsd -13 * SIZE(BO), %xmm7 686 mulsd %xmm2, %xmm7 687 subsd %xmm7, %xmm1 688 689 movsd -11 * SIZE(BO), %xmm4 690 mulsd %xmm4, %xmm0 691 692 movsd -10 * SIZE(BO), %xmm5 693 mulsd %xmm0, %xmm5 694 subsd %xmm5, %xmm3 695 movsd -9 * SIZE(BO), %xmm6 696 mulsd %xmm0, %xmm6 697 subsd %xmm6, %xmm1 698 699 movsd -6 * SIZE(BO), %xmm4 700 mulsd %xmm4, %xmm3 701 702 movsd -5 * SIZE(BO), %xmm5 703 mulsd %xmm3, %xmm5 704 subsd %xmm5, %xmm1 705 706 movsd -1 * SIZE(BO), %xmm4 707 mulsd %xmm4, %xmm1 708 709 unpcklpd %xmm0, %xmm2 710 unpcklpd %xmm1, %xmm3 711#endif 712 713#ifdef RT 714 movapd %xmm2, %xmm0 715 unpckhpd %xmm0, %xmm0 716 717 movapd %xmm3, %xmm1 718 unpckhpd %xmm1, %xmm1 719 720 movsd -1 * SIZE(BO), %xmm4 721 mulsd %xmm4, %xmm1 722 723 movsd -2 * SIZE(BO), %xmm5 724 mulsd %xmm1, %xmm5 725 subsd %xmm5, %xmm3 726 movsd -3 * SIZE(BO), %xmm6 727 mulsd %xmm1, %xmm6 728 subsd %xmm6, %xmm0 729 movsd -4 * SIZE(BO), %xmm7 730 mulsd %xmm1, %xmm7 731 subsd %xmm7, %xmm2 732 733 movsd -6 * SIZE(BO), %xmm4 734 mulsd %xmm4, %xmm3 735 736 movsd -7 * SIZE(BO), %xmm5 737 mulsd %xmm3, %xmm5 738 subsd %xmm5, %xmm0 739 movsd -8 * SIZE(BO), %xmm6 740 mulsd %xmm3, %xmm6 741 subsd %xmm6, %xmm2 742 743 movsd -11 * SIZE(BO), %xmm4 744 mulsd %xmm4, %xmm0 745 746 movsd -12 * SIZE(BO), %xmm5 747 mulsd %xmm0, %xmm5 748 subsd %xmm5, %xmm2 749 750 movsd -16 * SIZE(BO), %xmm4 751 mulsd %xmm4, %xmm2 752 753 unpcklpd %xmm0, %xmm2 754 unpcklpd %xmm1, %xmm3 755 756#endif 757 758#ifdef LN 759 subq $1 * SIZE, CO1 760 subq $1 * SIZE, CO2 761#endif 762 763#if defined(LN) || defined(LT) 764 movlpd %xmm2, 0 * SIZE(CO1) 765 movhpd %xmm2, 0 * SIZE(CO2) 766 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 767 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 768#else 769 movlpd %xmm2, 0 * SIZE(CO1) 770 movhpd %xmm2, 0 * SIZE(CO2) 771 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 772 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 773#endif 774 775#if defined(LN) || defined(LT) 776 movaps %xmm2, -16 * SIZE(BO) 777 movaps %xmm3, -14 * SIZE(BO) 778#else 779 movaps %xmm2, -16 * SIZE(AO) 780 movaps %xmm3, -14 * SIZE(AO) 781#endif 782 783#ifndef LN 784 addq $1 * SIZE, CO1 785 addq $1 * SIZE, CO2 786#endif 787 788#if defined(LT) || defined(RN) 789 movq K, %rax 790 subq KK, %rax 791 leaq (,%rax, SIZE), %rax 792 leaq (AO, %rax, 1), AO 793 leaq (BO, %rax, 4), BO 794#endif 795 796#ifdef LN 797 subq $1, KK 798#endif 799 800#ifdef LT 801 addq $1, KK 802#endif 803 804#ifdef RT 805 movq K, %rax 806 salq $0 + BASE_SHIFT, %rax 807 addq %rax, AORIG 808#endif 809 ALIGN_4 810 811.L20: 812 testq $2, M 813 je .L30 814 ALIGN_4 815 816.L21: 817#ifdef LN 818 movq K, %rax 819 salq $1 + BASE_SHIFT, %rax 820 subq %rax, AORIG 821#endif 822 823#if defined(LN) || defined(RT) 824 movq KK, %rax 825 movq AORIG, AO 826 leaq (, %rax, SIZE), %rax 827 leaq (AO, %rax, 2), AO 828#endif 829 830 movq B, BO 831 832#if defined(LN) || defined(RT) 833 movq KK, %rax 834 leaq (, %rax, SIZE), %rax 835 leaq (BO, %rax, 4), BO 836#endif 837 838 movapd -16 * SIZE(AO), %xmm0 839 pxor %xmm8, %xmm8 840 movapd -12 * SIZE(AO), %xmm2 841 pxor %xmm9, %xmm9 842 movddup -16 * SIZE(BO), %xmm1 843 pxor %xmm10, %xmm10 844 movddup -15 * SIZE(BO), %xmm5 845 pxor %xmm11, %xmm11 846 movddup -8 * SIZE(BO), %xmm3 847 848#if defined(LT) || defined(RN) 849 movq KK, %rax 850#else 851 movq K, %rax 852 subq KK, %rax 853#endif 854 andq $-4, %rax 855 leaq (, %rax, SIZE), %rax 856 leaq (AO, %rax, 2), AO 857 leaq (BO, %rax, 4), BO 858 negq %rax 859 NOBRANCH 860 je .L26 861 ALIGN_4 862 863.L22: 864 mulpd %xmm0, %xmm1 865 addpd %xmm1, %xmm8 866 movddup -14 * SIZE(BO, %rax, 4), %xmm1 867 mulpd %xmm0, %xmm5 868 addpd %xmm5, %xmm9 869 movddup -13 * SIZE(BO, %rax, 4), %xmm5 870 mulpd %xmm0, %xmm1 871 addpd %xmm1, %xmm10 872 movddup -12 * SIZE(BO, %rax, 4), %xmm1 873 mulpd %xmm0, %xmm5 874 movapd -14 * SIZE(AO, %rax, 2), %xmm0 875 addpd %xmm5, %xmm11 876 movddup -11 * SIZE(BO, %rax, 4), %xmm5 877 mulpd %xmm0, %xmm1 878 addpd %xmm1, %xmm8 879 movddup -10 * SIZE(BO, %rax, 4), %xmm1 880 mulpd %xmm0, %xmm5 881 addpd %xmm5, %xmm9 882 movddup -9 * SIZE(BO, %rax, 4), %xmm5 883 mulpd %xmm0, %xmm1 884 addpd %xmm1, %xmm10 885 movddup (BO, %rax, 4), %xmm1 886 mulpd %xmm0, %xmm5 887 movapd -8 * SIZE(AO, %rax, 2), %xmm0 888 addpd %xmm5, %xmm11 889 movddup -7 * SIZE(BO, %rax, 4), %xmm5 890 mulpd %xmm2, %xmm3 891 addpd %xmm3, %xmm8 892 movddup -6 * SIZE(BO, %rax, 4), %xmm3 893 mulpd %xmm2, %xmm5 894 addpd %xmm5, %xmm9 895 movddup -5 * SIZE(BO, %rax, 4), %xmm5 896 mulpd %xmm2, %xmm3 897 addpd %xmm3, %xmm10 898 movddup -4 * SIZE(BO, %rax, 4), %xmm3 899 mulpd %xmm2, %xmm5 900 movapd -10 * SIZE(AO, %rax, 2), %xmm2 901 addpd %xmm5, %xmm11 902 movddup -3 * SIZE(BO, %rax, 4), %xmm5 903 mulpd %xmm2, %xmm3 904 addpd %xmm3, %xmm8 905 movddup -2 * SIZE(BO, %rax, 4), %xmm3 906 mulpd %xmm2, %xmm5 907 addpd %xmm5, %xmm9 908 movddup -1 * SIZE(BO, %rax, 4), %xmm5 909 mulpd %xmm2, %xmm3 910 addpd %xmm3, %xmm10 911 movddup 8 * SIZE(BO, %rax, 4), %xmm3 912 mulpd %xmm2, %xmm5 913 movapd -4 * SIZE(AO, %rax, 2), %xmm2 914 addpd %xmm5, %xmm11 915 movddup 1 * SIZE(BO, %rax, 4), %xmm5 916 917 addq $4 * SIZE, %rax 918 BRANCH 919 jl .L22 920 ALIGN_4 921 922.L26: 923#if defined(LT) || defined(RN) 924 movq KK, %rax 925#else 926 movq K, %rax 927 subq KK, %rax 928#endif 929 andq $3, %rax # if (k & 1) 930 je .L29 931 932 leaq (, %rax, SIZE), %rax 933 leaq (AO, %rax, 2), AO 934 leaq (BO, %rax, 4), BO 935 negq %rax 936 ALIGN_4 937 938.L27: 939 mulpd %xmm0, %xmm1 940 addpd %xmm1, %xmm8 941 movddup -14 * SIZE(BO, %rax, 4), %xmm1 942 mulpd %xmm0, %xmm5 943 addpd %xmm5, %xmm9 944 movddup -13 * SIZE(BO, %rax, 4), %xmm5 945 mulpd %xmm0, %xmm1 946 addpd %xmm1, %xmm10 947 movddup -12 * SIZE(BO, %rax, 4), %xmm1 948 mulpd %xmm0, %xmm5 949 movapd -14 * SIZE(AO, %rax, 2), %xmm0 950 addpd %xmm5, %xmm11 951 movddup -11 * SIZE(BO, %rax, 4), %xmm5 952 953 addq $SIZE, %rax 954 jl .L27 955 ALIGN_4 956 957.L29: 958#if defined(LN) || defined(RT) 959 movq KK, %rax 960#ifdef LN 961 subq $2, %rax 962#else 963 subq $4, %rax 964#endif 965 966 leaq (, %rax, SIZE), %rax 967 968 movq AORIG, AO 969 leaq (AO, %rax, 2), AO 970 leaq (B, %rax, 4), BO 971#endif 972 973#if defined(LN) || defined(LT) 974 movapd %xmm8, %xmm0 975 unpcklpd %xmm9, %xmm8 976 unpckhpd %xmm9, %xmm0 977 978 movapd %xmm10, %xmm2 979 unpcklpd %xmm11, %xmm10 980 unpckhpd %xmm11, %xmm2 981 982 movapd -16 * SIZE(BO), %xmm9 983 movapd -14 * SIZE(BO), %xmm11 984 movapd -12 * SIZE(BO), %xmm13 985 movapd -10 * SIZE(BO), %xmm15 986 987 subpd %xmm8, %xmm9 988 subpd %xmm10, %xmm11 989 subpd %xmm0, %xmm13 990 subpd %xmm2, %xmm15 991#else 992 movapd -16 * SIZE(AO), %xmm0 993 movapd -14 * SIZE(AO), %xmm2 994 movapd -12 * SIZE(AO), %xmm4 995 movapd -10 * SIZE(AO), %xmm6 996 997 subpd %xmm8, %xmm0 998 subpd %xmm9, %xmm2 999 subpd %xmm10, %xmm4 1000 subpd %xmm11, %xmm6 1001#endif 1002 1003#ifdef LN 1004 movddup -13 * SIZE(AO), %xmm8 1005 mulpd %xmm8, %xmm13 1006 mulpd %xmm8, %xmm15 1007 1008 movddup -14 * SIZE(AO), %xmm10 1009 mulpd %xmm13, %xmm10 1010 subpd %xmm10, %xmm9 1011 movddup -14 * SIZE(AO), %xmm10 1012 mulpd %xmm15, %xmm10 1013 subpd %xmm10, %xmm11 1014 1015 movddup -16 * SIZE(AO), %xmm8 1016 mulpd %xmm8, %xmm9 1017 mulpd %xmm8, %xmm11 1018#endif 1019 1020#ifdef LT 1021 movddup -16 * SIZE(AO), %xmm8 1022 mulpd %xmm8, %xmm9 1023 mulpd %xmm8, %xmm11 1024 1025 movddup -15 * SIZE(AO), %xmm10 1026 mulpd %xmm9, %xmm10 1027 subpd %xmm10, %xmm13 1028 movddup -15 * SIZE(AO), %xmm10 1029 mulpd %xmm11, %xmm10 1030 subpd %xmm10, %xmm15 1031 1032 movddup -13 * SIZE(AO), %xmm8 1033 mulpd %xmm8, %xmm13 1034 mulpd %xmm8, %xmm15 1035#endif 1036 1037#ifdef RN 1038 movddup -16 * SIZE(BO), %xmm8 1039 mulpd %xmm8, %xmm0 1040 1041 movddup -15 * SIZE(BO), %xmm9 1042 mulpd %xmm0, %xmm9 1043 subpd %xmm9, %xmm2 1044 movddup -14 * SIZE(BO), %xmm10 1045 mulpd %xmm0, %xmm10 1046 subpd %xmm10, %xmm4 1047 movddup -13 * SIZE(BO), %xmm11 1048 mulpd %xmm0, %xmm11 1049 subpd %xmm11, %xmm6 1050 1051 movddup -11 * SIZE(BO), %xmm8 1052 mulpd %xmm8, %xmm2 1053 movddup -10 * SIZE(BO), %xmm9 1054 mulpd %xmm2, %xmm9 1055 subpd %xmm9, %xmm4 1056 movddup -9 * SIZE(BO), %xmm10 1057 mulpd %xmm2, %xmm10 1058 subpd %xmm10, %xmm6 1059 1060 movddup -6 * SIZE(BO), %xmm8 1061 mulpd %xmm8, %xmm4 1062 1063 movddup -5 * SIZE(BO), %xmm9 1064 mulpd %xmm4, %xmm9 1065 subpd %xmm9, %xmm6 1066 1067 movddup -1 * SIZE(BO), %xmm8 1068 mulpd %xmm8, %xmm6 1069#endif 1070 1071#ifdef RT 1072 movddup -1 * SIZE(BO), %xmm8 1073 mulpd %xmm8, %xmm6 1074 1075 movddup -2 * SIZE(BO), %xmm9 1076 mulpd %xmm6, %xmm9 1077 subpd %xmm9, %xmm4 1078 movddup -3 * SIZE(BO), %xmm10 1079 mulpd %xmm6, %xmm10 1080 subpd %xmm10, %xmm2 1081 movddup -4 * SIZE(BO), %xmm11 1082 mulpd %xmm6, %xmm11 1083 subpd %xmm11, %xmm0 1084 1085 movddup -6 * SIZE(BO), %xmm8 1086 mulpd %xmm8, %xmm4 1087 movddup -7 * SIZE(BO), %xmm9 1088 mulpd %xmm4, %xmm9 1089 subpd %xmm9, %xmm2 1090 movddup -8 * SIZE(BO), %xmm10 1091 mulpd %xmm4, %xmm10 1092 subpd %xmm10, %xmm0 1093 1094 movddup -11 * SIZE(BO), %xmm8 1095 mulpd %xmm8, %xmm2 1096 movddup -12 * SIZE(BO), %xmm9 1097 mulpd %xmm2, %xmm9 1098 subpd %xmm9, %xmm0 1099 1100 movddup -16 * SIZE(BO), %xmm8 1101 mulpd %xmm8, %xmm0 1102#endif 1103 1104#ifdef LN 1105 subq $2 * SIZE, CO1 1106 subq $2 * SIZE, CO2 1107#endif 1108 1109#if defined(LN) || defined(LT) 1110 movlpd %xmm9, 0 * SIZE(CO1) 1111 movlpd %xmm13, 1 * SIZE(CO1) 1112 1113 movhpd %xmm9, 0 * SIZE(CO2) 1114 movhpd %xmm13, 1 * SIZE(CO2) 1115 1116 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 1117 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 1118 1119 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1120 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 1121#else 1122 movlpd %xmm0, 0 * SIZE(CO1) 1123 movhpd %xmm0, 1 * SIZE(CO1) 1124 1125 movlpd %xmm2, 0 * SIZE(CO2) 1126 movhpd %xmm2, 1 * SIZE(CO2) 1127 1128 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 1129 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 1130 1131 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 1132 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 1133#endif 1134 1135#if defined(LN) || defined(LT) 1136 movaps %xmm9, -16 * SIZE(BO) 1137 movaps %xmm11, -14 * SIZE(BO) 1138 movaps %xmm13, -12 * SIZE(BO) 1139 movaps %xmm15, -10 * SIZE(BO) 1140#else 1141 movaps %xmm0, -16 * SIZE(AO) 1142 movaps %xmm2, -14 * SIZE(AO) 1143 movaps %xmm4, -12 * SIZE(AO) 1144 movaps %xmm6, -10 * SIZE(AO) 1145#endif 1146 1147#ifndef LN 1148 addq $2 * SIZE, CO1 1149 addq $2 * SIZE, CO2 1150#endif 1151 1152#if defined(LT) || defined(RN) 1153 movq K, %rax 1154 subq KK, %rax 1155 leaq (,%rax, SIZE), %rax 1156 leaq (AO, %rax, 2), AO 1157 leaq (BO, %rax, 4), BO 1158#endif 1159 1160#ifdef LN 1161 subq $2, KK 1162#endif 1163 1164#ifdef LT 1165 addq $2, KK 1166#endif 1167 1168#ifdef RT 1169 movq K, %rax 1170 salq $1 + BASE_SHIFT, %rax 1171 addq %rax, AORIG 1172#endif 1173 ALIGN_4 1174 1175.L30: 1176 movq M, I 1177 sarq $2, I # i = (m >> 2) 1178 jle .L39 1179 ALIGN_4 1180 1181.L11: 1182#ifdef LN 1183 movq K, %rax 1184 salq $2 + BASE_SHIFT, %rax 1185 subq %rax, AORIG 1186#endif 1187 1188#if defined(LN) || defined(RT) 1189 movq KK, %rax 1190 movq AORIG, AO 1191 leaq (, %rax, SIZE), %rax 1192 leaq (AO, %rax, 4), AO 1193#endif 1194 1195 movq B, BO 1196 1197#if defined(LN) || defined(RT) 1198 movq KK, %rax 1199 leaq (, %rax, SIZE), %rax 1200 leaq (BO, %rax, 4), BO 1201#endif 1202 1203 movapd -16 * SIZE(AO), %xmm0 1204 movddup -16 * SIZE(BO), %xmm1 1205 pxor %xmm8, %xmm8 1206 movddup -15 * SIZE(BO), %xmm3 1207 pxor %xmm9, %xmm9 1208 movapd -8 * SIZE(AO), %xmm4 1209 pxor %xmm10, %xmm10 1210 movddup -8 * SIZE(BO), %xmm5 1211 pxor %xmm11, %xmm11 1212 1213#ifndef LN 1214 prefetchw 3 * SIZE(CO1) 1215 pxor %xmm12, %xmm12 1216 prefetchw 3 * SIZE(CO2) 1217 pxor %xmm13, %xmm13 1218 prefetchw 3 * SIZE(CO1, LDC, 2) 1219 pxor %xmm14, %xmm14 1220 prefetchw 3 * SIZE(CO2, LDC, 2) 1221 pxor %xmm15, %xmm15 1222 movapd %xmm0, %xmm2 1223#else 1224 prefetchw -8 * SIZE(CO1) 1225 pxor %xmm12, %xmm12 1226 prefetchw -8 * SIZE(CO2) 1227 pxor %xmm13, %xmm13 1228 prefetchw -8 * SIZE(CO1, LDC, 2) 1229 pxor %xmm14, %xmm14 1230 prefetchw -8 * SIZE(CO2, LDC, 2) 1231 pxor %xmm15, %xmm15 1232 movapd %xmm0, %xmm2 1233#endif 1234 1235 prefetch -10 * SIZE(BB) 1236 1237#if defined(LT) || defined(RN) 1238 movq KK, %rax 1239#else 1240 movq K, %rax 1241 subq KK, %rax 1242#endif 1243 1244 andq $-8, %rax 1245 leaq (, %rax, SIZE), %rax 1246 leaq (AO, %rax, 4), AO 1247 leaq (BO, %rax, 4), BO 1248 negq %rax 1249 NOBRANCH 1250 je .L15 1251 ALIGN_4 1252 1253.L12: 1254 KERNEL1(16 * 0) 1255 KERNEL2(16 * 0) 1256 KERNEL3(16 * 0) 1257 KERNEL4(16 * 0) 1258 KERNEL5(16 * 0) 1259 KERNEL6(16 * 0) 1260 KERNEL7(16 * 0) 1261 KERNEL8(16 * 0) 1262 BRANCH 1263 jl .L12 1264 ALIGN_4 1265 1266.L15: 1267 prefetch 14 * SIZE(BB) 1268 subq $-16 * SIZE, BB 1269 1270#if defined(LT) || defined(RN) 1271 movq KK, %rax 1272#else 1273 movq K, %rax 1274 subq KK, %rax 1275#endif 1276 testq $4, %rax 1277 je .L16 1278 xorq %rax, %rax 1279 ALIGN_4 1280 1281 KERNEL_SUB1(16 * 0) 1282 KERNEL_SUB2(16 * 0) 1283 KERNEL_SUB3(16 * 0) 1284 KERNEL_SUB4(16 * 0) 1285 1286 subq $-16 * SIZE, BO 1287 subq $-16 * SIZE, AO 1288 ALIGN_4 1289 1290.L16: 1291#if defined(LT) || defined(RN) 1292 movq KK, %rax 1293#else 1294 movq K, %rax 1295 subq KK, %rax 1296#endif 1297 andq $3, %rax # if (k & 1) 1298 je .L19 1299 1300 leaq (, %rax, SIZE), %rax 1301 leaq (AO, %rax, 4), AO 1302 leaq (BO, %rax, 4), BO 1303 negq %rax 1304 ALIGN_4 1305 1306.L17: 1307 mulpd %xmm1, %xmm0 1308 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1309 addpd %xmm0, %xmm8 1310 movapd %xmm2, %xmm0 1311 addpd %xmm1, %xmm12 1312 movddup -14 * SIZE(BO, %rax, 4), %xmm1 1313 mulpd %xmm3, %xmm2 1314 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 1315 addpd %xmm2, %xmm9 1316 movapd %xmm0, %xmm2 1317 addpd %xmm3, %xmm13 1318 movddup -13 * SIZE(BO, %rax, 4), %xmm3 1319 mulpd %xmm1, %xmm0 1320 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1321 addpd %xmm0, %xmm10 1322 movapd -12 * SIZE(AO, %rax, 4), %xmm0 1323 addpd %xmm1, %xmm14 1324 movddup -12 * SIZE(BO, %rax, 4), %xmm1 1325 mulpd %xmm3, %xmm2 1326 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 1327 addpd %xmm2, %xmm11 1328 addpd %xmm3, %xmm15 1329 movddup -11 * SIZE(BO, %rax, 4), %xmm3 1330 movapd %xmm0, %xmm2 1331 1332 addq $SIZE, %rax 1333 jl .L17 1334 ALIGN_4 1335 1336.L19: 1337#if defined(LN) || defined(RT) 1338 movq KK, %rax 1339#ifdef LN 1340 subq $4, %rax 1341#else 1342 subq $4, %rax 1343#endif 1344 1345 leaq (, %rax, SIZE), %rax 1346 1347 movq AORIG, AO 1348 leaq (AO, %rax, 4), AO 1349 leaq (B, %rax, 4), BO 1350#endif 1351 1352#if defined(LN) || defined(LT) 1353 movapd %xmm8, %xmm0 1354 unpcklpd %xmm9, %xmm8 1355 unpckhpd %xmm9, %xmm0 1356 1357 movapd %xmm10, %xmm2 1358 unpcklpd %xmm11, %xmm10 1359 unpckhpd %xmm11, %xmm2 1360 1361 movapd %xmm12, %xmm4 1362 unpcklpd %xmm13, %xmm12 1363 unpckhpd %xmm13, %xmm4 1364 1365 movapd %xmm14, %xmm6 1366 unpcklpd %xmm15, %xmm14 1367 unpckhpd %xmm15, %xmm6 1368 1369 movapd -16 * SIZE(BO), %xmm9 1370 movapd -14 * SIZE(BO), %xmm11 1371 movapd -12 * SIZE(BO), %xmm13 1372 movapd -10 * SIZE(BO), %xmm15 1373 movapd -8 * SIZE(BO), %xmm1 1374 movapd -6 * SIZE(BO), %xmm3 1375 movapd -4 * SIZE(BO), %xmm5 1376 movapd -2 * SIZE(BO), %xmm7 1377 1378 subpd %xmm8, %xmm9 1379 subpd %xmm10, %xmm11 1380 subpd %xmm0, %xmm13 1381 subpd %xmm2, %xmm15 1382 subpd %xmm12, %xmm1 1383 subpd %xmm14, %xmm3 1384 subpd %xmm4, %xmm5 1385 subpd %xmm6, %xmm7 1386#else 1387 movapd -16 * SIZE(AO), %xmm0 1388 movapd -14 * SIZE(AO), %xmm1 1389 movapd -12 * SIZE(AO), %xmm2 1390 movapd -10 * SIZE(AO), %xmm3 1391 1392 movapd -8 * SIZE(AO), %xmm4 1393 movapd -6 * SIZE(AO), %xmm5 1394 movapd -4 * SIZE(AO), %xmm6 1395 movapd -2 * SIZE(AO), %xmm7 1396 1397 subpd %xmm8, %xmm0 1398 subpd %xmm12, %xmm1 1399 subpd %xmm9, %xmm2 1400 subpd %xmm13, %xmm3 1401 subpd %xmm10, %xmm4 1402 subpd %xmm14, %xmm5 1403 subpd %xmm11, %xmm6 1404 subpd %xmm15, %xmm7 1405#endif 1406 1407#ifdef LN 1408 movddup -1 * SIZE(AO), %xmm8 1409 mulpd %xmm8, %xmm5 1410 mulpd %xmm8, %xmm7 1411 1412 movddup -2 * SIZE(AO), %xmm10 1413 mulpd %xmm5, %xmm10 1414 subpd %xmm10, %xmm1 1415 movddup -2 * SIZE(AO), %xmm10 1416 mulpd %xmm7, %xmm10 1417 subpd %xmm10, %xmm3 1418 1419 movddup -3 * SIZE(AO), %xmm12 1420 mulpd %xmm5, %xmm12 1421 subpd %xmm12, %xmm13 1422 movddup -3 * SIZE(AO), %xmm12 1423 mulpd %xmm7, %xmm12 1424 subpd %xmm12, %xmm15 1425 1426 movddup -4 * SIZE(AO), %xmm14 1427 mulpd %xmm5, %xmm14 1428 subpd %xmm14, %xmm9 1429 movddup -4 * SIZE(AO), %xmm14 1430 mulpd %xmm7, %xmm14 1431 subpd %xmm14, %xmm11 1432 1433 movddup -6 * SIZE(AO), %xmm8 1434 mulpd %xmm8, %xmm1 1435 mulpd %xmm8, %xmm3 1436 1437 movddup -7 * SIZE(AO), %xmm10 1438 mulpd %xmm1, %xmm10 1439 subpd %xmm10, %xmm13 1440 movddup -7 * SIZE(AO), %xmm10 1441 mulpd %xmm3, %xmm10 1442 subpd %xmm10, %xmm15 1443 1444 movddup -8 * SIZE(AO), %xmm12 1445 mulpd %xmm1, %xmm12 1446 subpd %xmm12, %xmm9 1447 movddup -8 * SIZE(AO), %xmm12 1448 mulpd %xmm3, %xmm12 1449 subpd %xmm12, %xmm11 1450 1451 movddup -11 * SIZE(AO), %xmm8 1452 mulpd %xmm8, %xmm13 1453 mulpd %xmm8, %xmm15 1454 1455 movddup -12 * SIZE(AO), %xmm10 1456 mulpd %xmm13, %xmm10 1457 subpd %xmm10, %xmm9 1458 movddup -12 * SIZE(AO), %xmm10 1459 mulpd %xmm15, %xmm10 1460 subpd %xmm10, %xmm11 1461 1462 movddup -16 * SIZE(AO), %xmm8 1463 mulpd %xmm8, %xmm9 1464 mulpd %xmm8, %xmm11 1465#endif 1466 1467#ifdef LT 1468 movddup -16 * SIZE(AO), %xmm8 1469 mulpd %xmm8, %xmm9 1470 mulpd %xmm8, %xmm11 1471 1472 movddup -15 * SIZE(AO), %xmm10 1473 mulpd %xmm9, %xmm10 1474 subpd %xmm10, %xmm13 1475 1476 movddup -15 * SIZE(AO), %xmm10 1477 mulpd %xmm11, %xmm10 1478 subpd %xmm10, %xmm15 1479 1480 movddup -14 * SIZE(AO), %xmm12 1481 mulpd %xmm9, %xmm12 1482 subpd %xmm12, %xmm1 1483 movddup -14 * SIZE(AO), %xmm12 1484 mulpd %xmm11, %xmm12 1485 subpd %xmm12, %xmm3 1486 1487 movddup -13 * SIZE(AO), %xmm14 1488 mulpd %xmm9, %xmm14 1489 subpd %xmm14, %xmm5 1490 movddup -13 * SIZE(AO), %xmm14 1491 mulpd %xmm11, %xmm14 1492 subpd %xmm14, %xmm7 1493 1494 movddup -11 * SIZE(AO), %xmm8 1495 mulpd %xmm8, %xmm13 1496 mulpd %xmm8, %xmm15 1497 1498 movddup -10 * SIZE(AO), %xmm10 1499 mulpd %xmm13, %xmm10 1500 subpd %xmm10, %xmm1 1501 movddup -10 * SIZE(AO), %xmm10 1502 mulpd %xmm15, %xmm10 1503 subpd %xmm10, %xmm3 1504 1505 movddup -9 * SIZE(AO), %xmm12 1506 mulpd %xmm13, %xmm12 1507 subpd %xmm12, %xmm5 1508 movddup -9 * SIZE(AO), %xmm12 1509 mulpd %xmm15, %xmm12 1510 subpd %xmm12, %xmm7 1511 1512 movddup -6 * SIZE(AO), %xmm8 1513 mulpd %xmm8, %xmm1 1514 mulpd %xmm8, %xmm3 1515 1516 movddup -5 * SIZE(AO), %xmm10 1517 mulpd %xmm1, %xmm10 1518 subpd %xmm10, %xmm5 1519 movddup -5 * SIZE(AO), %xmm10 1520 mulpd %xmm3, %xmm10 1521 subpd %xmm10, %xmm7 1522 1523 movddup -1 * SIZE(AO), %xmm8 1524 mulpd %xmm8, %xmm5 1525 mulpd %xmm8, %xmm7 1526#endif 1527 1528#ifdef RN 1529 movddup -16 * SIZE(BO), %xmm8 1530 mulpd %xmm8, %xmm0 1531 mulpd %xmm8, %xmm1 1532 1533 movddup -15 * SIZE(BO), %xmm9 1534 mulpd %xmm0, %xmm9 1535 subpd %xmm9, %xmm2 1536 movddup -15 * SIZE(BO), %xmm9 1537 mulpd %xmm1, %xmm9 1538 subpd %xmm9, %xmm3 1539 1540 movddup -14 * SIZE(BO), %xmm10 1541 mulpd %xmm0, %xmm10 1542 subpd %xmm10, %xmm4 1543 movddup -14 * SIZE(BO), %xmm10 1544 mulpd %xmm1, %xmm10 1545 subpd %xmm10, %xmm5 1546 1547 movddup -13 * SIZE(BO), %xmm11 1548 mulpd %xmm0, %xmm11 1549 subpd %xmm11, %xmm6 1550 movddup -13 * SIZE(BO), %xmm11 1551 mulpd %xmm1, %xmm11 1552 subpd %xmm11, %xmm7 1553 1554 movddup -11 * SIZE(BO), %xmm8 1555 mulpd %xmm8, %xmm2 1556 mulpd %xmm8, %xmm3 1557 1558 movddup -10 * SIZE(BO), %xmm9 1559 mulpd %xmm2, %xmm9 1560 subpd %xmm9, %xmm4 1561 movddup -10 * SIZE(BO), %xmm9 1562 mulpd %xmm3, %xmm9 1563 subpd %xmm9, %xmm5 1564 1565 movddup -9 * SIZE(BO), %xmm10 1566 mulpd %xmm2, %xmm10 1567 subpd %xmm10, %xmm6 1568 movddup -9 * SIZE(BO), %xmm10 1569 mulpd %xmm3, %xmm10 1570 subpd %xmm10, %xmm7 1571 1572 movddup -6 * SIZE(BO), %xmm8 1573 mulpd %xmm8, %xmm4 1574 mulpd %xmm8, %xmm5 1575 1576 movddup -5 * SIZE(BO), %xmm9 1577 mulpd %xmm4, %xmm9 1578 subpd %xmm9, %xmm6 1579 movddup -5 * SIZE(BO), %xmm9 1580 mulpd %xmm5, %xmm9 1581 subpd %xmm9, %xmm7 1582 1583 movddup -1 * SIZE(BO), %xmm8 1584 mulpd %xmm8, %xmm6 1585 mulpd %xmm8, %xmm7 1586#endif 1587 1588#ifdef RT 1589 movddup -1 * SIZE(BO), %xmm8 1590 mulpd %xmm8, %xmm6 1591 mulpd %xmm8, %xmm7 1592 1593 movddup -2 * SIZE(BO), %xmm9 1594 mulpd %xmm6, %xmm9 1595 subpd %xmm9, %xmm4 1596 movddup -2 * SIZE(BO), %xmm9 1597 mulpd %xmm7, %xmm9 1598 subpd %xmm9, %xmm5 1599 1600 movddup -3 * SIZE(BO), %xmm10 1601 mulpd %xmm6, %xmm10 1602 subpd %xmm10, %xmm2 1603 movddup -3 * SIZE(BO), %xmm10 1604 mulpd %xmm7, %xmm10 1605 subpd %xmm10, %xmm3 1606 1607 movddup -4 * SIZE(BO), %xmm11 1608 mulpd %xmm6, %xmm11 1609 subpd %xmm11, %xmm0 1610 movddup -4 * SIZE(BO), %xmm11 1611 mulpd %xmm7, %xmm11 1612 subpd %xmm11, %xmm1 1613 1614 movddup -6 * SIZE(BO), %xmm8 1615 mulpd %xmm8, %xmm4 1616 mulpd %xmm8, %xmm5 1617 1618 movddup -7 * SIZE(BO), %xmm9 1619 mulpd %xmm4, %xmm9 1620 subpd %xmm9, %xmm2 1621 movddup -7 * SIZE(BO), %xmm9 1622 mulpd %xmm5, %xmm9 1623 subpd %xmm9, %xmm3 1624 1625 movddup -8 * SIZE(BO), %xmm10 1626 mulpd %xmm4, %xmm10 1627 subpd %xmm10, %xmm0 1628 movddup -8 * SIZE(BO), %xmm10 1629 mulpd %xmm5, %xmm10 1630 subpd %xmm10, %xmm1 1631 1632 movddup -11 * SIZE(BO), %xmm8 1633 mulpd %xmm8, %xmm2 1634 mulpd %xmm8, %xmm3 1635 1636 movddup -12 * SIZE(BO), %xmm9 1637 mulpd %xmm2, %xmm9 1638 subpd %xmm9, %xmm0 1639 movddup -12 * SIZE(BO), %xmm9 1640 mulpd %xmm3, %xmm9 1641 subpd %xmm9, %xmm1 1642 1643 movddup -16 * SIZE(BO), %xmm8 1644 mulpd %xmm8, %xmm0 1645 mulpd %xmm8, %xmm1 1646#endif 1647 1648#ifdef LN 1649 subq $4 * SIZE, CO1 1650 subq $4 * SIZE, CO2 1651#endif 1652 1653#if defined(LN) || defined(LT) 1654 movlpd %xmm9, 0 * SIZE(CO1) 1655 movlpd %xmm13, 1 * SIZE(CO1) 1656 movlpd %xmm1, 2 * SIZE(CO1) 1657 movlpd %xmm5, 3 * SIZE(CO1) 1658 1659 movhpd %xmm9, 0 * SIZE(CO2) 1660 movhpd %xmm13, 1 * SIZE(CO2) 1661 movhpd %xmm1, 2 * SIZE(CO2) 1662 movhpd %xmm5, 3 * SIZE(CO2) 1663 1664 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 1665 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 1666 movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) 1667 movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) 1668 1669 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1670 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 1671 movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) 1672 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 1673#else 1674 movlpd %xmm0, 0 * SIZE(CO1) 1675 movhpd %xmm0, 1 * SIZE(CO1) 1676 movlpd %xmm1, 2 * SIZE(CO1) 1677 movhpd %xmm1, 3 * SIZE(CO1) 1678 1679 movlpd %xmm2, 0 * SIZE(CO2) 1680 movhpd %xmm2, 1 * SIZE(CO2) 1681 movlpd %xmm3, 2 * SIZE(CO2) 1682 movhpd %xmm3, 3 * SIZE(CO2) 1683 1684 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 1685 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 1686 movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) 1687 movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) 1688 1689 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 1690 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 1691 movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) 1692 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 1693#endif 1694 1695#if defined(LN) || defined(LT) 1696 movaps %xmm9, -16 * SIZE(BO) 1697 movaps %xmm11, -14 * SIZE(BO) 1698 movaps %xmm13, -12 * SIZE(BO) 1699 movaps %xmm15, -10 * SIZE(BO) 1700 movaps %xmm1, -8 * SIZE(BO) 1701 movaps %xmm3, -6 * SIZE(BO) 1702 movaps %xmm5, -4 * SIZE(BO) 1703 movaps %xmm7, -2 * SIZE(BO) 1704#else 1705 movaps %xmm0, -16 * SIZE(AO) 1706 movaps %xmm1, -14 * SIZE(AO) 1707 movaps %xmm2, -12 * SIZE(AO) 1708 movaps %xmm3, -10 * SIZE(AO) 1709 movaps %xmm4, -8 * SIZE(AO) 1710 movaps %xmm5, -6 * SIZE(AO) 1711 movaps %xmm6, -4 * SIZE(AO) 1712 movaps %xmm7, -2 * SIZE(AO) 1713#endif 1714 1715#ifndef LN 1716 addq $4 * SIZE, CO1 1717 addq $4 * SIZE, CO2 1718#endif 1719 1720#if defined(LT) || defined(RN) 1721 movq K, %rax 1722 subq KK, %rax 1723 leaq (,%rax, SIZE), %rax 1724 leaq (AO, %rax, 4), AO 1725 leaq (BO, %rax, 4), BO 1726#endif 1727 1728#ifdef LN 1729 subq $4, KK 1730#endif 1731 1732#ifdef LT 1733 addq $4, KK 1734#endif 1735 1736#ifdef RT 1737 movq K, %rax 1738 salq $2 + BASE_SHIFT, %rax 1739 addq %rax, AORIG 1740#endif 1741 1742 decq I # i -- 1743 jg .L11 1744 ALIGN_4 1745 1746.L39: 1747#ifdef LN 1748 leaq (, K, SIZE), %rax 1749 leaq (B, %rax, 4), B 1750#endif 1751 1752#if defined(LT) || defined(RN) 1753 movq BO, B 1754#endif 1755 1756#ifdef RN 1757 addq $4, KK 1758#endif 1759 1760#ifdef RT 1761 subq $4, KK 1762#endif 1763 1764 decq J # j -- 1765 jg .L01 1766 ALIGN_4 1767 1768.L40: 1769 testq $2, N 1770 je .L80 1771 1772#if defined(LT) || defined(RN) 1773 movq A, AO 1774#else 1775 movq A, AORIG 1776#endif 1777 1778#ifdef RT 1779 movq K, %rax 1780 salq $1 + BASE_SHIFT, %rax 1781 subq %rax, B 1782 1783 leaq (, LDC, 2), %rax 1784 subq %rax, C 1785#endif 1786 1787 movq C, CO1 # coffset1 = c 1788 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1789#ifndef RT 1790 leaq (C, LDC, 2), C 1791#endif 1792 1793#ifdef LN 1794 movq OFFSET, %rax 1795 addq M, %rax 1796 movq %rax, KK 1797#endif 1798 1799#if defined(LT) 1800 movq OFFSET, %rax 1801 movq %rax, KK 1802#endif 1803 1804 testq $1, M 1805 je .L60 1806 ALIGN_4 1807 1808.L71: 1809#ifdef LN 1810 movq K, %rax 1811 salq $0 + BASE_SHIFT, %rax 1812 subq %rax, AORIG 1813#endif 1814 1815#if defined(LN) || defined(RT) 1816 movq KK, %rax 1817 movq AORIG, AO 1818 leaq (, %rax, SIZE), %rax 1819 leaq (AO, %rax, 1), AO 1820#endif 1821 1822 movq B, BO 1823 1824#if defined(LN) || defined(RT) 1825 movq KK, %rax 1826 salq $1 + BASE_SHIFT, %rax 1827 leaq (BO, %rax, 1), BO 1828#endif 1829 1830 movddup -16 * SIZE(AO), %xmm0 1831 pxor %xmm8, %xmm8 1832 movddup -15 * SIZE(AO), %xmm1 1833 pxor %xmm9, %xmm9 1834 movddup -14 * SIZE(AO), %xmm2 1835 pxor %xmm10, %xmm10 1836 movddup -13 * SIZE(AO), %xmm3 1837 pxor %xmm11, %xmm11 1838 1839#if defined(LT) || defined(RN) 1840 movq KK, %rax 1841#else 1842 movq K, %rax 1843 subq KK, %rax 1844#endif 1845 andq $-4, %rax 1846 leaq (, %rax, SIZE), %rax 1847 leaq (AO, %rax, 1), AO 1848 leaq (BO, %rax, 2), BO 1849 negq %rax 1850 NOBRANCH 1851 je .L76 1852 ALIGN_4 1853 1854.L72: 1855 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 1856 addpd %xmm0, %xmm8 1857 movddup -12 * SIZE(AO, %rax, 1), %xmm0 1858 1859 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 1860 addpd %xmm1, %xmm9 1861 movddup -11 * SIZE(AO, %rax, 1), %xmm1 1862 1863 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 1864 addpd %xmm2, %xmm10 1865 movddup -10 * SIZE(AO, %rax, 1), %xmm2 1866 1867 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 1868 addpd %xmm3, %xmm11 1869 movddup -9 * SIZE(AO, %rax, 1), %xmm3 1870 1871 addq $4 * SIZE, %rax 1872 BRANCH 1873 jl .L72 1874 ALIGN_4 1875 1876.L76: 1877#if defined(LT) || defined(RN) 1878 movq KK, %rax 1879#else 1880 movq K, %rax 1881 subq KK, %rax 1882#endif 1883 andq $3, %rax # if (k & 1) 1884 je .L78 1885 1886 leaq (, %rax, SIZE), %rax 1887 leaq (AO, %rax, 1), AO 1888 leaq (BO, %rax, 2), BO 1889 negq %rax 1890 ALIGN_4 1891 1892.L77: 1893 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 1894 addpd %xmm0, %xmm8 1895 movddup -15 * SIZE(AO, %rax, 1), %xmm0 1896 1897 addq $SIZE, %rax 1898 jl .L77 1899 ALIGN_4 1900 1901.L78: 1902 addpd %xmm9, %xmm8 1903 addpd %xmm11, %xmm10 1904 addpd %xmm10, %xmm8 1905 1906#if defined(LN) || defined(RT) 1907 movq KK, %rax 1908#ifdef LN 1909 subq $1, %rax 1910#else 1911 subq $2, %rax 1912#endif 1913 1914 leaq (, %rax, SIZE), %rax 1915 1916 movq AORIG, AO 1917 leaq (AO, %rax, 1), AO 1918 leaq (B, %rax, 2), BO 1919#endif 1920 1921#if defined(LN) || defined(LT) 1922 movapd -16 * SIZE(BO), %xmm2 1923#else 1924 movapd -16 * SIZE(AO), %xmm2 1925#endif 1926 1927 subpd %xmm8, %xmm2 1928 1929#if defined(LN) || defined(LT) 1930 movddup -16 * SIZE(AO), %xmm0 1931 1932 mulpd %xmm0, %xmm2 1933#endif 1934 1935#ifdef RN 1936 movapd %xmm2, %xmm0 1937 unpckhpd %xmm0, %xmm0 1938 1939 mulsd -16 * SIZE(BO), %xmm2 1940 movsd -15 * SIZE(BO), %xmm4 1941 mulsd %xmm2, %xmm4 1942 subsd %xmm4, %xmm0 1943 1944 mulsd -13 * SIZE(BO), %xmm0 1945 unpcklpd %xmm0, %xmm2 1946#endif 1947 1948#ifdef RT 1949 movapd %xmm2, %xmm0 1950 unpckhpd %xmm0, %xmm0 1951 1952 mulsd -13 * SIZE(BO), %xmm0 1953 1954 movlpd -14 * SIZE(BO), %xmm4 1955 mulsd %xmm0, %xmm4 1956 subsd %xmm4, %xmm2 1957 1958 mulsd -16 * SIZE(BO), %xmm2 1959 unpcklpd %xmm0, %xmm2 1960#endif 1961 1962#ifdef LN 1963 subq $1 * SIZE, CO1 1964 subq $1 * SIZE, CO2 1965#endif 1966 1967 movlpd %xmm2, 0 * SIZE(CO1) 1968 movhpd %xmm2, 0 * SIZE(CO2) 1969 1970#if defined(LN) || defined(LT) 1971 movaps %xmm2, -16 * SIZE(BO) 1972#else 1973 movaps %xmm2, -16 * SIZE(AO) 1974#endif 1975 1976#ifndef LN 1977 addq $1 * SIZE, CO1 1978 addq $1 * SIZE, CO2 1979#endif 1980 1981#if defined(LT) || defined(RN) 1982 movq K, %rax 1983 subq KK, %rax 1984 leaq (,%rax, SIZE), %rax 1985 leaq (AO, %rax, 1), AO 1986 leaq (BO, %rax, 2), BO 1987#endif 1988 1989#ifdef LN 1990 subq $1, KK 1991#endif 1992 1993#ifdef LT 1994 addq $1, KK 1995#endif 1996 1997#ifdef RT 1998 movq K, %rax 1999 salq $0 + BASE_SHIFT, %rax 2000 addq %rax, AORIG 2001#endif 2002 ALIGN_4 2003 2004.L60: 2005 testq $2, M 2006 je .L70 2007 2008#ifdef LN 2009 movq K, %rax 2010 salq $1 + BASE_SHIFT, %rax 2011 subq %rax, AORIG 2012#endif 2013 2014#if defined(LN) || defined(RT) 2015 movq KK, %rax 2016 movq AORIG, AO 2017 leaq (, %rax, SIZE), %rax 2018 leaq (AO, %rax, 2), AO 2019#endif 2020 2021 movq B, BO 2022 2023#if defined(LN) || defined(RT) 2024 movq KK, %rax 2025 leaq (, %rax, SIZE), %rax 2026 leaq (BO, %rax, 2), BO 2027#endif 2028 2029 movapd -16 * SIZE(AO), %xmm0 2030 pxor %xmm8, %xmm8 2031 movapd -12 * SIZE(AO), %xmm2 2032 pxor %xmm9, %xmm9 2033 movddup -16 * SIZE(BO), %xmm1 2034 pxor %xmm10, %xmm10 2035 movddup -15 * SIZE(BO), %xmm3 2036 pxor %xmm11, %xmm11 2037 2038#if defined(LT) || defined(RN) 2039 movq KK, %rax 2040#else 2041 movq K, %rax 2042 subq KK, %rax 2043#endif 2044 andq $-4, %rax 2045 leaq (, %rax, SIZE), %rax 2046 leaq (AO, %rax, 2), AO 2047 leaq (BO, %rax, 2), BO 2048 negq %rax 2049 NOBRANCH 2050 je .L66 2051 ALIGN_4 2052 2053.L62: 2054 mulpd %xmm0, %xmm1 2055 addpd %xmm1, %xmm8 2056 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2057 mulpd %xmm0, %xmm3 2058 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2059 addpd %xmm3, %xmm9 2060 movddup -13 * SIZE(BO, %rax, 2), %xmm3 2061 mulpd %xmm0, %xmm1 2062 addpd %xmm1, %xmm10 2063 movddup -12 * SIZE(BO, %rax, 2), %xmm1 2064 mulpd %xmm0, %xmm3 2065 movapd -8 * SIZE(AO, %rax, 2), %xmm0 2066 addpd %xmm3, %xmm11 2067 movddup -11 * SIZE(BO, %rax, 2), %xmm3 2068 mulpd %xmm2, %xmm1 2069 addpd %xmm1, %xmm8 2070 movddup -10 * SIZE(BO, %rax, 2), %xmm1 2071 mulpd %xmm2, %xmm3 2072 movapd -10 * SIZE(AO, %rax, 2), %xmm2 2073 addpd %xmm3, %xmm9 2074 movddup -9 * SIZE(BO, %rax, 2), %xmm3 2075 mulpd %xmm2, %xmm1 2076 addpd %xmm1, %xmm10 2077 movddup -8 * SIZE(BO, %rax, 2), %xmm1 2078 mulpd %xmm2, %xmm3 2079 movapd -4 * SIZE(AO, %rax, 2), %xmm2 2080 addpd %xmm3, %xmm11 2081 movddup -7 * SIZE(BO, %rax, 2), %xmm3 2082 2083 addq $4 * SIZE, %rax 2084 BRANCH 2085 jl .L62 2086 ALIGN_4 2087 2088.L66: 2089#if defined(LT) || defined(RN) 2090 movq KK, %rax 2091#else 2092 movq K, %rax 2093 subq KK, %rax 2094#endif 2095 andq $3, %rax # if (k & 1) 2096 je .L69 2097 2098 leaq (, %rax, SIZE), %rax 2099 leaq (AO, %rax, 2), AO 2100 leaq (BO, %rax, 2), BO 2101 negq %rax 2102 ALIGN_4 2103 2104.L67: 2105 mulpd %xmm0, %xmm1 2106 addpd %xmm1, %xmm8 2107 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2108 mulpd %xmm0, %xmm3 2109 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2110 addpd %xmm3, %xmm9 2111 movddup -13 * SIZE(BO, %rax, 2), %xmm3 2112 2113 addq $SIZE, %rax 2114 jl .L67 2115 ALIGN_4 2116 2117.L69: 2118 addpd %xmm10, %xmm8 2119 addpd %xmm11, %xmm9 2120 2121#if defined(LN) || defined(RT) 2122 movq KK, %rax 2123#ifdef LN 2124 subq $2, %rax 2125#else 2126 subq $2, %rax 2127#endif 2128 2129 leaq (, %rax, SIZE), %rax 2130 2131 movq AORIG, AO 2132 leaq (AO, %rax, 2), AO 2133 leaq (B, %rax, 2), BO 2134#endif 2135 2136#if defined(LN) || defined(LT) 2137 movapd %xmm8, %xmm0 2138 unpcklpd %xmm9, %xmm8 2139 unpckhpd %xmm9, %xmm0 2140 2141 movapd -16 * SIZE(BO), %xmm9 2142 movapd -14 * SIZE(BO), %xmm13 2143 2144 subpd %xmm8, %xmm9 2145 subpd %xmm0, %xmm13 2146#else 2147 movapd -16 * SIZE(AO), %xmm0 2148 movapd -14 * SIZE(AO), %xmm2 2149 2150 subpd %xmm8, %xmm0 2151 subpd %xmm9, %xmm2 2152#endif 2153 2154 2155#ifdef LN 2156 movddup -13 * SIZE(AO), %xmm8 2157 mulpd %xmm8, %xmm13 2158 2159 movddup -14 * SIZE(AO), %xmm10 2160 mulpd %xmm13, %xmm10 2161 subpd %xmm10, %xmm9 2162 2163 movddup -16 * SIZE(AO), %xmm8 2164 mulpd %xmm8, %xmm9 2165#endif 2166 2167#ifdef LT 2168 movddup -16 * SIZE(AO), %xmm8 2169 mulpd %xmm8, %xmm9 2170 2171 movddup -15 * SIZE(AO), %xmm10 2172 mulpd %xmm9, %xmm10 2173 subpd %xmm10, %xmm13 2174 2175 movddup -13 * SIZE(AO), %xmm8 2176 mulpd %xmm8, %xmm13 2177#endif 2178 2179#ifdef RN 2180 movddup -16 * SIZE(BO), %xmm8 2181 mulpd %xmm8, %xmm0 2182 2183 movddup -15 * SIZE(BO), %xmm9 2184 mulpd %xmm0, %xmm9 2185 subpd %xmm9, %xmm2 2186 2187 movddup -13 * SIZE(BO), %xmm8 2188 mulpd %xmm8, %xmm2 2189#endif 2190 2191#ifdef RT 2192 movddup -13 * SIZE(BO), %xmm8 2193 mulpd %xmm8, %xmm2 2194 2195 movddup -14 * SIZE(BO), %xmm9 2196 mulpd %xmm2, %xmm9 2197 subpd %xmm9, %xmm0 2198 2199 movddup -16 * SIZE(BO), %xmm8 2200 mulpd %xmm8, %xmm0 2201#endif 2202 2203#ifdef LN 2204 subq $2 * SIZE, CO1 2205 subq $2 * SIZE, CO2 2206#endif 2207 2208#if defined(LN) || defined(LT) 2209 movlpd %xmm9, 0 * SIZE(CO1) 2210 movlpd %xmm13, 1 * SIZE(CO1) 2211 2212 movhpd %xmm9, 0 * SIZE(CO2) 2213 movhpd %xmm13, 1 * SIZE(CO2) 2214#else 2215 movlpd %xmm0, 0 * SIZE(CO1) 2216 movhpd %xmm0, 1 * SIZE(CO1) 2217 2218 movlpd %xmm2, 0 * SIZE(CO2) 2219 movhpd %xmm2, 1 * SIZE(CO2) 2220#endif 2221 2222#if defined(LN) || defined(LT) 2223 movaps %xmm9, -16 * SIZE(BO) 2224 movaps %xmm13, -14 * SIZE(BO) 2225#else 2226 movaps %xmm0, -16 * SIZE(AO) 2227 movaps %xmm2, -14 * SIZE(AO) 2228#endif 2229 2230#ifndef LN 2231 addq $2 * SIZE, CO1 2232 addq $2 * SIZE, CO2 2233#endif 2234 2235#if defined(LT) || defined(RN) 2236 movq K, %rax 2237 subq KK, %rax 2238 leaq (,%rax, SIZE), %rax 2239 leaq (AO, %rax, 2), AO 2240 leaq (BO, %rax, 2), BO 2241#endif 2242 2243#ifdef LN 2244 subq $2, KK 2245#endif 2246 2247#ifdef LT 2248 addq $2, KK 2249#endif 2250 2251#ifdef RT 2252 movq K, %rax 2253 salq $1 + BASE_SHIFT, %rax 2254 addq %rax, AORIG 2255#endif 2256 ALIGN_4 2257 2258.L70: 2259 movq M, I 2260 sarq $2, I # i = (m >> 2) 2261 jle .L79 2262 ALIGN_4 2263 2264.L51: 2265#ifdef LN 2266 movq K, %rax 2267 salq $2 + BASE_SHIFT, %rax 2268 subq %rax, AORIG 2269#endif 2270 2271#if defined(LN) || defined(RT) 2272 movq KK, %rax 2273 movq AORIG, AO 2274 leaq (, %rax, SIZE), %rax 2275 leaq (AO, %rax, 4), AO 2276#endif 2277 2278 movq B, BO 2279 2280#if defined(LN) || defined(RT) 2281 movq KK, %rax 2282 leaq (, %rax, SIZE), %rax 2283 leaq (BO, %rax, 2), BO 2284#endif 2285 2286 movddup -16 * SIZE(BO), %xmm1 2287 movddup -15 * SIZE(BO), %xmm5 2288 pxor %xmm8, %xmm8 2289 movddup -12 * SIZE(BO), %xmm3 2290 pxor %xmm9, %xmm9 2291 movapd -16 * SIZE(AO), %xmm0 2292 pxor %xmm12, %xmm12 2293 movapd -8 * SIZE(AO), %xmm4 2294 pxor %xmm13, %xmm13 2295 2296#ifndef LN 2297 prefetchw 3 * SIZE(CO1) 2298 movapd %xmm0, %xmm2 2299 prefetchw 3 * SIZE(CO2) 2300#else 2301 prefetchw -8 * SIZE(CO1) 2302 movapd %xmm0, %xmm2 2303 prefetchw -8 * SIZE(CO2) 2304#endif 2305 2306 2307#if defined(LT) || defined(RN) 2308 movq KK, %rax 2309#else 2310 movq K, %rax 2311 subq KK, %rax 2312#endif 2313 andq $-4, %rax 2314 leaq (, %rax, SIZE), %rax 2315 leaq (AO, %rax, 4), AO 2316 leaq (BO, %rax, 2), BO 2317 negq %rax 2318 NOBRANCH 2319 je .L56 2320 ALIGN_4 2321 2322.L52: 2323 mulpd %xmm1, %xmm0 2324 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2325 addpd %xmm0, %xmm8 2326 movapd -12 * SIZE(AO, %rax, 4), %xmm0 2327 addpd %xmm1, %xmm12 2328 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2329 mulpd %xmm5, %xmm2 2330 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 2331 addpd %xmm2, %xmm9 2332 addpd %xmm5, %xmm13 2333 movddup -13 * SIZE(BO, %rax, 2), %xmm5 2334 movapd %xmm0, %xmm2 2335 mulpd %xmm1, %xmm0 2336 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 2337 addpd %xmm0, %xmm8 2338 movapd (AO, %rax, 4), %xmm0 2339 addpd %xmm1, %xmm12 2340 movddup -8 * SIZE(BO, %rax, 2), %xmm1 2341 mulpd %xmm5, %xmm2 2342 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 2343 addpd %xmm2, %xmm9 2344 addpd %xmm5, %xmm13 2345 movddup -11 * SIZE(BO, %rax, 2), %xmm5 2346 movapd %xmm4, %xmm2 2347 mulpd %xmm3, %xmm4 2348 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 2349 addpd %xmm4, %xmm8 2350 movapd -4 * SIZE(AO, %rax, 4), %xmm4 2351 addpd %xmm3, %xmm12 2352 movddup -10 * SIZE(BO, %rax, 2), %xmm3 2353 mulpd %xmm5, %xmm2 2354 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 2355 addpd %xmm2, %xmm9 2356 addpd %xmm5, %xmm13 2357 movddup -9 * SIZE(BO, %rax, 2), %xmm5 2358 movapd %xmm4, %xmm2 2359 mulpd %xmm3, %xmm4 2360 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 2361 addpd %xmm4, %xmm8 2362 movapd 8 * SIZE(AO, %rax, 4), %xmm4 2363 addpd %xmm3, %xmm12 2364 movddup -4 * SIZE(BO, %rax, 2), %xmm3 2365 mulpd %xmm5, %xmm2 2366 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 2367 addpd %xmm2, %xmm9 2368 addpd %xmm5, %xmm13 2369 movddup -7 * SIZE(BO, %rax, 2), %xmm5 2370 movapd %xmm0, %xmm2 2371 2372 addq $4 * SIZE, %rax 2373 BRANCH 2374 jl .L52 2375 ALIGN_4 2376 2377.L56: 2378#if defined(LT) || defined(RN) 2379 movq KK, %rax 2380#else 2381 movq K, %rax 2382 subq KK, %rax 2383#endif 2384 andq $3, %rax # if (k & 1) 2385 je .L59 2386 2387 leaq (, %rax, SIZE), %rax 2388 leaq (AO, %rax, 4), AO 2389 leaq (BO, %rax, 2), BO 2390 negq %rax 2391 ALIGN_4 2392 2393.L57: 2394 mulpd %xmm1, %xmm0 2395 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2396 addpd %xmm0, %xmm8 2397 movapd -12 * SIZE(AO, %rax, 4), %xmm0 2398 addpd %xmm1, %xmm12 2399 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2400 mulpd %xmm5, %xmm2 2401 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 2402 addpd %xmm2, %xmm9 2403 addpd %xmm5, %xmm13 2404 movddup -13 * SIZE(BO, %rax, 2), %xmm5 2405 movapd %xmm0, %xmm2 2406 2407 addq $SIZE, %rax 2408 jl .L57 2409 ALIGN_4 2410 2411.L59: 2412#if defined(LN) || defined(RT) 2413 movq KK, %rax 2414#ifdef LN 2415 subq $4, %rax 2416#else 2417 subq $2, %rax 2418#endif 2419 2420 leaq (, %rax, SIZE), %rax 2421 2422 movq AORIG, AO 2423 leaq (AO, %rax, 4), AO 2424 leaq (B, %rax, 2), BO 2425#endif 2426 2427#if defined(LN) || defined(LT) 2428 movapd %xmm8, %xmm0 2429 unpcklpd %xmm9, %xmm8 2430 unpckhpd %xmm9, %xmm0 2431 2432 movapd %xmm12, %xmm4 2433 unpcklpd %xmm13, %xmm12 2434 unpckhpd %xmm13, %xmm4 2435 2436 movapd -16 * SIZE(BO), %xmm9 2437 movapd -14 * SIZE(BO), %xmm13 2438 movapd -12 * SIZE(BO), %xmm1 2439 movapd -10 * SIZE(BO), %xmm5 2440 2441 subpd %xmm8, %xmm9 2442 subpd %xmm0, %xmm13 2443 subpd %xmm12, %xmm1 2444 subpd %xmm4, %xmm5 2445#else 2446 movapd -16 * SIZE(AO), %xmm0 2447 movapd -14 * SIZE(AO), %xmm1 2448 movapd -12 * SIZE(AO), %xmm2 2449 movapd -10 * SIZE(AO), %xmm3 2450 2451 subpd %xmm8, %xmm0 2452 subpd %xmm12, %xmm1 2453 subpd %xmm9, %xmm2 2454 subpd %xmm13, %xmm3 2455#endif 2456 2457#ifdef LN 2458 movddup -1 * SIZE(AO), %xmm8 2459 mulpd %xmm8, %xmm5 2460 movddup -2 * SIZE(AO), %xmm10 2461 mulpd %xmm5, %xmm10 2462 subpd %xmm10, %xmm1 2463 movddup -3 * SIZE(AO), %xmm12 2464 mulpd %xmm5, %xmm12 2465 subpd %xmm12, %xmm13 2466 movddup -4 * SIZE(AO), %xmm14 2467 mulpd %xmm5, %xmm14 2468 subpd %xmm14, %xmm9 2469 2470 movddup -6 * SIZE(AO), %xmm8 2471 mulpd %xmm8, %xmm1 2472 movddup -7 * SIZE(AO), %xmm10 2473 mulpd %xmm1, %xmm10 2474 subpd %xmm10, %xmm13 2475 movddup -8 * SIZE(AO), %xmm12 2476 mulpd %xmm1, %xmm12 2477 subpd %xmm12, %xmm9 2478 2479 movddup -11 * SIZE(AO), %xmm8 2480 mulpd %xmm8, %xmm13 2481 movddup -12 * SIZE(AO), %xmm10 2482 mulpd %xmm13, %xmm10 2483 subpd %xmm10, %xmm9 2484 2485 movddup -16 * SIZE(AO), %xmm8 2486 mulpd %xmm8, %xmm9 2487#endif 2488 2489#ifdef LT 2490 movddup -16 * SIZE(AO), %xmm8 2491 mulpd %xmm8, %xmm9 2492 movddup -15 * SIZE(AO), %xmm10 2493 mulpd %xmm9, %xmm10 2494 subpd %xmm10, %xmm13 2495 movddup -14 * SIZE(AO), %xmm12 2496 mulpd %xmm9, %xmm12 2497 subpd %xmm12, %xmm1 2498 movddup -13 * SIZE(AO), %xmm14 2499 mulpd %xmm9, %xmm14 2500 subpd %xmm14, %xmm5 2501 2502 2503 movddup -11 * SIZE(AO), %xmm8 2504 mulpd %xmm8, %xmm13 2505 2506 movddup -10 * SIZE(AO), %xmm10 2507 mulpd %xmm13, %xmm10 2508 subpd %xmm10, %xmm1 2509 movddup -9 * SIZE(AO), %xmm12 2510 mulpd %xmm13, %xmm12 2511 subpd %xmm12, %xmm5 2512 2513 movddup -6 * SIZE(AO), %xmm8 2514 mulpd %xmm8, %xmm1 2515 movddup -5 * SIZE(AO), %xmm10 2516 mulpd %xmm1, %xmm10 2517 subpd %xmm10, %xmm5 2518 2519 movddup -1 * SIZE(AO), %xmm8 2520 mulpd %xmm8, %xmm5 2521#endif 2522 2523#ifdef RN 2524 movddup -16 * SIZE(BO), %xmm8 2525 mulpd %xmm8, %xmm0 2526 mulpd %xmm8, %xmm1 2527 2528 movddup -15 * SIZE(BO), %xmm9 2529 mulpd %xmm0, %xmm9 2530 subpd %xmm9, %xmm2 2531 movddup -15 * SIZE(BO), %xmm9 2532 mulpd %xmm1, %xmm9 2533 subpd %xmm9, %xmm3 2534 2535 movddup -13 * SIZE(BO), %xmm8 2536 mulpd %xmm8, %xmm2 2537 mulpd %xmm8, %xmm3 2538#endif 2539 2540#ifdef RT 2541 movddup -13 * SIZE(BO), %xmm8 2542 mulpd %xmm8, %xmm2 2543 mulpd %xmm8, %xmm3 2544 2545 movddup -14 * SIZE(BO), %xmm9 2546 mulpd %xmm2, %xmm9 2547 subpd %xmm9, %xmm0 2548 movddup -14 * SIZE(BO), %xmm9 2549 mulpd %xmm3, %xmm9 2550 subpd %xmm9, %xmm1 2551 2552 movddup -16 * SIZE(BO), %xmm8 2553 mulpd %xmm8, %xmm0 2554 mulpd %xmm8, %xmm1 2555#endif 2556 2557#ifdef LN 2558 subq $4 * SIZE, CO1 2559 subq $4 * SIZE, CO2 2560#endif 2561 2562#if defined(LN) || defined(LT) 2563 movlpd %xmm9, 0 * SIZE(CO1) 2564 movlpd %xmm13, 1 * SIZE(CO1) 2565 movlpd %xmm1, 2 * SIZE(CO1) 2566 movlpd %xmm5, 3 * SIZE(CO1) 2567 2568 movhpd %xmm9, 0 * SIZE(CO2) 2569 movhpd %xmm13, 1 * SIZE(CO2) 2570 movhpd %xmm1, 2 * SIZE(CO2) 2571 movhpd %xmm5, 3 * SIZE(CO2) 2572#else 2573 movlpd %xmm0, 0 * SIZE(CO1) 2574 movhpd %xmm0, 1 * SIZE(CO1) 2575 movlpd %xmm1, 2 * SIZE(CO1) 2576 movhpd %xmm1, 3 * SIZE(CO1) 2577 2578 movlpd %xmm2, 0 * SIZE(CO2) 2579 movhpd %xmm2, 1 * SIZE(CO2) 2580 movlpd %xmm3, 2 * SIZE(CO2) 2581 movhpd %xmm3, 3 * SIZE(CO2) 2582#endif 2583 2584#if defined(LN) || defined(LT) 2585 movaps %xmm9, -16 * SIZE(BO) 2586 movaps %xmm13,-14 * SIZE(BO) 2587 movaps %xmm1, -12 * SIZE(BO) 2588 movaps %xmm5, -10 * SIZE(BO) 2589#else 2590 movaps %xmm0, -16 * SIZE(AO) 2591 movaps %xmm1, -14 * SIZE(AO) 2592 movaps %xmm2, -12 * SIZE(AO) 2593 movaps %xmm3, -10 * SIZE(AO) 2594#endif 2595 2596#ifndef LN 2597 addq $4 * SIZE, CO1 2598 addq $4 * SIZE, CO2 2599#endif 2600 2601#if defined(LT) || defined(RN) 2602 movq K, %rax 2603 subq KK, %rax 2604 leaq (,%rax, SIZE), %rax 2605 leaq (AO, %rax, 4), AO 2606 leaq (BO, %rax, 2), BO 2607#endif 2608 2609#ifdef LN 2610 subq $4, KK 2611#endif 2612 2613#ifdef LT 2614 addq $4, KK 2615#endif 2616 2617#ifdef RT 2618 movq K, %rax 2619 salq $2 + BASE_SHIFT, %rax 2620 addq %rax, AORIG 2621#endif 2622 2623 decq I # i -- 2624 jg .L51 2625 ALIGN_4 2626 2627.L79: 2628#ifdef LN 2629 leaq (, K, SIZE), %rax 2630 leaq (B, %rax, 2), B 2631#endif 2632 2633#if defined(LT) || defined(RN) 2634 movq BO, B 2635#endif 2636 2637#ifdef RN 2638 addq $2, KK 2639#endif 2640 2641#ifdef RT 2642 subq $2, KK 2643#endif 2644 ALIGN_4 2645 2646.L80: 2647 testq $1, N 2648 je .L999 2649 2650#if defined(LT) || defined(RN) 2651 movq A, AO 2652#else 2653 movq A, AORIG 2654#endif 2655 2656#ifdef RT 2657 movq K, %rax 2658 salq $0 + BASE_SHIFT, %rax 2659 subq %rax, B 2660 2661 subq LDC, C 2662#endif 2663 2664 movq C, CO1 # coffset1 = c 2665#ifndef RT 2666 addq LDC, C 2667#endif 2668 2669#ifdef LN 2670 movq OFFSET, %rax 2671 addq M, %rax 2672 movq %rax, KK 2673#endif 2674 2675#ifdef LT 2676 movq OFFSET, %rax 2677 movq %rax, KK 2678#endif 2679 2680 testq $1, M 2681 je .L100 2682 2683#ifdef LN 2684 movq K, %rax 2685 salq $0 + BASE_SHIFT, %rax 2686 subq %rax, AORIG 2687#endif 2688 2689#if defined(LN) || defined(RT) 2690 movq KK, %rax 2691 movq AORIG, AO 2692 leaq (, %rax, SIZE), %rax 2693 leaq (AO, %rax, 1), AO 2694#endif 2695 2696 movq B, BO 2697 2698#if defined(LN) || defined(RT) 2699 movq KK, %rax 2700 leaq (BO, %rax, SIZE), BO 2701#endif 2702 2703 movapd -16 * SIZE(AO), %xmm0 2704 pxor %xmm8, %xmm8 2705 movapd -14 * SIZE(AO), %xmm1 2706 pxor %xmm9, %xmm9 2707 2708#if defined(LT) || defined(RN) 2709 movq KK, %rax 2710#else 2711 movq K, %rax 2712 subq KK, %rax 2713#endif 2714 andq $-4, %rax 2715 leaq (, %rax, SIZE), %rax 2716 leaq (AO, %rax, 1), AO 2717 leaq (BO, %rax, 1), BO 2718 negq %rax 2719 NOBRANCH 2720 je .L116 2721 ALIGN_4 2722 2723.L112: 2724 mulpd -16 * SIZE(BO, %rax, 1), %xmm0 2725 addpd %xmm0, %xmm8 2726 movapd -12 * SIZE(AO, %rax, 1), %xmm0 2727 2728 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 2729 addpd %xmm1, %xmm9 2730 movapd -10 * SIZE(AO, %rax, 1), %xmm1 2731 2732 addq $4 * SIZE, %rax 2733 BRANCH 2734 jl .L112 2735 ALIGN_4 2736 2737.L116: 2738#if defined(LT) || defined(RN) 2739 movq KK, %rax 2740#else 2741 movq K, %rax 2742 subq KK, %rax 2743#endif 2744 andq $3, %rax # if (k & 1) 2745 je .L118 2746 2747 leaq (, %rax, SIZE), %rax 2748 leaq (AO, %rax, 1), AO 2749 leaq (BO, %rax, 1), BO 2750 negq %rax 2751 ALIGN_4 2752 2753.L117: 2754 mulsd -16 * SIZE(BO, %rax, 1), %xmm0 2755 addsd %xmm0, %xmm8 2756 movsd -15 * SIZE(AO, %rax, 1), %xmm0 2757 2758 addq $SIZE, %rax 2759 jl .L117 2760 ALIGN_4 2761 2762.L118: 2763 addpd %xmm9, %xmm8 2764 haddpd %xmm8, %xmm8 2765 2766#if defined(LN) || defined(RT) 2767 movq KK, %rax 2768#ifdef LN 2769 subq $1, %rax 2770#else 2771 subq $1, %rax 2772#endif 2773 2774 leaq (, %rax, SIZE), %rax 2775 2776 movq AORIG, AO 2777 leaq (AO, %rax, 1), AO 2778 leaq (B, %rax, 1), BO 2779#endif 2780 2781#if defined(LN) || defined(LT) 2782 movsd -16 * SIZE(BO), %xmm10 2783 subsd %xmm8, %xmm10 2784#else 2785 movsd -16 * SIZE(AO), %xmm10 2786 subsd %xmm8, %xmm10 2787#endif 2788 2789#if defined(LN) || defined(LT) 2790 movsd -16 * SIZE(AO), %xmm12 2791 mulsd %xmm12, %xmm10 2792#endif 2793 2794#if defined(RN) || defined(RT) 2795 movsd -16 * SIZE(BO), %xmm8 2796 mulsd %xmm8, %xmm10 2797#endif 2798 2799#ifdef LN 2800 subq $1 * SIZE, CO1 2801#endif 2802 2803 movsd %xmm10, 0 * SIZE(CO1) 2804 2805#if defined(LN) || defined(LT) 2806 movlpd %xmm10, -16 * SIZE(BO) 2807#else 2808 movlpd %xmm10, -16 * SIZE(AO) 2809#endif 2810 2811#ifndef LN 2812 addq $1 * SIZE, CO1 2813#endif 2814 2815#if defined(LT) || defined(RN) 2816 movq K, %rax 2817 subq KK, %rax 2818 leaq (,%rax, SIZE), %rax 2819 addq %rax, AO 2820 addq %rax, BO 2821#endif 2822 2823#ifdef LN 2824 subq $1, KK 2825#endif 2826 2827#ifdef LT 2828 addq $1, KK 2829#endif 2830 2831#ifdef RT 2832 movq K, %rax 2833 salq $0 + BASE_SHIFT, %rax 2834 addq %rax, AORIG 2835#endif 2836 ALIGN_4 2837 2838.L100: 2839 testq $2, M 2840 je .L110 2841 2842#ifdef LN 2843 movq K, %rax 2844 salq $1 + BASE_SHIFT, %rax 2845 subq %rax, AORIG 2846#endif 2847 2848#if defined(LN) || defined(RT) 2849 movq KK, %rax 2850 movq AORIG, AO 2851 leaq (, %rax, SIZE), %rax 2852 leaq (AO, %rax, 2), AO 2853#endif 2854 2855 movq B, BO 2856 2857#if defined(LN) || defined(RT) 2858 movq KK, %rax 2859 leaq (BO, %rax, SIZE), BO 2860#endif 2861 2862 movddup -16 * SIZE(BO), %xmm0 2863 pxor %xmm8, %xmm8 2864 movddup -15 * SIZE(BO), %xmm1 2865 pxor %xmm9, %xmm9 2866 movddup -14 * SIZE(BO), %xmm2 2867 pxor %xmm10, %xmm10 2868 movddup -13 * SIZE(BO), %xmm3 2869 pxor %xmm11, %xmm11 2870 2871#if defined(LT) || defined(RN) 2872 movq KK, %rax 2873#else 2874 movq K, %rax 2875 subq KK, %rax 2876#endif 2877 andq $-4, %rax 2878 leaq (, %rax, SIZE), %rax 2879 leaq (AO, %rax, 2), AO 2880 leaq (BO, %rax, 1), BO 2881 negq %rax 2882 NOBRANCH 2883 je .L106 2884 ALIGN_4 2885 2886.L102: 2887 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 2888 addpd %xmm0, %xmm8 2889 movddup -12 * SIZE(BO, %rax, 1), %xmm0 2890 2891 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 2892 addpd %xmm1, %xmm9 2893 movddup -11 * SIZE(BO, %rax, 1), %xmm1 2894 2895 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 2896 addpd %xmm2, %xmm10 2897 movddup -10 * SIZE(BO, %rax, 1), %xmm2 2898 2899 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 2900 addpd %xmm3, %xmm11 2901 movddup -9 * SIZE(BO, %rax, 1), %xmm3 2902 2903 addq $4 * SIZE, %rax 2904 BRANCH 2905 jl .L102 2906 ALIGN_4 2907 2908.L106: 2909#if defined(LT) || defined(RN) 2910 movq KK, %rax 2911#else 2912 movq K, %rax 2913 subq KK, %rax 2914#endif 2915 andq $3, %rax # if (k & 1) 2916 je .L109 2917 2918 leaq (, %rax, SIZE), %rax 2919 leaq (AO, %rax, 2), AO 2920 leaq (BO, %rax, 1), BO 2921 negq %rax 2922 ALIGN_4 2923 2924.L107: 2925 movddup -16 * SIZE(BO, %rax, 1), %xmm0 2926 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 2927 addpd %xmm0, %xmm8 2928 2929 addq $SIZE, %rax 2930 jl .L107 2931 ALIGN_4 2932 2933.L109: 2934 addpd %xmm9, %xmm8 2935 addpd %xmm11, %xmm10 2936 addpd %xmm10, %xmm8 2937 2938#if defined(LN) || defined(RT) 2939 movq KK, %rax 2940#ifdef LN 2941 subq $2, %rax 2942#else 2943 subq $1, %rax 2944#endif 2945 2946 leaq (, %rax, SIZE), %rax 2947 2948 movq AORIG, AO 2949 leaq (AO, %rax, 2), AO 2950 leaq (B, %rax, 1), BO 2951#endif 2952 2953#if defined(LN) || defined(LT) 2954 movapd -16 * SIZE(BO), %xmm10 2955 subpd %xmm8, %xmm10 2956#else 2957 movapd -16 * SIZE(AO), %xmm10 2958 subpd %xmm8, %xmm10 2959#endif 2960 2961#ifdef LN 2962 movapd %xmm10, %xmm8 2963 unpckhpd %xmm8, %xmm8 2964 2965 movsd -13 * SIZE(AO), %xmm12 2966 mulsd %xmm12, %xmm8 2967 2968 movsd -14 * SIZE(AO), %xmm13 2969 mulsd %xmm8, %xmm13 2970 subsd %xmm13, %xmm10 2971 2972 movsd -16 * SIZE(AO), %xmm12 2973 mulsd %xmm12, %xmm10 2974 2975 unpcklpd %xmm8, %xmm10 2976#endif 2977 2978#ifdef LT 2979 movapd %xmm10, %xmm8 2980 unpckhpd %xmm8, %xmm8 2981 2982 movsd -16 * SIZE(AO), %xmm12 2983 mulsd %xmm12, %xmm10 2984 2985 movsd -15 * SIZE(AO), %xmm13 2986 mulsd %xmm10, %xmm13 2987 subsd %xmm13, %xmm8 2988 2989 movsd -13 * SIZE(AO), %xmm12 2990 mulsd %xmm12, %xmm8 2991 2992 unpcklpd %xmm8, %xmm10 2993#endif 2994 2995#ifdef RN 2996 movddup -16 * SIZE(BO), %xmm8 2997 mulpd %xmm8, %xmm10 2998#endif 2999 3000#ifdef RT 3001 movddup -16 * SIZE(BO), %xmm8 3002 mulpd %xmm8, %xmm10 3003#endif 3004 3005#ifdef LN 3006 subq $2 * SIZE, CO1 3007#endif 3008 3009#if defined(LN) || defined(LT) 3010 movlpd %xmm10, 0 * SIZE(CO1) 3011 movhpd %xmm10, 1 * SIZE(CO1) 3012#else 3013 movlpd %xmm10, 0 * SIZE(CO1) 3014 movhpd %xmm10, 1 * SIZE(CO1) 3015#endif 3016 3017#if defined(LN) || defined(LT) 3018 movaps %xmm10, -16 * SIZE(BO) 3019#else 3020 movaps %xmm10, -16 * SIZE(AO) 3021#endif 3022 3023#ifndef LN 3024 addq $2 * SIZE, CO1 3025#endif 3026 3027#if defined(LT) || defined(RN) 3028 movq K, %rax 3029 subq KK, %rax 3030 leaq (,%rax, SIZE), %rax 3031 leaq (AO, %rax, 2), AO 3032 addq %rax, BO 3033#endif 3034 3035#ifdef LN 3036 subq $2, KK 3037#endif 3038 3039#ifdef LT 3040 addq $2, KK 3041#endif 3042 3043#ifdef RT 3044 movq K, %rax 3045 salq $1 + BASE_SHIFT, %rax 3046 addq %rax, AORIG 3047#endif 3048 ALIGN_4 3049 3050.L110: 3051 movq M, I 3052 sarq $2, I # i = (m >> 2) 3053 jle .L119 3054 ALIGN_4 3055 3056.L91: 3057#ifdef LN 3058 movq K, %rax 3059 salq $2 + BASE_SHIFT, %rax 3060 subq %rax, AORIG 3061#endif 3062 3063#if defined(LN) || defined(RT) 3064 movq KK, %rax 3065 movq AORIG, AO 3066 leaq (, %rax, SIZE), %rax 3067 leaq (AO, %rax, 4), AO 3068#endif 3069 3070 movq B, BO 3071 3072#if defined(LN) || defined(RT) 3073 movq KK, %rax 3074 leaq (BO, %rax, SIZE), BO 3075#endif 3076 3077 movapd -16 * SIZE(AO), %xmm0 3078 pxor %xmm8, %xmm8 3079 movapd -8 * SIZE(AO), %xmm2 3080 pxor %xmm9, %xmm9 3081 movddup -16 * SIZE(BO), %xmm1 3082 pxor %xmm10, %xmm10 3083 movddup -15 * SIZE(BO), %xmm5 3084 pxor %xmm11, %xmm11 3085 movddup -14 * SIZE(BO), %xmm3 3086 3087#ifndef LN 3088 prefetchw 3 * SIZE(CO1) 3089#else 3090 prefetchw -8 * SIZE(CO1) 3091#endif 3092 3093#if defined(LT) || defined(RN) 3094 movq KK, %rax 3095#else 3096 movq K, %rax 3097 subq KK, %rax 3098#endif 3099 andq $-4, %rax 3100 leaq (, %rax, SIZE), %rax 3101 leaq (AO, %rax, 4), AO 3102 leaq (BO, %rax, 1), BO 3103 negq %rax 3104 NOBRANCH 3105 je .L96 3106 ALIGN_4 3107 3108.L92: 3109 mulpd %xmm1, %xmm0 3110 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 3111 addpd %xmm0, %xmm8 3112 movapd -12 * SIZE(AO, %rax, 4), %xmm0 3113 addpd %xmm1, %xmm9 3114 movddup -12 * SIZE(BO, %rax, 1), %xmm1 3115 mulpd %xmm5, %xmm0 3116 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 3117 addpd %xmm0, %xmm10 3118 movapd (AO, %rax, 4), %xmm0 3119 addpd %xmm5, %xmm11 3120 movddup -13 * SIZE(BO, %rax, 1), %xmm5 3121 mulpd %xmm3, %xmm2 3122 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 3123 addpd %xmm2, %xmm8 3124 movapd -4 * SIZE(AO, %rax, 4), %xmm2 3125 addpd %xmm3, %xmm9 3126 movddup -10 * SIZE(BO, %rax, 1), %xmm3 3127 mulpd %xmm5, %xmm2 3128 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 3129 addpd %xmm2, %xmm10 3130 movapd 8 * SIZE(AO, %rax, 4), %xmm2 3131 addpd %xmm5, %xmm11 3132 movddup -11 * SIZE(BO, %rax, 1), %xmm5 3133 3134 addq $4 * SIZE, %rax 3135 BRANCH 3136 jl .L92 3137 ALIGN_4 3138 3139.L96: 3140#if defined(LT) || defined(RN) 3141 movq KK, %rax 3142#else 3143 movq K, %rax 3144 subq KK, %rax 3145#endif 3146 andq $3, %rax # if (k & 1) 3147 je .L99 3148 3149 leaq (, %rax, SIZE), %rax 3150 leaq (AO, %rax, 4), AO 3151 leaq (BO, %rax, 1), BO 3152 negq %rax 3153 ALIGN_4 3154 3155.L97: 3156 mulpd %xmm1, %xmm0 3157 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 3158 addpd %xmm0, %xmm8 3159 movapd -12 * SIZE(AO, %rax, 4), %xmm0 3160 addpd %xmm1, %xmm9 3161 movddup -15 * SIZE(BO, %rax, 1), %xmm1 3162 3163 addq $SIZE, %rax 3164 jl .L97 3165 ALIGN_4 3166.L99: 3167 addpd %xmm10, %xmm8 3168 addpd %xmm11, %xmm9 3169 3170#if defined(LN) || defined(RT) 3171 movq KK, %rax 3172#ifdef LN 3173 subq $4, %rax 3174#else 3175 subq $1, %rax 3176#endif 3177 3178 leaq (, %rax, SIZE), %rax 3179 3180 movq AORIG, AO 3181 leaq (AO, %rax, 4), AO 3182 leaq (B, %rax, 1), BO 3183#endif 3184 3185#if defined(LN) || defined(LT) 3186 movapd -16 * SIZE(BO), %xmm10 3187 movapd -14 * SIZE(BO), %xmm11 3188 3189 subpd %xmm8, %xmm10 3190 subpd %xmm9, %xmm11 3191#else 3192 movapd -16 * SIZE(AO), %xmm10 3193 movapd -14 * SIZE(AO), %xmm11 3194 3195 subpd %xmm8, %xmm10 3196 subpd %xmm9, %xmm11 3197#endif 3198 3199#ifdef LN 3200 movapd %xmm10, %xmm8 3201 unpckhpd %xmm8, %xmm8 3202 3203 movapd %xmm11, %xmm9 3204 unpckhpd %xmm9, %xmm9 3205 3206 movsd -1 * SIZE(AO), %xmm12 3207 mulsd %xmm12, %xmm9 3208 3209 movsd -2 * SIZE(AO), %xmm13 3210 mulsd %xmm9, %xmm13 3211 subsd %xmm13, %xmm11 3212 movsd -3 * SIZE(AO), %xmm14 3213 mulsd %xmm9, %xmm14 3214 subsd %xmm14, %xmm8 3215 movsd -4 * SIZE(AO), %xmm15 3216 mulsd %xmm9, %xmm15 3217 subsd %xmm15, %xmm10 3218 3219 movsd -6 * SIZE(AO), %xmm12 3220 mulsd %xmm12, %xmm11 3221 3222 movsd -7 * SIZE(AO), %xmm13 3223 mulsd %xmm11, %xmm13 3224 subsd %xmm13, %xmm8 3225 movsd -8 * SIZE(AO), %xmm14 3226 mulsd %xmm11, %xmm14 3227 subsd %xmm14, %xmm10 3228 3229 movsd -11 * SIZE(AO), %xmm12 3230 mulsd %xmm12, %xmm8 3231 3232 movsd -12 * SIZE(AO), %xmm13 3233 mulsd %xmm8, %xmm13 3234 subsd %xmm13, %xmm10 3235 3236 movsd -16 * SIZE(AO), %xmm12 3237 mulsd %xmm12, %xmm10 3238 3239 unpcklpd %xmm8, %xmm10 3240 unpcklpd %xmm9, %xmm11 3241#endif 3242 3243#ifdef LT 3244 movapd %xmm10, %xmm8 3245 unpckhpd %xmm8, %xmm8 3246 3247 movapd %xmm11, %xmm9 3248 unpckhpd %xmm9, %xmm9 3249 3250 movsd -16 * SIZE(AO), %xmm12 3251 mulsd %xmm12, %xmm10 3252 3253 movsd -15 * SIZE(AO), %xmm13 3254 mulsd %xmm10, %xmm13 3255 subsd %xmm13, %xmm8 3256 movsd -14 * SIZE(AO), %xmm14 3257 mulsd %xmm10, %xmm14 3258 subsd %xmm14, %xmm11 3259 movsd -13 * SIZE(AO), %xmm15 3260 mulsd %xmm10, %xmm15 3261 subsd %xmm15, %xmm9 3262 3263 movsd -11 * SIZE(AO), %xmm12 3264 mulsd %xmm12, %xmm8 3265 3266 movsd -10 * SIZE(AO), %xmm13 3267 mulsd %xmm8, %xmm13 3268 subsd %xmm13, %xmm11 3269 movsd -9 * SIZE(AO), %xmm14 3270 mulsd %xmm8, %xmm14 3271 subsd %xmm14, %xmm9 3272 3273 movsd -6 * SIZE(AO), %xmm12 3274 mulsd %xmm12, %xmm11 3275 3276 movsd -5 * SIZE(AO), %xmm13 3277 mulsd %xmm11, %xmm13 3278 subsd %xmm13, %xmm9 3279 3280 movsd -1 * SIZE(AO), %xmm12 3281 mulsd %xmm12, %xmm9 3282 3283 unpcklpd %xmm8, %xmm10 3284 unpcklpd %xmm9, %xmm11 3285#endif 3286 3287#ifdef RN 3288 movddup -16 * SIZE(BO), %xmm8 3289 mulpd %xmm8, %xmm10 3290 mulpd %xmm8, %xmm11 3291#endif 3292 3293#ifdef RT 3294 movddup -16 * SIZE(BO), %xmm8 3295 mulpd %xmm8, %xmm10 3296 mulpd %xmm8, %xmm11 3297#endif 3298 3299#ifdef LN 3300 subq $4 * SIZE, CO1 3301#endif 3302 3303 movlpd %xmm10, 0 * SIZE(CO1) 3304 movhpd %xmm10, 1 * SIZE(CO1) 3305 movlpd %xmm11, 2 * SIZE(CO1) 3306 movhpd %xmm11, 3 * SIZE(CO1) 3307 3308#if defined(LN) || defined(LT) 3309 movaps %xmm10, -16 * SIZE(BO) 3310 movaps %xmm11, -14 * SIZE(BO) 3311#else 3312 movaps %xmm10, -16 * SIZE(AO) 3313 movaps %xmm11, -14 * SIZE(AO) 3314#endif 3315 3316#ifndef LN 3317 addq $4 * SIZE, CO1 3318#endif 3319 3320#if defined(LT) || defined(RN) 3321 movq K, %rax 3322 subq KK, %rax 3323 leaq (,%rax, SIZE), %rax 3324 leaq (AO, %rax, 4), AO 3325 addq %rax, BO 3326#endif 3327 3328#ifdef LN 3329 subq $4, KK 3330#endif 3331 3332#ifdef LT 3333 addq $4, KK 3334#endif 3335 3336#ifdef RT 3337 movq K, %rax 3338 salq $2 + BASE_SHIFT, %rax 3339 addq %rax, AORIG 3340#endif 3341 3342 decq I # i -- 3343 jg .L91 3344 ALIGN_4 3345 3346.L119: 3347#ifdef LN 3348 leaq (B, K, SIZE), B 3349#endif 3350 3351#if defined(LT) || defined(RN) 3352 movq BO, B 3353#endif 3354 3355#ifdef RN 3356 addq $1, KK 3357#endif 3358 3359#ifdef RT 3360 subq $1, KK 3361#endif 3362 ALIGN_4 3363 3364.L999: 3365 movq (%rsp), %rbx 3366 movq 8(%rsp), %rbp 3367 movq 16(%rsp), %r12 3368 movq 24(%rsp), %r13 3369 movq 32(%rsp), %r14 3370 movq 40(%rsp), %r15 3371 3372#ifdef WINDOWS_ABI 3373 movq 48(%rsp), %rdi 3374 movq 56(%rsp), %rsi 3375 movups 64(%rsp), %xmm6 3376 movups 80(%rsp), %xmm7 3377 movups 96(%rsp), %xmm8 3378 movups 112(%rsp), %xmm9 3379 movups 128(%rsp), %xmm10 3380 movups 144(%rsp), %xmm11 3381 movups 160(%rsp), %xmm12 3382 movups 176(%rsp), %xmm13 3383 movups 192(%rsp), %xmm14 3384 movups 208(%rsp), %xmm15 3385#endif 3386 3387 addq $STACKSIZE, %rsp 3388 ret 3389 3390 EPILOGUE 3391