1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %r12 58#define BB %rbp 59#define J %rbx 60 61#ifndef WINDOWS_ABI 62 63#define STACKSIZE 96 64 65#define OFFSET 48(%rsp) 66#define AORIG 56(%rsp) 67#define KK 64(%rsp) 68#define KKK 72(%rsp) 69 70#else 71 72#define STACKSIZE 256 73 74#define OLD_A 40 + STACKSIZE(%rsp) 75#define OLD_B 48 + STACKSIZE(%rsp) 76#define OLD_C 56 + STACKSIZE(%rsp) 77#define OLD_LDC 64 + STACKSIZE(%rsp) 78#define OLD_OFFSET 72 + STACKSIZE(%rsp) 79 80#define OFFSET 224(%rsp) 81#define AORIG 232(%rsp) 82#define KK 240(%rsp) 83#define KKK 248(%rsp) 84 85#endif 86 87#define PREFETCH prefetch 88#define PREFETCHSIZE (8 * 7 + 0) 89 90#define movlpd movsd 91#define movapd movups 92#define movupd movups 93 94#define KERNEL1(xx) \ 95 mulpd %xmm1, %xmm0 ;\ 96 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 97 addpd %xmm0, %xmm8 ;\ 98 movapd %xmm2, %xmm0 ;\ 99 addpd %xmm1, %xmm12 ;\ 100 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 101 mulpd %xmm3, %xmm2 ;\ 102 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 103 addpd %xmm2, %xmm9 ;\ 104 movapd %xmm0, %xmm2 ;\ 105 addpd %xmm3, %xmm13 ;\ 106 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 107 mulpd %xmm1, %xmm0 ;\ 108 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 109 addpd %xmm0, %xmm10 ;\ 110 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 111 addpd %xmm1, %xmm14 ;\ 112 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 113 mulpd %xmm3, %xmm2 ;\ 114 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 115 addpd %xmm2, %xmm11 ;\ 116 addpd %xmm3, %xmm15 ;\ 117 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 118 movapd %xmm0, %xmm2 119 120 121#define KERNEL2(xx) \ 122 mulpd %xmm1, %xmm0 ;\ 123 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 124 addpd %xmm0, %xmm8 ;\ 125 movapd %xmm2, %xmm0 ;\ 126/**/ movapd (AO, %rax, 4), %xmm6 ;\ 127 addpd %xmm1, %xmm12 ;\ 128 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 129 mulpd %xmm3, %xmm2 ;\ 130 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 131 addpd %xmm2, %xmm9 ;\ 132 movapd %xmm0, %xmm2 ;\ 133 addpd %xmm3, %xmm13 ;\ 134 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 135 mulpd %xmm1, %xmm0 ;\ 136 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 137 addpd %xmm0, %xmm10 ;\ 138 addpd %xmm1, %xmm14 ;\ 139 mulpd %xmm3, %xmm2 ;\ 140/**/ movddup (BO, %rax, 4), %xmm1 ;\ 141 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 142 addpd %xmm2, %xmm11 ;\ 143 addpd %xmm3, %xmm15 ;\ 144 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 145 movapd %xmm4, %xmm2 146 147#define KERNEL3(xx) \ 148 mulpd %xmm5, %xmm4 ;\ 149 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 150 addpd %xmm4, %xmm8 ;\ 151 movapd %xmm2, %xmm4 ;\ 152 addpd %xmm5, %xmm12 ;\ 153 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 154 mulpd %xmm3, %xmm2 ;\ 155 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 156 addpd %xmm2, %xmm9 ;\ 157 movapd %xmm4, %xmm2 ;\ 158 addpd %xmm3, %xmm13 ;\ 159 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 160 mulpd %xmm5, %xmm4 ;\ 161 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 162 addpd %xmm4, %xmm10 ;\ 163 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 164 addpd %xmm5, %xmm14 ;\ 165 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 166 mulpd %xmm3, %xmm2 ;\ 167 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 168 addpd %xmm2, %xmm11 ;\ 169 addpd %xmm3, %xmm15 ;\ 170 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 171 movapd %xmm4, %xmm2 172 173#define KERNEL4(xx) \ 174 mulpd %xmm5, %xmm4 ;\ 175 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 176 addpd %xmm4, %xmm8 ;\ 177 movapd %xmm2, %xmm4 ;\ 178/**/ movapd 8 * SIZE(AO, %rax, 4), %xmm7 ;\ 179 addpd %xmm5, %xmm12 ;\ 180 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 181 mulpd %xmm3, %xmm2 ;\ 182 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 183 addpd %xmm2, %xmm9 ;\ 184 movapd %xmm4, %xmm2 ;\ 185 addpd %xmm3, %xmm13 ;\ 186 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 187 mulpd %xmm5, %xmm4 ;\ 188 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 189 addpd %xmm4, %xmm10 ;\ 190 addpd %xmm5, %xmm14 ;\ 191/**/ movddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ 192 mulpd %xmm3, %xmm2 ;\ 193 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 194 addpd %xmm2, %xmm11 ;\ 195 addpd %xmm3, %xmm15 ;\ 196 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 197 movapd %xmm6, %xmm2 198 199#define KERNEL5(xx) \ 200 mulpd %xmm1, %xmm6 ;\ 201 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 202 addpd %xmm6, %xmm8 ;\ 203 movapd %xmm2, %xmm6 ;\ 204 addpd %xmm1, %xmm12 ;\ 205 movddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ 206 mulpd %xmm3, %xmm2 ;\ 207 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 208 addpd %xmm2, %xmm9 ;\ 209 movapd %xmm6, %xmm2 ;\ 210 addpd %xmm3, %xmm13 ;\ 211 movddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ 212 mulpd %xmm1, %xmm6 ;\ 213 mulpd 2 * SIZE(AO, %rax, 4), %xmm1 ;\ 214 addpd %xmm6, %xmm10 ;\ 215 movapd 4 * SIZE(AO, %rax, 4), %xmm6 ;\ 216 addpd %xmm1, %xmm14 ;\ 217 movddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ 218 mulpd %xmm3, %xmm2 ;\ 219 mulpd 2 * SIZE(AO, %rax, 4), %xmm3 ;\ 220 addpd %xmm2, %xmm11 ;\ 221 addpd %xmm3, %xmm15 ;\ 222 movddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ 223 movapd %xmm6, %xmm2 224 225#define KERNEL6(xx) \ 226 mulpd %xmm1, %xmm6 ;\ 227 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 228 addpd %xmm6, %xmm8 ;\ 229 movapd %xmm2, %xmm6 ;\ 230/***/ movapd 16 * SIZE(AO, %rax, 4), %xmm0 ;\ 231 addpd %xmm1, %xmm12 ;\ 232 movddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ 233 mulpd %xmm3, %xmm2 ;\ 234 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 235 addpd %xmm2, %xmm9 ;\ 236 movapd %xmm6, %xmm2 ;\ 237 addpd %xmm3, %xmm13 ;\ 238 movddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ 239 mulpd %xmm1, %xmm6 ;\ 240 mulpd 6 * SIZE(AO, %rax, 4), %xmm1 ;\ 241 addpd %xmm6, %xmm10 ;\ 242 addpd %xmm1, %xmm14 ;\ 243/**/ movddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ 244 mulpd %xmm3, %xmm2 ;\ 245 mulpd 6 * SIZE(AO, %rax, 4), %xmm3 ;\ 246 addpd %xmm2, %xmm11 ;\ 247 addpd %xmm3, %xmm15 ;\ 248 movddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ 249 movapd %xmm7, %xmm2 250 251#define KERNEL7(xx) \ 252 mulpd %xmm5, %xmm7 ;\ 253 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 254 addpd %xmm7, %xmm8 ;\ 255 movapd %xmm2, %xmm7 ;\ 256 addpd %xmm5, %xmm12 ;\ 257 movddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ 258 mulpd %xmm3, %xmm2 ;\ 259 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 260 addpd %xmm2, %xmm9 ;\ 261 movapd %xmm7, %xmm2 ;\ 262 addpd %xmm3, %xmm13 ;\ 263 movddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ 264 mulpd %xmm5, %xmm7 ;\ 265 mulpd 10 * SIZE(AO, %rax, 4), %xmm5 ;\ 266 addpd %xmm7, %xmm10 ;\ 267 movapd 12 * SIZE(AO, %rax, 4), %xmm7 ;\ 268 addpd %xmm5, %xmm14 ;\ 269 movddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ 270 mulpd %xmm3, %xmm2 ;\ 271 mulpd 10 * SIZE(AO, %rax, 4), %xmm3 ;\ 272 addpd %xmm2, %xmm11 ;\ 273 addpd %xmm3, %xmm15 ;\ 274 movddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ 275 movapd %xmm7, %xmm2 276 277#define KERNEL8(xx) \ 278 mulpd %xmm5, %xmm7 ;\ 279 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 280 addpd %xmm7, %xmm8 ;\ 281 movapd %xmm2, %xmm7 ;\ 282/**/ movapd 24 * SIZE(AO, %rax, 4), %xmm4 ;\ 283 addpd %xmm5, %xmm12 ;\ 284 movddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ 285 mulpd %xmm3, %xmm2 ;\ 286 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 287 addpd %xmm2, %xmm9 ;\ 288 movapd %xmm7, %xmm2 ;\ 289 addpd %xmm3, %xmm13 ;\ 290 movddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ 291 mulpd %xmm5, %xmm7 ;\ 292 mulpd 14 * SIZE(AO, %rax, 4), %xmm5 ;\ 293 addpd %xmm7, %xmm10 ;\ 294 addpd %xmm5, %xmm14 ;\ 295/**/ movddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ 296 mulpd %xmm3, %xmm2 ;\ 297 mulpd 14 * SIZE(AO, %rax, 4), %xmm3 ;\ 298 addpd %xmm2, %xmm11 ;\ 299 addpd %xmm3, %xmm15 ;\ 300 movddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ 301 movapd %xmm0, %xmm2 ;\ 302 addq $8 * SIZE, %rax 303 304#define KERNEL_SUB1(xx) \ 305 mulpd %xmm1, %xmm0 ;\ 306 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 307 addpd %xmm0, %xmm8 ;\ 308 movapd %xmm2, %xmm0 ;\ 309 addpd %xmm1, %xmm12 ;\ 310 movddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ 311 mulpd %xmm3, %xmm2 ;\ 312 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 313 addpd %xmm2, %xmm9 ;\ 314 movapd %xmm0, %xmm2 ;\ 315 addpd %xmm3, %xmm13 ;\ 316 movddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ 317 mulpd %xmm1, %xmm0 ;\ 318 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 ;\ 319 addpd %xmm0, %xmm10 ;\ 320 movapd -12 * SIZE(AO, %rax, 4), %xmm0 ;\ 321 addpd %xmm1, %xmm14 ;\ 322 movddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ 323 mulpd %xmm3, %xmm2 ;\ 324 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 ;\ 325 addpd %xmm2, %xmm11 ;\ 326 addpd %xmm3, %xmm15 ;\ 327 movddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ 328 movapd %xmm0, %xmm2 329 330#define KERNEL_SUB2(xx) \ 331 mulpd %xmm1, %xmm0 ;\ 332 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 333 addpd %xmm0, %xmm8 ;\ 334 movapd %xmm2, %xmm0 ;\ 335 addpd %xmm1, %xmm12 ;\ 336 movddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ 337 mulpd %xmm3, %xmm2 ;\ 338 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 339 addpd %xmm2, %xmm9 ;\ 340 movapd %xmm0, %xmm2 ;\ 341 addpd %xmm3, %xmm13 ;\ 342 movddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ 343 mulpd %xmm1, %xmm0 ;\ 344 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 ;\ 345 addpd %xmm0, %xmm10 ;\ 346 movapd (AO, %rax, 4), %xmm0 ;\ 347 addpd %xmm1, %xmm14 ;\ 348 movddup (BO, %rax, 4), %xmm1 ;\ 349 mulpd %xmm3, %xmm2 ;\ 350 mulpd -10 * SIZE(AO, %rax, 4), %xmm3 ;\ 351 addpd %xmm2, %xmm11 ;\ 352 addpd %xmm3, %xmm15 ;\ 353 movddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ 354 movapd %xmm4, %xmm2 355 356#define KERNEL_SUB3(xx) \ 357 mulpd %xmm5, %xmm4 ;\ 358 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 359 addpd %xmm4, %xmm8 ;\ 360 movapd %xmm2, %xmm4 ;\ 361 addpd %xmm5, %xmm12 ;\ 362 movddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ 363 mulpd %xmm3, %xmm2 ;\ 364 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 365 addpd %xmm2, %xmm9 ;\ 366 movapd %xmm4, %xmm2 ;\ 367 addpd %xmm3, %xmm13 ;\ 368 movddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ 369 mulpd %xmm5, %xmm4 ;\ 370 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 ;\ 371 addpd %xmm4, %xmm10 ;\ 372 movapd -4 * SIZE(AO, %rax, 4), %xmm4 ;\ 373 addpd %xmm5, %xmm14 ;\ 374 movddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ 375 mulpd %xmm3, %xmm2 ;\ 376 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 ;\ 377 addpd %xmm2, %xmm11 ;\ 378 addpd %xmm3, %xmm15 ;\ 379 movddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ 380 movapd %xmm4, %xmm2 381 382#define KERNEL_SUB4(xx) \ 383 mulpd %xmm5, %xmm4 ;\ 384 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 385 addpd %xmm4, %xmm8 ;\ 386 movapd %xmm2, %xmm4 ;\ 387 addpd %xmm5, %xmm12 ;\ 388 movddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ 389 mulpd %xmm3, %xmm2 ;\ 390 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 391 addpd %xmm2, %xmm9 ;\ 392 movapd %xmm4, %xmm2 ;\ 393 addpd %xmm3, %xmm13 ;\ 394 movddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ 395 mulpd %xmm5, %xmm4 ;\ 396 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 ;\ 397 addpd %xmm4, %xmm10 ;\ 398 addpd %xmm5, %xmm14 ;\ 399 mulpd %xmm3, %xmm2 ;\ 400 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 ;\ 401 addpd %xmm2, %xmm11 ;\ 402 addpd %xmm3, %xmm15 ;\ 403 movddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ 404 movapd %xmm0, %xmm2 405 406 PROLOGUE 407 PROFCODE 408 409 subq $STACKSIZE, %rsp 410 movq %rbx, (%rsp) 411 movq %rbp, 8(%rsp) 412 movq %r12, 16(%rsp) 413 movq %r13, 24(%rsp) 414 movq %r14, 32(%rsp) 415 movq %r15, 40(%rsp) 416 417#ifdef WINDOWS_ABI 418 movq %rdi, 48(%rsp) 419 movq %rsi, 56(%rsp) 420 movups %xmm6, 64(%rsp) 421 movups %xmm7, 80(%rsp) 422 movups %xmm8, 96(%rsp) 423 movups %xmm9, 112(%rsp) 424 movups %xmm10, 128(%rsp) 425 movups %xmm11, 144(%rsp) 426 movups %xmm12, 160(%rsp) 427 movups %xmm13, 176(%rsp) 428 movups %xmm14, 192(%rsp) 429 movups %xmm15, 208(%rsp) 430 431 movq ARG1, OLD_M 432 movq ARG2, OLD_N 433 movq ARG3, K 434 movq OLD_A, A 435 movq OLD_B, B 436 movq OLD_C, C 437 movq OLD_LDC, LDC 438 movsd OLD_OFFSET, %xmm12 439#else 440 movq STACKSIZE + 8(%rsp), LDC 441 movsd STACKSIZE + 16(%rsp), %xmm12 442#endif 443 444 movq OLD_M, M 445 movq OLD_N, N 446 447 subq $-16 * SIZE, A 448 subq $-16 * SIZE, B 449 450 movsd %xmm12, OFFSET 451 movsd %xmm12, KK 452 453 leaq (, LDC, SIZE), LDC 454 455#ifdef LN 456 leaq (, M, SIZE), %rax 457 addq %rax, C 458 imulq K, %rax 459 addq %rax, A 460#endif 461 462#ifdef RT 463 leaq (, N, SIZE), %rax 464 imulq K, %rax 465 addq %rax, B 466 movq N, %rax 467 imulq LDC, %rax 468 addq %rax, C 469#endif 470 471#ifdef RN 472 negq KK 473#endif 474 475#ifdef RT 476 movq N, %rax 477 subq OFFSET, %rax 478 movq %rax, KK 479#endif 480 481 movq N, J 482 sarq $2, J # j = (n >> 2) 483 jle .L40 484 485.L01: 486#if defined(LT) || defined(RN) 487 movq A, AO 488#else 489 movq A, AORIG 490#endif 491 492#ifdef RT 493 movq K, %rax 494 salq $2 + BASE_SHIFT, %rax 495 subq %rax, B 496 497 leaq (, LDC, 4), %rax 498 subq %rax, C 499#endif 500 501 movq C, CO1 # coffset1 = c 502 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 503#ifndef RT 504 leaq (C, LDC, 4), C 505#endif 506 507#ifdef LN 508 movq OFFSET, %rax 509 addq M, %rax 510 movq %rax, KK 511#endif 512 513 movq K, %rax 514 salq $BASE_SHIFT + 2, %rax 515 leaq (B, %rax), BB 516 517#if defined(LT) 518 movq OFFSET, %rax 519 movq %rax, KK 520#endif 521 522 movq M, I 523 sarq $2, I # i = (m >> 2) 524 jle .L20 525 ALIGN_4 526 527.L11: 528#ifdef LN 529 movq K, %rax 530 salq $2 + BASE_SHIFT, %rax 531 subq %rax, AORIG 532#endif 533 534#if defined(LN) || defined(RT) 535 movq KK, %rax 536 movq AORIG, AO 537 leaq (, %rax, SIZE), %rax 538 leaq (AO, %rax, 4), AO 539#endif 540 541 movq B, BO 542 543#if defined(LN) || defined(RT) 544 movq KK, %rax 545 leaq (, %rax, SIZE), %rax 546 leaq (BO, %rax, 4), BO 547#endif 548 549 movapd -16 * SIZE(AO), %xmm0 550 movddup -16 * SIZE(BO), %xmm1 551 pxor %xmm8, %xmm8 552 movddup -15 * SIZE(BO), %xmm3 553 pxor %xmm9, %xmm9 554 movapd -8 * SIZE(AO), %xmm4 555 pxor %xmm10, %xmm10 556 movddup -8 * SIZE(BO), %xmm5 557 pxor %xmm11, %xmm11 558 559#ifndef LN 560 prefetchw 3 * SIZE(CO1) 561 pxor %xmm12, %xmm12 562 prefetchw 7 * SIZE(CO2) 563 pxor %xmm13, %xmm13 564 prefetchw 3 * SIZE(CO1, LDC, 2) 565 pxor %xmm14, %xmm14 566 prefetchw 7 * SIZE(CO2, LDC, 2) 567 pxor %xmm15, %xmm15 568 movapd %xmm0, %xmm2 569#else 570 prefetchw -8 * SIZE(CO1) 571 pxor %xmm12, %xmm12 572 prefetchw -8 * SIZE(CO2) 573 pxor %xmm13, %xmm13 574 prefetchw -8 * SIZE(CO1, LDC, 2) 575 pxor %xmm14, %xmm14 576 prefetchw -8 * SIZE(CO2, LDC, 2) 577 pxor %xmm15, %xmm15 578 movapd %xmm0, %xmm2 579#endif 580 581 prefetch -16 * SIZE(BB) 582 583#if defined(LT) || defined(RN) 584 movq KK, %rax 585#else 586 movq K, %rax 587 subq KK, %rax 588#endif 589 590 andq $-8, %rax 591 leaq (, %rax, SIZE), %rax 592 leaq (AO, %rax, 4), AO 593 leaq (BO, %rax, 4), BO 594 negq %rax 595 NOBRANCH 596 je .L15 597 ALIGN_4 598 599.L12: 600 KERNEL1(16 * 0) 601 KERNEL2(16 * 0) 602 KERNEL3(16 * 0) 603 KERNEL4(16 * 0) 604 KERNEL5(16 * 0) 605 KERNEL6(16 * 0) 606 KERNEL7(16 * 0) 607 KERNEL8(16 * 0) 608 BRANCH 609 jl .L12 610 ALIGN_4 611 612.L15: 613 prefetch -8 * SIZE(BB) 614 subq $-16 * SIZE, BB 615 616#if defined(LT) || defined(RN) 617 movq KK, %rax 618#else 619 movq K, %rax 620 subq KK, %rax 621#endif 622 testq $4, %rax 623 je .L16 624 xorq %rax, %rax 625 ALIGN_4 626 627 KERNEL_SUB1(16 * 0) 628 KERNEL_SUB2(16 * 0) 629 KERNEL_SUB3(16 * 0) 630 KERNEL_SUB4(16 * 0) 631 632 subq $-16 * SIZE, BO 633 subq $-16 * SIZE, AO 634 ALIGN_4 635 636.L16: 637#if defined(LT) || defined(RN) 638 movq KK, %rax 639#else 640 movq K, %rax 641 subq KK, %rax 642#endif 643 andq $3, %rax # if (k & 1) 644 je .L19 645 646 leaq (, %rax, SIZE), %rax 647 leaq (AO, %rax, 4), AO 648 leaq (BO, %rax, 4), BO 649 negq %rax 650 ALIGN_4 651 652.L17: 653 mulpd %xmm1, %xmm0 654 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 655 addpd %xmm0, %xmm8 656 movapd %xmm2, %xmm0 657 addpd %xmm1, %xmm12 658 movddup -14 * SIZE(BO, %rax, 4), %xmm1 659 mulpd %xmm3, %xmm2 660 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 661 addpd %xmm2, %xmm9 662 movapd %xmm0, %xmm2 663 addpd %xmm3, %xmm13 664 movddup -13 * SIZE(BO, %rax, 4), %xmm3 665 mulpd %xmm1, %xmm0 666 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 667 addpd %xmm0, %xmm10 668 movapd -12 * SIZE(AO, %rax, 4), %xmm0 669 addpd %xmm1, %xmm14 670 movddup -12 * SIZE(BO, %rax, 4), %xmm1 671 mulpd %xmm3, %xmm2 672 mulpd -14 * SIZE(AO, %rax, 4), %xmm3 673 addpd %xmm2, %xmm11 674 addpd %xmm3, %xmm15 675 movddup -11 * SIZE(BO, %rax, 4), %xmm3 676 movapd %xmm0, %xmm2 677 678 addq $SIZE, %rax 679 jl .L17 680 ALIGN_4 681 682.L19: 683#if defined(LN) || defined(RT) 684 movq KK, %rax 685#ifdef LN 686 subq $4, %rax 687#else 688 subq $4, %rax 689#endif 690 691 leaq (, %rax, SIZE), %rax 692 693 movq AORIG, AO 694 leaq (AO, %rax, 4), AO 695 leaq (B, %rax, 4), BO 696#endif 697 698#if defined(LN) || defined(LT) 699 movapd %xmm8, %xmm0 700 unpcklpd %xmm9, %xmm8 701 unpckhpd %xmm9, %xmm0 702 703 movapd %xmm10, %xmm2 704 unpcklpd %xmm11, %xmm10 705 unpckhpd %xmm11, %xmm2 706 707 movapd %xmm12, %xmm4 708 unpcklpd %xmm13, %xmm12 709 unpckhpd %xmm13, %xmm4 710 711 movapd %xmm14, %xmm6 712 unpcklpd %xmm15, %xmm14 713 unpckhpd %xmm15, %xmm6 714 715 movapd -16 * SIZE(BO), %xmm9 716 movapd -14 * SIZE(BO), %xmm11 717 movapd -12 * SIZE(BO), %xmm13 718 movapd -10 * SIZE(BO), %xmm15 719 movapd -8 * SIZE(BO), %xmm1 720 movapd -6 * SIZE(BO), %xmm3 721 movapd -4 * SIZE(BO), %xmm5 722 movapd -2 * SIZE(BO), %xmm7 723 724 subpd %xmm8, %xmm9 725 subpd %xmm10, %xmm11 726 subpd %xmm0, %xmm13 727 subpd %xmm2, %xmm15 728 subpd %xmm12, %xmm1 729 subpd %xmm14, %xmm3 730 subpd %xmm4, %xmm5 731 subpd %xmm6, %xmm7 732#else 733 movapd -16 * SIZE(AO), %xmm0 734 movapd -14 * SIZE(AO), %xmm1 735 movapd -12 * SIZE(AO), %xmm2 736 movapd -10 * SIZE(AO), %xmm3 737 738 movapd -8 * SIZE(AO), %xmm4 739 movapd -6 * SIZE(AO), %xmm5 740 movapd -4 * SIZE(AO), %xmm6 741 movapd -2 * SIZE(AO), %xmm7 742 743 subpd %xmm8, %xmm0 744 subpd %xmm12, %xmm1 745 subpd %xmm9, %xmm2 746 subpd %xmm13, %xmm3 747 subpd %xmm10, %xmm4 748 subpd %xmm14, %xmm5 749 subpd %xmm11, %xmm6 750 subpd %xmm15, %xmm7 751#endif 752 753#ifdef LN 754 movddup -1 * SIZE(AO), %xmm8 755 mulpd %xmm8, %xmm5 756 mulpd %xmm8, %xmm7 757 758 movddup -2 * SIZE(AO), %xmm10 759 mulpd %xmm5, %xmm10 760 subpd %xmm10, %xmm1 761 movddup -2 * SIZE(AO), %xmm10 762 mulpd %xmm7, %xmm10 763 subpd %xmm10, %xmm3 764 765 movddup -3 * SIZE(AO), %xmm12 766 mulpd %xmm5, %xmm12 767 subpd %xmm12, %xmm13 768 movddup -3 * SIZE(AO), %xmm12 769 mulpd %xmm7, %xmm12 770 subpd %xmm12, %xmm15 771 772 movddup -4 * SIZE(AO), %xmm14 773 mulpd %xmm5, %xmm14 774 subpd %xmm14, %xmm9 775 movddup -4 * SIZE(AO), %xmm14 776 mulpd %xmm7, %xmm14 777 subpd %xmm14, %xmm11 778 779 movddup -6 * SIZE(AO), %xmm8 780 mulpd %xmm8, %xmm1 781 mulpd %xmm8, %xmm3 782 783 movddup -7 * SIZE(AO), %xmm10 784 mulpd %xmm1, %xmm10 785 subpd %xmm10, %xmm13 786 movddup -7 * SIZE(AO), %xmm10 787 mulpd %xmm3, %xmm10 788 subpd %xmm10, %xmm15 789 790 movddup -8 * SIZE(AO), %xmm12 791 mulpd %xmm1, %xmm12 792 subpd %xmm12, %xmm9 793 movddup -8 * SIZE(AO), %xmm12 794 mulpd %xmm3, %xmm12 795 subpd %xmm12, %xmm11 796 797 movddup -11 * SIZE(AO), %xmm8 798 mulpd %xmm8, %xmm13 799 mulpd %xmm8, %xmm15 800 801 movddup -12 * SIZE(AO), %xmm10 802 mulpd %xmm13, %xmm10 803 subpd %xmm10, %xmm9 804 movddup -12 * SIZE(AO), %xmm10 805 mulpd %xmm15, %xmm10 806 subpd %xmm10, %xmm11 807 808 movddup -16 * SIZE(AO), %xmm8 809 mulpd %xmm8, %xmm9 810 mulpd %xmm8, %xmm11 811#endif 812 813#ifdef LT 814 movddup -16 * SIZE(AO), %xmm8 815 mulpd %xmm8, %xmm9 816 mulpd %xmm8, %xmm11 817 818 movddup -15 * SIZE(AO), %xmm10 819 mulpd %xmm9, %xmm10 820 subpd %xmm10, %xmm13 821 822 movddup -15 * SIZE(AO), %xmm10 823 mulpd %xmm11, %xmm10 824 subpd %xmm10, %xmm15 825 826 movddup -14 * SIZE(AO), %xmm12 827 mulpd %xmm9, %xmm12 828 subpd %xmm12, %xmm1 829 movddup -14 * SIZE(AO), %xmm12 830 mulpd %xmm11, %xmm12 831 subpd %xmm12, %xmm3 832 833 movddup -13 * SIZE(AO), %xmm14 834 mulpd %xmm9, %xmm14 835 subpd %xmm14, %xmm5 836 movddup -13 * SIZE(AO), %xmm14 837 mulpd %xmm11, %xmm14 838 subpd %xmm14, %xmm7 839 840 movddup -11 * SIZE(AO), %xmm8 841 mulpd %xmm8, %xmm13 842 mulpd %xmm8, %xmm15 843 844 movddup -10 * SIZE(AO), %xmm10 845 mulpd %xmm13, %xmm10 846 subpd %xmm10, %xmm1 847 movddup -10 * SIZE(AO), %xmm10 848 mulpd %xmm15, %xmm10 849 subpd %xmm10, %xmm3 850 851 movddup -9 * SIZE(AO), %xmm12 852 mulpd %xmm13, %xmm12 853 subpd %xmm12, %xmm5 854 movddup -9 * SIZE(AO), %xmm12 855 mulpd %xmm15, %xmm12 856 subpd %xmm12, %xmm7 857 858 movddup -6 * SIZE(AO), %xmm8 859 mulpd %xmm8, %xmm1 860 mulpd %xmm8, %xmm3 861 862 movddup -5 * SIZE(AO), %xmm10 863 mulpd %xmm1, %xmm10 864 subpd %xmm10, %xmm5 865 movddup -5 * SIZE(AO), %xmm10 866 mulpd %xmm3, %xmm10 867 subpd %xmm10, %xmm7 868 869 movddup -1 * SIZE(AO), %xmm8 870 mulpd %xmm8, %xmm5 871 mulpd %xmm8, %xmm7 872#endif 873 874#ifdef RN 875 movddup -16 * SIZE(BO), %xmm8 876 mulpd %xmm8, %xmm0 877 mulpd %xmm8, %xmm1 878 879 movddup -15 * SIZE(BO), %xmm9 880 mulpd %xmm0, %xmm9 881 subpd %xmm9, %xmm2 882 movddup -15 * SIZE(BO), %xmm9 883 mulpd %xmm1, %xmm9 884 subpd %xmm9, %xmm3 885 886 movddup -14 * SIZE(BO), %xmm10 887 mulpd %xmm0, %xmm10 888 subpd %xmm10, %xmm4 889 movddup -14 * SIZE(BO), %xmm10 890 mulpd %xmm1, %xmm10 891 subpd %xmm10, %xmm5 892 893 movddup -13 * SIZE(BO), %xmm11 894 mulpd %xmm0, %xmm11 895 subpd %xmm11, %xmm6 896 movddup -13 * SIZE(BO), %xmm11 897 mulpd %xmm1, %xmm11 898 subpd %xmm11, %xmm7 899 900 movddup -11 * SIZE(BO), %xmm8 901 mulpd %xmm8, %xmm2 902 mulpd %xmm8, %xmm3 903 904 movddup -10 * SIZE(BO), %xmm9 905 mulpd %xmm2, %xmm9 906 subpd %xmm9, %xmm4 907 movddup -10 * SIZE(BO), %xmm9 908 mulpd %xmm3, %xmm9 909 subpd %xmm9, %xmm5 910 911 movddup -9 * SIZE(BO), %xmm10 912 mulpd %xmm2, %xmm10 913 subpd %xmm10, %xmm6 914 movddup -9 * SIZE(BO), %xmm10 915 mulpd %xmm3, %xmm10 916 subpd %xmm10, %xmm7 917 918 movddup -6 * SIZE(BO), %xmm8 919 mulpd %xmm8, %xmm4 920 mulpd %xmm8, %xmm5 921 922 movddup -5 * SIZE(BO), %xmm9 923 mulpd %xmm4, %xmm9 924 subpd %xmm9, %xmm6 925 movddup -5 * SIZE(BO), %xmm9 926 mulpd %xmm5, %xmm9 927 subpd %xmm9, %xmm7 928 929 movddup -1 * SIZE(BO), %xmm8 930 mulpd %xmm8, %xmm6 931 mulpd %xmm8, %xmm7 932#endif 933 934#ifdef RT 935 movddup -1 * SIZE(BO), %xmm8 936 mulpd %xmm8, %xmm6 937 mulpd %xmm8, %xmm7 938 939 movddup -2 * SIZE(BO), %xmm9 940 mulpd %xmm6, %xmm9 941 subpd %xmm9, %xmm4 942 movddup -2 * SIZE(BO), %xmm9 943 mulpd %xmm7, %xmm9 944 subpd %xmm9, %xmm5 945 946 movddup -3 * SIZE(BO), %xmm10 947 mulpd %xmm6, %xmm10 948 subpd %xmm10, %xmm2 949 movddup -3 * SIZE(BO), %xmm10 950 mulpd %xmm7, %xmm10 951 subpd %xmm10, %xmm3 952 953 movddup -4 * SIZE(BO), %xmm11 954 mulpd %xmm6, %xmm11 955 subpd %xmm11, %xmm0 956 movddup -4 * SIZE(BO), %xmm11 957 mulpd %xmm7, %xmm11 958 subpd %xmm11, %xmm1 959 960 movddup -6 * SIZE(BO), %xmm8 961 mulpd %xmm8, %xmm4 962 mulpd %xmm8, %xmm5 963 964 movddup -7 * SIZE(BO), %xmm9 965 mulpd %xmm4, %xmm9 966 subpd %xmm9, %xmm2 967 movddup -7 * SIZE(BO), %xmm9 968 mulpd %xmm5, %xmm9 969 subpd %xmm9, %xmm3 970 971 movddup -8 * SIZE(BO), %xmm10 972 mulpd %xmm4, %xmm10 973 subpd %xmm10, %xmm0 974 movddup -8 * SIZE(BO), %xmm10 975 mulpd %xmm5, %xmm10 976 subpd %xmm10, %xmm1 977 978 movddup -11 * SIZE(BO), %xmm8 979 mulpd %xmm8, %xmm2 980 mulpd %xmm8, %xmm3 981 982 movddup -12 * SIZE(BO), %xmm9 983 mulpd %xmm2, %xmm9 984 subpd %xmm9, %xmm0 985 movddup -12 * SIZE(BO), %xmm9 986 mulpd %xmm3, %xmm9 987 subpd %xmm9, %xmm1 988 989 movddup -16 * SIZE(BO), %xmm8 990 mulpd %xmm8, %xmm0 991 mulpd %xmm8, %xmm1 992#endif 993 994#ifdef LN 995 subq $4 * SIZE, CO1 996 subq $4 * SIZE, CO2 997#endif 998 999#if defined(LN) || defined(LT) 1000 movlpd %xmm9, 0 * SIZE(CO1) 1001 movlpd %xmm13, 1 * SIZE(CO1) 1002 movlpd %xmm1, 2 * SIZE(CO1) 1003 movlpd %xmm5, 3 * SIZE(CO1) 1004 1005 movhpd %xmm9, 0 * SIZE(CO2) 1006 movhpd %xmm13, 1 * SIZE(CO2) 1007 movhpd %xmm1, 2 * SIZE(CO2) 1008 movhpd %xmm5, 3 * SIZE(CO2) 1009 1010 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 1011 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 1012 movlpd %xmm3, 2 * SIZE(CO1, LDC, 2) 1013 movlpd %xmm7, 3 * SIZE(CO1, LDC, 2) 1014 1015 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1016 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 1017 movhpd %xmm3, 2 * SIZE(CO2, LDC, 2) 1018 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 1019#else 1020 movlpd %xmm0, 0 * SIZE(CO1) 1021 movhpd %xmm0, 1 * SIZE(CO1) 1022 movlpd %xmm1, 2 * SIZE(CO1) 1023 movhpd %xmm1, 3 * SIZE(CO1) 1024 1025 movlpd %xmm2, 0 * SIZE(CO2) 1026 movhpd %xmm2, 1 * SIZE(CO2) 1027 movlpd %xmm3, 2 * SIZE(CO2) 1028 movhpd %xmm3, 3 * SIZE(CO2) 1029 1030 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 1031 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 1032 movlpd %xmm5, 2 * SIZE(CO1, LDC, 2) 1033 movhpd %xmm5, 3 * SIZE(CO1, LDC, 2) 1034 1035 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 1036 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 1037 movlpd %xmm7, 2 * SIZE(CO2, LDC, 2) 1038 movhpd %xmm7, 3 * SIZE(CO2, LDC, 2) 1039#endif 1040 1041#if defined(LN) || defined(LT) 1042 movaps %xmm9, -16 * SIZE(BO) 1043 movaps %xmm11, -14 * SIZE(BO) 1044 movaps %xmm13, -12 * SIZE(BO) 1045 movaps %xmm15, -10 * SIZE(BO) 1046 movaps %xmm1, -8 * SIZE(BO) 1047 movaps %xmm3, -6 * SIZE(BO) 1048 movaps %xmm5, -4 * SIZE(BO) 1049 movaps %xmm7, -2 * SIZE(BO) 1050#else 1051 movaps %xmm0, -16 * SIZE(AO) 1052 movaps %xmm1, -14 * SIZE(AO) 1053 movaps %xmm2, -12 * SIZE(AO) 1054 movaps %xmm3, -10 * SIZE(AO) 1055 movaps %xmm4, -8 * SIZE(AO) 1056 movaps %xmm5, -6 * SIZE(AO) 1057 movaps %xmm6, -4 * SIZE(AO) 1058 movaps %xmm7, -2 * SIZE(AO) 1059#endif 1060 1061#ifndef LN 1062 addq $4 * SIZE, CO1 1063 addq $4 * SIZE, CO2 1064#endif 1065 1066#if defined(LT) || defined(RN) 1067 movq K, %rax 1068 subq KK, %rax 1069 leaq (,%rax, SIZE), %rax 1070 leaq (AO, %rax, 4), AO 1071 leaq (BO, %rax, 4), BO 1072#endif 1073 1074#ifdef LN 1075 subq $4, KK 1076#endif 1077 1078#ifdef LT 1079 addq $4, KK 1080#endif 1081 1082#ifdef RT 1083 movq K, %rax 1084 salq $2 + BASE_SHIFT, %rax 1085 addq %rax, AORIG 1086#endif 1087 1088 decq I # i -- 1089 jg .L11 1090 ALIGN_4 1091 1092.L20: 1093 testq $3, M 1094 je .L39 1095 1096 testq $2, M 1097 je .L30 1098 ALIGN_4 1099 1100.L21: 1101#ifdef LN 1102 movq K, %rax 1103 salq $1 + BASE_SHIFT, %rax 1104 subq %rax, AORIG 1105#endif 1106 1107#if defined(LN) || defined(RT) 1108 movq KK, %rax 1109 movq AORIG, AO 1110 leaq (, %rax, SIZE), %rax 1111 leaq (AO, %rax, 2), AO 1112#endif 1113 1114 movq B, BO 1115 1116#if defined(LN) || defined(RT) 1117 movq KK, %rax 1118 leaq (, %rax, SIZE), %rax 1119 leaq (BO, %rax, 4), BO 1120#endif 1121 1122 movapd -16 * SIZE(AO), %xmm0 1123 pxor %xmm8, %xmm8 1124 movapd -12 * SIZE(AO), %xmm2 1125 pxor %xmm9, %xmm9 1126 movddup -16 * SIZE(BO), %xmm1 1127 pxor %xmm10, %xmm10 1128 movddup -15 * SIZE(BO), %xmm5 1129 pxor %xmm11, %xmm11 1130 movddup -8 * SIZE(BO), %xmm3 1131 1132#if defined(LT) || defined(RN) 1133 movq KK, %rax 1134#else 1135 movq K, %rax 1136 subq KK, %rax 1137#endif 1138 andq $-4, %rax 1139 leaq (, %rax, SIZE), %rax 1140 leaq (AO, %rax, 2), AO 1141 leaq (BO, %rax, 4), BO 1142 negq %rax 1143 NOBRANCH 1144 je .L26 1145 ALIGN_4 1146 1147.L22: 1148 mulpd %xmm0, %xmm1 1149 addpd %xmm1, %xmm8 1150 movddup -14 * SIZE(BO, %rax, 4), %xmm1 1151 mulpd %xmm0, %xmm5 1152 addpd %xmm5, %xmm9 1153 movddup -13 * SIZE(BO, %rax, 4), %xmm5 1154 mulpd %xmm0, %xmm1 1155 addpd %xmm1, %xmm10 1156 movddup -12 * SIZE(BO, %rax, 4), %xmm1 1157 mulpd %xmm0, %xmm5 1158 movapd -14 * SIZE(AO, %rax, 2), %xmm0 1159 addpd %xmm5, %xmm11 1160 movddup -11 * SIZE(BO, %rax, 4), %xmm5 1161 mulpd %xmm0, %xmm1 1162 addpd %xmm1, %xmm8 1163 movddup -10 * SIZE(BO, %rax, 4), %xmm1 1164 mulpd %xmm0, %xmm5 1165 addpd %xmm5, %xmm9 1166 movddup -9 * SIZE(BO, %rax, 4), %xmm5 1167 mulpd %xmm0, %xmm1 1168 addpd %xmm1, %xmm10 1169 movddup (BO, %rax, 4), %xmm1 1170 mulpd %xmm0, %xmm5 1171 movapd -8 * SIZE(AO, %rax, 2), %xmm0 1172 addpd %xmm5, %xmm11 1173 movddup -7 * SIZE(BO, %rax, 4), %xmm5 1174 mulpd %xmm2, %xmm3 1175 addpd %xmm3, %xmm8 1176 movddup -6 * SIZE(BO, %rax, 4), %xmm3 1177 mulpd %xmm2, %xmm5 1178 addpd %xmm5, %xmm9 1179 movddup -5 * SIZE(BO, %rax, 4), %xmm5 1180 mulpd %xmm2, %xmm3 1181 addpd %xmm3, %xmm10 1182 movddup -4 * SIZE(BO, %rax, 4), %xmm3 1183 mulpd %xmm2, %xmm5 1184 movapd -10 * SIZE(AO, %rax, 2), %xmm2 1185 addpd %xmm5, %xmm11 1186 movddup -3 * SIZE(BO, %rax, 4), %xmm5 1187 mulpd %xmm2, %xmm3 1188 addpd %xmm3, %xmm8 1189 movddup -2 * SIZE(BO, %rax, 4), %xmm3 1190 mulpd %xmm2, %xmm5 1191 addpd %xmm5, %xmm9 1192 movddup -1 * SIZE(BO, %rax, 4), %xmm5 1193 mulpd %xmm2, %xmm3 1194 addpd %xmm3, %xmm10 1195 movddup 8 * SIZE(BO, %rax, 4), %xmm3 1196 mulpd %xmm2, %xmm5 1197 movapd -4 * SIZE(AO, %rax, 2), %xmm2 1198 addpd %xmm5, %xmm11 1199 movddup 1 * SIZE(BO, %rax, 4), %xmm5 1200 1201 addq $4 * SIZE, %rax 1202 BRANCH 1203 jl .L22 1204 ALIGN_4 1205 1206.L26: 1207#if defined(LT) || defined(RN) 1208 movq KK, %rax 1209#else 1210 movq K, %rax 1211 subq KK, %rax 1212#endif 1213 andq $3, %rax # if (k & 1) 1214 je .L29 1215 1216 leaq (, %rax, SIZE), %rax 1217 leaq (AO, %rax, 2), AO 1218 leaq (BO, %rax, 4), BO 1219 negq %rax 1220 ALIGN_4 1221 1222.L27: 1223 mulpd %xmm0, %xmm1 1224 addpd %xmm1, %xmm8 1225 movddup -14 * SIZE(BO, %rax, 4), %xmm1 1226 mulpd %xmm0, %xmm5 1227 addpd %xmm5, %xmm9 1228 movddup -13 * SIZE(BO, %rax, 4), %xmm5 1229 mulpd %xmm0, %xmm1 1230 addpd %xmm1, %xmm10 1231 movddup -12 * SIZE(BO, %rax, 4), %xmm1 1232 mulpd %xmm0, %xmm5 1233 movapd -14 * SIZE(AO, %rax, 2), %xmm0 1234 addpd %xmm5, %xmm11 1235 movddup -11 * SIZE(BO, %rax, 4), %xmm5 1236 1237 addq $SIZE, %rax 1238 jl .L27 1239 ALIGN_4 1240 1241.L29: 1242#if defined(LN) || defined(RT) 1243 movq KK, %rax 1244#ifdef LN 1245 subq $2, %rax 1246#else 1247 subq $4, %rax 1248#endif 1249 1250 leaq (, %rax, SIZE), %rax 1251 1252 movq AORIG, AO 1253 leaq (AO, %rax, 2), AO 1254 leaq (B, %rax, 4), BO 1255#endif 1256 1257#if defined(LN) || defined(LT) 1258 movapd %xmm8, %xmm0 1259 unpcklpd %xmm9, %xmm8 1260 unpckhpd %xmm9, %xmm0 1261 1262 movapd %xmm10, %xmm2 1263 unpcklpd %xmm11, %xmm10 1264 unpckhpd %xmm11, %xmm2 1265 1266 movapd -16 * SIZE(BO), %xmm9 1267 movapd -14 * SIZE(BO), %xmm11 1268 movapd -12 * SIZE(BO), %xmm13 1269 movapd -10 * SIZE(BO), %xmm15 1270 1271 subpd %xmm8, %xmm9 1272 subpd %xmm10, %xmm11 1273 subpd %xmm0, %xmm13 1274 subpd %xmm2, %xmm15 1275#else 1276 movapd -16 * SIZE(AO), %xmm0 1277 movapd -14 * SIZE(AO), %xmm2 1278 movapd -12 * SIZE(AO), %xmm4 1279 movapd -10 * SIZE(AO), %xmm6 1280 1281 subpd %xmm8, %xmm0 1282 subpd %xmm9, %xmm2 1283 subpd %xmm10, %xmm4 1284 subpd %xmm11, %xmm6 1285#endif 1286 1287#ifdef LN 1288 movddup -13 * SIZE(AO), %xmm8 1289 mulpd %xmm8, %xmm13 1290 mulpd %xmm8, %xmm15 1291 1292 movddup -14 * SIZE(AO), %xmm10 1293 mulpd %xmm13, %xmm10 1294 subpd %xmm10, %xmm9 1295 movddup -14 * SIZE(AO), %xmm10 1296 mulpd %xmm15, %xmm10 1297 subpd %xmm10, %xmm11 1298 1299 movddup -16 * SIZE(AO), %xmm8 1300 mulpd %xmm8, %xmm9 1301 mulpd %xmm8, %xmm11 1302#endif 1303 1304#ifdef LT 1305 movddup -16 * SIZE(AO), %xmm8 1306 mulpd %xmm8, %xmm9 1307 mulpd %xmm8, %xmm11 1308 1309 movddup -15 * SIZE(AO), %xmm10 1310 mulpd %xmm9, %xmm10 1311 subpd %xmm10, %xmm13 1312 movddup -15 * SIZE(AO), %xmm10 1313 mulpd %xmm11, %xmm10 1314 subpd %xmm10, %xmm15 1315 1316 movddup -13 * SIZE(AO), %xmm8 1317 mulpd %xmm8, %xmm13 1318 mulpd %xmm8, %xmm15 1319#endif 1320 1321#ifdef RN 1322 movddup -16 * SIZE(BO), %xmm8 1323 mulpd %xmm8, %xmm0 1324 1325 movddup -15 * SIZE(BO), %xmm9 1326 mulpd %xmm0, %xmm9 1327 subpd %xmm9, %xmm2 1328 movddup -14 * SIZE(BO), %xmm10 1329 mulpd %xmm0, %xmm10 1330 subpd %xmm10, %xmm4 1331 movddup -13 * SIZE(BO), %xmm11 1332 mulpd %xmm0, %xmm11 1333 subpd %xmm11, %xmm6 1334 1335 movddup -11 * SIZE(BO), %xmm8 1336 mulpd %xmm8, %xmm2 1337 movddup -10 * SIZE(BO), %xmm9 1338 mulpd %xmm2, %xmm9 1339 subpd %xmm9, %xmm4 1340 movddup -9 * SIZE(BO), %xmm10 1341 mulpd %xmm2, %xmm10 1342 subpd %xmm10, %xmm6 1343 1344 movddup -6 * SIZE(BO), %xmm8 1345 mulpd %xmm8, %xmm4 1346 1347 movddup -5 * SIZE(BO), %xmm9 1348 mulpd %xmm4, %xmm9 1349 subpd %xmm9, %xmm6 1350 1351 movddup -1 * SIZE(BO), %xmm8 1352 mulpd %xmm8, %xmm6 1353#endif 1354 1355#ifdef RT 1356 movddup -1 * SIZE(BO), %xmm8 1357 mulpd %xmm8, %xmm6 1358 1359 movddup -2 * SIZE(BO), %xmm9 1360 mulpd %xmm6, %xmm9 1361 subpd %xmm9, %xmm4 1362 movddup -3 * SIZE(BO), %xmm10 1363 mulpd %xmm6, %xmm10 1364 subpd %xmm10, %xmm2 1365 movddup -4 * SIZE(BO), %xmm11 1366 mulpd %xmm6, %xmm11 1367 subpd %xmm11, %xmm0 1368 1369 movddup -6 * SIZE(BO), %xmm8 1370 mulpd %xmm8, %xmm4 1371 movddup -7 * SIZE(BO), %xmm9 1372 mulpd %xmm4, %xmm9 1373 subpd %xmm9, %xmm2 1374 movddup -8 * SIZE(BO), %xmm10 1375 mulpd %xmm4, %xmm10 1376 subpd %xmm10, %xmm0 1377 1378 movddup -11 * SIZE(BO), %xmm8 1379 mulpd %xmm8, %xmm2 1380 movddup -12 * SIZE(BO), %xmm9 1381 mulpd %xmm2, %xmm9 1382 subpd %xmm9, %xmm0 1383 1384 movddup -16 * SIZE(BO), %xmm8 1385 mulpd %xmm8, %xmm0 1386#endif 1387 1388#ifdef LN 1389 subq $2 * SIZE, CO1 1390 subq $2 * SIZE, CO2 1391#endif 1392 1393#if defined(LN) || defined(LT) 1394 movlpd %xmm9, 0 * SIZE(CO1) 1395 movlpd %xmm13, 1 * SIZE(CO1) 1396 1397 movhpd %xmm9, 0 * SIZE(CO2) 1398 movhpd %xmm13, 1 * SIZE(CO2) 1399 1400 movlpd %xmm11, 0 * SIZE(CO1, LDC, 2) 1401 movlpd %xmm15, 1 * SIZE(CO1, LDC, 2) 1402 1403 movhpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1404 movhpd %xmm15, 1 * SIZE(CO2, LDC, 2) 1405#else 1406 movlpd %xmm0, 0 * SIZE(CO1) 1407 movhpd %xmm0, 1 * SIZE(CO1) 1408 1409 movlpd %xmm2, 0 * SIZE(CO2) 1410 movhpd %xmm2, 1 * SIZE(CO2) 1411 1412 movlpd %xmm4, 0 * SIZE(CO1, LDC, 2) 1413 movhpd %xmm4, 1 * SIZE(CO1, LDC, 2) 1414 1415 movlpd %xmm6, 0 * SIZE(CO2, LDC, 2) 1416 movhpd %xmm6, 1 * SIZE(CO2, LDC, 2) 1417#endif 1418 1419#if defined(LN) || defined(LT) 1420 movaps %xmm9, -16 * SIZE(BO) 1421 movaps %xmm11, -14 * SIZE(BO) 1422 movaps %xmm13, -12 * SIZE(BO) 1423 movaps %xmm15, -10 * SIZE(BO) 1424#else 1425 movaps %xmm0, -16 * SIZE(AO) 1426 movaps %xmm2, -14 * SIZE(AO) 1427 movaps %xmm4, -12 * SIZE(AO) 1428 movaps %xmm6, -10 * SIZE(AO) 1429#endif 1430 1431#ifndef LN 1432 addq $2 * SIZE, CO1 1433 addq $2 * SIZE, CO2 1434#endif 1435 1436#if defined(LT) || defined(RN) 1437 movq K, %rax 1438 subq KK, %rax 1439 leaq (,%rax, SIZE), %rax 1440 leaq (AO, %rax, 2), AO 1441 leaq (BO, %rax, 4), BO 1442#endif 1443 1444#ifdef LN 1445 subq $2, KK 1446#endif 1447 1448#ifdef LT 1449 addq $2, KK 1450#endif 1451 1452#ifdef RT 1453 movq K, %rax 1454 salq $1 + BASE_SHIFT, %rax 1455 addq %rax, AORIG 1456#endif 1457 ALIGN_4 1458 1459.L30: 1460 testq $1, M 1461 je .L39 1462 1463#ifdef LN 1464 movq K, %rax 1465 salq $0 + BASE_SHIFT, %rax 1466 subq %rax, AORIG 1467#endif 1468 1469#if defined(LN) || defined(RT) 1470 movq KK, %rax 1471 movq AORIG, AO 1472 leaq (, %rax, SIZE), %rax 1473 leaq (AO, %rax, 1), AO 1474#endif 1475 1476 movq B, BO 1477 1478#if defined(LN) || defined(RT) 1479 movq KK, %rax 1480 leaq (, %rax, SIZE), %rax 1481 leaq (BO, %rax, 4), BO 1482#endif 1483 1484 movddup -16 * SIZE(AO), %xmm0 1485 pxor %xmm8, %xmm8 1486 movddup -14 * SIZE(AO), %xmm2 1487 pxor %xmm9, %xmm9 1488 movddup -15 * SIZE(AO), %xmm4 1489 pxor %xmm10, %xmm10 1490 movapd -16 * SIZE(BO), %xmm1 1491 pxor %xmm11, %xmm11 1492 movapd -8 * SIZE(BO), %xmm3 1493 1494#if defined(LT) || defined(RN) 1495 movq KK, %rax 1496#else 1497 movq K, %rax 1498 subq KK, %rax 1499#endif 1500 andq $-4, %rax 1501 leaq (, %rax, SIZE), %rax 1502 leaq (AO, %rax, 1), AO 1503 leaq (BO, %rax, 4), BO 1504 negq %rax 1505 NOBRANCH 1506 je .L36 1507 ALIGN_4 1508 1509.L32: 1510 mulpd %xmm0, %xmm1 1511 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 1512 addpd %xmm1, %xmm8 1513 movapd -12 * SIZE(BO, %rax, 4), %xmm1 1514 addpd %xmm0, %xmm9 1515 movddup -12 * SIZE(AO, %rax, 1), %xmm0 1516 mulpd %xmm4, %xmm1 1517 mulpd -10 * SIZE(BO, %rax, 4), %xmm4 1518 addpd %xmm1, %xmm10 1519 movapd (BO, %rax, 4), %xmm1 1520 addpd %xmm4, %xmm11 1521 movddup -11 * SIZE(AO, %rax, 1), %xmm4 1522 mulpd %xmm2, %xmm3 1523 mulpd -6 * SIZE(BO, %rax, 4), %xmm2 1524 addpd %xmm3, %xmm8 1525 movapd -4 * SIZE(BO, %rax, 4), %xmm3 1526 addpd %xmm2, %xmm9 1527 movddup -13 * SIZE(AO, %rax, 1), %xmm2 1528 mulpd %xmm2, %xmm3 1529 mulpd -2 * SIZE(BO, %rax, 4), %xmm2 1530 addpd %xmm3, %xmm10 1531 movapd 8 * SIZE(BO, %rax, 4), %xmm3 1532 addpd %xmm2, %xmm11 1533 movddup -10 * SIZE(AO, %rax, 1), %xmm2 1534 1535 addq $4 * SIZE, %rax 1536 BRANCH 1537 jl .L32 1538 ALIGN_4 1539 1540.L36: 1541#if defined(LT) || defined(RN) 1542 movq KK, %rax 1543#else 1544 movq K, %rax 1545 subq KK, %rax 1546#endif 1547 andq $3, %rax # if (k & 1) 1548 je .L38 1549 1550 leaq (, %rax, SIZE), %rax 1551 leaq (AO, %rax, 1), AO 1552 leaq (BO, %rax, 4), BO 1553 negq %rax 1554 ALIGN_4 1555 1556.L37: 1557 mulpd %xmm0, %xmm1 1558 mulpd -14 * SIZE(BO, %rax, 4), %xmm0 1559 addpd %xmm1, %xmm8 1560 movapd -12 * SIZE(BO, %rax, 4), %xmm1 1561 addpd %xmm0, %xmm9 1562 movddup -15 * SIZE(AO, %rax, 1), %xmm0 1563 1564 addq $SIZE, %rax 1565 jl .L37 1566 ALIGN_4 1567 1568.L38: 1569 addpd %xmm10, %xmm8 1570 addpd %xmm11, %xmm9 1571 1572#if defined(LN) || defined(RT) 1573 movq KK, %rax 1574#ifdef LN 1575 subq $1, %rax 1576#else 1577 subq $4, %rax 1578#endif 1579 1580 leaq (, %rax, SIZE), %rax 1581 1582 movq AORIG, AO 1583 leaq (AO, %rax, 1), AO 1584 leaq (B, %rax, 4), BO 1585#endif 1586 1587#if defined(LN) || defined(LT) 1588 movapd -16 * SIZE(BO), %xmm2 1589 movapd -14 * SIZE(BO), %xmm3 1590 1591 subpd %xmm8, %xmm2 1592 subpd %xmm9, %xmm3 1593#else 1594 movapd -16 * SIZE(AO), %xmm2 1595 movapd -14 * SIZE(AO), %xmm3 1596 1597 subpd %xmm8, %xmm2 1598 subpd %xmm9, %xmm3 1599#endif 1600 1601#if defined(LN) || defined(LT) 1602 movddup -16 * SIZE(AO), %xmm0 1603 mulpd %xmm0, %xmm2 1604 mulpd %xmm0, %xmm3 1605#endif 1606 1607#ifdef RN 1608 movapd %xmm2, %xmm0 1609 unpckhpd %xmm0, %xmm0 1610 1611 movapd %xmm3, %xmm1 1612 unpckhpd %xmm1, %xmm1 1613 1614 movsd -16 * SIZE(BO), %xmm4 1615 mulsd %xmm4, %xmm2 1616 1617 movsd -15 * SIZE(BO), %xmm5 1618 mulsd %xmm2, %xmm5 1619 subsd %xmm5, %xmm0 1620 movsd -14 * SIZE(BO), %xmm6 1621 mulsd %xmm2, %xmm6 1622 subsd %xmm6, %xmm3 1623 movsd -13 * SIZE(BO), %xmm7 1624 mulsd %xmm2, %xmm7 1625 subsd %xmm7, %xmm1 1626 1627 movsd -11 * SIZE(BO), %xmm4 1628 mulsd %xmm4, %xmm0 1629 1630 movsd -10 * SIZE(BO), %xmm5 1631 mulsd %xmm0, %xmm5 1632 subsd %xmm5, %xmm3 1633 movsd -9 * SIZE(BO), %xmm6 1634 mulsd %xmm0, %xmm6 1635 subsd %xmm6, %xmm1 1636 1637 movsd -6 * SIZE(BO), %xmm4 1638 mulsd %xmm4, %xmm3 1639 1640 movsd -5 * SIZE(BO), %xmm5 1641 mulsd %xmm3, %xmm5 1642 subsd %xmm5, %xmm1 1643 1644 movsd -1 * SIZE(BO), %xmm4 1645 mulsd %xmm4, %xmm1 1646 1647 unpcklpd %xmm0, %xmm2 1648 unpcklpd %xmm1, %xmm3 1649#endif 1650 1651#ifdef RT 1652 movapd %xmm2, %xmm0 1653 unpckhpd %xmm0, %xmm0 1654 1655 movapd %xmm3, %xmm1 1656 unpckhpd %xmm1, %xmm1 1657 1658 movsd -1 * SIZE(BO), %xmm4 1659 mulsd %xmm4, %xmm1 1660 1661 movsd -2 * SIZE(BO), %xmm5 1662 mulsd %xmm1, %xmm5 1663 subsd %xmm5, %xmm3 1664 movsd -3 * SIZE(BO), %xmm6 1665 mulsd %xmm1, %xmm6 1666 subsd %xmm6, %xmm0 1667 movsd -4 * SIZE(BO), %xmm7 1668 mulsd %xmm1, %xmm7 1669 subsd %xmm7, %xmm2 1670 1671 movsd -6 * SIZE(BO), %xmm4 1672 mulsd %xmm4, %xmm3 1673 1674 movsd -7 * SIZE(BO), %xmm5 1675 mulsd %xmm3, %xmm5 1676 subsd %xmm5, %xmm0 1677 movsd -8 * SIZE(BO), %xmm6 1678 mulsd %xmm3, %xmm6 1679 subsd %xmm6, %xmm2 1680 1681 movsd -11 * SIZE(BO), %xmm4 1682 mulsd %xmm4, %xmm0 1683 1684 movsd -12 * SIZE(BO), %xmm5 1685 mulsd %xmm0, %xmm5 1686 subsd %xmm5, %xmm2 1687 1688 movsd -16 * SIZE(BO), %xmm4 1689 mulsd %xmm4, %xmm2 1690 1691 unpcklpd %xmm0, %xmm2 1692 unpcklpd %xmm1, %xmm3 1693 1694#endif 1695 1696#ifdef LN 1697 subq $1 * SIZE, CO1 1698 subq $1 * SIZE, CO2 1699#endif 1700 1701#if defined(LN) || defined(LT) 1702 movlpd %xmm2, 0 * SIZE(CO1) 1703 movhpd %xmm2, 0 * SIZE(CO2) 1704 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 1705 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 1706#else 1707 movlpd %xmm2, 0 * SIZE(CO1) 1708 movhpd %xmm2, 0 * SIZE(CO2) 1709 movlpd %xmm3, 0 * SIZE(CO1, LDC, 2) 1710 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 1711#endif 1712 1713#if defined(LN) || defined(LT) 1714 movaps %xmm2, -16 * SIZE(BO) 1715 movaps %xmm3, -14 * SIZE(BO) 1716#else 1717 movaps %xmm2, -16 * SIZE(AO) 1718 movaps %xmm3, -14 * SIZE(AO) 1719#endif 1720 1721#ifndef LN 1722 addq $1 * SIZE, CO1 1723 addq $1 * SIZE, CO2 1724#endif 1725 1726#if defined(LT) || defined(RN) 1727 movq K, %rax 1728 subq KK, %rax 1729 leaq (,%rax, SIZE), %rax 1730 leaq (AO, %rax, 1), AO 1731 leaq (BO, %rax, 4), BO 1732#endif 1733 1734#ifdef LN 1735 subq $1, KK 1736#endif 1737 1738#ifdef LT 1739 addq $1, KK 1740#endif 1741 1742#ifdef RT 1743 movq K, %rax 1744 salq $0 + BASE_SHIFT, %rax 1745 addq %rax, AORIG 1746#endif 1747 ALIGN_4 1748 1749.L39: 1750#ifdef LN 1751 leaq (, K, SIZE), %rax 1752 leaq (B, %rax, 4), B 1753#endif 1754 1755#if defined(LT) || defined(RN) 1756 movq BO, B 1757#endif 1758 1759#ifdef RN 1760 addq $4, KK 1761#endif 1762 1763#ifdef RT 1764 subq $4, KK 1765#endif 1766 1767 decq J # j -- 1768 jg .L01 1769 ALIGN_4 1770 1771.L40: 1772 testq $2, N 1773 je .L80 1774 1775#if defined(LT) || defined(RN) 1776 movq A, AO 1777#else 1778 movq A, AORIG 1779#endif 1780 1781#ifdef RT 1782 movq K, %rax 1783 salq $1 + BASE_SHIFT, %rax 1784 subq %rax, B 1785 1786 leaq (, LDC, 2), %rax 1787 subq %rax, C 1788#endif 1789 1790 movq C, CO1 # coffset1 = c 1791 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1792#ifndef RT 1793 leaq (C, LDC, 2), C 1794#endif 1795 1796#ifdef LN 1797 movq OFFSET, %rax 1798 addq M, %rax 1799 movq %rax, KK 1800#endif 1801 1802#if defined(LT) 1803 movq OFFSET, %rax 1804 movq %rax, KK 1805#endif 1806 1807 movq M, I 1808 sarq $2, I # i = (m >> 2) 1809 jle .L60 1810 ALIGN_4 1811 1812.L51: 1813#ifdef LN 1814 movq K, %rax 1815 salq $2 + BASE_SHIFT, %rax 1816 subq %rax, AORIG 1817#endif 1818 1819#if defined(LN) || defined(RT) 1820 movq KK, %rax 1821 movq AORIG, AO 1822 leaq (, %rax, SIZE), %rax 1823 leaq (AO, %rax, 4), AO 1824#endif 1825 1826 movq B, BO 1827 1828#if defined(LN) || defined(RT) 1829 movq KK, %rax 1830 leaq (, %rax, SIZE), %rax 1831 leaq (BO, %rax, 2), BO 1832#endif 1833 1834 movddup -16 * SIZE(BO), %xmm1 1835 movddup -15 * SIZE(BO), %xmm5 1836 pxor %xmm8, %xmm8 1837 movddup -12 * SIZE(BO), %xmm3 1838 pxor %xmm9, %xmm9 1839 movapd -16 * SIZE(AO), %xmm0 1840 pxor %xmm12, %xmm12 1841 movapd -8 * SIZE(AO), %xmm4 1842 pxor %xmm13, %xmm13 1843 1844#ifndef LN 1845 prefetchw 3 * SIZE(CO1) 1846 movapd %xmm0, %xmm2 1847 prefetchw 3 * SIZE(CO2) 1848#else 1849 prefetchw -8 * SIZE(CO1) 1850 movapd %xmm0, %xmm2 1851 prefetchw -8 * SIZE(CO2) 1852#endif 1853 1854 1855#if defined(LT) || defined(RN) 1856 movq KK, %rax 1857#else 1858 movq K, %rax 1859 subq KK, %rax 1860#endif 1861 andq $-4, %rax 1862 leaq (, %rax, SIZE), %rax 1863 leaq (AO, %rax, 4), AO 1864 leaq (BO, %rax, 2), BO 1865 negq %rax 1866 NOBRANCH 1867 je .L56 1868 ALIGN_4 1869 1870.L52: 1871 mulpd %xmm1, %xmm0 1872 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1873 addpd %xmm0, %xmm8 1874 movapd -12 * SIZE(AO, %rax, 4), %xmm0 1875 addpd %xmm1, %xmm12 1876 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1877 mulpd %xmm5, %xmm2 1878 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 1879 addpd %xmm2, %xmm9 1880 addpd %xmm5, %xmm13 1881 movddup -13 * SIZE(BO, %rax, 2), %xmm5 1882 movapd %xmm0, %xmm2 1883 mulpd %xmm1, %xmm0 1884 mulpd -10 * SIZE(AO, %rax, 4), %xmm1 1885 addpd %xmm0, %xmm8 1886 movapd (AO, %rax, 4), %xmm0 1887 addpd %xmm1, %xmm12 1888 movddup -8 * SIZE(BO, %rax, 2), %xmm1 1889 mulpd %xmm5, %xmm2 1890 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 1891 addpd %xmm2, %xmm9 1892 addpd %xmm5, %xmm13 1893 movddup -11 * SIZE(BO, %rax, 2), %xmm5 1894 movapd %xmm4, %xmm2 1895 mulpd %xmm3, %xmm4 1896 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 1897 addpd %xmm4, %xmm8 1898 movapd -4 * SIZE(AO, %rax, 4), %xmm4 1899 addpd %xmm3, %xmm12 1900 movddup -10 * SIZE(BO, %rax, 2), %xmm3 1901 mulpd %xmm5, %xmm2 1902 mulpd -6 * SIZE(AO, %rax, 4), %xmm5 1903 addpd %xmm2, %xmm9 1904 addpd %xmm5, %xmm13 1905 movddup -9 * SIZE(BO, %rax, 2), %xmm5 1906 movapd %xmm4, %xmm2 1907 mulpd %xmm3, %xmm4 1908 mulpd -2 * SIZE(AO, %rax, 4), %xmm3 1909 addpd %xmm4, %xmm8 1910 movapd 8 * SIZE(AO, %rax, 4), %xmm4 1911 addpd %xmm3, %xmm12 1912 movddup -4 * SIZE(BO, %rax, 2), %xmm3 1913 mulpd %xmm5, %xmm2 1914 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 1915 addpd %xmm2, %xmm9 1916 addpd %xmm5, %xmm13 1917 movddup -7 * SIZE(BO, %rax, 2), %xmm5 1918 movapd %xmm0, %xmm2 1919 1920 addq $4 * SIZE, %rax 1921 BRANCH 1922 jl .L52 1923 ALIGN_4 1924 1925.L56: 1926#if defined(LT) || defined(RN) 1927 movq KK, %rax 1928#else 1929 movq K, %rax 1930 subq KK, %rax 1931#endif 1932 andq $3, %rax # if (k & 1) 1933 je .L59 1934 1935 leaq (, %rax, SIZE), %rax 1936 leaq (AO, %rax, 4), AO 1937 leaq (BO, %rax, 2), BO 1938 negq %rax 1939 ALIGN_4 1940 1941.L57: 1942 mulpd %xmm1, %xmm0 1943 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 1944 addpd %xmm0, %xmm8 1945 movapd -12 * SIZE(AO, %rax, 4), %xmm0 1946 addpd %xmm1, %xmm12 1947 movddup -14 * SIZE(BO, %rax, 2), %xmm1 1948 mulpd %xmm5, %xmm2 1949 mulpd -14 * SIZE(AO, %rax, 4), %xmm5 1950 addpd %xmm2, %xmm9 1951 addpd %xmm5, %xmm13 1952 movddup -13 * SIZE(BO, %rax, 2), %xmm5 1953 movapd %xmm0, %xmm2 1954 1955 addq $SIZE, %rax 1956 jl .L57 1957 ALIGN_4 1958 1959.L59: 1960#if defined(LN) || defined(RT) 1961 movq KK, %rax 1962#ifdef LN 1963 subq $4, %rax 1964#else 1965 subq $2, %rax 1966#endif 1967 1968 leaq (, %rax, SIZE), %rax 1969 1970 movq AORIG, AO 1971 leaq (AO, %rax, 4), AO 1972 leaq (B, %rax, 2), BO 1973#endif 1974 1975#if defined(LN) || defined(LT) 1976 movapd %xmm8, %xmm0 1977 unpcklpd %xmm9, %xmm8 1978 unpckhpd %xmm9, %xmm0 1979 1980 movapd %xmm12, %xmm4 1981 unpcklpd %xmm13, %xmm12 1982 unpckhpd %xmm13, %xmm4 1983 1984 movapd -16 * SIZE(BO), %xmm9 1985 movapd -14 * SIZE(BO), %xmm13 1986 movapd -12 * SIZE(BO), %xmm1 1987 movapd -10 * SIZE(BO), %xmm5 1988 1989 subpd %xmm8, %xmm9 1990 subpd %xmm0, %xmm13 1991 subpd %xmm12, %xmm1 1992 subpd %xmm4, %xmm5 1993#else 1994 movapd -16 * SIZE(AO), %xmm0 1995 movapd -14 * SIZE(AO), %xmm1 1996 movapd -12 * SIZE(AO), %xmm2 1997 movapd -10 * SIZE(AO), %xmm3 1998 1999 subpd %xmm8, %xmm0 2000 subpd %xmm12, %xmm1 2001 subpd %xmm9, %xmm2 2002 subpd %xmm13, %xmm3 2003#endif 2004 2005#ifdef LN 2006 movddup -1 * SIZE(AO), %xmm8 2007 mulpd %xmm8, %xmm5 2008 movddup -2 * SIZE(AO), %xmm10 2009 mulpd %xmm5, %xmm10 2010 subpd %xmm10, %xmm1 2011 movddup -3 * SIZE(AO), %xmm12 2012 mulpd %xmm5, %xmm12 2013 subpd %xmm12, %xmm13 2014 movddup -4 * SIZE(AO), %xmm14 2015 mulpd %xmm5, %xmm14 2016 subpd %xmm14, %xmm9 2017 2018 movddup -6 * SIZE(AO), %xmm8 2019 mulpd %xmm8, %xmm1 2020 movddup -7 * SIZE(AO), %xmm10 2021 mulpd %xmm1, %xmm10 2022 subpd %xmm10, %xmm13 2023 movddup -8 * SIZE(AO), %xmm12 2024 mulpd %xmm1, %xmm12 2025 subpd %xmm12, %xmm9 2026 2027 movddup -11 * SIZE(AO), %xmm8 2028 mulpd %xmm8, %xmm13 2029 movddup -12 * SIZE(AO), %xmm10 2030 mulpd %xmm13, %xmm10 2031 subpd %xmm10, %xmm9 2032 2033 movddup -16 * SIZE(AO), %xmm8 2034 mulpd %xmm8, %xmm9 2035#endif 2036 2037#ifdef LT 2038 movddup -16 * SIZE(AO), %xmm8 2039 mulpd %xmm8, %xmm9 2040 movddup -15 * SIZE(AO), %xmm10 2041 mulpd %xmm9, %xmm10 2042 subpd %xmm10, %xmm13 2043 movddup -14 * SIZE(AO), %xmm12 2044 mulpd %xmm9, %xmm12 2045 subpd %xmm12, %xmm1 2046 movddup -13 * SIZE(AO), %xmm14 2047 mulpd %xmm9, %xmm14 2048 subpd %xmm14, %xmm5 2049 2050 2051 movddup -11 * SIZE(AO), %xmm8 2052 mulpd %xmm8, %xmm13 2053 2054 movddup -10 * SIZE(AO), %xmm10 2055 mulpd %xmm13, %xmm10 2056 subpd %xmm10, %xmm1 2057 movddup -9 * SIZE(AO), %xmm12 2058 mulpd %xmm13, %xmm12 2059 subpd %xmm12, %xmm5 2060 2061 movddup -6 * SIZE(AO), %xmm8 2062 mulpd %xmm8, %xmm1 2063 movddup -5 * SIZE(AO), %xmm10 2064 mulpd %xmm1, %xmm10 2065 subpd %xmm10, %xmm5 2066 2067 movddup -1 * SIZE(AO), %xmm8 2068 mulpd %xmm8, %xmm5 2069#endif 2070 2071#ifdef RN 2072 movddup -16 * SIZE(BO), %xmm8 2073 mulpd %xmm8, %xmm0 2074 mulpd %xmm8, %xmm1 2075 2076 movddup -15 * SIZE(BO), %xmm9 2077 mulpd %xmm0, %xmm9 2078 subpd %xmm9, %xmm2 2079 movddup -15 * SIZE(BO), %xmm9 2080 mulpd %xmm1, %xmm9 2081 subpd %xmm9, %xmm3 2082 2083 movddup -13 * SIZE(BO), %xmm8 2084 mulpd %xmm8, %xmm2 2085 mulpd %xmm8, %xmm3 2086#endif 2087 2088#ifdef RT 2089 movddup -13 * SIZE(BO), %xmm8 2090 mulpd %xmm8, %xmm2 2091 mulpd %xmm8, %xmm3 2092 2093 movddup -14 * SIZE(BO), %xmm9 2094 mulpd %xmm2, %xmm9 2095 subpd %xmm9, %xmm0 2096 movddup -14 * SIZE(BO), %xmm9 2097 mulpd %xmm3, %xmm9 2098 subpd %xmm9, %xmm1 2099 2100 movddup -16 * SIZE(BO), %xmm8 2101 mulpd %xmm8, %xmm0 2102 mulpd %xmm8, %xmm1 2103#endif 2104 2105#ifdef LN 2106 subq $4 * SIZE, CO1 2107 subq $4 * SIZE, CO2 2108#endif 2109 2110#if defined(LN) || defined(LT) 2111 movlpd %xmm9, 0 * SIZE(CO1) 2112 movlpd %xmm13, 1 * SIZE(CO1) 2113 movlpd %xmm1, 2 * SIZE(CO1) 2114 movlpd %xmm5, 3 * SIZE(CO1) 2115 2116 movhpd %xmm9, 0 * SIZE(CO2) 2117 movhpd %xmm13, 1 * SIZE(CO2) 2118 movhpd %xmm1, 2 * SIZE(CO2) 2119 movhpd %xmm5, 3 * SIZE(CO2) 2120#else 2121 movlpd %xmm0, 0 * SIZE(CO1) 2122 movhpd %xmm0, 1 * SIZE(CO1) 2123 movlpd %xmm1, 2 * SIZE(CO1) 2124 movhpd %xmm1, 3 * SIZE(CO1) 2125 2126 movlpd %xmm2, 0 * SIZE(CO2) 2127 movhpd %xmm2, 1 * SIZE(CO2) 2128 movlpd %xmm3, 2 * SIZE(CO2) 2129 movhpd %xmm3, 3 * SIZE(CO2) 2130#endif 2131 2132#if defined(LN) || defined(LT) 2133 movaps %xmm9, -16 * SIZE(BO) 2134 movaps %xmm13,-14 * SIZE(BO) 2135 movaps %xmm1, -12 * SIZE(BO) 2136 movaps %xmm5, -10 * SIZE(BO) 2137#else 2138 movaps %xmm0, -16 * SIZE(AO) 2139 movaps %xmm1, -14 * SIZE(AO) 2140 movaps %xmm2, -12 * SIZE(AO) 2141 movaps %xmm3, -10 * SIZE(AO) 2142#endif 2143 2144#ifndef LN 2145 addq $4 * SIZE, CO1 2146 addq $4 * SIZE, CO2 2147#endif 2148 2149#if defined(LT) || defined(RN) 2150 movq K, %rax 2151 subq KK, %rax 2152 leaq (,%rax, SIZE), %rax 2153 leaq (AO, %rax, 4), AO 2154 leaq (BO, %rax, 2), BO 2155#endif 2156 2157#ifdef LN 2158 subq $4, KK 2159#endif 2160 2161#ifdef LT 2162 addq $4, KK 2163#endif 2164 2165#ifdef RT 2166 movq K, %rax 2167 salq $2 + BASE_SHIFT, %rax 2168 addq %rax, AORIG 2169#endif 2170 2171 decq I # i -- 2172 jg .L51 2173 ALIGN_4 2174 2175.L60: 2176 testq $2, M 2177 je .L70 2178 2179#ifdef LN 2180 movq K, %rax 2181 salq $1 + BASE_SHIFT, %rax 2182 subq %rax, AORIG 2183#endif 2184 2185#if defined(LN) || defined(RT) 2186 movq KK, %rax 2187 movq AORIG, AO 2188 leaq (, %rax, SIZE), %rax 2189 leaq (AO, %rax, 2), AO 2190#endif 2191 2192 movq B, BO 2193 2194#if defined(LN) || defined(RT) 2195 movq KK, %rax 2196 leaq (, %rax, SIZE), %rax 2197 leaq (BO, %rax, 2), BO 2198#endif 2199 2200 movapd -16 * SIZE(AO), %xmm0 2201 pxor %xmm8, %xmm8 2202 movapd -12 * SIZE(AO), %xmm2 2203 pxor %xmm9, %xmm9 2204 movddup -16 * SIZE(BO), %xmm1 2205 pxor %xmm10, %xmm10 2206 movddup -15 * SIZE(BO), %xmm3 2207 pxor %xmm11, %xmm11 2208 2209#if defined(LT) || defined(RN) 2210 movq KK, %rax 2211#else 2212 movq K, %rax 2213 subq KK, %rax 2214#endif 2215 andq $-4, %rax 2216 leaq (, %rax, SIZE), %rax 2217 leaq (AO, %rax, 2), AO 2218 leaq (BO, %rax, 2), BO 2219 negq %rax 2220 NOBRANCH 2221 je .L66 2222 ALIGN_4 2223 2224.L62: 2225 mulpd %xmm0, %xmm1 2226 addpd %xmm1, %xmm8 2227 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2228 mulpd %xmm0, %xmm3 2229 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2230 addpd %xmm3, %xmm9 2231 movddup -13 * SIZE(BO, %rax, 2), %xmm3 2232 mulpd %xmm0, %xmm1 2233 addpd %xmm1, %xmm10 2234 movddup -12 * SIZE(BO, %rax, 2), %xmm1 2235 mulpd %xmm0, %xmm3 2236 movapd -8 * SIZE(AO, %rax, 2), %xmm0 2237 addpd %xmm3, %xmm11 2238 movddup -11 * SIZE(BO, %rax, 2), %xmm3 2239 mulpd %xmm2, %xmm1 2240 addpd %xmm1, %xmm8 2241 movddup -10 * SIZE(BO, %rax, 2), %xmm1 2242 mulpd %xmm2, %xmm3 2243 movapd -10 * SIZE(AO, %rax, 2), %xmm2 2244 addpd %xmm3, %xmm9 2245 movddup -9 * SIZE(BO, %rax, 2), %xmm3 2246 mulpd %xmm2, %xmm1 2247 addpd %xmm1, %xmm10 2248 movddup -8 * SIZE(BO, %rax, 2), %xmm1 2249 mulpd %xmm2, %xmm3 2250 movapd -4 * SIZE(AO, %rax, 2), %xmm2 2251 addpd %xmm3, %xmm11 2252 movddup -7 * SIZE(BO, %rax, 2), %xmm3 2253 2254 addq $4 * SIZE, %rax 2255 BRANCH 2256 jl .L62 2257 ALIGN_4 2258 2259.L66: 2260#if defined(LT) || defined(RN) 2261 movq KK, %rax 2262#else 2263 movq K, %rax 2264 subq KK, %rax 2265#endif 2266 andq $3, %rax # if (k & 1) 2267 je .L69 2268 2269 leaq (, %rax, SIZE), %rax 2270 leaq (AO, %rax, 2), AO 2271 leaq (BO, %rax, 2), BO 2272 negq %rax 2273 ALIGN_4 2274 2275.L67: 2276 mulpd %xmm0, %xmm1 2277 addpd %xmm1, %xmm8 2278 movddup -14 * SIZE(BO, %rax, 2), %xmm1 2279 mulpd %xmm0, %xmm3 2280 movapd -14 * SIZE(AO, %rax, 2), %xmm0 2281 addpd %xmm3, %xmm9 2282 movddup -13 * SIZE(BO, %rax, 2), %xmm3 2283 2284 addq $SIZE, %rax 2285 jl .L67 2286 ALIGN_4 2287 2288.L69: 2289 addpd %xmm10, %xmm8 2290 addpd %xmm11, %xmm9 2291 2292#if defined(LN) || defined(RT) 2293 movq KK, %rax 2294#ifdef LN 2295 subq $2, %rax 2296#else 2297 subq $2, %rax 2298#endif 2299 2300 leaq (, %rax, SIZE), %rax 2301 2302 movq AORIG, AO 2303 leaq (AO, %rax, 2), AO 2304 leaq (B, %rax, 2), BO 2305#endif 2306 2307#if defined(LN) || defined(LT) 2308 movapd %xmm8, %xmm0 2309 unpcklpd %xmm9, %xmm8 2310 unpckhpd %xmm9, %xmm0 2311 2312 movapd -16 * SIZE(BO), %xmm9 2313 movapd -14 * SIZE(BO), %xmm13 2314 2315 subpd %xmm8, %xmm9 2316 subpd %xmm0, %xmm13 2317#else 2318 movapd -16 * SIZE(AO), %xmm0 2319 movapd -14 * SIZE(AO), %xmm2 2320 2321 subpd %xmm8, %xmm0 2322 subpd %xmm9, %xmm2 2323#endif 2324 2325 2326#ifdef LN 2327 movddup -13 * SIZE(AO), %xmm8 2328 mulpd %xmm8, %xmm13 2329 2330 movddup -14 * SIZE(AO), %xmm10 2331 mulpd %xmm13, %xmm10 2332 subpd %xmm10, %xmm9 2333 2334 movddup -16 * SIZE(AO), %xmm8 2335 mulpd %xmm8, %xmm9 2336#endif 2337 2338#ifdef LT 2339 movddup -16 * SIZE(AO), %xmm8 2340 mulpd %xmm8, %xmm9 2341 2342 movddup -15 * SIZE(AO), %xmm10 2343 mulpd %xmm9, %xmm10 2344 subpd %xmm10, %xmm13 2345 2346 movddup -13 * SIZE(AO), %xmm8 2347 mulpd %xmm8, %xmm13 2348#endif 2349 2350#ifdef RN 2351 movddup -16 * SIZE(BO), %xmm8 2352 mulpd %xmm8, %xmm0 2353 2354 movddup -15 * SIZE(BO), %xmm9 2355 mulpd %xmm0, %xmm9 2356 subpd %xmm9, %xmm2 2357 2358 movddup -13 * SIZE(BO), %xmm8 2359 mulpd %xmm8, %xmm2 2360#endif 2361 2362#ifdef RT 2363 movddup -13 * SIZE(BO), %xmm8 2364 mulpd %xmm8, %xmm2 2365 2366 movddup -14 * SIZE(BO), %xmm9 2367 mulpd %xmm2, %xmm9 2368 subpd %xmm9, %xmm0 2369 2370 movddup -16 * SIZE(BO), %xmm8 2371 mulpd %xmm8, %xmm0 2372#endif 2373 2374#ifdef LN 2375 subq $2 * SIZE, CO1 2376 subq $2 * SIZE, CO2 2377#endif 2378 2379#if defined(LN) || defined(LT) 2380 movlpd %xmm9, 0 * SIZE(CO1) 2381 movlpd %xmm13, 1 * SIZE(CO1) 2382 2383 movhpd %xmm9, 0 * SIZE(CO2) 2384 movhpd %xmm13, 1 * SIZE(CO2) 2385#else 2386 movlpd %xmm0, 0 * SIZE(CO1) 2387 movhpd %xmm0, 1 * SIZE(CO1) 2388 2389 movlpd %xmm2, 0 * SIZE(CO2) 2390 movhpd %xmm2, 1 * SIZE(CO2) 2391#endif 2392 2393#if defined(LN) || defined(LT) 2394 movaps %xmm9, -16 * SIZE(BO) 2395 movaps %xmm13, -14 * SIZE(BO) 2396#else 2397 movaps %xmm0, -16 * SIZE(AO) 2398 movaps %xmm2, -14 * SIZE(AO) 2399#endif 2400 2401#ifndef LN 2402 addq $2 * SIZE, CO1 2403 addq $2 * SIZE, CO2 2404#endif 2405 2406#if defined(LT) || defined(RN) 2407 movq K, %rax 2408 subq KK, %rax 2409 leaq (,%rax, SIZE), %rax 2410 leaq (AO, %rax, 2), AO 2411 leaq (BO, %rax, 2), BO 2412#endif 2413 2414#ifdef LN 2415 subq $2, KK 2416#endif 2417 2418#ifdef LT 2419 addq $2, KK 2420#endif 2421 2422#ifdef RT 2423 movq K, %rax 2424 salq $1 + BASE_SHIFT, %rax 2425 addq %rax, AORIG 2426#endif 2427 ALIGN_4 2428 2429.L70: 2430 testq $1, M 2431 je .L79 2432 ALIGN_4 2433 2434.L71: 2435#ifdef LN 2436 movq K, %rax 2437 salq $0 + BASE_SHIFT, %rax 2438 subq %rax, AORIG 2439#endif 2440 2441#if defined(LN) || defined(RT) 2442 movq KK, %rax 2443 movq AORIG, AO 2444 leaq (, %rax, SIZE), %rax 2445 leaq (AO, %rax, 1), AO 2446#endif 2447 2448 movq B, BO 2449 2450#if defined(LN) || defined(RT) 2451 movq KK, %rax 2452 salq $1 + BASE_SHIFT, %rax 2453 leaq (BO, %rax, 1), BO 2454#endif 2455 2456 movddup -16 * SIZE(AO), %xmm0 2457 pxor %xmm8, %xmm8 2458 movddup -15 * SIZE(AO), %xmm1 2459 pxor %xmm9, %xmm9 2460 movddup -14 * SIZE(AO), %xmm2 2461 pxor %xmm10, %xmm10 2462 movddup -13 * SIZE(AO), %xmm3 2463 pxor %xmm11, %xmm11 2464 2465#if defined(LT) || defined(RN) 2466 movq KK, %rax 2467#else 2468 movq K, %rax 2469 subq KK, %rax 2470#endif 2471 andq $-4, %rax 2472 leaq (, %rax, SIZE), %rax 2473 leaq (AO, %rax, 1), AO 2474 leaq (BO, %rax, 2), BO 2475 negq %rax 2476 NOBRANCH 2477 je .L76 2478 ALIGN_4 2479 2480.L72: 2481 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 2482 addpd %xmm0, %xmm8 2483 movddup -12 * SIZE(AO, %rax, 1), %xmm0 2484 2485 mulpd -14 * SIZE(BO, %rax, 2), %xmm1 2486 addpd %xmm1, %xmm9 2487 movddup -11 * SIZE(AO, %rax, 1), %xmm1 2488 2489 mulpd -12 * SIZE(BO, %rax, 2), %xmm2 2490 addpd %xmm2, %xmm10 2491 movddup -10 * SIZE(AO, %rax, 1), %xmm2 2492 2493 mulpd -10 * SIZE(BO, %rax, 2), %xmm3 2494 addpd %xmm3, %xmm11 2495 movddup -9 * SIZE(AO, %rax, 1), %xmm3 2496 2497 addq $4 * SIZE, %rax 2498 BRANCH 2499 jl .L72 2500 ALIGN_4 2501 2502.L76: 2503#if defined(LT) || defined(RN) 2504 movq KK, %rax 2505#else 2506 movq K, %rax 2507 subq KK, %rax 2508#endif 2509 andq $3, %rax # if (k & 1) 2510 je .L78 2511 2512 leaq (, %rax, SIZE), %rax 2513 leaq (AO, %rax, 1), AO 2514 leaq (BO, %rax, 2), BO 2515 negq %rax 2516 ALIGN_4 2517 2518.L77: 2519 mulpd -16 * SIZE(BO, %rax, 2), %xmm0 2520 addpd %xmm0, %xmm8 2521 movddup -15 * SIZE(AO, %rax, 1), %xmm0 2522 2523 addq $SIZE, %rax 2524 jl .L77 2525 ALIGN_4 2526 2527.L78: 2528 addpd %xmm9, %xmm8 2529 addpd %xmm11, %xmm10 2530 addpd %xmm10, %xmm8 2531 2532#if defined(LN) || defined(RT) 2533 movq KK, %rax 2534#ifdef LN 2535 subq $1, %rax 2536#else 2537 subq $2, %rax 2538#endif 2539 2540 leaq (, %rax, SIZE), %rax 2541 2542 movq AORIG, AO 2543 leaq (AO, %rax, 1), AO 2544 leaq (B, %rax, 2), BO 2545#endif 2546 2547#if defined(LN) || defined(LT) 2548 movapd -16 * SIZE(BO), %xmm2 2549#else 2550 movapd -16 * SIZE(AO), %xmm2 2551#endif 2552 2553 subpd %xmm8, %xmm2 2554 2555#if defined(LN) || defined(LT) 2556 movddup -16 * SIZE(AO), %xmm0 2557 2558 mulpd %xmm0, %xmm2 2559#endif 2560 2561#ifdef RN 2562 movapd %xmm2, %xmm0 2563 unpckhpd %xmm0, %xmm0 2564 2565 mulsd -16 * SIZE(BO), %xmm2 2566 movsd -15 * SIZE(BO), %xmm4 2567 mulsd %xmm2, %xmm4 2568 subsd %xmm4, %xmm0 2569 2570 mulsd -13 * SIZE(BO), %xmm0 2571 unpcklpd %xmm0, %xmm2 2572#endif 2573 2574#ifdef RT 2575 movapd %xmm2, %xmm0 2576 unpckhpd %xmm0, %xmm0 2577 2578 mulsd -13 * SIZE(BO), %xmm0 2579 2580 movlpd -14 * SIZE(BO), %xmm4 2581 mulsd %xmm0, %xmm4 2582 subsd %xmm4, %xmm2 2583 2584 mulsd -16 * SIZE(BO), %xmm2 2585 unpcklpd %xmm0, %xmm2 2586#endif 2587 2588#ifdef LN 2589 subq $1 * SIZE, CO1 2590 subq $1 * SIZE, CO2 2591#endif 2592 2593 movlpd %xmm2, 0 * SIZE(CO1) 2594 movhpd %xmm2, 0 * SIZE(CO2) 2595 2596#if defined(LN) || defined(LT) 2597 movaps %xmm2, -16 * SIZE(BO) 2598#else 2599 movaps %xmm2, -16 * SIZE(AO) 2600#endif 2601 2602#ifndef LN 2603 addq $1 * SIZE, CO1 2604 addq $1 * SIZE, CO2 2605#endif 2606 2607#if defined(LT) || defined(RN) 2608 movq K, %rax 2609 subq KK, %rax 2610 leaq (,%rax, SIZE), %rax 2611 leaq (AO, %rax, 1), AO 2612 leaq (BO, %rax, 2), BO 2613#endif 2614 2615#ifdef LN 2616 subq $1, KK 2617#endif 2618 2619#ifdef LT 2620 addq $1, KK 2621#endif 2622 2623#ifdef RT 2624 movq K, %rax 2625 salq $0 + BASE_SHIFT, %rax 2626 addq %rax, AORIG 2627#endif 2628 ALIGN_4 2629 2630.L79: 2631#ifdef LN 2632 leaq (, K, SIZE), %rax 2633 leaq (B, %rax, 2), B 2634#endif 2635 2636#if defined(LT) || defined(RN) 2637 movq BO, B 2638#endif 2639 2640#ifdef RN 2641 addq $2, KK 2642#endif 2643 2644#ifdef RT 2645 subq $2, KK 2646#endif 2647 ALIGN_4 2648 2649.L80: 2650 testq $1, N 2651 je .L999 2652 2653#if defined(LT) || defined(RN) 2654 movq A, AO 2655#else 2656 movq A, AORIG 2657#endif 2658 2659#ifdef RT 2660 movq K, %rax 2661 salq $0 + BASE_SHIFT, %rax 2662 subq %rax, B 2663 2664 subq LDC, C 2665#endif 2666 2667 movq C, CO1 # coffset1 = c 2668#ifndef RT 2669 addq LDC, C 2670#endif 2671 2672#ifdef LN 2673 movq OFFSET, %rax 2674 addq M, %rax 2675 movq %rax, KK 2676#endif 2677 2678#ifdef LT 2679 movq OFFSET, %rax 2680 movq %rax, KK 2681#endif 2682 2683 movq M, I 2684 sarq $2, I # i = (m >> 2) 2685 jle .L100 2686 ALIGN_4 2687 2688.L91: 2689#ifdef LN 2690 movq K, %rax 2691 salq $2 + BASE_SHIFT, %rax 2692 subq %rax, AORIG 2693#endif 2694 2695#if defined(LN) || defined(RT) 2696 movq KK, %rax 2697 movq AORIG, AO 2698 leaq (, %rax, SIZE), %rax 2699 leaq (AO, %rax, 4), AO 2700#endif 2701 2702 movq B, BO 2703 2704#if defined(LN) || defined(RT) 2705 movq KK, %rax 2706 leaq (BO, %rax, SIZE), BO 2707#endif 2708 2709 movapd -16 * SIZE(AO), %xmm0 2710 pxor %xmm8, %xmm8 2711 movapd -8 * SIZE(AO), %xmm2 2712 pxor %xmm9, %xmm9 2713 movddup -16 * SIZE(BO), %xmm1 2714 pxor %xmm10, %xmm10 2715 movddup -15 * SIZE(BO), %xmm5 2716 pxor %xmm11, %xmm11 2717 movddup -14 * SIZE(BO), %xmm3 2718 2719#ifndef LN 2720 prefetchw 3 * SIZE(CO1) 2721#else 2722 prefetchw -8 * SIZE(CO1) 2723#endif 2724 2725#if defined(LT) || defined(RN) 2726 movq KK, %rax 2727#else 2728 movq K, %rax 2729 subq KK, %rax 2730#endif 2731 andq $-4, %rax 2732 leaq (, %rax, SIZE), %rax 2733 leaq (AO, %rax, 4), AO 2734 leaq (BO, %rax, 1), BO 2735 negq %rax 2736 NOBRANCH 2737 je .L96 2738 ALIGN_4 2739 2740.L92: 2741 mulpd %xmm1, %xmm0 2742 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2743 addpd %xmm0, %xmm8 2744 movapd -12 * SIZE(AO, %rax, 4), %xmm0 2745 addpd %xmm1, %xmm9 2746 movddup -12 * SIZE(BO, %rax, 1), %xmm1 2747 mulpd %xmm5, %xmm0 2748 mulpd -10 * SIZE(AO, %rax, 4), %xmm5 2749 addpd %xmm0, %xmm10 2750 movapd (AO, %rax, 4), %xmm0 2751 addpd %xmm5, %xmm11 2752 movddup -13 * SIZE(BO, %rax, 1), %xmm5 2753 mulpd %xmm3, %xmm2 2754 mulpd -6 * SIZE(AO, %rax, 4), %xmm3 2755 addpd %xmm2, %xmm8 2756 movapd -4 * SIZE(AO, %rax, 4), %xmm2 2757 addpd %xmm3, %xmm9 2758 movddup -10 * SIZE(BO, %rax, 1), %xmm3 2759 mulpd %xmm5, %xmm2 2760 mulpd -2 * SIZE(AO, %rax, 4), %xmm5 2761 addpd %xmm2, %xmm10 2762 movapd 8 * SIZE(AO, %rax, 4), %xmm2 2763 addpd %xmm5, %xmm11 2764 movddup -11 * SIZE(BO, %rax, 1), %xmm5 2765 2766 addq $4 * SIZE, %rax 2767 BRANCH 2768 jl .L92 2769 ALIGN_4 2770 2771.L96: 2772#if defined(LT) || defined(RN) 2773 movq KK, %rax 2774#else 2775 movq K, %rax 2776 subq KK, %rax 2777#endif 2778 andq $3, %rax # if (k & 1) 2779 je .L99 2780 2781 leaq (, %rax, SIZE), %rax 2782 leaq (AO, %rax, 4), AO 2783 leaq (BO, %rax, 1), BO 2784 negq %rax 2785 ALIGN_4 2786 2787.L97: 2788 mulpd %xmm1, %xmm0 2789 mulpd -14 * SIZE(AO, %rax, 4), %xmm1 2790 addpd %xmm0, %xmm8 2791 movapd -12 * SIZE(AO, %rax, 4), %xmm0 2792 addpd %xmm1, %xmm9 2793 movddup -15 * SIZE(BO, %rax, 1), %xmm1 2794 2795 addq $SIZE, %rax 2796 jl .L97 2797 ALIGN_4 2798.L99: 2799 addpd %xmm10, %xmm8 2800 addpd %xmm11, %xmm9 2801 2802#if defined(LN) || defined(RT) 2803 movq KK, %rax 2804#ifdef LN 2805 subq $4, %rax 2806#else 2807 subq $1, %rax 2808#endif 2809 2810 leaq (, %rax, SIZE), %rax 2811 2812 movq AORIG, AO 2813 leaq (AO, %rax, 4), AO 2814 leaq (B, %rax, 1), BO 2815#endif 2816 2817#if defined(LN) || defined(LT) 2818 movapd -16 * SIZE(BO), %xmm10 2819 movapd -14 * SIZE(BO), %xmm11 2820 2821 subpd %xmm8, %xmm10 2822 subpd %xmm9, %xmm11 2823#else 2824 movapd -16 * SIZE(AO), %xmm10 2825 movapd -14 * SIZE(AO), %xmm11 2826 2827 subpd %xmm8, %xmm10 2828 subpd %xmm9, %xmm11 2829#endif 2830 2831#ifdef LN 2832 movapd %xmm10, %xmm8 2833 unpckhpd %xmm8, %xmm8 2834 2835 movapd %xmm11, %xmm9 2836 unpckhpd %xmm9, %xmm9 2837 2838 movsd -1 * SIZE(AO), %xmm12 2839 mulsd %xmm12, %xmm9 2840 2841 movsd -2 * SIZE(AO), %xmm13 2842 mulsd %xmm9, %xmm13 2843 subsd %xmm13, %xmm11 2844 movsd -3 * SIZE(AO), %xmm14 2845 mulsd %xmm9, %xmm14 2846 subsd %xmm14, %xmm8 2847 movsd -4 * SIZE(AO), %xmm15 2848 mulsd %xmm9, %xmm15 2849 subsd %xmm15, %xmm10 2850 2851 movsd -6 * SIZE(AO), %xmm12 2852 mulsd %xmm12, %xmm11 2853 2854 movsd -7 * SIZE(AO), %xmm13 2855 mulsd %xmm11, %xmm13 2856 subsd %xmm13, %xmm8 2857 movsd -8 * SIZE(AO), %xmm14 2858 mulsd %xmm11, %xmm14 2859 subsd %xmm14, %xmm10 2860 2861 movsd -11 * SIZE(AO), %xmm12 2862 mulsd %xmm12, %xmm8 2863 2864 movsd -12 * SIZE(AO), %xmm13 2865 mulsd %xmm8, %xmm13 2866 subsd %xmm13, %xmm10 2867 2868 movsd -16 * SIZE(AO), %xmm12 2869 mulsd %xmm12, %xmm10 2870 2871 unpcklpd %xmm8, %xmm10 2872 unpcklpd %xmm9, %xmm11 2873#endif 2874 2875#ifdef LT 2876 movapd %xmm10, %xmm8 2877 unpckhpd %xmm8, %xmm8 2878 2879 movapd %xmm11, %xmm9 2880 unpckhpd %xmm9, %xmm9 2881 2882 movsd -16 * SIZE(AO), %xmm12 2883 mulsd %xmm12, %xmm10 2884 2885 movsd -15 * SIZE(AO), %xmm13 2886 mulsd %xmm10, %xmm13 2887 subsd %xmm13, %xmm8 2888 movsd -14 * SIZE(AO), %xmm14 2889 mulsd %xmm10, %xmm14 2890 subsd %xmm14, %xmm11 2891 movsd -13 * SIZE(AO), %xmm15 2892 mulsd %xmm10, %xmm15 2893 subsd %xmm15, %xmm9 2894 2895 movsd -11 * SIZE(AO), %xmm12 2896 mulsd %xmm12, %xmm8 2897 2898 movsd -10 * SIZE(AO), %xmm13 2899 mulsd %xmm8, %xmm13 2900 subsd %xmm13, %xmm11 2901 movsd -9 * SIZE(AO), %xmm14 2902 mulsd %xmm8, %xmm14 2903 subsd %xmm14, %xmm9 2904 2905 movsd -6 * SIZE(AO), %xmm12 2906 mulsd %xmm12, %xmm11 2907 2908 movsd -5 * SIZE(AO), %xmm13 2909 mulsd %xmm11, %xmm13 2910 subsd %xmm13, %xmm9 2911 2912 movsd -1 * SIZE(AO), %xmm12 2913 mulsd %xmm12, %xmm9 2914 2915 unpcklpd %xmm8, %xmm10 2916 unpcklpd %xmm9, %xmm11 2917#endif 2918 2919#ifdef RN 2920 movddup -16 * SIZE(BO), %xmm8 2921 mulpd %xmm8, %xmm10 2922 mulpd %xmm8, %xmm11 2923#endif 2924 2925#ifdef RT 2926 movddup -16 * SIZE(BO), %xmm8 2927 mulpd %xmm8, %xmm10 2928 mulpd %xmm8, %xmm11 2929#endif 2930 2931#ifdef LN 2932 subq $4 * SIZE, CO1 2933#endif 2934 2935 movlpd %xmm10, 0 * SIZE(CO1) 2936 movhpd %xmm10, 1 * SIZE(CO1) 2937 movlpd %xmm11, 2 * SIZE(CO1) 2938 movhpd %xmm11, 3 * SIZE(CO1) 2939 2940#if defined(LN) || defined(LT) 2941 movaps %xmm10, -16 * SIZE(BO) 2942 movaps %xmm11, -14 * SIZE(BO) 2943#else 2944 movaps %xmm10, -16 * SIZE(AO) 2945 movaps %xmm11, -14 * SIZE(AO) 2946#endif 2947 2948#ifndef LN 2949 addq $4 * SIZE, CO1 2950#endif 2951 2952#if defined(LT) || defined(RN) 2953 movq K, %rax 2954 subq KK, %rax 2955 leaq (,%rax, SIZE), %rax 2956 leaq (AO, %rax, 4), AO 2957 addq %rax, BO 2958#endif 2959 2960#ifdef LN 2961 subq $4, KK 2962#endif 2963 2964#ifdef LT 2965 addq $4, KK 2966#endif 2967 2968#ifdef RT 2969 movq K, %rax 2970 salq $2 + BASE_SHIFT, %rax 2971 addq %rax, AORIG 2972#endif 2973 2974 decq I # i -- 2975 jg .L91 2976 ALIGN_4 2977 2978.L100: 2979 testq $2, M 2980 je .L110 2981 2982#ifdef LN 2983 movq K, %rax 2984 salq $1 + BASE_SHIFT, %rax 2985 subq %rax, AORIG 2986#endif 2987 2988#if defined(LN) || defined(RT) 2989 movq KK, %rax 2990 movq AORIG, AO 2991 leaq (, %rax, SIZE), %rax 2992 leaq (AO, %rax, 2), AO 2993#endif 2994 2995 movq B, BO 2996 2997#if defined(LN) || defined(RT) 2998 movq KK, %rax 2999 leaq (BO, %rax, SIZE), BO 3000#endif 3001 3002 movddup -16 * SIZE(BO), %xmm0 3003 pxor %xmm8, %xmm8 3004 movddup -15 * SIZE(BO), %xmm1 3005 pxor %xmm9, %xmm9 3006 movddup -14 * SIZE(BO), %xmm2 3007 pxor %xmm10, %xmm10 3008 movddup -13 * SIZE(BO), %xmm3 3009 pxor %xmm11, %xmm11 3010 3011#if defined(LT) || defined(RN) 3012 movq KK, %rax 3013#else 3014 movq K, %rax 3015 subq KK, %rax 3016#endif 3017 andq $-4, %rax 3018 leaq (, %rax, SIZE), %rax 3019 leaq (AO, %rax, 2), AO 3020 leaq (BO, %rax, 1), BO 3021 negq %rax 3022 NOBRANCH 3023 je .L106 3024 ALIGN_4 3025 3026.L102: 3027 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 3028 addpd %xmm0, %xmm8 3029 movddup -12 * SIZE(BO, %rax, 1), %xmm0 3030 3031 mulpd -14 * SIZE(AO, %rax, 2), %xmm1 3032 addpd %xmm1, %xmm9 3033 movddup -11 * SIZE(BO, %rax, 1), %xmm1 3034 3035 mulpd -12 * SIZE(AO, %rax, 2), %xmm2 3036 addpd %xmm2, %xmm10 3037 movddup -10 * SIZE(BO, %rax, 1), %xmm2 3038 3039 mulpd -10 * SIZE(AO, %rax, 2), %xmm3 3040 addpd %xmm3, %xmm11 3041 movddup -9 * SIZE(BO, %rax, 1), %xmm3 3042 3043 addq $4 * SIZE, %rax 3044 BRANCH 3045 jl .L102 3046 ALIGN_4 3047 3048.L106: 3049#if defined(LT) || defined(RN) 3050 movq KK, %rax 3051#else 3052 movq K, %rax 3053 subq KK, %rax 3054#endif 3055 andq $3, %rax # if (k & 1) 3056 je .L109 3057 3058 leaq (, %rax, SIZE), %rax 3059 leaq (AO, %rax, 2), AO 3060 leaq (BO, %rax, 1), BO 3061 negq %rax 3062 ALIGN_4 3063 3064.L107: 3065 movddup -16 * SIZE(BO, %rax, 1), %xmm0 3066 mulpd -16 * SIZE(AO, %rax, 2), %xmm0 3067 addpd %xmm0, %xmm8 3068 3069 addq $SIZE, %rax 3070 jl .L107 3071 ALIGN_4 3072 3073.L109: 3074 addpd %xmm9, %xmm8 3075 addpd %xmm11, %xmm10 3076 addpd %xmm10, %xmm8 3077 3078#if defined(LN) || defined(RT) 3079 movq KK, %rax 3080#ifdef LN 3081 subq $2, %rax 3082#else 3083 subq $1, %rax 3084#endif 3085 3086 leaq (, %rax, SIZE), %rax 3087 3088 movq AORIG, AO 3089 leaq (AO, %rax, 2), AO 3090 leaq (B, %rax, 1), BO 3091#endif 3092 3093#if defined(LN) || defined(LT) 3094 movapd -16 * SIZE(BO), %xmm10 3095 subpd %xmm8, %xmm10 3096#else 3097 movapd -16 * SIZE(AO), %xmm10 3098 subpd %xmm8, %xmm10 3099#endif 3100 3101#ifdef LN 3102 movapd %xmm10, %xmm8 3103 unpckhpd %xmm8, %xmm8 3104 3105 movsd -13 * SIZE(AO), %xmm12 3106 mulsd %xmm12, %xmm8 3107 3108 movsd -14 * SIZE(AO), %xmm13 3109 mulsd %xmm8, %xmm13 3110 subsd %xmm13, %xmm10 3111 3112 movsd -16 * SIZE(AO), %xmm12 3113 mulsd %xmm12, %xmm10 3114 3115 unpcklpd %xmm8, %xmm10 3116#endif 3117 3118#ifdef LT 3119 movapd %xmm10, %xmm8 3120 unpckhpd %xmm8, %xmm8 3121 3122 movsd -16 * SIZE(AO), %xmm12 3123 mulsd %xmm12, %xmm10 3124 3125 movsd -15 * SIZE(AO), %xmm13 3126 mulsd %xmm10, %xmm13 3127 subsd %xmm13, %xmm8 3128 3129 movsd -13 * SIZE(AO), %xmm12 3130 mulsd %xmm12, %xmm8 3131 3132 unpcklpd %xmm8, %xmm10 3133#endif 3134 3135#ifdef RN 3136 movddup -16 * SIZE(BO), %xmm8 3137 mulpd %xmm8, %xmm10 3138#endif 3139 3140#ifdef RT 3141 movddup -16 * SIZE(BO), %xmm8 3142 mulpd %xmm8, %xmm10 3143#endif 3144 3145#ifdef LN 3146 subq $2 * SIZE, CO1 3147#endif 3148 3149#if defined(LN) || defined(LT) 3150 movlpd %xmm10, 0 * SIZE(CO1) 3151 movhpd %xmm10, 1 * SIZE(CO1) 3152#else 3153 movlpd %xmm10, 0 * SIZE(CO1) 3154 movhpd %xmm10, 1 * SIZE(CO1) 3155#endif 3156 3157#if defined(LN) || defined(LT) 3158 movaps %xmm10, -16 * SIZE(BO) 3159#else 3160 movaps %xmm10, -16 * SIZE(AO) 3161#endif 3162 3163#ifndef LN 3164 addq $2 * SIZE, CO1 3165#endif 3166 3167#if defined(LT) || defined(RN) 3168 movq K, %rax 3169 subq KK, %rax 3170 leaq (,%rax, SIZE), %rax 3171 leaq (AO, %rax, 2), AO 3172 addq %rax, BO 3173#endif 3174 3175#ifdef LN 3176 subq $2, KK 3177#endif 3178 3179#ifdef LT 3180 addq $2, KK 3181#endif 3182 3183#ifdef RT 3184 movq K, %rax 3185 salq $1 + BASE_SHIFT, %rax 3186 addq %rax, AORIG 3187#endif 3188 ALIGN_4 3189 3190.L110: 3191 testq $1, M 3192 je .L119 3193 ALIGN_4 3194 3195.L111: 3196#ifdef LN 3197 movq K, %rax 3198 salq $0 + BASE_SHIFT, %rax 3199 subq %rax, AORIG 3200#endif 3201 3202#if defined(LN) || defined(RT) 3203 movq KK, %rax 3204 movq AORIG, AO 3205 leaq (, %rax, SIZE), %rax 3206 leaq (AO, %rax, 1), AO 3207#endif 3208 3209 movq B, BO 3210 3211#if defined(LN) || defined(RT) 3212 movq KK, %rax 3213 leaq (BO, %rax, SIZE), BO 3214#endif 3215 3216 movapd -16 * SIZE(AO), %xmm0 3217 pxor %xmm8, %xmm8 3218 movapd -14 * SIZE(AO), %xmm1 3219 pxor %xmm9, %xmm9 3220 3221#if defined(LT) || defined(RN) 3222 movq KK, %rax 3223#else 3224 movq K, %rax 3225 subq KK, %rax 3226#endif 3227 andq $-4, %rax 3228 leaq (, %rax, SIZE), %rax 3229 leaq (AO, %rax, 1), AO 3230 leaq (BO, %rax, 1), BO 3231 negq %rax 3232 NOBRANCH 3233 je .L116 3234 ALIGN_4 3235 3236.L112: 3237 mulpd -16 * SIZE(BO, %rax, 1), %xmm0 3238 addpd %xmm0, %xmm8 3239 movapd -12 * SIZE(AO, %rax, 1), %xmm0 3240 3241 mulpd -14 * SIZE(BO, %rax, 1), %xmm1 3242 addpd %xmm1, %xmm9 3243 movapd -10 * SIZE(AO, %rax, 1), %xmm1 3244 3245 addq $4 * SIZE, %rax 3246 BRANCH 3247 jl .L112 3248 ALIGN_4 3249 3250.L116: 3251#if defined(LT) || defined(RN) 3252 movq KK, %rax 3253#else 3254 movq K, %rax 3255 subq KK, %rax 3256#endif 3257 andq $3, %rax # if (k & 1) 3258 je .L118 3259 3260 leaq (, %rax, SIZE), %rax 3261 leaq (AO, %rax, 1), AO 3262 leaq (BO, %rax, 1), BO 3263 negq %rax 3264 ALIGN_4 3265 3266.L117: 3267 mulsd -16 * SIZE(BO, %rax, 1), %xmm0 3268 addsd %xmm0, %xmm8 3269 movsd -15 * SIZE(AO, %rax, 1), %xmm0 3270 3271 addq $SIZE, %rax 3272 jl .L117 3273 ALIGN_4 3274 3275.L118: 3276 addpd %xmm9, %xmm8 3277 haddpd %xmm8, %xmm8 3278 3279#if defined(LN) || defined(RT) 3280 movq KK, %rax 3281#ifdef LN 3282 subq $1, %rax 3283#else 3284 subq $1, %rax 3285#endif 3286 3287 leaq (, %rax, SIZE), %rax 3288 3289 movq AORIG, AO 3290 leaq (AO, %rax, 1), AO 3291 leaq (B, %rax, 1), BO 3292#endif 3293 3294#if defined(LN) || defined(LT) 3295 movsd -16 * SIZE(BO), %xmm10 3296 subsd %xmm8, %xmm10 3297#else 3298 movsd -16 * SIZE(AO), %xmm10 3299 subsd %xmm8, %xmm10 3300#endif 3301 3302#if defined(LN) || defined(LT) 3303 movsd -16 * SIZE(AO), %xmm12 3304 mulsd %xmm12, %xmm10 3305#endif 3306 3307#if defined(RN) || defined(RT) 3308 movsd -16 * SIZE(BO), %xmm8 3309 mulsd %xmm8, %xmm10 3310#endif 3311 3312#ifdef LN 3313 subq $1 * SIZE, CO1 3314#endif 3315 3316 movsd %xmm10, 0 * SIZE(CO1) 3317 3318#if defined(LN) || defined(LT) 3319 movlpd %xmm10, -16 * SIZE(BO) 3320#else 3321 movlpd %xmm10, -16 * SIZE(AO) 3322#endif 3323 3324#ifndef LN 3325 addq $1 * SIZE, CO1 3326#endif 3327 3328#if defined(LT) || defined(RN) 3329 movq K, %rax 3330 subq KK, %rax 3331 leaq (,%rax, SIZE), %rax 3332 addq %rax, AO 3333 addq %rax, BO 3334#endif 3335 3336#ifdef LN 3337 subq $1, KK 3338#endif 3339 3340#ifdef LT 3341 addq $1, KK 3342#endif 3343 3344#ifdef RT 3345 movq K, %rax 3346 salq $0 + BASE_SHIFT, %rax 3347 addq %rax, AORIG 3348#endif 3349 ALIGN_4 3350 3351.L119: 3352#ifdef LN 3353 leaq (B, K, SIZE), B 3354#endif 3355 3356#if defined(LT) || defined(RN) 3357 movq BO, B 3358#endif 3359 3360#ifdef RN 3361 addq $1, KK 3362#endif 3363 3364#ifdef RT 3365 subq $1, KK 3366#endif 3367 ALIGN_4 3368 3369 3370.L999: 3371 movq (%rsp), %rbx 3372 movq 8(%rsp), %rbp 3373 movq 16(%rsp), %r12 3374 movq 24(%rsp), %r13 3375 movq 32(%rsp), %r14 3376 movq 40(%rsp), %r15 3377 3378#ifdef WINDOWS_ABI 3379 movq 48(%rsp), %rdi 3380 movq 56(%rsp), %rsi 3381 movups 64(%rsp), %xmm6 3382 movups 80(%rsp), %xmm7 3383 movups 96(%rsp), %xmm8 3384 movups 112(%rsp), %xmm9 3385 movups 128(%rsp), %xmm10 3386 movups 144(%rsp), %xmm11 3387 movups 160(%rsp), %xmm12 3388 movups 176(%rsp), %xmm13 3389 movups 192(%rsp), %xmm14 3390 movups 208(%rsp), %xmm15 3391#endif 3392 3393 addq $STACKSIZE, %rsp 3394 ret 3395 3396 EPILOGUE 3397