1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/********************************************************************* 29* 30* 2014/06/28 Saar 31* BLASTEST : OK 32* CTEST : OK 33* TEST : OK 34* 35* 36* 2013/10/30 Saar 37* 38* Parameter: 39* UNROLL_M 2 40* UNROLL_N 2 41* ZGEMM_P 384 42* ZGEMM_Q 168 43* A_PR1 512 44* B_PR1 256 45* 46* Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): 47* 48* 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) 49* 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) 50* 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) 51* 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) 52* 53* Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): 54* 55* 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) 56* 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) 57* 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) 58* 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) 59* 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) 60* 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) 61* 62*********************************************************************/ 63 64 65#define ASSEMBLER 66#include "common.h" 67 68#define OLD_M %rdi 69#define OLD_N %rsi 70#define M %r13 71#define J %r14 72#define OLD_K %rdx 73 74#define A %rcx 75#define B %r8 76#define C %r9 77#define LDC %r10 78 79#define I %r11 80#define AO %rdi 81#define BO %rsi 82#define CO1 %r15 83#define K %r12 84#define BI %rbp 85#define SP %rbx 86 87#define BO1 %rdi 88#define BO2 %r15 89 90#ifndef WINDOWS_ABI 91 92#define STACKSIZE 96 93 94#else 95 96#define STACKSIZE 320 97 98#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 99#define OLD_A 48 + STACKSIZE(%rsp) 100#define OLD_B 56 + STACKSIZE(%rsp) 101#define OLD_C 64 + STACKSIZE(%rsp) 102#define OLD_LDC 72 + STACKSIZE(%rsp) 103#define OLD_OFFSET 80 + STACKSIZE(%rsp) 104 105#endif 106 107#define L_BUFFER_SIZE 256*8*4 108 109#define Ndiv6 24(%rsp) 110#define Nmod6 32(%rsp) 111#define N 40(%rsp) 112#define ALPHA_R 48(%rsp) 113#define ALPHA_I 56(%rsp) 114#define OFFSET 64(%rsp) 115#define KK 72(%rsp) 116#define KKK 80(%rsp) 117#define BUFFER1 128(%rsp) 118 119#if defined(OS_WINDOWS) 120#if L_BUFFER_SIZE > 16384 121#define STACK_TOUCH \ 122 movl $0, 4096 * 4(%rsp);\ 123 movl $0, 4096 * 3(%rsp);\ 124 movl $0, 4096 * 2(%rsp);\ 125 movl $0, 4096 * 1(%rsp); 126#elif L_BUFFER_SIZE > 12288 127#define STACK_TOUCH \ 128 movl $0, 4096 * 3(%rsp);\ 129 movl $0, 4096 * 2(%rsp);\ 130 movl $0, 4096 * 1(%rsp); 131#elif L_BUFFER_SIZE > 8192 132#define STACK_TOUCH \ 133 movl $0, 4096 * 2(%rsp);\ 134 movl $0, 4096 * 1(%rsp); 135#elif L_BUFFER_SIZE > 4096 136#define STACK_TOUCH \ 137 movl $0, 4096 * 1(%rsp); 138#else 139#define STACK_TOUCH 140#endif 141#else 142#define STACK_TOUCH 143#endif 144 145 146#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 147#define VFMADD_R vfmaddpd 148#define VFMADD_I vfmaddpd 149#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 150#define VFMADD_R vfnmaddpd 151#define VFMADD_I vfmaddpd 152#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 153#define VFMADD_R vfmaddpd 154#define VFMADD_I vfnmaddpd 155#else 156#define VFMADD_R vfnmaddpd 157#define VFMADD_I vfnmaddpd 158#endif 159 160 161#define A_PR1 512 162#define B_PR1 256 163 164#define KERNEL2x2_1(xx) \ 165 prefetcht0 A_PR1(AO,%rax,SIZE) ;\ 166 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 167 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ 168 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 169 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 170 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 171 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ 172 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 173 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 174 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 175 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 176 VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ 177 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ 178 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 179 VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ 180 181#define KERNEL2x2_2(xx) \ 182 vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 183 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 184 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 185 vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 186 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 187 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 188 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 189 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 190 vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ 191 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 192 VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ 193 vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ 194 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 195 VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ 196 197#define KERNEL2x2_3(xx) \ 198 prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ 199 vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 200 vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ 201 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 202 vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 203 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 204 vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 205 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 206 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 207 vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ 208 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 209 VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ 210 vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ 211 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 212 VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ 213 214#define KERNEL2x2_4(xx) \ 215 vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 216 vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 217 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 218 vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 219 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 220 vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ 221 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 222 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 223 vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 224 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 225 VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ 226 vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ 227 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 228 VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ 229 addq $16, BI ;\ 230 addq $16, %rax ;\ 231 232 233#define KERNEL2x2_SUB(xx) \ 234 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 235 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ 236 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 237 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 238 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 239 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ 240 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 241 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 242 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 243 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 244 VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ 245 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ 246 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 247 VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ 248 addq $4, BI ;\ 249 addq $4, %rax ;\ 250 251/************************************************************************************************/ 252 253#define KERNEL1x2_1(xx) \ 254 prefetcht0 A_PR1(AO,%rax,SIZE) ;\ 255 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 256 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ 257 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 258 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ 259 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 260 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 261 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 262 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ 263 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 264 265#define KERNEL1x2_2(xx) \ 266 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 267 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 268 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 269 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 270 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 271 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 272 vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ 273 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 274 vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ 275 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 276 277#define KERNEL1x2_3(xx) \ 278 vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 279 vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ 280 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 281 vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 282 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 283 vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ 284 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 285 vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ 286 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 287 288#define KERNEL1x2_4(xx) \ 289 vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 290 vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 291 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 292 vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ 293 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 294 vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 295 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 296 vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ 297 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 298 addq $16, BI ;\ 299 addq $8 , %rax ;\ 300 301 302#define KERNEL1x2_SUB(xx) \ 303 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 304 vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ 305 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 306 vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ 307 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 308 vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ 309 VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ 310 vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ 311 VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ 312 addq $4, BI ;\ 313 addq $2, %rax ;\ 314 315/************************************************************************************************/ 316 317#define KERNEL2x1_1(xx) \ 318 prefetcht0 A_PR1(AO,%rax,SIZE) ;\ 319 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 320 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 321 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 322 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 323 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 324 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 325 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 326 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 327 328#define KERNEL2x1_2(xx) \ 329 vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 330 vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ 331 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 332 vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 333 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 334 vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 335 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 336 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 337 338#define KERNEL2x1_3(xx) \ 339 prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ 340 vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 341 vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ 342 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 343 vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 344 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 345 vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 346 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 347 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 348 349#define KERNEL2x1_4(xx) \ 350 vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 351 vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ 352 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 353 vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 354 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 355 vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 356 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 357 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 358 addq $8, BI ;\ 359 addq $16, %rax ;\ 360 361 362#define KERNEL2x1_SUB(xx) \ 363 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 364 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 365 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 366 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ 367 VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ 368 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 369 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 370 VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ 371 addq $2, BI ;\ 372 addq $4, %rax ;\ 373 374 375/************************************************************************************************/ 376 377#define KERNEL1x1_1(xx) \ 378 prefetcht0 A_PR1(AO,%rax,SIZE) ;\ 379 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 380 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 381 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 382 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 383 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 384 385#define KERNEL1x1_2(xx) \ 386 vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 387 vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ 388 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 389 vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 390 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 391 392#define KERNEL1x1_3(xx) \ 393 vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 394 vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ 395 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 396 vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ 397 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 398 399#define KERNEL1x1_4(xx) \ 400 vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 401 vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ 402 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 403 vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 404 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 405 addq $8, BI ;\ 406 addq $8, %rax ;\ 407 408 409#define KERNEL1x1_SUB(xx) \ 410 vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ 411 vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ 412 VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ 413 vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ 414 VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ 415 addq $2, BI ;\ 416 addq $2, %rax ;\ 417 418 419/************************************************************************************************/ 420 421 422 423 424 PROLOGUE 425 PROFCODE 426 427 subq $STACKSIZE, %rsp 428 movq %rbx, (%rsp) 429 movq %rbp, 8(%rsp) 430 movq %r12, 16(%rsp) 431 movq %r13, 24(%rsp) 432 movq %r14, 32(%rsp) 433 movq %r15, 40(%rsp) 434 435 vzeroupper 436 437#ifdef WINDOWS_ABI 438 movq %rdi, 48(%rsp) 439 movq %rsi, 56(%rsp) 440 vmovups %xmm6, 64(%rsp) 441 vmovups %xmm7, 80(%rsp) 442 vmovups %xmm8, 96(%rsp) 443 vmovups %xmm9, 112(%rsp) 444 vmovups %xmm10, 128(%rsp) 445 vmovups %xmm11, 144(%rsp) 446 vmovups %xmm12, 160(%rsp) 447 vmovups %xmm13, 176(%rsp) 448 vmovups %xmm14, 192(%rsp) 449 vmovups %xmm15, 208(%rsp) 450 451 movq ARG1, OLD_M 452 movq ARG2, OLD_N 453 movq ARG3, OLD_K 454 movq OLD_A, A 455 movq OLD_B, B 456 movq OLD_C, C 457 movq OLD_LDC, LDC 458#ifdef TRMMKERNEL 459 vmovsd OLD_OFFSET, %xmm12 460#endif 461 vmovaps %xmm3, %xmm0 462 vmovsd OLD_ALPHA_I, %xmm1 463 464#else 465 movq STACKSIZE + 8(%rsp), LDC 466#ifdef TRMMKERNEL 467 vmovsd STACKSIZE + 16(%rsp), %xmm12 468#endif 469 470#endif 471 472 movq %rsp, SP # save old stack 473 subq $128 + L_BUFFER_SIZE, %rsp 474 andq $-4096, %rsp # align stack 475 476 STACK_TOUCH 477 478 cmpq $0, OLD_M 479 je .L999 480 481 cmpq $0, OLD_N 482 je .L999 483 484 cmpq $0, OLD_K 485 je .L999 486 487 movq OLD_M, M 488 movq OLD_N, N 489 movq OLD_K, K 490 491 vmovsd %xmm0, ALPHA_R 492 vmovsd %xmm1, ALPHA_I 493 494 salq $ZBASE_SHIFT, LDC 495 496 movq N, %rax 497 xorq %rdx, %rdx 498 movq $2, %rdi 499 divq %rdi // N / 2 500 movq %rax, Ndiv6 // N / 2 501 movq %rdx, Nmod6 // N % 2 502 503 504 505#ifdef TRMMKERNEL 506 vmovsd %xmm12, OFFSET 507 vmovsd %xmm12, KK 508#ifndef LEFT 509 negq KK 510#endif 511#endif 512 513.L2_0: 514 515 movq Ndiv6, J 516 cmpq $0, J 517 je .L1_0 518 ALIGN_4 519 520 521 522.L2_01: 523 // copy to sub buffer 524 movq B, BO1 525 leaq BUFFER1, BO // first buffer to BO 526 movq K, %rax 527 ALIGN_4 528 529.L2_02b: 530 531 vmovups (BO1), %xmm0 532 vmovups 2 * SIZE(BO1), %xmm1 533 vmovups %xmm0, (BO) 534 vmovups %xmm1, 2 * SIZE(BO) 535 addq $4*SIZE,BO1 536 addq $4*SIZE,BO 537 decq %rax 538 jnz .L2_02b 539 540.L2_02c: 541 542 movq BO1, B // next offset of B 543 544.L2_10: 545 movq C, CO1 546 leaq (C, LDC, 2), C // c += 2 * ldc 547 548#if defined(TRMMKERNEL) && defined(LEFT) 549 movq OFFSET, %rax 550 movq %rax, KK 551#endif 552 553 movq A, AO // aoffset = a 554 addq $8 * SIZE, AO 555 556 movq M, I 557 sarq $1, I // i = (m >> 1) 558 je .L2_40 559 560 ALIGN_4 561 562.L2_11: 563 564#if !defined(TRMMKERNEL) || \ 565 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 566 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 567 leaq BUFFER1, BO // first buffer to BO 568 addq $8 * SIZE, BO 569#else 570 movq KK, %rax 571 leaq BUFFER1, BO // first buffer to BO 572 addq $8 * SIZE, BO 573 movq %rax, BI // Index for BO 574 leaq (,BI,4), BI // BI = BI * 4 ; number of values 575 leaq (BO, BI, SIZE), BO 576 salq $2, %rax // rax = rax * 4 ; number of values 577 leaq (AO, %rax, SIZE), AO 578#endif 579 580 vzeroall 581 582#ifndef TRMMKERNEL 583 movq K, %rax 584#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 585 movq K, %rax 586 subq KK, %rax 587 movq %rax, KKK 588#else 589 movq KK, %rax 590#ifdef LEFT 591 addq $2, %rax // number of values in AO 592#else 593 addq $2, %rax // number of values in BO 594#endif 595 movq %rax, KKK 596#endif 597 598 599 andq $-8, %rax // K = K - ( K % 8 ) 600 je .L2_16 601 movq %rax, BI // Index for BO 602 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 603 604 salq $2, %rax // rax = rax * 4 ; number of values 605 leaq (AO, %rax, SIZE), AO 606 leaq (BO, BI, SIZE), BO 607 negq BI 608 negq %rax 609 ALIGN_4 610 611.L2_12: 612 613 prefetcht0 B_PR1(BO,BI,SIZE) 614 KERNEL2x2_1(xxx) 615 KERNEL2x2_2(xxx) 616 prefetcht0 B_PR1+64(BO,BI,SIZE) 617 KERNEL2x2_3(xxx) 618 KERNEL2x2_4(xxx) 619 620 prefetcht0 B_PR1(BO,BI,SIZE) 621 KERNEL2x2_1(xxx) 622 KERNEL2x2_2(xxx) 623 prefetcht0 B_PR1+64(BO,BI,SIZE) 624 KERNEL2x2_3(xxx) 625 KERNEL2x2_4(xxx) 626 627 je .L2_16 628 629 prefetcht0 B_PR1(BO,BI,SIZE) 630 KERNEL2x2_1(xxx) 631 KERNEL2x2_2(xxx) 632 prefetcht0 B_PR1+64(BO,BI,SIZE) 633 KERNEL2x2_3(xxx) 634 KERNEL2x2_4(xxx) 635 636 prefetcht0 B_PR1(BO,BI,SIZE) 637 KERNEL2x2_1(xxx) 638 KERNEL2x2_2(xxx) 639 prefetcht0 B_PR1+64(BO,BI,SIZE) 640 KERNEL2x2_3(xxx) 641 KERNEL2x2_4(xxx) 642 643 je .L2_16 644 645 jmp .L2_12 646 ALIGN_4 647 648.L2_16: 649#ifndef TRMMKERNEL 650 movq K, %rax 651#else 652 movq KKK, %rax 653#endif 654 655 andq $7, %rax # if (k & 1) 656 je .L2_19 657 658 movq %rax, BI // Index for BO 659 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 660 661 salq $2, %rax // rax = rax * 4 ; number of values 662 leaq (AO, %rax, SIZE), AO 663 leaq (BO, BI, SIZE), BO 664 negq BI 665 negq %rax 666 ALIGN_4 667 668.L2_17: 669 670 KERNEL2x2_SUB(xxx) 671 jl .L2_17 672 ALIGN_4 673 674 675.L2_19: 676 677 vmovddup ALPHA_R, %xmm0 678 vmovddup ALPHA_I, %xmm1 679 680 // swap high and low 64 bytes 681 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 682 vshufpd $0x01, %xmm11, %xmm11, %xmm11 683 vshufpd $0x01, %xmm13, %xmm13, %xmm13 684 vshufpd $0x01, %xmm15, %xmm15, %xmm15 685 686#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 687 defined(NR) || defined(NC) || defined(TR) || defined(TC) 688 689 vaddsubpd %xmm9, %xmm8 , %xmm8 690 vaddsubpd %xmm11,%xmm10, %xmm10 691 vaddsubpd %xmm13,%xmm12, %xmm12 692 vaddsubpd %xmm15,%xmm14, %xmm14 693 694 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 695 vshufpd $0x01, %xmm10, %xmm10, %xmm11 696 vshufpd $0x01, %xmm12, %xmm12, %xmm13 697 vshufpd $0x01, %xmm14, %xmm14, %xmm15 698 699#else 700 vaddsubpd %xmm8, %xmm9 ,%xmm9 701 vaddsubpd %xmm10, %xmm11,%xmm11 702 vaddsubpd %xmm12, %xmm13,%xmm13 703 vaddsubpd %xmm14, %xmm15,%xmm15 704 705 vmovapd %xmm9, %xmm8 706 vmovapd %xmm11, %xmm10 707 vmovapd %xmm13, %xmm12 708 vmovapd %xmm15, %xmm14 709 710 // swap high and low 64 bytes 711 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 712 vshufpd $0x01, %xmm11, %xmm11, %xmm11 713 vshufpd $0x01, %xmm13, %xmm13, %xmm13 714 vshufpd $0x01, %xmm15, %xmm15, %xmm15 715 716#endif 717 718 // multiply with ALPHA_R 719 vmulpd %xmm8 , %xmm0, %xmm8 720 vmulpd %xmm10, %xmm0, %xmm10 721 vmulpd %xmm12, %xmm0, %xmm12 722 vmulpd %xmm14, %xmm0, %xmm14 723 724 // multiply with ALPHA_I 725 vmulpd %xmm9 , %xmm1, %xmm9 726 vmulpd %xmm11, %xmm1, %xmm11 727 vmulpd %xmm13, %xmm1, %xmm13 728 vmulpd %xmm15, %xmm1, %xmm15 729 730 vaddsubpd %xmm9, %xmm8 , %xmm8 731 vaddsubpd %xmm11,%xmm10, %xmm10 732 vaddsubpd %xmm13,%xmm12, %xmm12 733 vaddsubpd %xmm15,%xmm14, %xmm14 734 735 736 737#ifndef TRMMKERNEL 738 739 vaddpd (CO1), %xmm8 , %xmm8 740 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 741 742 vaddpd (CO1, LDC), %xmm10, %xmm10 743 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 744 745#endif 746 747 vmovups %xmm8 , (CO1) 748 vmovups %xmm12 , 2 * SIZE(CO1) 749 750 vmovups %xmm10 , (CO1, LDC) 751 vmovups %xmm14 , 2 * SIZE(CO1, LDC) 752 753#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 754 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 755 movq K, %rax 756 subq KKK, %rax 757 movq %rax, BI // Index for BO 758 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 759 leaq (BO, BI, SIZE), BO 760 salq $2, %rax // rax = rax * 4 ; number of values 761 leaq (AO, %rax, SIZE), AO 762#endif 763 764 765#if defined(TRMMKERNEL) && defined(LEFT) 766 addq $2, KK 767#endif 768 769 addq $4 * SIZE, CO1 # coffset += 4 770 decq I # i -- 771 jg .L2_11 772 ALIGN_4 773 774 775/************************************************************************** 776* Rest of M 777***************************************************************************/ 778.L2_40: 779 testq $1, M 780 jz .L2_60 // to next 2 lines of N 781 782 ALIGN_4 783 784.L2_41: 785 786#if !defined(TRMMKERNEL) || \ 787 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 788 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 789 leaq BUFFER1, BO // first buffer to BO 790 addq $8 * SIZE, BO 791#else 792 movq KK, %rax 793 leaq BUFFER1, BO // first buffer to BO 794 addq $8 * SIZE, BO 795 movq %rax, BI // Index for BO 796 leaq (,BI,4), BI // BI = BI * 4 ; number of values 797 leaq (BO, BI, SIZE), BO 798 salq $1, %rax // rax = rax * 2 ; number of values 799 leaq (AO, %rax, SIZE), AO 800#endif 801 802 vzeroall 803 804#ifndef TRMMKERNEL 805 movq K, %rax 806#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 807 movq K, %rax 808 subq KK, %rax 809 movq %rax, KKK 810#else 811 movq KK, %rax 812#ifdef LEFT 813 addq $1, %rax // number of values in AO 814#else 815 addq $2, %rax // number of values in BO 816#endif 817 movq %rax, KKK 818#endif 819 820 821 andq $-8, %rax // K = K - ( K % 8 ) 822 je .L2_46 823 movq %rax, BI // Index for BO 824 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 825 826 salq $1, %rax // rax = rax * 2 ; number of values 827 leaq (AO, %rax, SIZE), AO 828 leaq (BO, BI, SIZE), BO 829 negq BI 830 negq %rax 831 ALIGN_4 832 833.L2_42: 834 835 prefetcht0 B_PR1(BO,BI,SIZE) 836 KERNEL1x2_1(xxx) 837 KERNEL1x2_2(xxx) 838 prefetcht0 B_PR1+64(BO,BI,SIZE) 839 KERNEL1x2_3(xxx) 840 KERNEL1x2_4(xxx) 841 842 prefetcht0 B_PR1(BO,BI,SIZE) 843 KERNEL1x2_1(xxx) 844 KERNEL1x2_2(xxx) 845 prefetcht0 B_PR1+64(BO,BI,SIZE) 846 KERNEL1x2_3(xxx) 847 KERNEL1x2_4(xxx) 848 849 je .L2_46 850 851 prefetcht0 B_PR1(BO,BI,SIZE) 852 KERNEL1x2_1(xxx) 853 KERNEL1x2_2(xxx) 854 prefetcht0 B_PR1+64(BO,BI,SIZE) 855 KERNEL1x2_3(xxx) 856 KERNEL1x2_4(xxx) 857 858 prefetcht0 B_PR1(BO,BI,SIZE) 859 KERNEL1x2_1(xxx) 860 KERNEL1x2_2(xxx) 861 prefetcht0 B_PR1+64(BO,BI,SIZE) 862 KERNEL1x2_3(xxx) 863 KERNEL1x2_4(xxx) 864 865 je .L2_46 866 867 jmp .L2_42 868 ALIGN_4 869 870.L2_46: 871#ifndef TRMMKERNEL 872 movq K, %rax 873#else 874 movq KKK, %rax 875#endif 876 877 andq $7, %rax # if (k & 1) 878 je .L2_49 879 880 movq %rax, BI // Index for BO 881 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 882 883 salq $1, %rax // rax = rax * 2 ; number of values 884 leaq (AO, %rax, SIZE), AO 885 leaq (BO, BI, SIZE), BO 886 negq BI 887 negq %rax 888 ALIGN_4 889 890.L2_47: 891 892 KERNEL1x2_SUB(xxx) 893 jl .L2_47 894 ALIGN_4 895 896 897.L2_49: 898 899 vmovddup ALPHA_R, %xmm0 900 vmovddup ALPHA_I, %xmm1 901 902 // swap high and low 64 bytes 903 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 904 vshufpd $0x01, %xmm11, %xmm11, %xmm11 905 906#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 907 defined(NR) || defined(NC) || defined(TR) || defined(TC) 908 909 vaddsubpd %xmm9, %xmm8 , %xmm8 910 vaddsubpd %xmm11,%xmm10, %xmm10 911 912 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 913 vshufpd $0x01, %xmm10, %xmm10, %xmm11 914 915#else 916 vaddsubpd %xmm8, %xmm9, %xmm9 917 vaddsubpd %xmm10,%xmm11, %xmm11 918 919 vmovapd %xmm9, %xmm8 920 vmovapd %xmm11, %xmm10 921 922 // swap high and low 64 bytes 923 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 924 vshufpd $0x01, %xmm11, %xmm11, %xmm11 925 926#endif 927 928 // multiply with ALPHA_R 929 vmulpd %xmm8 , %xmm0, %xmm8 930 vmulpd %xmm10, %xmm0, %xmm10 931 932 // multiply with ALPHA_I 933 vmulpd %xmm9 , %xmm1, %xmm9 934 vmulpd %xmm11, %xmm1, %xmm11 935 936 vaddsubpd %xmm9, %xmm8 , %xmm8 937 vaddsubpd %xmm11,%xmm10, %xmm10 938 939 940 941#ifndef TRMMKERNEL 942 943 vaddpd (CO1), %xmm8 , %xmm8 944 vaddpd (CO1, LDC), %xmm10, %xmm10 945 946#endif 947 948 vmovups %xmm8 , (CO1) 949 vmovups %xmm10 , (CO1, LDC) 950 951#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 952 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 953 movq K, %rax 954 subq KKK, %rax 955 movq %rax, BI // Index for BO 956 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 957 leaq (BO, BI, SIZE), BO 958 salq $1, %rax // rax = rax * 2 ; number of values 959 leaq (AO, %rax, SIZE), AO 960#endif 961 962 963#if defined(TRMMKERNEL) && defined(LEFT) 964 addq $1, KK 965#endif 966 967 addq $2 * SIZE, CO1 # coffset += 2 968 ALIGN_4 969 970 971 972 973.L2_60: 974#if defined(TRMMKERNEL) && !defined(LEFT) 975 addq $2, KK 976#endif 977 978 decq J // j -- 979 jg .L2_01 // next 2 lines of N 980 981 982 983.L1_0: 984 985/************************************************************************************************ 986* Loop for Nmod6 % 2 > 0 987*************************************************************************************************/ 988 989 movq Nmod6, J 990 andq $1, J // j % 2 991 je .L999 992 ALIGN_4 993 994.L1_01: 995 // copy to sub buffer 996 movq B, BO1 997 leaq BUFFER1, BO // first buffer to BO 998 movq K, %rax 999 ALIGN_4 1000 1001.L1_02b: 1002 1003 vmovups (BO1), %xmm0 1004 vmovups %xmm0, (BO) 1005 addq $2*SIZE,BO1 1006 addq $2*SIZE,BO 1007 decq %rax 1008 jnz .L1_02b 1009 1010.L1_02c: 1011 1012 movq BO1, B // next offset of B 1013 1014.L1_10: 1015 movq C, CO1 1016 leaq (C, LDC, 1), C // c += 1 * ldc 1017 1018#if defined(TRMMKERNEL) && defined(LEFT) 1019 movq OFFSET, %rax 1020 movq %rax, KK 1021#endif 1022 1023 movq A, AO // aoffset = a 1024 addq $8 * SIZE, AO 1025 1026 movq M, I 1027 sarq $1, I // i = (m >> 1) 1028 je .L1_40 1029 1030 ALIGN_4 1031 1032.L1_11: 1033 1034#if !defined(TRMMKERNEL) || \ 1035 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1036 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1037 leaq BUFFER1, BO // first buffer to BO 1038 addq $4 * SIZE, BO 1039#else 1040 movq KK, %rax 1041 leaq BUFFER1, BO // first buffer to BO 1042 addq $4 * SIZE, BO 1043 movq %rax, BI // Index for BO 1044 leaq (,BI,2), BI // BI = BI * 2 ; number of values 1045 leaq (BO, BI, SIZE), BO 1046 salq $2, %rax // rax = rax * 4 ; number of values 1047 leaq (AO, %rax, SIZE), AO 1048#endif 1049 1050 vzeroall 1051 1052#ifndef TRMMKERNEL 1053 movq K, %rax 1054#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1055 movq K, %rax 1056 subq KK, %rax 1057 movq %rax, KKK 1058#else 1059 movq KK, %rax 1060#ifdef LEFT 1061 addq $2, %rax // number of values in AO 1062#else 1063 addq $1, %rax // number of values in BO 1064#endif 1065 movq %rax, KKK 1066#endif 1067 1068 1069 andq $-8, %rax // K = K - ( K % 8 ) 1070 je .L1_16 1071 movq %rax, BI // Index for BO 1072 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1073 1074 salq $2, %rax // rax = rax * 4 ; number of values 1075 leaq (AO, %rax, SIZE), AO 1076 leaq (BO, BI, SIZE), BO 1077 negq BI 1078 negq %rax 1079 ALIGN_4 1080 1081.L1_12: 1082 1083 prefetcht0 B_PR1(BO,BI,SIZE) 1084 KERNEL2x1_1(xxx) 1085 KERNEL2x1_2(xxx) 1086 KERNEL2x1_3(xxx) 1087 KERNEL2x1_4(xxx) 1088 1089 prefetcht0 B_PR1(BO,BI,SIZE) 1090 KERNEL2x1_1(xxx) 1091 KERNEL2x1_2(xxx) 1092 KERNEL2x1_3(xxx) 1093 KERNEL2x1_4(xxx) 1094 1095 je .L1_16 1096 1097 prefetcht0 B_PR1(BO,BI,SIZE) 1098 KERNEL2x1_1(xxx) 1099 KERNEL2x1_2(xxx) 1100 KERNEL2x1_3(xxx) 1101 KERNEL2x1_4(xxx) 1102 1103 prefetcht0 B_PR1(BO,BI,SIZE) 1104 KERNEL2x1_1(xxx) 1105 KERNEL2x1_2(xxx) 1106 KERNEL2x1_3(xxx) 1107 KERNEL2x1_4(xxx) 1108 1109 je .L1_16 1110 1111 jmp .L1_12 1112 ALIGN_4 1113 1114.L1_16: 1115#ifndef TRMMKERNEL 1116 movq K, %rax 1117#else 1118 movq KKK, %rax 1119#endif 1120 1121 andq $7, %rax # if (k & 1) 1122 je .L1_19 1123 1124 movq %rax, BI // Index for BO 1125 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1126 1127 salq $2, %rax // rax = rax * 4 ; number of values 1128 leaq (AO, %rax, SIZE), AO 1129 leaq (BO, BI, SIZE), BO 1130 negq BI 1131 negq %rax 1132 ALIGN_4 1133 1134.L1_17: 1135 1136 KERNEL2x1_SUB(xxx) 1137 jl .L1_17 1138 ALIGN_4 1139 1140 1141.L1_19: 1142 1143 vmovddup ALPHA_R, %xmm0 1144 vmovddup ALPHA_I, %xmm1 1145 1146 // swap high and low 64 bytes 1147 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 1148 vshufpd $0x01, %xmm13, %xmm13, %xmm13 1149 1150#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1151 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1152 1153 vaddsubpd %xmm9, %xmm8 , %xmm8 1154 vaddsubpd %xmm13,%xmm12 , %xmm12 1155 1156 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 1157 vshufpd $0x01, %xmm12, %xmm12, %xmm13 1158 1159#else 1160 vaddsubpd %xmm8, %xmm9 , %xmm9 1161 vaddsubpd %xmm12,%xmm13, %xmm13 1162 1163 vmovapd %xmm9, %xmm8 1164 vmovapd %xmm13, %xmm12 1165 1166 // swap high and low 64 bytes 1167 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 1168 vshufpd $0x01, %xmm13, %xmm13, %xmm13 1169 1170#endif 1171 1172 // multiply with ALPHA_R 1173 vmulpd %xmm8 , %xmm0, %xmm8 1174 vmulpd %xmm12, %xmm0, %xmm12 1175 1176 // multiply with ALPHA_I 1177 vmulpd %xmm9 , %xmm1, %xmm9 1178 vmulpd %xmm13, %xmm1, %xmm13 1179 1180 vaddsubpd %xmm9, %xmm8 , %xmm8 1181 vaddsubpd %xmm13, %xmm12, %xmm12 1182 1183 1184 1185#ifndef TRMMKERNEL 1186 1187 vaddpd (CO1), %xmm8 , %xmm8 1188 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 1189 1190#endif 1191 1192 vmovups %xmm8 , (CO1) 1193 vmovups %xmm12 , 2 * SIZE(CO1) 1194 1195 1196#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1197 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1198 movq K, %rax 1199 subq KKK, %rax 1200 movq %rax, BI // Index for BO 1201 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1202 leaq (BO, BI, SIZE), BO 1203 salq $2, %rax // rax = rax * 4 ; number of values 1204 leaq (AO, %rax, SIZE), AO 1205#endif 1206 1207 1208#if defined(TRMMKERNEL) && defined(LEFT) 1209 addq $2, KK 1210#endif 1211 1212 addq $4 * SIZE, CO1 # coffset += 4 1213 decq I # i -- 1214 jg .L1_11 1215 ALIGN_4 1216 1217 1218/************************************************************************** 1219* Rest of M 1220***************************************************************************/ 1221.L1_40: 1222 testq $1, M 1223 jz .L999 1224 1225 ALIGN_4 1226 1227.L1_41: 1228 1229#if !defined(TRMMKERNEL) || \ 1230 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1231 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1232 leaq BUFFER1, BO // first buffer to BO 1233 addq $4 * SIZE, BO 1234#else 1235 movq KK, %rax 1236 leaq BUFFER1, BO // first buffer to BO 1237 addq $4 * SIZE, BO 1238 movq %rax, BI // Index for BO 1239 leaq (,BI,2), BI // BI = BI * 2 ; number of values 1240 leaq (BO, BI, SIZE), BO 1241 salq $1, %rax // rax = rax * 2 ; number of values 1242 leaq (AO, %rax, SIZE), AO 1243#endif 1244 1245 vzeroall 1246 1247#ifndef TRMMKERNEL 1248 movq K, %rax 1249#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1250 movq K, %rax 1251 subq KK, %rax 1252 movq %rax, KKK 1253#else 1254 movq KK, %rax 1255#ifdef LEFT 1256 addq $1, %rax // number of values in AO 1257#else 1258 addq $1, %rax // number of values in BO 1259#endif 1260 movq %rax, KKK 1261#endif 1262 1263 1264 andq $-8, %rax // K = K - ( K % 8 ) 1265 je .L1_46 1266 movq %rax, BI // Index for BO 1267 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1268 1269 salq $1, %rax // rax = rax * 2 ; number of values 1270 leaq (AO, %rax, SIZE), AO 1271 leaq (BO, BI, SIZE), BO 1272 negq BI 1273 negq %rax 1274 ALIGN_4 1275 1276.L1_42: 1277 1278 prefetcht0 B_PR1(BO,BI,SIZE) 1279 KERNEL1x1_1(xxx) 1280 KERNEL1x1_2(xxx) 1281 KERNEL1x1_3(xxx) 1282 KERNEL1x1_4(xxx) 1283 1284 prefetcht0 B_PR1(BO,BI,SIZE) 1285 KERNEL1x1_1(xxx) 1286 KERNEL1x1_2(xxx) 1287 KERNEL1x1_3(xxx) 1288 KERNEL1x1_4(xxx) 1289 1290 je .L1_46 1291 1292 prefetcht0 B_PR1(BO,BI,SIZE) 1293 KERNEL1x1_1(xxx) 1294 KERNEL1x1_2(xxx) 1295 KERNEL1x1_3(xxx) 1296 KERNEL1x1_4(xxx) 1297 1298 prefetcht0 B_PR1(BO,BI,SIZE) 1299 KERNEL1x1_1(xxx) 1300 KERNEL1x1_2(xxx) 1301 KERNEL1x1_3(xxx) 1302 KERNEL1x1_4(xxx) 1303 1304 je .L1_46 1305 1306 jmp .L1_42 1307 ALIGN_4 1308 1309.L1_46: 1310#ifndef TRMMKERNEL 1311 movq K, %rax 1312#else 1313 movq KKK, %rax 1314#endif 1315 1316 andq $7, %rax # if (k & 1) 1317 je .L1_49 1318 1319 movq %rax, BI // Index for BO 1320 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1321 1322 salq $1, %rax // rax = rax * 2 ; number of values 1323 leaq (AO, %rax, SIZE), AO 1324 leaq (BO, BI, SIZE), BO 1325 negq BI 1326 negq %rax 1327 ALIGN_4 1328 1329.L1_47: 1330 1331 KERNEL1x1_SUB(xxx) 1332 jl .L1_47 1333 ALIGN_4 1334 1335 1336.L1_49: 1337 1338 vmovddup ALPHA_R, %xmm0 1339 vmovddup ALPHA_I, %xmm1 1340 1341 // swap high and low 64 bytes 1342 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 1343 1344#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1345 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1346 1347 vaddsubpd %xmm9, %xmm8, %xmm8 1348 1349 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 1350 1351#else 1352 vaddsubpd %xmm8, %xmm9, %xmm9 1353 1354 vmovapd %xmm9, %xmm8 1355 1356 // swap high and low 64 bytes 1357 vshufpd $0x01, %xmm9 , %xmm9, %xmm9 1358 1359#endif 1360 1361 // multiply with ALPHA_R 1362 vmulpd %xmm8 , %xmm0, %xmm8 1363 1364 // multiply with ALPHA_I 1365 vmulpd %xmm9 , %xmm1, %xmm9 1366 1367 vaddsubpd %xmm9 ,%xmm8, %xmm8 1368 1369 1370 1371#ifndef TRMMKERNEL 1372 1373 vaddpd (CO1), %xmm8 , %xmm8 1374 1375#endif 1376 1377 vmovups %xmm8 , (CO1) 1378 1379#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1380 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1381 movq K, %rax 1382 subq KKK, %rax 1383 movq %rax, BI // Index for BO 1384 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 1385 leaq (BO, BI, SIZE), BO 1386 salq $1, %rax // rax = rax * 2 ; number of values 1387 leaq (AO, %rax, SIZE), AO 1388#endif 1389 1390 1391#if defined(TRMMKERNEL) && defined(LEFT) 1392 addq $1, KK 1393#endif 1394 1395 addq $2 * SIZE, CO1 # coffset += 2 1396 ALIGN_4 1397 1398 1399 1400.L999: 1401 vzeroupper 1402 1403 movq SP, %rsp 1404 movq (%rsp), %rbx 1405 movq 8(%rsp), %rbp 1406 movq 16(%rsp), %r12 1407 movq 24(%rsp), %r13 1408 movq 32(%rsp), %r14 1409 movq 40(%rsp), %r15 1410 1411#ifdef WINDOWS_ABI 1412 movq 48(%rsp), %rdi 1413 movq 56(%rsp), %rsi 1414 vmovups 64(%rsp), %xmm6 1415 vmovups 80(%rsp), %xmm7 1416 vmovups 96(%rsp), %xmm8 1417 vmovups 112(%rsp), %xmm9 1418 vmovups 128(%rsp), %xmm10 1419 vmovups 144(%rsp), %xmm11 1420 vmovups 160(%rsp), %xmm12 1421 vmovups 176(%rsp), %xmm13 1422 vmovups 192(%rsp), %xmm14 1423 vmovups 208(%rsp), %xmm15 1424#endif 1425 1426 addq $STACKSIZE, %rsp 1427 ret 1428 1429 EPILOGUE 1430