1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 /* NOTE: The micro-kernels in this file were partially inspired by portions
36    of code found in OpenBLAS 0.2.8 (http://www.openblas.net/). -FGVZ */
37 
38 #include "blis.h"
39 
bli_sgemm_asm_8x8(dim_t k,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)40 void bli_sgemm_asm_8x8
41      (
42        dim_t               k,
43        float*     restrict alpha,
44        float*     restrict a,
45        float*     restrict b,
46        float*     restrict beta,
47        float*     restrict c, inc_t rs_c, inc_t cs_c,
48        auxinfo_t* restrict data,
49        cntx_t*    restrict cntx
50      )
51 {
52     //void*   a_next = bli_auxinfo_next_a( data );
53     //void*   b_next = bli_auxinfo_next_b( data );
54 
55     uint64_t   k_iter = k / 4;
56     uint64_t   k_left = k % 4;
57 
58     __asm__ volatile
59     (
60     "                                            \n\t"
61     "                                            \n\t"
62     "movq                %2, %%rax               \n\t" // load address of a.
63     "movq                %3, %%rbx               \n\t" // load address of b.
64     //"movq                %9, %%r15               \n\t" // load address of b_next.
65     "                                            \n\t"
66     "vmovaps    0 * 32(%%rax), %%ymm0            \n\t" // initialize loop by pre-loading
67     "vmovsldup  0 * 32(%%rbx), %%ymm2            \n\t" // elements of a and b.
68     "vpermilps   $0x4e, %%ymm2, %%ymm3           \n\t"
69     "                                            \n\t"
70     "movq                %6, %%rcx               \n\t" // load address of c
71     "movq                %8, %%rdi               \n\t" // load cs_c
72     "leaq        (,%%rdi,4), %%rdi               \n\t" // cs_c *= sizeof(float)
73     "leaq   (%%rcx,%%rdi,4), %%r10               \n\t" // load address of c + 4*cs_c;
74     "                                            \n\t"
75     "leaq   (%%rdi,%%rdi,2), %%r14               \n\t" // r14 = 3*cs_c;
76     "prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
77     "prefetcht0   7 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
78     "prefetcht0   7 * 8(%%rcx,%%rdi,2)           \n\t" // prefetch c + 2*cs_c
79     "prefetcht0   7 * 8(%%rcx,%%r14)             \n\t" // prefetch c + 3*cs_c
80     "prefetcht0   7 * 8(%%r10)                   \n\t" // prefetch c + 4*cs_c
81     "prefetcht0   7 * 8(%%r10,%%rdi)             \n\t" // prefetch c + 5*cs_c
82     "prefetcht0   7 * 8(%%r10,%%rdi,2)           \n\t" // prefetch c + 6*cs_c
83     "prefetcht0   7 * 8(%%r10,%%r14)             \n\t" // prefetch c + 7*cs_c
84     "                                            \n\t"
85     "vxorps    %%ymm8,  %%ymm8,  %%ymm8          \n\t"
86     "vxorps    %%ymm9,  %%ymm9,  %%ymm9          \n\t"
87     "vxorps    %%ymm10, %%ymm10, %%ymm10         \n\t"
88     "vxorps    %%ymm11, %%ymm11, %%ymm11         \n\t"
89     "vxorps    %%ymm12, %%ymm12, %%ymm12         \n\t"
90     "vxorps    %%ymm13, %%ymm13, %%ymm13         \n\t"
91     "vxorps    %%ymm14, %%ymm14, %%ymm14         \n\t"
92     "vxorps    %%ymm15, %%ymm15, %%ymm15         \n\t"
93     "                                            \n\t"
94     "                                            \n\t"
95     "                                            \n\t"
96     "movq      %0, %%rsi                         \n\t" // i = k_iter;
97     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
98     "je     .SCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
99     "                                            \n\t" // contains the k_left loop.
100     "                                            \n\t"
101     "                                            \n\t"
102     ".SLOOPKITER:                                \n\t" // MAIN LOOP
103     "                                            \n\t"
104     "                                            \n\t"
105     "                                            \n\t" // iteration 0
106     "prefetcht0  16 * 32(%%rax)                  \n\t"
107     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
108     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
109     "vmovshdup  0 * 32(%%rbx),  %%ymm2           \n\t"
110     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
111     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
112     "vaddps            %%ymm15, %%ymm6, %%ymm15  \n\t"
113     "vaddps            %%ymm13, %%ymm7, %%ymm13  \n\t"
114     "                                            \n\t"
115     "vmovaps    1 * 32(%%rax),  %%ymm1           \n\t"
116     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
117     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
118     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
119     "vaddps            %%ymm11, %%ymm6, %%ymm11  \n\t"
120     "vaddps            %%ymm9,  %%ymm7, %%ymm9   \n\t"
121     "                                            \n\t"
122     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
123     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
124     "vmovsldup  1 * 32(%%rbx),  %%ymm2           \n\t"
125     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
126     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
127     "vaddps            %%ymm14, %%ymm6, %%ymm14  \n\t"
128     "vaddps            %%ymm12, %%ymm7, %%ymm12  \n\t"
129     "                                            \n\t"
130     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
131     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
132     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
133     "vaddps            %%ymm10, %%ymm6, %%ymm10  \n\t"
134     "vaddps            %%ymm8,  %%ymm7, %%ymm8   \n\t"
135     "                                            \n\t"
136     "                                            \n\t" // iteration 1
137     "vmulps            %%ymm1,  %%ymm2, %%ymm6   \n\t"
138     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
139     "vmovshdup  1 * 32(%%rbx), %%ymm2            \n\t"
140     "vmulps            %%ymm1,  %%ymm3, %%ymm7   \n\t"
141     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
142     "vaddps            %%ymm15, %%ymm6, %%ymm15  \n\t"
143     "vaddps            %%ymm13, %%ymm7, %%ymm13  \n\t"
144     "                                            \n\t"
145     "vmovaps    2 * 32(%%rax),  %%ymm0           \n\t"
146     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
147     "vmulps            %%ymm1,  %%ymm4, %%ymm6   \n\t"
148     "vmulps            %%ymm1,  %%ymm5, %%ymm7   \n\t"
149     "vaddps            %%ymm11, %%ymm6, %%ymm11  \n\t"
150     "vaddps            %%ymm9,  %%ymm7, %%ymm9   \n\t"
151     "                                            \n\t"
152     "vmulps            %%ymm1,  %%ymm2, %%ymm6   \n\t"
153     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
154     "vmovsldup  2 * 32(%%rbx),  %%ymm2           \n\t"
155     "vmulps            %%ymm1,  %%ymm3, %%ymm7   \n\t"
156     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
157     "vaddps            %%ymm14, %%ymm6, %%ymm14  \n\t"
158     "vaddps            %%ymm12, %%ymm7, %%ymm12  \n\t"
159     "                                            \n\t"
160     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
161     "vmulps            %%ymm1,  %%ymm4, %%ymm6   \n\t"
162     "vmulps            %%ymm1,  %%ymm5, %%ymm7   \n\t"
163     "vaddps            %%ymm10, %%ymm6, %%ymm10  \n\t"
164     "vaddps            %%ymm8,  %%ymm7, %%ymm8   \n\t"
165     "                                            \n\t"
166     "                                            \n\t"
167     "                                            \n\t" // iteration 2
168     "prefetcht0  18 * 32(%%rax)                  \n\t"
169     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
170     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
171     "vmovshdup  2 * 32(%%rbx),  %%ymm2           \n\t"
172     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
173     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
174     "vaddps            %%ymm15, %%ymm6, %%ymm15  \n\t"
175     "vaddps            %%ymm13, %%ymm7, %%ymm13  \n\t"
176     "                                            \n\t"
177     "vmovaps    3 * 32(%%rax),  %%ymm1           \n\t"
178     "addq           $4 * 8 * 4, %%rax            \n\t" // a += 4*8 (unroll x mr)
179     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
180     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
181     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
182     "vaddps            %%ymm11, %%ymm6, %%ymm11  \n\t"
183     "vaddps            %%ymm9,  %%ymm7, %%ymm9   \n\t"
184     "                                            \n\t"
185     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
186     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
187     "vmovsldup  3 * 32(%%rbx),  %%ymm2           \n\t"
188     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
189     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
190     "vaddps            %%ymm14, %%ymm6, %%ymm14  \n\t"
191     "vaddps            %%ymm12, %%ymm7, %%ymm12  \n\t"
192     "                                            \n\t"
193     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
194     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
195     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
196     "vaddps            %%ymm10, %%ymm6, %%ymm10  \n\t"
197     "vaddps            %%ymm8,  %%ymm7, %%ymm8   \n\t"
198     "                                            \n\t"
199     "                                            \n\t"
200     "                                            \n\t" // iteration 3
201     "vmulps            %%ymm1,  %%ymm2, %%ymm6   \n\t"
202     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
203     "vmovshdup  3 * 32(%%rbx), %%ymm2            \n\t"
204     "addq           $4 * 8 * 4, %%rbx            \n\t" // b += 4*8 (unroll x nr)
205     "vmulps            %%ymm1,  %%ymm3, %%ymm7   \n\t"
206     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
207     "vaddps            %%ymm15, %%ymm6, %%ymm15  \n\t"
208     "vaddps            %%ymm13, %%ymm7, %%ymm13  \n\t"
209     "                                            \n\t"
210     "vmovaps    0 * 32(%%rax),  %%ymm0           \n\t"
211     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
212     "vmulps            %%ymm1,  %%ymm4, %%ymm6   \n\t"
213     "vmulps            %%ymm1,  %%ymm5, %%ymm7   \n\t"
214     "vaddps            %%ymm11, %%ymm6, %%ymm11  \n\t"
215     "vaddps            %%ymm9,  %%ymm7, %%ymm9   \n\t"
216     "                                            \n\t"
217     "vmulps            %%ymm1,  %%ymm2, %%ymm6   \n\t"
218     "vperm2f128 $0x03, %%ymm2,  %%ymm2, %%ymm4   \n\t"
219     "vmovsldup  0 * 32(%%rbx),  %%ymm2           \n\t"
220     "vmulps            %%ymm1,  %%ymm3, %%ymm7   \n\t"
221     "vperm2f128 $0x03, %%ymm3,  %%ymm3, %%ymm5   \n\t"
222     "vaddps            %%ymm14, %%ymm6, %%ymm14  \n\t"
223     "vaddps            %%ymm12, %%ymm7, %%ymm12  \n\t"
224     "                                            \n\t"
225     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
226     "vmulps            %%ymm1,  %%ymm4, %%ymm6   \n\t"
227     "vmulps            %%ymm1,  %%ymm5, %%ymm7   \n\t"
228     "vaddps            %%ymm10, %%ymm6, %%ymm10  \n\t"
229     "vaddps            %%ymm8,  %%ymm7, %%ymm8   \n\t"
230     "                                            \n\t"
231     "                                            \n\t"
232     "                                            \n\t"
233     "                                            \n\t"
234     "decq   %%rsi                                \n\t" // i -= 1;
235     "jne    .SLOOPKITER                          \n\t" // iterate again if i != 0.
236     "                                            \n\t"
237     "                                            \n\t"
238     "                                            \n\t"
239     "                                            \n\t"
240     "                                            \n\t"
241     "                                            \n\t"
242     ".SCONSIDKLEFT:                              \n\t"
243     "                                            \n\t"
244     "movq      %1, %%rsi                         \n\t" // i = k_left;
245     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
246     "je     .SPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
247     "                                            \n\t" // else, we prepare to enter k_left loop.
248     "                                            \n\t"
249     "                                            \n\t"
250     ".SLOOPKLEFT:                                \n\t" // EDGE LOOP
251     "                                            \n\t"
252     "                                            \n\t"
253     "prefetcht0  16 * 32(%%rax)                  \n\t"
254     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
255     "vperm2f128  $0x3, %%ymm2,  %%ymm2, %%ymm4   \n\t"
256     "vmovshdup  0 * 32(%%rbx),  %%ymm2           \n\t"
257     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
258     "vperm2f128  $0x3, %%ymm3,  %%ymm3, %%ymm5   \n\t"
259     "vaddps            %%ymm15, %%ymm6, %%ymm15  \n\t"
260     "vaddps            %%ymm13, %%ymm7, %%ymm13  \n\t"
261     "                                            \n\t"
262     "vmovaps    1 * 32(%%rax),  %%ymm1           \n\t"
263     "addq           $8 * 1 * 4, %%rax            \n\t" // a += 8 (1 x mr)
264     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
265     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
266     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
267     "vaddps            %%ymm11, %%ymm6, %%ymm11  \n\t"
268     "vaddps            %%ymm9,  %%ymm7, %%ymm9   \n\t"
269     "                                            \n\t"
270     "vmulps            %%ymm0,  %%ymm2, %%ymm6   \n\t"
271     "vperm2f128  $0x3, %%ymm2,  %%ymm2, %%ymm4   \n\t"
272     "vmovsldup  1 * 32(%%rbx),  %%ymm2           \n\t"
273     "addq           $8 * 1 * 4, %%rbx            \n\t" // b += 8 (1 x nr)
274     "vmulps            %%ymm0,  %%ymm3, %%ymm7   \n\t"
275     "vperm2f128  $0x3, %%ymm3,  %%ymm3, %%ymm5   \n\t"
276     "vaddps            %%ymm14, %%ymm6, %%ymm14  \n\t"
277     "vaddps            %%ymm12, %%ymm7, %%ymm12  \n\t"
278     "                                            \n\t"
279     "vpermilps  $0x4e, %%ymm2,  %%ymm3           \n\t"
280     "vmulps            %%ymm0,  %%ymm4, %%ymm6   \n\t"
281     "vmulps            %%ymm0,  %%ymm5, %%ymm7   \n\t"
282     "vmovaps           %%ymm1,  %%ymm0           \n\t"
283     "vaddps            %%ymm10, %%ymm6, %%ymm10  \n\t"
284     "vaddps            %%ymm8,  %%ymm7, %%ymm8   \n\t"
285     "                                            \n\t"
286     "                                            \n\t"
287     "                                            \n\t"
288     "decq   %%rsi                                \n\t" // i -= 1;
289     "jne    .SLOOPKLEFT                          \n\t" // iterate again if i != 0.
290     "                                            \n\t"
291     "                                            \n\t"
292     "                                            \n\t"
293     ".SPOSTACCUM:                                \n\t"
294     "                                            \n\t"
295     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
296     "                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
297     "                                            \n\t" //   ab10    ab12    ab14    ab16
298     "                                            \n\t" //   ab22    ab20    ab26    ab24
299     "                                            \n\t" //   ab32    ab30    ab36    ab34
300     "                                            \n\t" //   ab44    ab46    ab40    ab42
301     "                                            \n\t" //   ab54    ab56    ab50    ab52
302     "                                            \n\t" //   ab66    ab64    ab62    ab60
303     "                                            \n\t" //   ab76 )  ab74 )  ab72 )  ab70 )
304     "                                            \n\t"
305     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
306     "                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
307     "                                            \n\t" //   ab11    ab13    ab15    ab17
308     "                                            \n\t" //   ab23    ab21    ab27    ab25
309     "                                            \n\t" //   ab33    ab31    ab37    ab35
310     "                                            \n\t" //   ab45    ab47    ab41    ab43
311     "                                            \n\t" //   ab55    ab57    ab51    ab53
312     "                                            \n\t" //   ab67    ab65    ab63    ab61
313     "                                            \n\t" //   ab77 )  ab75 )  ab73 )  ab71 )
314     "                                            \n\t"
315     "vmovaps          %%ymm15, %%ymm7            \n\t"
316     "vshufps   $0xe4, %%ymm13, %%ymm15, %%ymm15  \n\t"
317     "vshufps   $0xe4, %%ymm7,  %%ymm13, %%ymm13  \n\t"
318     "                                            \n\t"
319     "vmovaps          %%ymm11, %%ymm7            \n\t"
320     "vshufps   $0xe4, %%ymm9,  %%ymm11, %%ymm11  \n\t"
321     "vshufps   $0xe4, %%ymm7,  %%ymm9,  %%ymm9   \n\t"
322     "                                            \n\t"
323     "vmovaps          %%ymm14, %%ymm7            \n\t"
324     "vshufps   $0xe4, %%ymm12, %%ymm14, %%ymm14  \n\t"
325     "vshufps   $0xe4, %%ymm7,  %%ymm12, %%ymm12  \n\t"
326     "                                            \n\t"
327     "vmovaps          %%ymm10, %%ymm7            \n\t"
328     "vshufps   $0xe4, %%ymm8,  %%ymm10, %%ymm10  \n\t"
329     "vshufps   $0xe4, %%ymm7,  %%ymm8,  %%ymm8   \n\t"
330     "                                            \n\t"
331     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
332     "                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
333     "                                            \n\t" //   ab10    ab12    ab14    ab16
334     "                                            \n\t" //   ab20    ab22    ab24    ab26
335     "                                            \n\t" //   ab30    ab32    ab34    ab36
336     "                                            \n\t" //   ab44    ab46    ab40    ab42
337     "                                            \n\t" //   ab54    ab56    ab50    ab52
338     "                                            \n\t" //   ab64    ab66    ab60    ab62
339     "                                            \n\t" //   ab74 )  ab76 )  ab70 )  ab72 )
340     "                                            \n\t"
341     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
342     "                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
343     "                                            \n\t" //   ab11    ab13    ab15    ab17
344     "                                            \n\t" //   ab21    ab23    ab25    ab27
345     "                                            \n\t" //   ab31    ab33    ab35    ab37
346     "                                            \n\t" //   ab45    ab47    ab41    ab43
347     "                                            \n\t" //   ab55    ab57    ab51    ab53
348     "                                            \n\t" //   ab65    ab67    ab61    ab63
349     "                                            \n\t" //   ab75 )  ab77 )  ab71 )  ab73 )
350     "                                            \n\t"
351     "vmovaps           %%ymm15, %%ymm7           \n\t"
352     "vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t"
353     "vperm2f128 $0x12, %%ymm11, %%ymm7,  %%ymm11 \n\t"
354     "                                            \n\t"
355     "vmovaps           %%ymm13, %%ymm7           \n\t"
356     "vperm2f128 $0x30, %%ymm9,  %%ymm13, %%ymm13 \n\t"
357     "vperm2f128 $0x12, %%ymm9,  %%ymm7,  %%ymm9  \n\t"
358     "                                            \n\t"
359     "vmovaps           %%ymm14, %%ymm7           \n\t"
360     "vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t"
361     "vperm2f128 $0x12, %%ymm10, %%ymm7,  %%ymm10 \n\t"
362     "                                            \n\t"
363     "vmovaps           %%ymm12, %%ymm7           \n\t"
364     "vperm2f128 $0x30, %%ymm8,  %%ymm12, %%ymm12 \n\t"
365     "vperm2f128 $0x12, %%ymm8,  %%ymm7,  %%ymm8  \n\t"
366     "                                            \n\t"
367     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
368     "                                            \n\t" // ( ab00  ( ab02  ( ab04  ( ab06
369     "                                            \n\t" //   ab10    ab12    ab14    ab16
370     "                                            \n\t" //   ab20    ab22    ab24    ab26
371     "                                            \n\t" //   ab30    ab32    ab34    ab36
372     "                                            \n\t" //   ab40    ab42    ab44    ab46
373     "                                            \n\t" //   ab50    ab52    ab54    ab56
374     "                                            \n\t" //   ab60    ab62    ab64    ab66
375     "                                            \n\t" //   ab70 )  ab72 )  ab74 )  ab76 )
376     "                                            \n\t"
377     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
378     "                                            \n\t" // ( ab01  ( ab03  ( ab05  ( ab07
379     "                                            \n\t" //   ab11    ab13    ab15    ab17
380     "                                            \n\t" //   ab21    ab23    ab25    ab27
381     "                                            \n\t" //   ab31    ab33    ab35    ab37
382     "                                            \n\t" //   ab41    ab43    ab45    ab47
383     "                                            \n\t" //   ab51    ab53    ab55    ab57
384     "                                            \n\t" //   ab61    ab63    ab65    ab67
385     "                                            \n\t" //   ab71 )  ab73 )  ab75 )  ab77 )
386     "                                            \n\t"
387     "                                            \n\t"
388     "                                            \n\t"
389     "movq         %4, %%rax                      \n\t" // load address of alpha
390     "movq         %5, %%rbx                      \n\t" // load address of beta
391     "vbroadcastss    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
392     "vbroadcastss    (%%rbx), %%ymm4             \n\t" // load beta and duplicate
393     "                                            \n\t"
394     "vmulps           %%ymm0,  %%ymm8,  %%ymm8   \n\t" // scale by alpha
395     "vmulps           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
396     "vmulps           %%ymm0,  %%ymm10, %%ymm10  \n\t"
397     "vmulps           %%ymm0,  %%ymm11, %%ymm11  \n\t"
398     "vmulps           %%ymm0,  %%ymm12, %%ymm12  \n\t"
399     "vmulps           %%ymm0,  %%ymm13, %%ymm13  \n\t"
400     "vmulps           %%ymm0,  %%ymm14, %%ymm14  \n\t"
401     "vmulps           %%ymm0,  %%ymm15, %%ymm15  \n\t"
402     "                                            \n\t"
403     "                                            \n\t"
404     "                                            \n\t"
405     "                                            \n\t"
406     "                                            \n\t"
407     "                                            \n\t"
408     "movq                %7, %%rsi               \n\t" // load rs_c
409     "leaq        (,%%rsi,4), %%rsi               \n\t" // rsi = rs_c * sizeof(float)
410     "                                            \n\t"
411     "leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c + 4*rs_c;
412     "                                            \n\t"
413     "leaq        (,%%rsi,2), %%r12               \n\t" // r12 = 2*rs_c;
414     "leaq   (%%r12,%%rsi,1), %%r13               \n\t" // r13 = 3*rs_c;
415     "                                            \n\t"
416     "                                            \n\t"
417     "                                            \n\t"
418     "                                            \n\t" // determine if
419     "                                            \n\t" //    c    % 32 == 0, AND
420     "                                            \n\t" //  4*cs_c % 32 == 0, AND
421     "                                            \n\t" //    rs_c      == 1
422     "                                            \n\t" // ie: aligned, ldim aligned, and
423     "                                            \n\t" // column-stored
424     "                                            \n\t"
425     "cmpq       $4, %%rsi                        \n\t" // set ZF if (4*rs_c) == 4.
426     "sete           %%bl                         \n\t" // bl = ( ZF == 1 ? 1 : 0 );
427     "testq     $31, %%rcx                        \n\t" // set ZF if c & 32 is zero.
428     "setz           %%bh                         \n\t" // bh = ( ZF == 0 ? 1 : 0 );
429     "testq     $31, %%rdi                        \n\t" // set ZF if (4*cs_c) & 32 is zero.
430     "setz           %%al                         \n\t" // al = ( ZF == 0 ? 1 : 0 );
431     "                                            \n\t" // and(bl,bh) followed by
432     "                                            \n\t" // and(bh,al) will reveal result
433     "                                            \n\t"
434     "                                            \n\t" // now avoid loading C if beta == 0
435     "                                            \n\t"
436     "vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
437     "vucomiss  %%xmm0,  %%xmm4                   \n\t" // set ZF if beta == 0.
438     "je      .SBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
439     "                                            \n\t"
440     "                                            \n\t"
441     "                                            \n\t" // check if aligned/column-stored
442     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
443     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
444     "jne     .SCOLSTORED                         \n\t" // jump to column storage case
445     "                                            \n\t"
446     "                                            \n\t"
447     "                                            \n\t"
448     ".SGENSTORED:                                \n\t"
449     "                                            \n\t"
450     "                                            \n\t" // update c00:c70
451     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
452     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
453     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
454     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
455     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
456     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
457     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
458     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
459     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
460     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
461     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
462     "                                            \n\t"
463     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
464     "vaddps            %%ymm15, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
465     "                                            \n\t"
466     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
467     "vmovss            %%xmm0, (%%rcx)           \n\t"
468     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
469     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
470     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
471     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
472     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
473     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
474     "vmovss            %%xmm2, (%%rdx)           \n\t"
475     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
476     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
477     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
478     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
479     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
480     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
481     "                                            \n\t"
482     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
483     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
484     "                                            \n\t"
485     "                                            \n\t"
486     "                                            \n\t" // update c01:c71
487     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
488     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
489     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
490     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
491     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
492     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
493     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
494     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
495     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
496     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
497     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
498     "                                            \n\t"
499     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
500     "vaddps            %%ymm14, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
501     "                                            \n\t"
502     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
503     "vmovss            %%xmm0, (%%rcx)           \n\t"
504     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
505     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
506     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
507     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
508     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
509     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
510     "vmovss            %%xmm2, (%%rdx)           \n\t"
511     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
512     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
513     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
514     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
515     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
516     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
517     "                                            \n\t"
518     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
519     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
520     "                                            \n\t"
521     "                                            \n\t"
522     "                                            \n\t" // update c02:c72
523     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
524     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
525     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
526     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
527     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
528     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
529     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
530     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
531     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
532     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
533     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
534     "                                            \n\t"
535     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
536     "vaddps            %%ymm13, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
537     "                                            \n\t"
538     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
539     "vmovss            %%xmm0, (%%rcx)           \n\t"
540     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
541     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
542     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
543     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
544     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
545     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
546     "vmovss            %%xmm2, (%%rdx)           \n\t"
547     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
548     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
549     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
550     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
551     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
552     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
553     "                                            \n\t"
554     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
555     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
556     "                                            \n\t"
557     "                                            \n\t"
558     "                                            \n\t" // update c03:c73
559     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
560     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
561     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
562     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
563     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
564     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
565     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
566     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
567     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
568     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
569     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
570     "                                            \n\t"
571     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
572     "vaddps            %%ymm12, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
573     "                                            \n\t"
574     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
575     "vmovss            %%xmm0, (%%rcx)           \n\t"
576     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
577     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
578     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
579     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
580     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
581     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
582     "vmovss            %%xmm2, (%%rdx)           \n\t"
583     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
584     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
585     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
586     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
587     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
588     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
589     "                                            \n\t"
590     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
591     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
592     "                                            \n\t"
593     "                                            \n\t"
594     "                                            \n\t" // update c04:c74
595     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
596     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
597     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
598     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
599     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
600     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
601     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
602     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
603     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
604     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
605     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
606     "                                            \n\t"
607     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
608     "vaddps            %%ymm11, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
609     "                                            \n\t"
610     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
611     "vmovss            %%xmm0, (%%rcx)           \n\t"
612     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
613     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
614     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
615     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
616     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
617     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
618     "vmovss            %%xmm2, (%%rdx)           \n\t"
619     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
620     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
621     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
622     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
623     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
624     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
625     "                                            \n\t"
626     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
627     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
628     "                                            \n\t"
629     "                                            \n\t"
630     "                                            \n\t" // update c05:c75
631     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
632     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
633     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
634     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
635     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
636     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
637     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
638     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
639     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
640     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
641     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
642     "                                            \n\t"
643     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
644     "vaddps            %%ymm10, %%ymm0,  %%ymm0  \n\t" // add the gemm result,
645     "                                            \n\t"
646     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
647     "vmovss            %%xmm0, (%%rcx)           \n\t"
648     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
649     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
650     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
651     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
652     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
653     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
654     "vmovss            %%xmm2, (%%rdx)           \n\t"
655     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
656     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
657     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
658     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
659     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
660     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
661     "                                            \n\t"
662     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
663     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
664     "                                            \n\t"
665     "                                            \n\t"
666     "                                            \n\t" // update c06:c76
667     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
668     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
669     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
670     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
671     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
672     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
673     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
674     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
675     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
676     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
677     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
678     "                                            \n\t"
679     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
680     "vaddps            %%ymm9,  %%ymm0,  %%ymm0  \n\t" // add the gemm result,
681     "                                            \n\t"
682     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
683     "vmovss            %%xmm0, (%%rcx)           \n\t"
684     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
685     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
686     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
687     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
688     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
689     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
690     "vmovss            %%xmm2, (%%rdx)           \n\t"
691     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
692     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
693     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
694     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
695     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
696     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
697     "                                            \n\t"
698     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
699     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
700     "                                            \n\t"
701     "                                            \n\t"
702     "                                            \n\t" // update c07:c77
703     "vmovlps    (%%rcx),        %%xmm0,  %%xmm0  \n\t"
704     "vmovhps    (%%rcx,%%rsi),  %%xmm0,  %%xmm0  \n\t"
705     "vmovlps    (%%rcx,%%r12),  %%xmm1,  %%xmm1  \n\t"
706     "vmovhps    (%%rcx,%%r13),  %%xmm1,  %%xmm1  \n\t"
707     "vshufps    $0x88, %%xmm1,  %%xmm0,  %%xmm0  \n\t"
708     "vmovlps    (%%rdx),        %%xmm2,  %%xmm2  \n\t"
709     "vmovhps    (%%rdx,%%rsi),  %%xmm2,  %%xmm2  \n\t"
710     "vmovlps    (%%rdx,%%r12),  %%xmm3,  %%xmm3  \n\t"
711     "vmovhps    (%%rdx,%%r13),  %%xmm3,  %%xmm3  \n\t"
712     "vshufps    $0x88, %%xmm3,  %%xmm2,  %%xmm2  \n\t"
713     "vperm2f128 $0x20, %%ymm2,  %%ymm0,  %%ymm0  \n\t"
714     "                                            \n\t"
715     "vmulps            %%ymm4,  %%ymm0,  %%ymm0  \n\t" // scale by beta,
716     "vaddps            %%ymm8,  %%ymm0,  %%ymm0  \n\t" // add the gemm result,
717     "                                            \n\t"
718     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
719     "vmovss            %%xmm0, (%%rcx)           \n\t"
720     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
721     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
722     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
723     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
724     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
725     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
726     "vmovss            %%xmm2, (%%rdx)           \n\t"
727     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
728     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
729     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
730     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
731     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
732     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
733     "                                            \n\t"
734     "                                            \n\t"
735     "                                            \n\t"
736     "jmp    .SDONE                               \n\t" // jump to end.
737     "                                            \n\t"
738     "                                            \n\t"
739     "                                            \n\t"
740     ".SCOLSTORED:                                \n\t"
741     "                                            \n\t"
742     "                                            \n\t"
743     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c00:c70,
744     "vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
745     "vaddps           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
746     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
747     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
748     "                                            \n\t"
749     "vmovaps    (%%rcx),       %%ymm1            \n\t" // load c01:c71,
750     "vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
751     "vaddps           %%ymm14, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
752     "vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
753     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
754     "                                            \n\t"
755     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c02:c72,
756     "vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
757     "vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
758     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
759     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
760     "                                            \n\t"
761     "vmovaps    (%%rcx),       %%ymm1            \n\t" // load c03:c73,
762     "vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
763     "vaddps           %%ymm12, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
764     "vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
765     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
766     "                                            \n\t"
767     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c04:c74,
768     "vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
769     "vaddps           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
770     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
771     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
772     "                                            \n\t"
773     "vmovaps    (%%rcx),       %%ymm1            \n\t" // load c05:c75,
774     "vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
775     "vaddps           %%ymm10, %%ymm1,  %%ymm1   \n\t" // add the gemm result,
776     "vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
777     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
778     "                                            \n\t"
779     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c06:c76,
780     "vmulps           %%ymm4,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
781     "vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result,
782     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
783     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
784     "                                            \n\t"
785     "vmovaps    (%%rcx),       %%ymm1            \n\t" // load c07:c77,
786     "vmulps           %%ymm4,  %%ymm1,  %%ymm1   \n\t" // scale by beta,
787     "vaddps           %%ymm8,  %%ymm1,  %%ymm1   \n\t" // add the gemm result,
788     "vmovaps          %%ymm1,  (%%rcx)           \n\t" // and store back to memory.
789     "                                            \n\t"
790     "                                            \n\t"
791     "jmp    .SDONE                               \n\t" // jump to end.
792     "                                            \n\t"
793     "                                            \n\t"
794     "                                            \n\t"
795     "                                            \n\t"
796     ".SBETAZERO:                                 \n\t"
797     "                                            \n\t" // check if aligned/column-stored
798     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
799     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
800     "jne     .SCOLSTORBZ                         \n\t" // jump to column storage case
801     "                                            \n\t"
802     "                                            \n\t"
803     "                                            \n\t"
804     ".SGENSTORBZ:                                \n\t"
805     "                                            \n\t"
806     "                                            \n\t" // update c00:c70
807     "vmovapd           %%ymm15, %%ymm0           \n\t"
808     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
809     "vmovss            %%xmm0, (%%rcx)           \n\t"
810     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
811     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
812     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
813     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
814     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
815     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
816     "vmovss            %%xmm2, (%%rdx)           \n\t"
817     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
818     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
819     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
820     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
821     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
822     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
823     "                                            \n\t"
824     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
825     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
826     "                                            \n\t"
827     "                                            \n\t"
828     "                                            \n\t" // update c01:c71
829     "vmovapd           %%ymm14, %%ymm0           \n\t"
830     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
831     "vmovss            %%xmm0, (%%rcx)           \n\t"
832     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
833     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
834     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
835     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
836     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
837     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
838     "vmovss            %%xmm2, (%%rdx)           \n\t"
839     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
840     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
841     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
842     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
843     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
844     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
845     "                                            \n\t"
846     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
847     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
848     "                                            \n\t"
849     "                                            \n\t"
850     "                                            \n\t" // update c02:c72
851     "vmovapd           %%ymm13, %%ymm0           \n\t"
852     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
853     "vmovss            %%xmm0, (%%rcx)           \n\t"
854     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
855     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
856     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
857     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
858     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
859     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
860     "vmovss            %%xmm2, (%%rdx)           \n\t"
861     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
862     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
863     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
864     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
865     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
866     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
867     "                                            \n\t"
868     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
869     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
870     "                                            \n\t"
871     "                                            \n\t"
872     "                                            \n\t" // update c03:c73
873     "vmovapd           %%ymm12, %%ymm0           \n\t"
874     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
875     "vmovss            %%xmm0, (%%rcx)           \n\t"
876     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
877     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
878     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
879     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
880     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
881     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
882     "vmovss            %%xmm2, (%%rdx)           \n\t"
883     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
884     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
885     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
886     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
887     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
888     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
889     "                                            \n\t"
890     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
891     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
892     "                                            \n\t"
893     "                                            \n\t"
894     "                                            \n\t" // update c04:c74
895     "vmovapd           %%ymm11, %%ymm0           \n\t"
896     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
897     "vmovss            %%xmm0, (%%rcx)           \n\t"
898     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
899     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
900     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
901     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
902     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
903     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
904     "vmovss            %%xmm2, (%%rdx)           \n\t"
905     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
906     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
907     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
908     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
909     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
910     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
911     "                                            \n\t"
912     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
913     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
914     "                                            \n\t"
915     "                                            \n\t"
916     "                                            \n\t" // update c05:c75
917     "vmovapd           %%ymm10, %%ymm0           \n\t"
918     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
919     "vmovss            %%xmm0, (%%rcx)           \n\t"
920     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
921     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
922     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
923     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
924     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
925     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
926     "vmovss            %%xmm2, (%%rdx)           \n\t"
927     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
928     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
929     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
930     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
931     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
932     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
933     "                                            \n\t"
934     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
935     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
936     "                                            \n\t"
937     "                                            \n\t"
938     "                                            \n\t" // update c06:c76
939     "vmovapd           %%ymm9,  %%ymm0           \n\t"
940     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
941     "vmovss            %%xmm0, (%%rcx)           \n\t"
942     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
943     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
944     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
945     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
946     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
947     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
948     "vmovss            %%xmm2, (%%rdx)           \n\t"
949     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
950     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
951     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
952     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
953     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
954     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
955     "                                            \n\t"
956     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
957     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
958     "                                            \n\t"
959     "                                            \n\t"
960     "                                            \n\t" // update c07:c77
961     "vmovapd           %%ymm8,  %%ymm0           \n\t"
962     "vextractf128  $1, %%ymm0,  %%xmm2           \n\t"
963     "vmovss            %%xmm0, (%%rcx)           \n\t"
964     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
965     "vmovss            %%xmm1, (%%rcx,%%rsi)     \n\t"
966     "vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t"
967     "vmovss            %%xmm0, (%%rcx,%%r12)     \n\t"
968     "vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t"
969     "vmovss            %%xmm1, (%%rcx,%%r13)     \n\t"
970     "vmovss            %%xmm2, (%%rdx)           \n\t"
971     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
972     "vmovss            %%xmm3, (%%rdx,%%rsi)     \n\t"
973     "vpermilps  $0x39, %%xmm3,  %%xmm2           \n\t"
974     "vmovss            %%xmm2, (%%rdx,%%r12)     \n\t"
975     "vpermilps  $0x39, %%xmm2,  %%xmm3           \n\t"
976     "vmovss            %%xmm3, (%%rdx,%%r13)     \n\t"
977     "                                            \n\t"
978     "                                            \n\t"
979     "jmp    .SDONE                               \n\t" // jump to end.
980     "                                            \n\t"
981     "                                            \n\t"
982     "                                            \n\t"
983     ".SCOLSTORBZ:                                \n\t"
984     "                                            \n\t"
985     "                                            \n\t"
986     "vmovaps          %%ymm15, (%%rcx)           \n\t" // and store back to memory.
987     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
988     "                                            \n\t"
989     "vmovaps          %%ymm14, (%%rcx)           \n\t" // and store back to memory.
990     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
991     "                                            \n\t"
992     "vmovaps          %%ymm13, (%%rcx)           \n\t" // and store back to memory.
993     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
994     "                                            \n\t"
995     "vmovaps          %%ymm12, (%%rcx)           \n\t" // and store back to memory.
996     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
997     "                                            \n\t"
998     "vmovaps          %%ymm11, (%%rcx)           \n\t" // and store back to memory.
999     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1000     "                                            \n\t"
1001     "vmovaps          %%ymm10, (%%rcx)           \n\t" // and store back to memory.
1002     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1003     "                                            \n\t"
1004     "vmovaps          %%ymm9,  (%%rcx)           \n\t" // and store back to memory.
1005     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1006     "                                            \n\t"
1007     "vmovaps          %%ymm8,  (%%rcx)           \n\t" // and store back to memory.
1008     "                                            \n\t"
1009     "                                            \n\t"
1010     "                                            \n\t"
1011     "                                            \n\t"
1012     "                                            \n\t"
1013     ".SDONE:                                     \n\t"
1014     "                                            \n\t"
1015 
1016     : // output operands (none)
1017     : // input operands
1018       "m" (k_iter), // 0
1019       "m" (k_left), // 1
1020       "m" (a),      // 2
1021       "m" (b),      // 3
1022       "m" (alpha),  // 4
1023       "m" (beta),   // 5
1024       "m" (c),      // 6
1025       "m" (rs_c),   // 7
1026       "m" (cs_c)/*,   // 8
1027       "m" (b_next), // 9
1028       "m" (a_next)*/  // 10
1029     : // register clobber list
1030       "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1031       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1032       "xmm0", "xmm1", "xmm2", "xmm3",
1033       "xmm4", "xmm5", "xmm6", "xmm7",
1034       "xmm8", "xmm9", "xmm10", "xmm11",
1035       "xmm12", "xmm13", "xmm14", "xmm15",
1036       "memory"
1037     );
1038 }
1039 
bli_dgemm_asm_8x4(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)1040 void bli_dgemm_asm_8x4
1041      (
1042        dim_t               k,
1043        double*    restrict alpha,
1044        double*    restrict a,
1045        double*    restrict b,
1046        double*    restrict beta,
1047        double*    restrict c, inc_t rs_c, inc_t cs_c,
1048        auxinfo_t* restrict data,
1049        cntx_t*    restrict cntx
1050      )
1051 {
1052     //void*   a_next = bli_auxinfo_next_a( data );
1053     void*   b_next = bli_auxinfo_next_b( data );
1054 
1055     uint64_t   k_iter = k / 4;
1056     uint64_t   k_left = k % 4;
1057 
1058     __asm__ volatile
1059     (
1060     "                                            \n\t"
1061     "                                            \n\t"
1062     "movq                %2, %%rax               \n\t" // load address of a.
1063     "movq                %3, %%rbx               \n\t" // load address of b.
1064     "movq                %9, %%r15               \n\t" // load address of b_next.
1065     //"movq               %10, %%r14               \n\t" // load address of a_next.
1066     "addq          $-4 * 64, %%r15               \n\t"
1067     "                                            \n\t"
1068     "vmovapd   0 * 32(%%rax), %%ymm0             \n\t" // initialize loop by pre-loading
1069     "vmovapd   0 * 32(%%rbx), %%ymm2             \n\t" // elements of a and b.
1070     "vpermilpd  $0x5, %%ymm2, %%ymm3             \n\t"
1071     "                                            \n\t"
1072     "movq                %6, %%rcx               \n\t" // load address of c
1073     "movq                %8, %%rdi               \n\t" // load cs_c
1074     "leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(double)
1075     "leaq   (%%rcx,%%rdi,2), %%r10               \n\t" // load address of c + 2*cs_c;
1076     "                                            \n\t"
1077     "prefetcht0   3 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
1078     "prefetcht0   3 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
1079     "prefetcht0   3 * 8(%%r10)                   \n\t" // prefetch c + 2*cs_c
1080     "prefetcht0   3 * 8(%%r10,%%rdi)             \n\t" // prefetch c + 3*cs_c
1081     "                                            \n\t"
1082     "vxorpd    %%ymm8,  %%ymm8,  %%ymm8          \n\t"
1083     "vxorpd    %%ymm9,  %%ymm9,  %%ymm9          \n\t"
1084     "vxorpd    %%ymm10, %%ymm10, %%ymm10         \n\t"
1085     "vxorpd    %%ymm11, %%ymm11, %%ymm11         \n\t"
1086     "vxorpd    %%ymm12, %%ymm12, %%ymm12         \n\t"
1087     "vxorpd    %%ymm13, %%ymm13, %%ymm13         \n\t"
1088     "vxorpd    %%ymm14, %%ymm14, %%ymm14         \n\t"
1089     "vxorpd    %%ymm15, %%ymm15, %%ymm15         \n\t"
1090     "                                            \n\t"
1091     "                                            \n\t"
1092     "                                            \n\t"
1093     "movq      %0, %%rsi                         \n\t" // i = k_iter;
1094     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1095     "je     .DCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
1096     "                                            \n\t" // contains the k_left loop.
1097     "                                            \n\t"
1098     "                                            \n\t"
1099     ".DLOOPKITER:                                \n\t" // MAIN LOOP
1100     "                                            \n\t"
1101     "addq         $4 * 4 * 8,  %%r15             \n\t" // b_next += 4*4 (unroll x nr)
1102     "                                            \n\t"
1103     "                                            \n\t" // iteration 0
1104     "vmovapd   1 * 32(%%rax),  %%ymm1            \n\t"
1105     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1106     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1107     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1108     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1109     "vaddpd           %%ymm15, %%ymm6,  %%ymm15  \n\t"
1110     "vaddpd           %%ymm13, %%ymm7,  %%ymm13  \n\t"
1111     "                                            \n\t"
1112     "prefetcht0  16 * 32(%%rax)                  \n\t"
1113     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1114     "vmovapd   1 * 32(%%rbx),  %%ymm2            \n\t"
1115     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1116     "vpermilpd  $0x5, %%ymm2,  %%ymm3            \n\t"
1117     "vaddpd           %%ymm14, %%ymm6,  %%ymm14  \n\t"
1118     "vaddpd           %%ymm12, %%ymm7,  %%ymm12  \n\t"
1119     "                                            \n\t"
1120     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1121     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1122     "vmovapd   2 * 32(%%rax),  %%ymm0            \n\t"
1123     "vaddpd           %%ymm11, %%ymm6,  %%ymm11  \n\t"
1124     "vaddpd           %%ymm9,  %%ymm7,  %%ymm9   \n\t"
1125     "prefetcht0   0 * 32(%%r15)                  \n\t" // prefetch b_next[0*4]
1126     "                                            \n\t"
1127     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1128     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1129     "vaddpd           %%ymm10, %%ymm6,  %%ymm10  \n\t"
1130     "vaddpd           %%ymm8,  %%ymm7,  %%ymm8   \n\t"
1131     "                                            \n\t"
1132     "                                            \n\t"
1133     "                                            \n\t" // iteration 1
1134     "vmovapd   3 * 32(%%rax),  %%ymm1            \n\t"
1135     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1136     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1137     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1138     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1139     "vaddpd           %%ymm15, %%ymm6,  %%ymm15  \n\t"
1140     "vaddpd           %%ymm13, %%ymm7,  %%ymm13  \n\t"
1141     "                                            \n\t"
1142     "prefetcht0  18 * 32(%%rax)                  \n\t"
1143     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1144     "vmovapd   2 * 32(%%rbx),  %%ymm2            \n\t"
1145     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1146     "vpermilpd  $0x5, %%ymm2,  %%ymm3            \n\t"
1147     "vaddpd           %%ymm14, %%ymm6,  %%ymm14  \n\t"
1148     "vaddpd           %%ymm12, %%ymm7,  %%ymm12  \n\t"
1149     "                                            \n\t"
1150     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1151     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1152     "vmovapd   4 * 32(%%rax),  %%ymm0            \n\t"
1153     "vaddpd           %%ymm11, %%ymm6,  %%ymm11  \n\t"
1154     "vaddpd           %%ymm9,  %%ymm7,  %%ymm9   \n\t"
1155     "                                            \n\t"
1156     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1157     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1158     "vaddpd           %%ymm10, %%ymm6,  %%ymm10  \n\t"
1159     "vaddpd           %%ymm8,  %%ymm7,  %%ymm8   \n\t"
1160     "                                            \n\t"
1161     "                                            \n\t"
1162     "                                            \n\t" // iteration 2
1163     "vmovapd   5 * 32(%%rax),  %%ymm1            \n\t"
1164     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1165     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1166     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1167     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1168     "vaddpd           %%ymm15, %%ymm6,  %%ymm15  \n\t"
1169     "vaddpd           %%ymm13, %%ymm7,  %%ymm13  \n\t"
1170     "                                            \n\t"
1171     "prefetcht0  20 * 32(%%rax)                  \n\t"
1172     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1173     "vmovapd   3 * 32(%%rbx),  %%ymm2            \n\t"
1174     "addq         $4 * 4 * 8,  %%rbx             \n\t" // b += 4*4 (unroll x nr)
1175     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1176     "vpermilpd  $0x5, %%ymm2,  %%ymm3            \n\t"
1177     "vaddpd           %%ymm14, %%ymm6,  %%ymm14  \n\t"
1178     "vaddpd           %%ymm12, %%ymm7,  %%ymm12  \n\t"
1179     "                                            \n\t"
1180     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1181     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1182     "vmovapd   6 * 32(%%rax),  %%ymm0            \n\t"
1183     "vaddpd           %%ymm11, %%ymm6,  %%ymm11  \n\t"
1184     "vaddpd           %%ymm9,  %%ymm7,  %%ymm9   \n\t"
1185     "prefetcht0   2 * 32(%%r15)                  \n\t" // prefetch b_next[2*4]
1186     "                                            \n\t"
1187     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1188     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1189     "vaddpd           %%ymm10, %%ymm6,  %%ymm10  \n\t"
1190     "vaddpd           %%ymm8,  %%ymm7,  %%ymm8   \n\t"
1191     "                                            \n\t"
1192     "                                            \n\t"
1193     "                                            \n\t" // iteration 3
1194     "vmovapd   7 * 32(%%rax),  %%ymm1            \n\t"
1195     "addq         $4 * 8 * 8,  %%rax             \n\t" // a += 4*8 (unroll x mr)
1196     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1197     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1198     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1199     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1200     "vaddpd           %%ymm15, %%ymm6,  %%ymm15  \n\t"
1201     "vaddpd           %%ymm13, %%ymm7,  %%ymm13  \n\t"
1202     "                                            \n\t"
1203     //"prefetcht0  22 * 32(%%rax)                  \n\t"
1204     "prefetcht0  14 * 32(%%rax)                  \n\t"
1205     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1206     "vmovapd   0 * 32(%%rbx),  %%ymm2            \n\t"
1207     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1208     "vpermilpd  $0x5, %%ymm2,  %%ymm3            \n\t"
1209     "vaddpd           %%ymm14, %%ymm6,  %%ymm14  \n\t"
1210     "vaddpd           %%ymm12, %%ymm7,  %%ymm12  \n\t"
1211     "                                            \n\t"
1212     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1213     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1214     "vmovapd   0 * 32(%%rax),  %%ymm0            \n\t"
1215     "vaddpd           %%ymm11, %%ymm6,  %%ymm11  \n\t"
1216     "vaddpd           %%ymm9,  %%ymm7,  %%ymm9   \n\t"
1217     "                                            \n\t"
1218     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1219     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1220     "vaddpd           %%ymm10, %%ymm6,  %%ymm10  \n\t"
1221     "vaddpd           %%ymm8,  %%ymm7,  %%ymm8   \n\t"
1222     "                                            \n\t"
1223     "                                            \n\t"
1224     "                                            \n\t"
1225     //"addq   $4 * 8 * 8, %%rax                    \n\t" // a      += 4*8 (unroll x mr)
1226     //"addq   $4 * 4 * 8, %%rbx                    \n\t" // b      += 4*4 (unroll x nr)
1227     "                                            \n\t"
1228     "decq   %%rsi                                \n\t" // i -= 1;
1229     "jne    .DLOOPKITER                          \n\t" // iterate again if i != 0.
1230     "                                            \n\t"
1231     "                                            \n\t"
1232     "                                            \n\t"
1233     "                                            \n\t"
1234     "                                            \n\t"
1235     "                                            \n\t"
1236     ".DCONSIDKLEFT:                              \n\t"
1237     "                                            \n\t"
1238     "movq      %1, %%rsi                         \n\t" // i = k_left;
1239     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1240     "je     .DPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
1241     "                                            \n\t" // else, we prepare to enter k_left loop.
1242     "                                            \n\t"
1243     "                                            \n\t"
1244     ".DLOOPKLEFT:                                \n\t" // EDGE LOOP
1245     "                                            \n\t"
1246     "vmovapd   1 * 32(%%rax),  %%ymm1            \n\t"
1247     "addq         $8 * 1 * 8,  %%rax             \n\t" // a += 8 (1 x mr)
1248     "vmulpd           %%ymm0,  %%ymm2, %%ymm6    \n\t"
1249     "vperm2f128 $0x3, %%ymm2,  %%ymm2, %%ymm4    \n\t"
1250     "vmulpd           %%ymm0,  %%ymm3, %%ymm7    \n\t"
1251     "vperm2f128 $0x3, %%ymm3,  %%ymm3, %%ymm5    \n\t"
1252     "vaddpd           %%ymm15, %%ymm6, %%ymm15   \n\t"
1253     "vaddpd           %%ymm13, %%ymm7, %%ymm13   \n\t"
1254     "                                            \n\t"
1255     "prefetcht0  14 * 32(%%rax)                  \n\t"
1256     "vmulpd           %%ymm1,  %%ymm2, %%ymm6    \n\t"
1257     "vmovapd   1 * 32(%%rbx),  %%ymm2            \n\t"
1258     "addq         $4 * 1 * 8,  %%rbx             \n\t" // b += 4 (1 x nr)
1259     "vmulpd           %%ymm1,  %%ymm3, %%ymm7    \n\t"
1260     "vpermilpd  $0x5, %%ymm2,  %%ymm3            \n\t"
1261     "vaddpd           %%ymm14, %%ymm6, %%ymm14   \n\t"
1262     "vaddpd           %%ymm12, %%ymm7, %%ymm12   \n\t"
1263     "                                            \n\t"
1264     "vmulpd           %%ymm0,  %%ymm4, %%ymm6    \n\t"
1265     "vmulpd           %%ymm0,  %%ymm5, %%ymm7    \n\t"
1266     "vmovapd   0 * 32(%%rax),  %%ymm0            \n\t"
1267     "vaddpd           %%ymm11, %%ymm6, %%ymm11   \n\t"
1268     "vaddpd           %%ymm9,  %%ymm7, %%ymm9    \n\t"
1269     "                                            \n\t"
1270     "vmulpd           %%ymm1,  %%ymm4, %%ymm6    \n\t"
1271     "vmulpd           %%ymm1,  %%ymm5, %%ymm7    \n\t"
1272     "vaddpd           %%ymm10, %%ymm6, %%ymm10   \n\t"
1273     "vaddpd           %%ymm8,  %%ymm7, %%ymm8    \n\t"
1274     "                                            \n\t"
1275     "                                            \n\t"
1276     "decq   %%rsi                                \n\t" // i -= 1;
1277     "jne    .DLOOPKLEFT                          \n\t" // iterate again if i != 0.
1278     "                                            \n\t"
1279     "                                            \n\t"
1280     "                                            \n\t"
1281     ".DPOSTACCUM:                                \n\t"
1282     "                                            \n\t"
1283     "                                            \n\t"
1284     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
1285     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
1286     "                                            \n\t" //   ab11    ab10    ab13    ab12
1287     "                                            \n\t" //   ab22    ab23    ab20    ab21
1288     "                                            \n\t" //   ab33 )  ab32 )  ab31 )  ab30 )
1289     "                                            \n\t"
1290     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
1291     "                                            \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
1292     "                                            \n\t" //   ab51    ab50    ab53    ab52
1293     "                                            \n\t" //   ab62    ab63    ab60    ab61
1294     "                                            \n\t" //   ab73 )  ab72 )  ab71 )  ab70 )
1295     "                                            \n\t"
1296     "vmovapd          %%ymm15, %%ymm7            \n\t"
1297     "vshufpd    $0xa, %%ymm15, %%ymm13, %%ymm15  \n\t"
1298     "vshufpd    $0xa, %%ymm13, %%ymm7,  %%ymm13  \n\t"
1299     "                                            \n\t"
1300     "vmovapd          %%ymm11, %%ymm7            \n\t"
1301     "vshufpd    $0xa, %%ymm11, %%ymm9,  %%ymm11  \n\t"
1302     "vshufpd    $0xa, %%ymm9,  %%ymm7,  %%ymm9   \n\t"
1303     "                                            \n\t"
1304     "vmovapd          %%ymm14, %%ymm7            \n\t"
1305     "vshufpd    $0xa, %%ymm14, %%ymm12, %%ymm14  \n\t"
1306     "vshufpd    $0xa, %%ymm12, %%ymm7,  %%ymm12  \n\t"
1307     "                                            \n\t"
1308     "vmovapd          %%ymm10, %%ymm7            \n\t"
1309     "vshufpd    $0xa, %%ymm10, %%ymm8,  %%ymm10  \n\t"
1310     "vshufpd    $0xa, %%ymm8,  %%ymm7,  %%ymm8   \n\t"
1311     "                                            \n\t"
1312     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
1313     "                                            \n\t" // ( ab01  ( ab00  ( ab03  ( ab02
1314     "                                            \n\t" //   ab11    ab10    ab13    ab12
1315     "                                            \n\t" //   ab23    ab22    ab21    ab20
1316     "                                            \n\t" //   ab33 )  ab32 )  ab31 )  ab30 )
1317     "                                            \n\t"
1318     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
1319     "                                            \n\t" // ( ab41  ( ab40  ( ab43  ( ab42
1320     "                                            \n\t" //   ab51    ab50    ab53    ab52
1321     "                                            \n\t" //   ab63    ab62    ab61    ab60
1322     "                                            \n\t" //   ab73 )  ab72 )  ab71 )  ab70 )
1323     "                                            \n\t"
1324     "vmovapd           %%ymm15, %%ymm7           \n\t"
1325     "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t"
1326     "vperm2f128 $0x12, %%ymm7,  %%ymm11, %%ymm11 \n\t"
1327     "                                            \n\t"
1328     "vmovapd           %%ymm13, %%ymm7           \n\t"
1329     "vperm2f128 $0x30, %%ymm13, %%ymm9,  %%ymm13 \n\t"
1330     "vperm2f128 $0x12, %%ymm7,  %%ymm9,  %%ymm9  \n\t"
1331     "                                            \n\t"
1332     "vmovapd           %%ymm14, %%ymm7           \n\t"
1333     "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t"
1334     "vperm2f128 $0x12, %%ymm7,  %%ymm10, %%ymm10 \n\t"
1335     "                                            \n\t"
1336     "vmovapd           %%ymm12, %%ymm7           \n\t"
1337     "vperm2f128 $0x30, %%ymm12, %%ymm8,  %%ymm12 \n\t"
1338     "vperm2f128 $0x12, %%ymm7,  %%ymm8,  %%ymm8  \n\t"
1339     "                                            \n\t"
1340     "                                            \n\t" // ymm9:   ymm11:  ymm13:  ymm15:
1341     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
1342     "                                            \n\t" //   ab10    ab11    ab12    ab13
1343     "                                            \n\t" //   ab20    ab21    ab22    ab23
1344     "                                            \n\t" //   ab30 )  ab31 )  ab32 )  ab33 )
1345     "                                            \n\t"
1346     "                                            \n\t" // ymm8:   ymm10:  ymm12:  ymm14:
1347     "                                            \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
1348     "                                            \n\t" //   ab50    ab51    ab52    ab53
1349     "                                            \n\t" //   ab60    ab61    ab62    ab63
1350     "                                            \n\t" //   ab70 )  ab71 )  ab72 )  ab73 )
1351     "                                            \n\t"
1352     "                                            \n\t"
1353     "movq         %4, %%rax                      \n\t" // load address of alpha
1354     "movq         %5, %%rbx                      \n\t" // load address of beta
1355     "vbroadcastsd    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
1356     "vbroadcastsd    (%%rbx), %%ymm2             \n\t" // load beta and duplicate
1357     "                                            \n\t"
1358     "vmulpd           %%ymm0,  %%ymm8,  %%ymm8   \n\t" // scale by alpha
1359     "vmulpd           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
1360     "vmulpd           %%ymm0,  %%ymm10, %%ymm10  \n\t"
1361     "vmulpd           %%ymm0,  %%ymm11, %%ymm11  \n\t"
1362     "vmulpd           %%ymm0,  %%ymm12, %%ymm12  \n\t"
1363     "vmulpd           %%ymm0,  %%ymm13, %%ymm13  \n\t"
1364     "vmulpd           %%ymm0,  %%ymm14, %%ymm14  \n\t"
1365     "vmulpd           %%ymm0,  %%ymm15, %%ymm15  \n\t"
1366     "                                            \n\t"
1367     "                                            \n\t"
1368     "                                            \n\t"
1369     "                                            \n\t"
1370     "                                            \n\t"
1371     "                                            \n\t"
1372     "movq                %7, %%rsi               \n\t" // load rs_c
1373     "leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(double)
1374     "                                            \n\t"
1375     "leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c + 4*rs_c;
1376     "                                            \n\t"
1377     "leaq        (,%%rsi,2), %%r12               \n\t" // r12 = 2*rs_c;
1378     "leaq   (%%r12,%%rsi,1), %%r13               \n\t" // r13 = 3*rs_c;
1379     "                                            \n\t"
1380     "                                            \n\t"
1381     "                                            \n\t"
1382     "                                            \n\t" // determine if
1383     "                                            \n\t" //    c    % 32 == 0, AND
1384     "                                            \n\t" //  8*cs_c % 32 == 0, AND
1385     "                                            \n\t" //    rs_c      == 1
1386     "                                            \n\t" // ie: aligned, ldim aligned, and
1387     "                                            \n\t" // column-stored
1388     "                                            \n\t"
1389     "cmpq       $8, %%rsi                        \n\t" // set ZF if (8*rs_c) == 8.
1390     "sete           %%bl                         \n\t" // bl = ( ZF == 1 ? 1 : 0 );
1391     "testq     $31, %%rcx                        \n\t" // set ZF if c & 32 is zero.
1392     "setz           %%bh                         \n\t" // bh = ( ZF == 0 ? 1 : 0 );
1393     "testq     $31, %%rdi                        \n\t" // set ZF if (8*cs_c) & 32 is zero.
1394     "setz           %%al                         \n\t" // al = ( ZF == 0 ? 1 : 0 );
1395     "                                            \n\t" // and(bl,bh) followed by
1396     "                                            \n\t" // and(bh,al) will reveal result
1397     "                                            \n\t"
1398     "                                            \n\t" // now avoid loading C if beta == 0
1399     "                                            \n\t"
1400     "vxorpd    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
1401     "vucomisd  %%xmm0,  %%xmm2                   \n\t" // set ZF if beta == 0.
1402     "je      .DBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
1403     "                                            \n\t"
1404     "                                            \n\t"
1405     "                                            \n\t" // check if aligned/column-stored
1406     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
1407     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
1408     "jne     .DCOLSTORED                         \n\t" // jump to column storage case
1409     "                                            \n\t"
1410     "                                            \n\t"
1411     "                                            \n\t"
1412     ".DGENSTORED:                                \n\t"
1413     "                                            \n\t" // update c00:c33
1414     "                                            \n\t"
1415     "vextractf128 $1, %%ymm9,  %%xmm1            \n\t"
1416     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load c00 and c10,
1417     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1418     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1419     "vaddpd           %%xmm9,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1420     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // and store back to memory.
1421     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t"
1422     "vmovlpd    (%%rcx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c20 and c30,
1423     "vmovhpd    (%%rcx,%%r13), %%xmm0,  %%xmm0   \n\t"
1424     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1425     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1426     "vmovlpd          %%xmm0,  (%%rcx,%%r12)     \n\t" // and store back to memory.
1427     "vmovhpd          %%xmm0,  (%%rcx,%%r13)     \n\t"
1428     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1429     "                                            \n\t"
1430     "vextractf128 $1, %%ymm11, %%xmm1            \n\t"
1431     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load c01 and c11,
1432     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1433     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1434     "vaddpd           %%xmm11, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1435     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // and store back to memory.
1436     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t"
1437     "vmovlpd    (%%rcx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c21 and c31,
1438     "vmovhpd    (%%rcx,%%r13), %%xmm0,  %%xmm0   \n\t"
1439     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1440     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1441     "vmovlpd          %%xmm0,  (%%rcx,%%r12)     \n\t" // and store back to memory.
1442     "vmovhpd          %%xmm0,  (%%rcx,%%r13)     \n\t"
1443     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1444     "                                            \n\t"
1445     "vextractf128 $1, %%ymm13, %%xmm1            \n\t"
1446     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load c02 and c12,
1447     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1448     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1449     "vaddpd           %%xmm13, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1450     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // and store back to memory.
1451     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t"
1452     "vmovlpd    (%%rcx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c22 and c32,
1453     "vmovhpd    (%%rcx,%%r13), %%xmm0,  %%xmm0   \n\t"
1454     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1455     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1456     "vmovlpd          %%xmm0,  (%%rcx,%%r12)     \n\t" // and store back to memory.
1457     "vmovhpd          %%xmm0,  (%%rcx,%%r13)     \n\t"
1458     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1459     "                                            \n\t"
1460     "vextractf128 $1, %%ymm15, %%xmm1            \n\t"
1461     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load c03 and c13,
1462     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1463     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1464     "vaddpd           %%xmm15, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1465     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // and store back to memory.
1466     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t"
1467     "vmovlpd    (%%rcx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c23 and c33,
1468     "vmovhpd    (%%rcx,%%r13), %%xmm0,  %%xmm0   \n\t"
1469     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1470     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1471     "vmovlpd          %%xmm0,  (%%rcx,%%r12)     \n\t" // and store back to memory.
1472     "vmovhpd          %%xmm0,  (%%rcx,%%r13)     \n\t"
1473     "                                            \n\t"
1474     "                                            \n\t" // update c40:c73
1475     "                                            \n\t"
1476     "vextractf128 $1, %%ymm8,  %%xmm1            \n\t"
1477     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load c40 and c50,
1478     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1479     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1480     "vaddpd           %%xmm8,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1481     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // and store back to memory.
1482     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t"
1483     "vmovlpd    (%%rdx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c60 and c70,
1484     "vmovhpd    (%%rdx,%%r13), %%xmm0,  %%xmm0   \n\t"
1485     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1486     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1487     "vmovlpd          %%xmm0,  (%%rdx,%%r12)     \n\t" // and store back to memory.
1488     "vmovhpd          %%xmm0,  (%%rdx,%%r13)     \n\t"
1489     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1490     "                                            \n\t"
1491     "vextractf128 $1, %%ymm10, %%xmm1            \n\t"
1492     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load c41 and c51,
1493     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1494     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1495     "vaddpd           %%xmm10, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1496     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // and store back to memory.
1497     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t"
1498     "vmovlpd    (%%rdx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c61 and c71,
1499     "vmovhpd    (%%rdx,%%r13), %%xmm0,  %%xmm0   \n\t"
1500     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1501     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1502     "vmovlpd          %%xmm0,  (%%rdx,%%r12)     \n\t" // and store back to memory.
1503     "vmovhpd          %%xmm0,  (%%rdx,%%r13)     \n\t"
1504     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1505     "                                            \n\t"
1506     "vextractf128 $1, %%ymm12, %%xmm1            \n\t"
1507     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load c42 and c52,
1508     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1509     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1510     "vaddpd           %%xmm12, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1511     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // and store back to memory.
1512     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t"
1513     "vmovlpd    (%%rdx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c62 and c72,
1514     "vmovhpd    (%%rdx,%%r13), %%xmm0,  %%xmm0   \n\t"
1515     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1516     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1517     "vmovlpd          %%xmm0,  (%%rdx,%%r12)     \n\t" // and store back to memory.
1518     "vmovhpd          %%xmm0,  (%%rdx,%%r13)     \n\t"
1519     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1520     "                                            \n\t"
1521     "vextractf128 $1, %%ymm14, %%xmm1            \n\t"
1522     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load c43 and c53,
1523     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t"
1524     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1525     "vaddpd           %%xmm14, %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1526     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // and store back to memory.
1527     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t"
1528     "vmovlpd    (%%rdx,%%r12), %%xmm0,  %%xmm0   \n\t" // load c63 and c73,
1529     "vmovhpd    (%%rdx,%%r13), %%xmm0,  %%xmm0   \n\t"
1530     "vmulpd           %%xmm2,  %%xmm0,  %%xmm0   \n\t" // scale by beta,
1531     "vaddpd           %%xmm1,  %%xmm0,  %%xmm0   \n\t" // add the gemm result,
1532     "vmovlpd          %%xmm0,  (%%rdx,%%r12)     \n\t" // and store back to memory.
1533     "vmovhpd          %%xmm0,  (%%rdx,%%r13)     \n\t"
1534     "                                            \n\t"
1535     "                                            \n\t"
1536     "jmp    .DDONE                               \n\t" // jump to end.
1537     "                                            \n\t"
1538     "                                            \n\t"
1539     "                                            \n\t"
1540     ".DCOLSTORED:                                \n\t"
1541     "                                            \n\t" // update c00:c33
1542     "                                            \n\t"
1543     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c00:c30,
1544     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1545     "vaddpd           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1546     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
1547     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1548     "                                            \n\t"
1549     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c01:c31,
1550     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1551     "vaddpd           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1552     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
1553     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1554     "                                            \n\t"
1555     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c02:c32,
1556     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1557     "vaddpd           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1558     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
1559     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1560     "                                            \n\t"
1561     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c03:c33,
1562     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1563     "vaddpd           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1564     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // and store back to memory.
1565     "                                            \n\t"
1566     "                                            \n\t" // update c40:c73
1567     "                                            \n\t"
1568     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c40:c70,
1569     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1570     "vaddpd           %%ymm8,  %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1571     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // and store back to memory.
1572     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1573     "                                            \n\t"
1574     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c41:c71,
1575     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1576     "vaddpd           %%ymm10, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1577     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // and store back to memory.
1578     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1579     "                                            \n\t"
1580     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c42:c72,
1581     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1582     "vaddpd           %%ymm12, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1583     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // and store back to memory.
1584     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1585     "                                            \n\t"
1586     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c43:c73,
1587     "vmulpd           %%ymm2,  %%ymm0,  %%ymm0   \n\t" // scale by beta,
1588     "vaddpd           %%ymm14, %%ymm0,  %%ymm0   \n\t" // add the gemm result,
1589     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // and store back to memory.
1590     "                                            \n\t"
1591     "                                            \n\t"
1592     "jmp    .DDONE                               \n\t" // jump to end.
1593     "                                            \n\t"
1594     "                                            \n\t"
1595     "                                            \n\t"
1596     "                                            \n\t"
1597     ".DBETAZERO:                                 \n\t"
1598     "                                            \n\t" // check if aligned/column-stored
1599     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
1600     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
1601     "jne     .DCOLSTORBZ                         \n\t" // jump to column storage case
1602     "                                            \n\t"
1603     "                                            \n\t"
1604     "                                            \n\t"
1605     ".DGENSTORBZ:                                \n\t"
1606     "                                            \n\t" // update c00:c33
1607     "                                            \n\t"
1608     "vextractf128 $1, %%ymm9,  %%xmm1            \n\t"
1609     "vmovlpd          %%xmm9,  (%%rcx)           \n\t" // store to c00:c30
1610     "vmovhpd          %%xmm9,  (%%rcx,%%rsi)     \n\t"
1611     "vmovlpd          %%xmm1,  (%%rcx,%%r12)     \n\t"
1612     "vmovhpd          %%xmm1,  (%%rcx,%%r13)     \n\t"
1613     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1614     "                                            \n\t"
1615     "vextractf128 $1, %%ymm11, %%xmm1            \n\t"
1616     "vmovlpd          %%xmm11, (%%rcx)           \n\t" // store to c01:c31
1617     "vmovhpd          %%xmm11, (%%rcx,%%rsi)     \n\t"
1618     "vmovlpd          %%xmm1,  (%%rcx,%%r12)     \n\t"
1619     "vmovhpd          %%xmm1,  (%%rcx,%%r13)     \n\t"
1620     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1621     "                                            \n\t"
1622     "vextractf128 $1, %%ymm13, %%xmm1            \n\t"
1623     "vmovlpd          %%xmm13, (%%rcx)           \n\t" // store to c02:c32
1624     "vmovhpd          %%xmm13, (%%rcx,%%rsi)     \n\t"
1625     "vmovlpd          %%xmm1,  (%%rcx,%%r12)     \n\t"
1626     "vmovhpd          %%xmm1,  (%%rcx,%%r13)     \n\t"
1627     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1628     "                                            \n\t"
1629     "vextractf128 $1, %%ymm15, %%xmm1            \n\t"
1630     "vmovlpd          %%xmm15, (%%rcx)           \n\t" // store to c03:c33
1631     "vmovhpd          %%xmm15, (%%rcx,%%rsi)     \n\t"
1632     "vmovlpd          %%xmm1,  (%%rcx,%%r12)     \n\t"
1633     "vmovhpd          %%xmm1,  (%%rcx,%%r13)     \n\t"
1634     "                                            \n\t"
1635     "                                            \n\t" // update c40:c73
1636     "                                            \n\t"
1637     "vextractf128 $1, %%ymm8,  %%xmm1            \n\t"
1638     "vmovlpd          %%xmm8,  (%%rdx)           \n\t" // store to c40:c70
1639     "vmovhpd          %%xmm8,  (%%rdx,%%rsi)     \n\t"
1640     "vmovlpd          %%xmm1,  (%%rdx,%%r12)     \n\t"
1641     "vmovhpd          %%xmm1,  (%%rdx,%%r13)     \n\t"
1642     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1643     "                                            \n\t"
1644     "vextractf128 $1, %%ymm10, %%xmm1            \n\t"
1645     "vmovlpd          %%xmm10, (%%rdx)           \n\t" // store to c41:c71
1646     "vmovhpd          %%xmm10, (%%rdx,%%rsi)     \n\t"
1647     "vmovlpd          %%xmm1,  (%%rdx,%%r12)     \n\t"
1648     "vmovhpd          %%xmm1,  (%%rdx,%%r13)     \n\t"
1649     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1650     "                                            \n\t"
1651     "vextractf128 $1, %%ymm12, %%xmm1            \n\t"
1652     "vmovlpd          %%xmm12, (%%rdx)           \n\t" // store to c42:c72
1653     "vmovhpd          %%xmm12, (%%rdx,%%rsi)     \n\t"
1654     "vmovlpd          %%xmm1,  (%%rdx,%%r12)     \n\t"
1655     "vmovhpd          %%xmm1,  (%%rdx,%%r13)     \n\t"
1656     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1657     "                                            \n\t"
1658     "vextractf128 $1, %%ymm14, %%xmm1            \n\t"
1659     "vmovlpd          %%xmm14, (%%rdx)           \n\t" // store to c43:c73
1660     "vmovhpd          %%xmm14, (%%rdx,%%rsi)     \n\t"
1661     "vmovlpd          %%xmm1,  (%%rdx,%%r12)     \n\t"
1662     "vmovhpd          %%xmm1,  (%%rdx,%%r13)     \n\t"
1663     "                                            \n\t"
1664     "                                            \n\t"
1665     "jmp    .DDONE                               \n\t" // jump to end.
1666     "                                            \n\t"
1667     "                                            \n\t"
1668     "                                            \n\t"
1669     ".DCOLSTORBZ:                                \n\t"
1670     "                                            \n\t" // update c00:c33
1671     "                                            \n\t"
1672     "vmovapd          %%ymm9,  (%%rcx)           \n\t" // store c00:c30
1673     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1674     "                                            \n\t"
1675     "vmovapd          %%ymm11, (%%rcx)           \n\t" // store c01:c31
1676     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1677     "                                            \n\t"
1678     "vmovapd          %%ymm13, (%%rcx)           \n\t" // store c02:c32
1679     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
1680     "                                            \n\t"
1681     "vmovapd          %%ymm15, (%%rcx)           \n\t" // store c03:c33
1682     "                                            \n\t"
1683     "                                            \n\t" // update c40:c73
1684     "                                            \n\t"
1685     "vmovapd          %%ymm8,  (%%rdx)           \n\t" // store c40:c70
1686     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1687     "                                            \n\t"
1688     "vmovapd          %%ymm10, (%%rdx)           \n\t" // store c41:c71
1689     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1690     "                                            \n\t"
1691     "vmovapd          %%ymm12, (%%rdx)           \n\t" // store c42:c72
1692     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
1693     "                                            \n\t"
1694     "vmovapd          %%ymm14, (%%rdx)           \n\t" // store c43:c73
1695     "                                            \n\t"
1696     "                                            \n\t"
1697     "                                            \n\t"
1698     "                                            \n\t"
1699     "                                            \n\t"
1700     ".DDONE:                                     \n\t"
1701     "                                            \n\t"
1702    // "vzeroupper                                  \n\t"
1703     "                                            \n\t"
1704 
1705     : // output operands (none)
1706     : // input operands
1707       "m" (k_iter), // 0
1708       "m" (k_left), // 1
1709       "m" (a),      // 2
1710       "m" (b),      // 3
1711       "m" (alpha),  // 4
1712       "m" (beta),   // 5
1713       "m" (c),      // 6
1714       "m" (rs_c),   // 7
1715       "m" (cs_c),   // 8
1716       "m" (b_next)/*, // 9
1717       "m" (a_next)*/  // 10
1718     : // register clobber list
1719       "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1720       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1721       "xmm0", "xmm1", "xmm2", "xmm3",
1722       "xmm4", "xmm5", "xmm6", "xmm7",
1723       "xmm8", "xmm9", "xmm10", "xmm11",
1724       "xmm12", "xmm13", "xmm14", "xmm15",
1725       "memory"
1726     );
1727 }
1728 
bli_cgemm_asm_8x4(dim_t k,scomplex * restrict alpha,scomplex * restrict a,scomplex * restrict b,scomplex * restrict beta,scomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)1729 void bli_cgemm_asm_8x4
1730      (
1731        dim_t               k,
1732        scomplex*  restrict alpha,
1733        scomplex*  restrict a,
1734        scomplex*  restrict b,
1735        scomplex*  restrict beta,
1736        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
1737        auxinfo_t* restrict data,
1738        cntx_t*    restrict cntx
1739      )
1740 {
1741     //void*   a_next = bli_auxinfo_next_a( data );
1742     void*   b_next = bli_auxinfo_next_b( data );
1743 
1744     uint64_t   k_iter = k / 4;
1745     uint64_t   k_left = k % 4;
1746 
1747     __asm__ volatile
1748     (
1749     "                                            \n\t"
1750     "                                            \n\t"
1751     "movq                %2, %%rax               \n\t" // load address of a.
1752     "movq                %3, %%rbx               \n\t" // load address of b.
1753     "movq                %9, %%r15               \n\t" // load address of b_next.
1754     //"movq               %10, %%r14               \n\t" // load address of a_next.
1755     "addq          $-4 * 64, %%r15               \n\t"
1756     "                                            \n\t"
1757     "vmovaps        0 * 32(%%rax), %%ymm0        \n\t" // initialize loop by pre-loading
1758     "vmovsldup      0 * 32(%%rbx), %%ymm2        \n\t"
1759     "vpermilps     $0x4e, %%ymm2,  %%ymm3        \n\t"
1760     "                                            \n\t"
1761     "movq                %6, %%rcx               \n\t" // load address of c
1762     "movq                %8, %%rdi               \n\t" // load cs_c
1763     "leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(scomplex)
1764     "leaq   (%%rcx,%%rdi,2), %%r10               \n\t" // load address of c + 2*cs_c;
1765     "                                            \n\t"
1766     "prefetcht0   3 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
1767     "prefetcht0   3 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
1768     "prefetcht0   3 * 8(%%r10)                   \n\t" // prefetch c + 2*cs_c
1769     "prefetcht0   3 * 8(%%r10,%%rdi)             \n\t" // prefetch c + 3*cs_c
1770     "                                            \n\t"
1771     "vxorps    %%ymm8,  %%ymm8,  %%ymm8          \n\t"
1772     "vxorps    %%ymm9,  %%ymm9,  %%ymm9          \n\t"
1773     "vxorps    %%ymm10, %%ymm10, %%ymm10         \n\t"
1774     "vxorps    %%ymm11, %%ymm11, %%ymm11         \n\t"
1775     "vxorps    %%ymm12, %%ymm12, %%ymm12         \n\t"
1776     "vxorps    %%ymm13, %%ymm13, %%ymm13         \n\t"
1777     "vxorps    %%ymm14, %%ymm14, %%ymm14         \n\t"
1778     "vxorps    %%ymm15, %%ymm15, %%ymm15         \n\t"
1779     "                                            \n\t"
1780     "                                            \n\t"
1781     "                                            \n\t"
1782     "movq      %0, %%rsi                         \n\t" // i = k_iter;
1783     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
1784     "je     .CCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
1785     "                                            \n\t" // contains the k_left loop.
1786     "                                            \n\t"
1787     "                                            \n\t"
1788     ".CLOOPKITER:                                \n\t" // MAIN LOOP
1789     "                                            \n\t"
1790     "addq         $4 * 4 * 8,  %%r15             \n\t" // b_next += 4*4 (unroll x nr)
1791     "                                            \n\t"
1792     "                                            \n\t" // iteration 0
1793     "prefetcht0     8 * 32(%%rax)                \n\t"
1794     "vmovaps        1 * 32(%%rax),      %%ymm1   \n\t"
1795     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1796     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1797     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1798     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1799     "vaddps           %%ymm6,  %%ymm15, %%ymm15  \n\t"
1800     "vaddps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
1801     "                                            \n\t"
1802     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1803     "vmovshdup      0 * 32(%%rbx),      %%ymm2   \n\t"
1804     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1805     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1806     "vaddps           %%ymm6,  %%ymm14, %%ymm14  \n\t"
1807     "vaddps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
1808     "                                            \n\t"
1809     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1810     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1811     "vpermilps $0xb1, %%ymm0,  %%ymm0            \n\t"
1812     "vaddps           %%ymm6,  %%ymm11, %%ymm11  \n\t"
1813     "vaddps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1814     "                                            \n\t"
1815     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1816     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1817     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1818     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1819     "vaddps           %%ymm6,  %%ymm10, %%ymm10  \n\t"
1820     "vaddps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1821     "prefetcht0   0 * 32(%%r15)                  \n\t" // prefetch b_next[0*4]
1822     "                                            \n\t"
1823     "vpermilps $0xb1, %%ymm1,  %%ymm1            \n\t"
1824     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1825     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1826     "vaddsubps        %%ymm6,  %%ymm15, %%ymm15  \n\t"
1827     "vaddsubps        %%ymm7,  %%ymm13, %%ymm13  \n\t"
1828     "                                            \n\t"
1829     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1830     "vmovsldup      1 * 32(%%rbx),      %%ymm2   \n\t"
1831     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1832     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1833     "vaddsubps        %%ymm6,  %%ymm14, %%ymm14  \n\t"
1834     "vaddsubps        %%ymm7,  %%ymm12, %%ymm12  \n\t"
1835     "                                            \n\t"
1836     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1837     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1838     "vmovaps        2 * 32(%%rax),      %%ymm0   \n\t"
1839     "vaddsubps        %%ymm6,  %%ymm11, %%ymm11  \n\t"
1840     "vaddsubps        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1841     "                                            \n\t"
1842     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1843     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1844     "vaddsubps        %%ymm6,  %%ymm10, %%ymm10  \n\t"
1845     "vaddsubps        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1846     "                                            \n\t"
1847     "                                            \n\t"
1848     "                                            \n\t" // iteration 1
1849     "prefetcht0    10 * 32(%%rax)                \n\t"
1850     "vmovaps        3 * 32(%%rax),      %%ymm1   \n\t"
1851     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1852     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1853     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1854     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1855     "vaddps           %%ymm6,  %%ymm15, %%ymm15  \n\t"
1856     "vaddps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
1857     "                                            \n\t"
1858     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1859     "vmovshdup      1 * 32(%%rbx),      %%ymm2   \n\t"
1860     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1861     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1862     "vaddps           %%ymm6,  %%ymm14, %%ymm14  \n\t"
1863     "vaddps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
1864     "                                            \n\t"
1865     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1866     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1867     "vpermilps $0xb1, %%ymm0,  %%ymm0            \n\t"
1868     "vaddps           %%ymm6,  %%ymm11, %%ymm11  \n\t"
1869     "vaddps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1870     "                                            \n\t"
1871     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1872     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1873     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1874     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1875     "vaddps           %%ymm6,  %%ymm10, %%ymm10  \n\t"
1876     "vaddps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1877     "                                            \n\t"
1878     "vpermilps $0xb1, %%ymm1,  %%ymm1            \n\t"
1879     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1880     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1881     "vaddsubps        %%ymm6,  %%ymm15, %%ymm15  \n\t"
1882     "vaddsubps        %%ymm7,  %%ymm13, %%ymm13  \n\t"
1883     "                                            \n\t"
1884     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1885     "vmovsldup      2 * 32(%%rbx),      %%ymm2   \n\t"
1886     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1887     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1888     "vaddsubps        %%ymm6,  %%ymm14, %%ymm14  \n\t"
1889     "vaddsubps        %%ymm7,  %%ymm12, %%ymm12  \n\t"
1890     "                                            \n\t"
1891     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1892     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1893     "vmovaps        4 * 32(%%rax),      %%ymm0   \n\t"
1894     "vaddsubps        %%ymm6,  %%ymm11, %%ymm11  \n\t"
1895     "vaddsubps        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1896     "                                            \n\t"
1897     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1898     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1899     "vaddsubps        %%ymm6,  %%ymm10, %%ymm10  \n\t"
1900     "vaddsubps        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1901     "                                            \n\t"
1902     "                                            \n\t"
1903     "                                            \n\t" // iteration 2
1904     "prefetcht0    12 * 32(%%rax)                \n\t"
1905     "vmovaps        5 * 32(%%rax),      %%ymm1   \n\t"
1906     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1907     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1908     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1909     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1910     "vaddps           %%ymm6,  %%ymm15, %%ymm15  \n\t"
1911     "vaddps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
1912     "                                            \n\t"
1913     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1914     "vmovshdup      2 * 32(%%rbx),      %%ymm2   \n\t"
1915     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1916     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1917     "vaddps           %%ymm6,  %%ymm14, %%ymm14  \n\t"
1918     "vaddps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
1919     "                                            \n\t"
1920     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1921     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1922     "vpermilps $0xb1, %%ymm0,  %%ymm0            \n\t"
1923     "vaddps           %%ymm6,  %%ymm11, %%ymm11  \n\t"
1924     "vaddps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1925     "                                            \n\t"
1926     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1927     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1928     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1929     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1930     "vaddps           %%ymm6,  %%ymm10, %%ymm10  \n\t"
1931     "vaddps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1932     "prefetcht0   2 * 32(%%r15)                  \n\t" // prefetch b_next[2*4]
1933     "                                            \n\t"
1934     "vpermilps $0xb1, %%ymm1,  %%ymm1            \n\t"
1935     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1936     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1937     "vaddsubps        %%ymm6,  %%ymm15, %%ymm15  \n\t"
1938     "vaddsubps        %%ymm7,  %%ymm13, %%ymm13  \n\t"
1939     "                                            \n\t"
1940     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1941     "vmovsldup      3 * 32(%%rbx),      %%ymm2   \n\t"
1942     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1943     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1944     "vaddsubps        %%ymm6,  %%ymm14, %%ymm14  \n\t"
1945     "vaddsubps        %%ymm7,  %%ymm12, %%ymm12  \n\t"
1946     "                                            \n\t"
1947     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1948     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1949     "vmovaps        6 * 32(%%rax),      %%ymm0   \n\t"
1950     "vaddsubps        %%ymm6,  %%ymm11, %%ymm11  \n\t"
1951     "vaddsubps        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1952     "                                            \n\t"
1953     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1954     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1955     "vaddsubps        %%ymm6,  %%ymm10, %%ymm10  \n\t"
1956     "vaddsubps        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1957     "                                            \n\t"
1958     "                                            \n\t"
1959     "                                            \n\t" // iteration 3
1960     "prefetcht0    14 * 32(%%rax)                \n\t"
1961     "vmovaps        7 * 32(%%rax),      %%ymm1   \n\t"
1962     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1963     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1964     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1965     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1966     "vaddps           %%ymm6,  %%ymm15, %%ymm15  \n\t"
1967     "vaddps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
1968     "                                            \n\t"
1969     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1970     "vmovshdup      3 * 32(%%rbx),      %%ymm2   \n\t"
1971     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1972     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1973     "vaddps           %%ymm6,  %%ymm14, %%ymm14  \n\t"
1974     "vaddps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
1975     "                                            \n\t"
1976     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
1977     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
1978     "vpermilps $0xb1, %%ymm0,  %%ymm0            \n\t"
1979     "vaddps           %%ymm6,  %%ymm11, %%ymm11  \n\t"
1980     "vaddps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
1981     "                                            \n\t"
1982     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
1983     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
1984     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
1985     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
1986     "vaddps           %%ymm6,  %%ymm10, %%ymm10  \n\t"
1987     "vaddps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
1988     "                                            \n\t"
1989     "vpermilps $0xb1, %%ymm1,  %%ymm1            \n\t"
1990     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
1991     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
1992     "vaddsubps        %%ymm6,  %%ymm15, %%ymm15  \n\t"
1993     "vaddsubps        %%ymm7,  %%ymm13, %%ymm13  \n\t"
1994     "                                            \n\t"
1995     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
1996     "vmovsldup      4 * 32(%%rbx),      %%ymm2   \n\t"
1997     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
1998     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
1999     "vaddsubps        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2000     "vaddsubps        %%ymm7,  %%ymm12, %%ymm12  \n\t"
2001     "                                            \n\t"
2002     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2003     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2004     "vmovaps        8 * 32(%%rax),      %%ymm0   \n\t"
2005     "vaddsubps        %%ymm6,  %%ymm11, %%ymm11  \n\t"
2006     "vaddsubps        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2007     "                                            \n\t"
2008     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2009     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2010     "vaddsubps        %%ymm6,  %%ymm10, %%ymm10  \n\t"
2011     "vaddsubps        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2012     "                                            \n\t"
2013     "                                            \n\t"
2014     "addq          $8 * 4 * 8, %%rax             \n\t" // a += 8*4 (unroll x mr)
2015     "addq          $4 * 4 * 8, %%rbx             \n\t" // b += 4*4 (unroll x nr)
2016     "                                            \n\t"
2017     "                                            \n\t"
2018     "decq   %%rsi                                \n\t" // i -= 1;
2019     "jne    .CLOOPKITER                          \n\t" // iterate again if i != 0.
2020     "                                            \n\t"
2021     "                                            \n\t"
2022     "                                            \n\t"
2023     "                                            \n\t"
2024     "                                            \n\t"
2025     "                                            \n\t"
2026     ".CCONSIDKLEFT:                              \n\t"
2027     "                                            \n\t"
2028     "movq      %1, %%rsi                         \n\t" // i = k_left;
2029     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
2030     "je     .CPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
2031     "                                            \n\t" // else, we prepare to enter k_left loop.
2032     "                                            \n\t"
2033     "                                            \n\t"
2034     ".CLOOPKLEFT:                                \n\t" // EDGE LOOP
2035     "                                            \n\t"
2036     "                                            \n\t" // iteration 0
2037     "prefetcht0     8 * 32(%%rax)                \n\t"
2038     "vmovaps        1 * 32(%%rax),      %%ymm1   \n\t"
2039     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2040     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2041     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2042     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2043     "vaddps           %%ymm6,  %%ymm15, %%ymm15  \n\t"
2044     "vaddps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
2045     "                                            \n\t"
2046     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2047     "vmovshdup      0 * 32(%%rbx),      %%ymm2   \n\t"
2048     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2049     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
2050     "vaddps           %%ymm6,  %%ymm14, %%ymm14  \n\t"
2051     "vaddps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
2052     "                                            \n\t"
2053     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2054     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2055     "vpermilps $0xb1, %%ymm0,  %%ymm0            \n\t"
2056     "vaddps           %%ymm6,  %%ymm11, %%ymm11  \n\t"
2057     "vaddps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2058     "                                            \n\t"
2059     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2060     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2061     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2062     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2063     "vaddps           %%ymm6,  %%ymm10, %%ymm10  \n\t"
2064     "vaddps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2065     "                                            \n\t"
2066     "vpermilps $0xb1, %%ymm1,  %%ymm1            \n\t"
2067     "vmulps           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2068     "vmulps           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2069     "vaddsubps        %%ymm6,  %%ymm15, %%ymm15  \n\t"
2070     "vaddsubps        %%ymm7,  %%ymm13, %%ymm13  \n\t"
2071     "                                            \n\t"
2072     "vmulps           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2073     "vmovsldup      1 * 32(%%rbx),      %%ymm2   \n\t"
2074     "vmulps           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2075     "vpermilps $0x4e, %%ymm2,  %%ymm3            \n\t"
2076     "vaddsubps        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2077     "vaddsubps        %%ymm7,  %%ymm12, %%ymm12  \n\t"
2078     "                                            \n\t"
2079     "vmulps           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2080     "vmulps           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2081     "vmovaps        2 * 32(%%rax),      %%ymm0   \n\t"
2082     "vaddsubps        %%ymm6,  %%ymm11, %%ymm11  \n\t"
2083     "vaddsubps        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2084     "                                            \n\t"
2085     "vmulps           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2086     "vmulps           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2087     "vaddsubps        %%ymm6,  %%ymm10, %%ymm10  \n\t"
2088     "vaddsubps        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2089     "                                            \n\t"
2090     "                                            \n\t"
2091     "addq          $8 * 1 * 8, %%rax             \n\t" // a += 8 (1 x mr)
2092     "addq          $4 * 1 * 8, %%rbx             \n\t" // b += 4 (1 x nr)
2093     "                                            \n\t"
2094     "                                            \n\t"
2095     "decq   %%rsi                                \n\t" // i -= 1;
2096     "jne    .CLOOPKLEFT                          \n\t" // iterate again if i != 0.
2097     "                                            \n\t"
2098     "                                            \n\t"
2099     "                                            \n\t"
2100     ".CPOSTACCUM:                                \n\t"
2101     "                                            \n\t"
2102     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
2103     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
2104     "                                            \n\t" //   ab10    ab11    ab12    ab13
2105     "                                            \n\t" //   ab21    ab20    ab23    ab22
2106     "                                            \n\t" //   ab31    ab30    ab33    ab32
2107     "                                            \n\t" //   ab42    ab43    ab40    ab41
2108     "                                            \n\t" //   ab52    ab53    ab50    ab51
2109     "                                            \n\t" //   ab63    ab62    ab61    ab60
2110     "                                            \n\t" //   ab73 )  ab72 )  ab71 )  ab70 )
2111     "                                            \n\t"
2112     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
2113     "                                            \n\t" // ( ab80  ( ab81  ( ab82  ( ab83
2114     "                                            \n\t" //   ab90    ab91    ab92    ab93
2115     "                                            \n\t" //   aba1    aba0    aba3    aba2
2116     "                                            \n\t" //   abb1    abb0    abb3    abb2
2117     "                                            \n\t" //   abc2    abc3    abc0    abc1
2118     "                                            \n\t" //   abd2    abd3    abd0    abd1
2119     "                                            \n\t" //   abe3    abe2    abe1    abe0
2120     "                                            \n\t" //   abf3    abf2    abf1    abf0 )
2121     "                                            \n\t"
2122     "vmovaps          %%ymm15, %%ymm7            \n\t"
2123     "vshufps   $0xe4, %%ymm13, %%ymm15, %%ymm15  \n\t"
2124     "vshufps   $0xe4, %%ymm7,  %%ymm13, %%ymm13  \n\t"
2125     "                                            \n\t"
2126     "vmovaps          %%ymm11, %%ymm7            \n\t"
2127     "vshufps   $0xe4, %%ymm9,  %%ymm11, %%ymm11  \n\t"
2128     "vshufps   $0xe4, %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2129     "                                            \n\t"
2130     "vmovaps          %%ymm14, %%ymm7            \n\t"
2131     "vshufps   $0xe4, %%ymm12, %%ymm14, %%ymm14  \n\t"
2132     "vshufps   $0xe4, %%ymm7,  %%ymm12, %%ymm12  \n\t"
2133     "                                            \n\t"
2134     "vmovaps          %%ymm10, %%ymm7            \n\t"
2135     "vshufps   $0xe4, %%ymm8,  %%ymm10, %%ymm10  \n\t"
2136     "vshufps   $0xe4, %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2137     "                                            \n\t"
2138     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
2139     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
2140     "                                            \n\t" //   ab10    ab11    ab12    ab13
2141     "                                            \n\t" //   ab20    ab21    ab22    ab23
2142     "                                            \n\t" //   ab30    ab31    ab32    ab33
2143     "                                            \n\t" //   ab42    ab43    ab40    ab41
2144     "                                            \n\t" //   ab52    ab53    ab50    ab51
2145     "                                            \n\t" //   ab62    ab63    ab60    ab61
2146     "                                            \n\t" //   ab72 )  ab73 )  ab70 )  ab71 )
2147     "                                            \n\t"
2148     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
2149     "                                            \n\t" // ( ab80  ( ab81  ( ab82  ( ab83
2150     "                                            \n\t" //   ab90    ab91    ab92    ab93
2151     "                                            \n\t" //   aba0    aba1    aba2    aba3
2152     "                                            \n\t" //   abb0    abb1    abb2    abb3
2153     "                                            \n\t" //   abc2    abc3    abc0    abc1
2154     "                                            \n\t" //   abd2    abd3    abd0    abd1
2155     "                                            \n\t" //   abe2    abe3    abe0    abe1
2156     "                                            \n\t" //   abf2 )  abf3 )  abf0 )  abf1 )
2157     "                                            \n\t"
2158     "vmovaps           %%ymm15, %%ymm7           \n\t"
2159     "vperm2f128 $0x12, %%ymm15, %%ymm11, %%ymm15 \n\t"
2160     "vperm2f128 $0x30, %%ymm7,  %%ymm11, %%ymm11 \n\t"
2161     "                                            \n\t"
2162     "vmovaps           %%ymm13, %%ymm7           \n\t"
2163     "vperm2f128 $0x12, %%ymm13, %%ymm9,  %%ymm13 \n\t"
2164     "vperm2f128 $0x30, %%ymm7,  %%ymm9,  %%ymm9  \n\t"
2165     "                                            \n\t"
2166     "vmovaps           %%ymm14, %%ymm7           \n\t"
2167     "vperm2f128 $0x12, %%ymm14, %%ymm10, %%ymm14 \n\t"
2168     "vperm2f128 $0x30, %%ymm7,  %%ymm10, %%ymm10 \n\t"
2169     "                                            \n\t"
2170     "vmovaps           %%ymm12, %%ymm7           \n\t"
2171     "vperm2f128 $0x12, %%ymm12, %%ymm8,  %%ymm12 \n\t"
2172     "vperm2f128 $0x30, %%ymm7,  %%ymm8,  %%ymm8  \n\t"
2173     "                                            \n\t"
2174     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
2175     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
2176     "                                            \n\t" //   ab10    ab11    ab12    ab13
2177     "                                            \n\t" //   ab20    ab21    ab22    ab23
2178     "                                            \n\t" //   ab30    ab31    ab32    ab33
2179     "                                            \n\t" //   ab40    ab41    ab42    ab43
2180     "                                            \n\t" //   ab50    ab51    ab52    ab53
2181     "                                            \n\t" //   ab60    ab61    ab62    ab63
2182     "                                            \n\t" //   ab70 )  ab71 )  ab72 )  ab73 )
2183     "                                            \n\t"
2184     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
2185     "                                            \n\t" // ( ab80  ( ab81  ( ab82  ( ab83
2186     "                                            \n\t" //   ab90    ab91    ab92    ab93
2187     "                                            \n\t" //   aba0    aba1    aba2    aba3
2188     "                                            \n\t" //   abb0    abb1    abb2    abb3
2189     "                                            \n\t" //   abc0    abc1    abc2    abc3
2190     "                                            \n\t" //   abd0    abd1    abd2    abd3
2191     "                                            \n\t" //   abe0    abe1    abe2    abe3
2192     "                                            \n\t" //   abf0 )  abf1 )  abf2 )  abf3 )
2193     "                                            \n\t"
2194     "                                            \n\t"
2195     "                                            \n\t"
2196     "                                            \n\t"
2197     "                                            \n\t" // scale by alpha
2198     "                                            \n\t"
2199     "movq         %4, %%rax                      \n\t" // load address of alpha
2200     "vbroadcastss    (%%rax), %%ymm7             \n\t" // load alpha_r and duplicate
2201     "vbroadcastss   4(%%rax), %%ymm6             \n\t" // load alpha_i and duplicate
2202     "                                            \n\t"
2203     "vpermilps $0xb1, %%ymm15, %%ymm3            \n\t"
2204     "vmulps           %%ymm7,  %%ymm15, %%ymm15  \n\t"
2205     "vmulps           %%ymm6,  %%ymm3,  %%ymm3   \n\t"
2206     "vaddsubps        %%ymm3,  %%ymm15, %%ymm15  \n\t"
2207     "                                            \n\t"
2208     "vpermilps $0xb1, %%ymm14, %%ymm2            \n\t"
2209     "vmulps           %%ymm7,  %%ymm14, %%ymm14  \n\t"
2210     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2211     "vaddsubps        %%ymm2,  %%ymm14, %%ymm14  \n\t"
2212     "                                            \n\t"
2213     "vpermilps $0xb1, %%ymm13, %%ymm1            \n\t"
2214     "vmulps           %%ymm7,  %%ymm13, %%ymm13  \n\t"
2215     "vmulps           %%ymm6,  %%ymm1,  %%ymm1   \n\t"
2216     "vaddsubps        %%ymm1,  %%ymm13, %%ymm13  \n\t"
2217     "                                            \n\t"
2218     "vpermilps $0xb1, %%ymm12, %%ymm0            \n\t"
2219     "vmulps           %%ymm7,  %%ymm12, %%ymm12  \n\t"
2220     "vmulps           %%ymm6,  %%ymm0,  %%ymm0   \n\t"
2221     "vaddsubps        %%ymm0,  %%ymm12, %%ymm12  \n\t"
2222     "                                            \n\t"
2223     "vpermilps $0xb1, %%ymm11, %%ymm3            \n\t"
2224     "vmulps           %%ymm7,  %%ymm11, %%ymm11  \n\t"
2225     "vmulps           %%ymm6,  %%ymm3,  %%ymm3   \n\t"
2226     "vaddsubps        %%ymm3,  %%ymm11, %%ymm11  \n\t"
2227     "                                            \n\t"
2228     "vpermilps $0xb1, %%ymm10, %%ymm2            \n\t"
2229     "vmulps           %%ymm7,  %%ymm10, %%ymm10  \n\t"
2230     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2231     "vaddsubps        %%ymm2,  %%ymm10, %%ymm10  \n\t"
2232     "                                            \n\t"
2233     "vpermilps $0xb1, %%ymm9,  %%ymm1            \n\t"
2234     "vmulps           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2235     "vmulps           %%ymm6,  %%ymm1,  %%ymm1   \n\t"
2236     "vaddsubps        %%ymm1,  %%ymm9,  %%ymm9   \n\t"
2237     "                                            \n\t"
2238     "vpermilps $0xb1, %%ymm8,  %%ymm0            \n\t"
2239     "vmulps           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2240     "vmulps           %%ymm6,  %%ymm0,  %%ymm0   \n\t"
2241     "vaddsubps        %%ymm0,  %%ymm8,  %%ymm8   \n\t"
2242     "                                            \n\t"
2243     "                                            \n\t"
2244     "                                            \n\t"
2245     "                                            \n\t"
2246     "movq         %5, %%rbx                      \n\t" // load address of beta
2247     "vbroadcastss    (%%rbx), %%ymm7             \n\t" // load beta_r and duplicate
2248     "vbroadcastss   4(%%rbx), %%ymm6             \n\t" // load beta_i and duplicate
2249     "                                            \n\t"
2250     "                                            \n\t"
2251     "                                            \n\t"
2252     "                                            \n\t"
2253     "                                            \n\t"
2254     "                                            \n\t"
2255     "                                            \n\t"
2256     "movq                %7, %%rsi               \n\t" // load rs_c
2257     "leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(scomplex)
2258     "                                            \n\t"
2259     "leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c + 4*rs_c;
2260     "                                            \n\t"
2261     "leaq        (,%%rsi,2), %%r12               \n\t" // r12 = 2*rs_c;
2262     "leaq   (%%r12,%%rsi,1), %%r13               \n\t" // r13 = 3*rs_c;
2263     "                                            \n\t"
2264     "                                            \n\t"
2265     "                                            \n\t"
2266     "                                            \n\t" // determine if
2267     "                                            \n\t" //    c    % 32 == 0, AND
2268     "                                            \n\t" //  8*cs_c % 32 == 0, AND
2269     "                                            \n\t" //    rs_c      == 1
2270     "                                            \n\t" // ie: aligned, ldim aligned, and
2271     "                                            \n\t" // column-stored
2272     "                                            \n\t"
2273     "cmpq       $8, %%rsi                        \n\t" // set ZF if (8*rs_c) == 8.
2274     "sete           %%bl                         \n\t" // bl = ( ZF == 1 ? 1 : 0 );
2275     "testq     $31, %%rcx                        \n\t" // set ZF if c & 32 is zero.
2276     "setz           %%bh                         \n\t" // bh = ( ZF == 0 ? 1 : 0 );
2277     "testq     $31, %%rdi                        \n\t" // set ZF if (8*cs_c) & 32 is zero.
2278     "setz           %%al                         \n\t" // al = ( ZF == 0 ? 1 : 0 );
2279     "                                            \n\t" // and(bl,bh) followed by
2280     "                                            \n\t" // and(bh,al) will reveal result
2281     "                                            \n\t"
2282     "                                            \n\t" // now avoid loading C if beta == 0
2283     "                                            \n\t"
2284     "vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
2285     "vucomiss  %%xmm0,  %%xmm7                   \n\t" // set ZF if beta_r == 0.
2286     "sete       %%r8b                            \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
2287     "vucomiss  %%xmm0,  %%xmm6                   \n\t" // set ZF if beta_i == 0.
2288     "sete       %%r9b                            \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
2289     "andb       %%r8b, %%r9b                     \n\t" // set ZF if r8b & r9b == 1.
2290     "jne      .CBETAZERO                         \n\t" // if ZF = 0, jump to beta == 0 case
2291     "                                            \n\t"
2292     "                                            \n\t"
2293     "                                            \n\t" // check if aligned/column-stored
2294     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
2295     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
2296     "jne     .CCOLSTORED                         \n\t" // jump to column storage case
2297     "                                            \n\t"
2298     "                                            \n\t"
2299     "                                            \n\t"
2300     ".CGENSTORED:                                \n\t"
2301     "                                            \n\t"
2302     "                                            \n\t" // update c00:c70
2303     "                                            \n\t"
2304     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load (c00,10) into xmm0[0:1]
2305     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (c20,30) into xmm0[2:3]
2306     "vmovlpd    (%%rcx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (c40,50) into xmm2[0:1]
2307     "vmovhpd    (%%rcx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (c60,70) into xmm2[2:3]
2308     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2309     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2310     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2311     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2312     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2313     "vaddps           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2314     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2315     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // store (c00,c10)
2316     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t" // store (c20,c30)
2317     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c40,c50)
2318     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c60,c70)
2319     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2320     "                                            \n\t"
2321     "                                            \n\t" // update c80:cf0
2322     "                                            \n\t"
2323     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load (c80,90) into xmm0[0:1]
2324     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (ca0,b0) into xmm0[2:3]
2325     "vmovlpd    (%%rdx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (cc0,d0) into xmm2[0:1]
2326     "vmovhpd    (%%rdx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (ce0,f0) into xmm2[2:3]
2327     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2328     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2329     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2330     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2331     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2332     "vaddps           %%ymm14, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2333     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2334     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // store (c80,c90)
2335     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t" // store (ca0,cb0)
2336     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc0,cd0)
2337     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce0,cf0)
2338     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2339     "                                            \n\t"
2340     "                                            \n\t" // update c01:c71
2341     "                                            \n\t"
2342     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load (c01,11) into xmm0[0:1]
2343     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (c21,31) into xmm0[2:3]
2344     "vmovlpd    (%%rcx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (c41,51) into xmm2[0:1]
2345     "vmovhpd    (%%rcx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (c61,71) into xmm2[2:3]
2346     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2347     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2348     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2349     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2350     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2351     "vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2352     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2353     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // store (c01,c11)
2354     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t" // store (c21,c31)
2355     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c41,c51)
2356     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c61,c71)
2357     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2358     "                                            \n\t"
2359     "                                            \n\t" // update c81:cf1
2360     "                                            \n\t"
2361     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load (c81,91) into xmm0[0:1]
2362     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (ca1,b1) into xmm0[2:3]
2363     "vmovlpd    (%%rdx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (cc1,d1) into xmm2[0:1]
2364     "vmovhpd    (%%rdx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (ce1,f1) into xmm2[2:3]
2365     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2366     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2367     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2368     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2369     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2370     "vaddps           %%ymm12, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2371     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2372     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // store (c81,c91)
2373     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t" // store (ca1,cb1)
2374     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc1,cd1)
2375     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce1,cf1)
2376     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2377     "                                            \n\t"
2378     "                                            \n\t" // update c02:c72
2379     "                                            \n\t"
2380     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load (c02,12) into xmm0[0:1]
2381     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (c22,32) into xmm0[2:3]
2382     "vmovlpd    (%%rcx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (c42,52) into xmm2[0:1]
2383     "vmovhpd    (%%rcx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (c62,72) into xmm2[2:3]
2384     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2385     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2386     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2387     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2388     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2389     "vaddps           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2390     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2391     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // store (c02,c12)
2392     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t" // store (c22,c32)
2393     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c42,c52)
2394     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c62,c72)
2395     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2396     "                                            \n\t"
2397     "                                            \n\t" // update c82:cf2
2398     "                                            \n\t"
2399     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load (c82,92) into xmm0[0:1]
2400     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (ca2,b2) into xmm0[2:3]
2401     "vmovlpd    (%%rdx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (cc2,d2) into xmm2[0:1]
2402     "vmovhpd    (%%rdx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (ce2,f2) into xmm2[2:3]
2403     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2404     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2405     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2406     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2407     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2408     "vaddps           %%ymm10, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2409     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2410     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // store (c82,c92)
2411     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t" // store (ca2,cb2)
2412     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc2,cd2)
2413     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce2,cf2)
2414     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2415     "                                            \n\t"
2416     "                                            \n\t" // update c03:c73
2417     "                                            \n\t"
2418     "vmovlpd    (%%rcx),       %%xmm0,  %%xmm0   \n\t" // load (c03,13) into xmm0[0:1]
2419     "vmovhpd    (%%rcx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (c23,33) into xmm0[2:3]
2420     "vmovlpd    (%%rcx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (c43,53) into xmm2[0:1]
2421     "vmovhpd    (%%rcx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (c63,73) into xmm2[2:3]
2422     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2423     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2424     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2425     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2426     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2427     "vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2428     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2429     "vmovlpd          %%xmm0,  (%%rcx)           \n\t" // store (c03,c13)
2430     "vmovhpd          %%xmm0,  (%%rcx,%%rsi)     \n\t" // store (c23,c33)
2431     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c43,c53)
2432     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c63,c73)
2433     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2434     "                                            \n\t"
2435     "                                            \n\t" // update c83:cf3
2436     "                                            \n\t"
2437     "vmovlpd    (%%rdx),       %%xmm0,  %%xmm0   \n\t" // load (c83,93) into xmm0[0:1]
2438     "vmovhpd    (%%rdx,%%rsi), %%xmm0,  %%xmm0   \n\t" // load (ca3,b3) into xmm0[2:3]
2439     "vmovlpd    (%%rdx,%%r12), %%xmm2,  %%xmm2   \n\t" // load (cc3,d3) into xmm2[0:1]
2440     "vmovhpd    (%%rdx,%%r13), %%xmm2,  %%xmm2   \n\t" // load (ce3,f3) into xmm2[2:3]
2441     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:3],xmm2)
2442     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2443     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2444     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2445     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2446     "vaddps           %%ymm8,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2447     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2448     "vmovlpd          %%xmm0,  (%%rdx)           \n\t" // store (c83,c93)
2449     "vmovhpd          %%xmm0,  (%%rdx,%%rsi)     \n\t" // store (ca3,cb3)
2450     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc3,cd3)
2451     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce3,cf3)
2452     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2453     "                                            \n\t"
2454     "                                            \n\t"
2455     "                                            \n\t"
2456     "jmp    .CDONE                               \n\t" // jump to end.
2457     "                                            \n\t"
2458     "                                            \n\t"
2459     "                                            \n\t"
2460     ".CCOLSTORED:                                \n\t"
2461     "                                            \n\t"
2462     "                                            \n\t" // update c00:c70
2463     "                                            \n\t"
2464     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c00:c70 into ymm0
2465     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2466     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2467     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2468     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2469     "vaddps           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2470     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // store c00:c70
2471     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2472     "                                            \n\t"
2473     "                                            \n\t" // update c80:cf0
2474     "                                            \n\t"
2475     "vmovaps    (%%rdx),       %%ymm0            \n\t" // load c80:f0 into ymm0
2476     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2477     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2478     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2479     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2480     "vaddps           %%ymm14, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2481     "vmovaps          %%ymm0,  (%%rdx)           \n\t" // store c80:cf0
2482     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2483     "                                            \n\t"
2484     "                                            \n\t" // update c00:c70
2485     "                                            \n\t"
2486     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c01:c71 into ymm0
2487     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2488     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2489     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2490     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2491     "vaddps           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2492     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // store c01:c71
2493     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2494     "                                            \n\t"
2495     "                                            \n\t" // update c81:cf1
2496     "                                            \n\t"
2497     "vmovaps    (%%rdx),       %%ymm0            \n\t" // load c81:f1 into ymm0
2498     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2499     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2500     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2501     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2502     "vaddps           %%ymm12, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2503     "vmovaps          %%ymm0,  (%%rdx)           \n\t" // store c81:cf1
2504     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2505     "                                            \n\t"
2506     "                                            \n\t" // update c02:c72
2507     "                                            \n\t"
2508     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c02:c72 into ymm0
2509     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2510     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2511     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2512     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2513     "vaddps           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2514     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // store c02:c72
2515     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2516     "                                            \n\t"
2517     "                                            \n\t" // update c82:cf2
2518     "                                            \n\t"
2519     "vmovaps    (%%rdx),       %%ymm0            \n\t" // load c82:f2 into ymm0
2520     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2521     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2522     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2523     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2524     "vaddps           %%ymm10, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2525     "vmovaps          %%ymm0,  (%%rdx)           \n\t" // store c82:cf2
2526     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2527     "                                            \n\t"
2528     "                                            \n\t" // update c03:c73
2529     "                                            \n\t"
2530     "vmovaps    (%%rcx),       %%ymm0            \n\t" // load c03:c73 into ymm0
2531     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2532     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2533     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2534     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2535     "vaddps           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2536     "vmovaps          %%ymm0,  (%%rcx)           \n\t" // store c03:c73
2537     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2538     "                                            \n\t"
2539     "                                            \n\t" // update c83:cf3
2540     "                                            \n\t"
2541     "vmovaps    (%%rdx),       %%ymm0            \n\t" // load c83:f3 into ymm0
2542     "vpermilps $0xb1, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
2543     "vmulps           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
2544     "vmulps           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
2545     "vaddsubps        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
2546     "vaddps           %%ymm8,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
2547     "vmovaps          %%ymm0,  (%%rdx)           \n\t" // store c83:cf3
2548     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2549     "                                            \n\t"
2550     "                                            \n\t"
2551     "                                            \n\t"
2552     "jmp    .CDONE                               \n\t" // jump to end.
2553     "                                            \n\t"
2554     "                                            \n\t"
2555     "                                            \n\t"
2556     ".CBETAZERO:                                 \n\t"
2557     "                                            \n\t" // check if aligned/column-stored
2558     "                                            \n\t" // check if aligned/column-stored
2559     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
2560     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
2561     "jne     .CCOLSTORBZ                         \n\t" // jump to column storage case
2562     "                                            \n\t"
2563     "                                            \n\t"
2564     "                                            \n\t"
2565     ".CGENSTORBZ:                                \n\t"
2566     "                                            \n\t"
2567     "                                            \n\t" // update c00:c70
2568     "                                            \n\t"
2569     "vextractf128 $1, %%ymm15, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2570     "vmovlpd          %%xmm15, (%%rcx)           \n\t" // store (c00,c10)
2571     "vmovhpd          %%xmm15, (%%rcx,%%rsi)     \n\t" // store (c20,c30)
2572     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c40,c50)
2573     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c60,c70)
2574     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2575     "                                            \n\t"
2576     "                                            \n\t" // update c80:cf0
2577     "                                            \n\t"
2578     "vextractf128 $1, %%ymm14, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2579     "vmovlpd          %%xmm14, (%%rdx)           \n\t" // store (c80,c90)
2580     "vmovhpd          %%xmm14, (%%rdx,%%rsi)     \n\t" // store (ca0,cb0)
2581     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc0,cd0)
2582     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce0,cf0)
2583     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2584     "                                            \n\t"
2585     "                                            \n\t" // update c01:c71
2586     "                                            \n\t"
2587     "vextractf128 $1, %%ymm13, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2588     "vmovlpd          %%xmm13, (%%rcx)           \n\t" // store (c01,c11)
2589     "vmovhpd          %%xmm13, (%%rcx,%%rsi)     \n\t" // store (c21,c31)
2590     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c41,c51)
2591     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c61,c71)
2592     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2593     "                                            \n\t"
2594     "                                            \n\t" // update c81:cf1
2595     "                                            \n\t"
2596     "vextractf128 $1, %%ymm12, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2597     "vmovlpd          %%xmm12, (%%rdx)           \n\t" // store (c81,c91)
2598     "vmovhpd          %%xmm12, (%%rdx,%%rsi)     \n\t" // store (ca1,cb1)
2599     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc1,cd1)
2600     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce1,cf1)
2601     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2602     "                                            \n\t"
2603     "                                            \n\t" // update c02:c72
2604     "                                            \n\t"
2605     "vextractf128 $1, %%ymm11, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2606     "vmovlpd          %%xmm11, (%%rcx)           \n\t" // store (c02,c12)
2607     "vmovhpd          %%xmm11, (%%rcx,%%rsi)     \n\t" // store (c22,c32)
2608     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c42,c52)
2609     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c62,c72)
2610     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2611     "                                            \n\t"
2612     "                                            \n\t" // update c82:cf2
2613     "                                            \n\t"
2614     "vextractf128 $1, %%ymm10, %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2615     "vmovlpd          %%xmm10, (%%rdx)           \n\t" // store (c82,c92)
2616     "vmovhpd          %%xmm10, (%%rdx,%%rsi)     \n\t" // store (ca2,cb2)
2617     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc2,cd2)
2618     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce2,cf2)
2619     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2620     "                                            \n\t"
2621     "                                            \n\t" // update c03:c73
2622     "                                            \n\t"
2623     "vextractf128 $1, %%ymm9,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2624     "vmovlpd          %%xmm9,  (%%rcx)           \n\t" // store (c03,c13)
2625     "vmovhpd          %%xmm9,  (%%rcx,%%rsi)     \n\t" // store (c23,c33)
2626     "vmovlpd          %%xmm2,  (%%rcx,%%r12)     \n\t" // store (c43,c53)
2627     "vmovhpd          %%xmm2,  (%%rcx,%%r13)     \n\t" // store (c63,c73)
2628     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2629     "                                            \n\t"
2630     "                                            \n\t" // update c83:cf3
2631     "                                            \n\t"
2632     "vextractf128 $1, %%ymm8,  %%xmm2            \n\t" // xmm2 := ymm0[4:7]
2633     "vmovlpd          %%xmm8,  (%%rdx)           \n\t" // store (c83,c93)
2634     "vmovhpd          %%xmm8,  (%%rdx,%%rsi)     \n\t" // store (ca3,cb3)
2635     "vmovlpd          %%xmm2,  (%%rdx,%%r12)     \n\t" // store (cc3,cd3)
2636     "vmovhpd          %%xmm2,  (%%rdx,%%r13)     \n\t" // store (ce3,cf3)
2637     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2638     "                                            \n\t"
2639     "                                            \n\t"
2640     "                                            \n\t"
2641     "jmp    .CDONE                               \n\t" // jump to end.
2642     "                                            \n\t"
2643     "                                            \n\t"
2644     "                                            \n\t"
2645     ".CCOLSTORBZ:                                \n\t"
2646     "                                            \n\t"
2647     "                                            \n\t"
2648     "vmovaps          %%ymm15, (%%rcx)           \n\t" // store c00:c70
2649     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2650     "                                            \n\t"
2651     "vmovaps          %%ymm14, (%%rdx)           \n\t" // store c80:cf0
2652     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2653     "                                            \n\t"
2654     "vmovaps          %%ymm13, (%%rcx)           \n\t" // store c01:c71
2655     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2656     "                                            \n\t"
2657     "vmovaps          %%ymm12, (%%rdx)           \n\t" // store c81:cf1
2658     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2659     "                                            \n\t"
2660     "vmovaps          %%ymm11, (%%rcx)           \n\t" // store c02:c72
2661     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2662     "                                            \n\t"
2663     "vmovaps          %%ymm10, (%%rdx)           \n\t" // store c82:cf2
2664     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2665     "                                            \n\t"
2666     "vmovaps          %%ymm9,  (%%rcx)           \n\t" // store c03:c73
2667     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
2668     "                                            \n\t"
2669     "vmovaps          %%ymm8,  (%%rdx)           \n\t" // store c83:cf3
2670     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
2671     "                                            \n\t"
2672     "                                            \n\t"
2673     "                                            \n\t"
2674     "                                            \n\t"
2675     "                                            \n\t"
2676     ".CDONE:                                     \n\t"
2677     "                                            \n\t"
2678 
2679     : // output operands (none)
2680     : // input operands
2681       "m" (k_iter), // 0
2682       "m" (k_left), // 1
2683       "m" (a),      // 2
2684       "m" (b),      // 3
2685       "m" (alpha),  // 4
2686       "m" (beta),   // 5
2687       "m" (c),      // 6
2688       "m" (rs_c),   // 7
2689       "m" (cs_c),   // 8
2690       "m" (b_next)/*, // 9
2691       "m" (a_next)*/  // 10
2692     : // register clobber list
2693       "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
2694       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
2695       "xmm0", "xmm1", "xmm2", "xmm3",
2696       "xmm4", "xmm5", "xmm6", "xmm7",
2697       "xmm8", "xmm9", "xmm10", "xmm11",
2698       "xmm12", "xmm13", "xmm14", "xmm15",
2699       "memory"
2700     );
2701 }
2702 
2703 
2704 
bli_zgemm_asm_4x4(dim_t k,dcomplex * restrict alpha,dcomplex * restrict a,dcomplex * restrict b,dcomplex * restrict beta,dcomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)2705 void bli_zgemm_asm_4x4
2706      (
2707        dim_t               k,
2708        dcomplex*  restrict alpha,
2709        dcomplex*  restrict a,
2710        dcomplex*  restrict b,
2711        dcomplex*  restrict beta,
2712        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
2713        auxinfo_t* restrict data,
2714        cntx_t*    restrict cntx
2715      )
2716 {
2717     //void*   a_next = bli_auxinfo_next_a( data );
2718     //void*   b_next = bli_auxinfo_next_b( data );
2719 
2720     uint64_t   k_iter = k / 4;
2721     uint64_t   k_left = k % 4;
2722 
2723     __asm__ volatile
2724     (
2725     "                                            \n\t"
2726     "                                            \n\t"
2727     "movq                %2, %%rax               \n\t" // load address of a.
2728     "movq                %3, %%rbx               \n\t" // load address of b.
2729     //"movq                %9, %%r15               \n\t" // load address of b_next.
2730     //"movq               %10, %%r14               \n\t" // load address of a_next.
2731     "                                            \n\t"
2732     "vmovapd        0 * 32(%%rax), %%ymm0        \n\t" // initialize loop by pre-loading
2733     "vmovddup   0 + 0 * 32(%%rbx), %%ymm2        \n\t"
2734     "vmovddup   0 + 1 * 32(%%rbx), %%ymm3        \n\t"
2735     "                                            \n\t"
2736     "movq                %6, %%rcx               \n\t" // load address of c
2737     "movq                %8, %%rdi               \n\t" // load cs_c
2738     "leaq        (,%%rdi,8), %%rdi               \n\t" // cs_c *= sizeof(dcomplex)
2739     "leaq        (,%%rdi,2), %%rdi               \n\t"
2740     "leaq   (%%rcx,%%rdi,2), %%r10               \n\t" // load address of c + 2*cs_c;
2741     "                                            \n\t"
2742     "prefetcht0   3 * 8(%%rcx)                   \n\t" // prefetch c + 0*cs_c
2743     "prefetcht0   3 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*cs_c
2744     "prefetcht0   3 * 8(%%r10)                   \n\t" // prefetch c + 2*cs_c
2745     "prefetcht0   3 * 8(%%r10,%%rdi)             \n\t" // prefetch c + 3*cs_c
2746     "                                            \n\t"
2747     "vxorpd    %%ymm8,  %%ymm8,  %%ymm8          \n\t"
2748     "vxorpd    %%ymm9,  %%ymm9,  %%ymm9          \n\t"
2749     "vxorpd    %%ymm10, %%ymm10, %%ymm10         \n\t"
2750     "vxorpd    %%ymm11, %%ymm11, %%ymm11         \n\t"
2751     "vxorpd    %%ymm12, %%ymm12, %%ymm12         \n\t"
2752     "vxorpd    %%ymm13, %%ymm13, %%ymm13         \n\t"
2753     "vxorpd    %%ymm14, %%ymm14, %%ymm14         \n\t"
2754     "vxorpd    %%ymm15, %%ymm15, %%ymm15         \n\t"
2755     "                                            \n\t"
2756     "                                            \n\t"
2757     "                                            \n\t"
2758     "movq      %0, %%rsi                         \n\t" // i = k_iter;
2759     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
2760     "je     .ZCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
2761     "                                            \n\t" // contains the k_left loop.
2762     "                                            \n\t"
2763     "                                            \n\t"
2764     ".ZLOOPKITER:                                \n\t" // MAIN LOOP
2765     "                                            \n\t"
2766     "                                            \n\t"
2767     "                                            \n\t" // iteration 0
2768     "vmovapd        1 * 32(%%rax),      %%ymm1   \n\t"
2769     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2770     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2771     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2772     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2773     "vaddpd           %%ymm6,  %%ymm15, %%ymm15  \n\t"
2774     "vaddpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
2775     "                                            \n\t"
2776     "prefetcht0    16 * 32(%%rax)                \n\t"
2777     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2778     "vmovddup   8 + 0 * 32(%%rbx),      %%ymm2   \n\t"
2779     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2780     "vmovddup   8 + 1 * 32(%%rbx),      %%ymm3   \n\t"
2781     "vaddpd           %%ymm6,  %%ymm14, %%ymm14  \n\t"
2782     "vaddpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
2783     "                                            \n\t"
2784     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2785     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2786     "vpermilpd  $0x5, %%ymm0,  %%ymm0            \n\t"
2787     "vaddpd           %%ymm6,  %%ymm13, %%ymm13  \n\t"
2788     "vaddpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2789     "                                            \n\t"
2790     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2791     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2792     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2793     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2794     "vaddpd           %%ymm6,  %%ymm12, %%ymm12  \n\t"
2795     "vaddpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2796     "                                            \n\t"
2797     "vpermilpd  $0x5, %%ymm1,  %%ymm1            \n\t"
2798     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2799     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2800     "vaddsubpd        %%ymm6,  %%ymm15, %%ymm15  \n\t"
2801     "vaddsubpd        %%ymm7,  %%ymm11, %%ymm11  \n\t"
2802     "                                            \n\t"
2803     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2804     "vmovddup   0 + 2 * 32(%%rbx),      %%ymm2   \n\t"
2805     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2806     "vmovddup   0 + 3 * 32(%%rbx),      %%ymm3   \n\t"
2807     "vaddsubpd        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2808     "vaddsubpd        %%ymm7,  %%ymm10, %%ymm10  \n\t"
2809     "                                            \n\t"
2810     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2811     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2812     "vmovapd        2 * 32(%%rax),      %%ymm0   \n\t"
2813     "vaddsubpd        %%ymm6,  %%ymm13, %%ymm13  \n\t"
2814     "vaddsubpd        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2815     "                                            \n\t"
2816     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2817     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2818     "vaddsubpd        %%ymm6,  %%ymm12, %%ymm12  \n\t"
2819     "vaddsubpd        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2820     "                                            \n\t"
2821     "                                            \n\t"
2822     "                                            \n\t" // iteration 1
2823     "vmovapd        3 * 32(%%rax),      %%ymm1   \n\t"
2824     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2825     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2826     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2827     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2828     "vaddpd           %%ymm6,  %%ymm15, %%ymm15  \n\t"
2829     "vaddpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
2830     "                                            \n\t"
2831     "prefetcht0    18 * 32(%%rax)                \n\t"
2832     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2833     "vmovddup   8 + 2 * 32(%%rbx),      %%ymm2   \n\t"
2834     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2835     "vmovddup   8 + 3 * 32(%%rbx),      %%ymm3   \n\t"
2836     "vaddpd           %%ymm6,  %%ymm14, %%ymm14  \n\t"
2837     "vaddpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
2838     "                                            \n\t"
2839     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2840     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2841     "vpermilpd  $0x5, %%ymm0,  %%ymm0            \n\t"
2842     "vaddpd           %%ymm6,  %%ymm13, %%ymm13  \n\t"
2843     "vaddpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2844     "                                            \n\t"
2845     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2846     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2847     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2848     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2849     "vaddpd           %%ymm6,  %%ymm12, %%ymm12  \n\t"
2850     "vaddpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2851     "                                            \n\t"
2852     "vpermilpd  $0x5, %%ymm1,  %%ymm1            \n\t"
2853     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2854     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2855     "vaddsubpd        %%ymm6,  %%ymm15, %%ymm15  \n\t"
2856     "vaddsubpd        %%ymm7,  %%ymm11, %%ymm11  \n\t"
2857     "                                            \n\t"
2858     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2859     "vmovddup   0 + 4 * 32(%%rbx),      %%ymm2   \n\t"
2860     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2861     "vmovddup   0 + 5 * 32(%%rbx),      %%ymm3   \n\t"
2862     "vaddsubpd        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2863     "vaddsubpd        %%ymm7,  %%ymm10, %%ymm10  \n\t"
2864     "                                            \n\t"
2865     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2866     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2867     "vmovapd        4 * 32(%%rax),      %%ymm0   \n\t"
2868     "vaddsubpd        %%ymm6,  %%ymm13, %%ymm13  \n\t"
2869     "vaddsubpd        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2870     "                                            \n\t"
2871     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2872     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2873     "vaddsubpd        %%ymm6,  %%ymm12, %%ymm12  \n\t"
2874     "vaddsubpd        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2875     "                                            \n\t"
2876     "                                            \n\t"
2877     "                                            \n\t" // iteration 2
2878     "vmovapd        5 * 32(%%rax),      %%ymm1   \n\t"
2879     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2880     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2881     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2882     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2883     "vaddpd           %%ymm6,  %%ymm15, %%ymm15  \n\t"
2884     "vaddpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
2885     "                                            \n\t"
2886     "prefetcht0    20 * 32(%%rax)                \n\t"
2887     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2888     "vmovddup   8 + 4 * 32(%%rbx),      %%ymm2   \n\t"
2889     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2890     "vmovddup   8 + 5 * 32(%%rbx),      %%ymm3   \n\t"
2891     "vaddpd           %%ymm6,  %%ymm14, %%ymm14  \n\t"
2892     "vaddpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
2893     "                                            \n\t"
2894     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2895     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2896     "vpermilpd  $0x5, %%ymm0,  %%ymm0            \n\t"
2897     "vaddpd           %%ymm6,  %%ymm13, %%ymm13  \n\t"
2898     "vaddpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2899     "                                            \n\t"
2900     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2901     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2902     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2903     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2904     "vaddpd           %%ymm6,  %%ymm12, %%ymm12  \n\t"
2905     "vaddpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2906     "                                            \n\t"
2907     "vpermilpd  $0x5, %%ymm1,  %%ymm1            \n\t"
2908     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2909     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2910     "vaddsubpd        %%ymm6,  %%ymm15, %%ymm15  \n\t"
2911     "vaddsubpd        %%ymm7,  %%ymm11, %%ymm11  \n\t"
2912     "                                            \n\t"
2913     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2914     "vmovddup   0 + 6 * 32(%%rbx),      %%ymm2   \n\t"
2915     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2916     "vmovddup   0 + 7 * 32(%%rbx),      %%ymm3   \n\t"
2917     "vaddsubpd        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2918     "vaddsubpd        %%ymm7,  %%ymm10, %%ymm10  \n\t"
2919     "                                            \n\t"
2920     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2921     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2922     "vmovapd        6 * 32(%%rax),      %%ymm0   \n\t"
2923     "vaddsubpd        %%ymm6,  %%ymm13, %%ymm13  \n\t"
2924     "vaddsubpd        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2925     "                                            \n\t"
2926     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2927     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2928     "vaddsubpd        %%ymm6,  %%ymm12, %%ymm12  \n\t"
2929     "vaddsubpd        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2930     "                                            \n\t"
2931     "                                            \n\t"
2932     "                                            \n\t" // iteration 3
2933     "vmovapd        7 * 32(%%rax),      %%ymm1   \n\t"
2934     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2935     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2936     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2937     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2938     "vaddpd           %%ymm6,  %%ymm15, %%ymm15  \n\t"
2939     "vaddpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
2940     "                                            \n\t"
2941     "prefetcht0    22 * 32(%%rax)                \n\t"
2942     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2943     "vmovddup   8 + 6 * 32(%%rbx),      %%ymm2   \n\t"
2944     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2945     "vmovddup   8 + 7 * 32(%%rbx),      %%ymm3   \n\t"
2946     "vaddpd           %%ymm6,  %%ymm14, %%ymm14  \n\t"
2947     "vaddpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
2948     "                                            \n\t"
2949     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2950     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2951     "vpermilpd  $0x5, %%ymm0,  %%ymm0            \n\t"
2952     "vaddpd           %%ymm6,  %%ymm13, %%ymm13  \n\t"
2953     "vaddpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2954     "                                            \n\t"
2955     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2956     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
2957     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2958     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
2959     "vaddpd           %%ymm6,  %%ymm12, %%ymm12  \n\t"
2960     "vaddpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2961     "                                            \n\t"
2962     "vpermilpd  $0x5, %%ymm1,  %%ymm1            \n\t"
2963     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
2964     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
2965     "vaddsubpd        %%ymm6,  %%ymm15, %%ymm15  \n\t"
2966     "vaddsubpd        %%ymm7,  %%ymm11, %%ymm11  \n\t"
2967     "                                            \n\t"
2968     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
2969     "vmovddup   0 + 8 * 32(%%rbx),      %%ymm2   \n\t"
2970     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
2971     "vmovddup   0 + 9 * 32(%%rbx),      %%ymm3   \n\t"
2972     "vaddsubpd        %%ymm6,  %%ymm14, %%ymm14  \n\t"
2973     "vaddsubpd        %%ymm7,  %%ymm10, %%ymm10  \n\t"
2974     "                                            \n\t"
2975     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
2976     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
2977     "vmovapd        8 * 32(%%rax),      %%ymm0   \n\t"
2978     "vaddsubpd        %%ymm6,  %%ymm13, %%ymm13  \n\t"
2979     "vaddsubpd        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
2980     "                                            \n\t"
2981     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
2982     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
2983     "vaddsubpd        %%ymm6,  %%ymm12, %%ymm12  \n\t"
2984     "vaddsubpd        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
2985     "                                            \n\t"
2986     "                                            \n\t"
2987     "addq         $4 * 4 * 16, %%rbx             \n\t" // b += 4*4 (unroll x nr)
2988     "addq         $4 * 4 * 16, %%rax             \n\t" // a += 4*4 (unroll x mr)
2989     "                                            \n\t"
2990     "                                            \n\t"
2991     "decq   %%rsi                                \n\t" // i -= 1;
2992     "jne    .ZLOOPKITER                          \n\t" // iterate again if i != 0.
2993     "                                            \n\t"
2994     "                                            \n\t"
2995     "                                            \n\t"
2996     "                                            \n\t"
2997     "                                            \n\t"
2998     "                                            \n\t"
2999     ".ZCONSIDKLEFT:                              \n\t"
3000     "                                            \n\t"
3001     "movq      %1, %%rsi                         \n\t" // i = k_left;
3002     "testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
3003     "je     .ZPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
3004     "                                            \n\t" // else, we prepare to enter k_left loop.
3005     "                                            \n\t"
3006     "                                            \n\t"
3007     ".ZLOOPKLEFT:                                \n\t" // EDGE LOOP
3008     "                                            \n\t"
3009     "                                            \n\t" // iteration 0
3010     "vmovapd        1 * 32(%%rax),      %%ymm1   \n\t"
3011     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
3012     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
3013     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
3014     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
3015     "vaddpd           %%ymm6,  %%ymm15, %%ymm15  \n\t"
3016     "vaddpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
3017     "                                            \n\t"
3018     "prefetcht0    16 * 32(%%rax)                \n\t"
3019     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
3020     "vmovddup   8 + 0 * 32(%%rbx),      %%ymm2   \n\t"
3021     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
3022     "vmovddup   8 + 1 * 32(%%rbx),      %%ymm3   \n\t"
3023     "vaddpd           %%ymm6,  %%ymm14, %%ymm14  \n\t"
3024     "vaddpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
3025     "                                            \n\t"
3026     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
3027     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
3028     "vpermilpd  $0x5, %%ymm0,  %%ymm0            \n\t"
3029     "vaddpd           %%ymm6,  %%ymm13, %%ymm13  \n\t"
3030     "vaddpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
3031     "                                            \n\t"
3032     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
3033     "vperm2f128 $0x3, %%ymm2,  %%ymm2,  %%ymm4   \n\t"
3034     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
3035     "vperm2f128 $0x3, %%ymm3,  %%ymm3,  %%ymm5   \n\t"
3036     "vaddpd           %%ymm6,  %%ymm12, %%ymm12  \n\t"
3037     "vaddpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
3038     "                                            \n\t"
3039     "vpermilpd  $0x5, %%ymm1,  %%ymm1            \n\t"
3040     "vmulpd           %%ymm0,  %%ymm2,  %%ymm6   \n\t"
3041     "vmulpd           %%ymm0,  %%ymm3,  %%ymm7   \n\t"
3042     "vaddsubpd        %%ymm6,  %%ymm15, %%ymm15  \n\t"
3043     "vaddsubpd        %%ymm7,  %%ymm11, %%ymm11  \n\t"
3044     "                                            \n\t"
3045     "vmulpd           %%ymm1,  %%ymm2,  %%ymm6   \n\t"
3046     "vmovddup   0 + 2 * 32(%%rbx),      %%ymm2   \n\t"
3047     "vmulpd           %%ymm1,  %%ymm3,  %%ymm7   \n\t"
3048     "vmovddup   0 + 3 * 32(%%rbx),      %%ymm3   \n\t"
3049     "vaddsubpd        %%ymm6,  %%ymm14, %%ymm14  \n\t"
3050     "vaddsubpd        %%ymm7,  %%ymm10, %%ymm10  \n\t"
3051     "                                            \n\t"
3052     "vmulpd           %%ymm0,  %%ymm4,  %%ymm6   \n\t"
3053     "vmulpd           %%ymm0,  %%ymm5,  %%ymm7   \n\t"
3054     "vmovapd        2 * 32(%%rax),      %%ymm0   \n\t"
3055     "vaddsubpd        %%ymm6,  %%ymm13, %%ymm13  \n\t"
3056     "vaddsubpd        %%ymm7,  %%ymm9,  %%ymm9   \n\t"
3057     "                                            \n\t"
3058     "vmulpd           %%ymm1,  %%ymm4,  %%ymm6   \n\t"
3059     "vmulpd           %%ymm1,  %%ymm5,  %%ymm7   \n\t"
3060     "vaddsubpd        %%ymm6,  %%ymm12, %%ymm12  \n\t"
3061     "vaddsubpd        %%ymm7,  %%ymm8,  %%ymm8   \n\t"
3062     "                                            \n\t"
3063     "                                            \n\t"
3064     "addq         $4 * 1 * 16, %%rax             \n\t" // a += 4 (1 x mr)
3065     "addq         $4 * 1 * 16, %%rbx             \n\t" // b += 4 (1 x nr)
3066     "                                            \n\t"
3067     "                                            \n\t"
3068     "decq   %%rsi                                \n\t" // i -= 1;
3069     "jne    .ZLOOPKLEFT                          \n\t" // iterate again if i != 0.
3070     "                                            \n\t"
3071     "                                            \n\t"
3072     "                                            \n\t"
3073     ".ZPOSTACCUM:                                \n\t"
3074     "                                            \n\t"
3075     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
3076     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
3077     "                                            \n\t" //   ab10    ab11    ab12    ab13
3078     "                                            \n\t" //   ab21    ab20    ab23    ab22
3079     "                                            \n\t" //   ab31 )  ab30 )  ab33 )  ab32 )
3080     "                                            \n\t"
3081     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
3082     "                                            \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
3083     "                                            \n\t" //   ab50    ab51    ab52    ab53
3084     "                                            \n\t" //   ab61    ab60    ab63    ab62
3085     "                                            \n\t" //   ab71 )  ab70 )  ab73 )  ab72 )
3086     "                                            \n\t"
3087     "                                            \n\t"
3088     "vmovapd           %%ymm15, %%ymm7           \n\t"
3089     "vperm2f128 $0x12, %%ymm15, %%ymm13, %%ymm15 \n\t"
3090     "vperm2f128 $0x30, %%ymm7,  %%ymm13, %%ymm13 \n\t"
3091     "                                            \n\t"
3092     "vmovapd           %%ymm11, %%ymm7           \n\t"
3093     "vperm2f128 $0x12, %%ymm11, %%ymm9,  %%ymm11 \n\t"
3094     "vperm2f128 $0x30, %%ymm7,  %%ymm9,  %%ymm9  \n\t"
3095     "                                            \n\t"
3096     "vmovapd           %%ymm14, %%ymm7           \n\t"
3097     "vperm2f128 $0x12, %%ymm14, %%ymm12, %%ymm14 \n\t"
3098     "vperm2f128 $0x30, %%ymm7,  %%ymm12, %%ymm12 \n\t"
3099     "                                            \n\t"
3100     "vmovapd           %%ymm10, %%ymm7           \n\t"
3101     "vperm2f128 $0x12, %%ymm10, %%ymm8,  %%ymm10 \n\t"
3102     "vperm2f128 $0x30, %%ymm7,  %%ymm8,  %%ymm8  \n\t"
3103     "                                            \n\t"
3104     "                                            \n\t"
3105     "                                            \n\t" // ymm15:  ymm13:  ymm11:  ymm9:
3106     "                                            \n\t" // ( ab00  ( ab01  ( ab02  ( ab03
3107     "                                            \n\t" //   ab10    ab11    ab12    ab13
3108     "                                            \n\t" //   ab20    ab21    ab22    ab23
3109     "                                            \n\t" //   ab30 )  ab31 )  ab32 )  ab33 )
3110     "                                            \n\t"
3111     "                                            \n\t" // ymm14:  ymm12:  ymm10:  ymm8:
3112     "                                            \n\t" // ( ab40  ( ab41  ( ab42  ( ab43
3113     "                                            \n\t" //   ab50    ab51    ab52    ab53
3114     "                                            \n\t" //   ab60    ab61    ab62    ab63
3115     "                                            \n\t" //   ab70 )  ab71 )  ab72 )  ab73 )
3116     "                                            \n\t"
3117     "                                            \n\t"
3118     "                                            \n\t" // scale by alpha
3119     "                                            \n\t"
3120     "movq         %4, %%rax                      \n\t" // load address of alpha
3121     "vbroadcastsd    (%%rax), %%ymm7             \n\t" // load alpha_r and duplicate
3122     "vbroadcastsd   8(%%rax), %%ymm6             \n\t" // load alpha_i and duplicate
3123     "                                            \n\t"
3124     "vpermilpd  $0x5, %%ymm15, %%ymm3            \n\t"
3125     "vmulpd           %%ymm7,  %%ymm15, %%ymm15  \n\t"
3126     "vmulpd           %%ymm6,  %%ymm3,  %%ymm3   \n\t"
3127     "vaddsubpd        %%ymm3,  %%ymm15, %%ymm15  \n\t"
3128     "                                            \n\t"
3129     "vpermilpd  $0x5, %%ymm14, %%ymm2            \n\t"
3130     "vmulpd           %%ymm7,  %%ymm14, %%ymm14  \n\t"
3131     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3132     "vaddsubpd        %%ymm2,  %%ymm14, %%ymm14  \n\t"
3133     "                                            \n\t"
3134     "vpermilpd  $0x5, %%ymm13, %%ymm1            \n\t"
3135     "vmulpd           %%ymm7,  %%ymm13, %%ymm13  \n\t"
3136     "vmulpd           %%ymm6,  %%ymm1,  %%ymm1   \n\t"
3137     "vaddsubpd        %%ymm1,  %%ymm13, %%ymm13  \n\t"
3138     "                                            \n\t"
3139     "vpermilpd  $0x5, %%ymm12, %%ymm0            \n\t"
3140     "vmulpd           %%ymm7,  %%ymm12, %%ymm12  \n\t"
3141     "vmulpd           %%ymm6,  %%ymm0,  %%ymm0   \n\t"
3142     "vaddsubpd        %%ymm0,  %%ymm12, %%ymm12  \n\t"
3143     "                                            \n\t"
3144     "vpermilpd  $0x5, %%ymm11, %%ymm3            \n\t"
3145     "vmulpd           %%ymm7,  %%ymm11, %%ymm11  \n\t"
3146     "vmulpd           %%ymm6,  %%ymm3,  %%ymm3   \n\t"
3147     "vaddsubpd        %%ymm3,  %%ymm11, %%ymm11  \n\t"
3148     "                                            \n\t"
3149     "vpermilpd  $0x5, %%ymm10, %%ymm2            \n\t"
3150     "vmulpd           %%ymm7,  %%ymm10, %%ymm10  \n\t"
3151     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3152     "vaddsubpd        %%ymm2,  %%ymm10, %%ymm10  \n\t"
3153     "                                            \n\t"
3154     "vpermilpd  $0x5, %%ymm9,  %%ymm1            \n\t"
3155     "vmulpd           %%ymm7,  %%ymm9,  %%ymm9   \n\t"
3156     "vmulpd           %%ymm6,  %%ymm1,  %%ymm1   \n\t"
3157     "vaddsubpd        %%ymm1,  %%ymm9,  %%ymm9   \n\t"
3158     "                                            \n\t"
3159     "vpermilpd  $0x5, %%ymm8,  %%ymm0            \n\t"
3160     "vmulpd           %%ymm7,  %%ymm8,  %%ymm8   \n\t"
3161     "vmulpd           %%ymm6,  %%ymm0,  %%ymm0   \n\t"
3162     "vaddsubpd        %%ymm0,  %%ymm8,  %%ymm8   \n\t"
3163     "                                            \n\t"
3164     "                                            \n\t"
3165     "                                            \n\t"
3166     "                                            \n\t"
3167     "movq         %5, %%rbx                      \n\t" // load address of beta
3168     "vbroadcastsd    (%%rbx), %%ymm7             \n\t" // load beta_r and duplicate
3169     "vbroadcastsd   8(%%rbx), %%ymm6             \n\t" // load beta_i and duplicate
3170     "                                            \n\t"
3171     "                                            \n\t"
3172     "                                            \n\t"
3173     "                                            \n\t"
3174     "                                            \n\t"
3175     "                                            \n\t"
3176     "                                            \n\t"
3177     "movq                %7, %%rsi               \n\t" // load rs_c
3178     "leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = rs_c * sizeof(dcomplex)
3179     "leaq        (,%%rsi,2), %%rsi               \n\t"
3180     "leaq   (%%rcx,%%rsi,2), %%rdx               \n\t" // load address of c + 2*rs_c;
3181     "                                            \n\t"
3182     "                                            \n\t"
3183     "                                            \n\t"
3184     "                                            \n\t"
3185     "                                            \n\t"
3186     "                                            \n\t"
3187     "                                            \n\t" // determine if
3188     "                                            \n\t" //    c    % 32 == 0, AND
3189     "                                            \n\t" // 16*cs_c % 32 == 0, AND
3190     "                                            \n\t" //    rs_c      == 1
3191     "                                            \n\t" // ie: aligned, ldim aligned, and
3192     "                                            \n\t" // column-stored
3193     "                                            \n\t"
3194     "cmpq      $16, %%rsi                        \n\t" // set ZF if (16*rs_c) == 16.
3195     "sete           %%bl                         \n\t" // bl = ( ZF == 1 ? 1 : 0 );
3196     "testq     $31, %%rcx                        \n\t" // set ZF if c & 32 is zero.
3197     "setz           %%bh                         \n\t" // bh = ( ZF == 0 ? 1 : 0 );
3198     "testq     $31, %%rdi                        \n\t" // set ZF if (16*cs_c) & 32 is zero.
3199     "setz           %%al                         \n\t" // al = ( ZF == 0 ? 1 : 0 );
3200     "                                            \n\t" // and(bl,bh) followed by
3201     "                                            \n\t" // and(bh,al) will reveal result
3202     "                                            \n\t"
3203     "                                            \n\t" // now avoid loading C if beta == 0
3204     "                                            \n\t"
3205     "vxorpd    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
3206     "vucomisd  %%xmm0,  %%xmm7                   \n\t" // set ZF if beta_r == 0.
3207     "sete       %%r8b                            \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
3208     "vucomisd  %%xmm0,  %%xmm6                   \n\t" // set ZF if beta_i == 0.
3209     "sete       %%r9b                            \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
3210     "andb       %%r8b, %%r9b                     \n\t" // set ZF if r8b & r9b == 1.
3211     "jne      .ZBETAZERO                         \n\t" // if ZF = 0, jump to beta == 0 case
3212     "                                            \n\t"
3213     "                                            \n\t"
3214     "                                            \n\t" // check if aligned/column-stored
3215     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
3216     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
3217     "jne     .ZCOLSTORED                         \n\t" // jump to column storage case
3218     "                                            \n\t"
3219     "                                            \n\t"
3220     "                                            \n\t"
3221     ".ZGENSTORED:                                \n\t"
3222     "                                            \n\t" // update c00:c30
3223     "                                            \n\t"
3224     "vmovupd    (%%rcx),       %%xmm0            \n\t" // load (c00,c10) into xmm0
3225     "vmovupd    (%%rcx,%%rsi), %%xmm2            \n\t" // load (c20,c30) into xmm2
3226     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3227     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3228     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3229     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3230     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3231     "vaddpd           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3232     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3233     "vmovupd          %%xmm0,  (%%rcx)           \n\t" // store (c00,c10)
3234     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c20,c30)
3235     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3236     "                                            \n\t"
3237     "                                            \n\t" // update c40:c70
3238     "                                            \n\t"
3239     "vmovupd    (%%rdx),       %%xmm0            \n\t" // load (c40,c50) into xmm0
3240     "vmovupd    (%%rdx,%%rsi), %%xmm2            \n\t" // load (c60,c70) into xmm2
3241     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3242     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3243     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3244     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3245     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3246     "vaddpd           %%ymm14, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3247     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3248     "vmovupd          %%xmm0,  (%%rdx)           \n\t" // store (c40,c50)
3249     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c60,c70)
3250     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3251     "                                            \n\t"
3252     "                                            \n\t" // update c01:c31
3253     "                                            \n\t"
3254     "vmovupd    (%%rcx),       %%xmm0            \n\t" // load (c01,c11) into xmm0
3255     "vmovupd    (%%rcx,%%rsi), %%xmm2            \n\t" // load (c21,c31) into xmm2
3256     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3257     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3258     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3259     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3260     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3261     "vaddpd           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3262     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3263     "vmovupd          %%xmm0,  (%%rcx)           \n\t" // store (c01,c11)
3264     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c21,c31)
3265     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3266     "                                            \n\t"
3267     "                                            \n\t" // update c41:c71
3268     "                                            \n\t"
3269     "vmovupd    (%%rdx),       %%xmm0            \n\t" // load (c41,c51) into xmm0
3270     "vmovupd    (%%rdx,%%rsi), %%xmm2            \n\t" // load (c61,c71) into xmm2
3271     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3272     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3273     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3274     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3275     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3276     "vaddpd           %%ymm12, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3277     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3278     "vmovupd          %%xmm0,  (%%rdx)           \n\t" // store (c41,c51)
3279     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c61,c71)
3280     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3281     "                                            \n\t"
3282     "                                            \n\t" // update c02:c32
3283     "                                            \n\t"
3284     "vmovupd    (%%rcx),       %%xmm0            \n\t" // load (c02,c12) into xmm0
3285     "vmovupd    (%%rcx,%%rsi), %%xmm2            \n\t" // load (c22,c32) into xmm2
3286     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3287     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3288     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3289     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3290     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3291     "vaddpd           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3292     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3293     "vmovupd          %%xmm0,  (%%rcx)           \n\t" // store (c02,c12)
3294     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c22,c32)
3295     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3296     "                                            \n\t"
3297     "                                            \n\t" // update c42:c72
3298     "                                            \n\t"
3299     "vmovupd    (%%rdx),       %%xmm0            \n\t" // load (c42,c52) into xmm0
3300     "vmovupd    (%%rdx,%%rsi), %%xmm2            \n\t" // load (c62,c72) into xmm2
3301     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3302     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3303     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3304     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3305     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3306     "vaddpd           %%ymm10, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3307     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3308     "vmovupd          %%xmm0,  (%%rdx)           \n\t" // store (c42,c52)
3309     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c62,c72)
3310     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3311     "                                            \n\t"
3312     "                                            \n\t" // update c03:c33
3313     "                                            \n\t"
3314     "vmovupd    (%%rcx),       %%xmm0            \n\t" // load (c03,c13) into xmm0
3315     "vmovupd    (%%rcx,%%rsi), %%xmm2            \n\t" // load (c23,c33) into xmm2
3316     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3317     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3318     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3319     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3320     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3321     "vaddpd           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3322     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3323     "vmovupd          %%xmm0,  (%%rcx)           \n\t" // store (c03,c13)
3324     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c23,c33)
3325     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3326     "                                            \n\t"
3327     "                                            \n\t" // update c43:c73
3328     "                                            \n\t"
3329     "vmovupd    (%%rdx),       %%xmm0            \n\t" // load (c43,c53) into xmm0
3330     "vmovupd    (%%rdx,%%rsi), %%xmm2            \n\t" // load (c63,c73) into xmm2
3331     "vinsertf128  $1, %%xmm2,  %%ymm0,  %%ymm0   \n\t" // ymm0 := (ymm0[0:1],xmm2)
3332     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3333     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3334     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3335     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3336     "vaddpd           %%ymm8,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3337     "vextractf128 $1, %%ymm0,  %%xmm2            \n\t" // xmm2 := ymm0[2:3]
3338     "vmovupd          %%xmm0,  (%%rdx)           \n\t" // store (c43,c53)
3339     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c63,c73)
3340     "                                            \n\t"
3341     "                                            \n\t"
3342     "                                            \n\t"
3343     "jmp    .ZDONE                               \n\t" // jump to end.
3344     "                                            \n\t"
3345     "                                            \n\t"
3346     "                                            \n\t"
3347     ".ZCOLSTORED:                                \n\t"
3348     "                                            \n\t" // update c00:c30
3349     "                                            \n\t"
3350     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c00:c30 into ymm0
3351     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3352     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3353     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3354     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3355     "vaddpd           %%ymm15, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3356     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // store c00:c30
3357     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3358     "                                            \n\t"
3359     "                                            \n\t" // update c40:c70
3360     "                                            \n\t"
3361     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c40:c70 into ymm0
3362     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3363     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3364     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3365     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3366     "vaddpd           %%ymm14, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3367     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // store c40:c70
3368     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3369     "                                            \n\t"
3370     "                                            \n\t" // update c01:c31
3371     "                                            \n\t"
3372     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c01:c31 into ymm0
3373     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3374     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3375     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3376     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3377     "vaddpd           %%ymm13, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3378     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // store c01:c31
3379     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3380     "                                            \n\t"
3381     "                                            \n\t" // update c41:c71
3382     "                                            \n\t"
3383     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c41:c71 into ymm0
3384     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3385     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3386     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3387     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3388     "vaddpd           %%ymm12, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3389     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // store c41:c71
3390     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3391     "                                            \n\t"
3392     "                                            \n\t" // update c02:c32
3393     "                                            \n\t"
3394     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c02:c32 into ymm0
3395     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3396     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3397     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3398     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3399     "vaddpd           %%ymm11, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3400     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // store c02:c32
3401     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3402     "                                            \n\t"
3403     "                                            \n\t" // update c42:c72
3404     "                                            \n\t"
3405     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c42:c72 into ymm0
3406     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3407     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3408     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3409     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3410     "vaddpd           %%ymm10, %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3411     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // store c42:c72
3412     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3413     "                                            \n\t"
3414     "                                            \n\t" // update c03:c33
3415     "                                            \n\t"
3416     "vmovapd    (%%rcx),       %%ymm0            \n\t" // load c03:c33 into ymm0
3417     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3418     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3419     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3420     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3421     "vaddpd           %%ymm9,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3422     "vmovapd          %%ymm0,  (%%rcx)           \n\t" // store c03:c33
3423     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3424     "                                            \n\t"
3425     "                                            \n\t" // update c43:c73
3426     "                                            \n\t"
3427     "vmovapd    (%%rdx),       %%ymm0            \n\t" // load c43:c73 into ymm0
3428     "vpermilpd  $0x5, %%ymm0,  %%ymm2            \n\t" // scale ymm0 by beta
3429     "vmulpd           %%ymm7,  %%ymm0,  %%ymm0   \n\t"
3430     "vmulpd           %%ymm6,  %%ymm2,  %%ymm2   \n\t"
3431     "vaddsubpd        %%ymm2,  %%ymm0,  %%ymm0   \n\t"
3432     "vaddpd           %%ymm8,  %%ymm0,  %%ymm0   \n\t" // add the gemm result to ymm0
3433     "vmovapd          %%ymm0,  (%%rdx)           \n\t" // store c43:c73
3434     "                                            \n\t"
3435     "                                            \n\t"
3436     "                                            \n\t"
3437     "jmp    .ZDONE                               \n\t" // jump to end.
3438     "                                            \n\t"
3439     "                                            \n\t"
3440     "                                            \n\t"
3441     ".ZBETAZERO:                                 \n\t"
3442     "                                            \n\t" // check if aligned/column-stored
3443     "                                            \n\t" // check if aligned/column-stored
3444     "andb     %%bl, %%bh                         \n\t" // set ZF if bl & bh == 1.
3445     "andb     %%bh, %%al                         \n\t" // set ZF if bh & al == 1.
3446     "jne     .ZCOLSTORBZ                         \n\t" // jump to column storage case
3447     "                                            \n\t"
3448     "                                            \n\t"
3449     "                                            \n\t"
3450     ".ZGENSTORBZ:                                \n\t"
3451     "                                            \n\t" // update c00:c30
3452     "                                            \n\t"
3453     "vextractf128 $1, %%ymm15, %%xmm2            \n\t"
3454     "vmovupd          %%xmm15, (%%rcx)           \n\t" // store (c00,c10)
3455     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c20,c30)
3456     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3457     "                                            \n\t"
3458     "                                            \n\t" // update c40:c70
3459     "                                            \n\t"
3460     "vextractf128 $1, %%ymm14, %%xmm2            \n\t"
3461     "vmovupd          %%xmm14, (%%rdx)           \n\t" // store (c40,c50)
3462     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c60,c70)
3463     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3464     "                                            \n\t"
3465     "                                            \n\t" // update c01:c31
3466     "                                            \n\t"
3467     "vextractf128 $1, %%ymm13, %%xmm2            \n\t"
3468     "vmovupd          %%xmm13, (%%rcx)           \n\t" // store (c01,c11)
3469     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c21,c31)
3470     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3471     "                                            \n\t"
3472     "                                            \n\t" // update c41:c71
3473     "                                            \n\t"
3474     "vextractf128 $1, %%ymm12, %%xmm2            \n\t"
3475     "vmovupd          %%xmm12, (%%rdx)           \n\t" // store (c41,c51)
3476     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c61,c71)
3477     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3478     "                                            \n\t"
3479     "                                            \n\t" // update c02:c32
3480     "                                            \n\t"
3481     "vextractf128 $1, %%ymm11, %%xmm2            \n\t"
3482     "vmovupd          %%xmm11, (%%rcx)           \n\t" // store (c02,c12)
3483     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c22,c32)
3484     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3485     "                                            \n\t"
3486     "                                            \n\t" // update c42:c72
3487     "                                            \n\t"
3488     "vextractf128 $1, %%ymm10, %%xmm2            \n\t"
3489     "vmovupd          %%xmm10, (%%rdx)           \n\t" // store (c42,c52)
3490     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c62,c72)
3491     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3492     "                                            \n\t"
3493     "                                            \n\t" // update c03:c33
3494     "                                            \n\t"
3495     "vextractf128 $1, %%ymm9,  %%xmm2            \n\t"
3496     "vmovupd          %%xmm9,  (%%rcx)           \n\t" // store (c03,c13)
3497     "vmovupd          %%xmm2,  (%%rcx,%%rsi)     \n\t" // store (c23,c33)
3498     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3499     "                                            \n\t"
3500     "                                            \n\t" // update c43:c73
3501     "                                            \n\t"
3502     "vextractf128 $1, %%ymm8,  %%xmm2            \n\t"
3503     "vmovupd          %%xmm8,  (%%rdx)           \n\t" // store (c43,c53)
3504     "vmovupd          %%xmm2,  (%%rdx,%%rsi)     \n\t" // store (c63,c73)
3505     "                                            \n\t"
3506     "                                            \n\t"
3507     "                                            \n\t"
3508     "jmp    .ZDONE                               \n\t" // jump to end.
3509     "                                            \n\t"
3510     "                                            \n\t"
3511     "                                            \n\t"
3512     ".ZCOLSTORBZ:                                \n\t"
3513     "                                            \n\t"
3514     "                                            \n\t"
3515     "vmovapd          %%ymm15, (%%rcx)           \n\t" // store c00:c30
3516     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3517     "                                            \n\t"
3518     "vmovapd          %%ymm14, (%%rdx)           \n\t" // store c40:c70
3519     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3520     "                                            \n\t"
3521     "vmovapd          %%ymm13, (%%rcx)           \n\t" // store c01:c31
3522     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3523     "                                            \n\t"
3524     "vmovapd          %%ymm12, (%%rdx)           \n\t" // store c41:c71
3525     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3526     "                                            \n\t"
3527     "vmovapd          %%ymm11, (%%rcx)           \n\t" // store c02:c32
3528     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3529     "                                            \n\t"
3530     "vmovapd          %%ymm10, (%%rdx)           \n\t" // store c42:c72
3531     "addq      %%rdi, %%rdx                      \n\t" // c += cs_c;
3532     "                                            \n\t"
3533     "vmovapd          %%ymm9,  (%%rcx)           \n\t" // store c03:c33
3534     "addq      %%rdi, %%rcx                      \n\t" // c += cs_c;
3535     "                                            \n\t"
3536     "vmovapd          %%ymm8,  (%%rdx)           \n\t" // store c43:c73
3537     "                                            \n\t"
3538     "                                            \n\t"
3539     "                                            \n\t"
3540     "                                            \n\t"
3541     "                                            \n\t"
3542     ".ZDONE:                                     \n\t"
3543     "                                            \n\t"
3544 
3545     : // output operands (none)
3546     : // input operands
3547       "m" (k_iter), // 0
3548       "m" (k_left), // 1
3549       "m" (a),      // 2
3550       "m" (b),      // 3
3551       "m" (alpha),  // 4
3552       "m" (beta),   // 5
3553       "m" (c),      // 6
3554       "m" (rs_c),   // 7
3555       "m" (cs_c)/*,   // 8
3556       "m" (b_next), // 9
3557       "m" (a_next)*/  // 10
3558     : // register clobber list
3559       "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
3560       "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
3561       "xmm0", "xmm1", "xmm2", "xmm3",
3562       "xmm4", "xmm5", "xmm6", "xmm7",
3563       "xmm8", "xmm9", "xmm10", "xmm11",
3564       "xmm12", "xmm13", "xmm14", "xmm15",
3565       "memory"
3566     );
3567 }
3568 
3569