1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 
37 
38 #define SGEMM_INPUT_GS_BETA_NZ \
39 	"vmovlps    (%%rcx        ),  %%xmm0,  %%xmm0  \n\t" \
40 	"vmovhps    (%%rcx,%%rsi,1),  %%xmm0,  %%xmm0  \n\t" \
41 	"vmovlps    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
42 	"vmovhps    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
43 	"vshufps    $0x88,   %%xmm1,  %%xmm0,  %%xmm0  \n\t" \
44 	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
45 	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
46 	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
47 	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
48 	"vshufps    $0x88,   %%xmm1,  %%xmm2,  %%xmm2  \n\t" \
49 	"vperm2f128 $0x20,   %%ymm2,  %%ymm0,  %%ymm0  \n\t"
50 
51 #define SGEMM_OUTPUT_GS_BETA_NZ \
52 	"vextractf128  $1, %%ymm0,  %%xmm2           \n\t" \
53 	"vmovss            %%xmm0, (%%rcx        )   \n\t" \
54 	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t" \
55 	"vmovss            %%xmm1, (%%rcx,%%rsi,1)   \n\t" \
56 	"vpermilps  $0x39, %%xmm1,  %%xmm0           \n\t" \
57 	"vmovss            %%xmm0, (%%rcx,%%rsi,2)   \n\t" \
58 	"vpermilps  $0x39, %%xmm0,  %%xmm1           \n\t" \
59 	"vmovss            %%xmm1, (%%rcx,%%r13  )   \n\t" \
60 	"vmovss            %%xmm2, (%%rcx,%%rsi,4)   \n\t" \
61 	"vpermilps  $0x39, %%xmm2,  %%xmm1           \n\t" \
62 	"vmovss            %%xmm1, (%%rcx,%%r15  )   \n\t" \
63 	"vpermilps  $0x39, %%xmm1,  %%xmm2           \n\t" \
64 	"vmovss            %%xmm2, (%%rcx,%%r13,2)   \n\t" \
65 	"vpermilps  $0x39, %%xmm2,  %%xmm1           \n\t" \
66 	"vmovss            %%xmm1, (%%rcx,%%r10  )   \n\t"
67 
bli_sgemm_asm_6x16(dim_t k,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)68 void bli_sgemm_asm_6x16
69      (
70        dim_t               k,
71        float*     restrict alpha,
72        float*     restrict a,
73        float*     restrict b,
74        float*     restrict beta,
75        float*     restrict c, inc_t rs_c, inc_t cs_c,
76        auxinfo_t* restrict data,
77        cntx_t*    restrict cntx
78      )
79 {
80 	//void*   a_next = bli_auxinfo_next_a( data );
81 	//void*   b_next = bli_auxinfo_next_b( data );
82 
83 	uint64_t   k_iter = k / 4;
84 	uint64_t   k_left = k % 4;
85 
86 	__asm__ volatile
87 	(
88 	"                                            \n\t"
89 	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
90 	"                                            \n\t"
91 	"                                            \n\t"
92 	"movq                %2, %%rax               \n\t" // load address of a.
93 	"movq                %3, %%rbx               \n\t" // load address of b.
94 	//"movq                %9, %%r15               \n\t" // load address of b_next.
95 	"                                            \n\t"
96 	"addq           $32 * 4, %%rbx               \n\t"
97 	"                                            \n\t" // initialize loop by pre-loading
98 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
99 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
100 	"                                            \n\t"
101 	"movq                %6, %%rcx               \n\t" // load address of c
102 	"movq                %7, %%rdi               \n\t" // load rs_c
103 	"leaq        (,%%rdi,4), %%rdi               \n\t" // rs_c *= sizeof(float)
104 	"                                            \n\t"
105 	"leaq   (%%rdi,%%rdi,2), %%r13               \n\t" // r13 = 3*rs_c;
106 	"leaq   (%%rcx,%%r13,1), %%rdx               \n\t" // rdx = c + 3*rs_c;
107 	"prefetcht0   7 * 8(%%rcx)                   \n\t" // prefetch c + 0*rs_c
108 	"prefetcht0   7 * 8(%%rcx,%%rdi)             \n\t" // prefetch c + 1*rs_c
109 	"prefetcht0   7 * 8(%%rcx,%%rdi,2)           \n\t" // prefetch c + 2*rs_c
110 	"prefetcht0   7 * 8(%%rdx)                   \n\t" // prefetch c + 3*rs_c
111 	"prefetcht0   7 * 8(%%rdx,%%rdi)             \n\t" // prefetch c + 4*rs_c
112 	"prefetcht0   7 * 8(%%rdx,%%rdi,2)           \n\t" // prefetch c + 5*rs_c
113 	"                                            \n\t"
114 	"                                            \n\t"
115 	"                                            \n\t"
116 	"                                            \n\t"
117 	"movq      %0, %%rsi                         \n\t" // i = k_iter;
118 	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
119 	"je     .SCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
120 	"                                            \n\t" // contains the k_left loop.
121 	"                                            \n\t"
122 	"                                            \n\t"
123 	".SLOOPKITER:                                \n\t" // MAIN LOOP
124 	"                                            \n\t"
125 	"                                            \n\t"
126 	"                                            \n\t" // iteration 0
127 	"prefetcht0   64 * 4(%%rax)                  \n\t"
128 	"                                            \n\t"
129 	"vbroadcastss       0 *  4(%%rax), %%ymm2    \n\t"
130 	"vbroadcastss       1 *  4(%%rax), %%ymm3    \n\t"
131 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
132 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
133 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
134 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
135 	"                                            \n\t"
136 	"vbroadcastss       2 *  4(%%rax), %%ymm2    \n\t"
137 	"vbroadcastss       3 *  4(%%rax), %%ymm3    \n\t"
138 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
139 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
140 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
141 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
142 	"                                            \n\t"
143 	"vbroadcastss       4 *  4(%%rax), %%ymm2    \n\t"
144 	"vbroadcastss       5 *  4(%%rax), %%ymm3    \n\t"
145 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
146 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
147 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
148 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
149 	"                                            \n\t"
150 	"vmovaps           -2 * 32(%%rbx), %%ymm0    \n\t"
151 	"vmovaps           -1 * 32(%%rbx), %%ymm1    \n\t"
152 	"                                            \n\t"
153 	"                                            \n\t" // iteration 1
154 	"vbroadcastss       6 *  4(%%rax), %%ymm2    \n\t"
155 	"vbroadcastss       7 *  4(%%rax), %%ymm3    \n\t"
156 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
157 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
158 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
159 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
160 	"                                            \n\t"
161 	"vbroadcastss       8 *  4(%%rax), %%ymm2    \n\t"
162 	"vbroadcastss       9 *  4(%%rax), %%ymm3    \n\t"
163 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
164 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
165 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
166 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
167 	"                                            \n\t"
168 	"vbroadcastss      10 *  4(%%rax), %%ymm2    \n\t"
169 	"vbroadcastss      11 *  4(%%rax), %%ymm3    \n\t"
170 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
171 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
172 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
173 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
174 	"                                            \n\t"
175 	"vmovaps            0 * 32(%%rbx), %%ymm0    \n\t"
176 	"vmovaps            1 * 32(%%rbx), %%ymm1    \n\t"
177 	"                                            \n\t"
178 	"                                            \n\t" // iteration 2
179 	"prefetcht0   76 * 4(%%rax)                  \n\t"
180 	"                                            \n\t"
181 	"vbroadcastss      12 *  4(%%rax), %%ymm2    \n\t"
182 	"vbroadcastss      13 *  4(%%rax), %%ymm3    \n\t"
183 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
184 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
185 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
186 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
187 	"                                            \n\t"
188 	"vbroadcastss      14 *  4(%%rax), %%ymm2    \n\t"
189 	"vbroadcastss      15 *  4(%%rax), %%ymm3    \n\t"
190 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
191 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
192 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
193 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
194 	"                                            \n\t"
195 	"vbroadcastss      16 *  4(%%rax), %%ymm2    \n\t"
196 	"vbroadcastss      17 *  4(%%rax), %%ymm3    \n\t"
197 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
198 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
199 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
200 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
201 	"                                            \n\t"
202 	"vmovaps            2 * 32(%%rbx), %%ymm0    \n\t"
203 	"vmovaps            3 * 32(%%rbx), %%ymm1    \n\t"
204 	"                                            \n\t"
205 	"                                            \n\t" // iteration 3
206 	"vbroadcastss      18 *  4(%%rax), %%ymm2    \n\t"
207 	"vbroadcastss      19 *  4(%%rax), %%ymm3    \n\t"
208 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
209 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
210 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
211 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
212 	"                                            \n\t"
213 	"vbroadcastss      20 *  4(%%rax), %%ymm2    \n\t"
214 	"vbroadcastss      21 *  4(%%rax), %%ymm3    \n\t"
215 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
216 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
217 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
218 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
219 	"                                            \n\t"
220 	"vbroadcastss      22 *  4(%%rax), %%ymm2    \n\t"
221 	"vbroadcastss      23 *  4(%%rax), %%ymm3    \n\t"
222 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
223 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
224 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
225 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
226 	"                                            \n\t"
227 	"addq          $4 *  6 * 4, %%rax            \n\t" // a += 4*6  (unroll x mr)
228 	"addq          $4 * 16 * 4, %%rbx            \n\t" // b += 4*16 (unroll x nr)
229 	"                                            \n\t"
230 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
231 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
232 	"                                            \n\t"
233 	"                                            \n\t"
234 	"decq   %%rsi                                \n\t" // i -= 1;
235 	"jne    .SLOOPKITER                          \n\t" // iterate again if i != 0.
236 	"                                            \n\t"
237 	"                                            \n\t"
238 	"                                            \n\t"
239 	"                                            \n\t"
240 	"                                            \n\t"
241 	"                                            \n\t"
242 	".SCONSIDKLEFT:                              \n\t"
243 	"                                            \n\t"
244 	"movq      %1, %%rsi                         \n\t" // i = k_left;
245 	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
246 	"je     .SPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
247 	"                                            \n\t" // else, we prepare to enter k_left loop.
248 	"                                            \n\t"
249 	"                                            \n\t"
250 	".SLOOPKLEFT:                                \n\t" // EDGE LOOP
251 	"                                            \n\t"
252 	"prefetcht0  16 * 32(%%rax)                  \n\t"
253 	"                                            \n\t"
254 	"vbroadcastss       0 *  4(%%rax), %%ymm2    \n\t"
255 	"vbroadcastss       1 *  4(%%rax), %%ymm3    \n\t"
256 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm4    \n\t"
257 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm5    \n\t"
258 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm6    \n\t"
259 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm7    \n\t"
260 	"                                            \n\t"
261 	"vbroadcastss       2 *  4(%%rax), %%ymm2    \n\t"
262 	"vbroadcastss       3 *  4(%%rax), %%ymm3    \n\t"
263 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm8    \n\t"
264 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm9    \n\t"
265 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm10   \n\t"
266 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm11   \n\t"
267 	"                                            \n\t"
268 	"vbroadcastss       4 *  4(%%rax), %%ymm2    \n\t"
269 	"vbroadcastss       5 *  4(%%rax), %%ymm3    \n\t"
270 	"vfmadd231ps       %%ymm0, %%ymm2, %%ymm12   \n\t"
271 	"vfmadd231ps       %%ymm1, %%ymm2, %%ymm13   \n\t"
272 	"vfmadd231ps       %%ymm0, %%ymm3, %%ymm14   \n\t"
273 	"vfmadd231ps       %%ymm1, %%ymm3, %%ymm15   \n\t"
274 	"                                            \n\t"
275 	"addq          $1 *  6 * 4, %%rax            \n\t" // a += 1*6  (unroll x mr)
276 	"addq          $1 * 16 * 4, %%rbx            \n\t" // b += 1*16 (unroll x nr)
277 	"                                            \n\t"
278 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
279 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
280 	"                                            \n\t"
281 	"                                            \n\t"
282 	"decq   %%rsi                                \n\t" // i -= 1;
283 	"jne    .SLOOPKLEFT                          \n\t" // iterate again if i != 0.
284 	"                                            \n\t"
285 	"                                            \n\t"
286 	"                                            \n\t"
287 	".SPOSTACCUM:                                \n\t"
288 	"                                            \n\t"
289 	"                                            \n\t"
290 	"                                            \n\t"
291 	"                                            \n\t"
292 	"movq         %4, %%rax                      \n\t" // load address of alpha
293 	"movq         %5, %%rbx                      \n\t" // load address of beta
294 	"vbroadcastss    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
295 	"vbroadcastss    (%%rbx), %%ymm3             \n\t" // load beta and duplicate
296 	"                                            \n\t"
297 	"vmulps           %%ymm0,  %%ymm4,  %%ymm4   \n\t" // scale by alpha
298 	"vmulps           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
299 	"vmulps           %%ymm0,  %%ymm6,  %%ymm6   \n\t"
300 	"vmulps           %%ymm0,  %%ymm7,  %%ymm7   \n\t"
301 	"vmulps           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
302 	"vmulps           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
303 	"vmulps           %%ymm0,  %%ymm10, %%ymm10  \n\t"
304 	"vmulps           %%ymm0,  %%ymm11, %%ymm11  \n\t"
305 	"vmulps           %%ymm0,  %%ymm12, %%ymm12  \n\t"
306 	"vmulps           %%ymm0,  %%ymm13, %%ymm13  \n\t"
307 	"vmulps           %%ymm0,  %%ymm14, %%ymm14  \n\t"
308 	"vmulps           %%ymm0,  %%ymm15, %%ymm15  \n\t"
309 	"                                            \n\t"
310 	"                                            \n\t"
311 	"                                            \n\t"
312 	"                                            \n\t"
313 	"                                            \n\t"
314 	"                                            \n\t"
315 	"movq                %8, %%rsi               \n\t" // load cs_c
316 	"leaq        (,%%rsi,4), %%rsi               \n\t" // rsi = cs_c * sizeof(float)
317 	"                                            \n\t"
318 	"leaq   (%%rcx,%%rsi,8), %%rdx               \n\t" // load address of c +  8*cs_c;
319 	"                                            \n\t"
320 	"leaq   (%%rsi,%%rsi,2), %%r13               \n\t" // r13 = 3*cs_c;
321 	"leaq   (%%rsi,%%rsi,4), %%r15               \n\t" // r15 = 5*cs_c;
322 	"leaq   (%%r13,%%rsi,4), %%r10               \n\t" // r10 = 7*cs_c;
323 	"                                            \n\t"
324 	"                                            \n\t"
325 	"                                            \n\t" // now avoid loading C if beta == 0
326 	"                                            \n\t"
327 	"vxorps    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
328 	"vucomiss  %%xmm0,  %%xmm3                   \n\t" // set ZF if beta == 0.
329 	"je      .SBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
330 	"                                            \n\t"
331 	"                                            \n\t"
332     "cmpq       $4, %%rsi                        \n\t" // set ZF if (4*cs_c) == 4.
333 	"jz      .SROWSTORED                         \n\t" // jump to row storage case
334 	"                                            \n\t"
335 	"                                            \n\t"
336 	"                                            \n\t"
337 	".SGENSTORED:                                \n\t"
338 	"                                            \n\t"
339 	"                                            \n\t"
340 	SGEMM_INPUT_GS_BETA_NZ
341 	"vfmadd213ps      %%ymm4,  %%ymm3,  %%ymm0   \n\t"
342 	SGEMM_OUTPUT_GS_BETA_NZ
343 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
344 	"                                            \n\t"
345 	"                                            \n\t"
346 	SGEMM_INPUT_GS_BETA_NZ
347 	"vfmadd213ps      %%ymm6,  %%ymm3,  %%ymm0   \n\t"
348 	SGEMM_OUTPUT_GS_BETA_NZ
349 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
350 	"                                            \n\t"
351 	"                                            \n\t"
352 	SGEMM_INPUT_GS_BETA_NZ
353 	"vfmadd213ps      %%ymm8,  %%ymm3,  %%ymm0   \n\t"
354 	SGEMM_OUTPUT_GS_BETA_NZ
355 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
356 	"                                            \n\t"
357 	"                                            \n\t"
358 	SGEMM_INPUT_GS_BETA_NZ
359 	"vfmadd213ps      %%ymm10, %%ymm3,  %%ymm0   \n\t"
360 	SGEMM_OUTPUT_GS_BETA_NZ
361 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
362 	"                                            \n\t"
363 	"                                            \n\t"
364 	SGEMM_INPUT_GS_BETA_NZ
365 	"vfmadd213ps      %%ymm12, %%ymm3,  %%ymm0   \n\t"
366 	SGEMM_OUTPUT_GS_BETA_NZ
367 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
368 	"                                            \n\t"
369 	"                                            \n\t"
370 	SGEMM_INPUT_GS_BETA_NZ
371 	"vfmadd213ps      %%ymm14, %%ymm3,  %%ymm0   \n\t"
372 	SGEMM_OUTPUT_GS_BETA_NZ
373 	//"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
374 	"                                            \n\t"
375 	"                                            \n\t"
376 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 8*cs_c
377 	"                                            \n\t"
378 	"                                            \n\t"
379 	SGEMM_INPUT_GS_BETA_NZ
380 	"vfmadd213ps      %%ymm5,  %%ymm3,  %%ymm0   \n\t"
381 	SGEMM_OUTPUT_GS_BETA_NZ
382 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
383 	"                                            \n\t"
384 	"                                            \n\t"
385 	SGEMM_INPUT_GS_BETA_NZ
386 	"vfmadd213ps      %%ymm7,  %%ymm3,  %%ymm0   \n\t"
387 	SGEMM_OUTPUT_GS_BETA_NZ
388 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
389 	"                                            \n\t"
390 	"                                            \n\t"
391 	SGEMM_INPUT_GS_BETA_NZ
392 	"vfmadd213ps      %%ymm9,  %%ymm3,  %%ymm0   \n\t"
393 	SGEMM_OUTPUT_GS_BETA_NZ
394 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
395 	"                                            \n\t"
396 	"                                            \n\t"
397 	SGEMM_INPUT_GS_BETA_NZ
398 	"vfmadd213ps      %%ymm11, %%ymm3,  %%ymm0   \n\t"
399 	SGEMM_OUTPUT_GS_BETA_NZ
400 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
401 	"                                            \n\t"
402 	"                                            \n\t"
403 	SGEMM_INPUT_GS_BETA_NZ
404 	"vfmadd213ps      %%ymm13, %%ymm3,  %%ymm0   \n\t"
405 	SGEMM_OUTPUT_GS_BETA_NZ
406 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
407 	"                                            \n\t"
408 	"                                            \n\t"
409 	SGEMM_INPUT_GS_BETA_NZ
410 	"vfmadd213ps      %%ymm15, %%ymm3,  %%ymm0   \n\t"
411 	SGEMM_OUTPUT_GS_BETA_NZ
412 	//"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
413 	"                                            \n\t"
414 	"                                            \n\t"
415 	"                                            \n\t"
416 	"jmp    .SDONE                               \n\t" // jump to end.
417 	"                                            \n\t"
418 	"                                            \n\t"
419 	"                                            \n\t"
420 	".SROWSTORED:                                \n\t"
421 	"                                            \n\t"
422 	"                                            \n\t"
423 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm4   \n\t"
424 	"vmovups          %%ymm4,  (%%rcx)           \n\t"
425 	"addq      %%rdi, %%rcx                      \n\t"
426 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm5   \n\t"
427 	"vmovups          %%ymm5,  (%%rdx)           \n\t"
428 	"addq      %%rdi, %%rdx                      \n\t"
429 	"                                            \n\t"
430 	"                                            \n\t"
431 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm6   \n\t"
432 	"vmovups          %%ymm6,  (%%rcx)           \n\t"
433 	"addq      %%rdi, %%rcx                      \n\t"
434 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm7   \n\t"
435 	"vmovups          %%ymm7,  (%%rdx)           \n\t"
436 	"addq      %%rdi, %%rdx                      \n\t"
437 	"                                            \n\t"
438 	"                                            \n\t"
439 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm8   \n\t"
440 	"vmovups          %%ymm8,  (%%rcx)           \n\t"
441 	"addq      %%rdi, %%rcx                      \n\t"
442 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm9   \n\t"
443 	"vmovups          %%ymm9,  (%%rdx)           \n\t"
444 	"addq      %%rdi, %%rdx                      \n\t"
445 	"                                            \n\t"
446 	"                                            \n\t"
447 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm10  \n\t"
448 	"vmovups          %%ymm10, (%%rcx)           \n\t"
449 	"addq      %%rdi, %%rcx                      \n\t"
450 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm11  \n\t"
451 	"vmovups          %%ymm11, (%%rdx)           \n\t"
452 	"addq      %%rdi, %%rdx                      \n\t"
453 	"                                            \n\t"
454 	"                                            \n\t"
455 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm12  \n\t"
456 	"vmovups          %%ymm12, (%%rcx)           \n\t"
457 	"addq      %%rdi, %%rcx                      \n\t"
458 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm13  \n\t"
459 	"vmovups          %%ymm13, (%%rdx)           \n\t"
460 	"addq      %%rdi, %%rdx                      \n\t"
461 	"                                            \n\t"
462 	"                                            \n\t"
463 	"vfmadd231ps      (%%rcx), %%ymm3,  %%ymm14  \n\t"
464 	"vmovups          %%ymm14, (%%rcx)           \n\t"
465 	//"addq      %%rdi, %%rcx                      \n\t"
466 	"vfmadd231ps      (%%rdx), %%ymm3,  %%ymm15  \n\t"
467 	"vmovups          %%ymm15, (%%rdx)           \n\t"
468 	//"addq      %%rdi, %%rdx                      \n\t"
469 	"                                            \n\t"
470 	"                                            \n\t"
471 	"                                            \n\t"
472 	"jmp    .SDONE                               \n\t" // jump to end.
473 	"                                            \n\t"
474 	"                                            \n\t"
475 	"                                            \n\t"
476 	".SBETAZERO:                                 \n\t"
477     "                                            \n\t"
478     "cmpq       $4, %%rsi                        \n\t" // set ZF if (4*cs_c) == 4.
479 	"jz      .SROWSTORBZ                         \n\t" // jump to row storage case
480 	"                                            \n\t"
481 	"                                            \n\t"
482 	"                                            \n\t"
483 	".SGENSTORBZ:                                \n\t"
484 	"                                            \n\t"
485 	"                                            \n\t"
486 	"vmovaps           %%ymm4,  %%ymm0           \n\t"
487 	SGEMM_OUTPUT_GS_BETA_NZ
488 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
489 	"                                            \n\t"
490 	"                                            \n\t"
491 	"vmovaps           %%ymm6,  %%ymm0           \n\t"
492 	SGEMM_OUTPUT_GS_BETA_NZ
493 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
494 	"                                            \n\t"
495 	"                                            \n\t"
496 	"vmovaps           %%ymm8,  %%ymm0           \n\t"
497 	SGEMM_OUTPUT_GS_BETA_NZ
498 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
499 	"                                            \n\t"
500 	"                                            \n\t"
501 	"vmovaps           %%ymm10, %%ymm0           \n\t"
502 	SGEMM_OUTPUT_GS_BETA_NZ
503 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
504 	"                                            \n\t"
505 	"                                            \n\t"
506 	"vmovaps           %%ymm12, %%ymm0           \n\t"
507 	SGEMM_OUTPUT_GS_BETA_NZ
508 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
509 	"                                            \n\t"
510 	"                                            \n\t"
511 	"vmovaps           %%ymm14, %%ymm0           \n\t"
512 	SGEMM_OUTPUT_GS_BETA_NZ
513 	//"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
514 	"                                            \n\t"
515 	"                                            \n\t"
516 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 8*cs_c
517 	"                                            \n\t"
518 	"                                            \n\t"
519 	"vmovaps           %%ymm5,  %%ymm0           \n\t"
520 	SGEMM_OUTPUT_GS_BETA_NZ
521 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
522 	"                                            \n\t"
523 	"                                            \n\t"
524 	"vmovaps           %%ymm7,  %%ymm0           \n\t"
525 	SGEMM_OUTPUT_GS_BETA_NZ
526 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
527 	"                                            \n\t"
528 	"                                            \n\t"
529 	"vmovaps           %%ymm9,  %%ymm0           \n\t"
530 	SGEMM_OUTPUT_GS_BETA_NZ
531 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
532 	"                                            \n\t"
533 	"                                            \n\t"
534 	"vmovaps           %%ymm11, %%ymm0           \n\t"
535 	SGEMM_OUTPUT_GS_BETA_NZ
536 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
537 	"                                            \n\t"
538 	"                                            \n\t"
539 	"vmovaps           %%ymm13, %%ymm0           \n\t"
540 	SGEMM_OUTPUT_GS_BETA_NZ
541 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
542 	"                                            \n\t"
543 	"                                            \n\t"
544 	"vmovaps           %%ymm15, %%ymm0           \n\t"
545 	SGEMM_OUTPUT_GS_BETA_NZ
546 	//"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
547 	"                                            \n\t"
548 	"                                            \n\t"
549 	"                                            \n\t"
550 	"jmp    .SDONE                               \n\t" // jump to end.
551 	"                                            \n\t"
552 	"                                            \n\t"
553 	"                                            \n\t"
554 	".SROWSTORBZ:                                \n\t"
555 	"                                            \n\t"
556 	"                                            \n\t"
557 	"vmovups          %%ymm4,  (%%rcx)           \n\t"
558 	"addq      %%rdi, %%rcx                      \n\t"
559 	"vmovups          %%ymm5,  (%%rdx)           \n\t"
560 	"addq      %%rdi, %%rdx                      \n\t"
561 	"                                            \n\t"
562 	"vmovups          %%ymm6,  (%%rcx)           \n\t"
563 	"addq      %%rdi, %%rcx                      \n\t"
564 	"vmovups          %%ymm7,  (%%rdx)           \n\t"
565 	"addq      %%rdi, %%rdx                      \n\t"
566 	"                                            \n\t"
567 	"                                            \n\t"
568 	"vmovups          %%ymm8,  (%%rcx)           \n\t"
569 	"addq      %%rdi, %%rcx                      \n\t"
570 	"vmovups          %%ymm9,  (%%rdx)           \n\t"
571 	"addq      %%rdi, %%rdx                      \n\t"
572 	"                                            \n\t"
573 	"                                            \n\t"
574 	"vmovups          %%ymm10, (%%rcx)           \n\t"
575 	"addq      %%rdi, %%rcx                      \n\t"
576 	"vmovups          %%ymm11, (%%rdx)           \n\t"
577 	"addq      %%rdi, %%rdx                      \n\t"
578 	"                                            \n\t"
579 	"                                            \n\t"
580 	"vmovups          %%ymm12, (%%rcx)           \n\t"
581 	"addq      %%rdi, %%rcx                      \n\t"
582 	"vmovups          %%ymm13, (%%rdx)           \n\t"
583 	"addq      %%rdi, %%rdx                      \n\t"
584 	"                                            \n\t"
585 	"                                            \n\t"
586 	"vmovups          %%ymm14, (%%rcx)           \n\t"
587 	//"addq      %%rdi, %%rcx                      \n\t"
588 	"vmovups          %%ymm15, (%%rdx)           \n\t"
589 	//"addq      %%rdi, %%rdx                      \n\t"
590 	"                                            \n\t"
591 	"                                            \n\t"
592 	"                                            \n\t"
593 	"                                            \n\t"
594 	"                                            \n\t"
595 	"                                            \n\t"
596 	"                                            \n\t"
597 	".SDONE:                                     \n\t"
598 	"                                            \n\t"
599 
600 	: // output operands (none)
601 	: // input operands
602 	  "m" (k_iter), // 0
603 	  "m" (k_left), // 1
604 	  "m" (a),      // 2
605 	  "m" (b),      // 3
606 	  "m" (alpha),  // 4
607 	  "m" (beta),   // 5
608 	  "m" (c),      // 6
609 	  "m" (rs_c),   // 7
610 	  "m" (cs_c)/*,   // 8
611 	  "m" (b_next), // 9
612 	  "m" (a_next)*/  // 10
613 	: // register clobber list
614 	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
615 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
616 	  "xmm0", "xmm1", "xmm2", "xmm3",
617 	  "xmm4", "xmm5", "xmm6", "xmm7",
618 	  "xmm8", "xmm9", "xmm10", "xmm11",
619 	  "xmm12", "xmm13", "xmm14", "xmm15",
620 	  "memory"
621 	);
622 }
623 
624 
625 #define DGEMM_INPUT_GS_BETA_NZ \
626 	"vmovlpd    (%%rcx        ),  %%xmm0,  %%xmm0  \n\t" \
627 	"vmovhpd    (%%rcx,%%rsi,1),  %%xmm0,  %%xmm0  \n\t" \
628 	"vmovlpd    (%%rcx,%%rsi,2),  %%xmm1,  %%xmm1  \n\t" \
629 	"vmovhpd    (%%rcx,%%r13  ),  %%xmm1,  %%xmm1  \n\t" \
630 	"vperm2f128 $0x20,   %%ymm1,  %%ymm0,  %%ymm0  \n\t" /*\
631 	"vmovlps    (%%rcx,%%rsi,4),  %%xmm2,  %%xmm2  \n\t" \
632 	"vmovhps    (%%rcx,%%r15  ),  %%xmm2,  %%xmm2  \n\t" \
633 	"vmovlps    (%%rcx,%%r13,2),  %%xmm1,  %%xmm1  \n\t" \
634 	"vmovhps    (%%rcx,%%r10  ),  %%xmm1,  %%xmm1  \n\t" \
635 	"vperm2f128 $0x20,   %%ymm1,  %%ymm2,  %%ymm2  \n\t"*/
636 
637 #define DGEMM_OUTPUT_GS_BETA_NZ \
638 	"vextractf128  $1, %%ymm0,  %%xmm1           \n\t" \
639 	"vmovlpd           %%xmm0,  (%%rcx        )  \n\t" \
640 	"vmovhpd           %%xmm0,  (%%rcx,%%rsi  )  \n\t" \
641 	"vmovlpd           %%xmm1,  (%%rcx,%%rsi,2)  \n\t" \
642 	"vmovhpd           %%xmm1,  (%%rcx,%%r13  )  \n\t" /*\
643 	"vextractf128  $1, %%ymm2,  %%xmm1           \n\t" \
644 	"vmovlpd           %%xmm2,  (%%rcx,%%rsi,4)  \n\t" \
645 	"vmovhpd           %%xmm2,  (%%rcx,%%r15  )  \n\t" \
646 	"vmovlpd           %%xmm1,  (%%rcx,%%r13,2)  \n\t" \
647 	"vmovhpd           %%xmm1,  (%%rcx,%%r10  )  \n\t"*/
648 
bli_dgemm_asm_6x8(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)649 void bli_dgemm_asm_6x8
650      (
651        dim_t               k,
652        double*    restrict alpha,
653        double*    restrict a,
654        double*    restrict b,
655        double*    restrict beta,
656        double*    restrict c, inc_t rs_c, inc_t cs_c,
657        auxinfo_t* restrict data,
658        cntx_t*    restrict cntx
659      )
660 {
661 	//void*   a_next = bli_auxinfo_next_a( data );
662     //void*   b_next = bli_auxinfo_next_b( data );
663     void* c_prefetch = data->c_prefetch;
664 
665     uint64_t   k_iter = k / 4;
666     uint64_t   k_left = k % 4;
667 
668 	__asm__ volatile
669 	(
670 	"                                            \n\t"
671 	"vzeroall                                    \n\t" // zero all xmm/ymm registers.
672 	"                                            \n\t"
673 	"                                            \n\t"
674 	"movq                %2, %%rax               \n\t" // load address of a.
675 	"movq                %3, %%rbx               \n\t" // load address of b.
676 	//"movq                %9, %%r15               \n\t" // load address of b_next.
677 	"                                            \n\t"
678 	"addq           $32 * 4, %%rbx               \n\t"
679 	"                                            \n\t" // initialize loop by pre-loading
680 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
681 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
682 	"                                            \n\t"
683     "movq                %6, %%rcx               \n\t" // load address of c
684     "movq                %9, %%r8                \n\t" // load address of c_prefetch
685 	"movq                %7, %%rdi               \n\t" // load rs_c
686 	"leaq        (,%%rdi,8), %%rdi               \n\t" // rs_c *= sizeof(double)
687 	"                                            \n\t"
688 	"leaq   (%%rdi,%%rdi,2), %%r13               \n\t" // r13 = 3*rs_c;
689 	"leaq   (%%r8,%%r13,1), %%rdx                \n\t" // rdx = c_prefetch + 3*rs_c;
690 	"prefetcht0   7 * 8(%%r8)                    \n\t" // prefetch c + 0*rs_c
691 	"prefetcht0   7 * 8(%%r8,%%rdi)              \n\t" // prefetch c + 1*rs_c
692 	"prefetcht0   7 * 8(%%r8,%%rdi,2)            \n\t" // prefetch c + 2*rs_c
693 	"prefetcht0   7 * 8(%%rdx)                   \n\t" // prefetch c + 3*rs_c
694 	"prefetcht0   7 * 8(%%rdx,%%rdi)             \n\t" // prefetch c + 4*rs_c
695 	"prefetcht0   7 * 8(%%rdx,%%rdi,2)           \n\t" // prefetch c + 5*rs_c
696 	"                                            \n\t"
697 	"                                            \n\t"
698 	"                                            \n\t"
699 	"                                            \n\t"
700 	"movq      %0, %%rsi                         \n\t" // i = k_iter;
701 	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
702 	"je     .DCONSIDKLEFT                        \n\t" // if i == 0, jump to code that
703 	"                                            \n\t" // contains the k_left loop.
704 	"                                            \n\t"
705 	"                                            \n\t"
706 	".DLOOPKITER:                                \n\t" // MAIN LOOP
707 	"                                            \n\t"
708 	"                                            \n\t"
709 	"                                            \n\t" // iteration 0
710 	"prefetcht0   64 * 8(%%rax)                  \n\t"
711 	"                                            \n\t"
712 	"vbroadcastsd       0 *  8(%%rax), %%ymm2    \n\t"
713 	"vbroadcastsd       1 *  8(%%rax), %%ymm3    \n\t"
714 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
715 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
716 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
717 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
718 	"                                            \n\t"
719 	"vbroadcastsd       2 *  8(%%rax), %%ymm2    \n\t"
720 	"vbroadcastsd       3 *  8(%%rax), %%ymm3    \n\t"
721 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
722 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
723 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
724 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
725 	"                                            \n\t"
726 	"vbroadcastsd       4 *  8(%%rax), %%ymm2    \n\t"
727 	"vbroadcastsd       5 *  8(%%rax), %%ymm3    \n\t"
728 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
729 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
730 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
731 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
732 	"                                            \n\t"
733 	"vmovaps           -2 * 32(%%rbx), %%ymm0    \n\t"
734 	"vmovaps           -1 * 32(%%rbx), %%ymm1    \n\t"
735 	"                                            \n\t"
736 	"                                            \n\t" // iteration 1
737     "prefetcht0   72 * 8(%%rax)                  \n\t"
738     "                                            \n\t"
739 	"vbroadcastsd       6 *  8(%%rax), %%ymm2    \n\t"
740 	"vbroadcastsd       7 *  8(%%rax), %%ymm3    \n\t"
741 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
742 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
743 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
744 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
745 	"                                            \n\t"
746 	"vbroadcastsd       8 *  8(%%rax), %%ymm2    \n\t"
747 	"vbroadcastsd       9 *  8(%%rax), %%ymm3    \n\t"
748 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
749 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
750 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
751 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
752 	"                                            \n\t"
753 	"vbroadcastsd      10 *  8(%%rax), %%ymm2    \n\t"
754 	"vbroadcastsd      11 *  8(%%rax), %%ymm3    \n\t"
755 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
756 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
757 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
758 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
759 	"                                            \n\t"
760 	"vmovaps            0 * 32(%%rbx), %%ymm0    \n\t"
761 	"vmovaps            1 * 32(%%rbx), %%ymm1    \n\t"
762 	"                                            \n\t"
763 	"                                            \n\t" // iteration 2
764 	"prefetcht0   80 * 8(%%rax)                  \n\t"
765 	"                                            \n\t"
766 	"vbroadcastsd      12 *  8(%%rax), %%ymm2    \n\t"
767 	"vbroadcastsd      13 *  8(%%rax), %%ymm3    \n\t"
768 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
769 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
770 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
771 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
772 	"                                            \n\t"
773 	"vbroadcastsd      14 *  8(%%rax), %%ymm2    \n\t"
774 	"vbroadcastsd      15 *  8(%%rax), %%ymm3    \n\t"
775 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
776 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
777 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
778 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
779 	"                                            \n\t"
780 	"vbroadcastsd      16 *  8(%%rax), %%ymm2    \n\t"
781 	"vbroadcastsd      17 *  8(%%rax), %%ymm3    \n\t"
782 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
783 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
784 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
785 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
786 	"                                            \n\t"
787 	"vmovaps            2 * 32(%%rbx), %%ymm0    \n\t"
788 	"vmovaps            3 * 32(%%rbx), %%ymm1    \n\t"
789 	"                                            \n\t"
790 	"                                            \n\t" // iteration 3
791 	"vbroadcastsd      18 *  8(%%rax), %%ymm2    \n\t"
792 	"vbroadcastsd      19 *  8(%%rax), %%ymm3    \n\t"
793 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
794 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
795 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
796 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
797 	"                                            \n\t"
798 	"vbroadcastsd      20 *  8(%%rax), %%ymm2    \n\t"
799 	"vbroadcastsd      21 *  8(%%rax), %%ymm3    \n\t"
800 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
801 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
802 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
803 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
804 	"                                            \n\t"
805 	"vbroadcastsd      22 *  8(%%rax), %%ymm2    \n\t"
806 	"vbroadcastsd      23 *  8(%%rax), %%ymm3    \n\t"
807 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
808 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
809 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
810 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
811 	"                                            \n\t"
812 	"addq           $4 * 6 * 8, %%rax            \n\t" // a += 4*6 (unroll x mr)
813 	"addq           $4 * 8 * 8, %%rbx            \n\t" // b += 4*8 (unroll x nr)
814 	"                                            \n\t"
815 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
816 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
817 	"                                            \n\t"
818 	"                                            \n\t"
819 	"decq   %%rsi                                \n\t" // i -= 1;
820 	"jne    .DLOOPKITER                          \n\t" // iterate again if i != 0.
821 	"                                            \n\t"
822 	"                                            \n\t"
823 	"                                            \n\t"
824 	"                                            \n\t"
825 	"                                            \n\t"
826 	"                                            \n\t"
827 	".DCONSIDKLEFT:                              \n\t"
828 	"                                            \n\t"
829 	"movq      %1, %%rsi                         \n\t" // i = k_left;
830 	"testq  %%rsi, %%rsi                         \n\t" // check i via logical AND.
831 	"je     .DPOSTACCUM                          \n\t" // if i == 0, we're done; jump to end.
832 	"                                            \n\t" // else, we prepare to enter k_left loop.
833 	"                                            \n\t"
834 	"                                            \n\t"
835 	".DLOOPKLEFT:                                \n\t" // EDGE LOOP
836 	"                                            \n\t"
837 	"prefetcht0   64 * 8(%%rax)                  \n\t"
838 	"                                            \n\t"
839 	"vbroadcastsd       0 *  8(%%rax), %%ymm2    \n\t"
840 	"vbroadcastsd       1 *  8(%%rax), %%ymm3    \n\t"
841 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm4    \n\t"
842 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm5    \n\t"
843 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm6    \n\t"
844 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm7    \n\t"
845 	"                                            \n\t"
846 	"vbroadcastsd       2 *  8(%%rax), %%ymm2    \n\t"
847 	"vbroadcastsd       3 *  8(%%rax), %%ymm3    \n\t"
848 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm8    \n\t"
849 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm9    \n\t"
850 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm10   \n\t"
851 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm11   \n\t"
852 	"                                            \n\t"
853 	"vbroadcastsd       4 *  8(%%rax), %%ymm2    \n\t"
854 	"vbroadcastsd       5 *  8(%%rax), %%ymm3    \n\t"
855 	"vfmadd231pd       %%ymm0, %%ymm2, %%ymm12   \n\t"
856 	"vfmadd231pd       %%ymm1, %%ymm2, %%ymm13   \n\t"
857 	"vfmadd231pd       %%ymm0, %%ymm3, %%ymm14   \n\t"
858 	"vfmadd231pd       %%ymm1, %%ymm3, %%ymm15   \n\t"
859 	"                                            \n\t"
860 	"addq           $1 * 6 * 8, %%rax            \n\t" // a += 1*6 (unroll x mr)
861 	"addq           $1 * 8 * 8, %%rbx            \n\t" // b += 1*8 (unroll x nr)
862 	"                                            \n\t"
863 	"vmovaps           -4 * 32(%%rbx), %%ymm0    \n\t"
864 	"vmovaps           -3 * 32(%%rbx), %%ymm1    \n\t"
865 	"                                            \n\t"
866 	"                                            \n\t"
867 	"decq   %%rsi                                \n\t" // i -= 1;
868 	"jne    .DLOOPKLEFT                          \n\t" // iterate again if i != 0.
869 	"                                            \n\t"
870 	"                                            \n\t"
871 	"                                            \n\t"
872 	".DPOSTACCUM:                                \n\t"
873 	"                                            \n\t"
874 	"                                            \n\t"
875 	"                                            \n\t"
876 	"                                            \n\t"
877 	"movq         %4, %%rax                      \n\t" // load address of alpha
878 	"movq         %5, %%rbx                      \n\t" // load address of beta
879 	"vbroadcastsd    (%%rax), %%ymm0             \n\t" // load alpha and duplicate
880 	"vbroadcastsd    (%%rbx), %%ymm3             \n\t" // load beta and duplicate
881 	"                                            \n\t"
882 	"vmulpd           %%ymm0,  %%ymm4,  %%ymm4   \n\t" // scale by alpha
883 	"vmulpd           %%ymm0,  %%ymm5,  %%ymm5   \n\t"
884 	"vmulpd           %%ymm0,  %%ymm6,  %%ymm6   \n\t"
885 	"vmulpd           %%ymm0,  %%ymm7,  %%ymm7   \n\t"
886 	"vmulpd           %%ymm0,  %%ymm8,  %%ymm8   \n\t"
887 	"vmulpd           %%ymm0,  %%ymm9,  %%ymm9   \n\t"
888 	"vmulpd           %%ymm0,  %%ymm10, %%ymm10  \n\t"
889 	"vmulpd           %%ymm0,  %%ymm11, %%ymm11  \n\t"
890 	"vmulpd           %%ymm0,  %%ymm12, %%ymm12  \n\t"
891 	"vmulpd           %%ymm0,  %%ymm13, %%ymm13  \n\t"
892 	"vmulpd           %%ymm0,  %%ymm14, %%ymm14  \n\t"
893 	"vmulpd           %%ymm0,  %%ymm15, %%ymm15  \n\t"
894 	"                                            \n\t"
895 	"                                            \n\t"
896 	"                                            \n\t"
897 	"                                            \n\t"
898 	"                                            \n\t"
899 	"                                            \n\t"
900 	"movq                %8, %%rsi               \n\t" // load cs_c
901 	"leaq        (,%%rsi,8), %%rsi               \n\t" // rsi = cs_c * sizeof(double)
902 	"                                            \n\t"
903 	"leaq   (%%rcx,%%rsi,4), %%rdx               \n\t" // load address of c +  4*cs_c;
904 	"                                            \n\t"
905 	"leaq   (%%rsi,%%rsi,2), %%r13               \n\t" // r13 = 3*cs_c;
906 	//"leaq   (%%rsi,%%rsi,4), %%r15               \n\t" // r15 = 5*cs_c;
907 	//"leaq   (%%r13,%%rsi,4), %%r10               \n\t" // r10 = 7*cs_c;
908 	"                                            \n\t"
909 	"                                            \n\t"
910 	"                                            \n\t" // now avoid loading C if beta == 0
911 	"                                            \n\t"
912 	"vxorpd    %%ymm0,  %%ymm0,  %%ymm0          \n\t" // set ymm0 to zero.
913 	"vucomisd  %%xmm0,  %%xmm3                   \n\t" // set ZF if beta == 0.
914 	"je      .DBETAZERO                          \n\t" // if ZF = 1, jump to beta == 0 case
915 	"                                            \n\t"
916 	"                                            \n\t"
917     "cmpq       $8, %%rsi                        \n\t" // set ZF if (8*cs_c) == 8.
918 	"jz      .DROWSTORED                         \n\t" // jump to row storage case
919 	"                                            \n\t"
920 	"                                            \n\t"
921 	"                                            \n\t"
922 	".DGENSTORED:                                \n\t"
923 	"                                            \n\t"
924 	"                                            \n\t"
925 	DGEMM_INPUT_GS_BETA_NZ
926 	"vfmadd213pd      %%ymm4,  %%ymm3,  %%ymm0   \n\t"
927 	DGEMM_OUTPUT_GS_BETA_NZ
928 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
929 	"                                            \n\t"
930 	"                                            \n\t"
931 	DGEMM_INPUT_GS_BETA_NZ
932 	"vfmadd213pd      %%ymm6,  %%ymm3,  %%ymm0   \n\t"
933 	DGEMM_OUTPUT_GS_BETA_NZ
934 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
935 	"                                            \n\t"
936 	"                                            \n\t"
937 	DGEMM_INPUT_GS_BETA_NZ
938 	"vfmadd213pd      %%ymm8,  %%ymm3,  %%ymm0   \n\t"
939 	DGEMM_OUTPUT_GS_BETA_NZ
940 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
941 	"                                            \n\t"
942 	"                                            \n\t"
943 	DGEMM_INPUT_GS_BETA_NZ
944 	"vfmadd213pd      %%ymm10, %%ymm3,  %%ymm0   \n\t"
945 	DGEMM_OUTPUT_GS_BETA_NZ
946 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
947 	"                                            \n\t"
948 	"                                            \n\t"
949 	DGEMM_INPUT_GS_BETA_NZ
950 	"vfmadd213pd      %%ymm12, %%ymm3,  %%ymm0   \n\t"
951 	DGEMM_OUTPUT_GS_BETA_NZ
952 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
953 	"                                            \n\t"
954 	"                                            \n\t"
955 	DGEMM_INPUT_GS_BETA_NZ
956 	"vfmadd213pd      %%ymm14, %%ymm3,  %%ymm0   \n\t"
957 	DGEMM_OUTPUT_GS_BETA_NZ
958 	"                                            \n\t"
959 	"                                            \n\t"
960 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*cs_c
961 	"                                            \n\t"
962 	"                                            \n\t"
963 	DGEMM_INPUT_GS_BETA_NZ
964 	"vfmadd213pd      %%ymm5,  %%ymm3,  %%ymm0   \n\t"
965 	DGEMM_OUTPUT_GS_BETA_NZ
966 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
967 	"                                            \n\t"
968 	"                                            \n\t"
969 	DGEMM_INPUT_GS_BETA_NZ
970 	"vfmadd213pd      %%ymm7,  %%ymm3,  %%ymm0   \n\t"
971 	DGEMM_OUTPUT_GS_BETA_NZ
972 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
973 	"                                            \n\t"
974 	"                                            \n\t"
975 	DGEMM_INPUT_GS_BETA_NZ
976 	"vfmadd213pd      %%ymm9,  %%ymm3,  %%ymm0   \n\t"
977 	DGEMM_OUTPUT_GS_BETA_NZ
978 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
979 	"                                            \n\t"
980 	"                                            \n\t"
981 	DGEMM_INPUT_GS_BETA_NZ
982 	"vfmadd213pd      %%ymm11, %%ymm3,  %%ymm0   \n\t"
983 	DGEMM_OUTPUT_GS_BETA_NZ
984 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
985 	"                                            \n\t"
986 	"                                            \n\t"
987 	DGEMM_INPUT_GS_BETA_NZ
988 	"vfmadd213pd      %%ymm13, %%ymm3,  %%ymm0   \n\t"
989 	DGEMM_OUTPUT_GS_BETA_NZ
990 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
991 	"                                            \n\t"
992 	"                                            \n\t"
993 	DGEMM_INPUT_GS_BETA_NZ
994 	"vfmadd213pd      %%ymm15, %%ymm3,  %%ymm0   \n\t"
995 	DGEMM_OUTPUT_GS_BETA_NZ
996 	"                                            \n\t"
997 	"                                            \n\t"
998 	"                                            \n\t"
999 	"jmp    .DDONE                               \n\t" // jump to end.
1000 	"                                            \n\t"
1001 	"                                            \n\t"
1002 	"                                            \n\t"
1003 	".DROWSTORED:                                \n\t"
1004 	"                                            \n\t"
1005 	"                                            \n\t"
1006 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm4    \n\t"
1007 	"vmovups          %%ymm4,  (%%rcx)           \n\t"
1008 	"addq      %%rdi, %%rcx                      \n\t"
1009 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm5    \n\t"
1010 	"vmovups          %%ymm5,  (%%rdx)           \n\t"
1011 	"addq      %%rdi, %%rdx                      \n\t"
1012 	"                                            \n\t"
1013 	"                                            \n\t"
1014 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm6    \n\t"
1015 	"vmovups          %%ymm6,  (%%rcx)           \n\t"
1016 	"addq      %%rdi, %%rcx                      \n\t"
1017 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm7    \n\t"
1018 	"vmovups          %%ymm7,  (%%rdx)           \n\t"
1019 	"addq      %%rdi, %%rdx                      \n\t"
1020 	"                                            \n\t"
1021 	"                                            \n\t"
1022 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm8    \n\t"
1023 	"vmovups          %%ymm8,  (%%rcx)           \n\t"
1024 	"addq      %%rdi, %%rcx                      \n\t"
1025 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm9    \n\t"
1026 	"vmovups          %%ymm9,  (%%rdx)           \n\t"
1027 	"addq      %%rdi, %%rdx                      \n\t"
1028 	"                                            \n\t"
1029 	"                                            \n\t"
1030 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm10   \n\t"
1031 	"vmovups          %%ymm10, (%%rcx)           \n\t"
1032 	"addq      %%rdi, %%rcx                      \n\t"
1033 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm11   \n\t"
1034 	"vmovups          %%ymm11, (%%rdx)           \n\t"
1035 	"addq      %%rdi, %%rdx                      \n\t"
1036 	"                                            \n\t"
1037 	"                                            \n\t"
1038 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm12   \n\t"
1039 	"vmovups          %%ymm12, (%%rcx)           \n\t"
1040 	"addq      %%rdi, %%rcx                      \n\t"
1041 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm13   \n\t"
1042 	"vmovups          %%ymm13, (%%rdx)           \n\t"
1043 	"addq      %%rdi, %%rdx                      \n\t"
1044 	"                                            \n\t"
1045 	"                                            \n\t"
1046 	"vfmadd231pd      (%%rcx), %%ymm3, %%ymm14   \n\t"
1047 	"vmovups          %%ymm14, (%%rcx)           \n\t"
1048 	//"addq      %%rdi, %%rcx                      \n\t"
1049 	"vfmadd231pd      (%%rdx), %%ymm3, %%ymm15   \n\t"
1050 	"vmovups          %%ymm15, (%%rdx)           \n\t"
1051 	//"addq      %%rdi, %%rdx                      \n\t"
1052 	"                                            \n\t"
1053 	"                                            \n\t"
1054 	"                                            \n\t"
1055 	"jmp    .DDONE                               \n\t" // jump to end.
1056 	"                                            \n\t"
1057 	"                                            \n\t"
1058 	"                                            \n\t"
1059 	".DBETAZERO:                                 \n\t"
1060     "                                            \n\t"
1061     "cmpq       $8, %%rsi                        \n\t" // set ZF if (8*cs_c) == 8.
1062 	"jz      .DROWSTORBZ                         \n\t" // jump to row storage case
1063 	"                                            \n\t"
1064 	"                                            \n\t"
1065 	"                                            \n\t"
1066 	".DGENSTORBZ:                                \n\t"
1067 	"                                            \n\t"
1068 	"                                            \n\t"
1069 	"vmovaps           %%ymm4,  %%ymm0           \n\t"
1070 	DGEMM_OUTPUT_GS_BETA_NZ
1071 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1072 	"                                            \n\t"
1073 	"                                            \n\t"
1074 	"vmovaps           %%ymm6,  %%ymm0           \n\t"
1075 	DGEMM_OUTPUT_GS_BETA_NZ
1076 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1077 	"                                            \n\t"
1078 	"                                            \n\t"
1079 	"vmovaps           %%ymm8,  %%ymm0           \n\t"
1080 	DGEMM_OUTPUT_GS_BETA_NZ
1081 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1082 	"                                            \n\t"
1083 	"                                            \n\t"
1084 	"vmovaps           %%ymm10, %%ymm0           \n\t"
1085 	DGEMM_OUTPUT_GS_BETA_NZ
1086 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1087 	"                                            \n\t"
1088 	"                                            \n\t"
1089 	"vmovaps           %%ymm12, %%ymm0           \n\t"
1090 	DGEMM_OUTPUT_GS_BETA_NZ
1091 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1092 	"                                            \n\t"
1093 	"                                            \n\t"
1094 	"vmovaps           %%ymm14, %%ymm0           \n\t"
1095 	DGEMM_OUTPUT_GS_BETA_NZ
1096 	"                                            \n\t"
1097 	"                                            \n\t"
1098 	"movq      %%rdx, %%rcx                      \n\t" // rcx = c + 4*cs_c
1099 	"                                            \n\t"
1100 	"                                            \n\t"
1101 	"vmovaps           %%ymm5,  %%ymm0           \n\t"
1102 	DGEMM_OUTPUT_GS_BETA_NZ
1103 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1104 	"                                            \n\t"
1105 	"                                            \n\t"
1106 	"vmovaps           %%ymm7,  %%ymm0           \n\t"
1107 	DGEMM_OUTPUT_GS_BETA_NZ
1108 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1109 	"                                            \n\t"
1110 	"                                            \n\t"
1111 	"vmovaps           %%ymm9,  %%ymm0           \n\t"
1112 	DGEMM_OUTPUT_GS_BETA_NZ
1113 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1114 	"                                            \n\t"
1115 	"                                            \n\t"
1116 	"vmovaps           %%ymm11, %%ymm0           \n\t"
1117 	DGEMM_OUTPUT_GS_BETA_NZ
1118 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1119 	"                                            \n\t"
1120 	"                                            \n\t"
1121 	"vmovaps           %%ymm13, %%ymm0           \n\t"
1122 	DGEMM_OUTPUT_GS_BETA_NZ
1123 	"addq      %%rdi, %%rcx                      \n\t" // c += rs_c;
1124 	"                                            \n\t"
1125 	"                                            \n\t"
1126 	"vmovaps           %%ymm15, %%ymm0           \n\t"
1127 	DGEMM_OUTPUT_GS_BETA_NZ
1128 	"                                            \n\t"
1129 	"                                            \n\t"
1130 	"                                            \n\t"
1131 	"jmp    .DDONE                               \n\t" // jump to end.
1132 	"                                            \n\t"
1133 	"                                            \n\t"
1134 	"                                            \n\t"
1135 	".DROWSTORBZ:                                \n\t"
1136 	"                                            \n\t"
1137 	"                                            \n\t"
1138 	"vmovups          %%ymm4,  (%%rcx)           \n\t"
1139 	"addq      %%rdi, %%rcx                      \n\t"
1140 	"vmovups          %%ymm5,  (%%rdx)           \n\t"
1141 	"addq      %%rdi, %%rdx                      \n\t"
1142 	"                                            \n\t"
1143 	"vmovups          %%ymm6,  (%%rcx)           \n\t"
1144 	"addq      %%rdi, %%rcx                      \n\t"
1145 	"vmovups          %%ymm7,  (%%rdx)           \n\t"
1146 	"addq      %%rdi, %%rdx                      \n\t"
1147 	"                                            \n\t"
1148 	"                                            \n\t"
1149 	"vmovups          %%ymm8,  (%%rcx)           \n\t"
1150 	"addq      %%rdi, %%rcx                      \n\t"
1151 	"vmovups          %%ymm9,  (%%rdx)           \n\t"
1152 	"addq      %%rdi, %%rdx                      \n\t"
1153 	"                                            \n\t"
1154 	"                                            \n\t"
1155 	"vmovups          %%ymm10, (%%rcx)           \n\t"
1156 	"addq      %%rdi, %%rcx                      \n\t"
1157 	"vmovups          %%ymm11, (%%rdx)           \n\t"
1158 	"addq      %%rdi, %%rdx                      \n\t"
1159 	"                                            \n\t"
1160 	"                                            \n\t"
1161 	"vmovups          %%ymm12, (%%rcx)           \n\t"
1162 	"addq      %%rdi, %%rcx                      \n\t"
1163 	"vmovups          %%ymm13, (%%rdx)           \n\t"
1164 	"addq      %%rdi, %%rdx                      \n\t"
1165 	"                                            \n\t"
1166 	"                                            \n\t"
1167 	"vmovups          %%ymm14, (%%rcx)           \n\t"
1168 	//"addq      %%rdi, %%rcx                      \n\t"
1169 	"vmovups          %%ymm15, (%%rdx)           \n\t"
1170 	//"addq      %%rdi, %%rdx                      \n\t"
1171 	"                                            \n\t"
1172 	"                                            \n\t"
1173 	"                                            \n\t"
1174 	"                                            \n\t"
1175 	"                                            \n\t"
1176 	"                                            \n\t"
1177 	".DDONE:                                     \n\t"
1178 	"                                            \n\t"
1179 
1180 	: // output operands (none)
1181 	: // input operands
1182 	  "m" (k_iter), // 0
1183 	  "m" (k_left), // 1
1184 	  "m" (a),      // 2
1185 	  "m" (b),      // 3
1186 	  "m" (alpha),  // 4
1187 	  "m" (beta),   // 5
1188 	  "m" (c),      // 6
1189 	  "m" (rs_c),   // 7
1190       "m" (cs_c),   // 8
1191       "m" (c_prefetch)/*,   // 9
1192 	  "m" (b_next), // 9
1193 	  "m" (a_next)*/  // 10
1194 	: // register clobber list
1195 	  "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1196 	  "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1197 	  "xmm0", "xmm1", "xmm2", "xmm3",
1198 	  "xmm4", "xmm5", "xmm6", "xmm7",
1199 	  "xmm8", "xmm9", "xmm10", "xmm11",
1200 	  "xmm12", "xmm13", "xmm14", "xmm15",
1201 	  "memory"
1202 	);
1203 }
1204 
1205 #if 0
1206 
1207 void bli_cgemm_asm_
1208      (
1209        dim_t               k,
1210        scomplex*  restrict alpha,
1211        scomplex*  restrict a,
1212        scomplex*  restrict b,
1213        scomplex*  restrict beta,
1214        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
1215        auxinfo_t* restrict data,
1216        cntx_t*    restrict cntx
1217      )
1218 {
1219 	//void*   a_next = bli_auxinfo_next_a( data );
1220 	//void*   b_next = bli_auxinfo_next_b( data );
1221 
1222 	//dim_t   k_iter = k / 4;
1223 	//dim_t   k_left = k % 4;
1224 
1225 }
1226 
1227 
1228 
1229 void bli_zgemm_asm_
1230      (
1231        dim_t               k,
1232        dcomplex*  restrict alpha,
1233        dcomplex*  restrict a,
1234        dcomplex*  restrict b,
1235        dcomplex*  restrict beta,
1236        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
1237        auxinfo_t* restrict data,
1238        cntx_t*    restrict cntx
1239      )
1240 {
1241 	//void*   a_next = bli_auxinfo_next_a( data );
1242 	//void*   b_next = bli_auxinfo_next_b( data );
1243 
1244 	//dim_t   k_iter = k / 4;
1245 	//dim_t   k_left = k % 4;
1246 
1247 }
1248 
1249 #endif
1250