1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 /* NOTE: The micro-kernels in this file were partially inspired by portions
36 of code found in OpenBLAS 0.2.8 (http://www.openblas.net/). -FGVZ */
37
38 #include "blis.h"
39
bli_sgemm_asm_8x8(dim_t k,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)40 void bli_sgemm_asm_8x8
41 (
42 dim_t k,
43 float* restrict alpha,
44 float* restrict a,
45 float* restrict b,
46 float* restrict beta,
47 float* restrict c, inc_t rs_c, inc_t cs_c,
48 auxinfo_t* restrict data,
49 cntx_t* restrict cntx
50 )
51 {
52 //void* a_next = bli_auxinfo_next_a( data );
53 //void* b_next = bli_auxinfo_next_b( data );
54
55 uint64_t k_iter = k / 4;
56 uint64_t k_left = k % 4;
57
58 __asm__ volatile
59 (
60 " \n\t"
61 " \n\t"
62 "movq %2, %%rax \n\t" // load address of a.
63 "movq %3, %%rbx \n\t" // load address of b.
64 //"movq %9, %%r15 \n\t" // load address of b_next.
65 " \n\t"
66 "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
67 "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
68 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
69 " \n\t"
70 "movq %6, %%rcx \n\t" // load address of c
71 "movq %8, %%rdi \n\t" // load cs_c
72 "leaq (,%%rdi,4), %%rdi \n\t" // cs_c *= sizeof(float)
73 "leaq (%%rcx,%%rdi,4), %%r10 \n\t" // load address of c + 4*cs_c;
74 " \n\t"
75 "leaq (%%rdi,%%rdi,2), %%r14 \n\t" // r14 = 3*cs_c;
76 "prefetcht0 7 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
77 "prefetcht0 7 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
78 "prefetcht0 7 * 8(%%rcx,%%rdi,2) \n\t" // prefetch c + 2*cs_c
79 "prefetcht0 7 * 8(%%rcx,%%r14) \n\t" // prefetch c + 3*cs_c
80 "prefetcht0 7 * 8(%%r10) \n\t" // prefetch c + 4*cs_c
81 "prefetcht0 7 * 8(%%r10,%%rdi) \n\t" // prefetch c + 5*cs_c
82 "prefetcht0 7 * 8(%%r10,%%rdi,2) \n\t" // prefetch c + 6*cs_c
83 "prefetcht0 7 * 8(%%r10,%%r14) \n\t" // prefetch c + 7*cs_c
84 " \n\t"
85 "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t"
86 "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t"
87 "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t"
88 "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t"
89 "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t"
90 "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
91 "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
92 "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
93 " \n\t"
94 " \n\t"
95 " \n\t"
96 "movq %0, %%rsi \n\t" // i = k_iter;
97 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
98 "je .SCONSIDKLEFT \n\t" // if i == 0, jump to code that
99 " \n\t" // contains the k_left loop.
100 " \n\t"
101 " \n\t"
102 ".SLOOPKITER: \n\t" // MAIN LOOP
103 " \n\t"
104 " \n\t"
105 " \n\t" // iteration 0
106 "prefetcht0 16 * 32(%%rax) \n\t"
107 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
108 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
109 "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
110 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
111 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
112 "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t"
113 "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t"
114 " \n\t"
115 "vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
116 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
117 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
118 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
119 "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t"
120 "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t"
121 " \n\t"
122 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
123 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
124 "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
125 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
126 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
127 "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t"
128 "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t"
129 " \n\t"
130 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
131 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
132 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
133 "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t"
134 "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t"
135 " \n\t"
136 " \n\t" // iteration 1
137 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
138 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
139 "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t"
140 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
141 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
142 "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t"
143 "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t"
144 " \n\t"
145 "vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
146 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
147 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
148 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
149 "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t"
150 "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t"
151 " \n\t"
152 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
153 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
154 "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t"
155 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
156 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
157 "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t"
158 "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t"
159 " \n\t"
160 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
161 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
162 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
163 "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t"
164 "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t"
165 " \n\t"
166 " \n\t"
167 " \n\t" // iteration 2
168 "prefetcht0 18 * 32(%%rax) \n\t"
169 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
170 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
171 "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t"
172 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
173 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
174 "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t"
175 "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t"
176 " \n\t"
177 "vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
178 "addq $4 * 8 * 4, %%rax \n\t" // a += 4*8 (unroll x mr)
179 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
180 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
181 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
182 "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t"
183 "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t"
184 " \n\t"
185 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
186 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
187 "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t"
188 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
189 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
190 "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t"
191 "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t"
192 " \n\t"
193 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
194 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
195 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
196 "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t"
197 "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t"
198 " \n\t"
199 " \n\t"
200 " \n\t" // iteration 3
201 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
202 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
203 "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t"
204 "addq $4 * 8 * 4, %%rbx \n\t" // b += 4*8 (unroll x nr)
205 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
206 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
207 "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t"
208 "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t"
209 " \n\t"
210 "vmovaps 0 * 32(%%rax), %%ymm0 \n\t"
211 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
212 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
213 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
214 "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t"
215 "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t"
216 " \n\t"
217 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
218 "vperm2f128 $0x03, %%ymm2, %%ymm2, %%ymm4 \n\t"
219 "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t"
220 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
221 "vperm2f128 $0x03, %%ymm3, %%ymm3, %%ymm5 \n\t"
222 "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t"
223 "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t"
224 " \n\t"
225 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
226 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
227 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
228 "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t"
229 "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t"
230 " \n\t"
231 " \n\t"
232 " \n\t"
233 " \n\t"
234 "decq %%rsi \n\t" // i -= 1;
235 "jne .SLOOPKITER \n\t" // iterate again if i != 0.
236 " \n\t"
237 " \n\t"
238 " \n\t"
239 " \n\t"
240 " \n\t"
241 " \n\t"
242 ".SCONSIDKLEFT: \n\t"
243 " \n\t"
244 "movq %1, %%rsi \n\t" // i = k_left;
245 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
246 "je .SPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
247 " \n\t" // else, we prepare to enter k_left loop.
248 " \n\t"
249 " \n\t"
250 ".SLOOPKLEFT: \n\t" // EDGE LOOP
251 " \n\t"
252 " \n\t"
253 "prefetcht0 16 * 32(%%rax) \n\t"
254 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
255 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
256 "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
257 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
258 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
259 "vaddps %%ymm15, %%ymm6, %%ymm15 \n\t"
260 "vaddps %%ymm13, %%ymm7, %%ymm13 \n\t"
261 " \n\t"
262 "vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
263 "addq $8 * 1 * 4, %%rax \n\t" // a += 8 (1 x mr)
264 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
265 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
266 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
267 "vaddps %%ymm11, %%ymm6, %%ymm11 \n\t"
268 "vaddps %%ymm9, %%ymm7, %%ymm9 \n\t"
269 " \n\t"
270 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
271 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
272 "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
273 "addq $8 * 1 * 4, %%rbx \n\t" // b += 8 (1 x nr)
274 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
275 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
276 "vaddps %%ymm14, %%ymm6, %%ymm14 \n\t"
277 "vaddps %%ymm12, %%ymm7, %%ymm12 \n\t"
278 " \n\t"
279 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
280 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
281 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
282 "vmovaps %%ymm1, %%ymm0 \n\t"
283 "vaddps %%ymm10, %%ymm6, %%ymm10 \n\t"
284 "vaddps %%ymm8, %%ymm7, %%ymm8 \n\t"
285 " \n\t"
286 " \n\t"
287 " \n\t"
288 "decq %%rsi \n\t" // i -= 1;
289 "jne .SLOOPKLEFT \n\t" // iterate again if i != 0.
290 " \n\t"
291 " \n\t"
292 " \n\t"
293 ".SPOSTACCUM: \n\t"
294 " \n\t"
295 " \n\t" // ymm15: ymm13: ymm11: ymm9:
296 " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
297 " \n\t" // ab10 ab12 ab14 ab16
298 " \n\t" // ab22 ab20 ab26 ab24
299 " \n\t" // ab32 ab30 ab36 ab34
300 " \n\t" // ab44 ab46 ab40 ab42
301 " \n\t" // ab54 ab56 ab50 ab52
302 " \n\t" // ab66 ab64 ab62 ab60
303 " \n\t" // ab76 ) ab74 ) ab72 ) ab70 )
304 " \n\t"
305 " \n\t" // ymm14: ymm12: ymm10: ymm8:
306 " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
307 " \n\t" // ab11 ab13 ab15 ab17
308 " \n\t" // ab23 ab21 ab27 ab25
309 " \n\t" // ab33 ab31 ab37 ab35
310 " \n\t" // ab45 ab47 ab41 ab43
311 " \n\t" // ab55 ab57 ab51 ab53
312 " \n\t" // ab67 ab65 ab63 ab61
313 " \n\t" // ab77 ) ab75 ) ab73 ) ab71 )
314 " \n\t"
315 "vmovaps %%ymm15, %%ymm7 \n\t"
316 "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t"
317 "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t"
318 " \n\t"
319 "vmovaps %%ymm11, %%ymm7 \n\t"
320 "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t"
321 "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t"
322 " \n\t"
323 "vmovaps %%ymm14, %%ymm7 \n\t"
324 "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t"
325 "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t"
326 " \n\t"
327 "vmovaps %%ymm10, %%ymm7 \n\t"
328 "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t"
329 "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t"
330 " \n\t"
331 " \n\t" // ymm15: ymm13: ymm11: ymm9:
332 " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
333 " \n\t" // ab10 ab12 ab14 ab16
334 " \n\t" // ab20 ab22 ab24 ab26
335 " \n\t" // ab30 ab32 ab34 ab36
336 " \n\t" // ab44 ab46 ab40 ab42
337 " \n\t" // ab54 ab56 ab50 ab52
338 " \n\t" // ab64 ab66 ab60 ab62
339 " \n\t" // ab74 ) ab76 ) ab70 ) ab72 )
340 " \n\t"
341 " \n\t" // ymm14: ymm12: ymm10: ymm8:
342 " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
343 " \n\t" // ab11 ab13 ab15 ab17
344 " \n\t" // ab21 ab23 ab25 ab27
345 " \n\t" // ab31 ab33 ab35 ab37
346 " \n\t" // ab45 ab47 ab41 ab43
347 " \n\t" // ab55 ab57 ab51 ab53
348 " \n\t" // ab65 ab67 ab61 ab63
349 " \n\t" // ab75 ) ab77 ) ab71 ) ab73 )
350 " \n\t"
351 "vmovaps %%ymm15, %%ymm7 \n\t"
352 "vperm2f128 $0x30, %%ymm11, %%ymm15, %%ymm15 \n\t"
353 "vperm2f128 $0x12, %%ymm11, %%ymm7, %%ymm11 \n\t"
354 " \n\t"
355 "vmovaps %%ymm13, %%ymm7 \n\t"
356 "vperm2f128 $0x30, %%ymm9, %%ymm13, %%ymm13 \n\t"
357 "vperm2f128 $0x12, %%ymm9, %%ymm7, %%ymm9 \n\t"
358 " \n\t"
359 "vmovaps %%ymm14, %%ymm7 \n\t"
360 "vperm2f128 $0x30, %%ymm10, %%ymm14, %%ymm14 \n\t"
361 "vperm2f128 $0x12, %%ymm10, %%ymm7, %%ymm10 \n\t"
362 " \n\t"
363 "vmovaps %%ymm12, %%ymm7 \n\t"
364 "vperm2f128 $0x30, %%ymm8, %%ymm12, %%ymm12 \n\t"
365 "vperm2f128 $0x12, %%ymm8, %%ymm7, %%ymm8 \n\t"
366 " \n\t"
367 " \n\t" // ymm15: ymm13: ymm11: ymm9:
368 " \n\t" // ( ab00 ( ab02 ( ab04 ( ab06
369 " \n\t" // ab10 ab12 ab14 ab16
370 " \n\t" // ab20 ab22 ab24 ab26
371 " \n\t" // ab30 ab32 ab34 ab36
372 " \n\t" // ab40 ab42 ab44 ab46
373 " \n\t" // ab50 ab52 ab54 ab56
374 " \n\t" // ab60 ab62 ab64 ab66
375 " \n\t" // ab70 ) ab72 ) ab74 ) ab76 )
376 " \n\t"
377 " \n\t" // ymm14: ymm12: ymm10: ymm8:
378 " \n\t" // ( ab01 ( ab03 ( ab05 ( ab07
379 " \n\t" // ab11 ab13 ab15 ab17
380 " \n\t" // ab21 ab23 ab25 ab27
381 " \n\t" // ab31 ab33 ab35 ab37
382 " \n\t" // ab41 ab43 ab45 ab47
383 " \n\t" // ab51 ab53 ab55 ab57
384 " \n\t" // ab61 ab63 ab65 ab67
385 " \n\t" // ab71 ) ab73 ) ab75 ) ab77 )
386 " \n\t"
387 " \n\t"
388 " \n\t"
389 "movq %4, %%rax \n\t" // load address of alpha
390 "movq %5, %%rbx \n\t" // load address of beta
391 "vbroadcastss (%%rax), %%ymm0 \n\t" // load alpha and duplicate
392 "vbroadcastss (%%rbx), %%ymm4 \n\t" // load beta and duplicate
393 " \n\t"
394 "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha
395 "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t"
396 "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t"
397 "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t"
398 "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t"
399 "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t"
400 "vmulps %%ymm0, %%ymm14, %%ymm14 \n\t"
401 "vmulps %%ymm0, %%ymm15, %%ymm15 \n\t"
402 " \n\t"
403 " \n\t"
404 " \n\t"
405 " \n\t"
406 " \n\t"
407 " \n\t"
408 "movq %7, %%rsi \n\t" // load rs_c
409 "leaq (,%%rsi,4), %%rsi \n\t" // rsi = rs_c * sizeof(float)
410 " \n\t"
411 "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
412 " \n\t"
413 "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c;
414 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c;
415 " \n\t"
416 " \n\t"
417 " \n\t"
418 " \n\t" // determine if
419 " \n\t" // c % 32 == 0, AND
420 " \n\t" // 4*cs_c % 32 == 0, AND
421 " \n\t" // rs_c == 1
422 " \n\t" // ie: aligned, ldim aligned, and
423 " \n\t" // column-stored
424 " \n\t"
425 "cmpq $4, %%rsi \n\t" // set ZF if (4*rs_c) == 4.
426 "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
427 "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
428 "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
429 "testq $31, %%rdi \n\t" // set ZF if (4*cs_c) & 32 is zero.
430 "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
431 " \n\t" // and(bl,bh) followed by
432 " \n\t" // and(bh,al) will reveal result
433 " \n\t"
434 " \n\t" // now avoid loading C if beta == 0
435 " \n\t"
436 "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
437 "vucomiss %%xmm0, %%xmm4 \n\t" // set ZF if beta == 0.
438 "je .SBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
439 " \n\t"
440 " \n\t"
441 " \n\t" // check if aligned/column-stored
442 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
443 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
444 "jne .SCOLSTORED \n\t" // jump to column storage case
445 " \n\t"
446 " \n\t"
447 " \n\t"
448 ".SGENSTORED: \n\t"
449 " \n\t"
450 " \n\t" // update c00:c70
451 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
452 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
453 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
454 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
455 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
456 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
457 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
458 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
459 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
460 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
461 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
462 " \n\t"
463 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
464 "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result,
465 " \n\t"
466 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
467 "vmovss %%xmm0, (%%rcx) \n\t"
468 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
469 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
470 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
471 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
472 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
473 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
474 "vmovss %%xmm2, (%%rdx) \n\t"
475 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
476 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
477 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
478 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
479 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
480 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
481 " \n\t"
482 "addq %%rdi, %%rcx \n\t" // c += cs_c;
483 "addq %%rdi, %%rdx \n\t" // c += cs_c;
484 " \n\t"
485 " \n\t"
486 " \n\t" // update c01:c71
487 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
488 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
489 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
490 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
491 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
492 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
493 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
494 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
495 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
496 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
497 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
498 " \n\t"
499 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
500 "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result,
501 " \n\t"
502 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
503 "vmovss %%xmm0, (%%rcx) \n\t"
504 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
505 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
506 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
507 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
508 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
509 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
510 "vmovss %%xmm2, (%%rdx) \n\t"
511 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
512 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
513 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
514 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
515 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
516 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
517 " \n\t"
518 "addq %%rdi, %%rcx \n\t" // c += cs_c;
519 "addq %%rdi, %%rdx \n\t" // c += cs_c;
520 " \n\t"
521 " \n\t"
522 " \n\t" // update c02:c72
523 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
524 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
525 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
526 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
527 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
528 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
529 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
530 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
531 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
532 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
533 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
534 " \n\t"
535 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
536 "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
537 " \n\t"
538 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
539 "vmovss %%xmm0, (%%rcx) \n\t"
540 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
541 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
542 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
543 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
544 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
545 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
546 "vmovss %%xmm2, (%%rdx) \n\t"
547 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
548 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
549 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
550 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
551 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
552 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
553 " \n\t"
554 "addq %%rdi, %%rcx \n\t" // c += cs_c;
555 "addq %%rdi, %%rdx \n\t" // c += cs_c;
556 " \n\t"
557 " \n\t"
558 " \n\t" // update c03:c73
559 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
560 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
561 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
562 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
563 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
564 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
565 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
566 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
567 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
568 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
569 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
570 " \n\t"
571 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
572 "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result,
573 " \n\t"
574 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
575 "vmovss %%xmm0, (%%rcx) \n\t"
576 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
577 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
578 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
579 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
580 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
581 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
582 "vmovss %%xmm2, (%%rdx) \n\t"
583 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
584 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
585 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
586 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
587 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
588 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
589 " \n\t"
590 "addq %%rdi, %%rcx \n\t" // c += cs_c;
591 "addq %%rdi, %%rdx \n\t" // c += cs_c;
592 " \n\t"
593 " \n\t"
594 " \n\t" // update c04:c74
595 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
596 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
597 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
598 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
599 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
600 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
601 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
602 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
603 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
604 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
605 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
606 " \n\t"
607 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
608 "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
609 " \n\t"
610 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
611 "vmovss %%xmm0, (%%rcx) \n\t"
612 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
613 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
614 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
615 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
616 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
617 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
618 "vmovss %%xmm2, (%%rdx) \n\t"
619 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
620 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
621 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
622 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
623 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
624 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
625 " \n\t"
626 "addq %%rdi, %%rcx \n\t" // c += cs_c;
627 "addq %%rdi, %%rdx \n\t" // c += cs_c;
628 " \n\t"
629 " \n\t"
630 " \n\t" // update c05:c75
631 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
632 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
633 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
634 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
635 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
636 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
637 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
638 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
639 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
640 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
641 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
642 " \n\t"
643 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
644 "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result,
645 " \n\t"
646 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
647 "vmovss %%xmm0, (%%rcx) \n\t"
648 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
649 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
650 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
651 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
652 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
653 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
654 "vmovss %%xmm2, (%%rdx) \n\t"
655 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
656 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
657 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
658 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
659 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
660 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
661 " \n\t"
662 "addq %%rdi, %%rcx \n\t" // c += cs_c;
663 "addq %%rdi, %%rdx \n\t" // c += cs_c;
664 " \n\t"
665 " \n\t"
666 " \n\t" // update c06:c76
667 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
668 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
669 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
670 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
671 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
672 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
673 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
674 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
675 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
676 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
677 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
678 " \n\t"
679 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
680 "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
681 " \n\t"
682 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
683 "vmovss %%xmm0, (%%rcx) \n\t"
684 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
685 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
686 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
687 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
688 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
689 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
690 "vmovss %%xmm2, (%%rdx) \n\t"
691 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
692 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
693 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
694 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
695 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
696 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
697 " \n\t"
698 "addq %%rdi, %%rcx \n\t" // c += cs_c;
699 "addq %%rdi, %%rdx \n\t" // c += cs_c;
700 " \n\t"
701 " \n\t"
702 " \n\t" // update c07:c77
703 "vmovlps (%%rcx), %%xmm0, %%xmm0 \n\t"
704 "vmovhps (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
705 "vmovlps (%%rcx,%%r12), %%xmm1, %%xmm1 \n\t"
706 "vmovhps (%%rcx,%%r13), %%xmm1, %%xmm1 \n\t"
707 "vshufps $0x88, %%xmm1, %%xmm0, %%xmm0 \n\t"
708 "vmovlps (%%rdx), %%xmm2, %%xmm2 \n\t"
709 "vmovhps (%%rdx,%%rsi), %%xmm2, %%xmm2 \n\t"
710 "vmovlps (%%rdx,%%r12), %%xmm3, %%xmm3 \n\t"
711 "vmovhps (%%rdx,%%r13), %%xmm3, %%xmm3 \n\t"
712 "vshufps $0x88, %%xmm3, %%xmm2, %%xmm2 \n\t"
713 "vperm2f128 $0x20, %%ymm2, %%ymm0, %%ymm0 \n\t"
714 " \n\t"
715 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
716 "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result,
717 " \n\t"
718 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
719 "vmovss %%xmm0, (%%rcx) \n\t"
720 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
721 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
722 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
723 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
724 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
725 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
726 "vmovss %%xmm2, (%%rdx) \n\t"
727 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
728 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
729 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
730 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
731 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
732 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
733 " \n\t"
734 " \n\t"
735 " \n\t"
736 "jmp .SDONE \n\t" // jump to end.
737 " \n\t"
738 " \n\t"
739 " \n\t"
740 ".SCOLSTORED: \n\t"
741 " \n\t"
742 " \n\t"
743 "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70,
744 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
745 "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result,
746 "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
747 "addq %%rdi, %%rcx \n\t" // c += cs_c;
748 " \n\t"
749 "vmovaps (%%rcx), %%ymm1 \n\t" // load c01:c71,
750 "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
751 "vaddps %%ymm14, %%ymm1, %%ymm1 \n\t" // add the gemm result,
752 "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
753 "addq %%rdi, %%rcx \n\t" // c += cs_c;
754 " \n\t"
755 "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72,
756 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
757 "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
758 "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
759 "addq %%rdi, %%rcx \n\t" // c += cs_c;
760 " \n\t"
761 "vmovaps (%%rcx), %%ymm1 \n\t" // load c03:c73,
762 "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
763 "vaddps %%ymm12, %%ymm1, %%ymm1 \n\t" // add the gemm result,
764 "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
765 "addq %%rdi, %%rcx \n\t" // c += cs_c;
766 " \n\t"
767 "vmovaps (%%rcx), %%ymm0 \n\t" // load c04:c74,
768 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
769 "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
770 "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
771 "addq %%rdi, %%rcx \n\t" // c += cs_c;
772 " \n\t"
773 "vmovaps (%%rcx), %%ymm1 \n\t" // load c05:c75,
774 "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
775 "vaddps %%ymm10, %%ymm1, %%ymm1 \n\t" // add the gemm result,
776 "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
777 "addq %%rdi, %%rcx \n\t" // c += cs_c;
778 " \n\t"
779 "vmovaps (%%rcx), %%ymm0 \n\t" // load c06:c76,
780 "vmulps %%ymm4, %%ymm0, %%ymm0 \n\t" // scale by beta,
781 "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
782 "vmovaps %%ymm0, (%%rcx) \n\t" // and store back to memory.
783 "addq %%rdi, %%rcx \n\t" // c += cs_c;
784 " \n\t"
785 "vmovaps (%%rcx), %%ymm1 \n\t" // load c07:c77,
786 "vmulps %%ymm4, %%ymm1, %%ymm1 \n\t" // scale by beta,
787 "vaddps %%ymm8, %%ymm1, %%ymm1 \n\t" // add the gemm result,
788 "vmovaps %%ymm1, (%%rcx) \n\t" // and store back to memory.
789 " \n\t"
790 " \n\t"
791 "jmp .SDONE \n\t" // jump to end.
792 " \n\t"
793 " \n\t"
794 " \n\t"
795 " \n\t"
796 ".SBETAZERO: \n\t"
797 " \n\t" // check if aligned/column-stored
798 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
799 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
800 "jne .SCOLSTORBZ \n\t" // jump to column storage case
801 " \n\t"
802 " \n\t"
803 " \n\t"
804 ".SGENSTORBZ: \n\t"
805 " \n\t"
806 " \n\t" // update c00:c70
807 "vmovapd %%ymm15, %%ymm0 \n\t"
808 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
809 "vmovss %%xmm0, (%%rcx) \n\t"
810 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
811 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
812 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
813 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
814 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
815 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
816 "vmovss %%xmm2, (%%rdx) \n\t"
817 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
818 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
819 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
820 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
821 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
822 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
823 " \n\t"
824 "addq %%rdi, %%rcx \n\t" // c += cs_c;
825 "addq %%rdi, %%rdx \n\t" // c += cs_c;
826 " \n\t"
827 " \n\t"
828 " \n\t" // update c01:c71
829 "vmovapd %%ymm14, %%ymm0 \n\t"
830 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
831 "vmovss %%xmm0, (%%rcx) \n\t"
832 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
833 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
834 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
835 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
836 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
837 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
838 "vmovss %%xmm2, (%%rdx) \n\t"
839 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
840 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
841 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
842 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
843 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
844 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
845 " \n\t"
846 "addq %%rdi, %%rcx \n\t" // c += cs_c;
847 "addq %%rdi, %%rdx \n\t" // c += cs_c;
848 " \n\t"
849 " \n\t"
850 " \n\t" // update c02:c72
851 "vmovapd %%ymm13, %%ymm0 \n\t"
852 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
853 "vmovss %%xmm0, (%%rcx) \n\t"
854 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
855 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
856 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
857 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
858 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
859 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
860 "vmovss %%xmm2, (%%rdx) \n\t"
861 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
862 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
863 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
864 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
865 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
866 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
867 " \n\t"
868 "addq %%rdi, %%rcx \n\t" // c += cs_c;
869 "addq %%rdi, %%rdx \n\t" // c += cs_c;
870 " \n\t"
871 " \n\t"
872 " \n\t" // update c03:c73
873 "vmovapd %%ymm12, %%ymm0 \n\t"
874 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
875 "vmovss %%xmm0, (%%rcx) \n\t"
876 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
877 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
878 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
879 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
880 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
881 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
882 "vmovss %%xmm2, (%%rdx) \n\t"
883 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
884 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
885 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
886 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
887 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
888 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
889 " \n\t"
890 "addq %%rdi, %%rcx \n\t" // c += cs_c;
891 "addq %%rdi, %%rdx \n\t" // c += cs_c;
892 " \n\t"
893 " \n\t"
894 " \n\t" // update c04:c74
895 "vmovapd %%ymm11, %%ymm0 \n\t"
896 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
897 "vmovss %%xmm0, (%%rcx) \n\t"
898 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
899 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
900 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
901 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
902 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
903 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
904 "vmovss %%xmm2, (%%rdx) \n\t"
905 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
906 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
907 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
908 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
909 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
910 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
911 " \n\t"
912 "addq %%rdi, %%rcx \n\t" // c += cs_c;
913 "addq %%rdi, %%rdx \n\t" // c += cs_c;
914 " \n\t"
915 " \n\t"
916 " \n\t" // update c05:c75
917 "vmovapd %%ymm10, %%ymm0 \n\t"
918 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
919 "vmovss %%xmm0, (%%rcx) \n\t"
920 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
921 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
922 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
923 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
924 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
925 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
926 "vmovss %%xmm2, (%%rdx) \n\t"
927 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
928 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
929 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
930 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
931 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
932 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
933 " \n\t"
934 "addq %%rdi, %%rcx \n\t" // c += cs_c;
935 "addq %%rdi, %%rdx \n\t" // c += cs_c;
936 " \n\t"
937 " \n\t"
938 " \n\t" // update c06:c76
939 "vmovapd %%ymm9, %%ymm0 \n\t"
940 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
941 "vmovss %%xmm0, (%%rcx) \n\t"
942 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
943 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
944 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
945 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
946 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
947 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
948 "vmovss %%xmm2, (%%rdx) \n\t"
949 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
950 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
951 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
952 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
953 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
954 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
955 " \n\t"
956 "addq %%rdi, %%rcx \n\t" // c += cs_c;
957 "addq %%rdi, %%rdx \n\t" // c += cs_c;
958 " \n\t"
959 " \n\t"
960 " \n\t" // update c07:c77
961 "vmovapd %%ymm8, %%ymm0 \n\t"
962 "vextractf128 $1, %%ymm0, %%xmm2 \n\t"
963 "vmovss %%xmm0, (%%rcx) \n\t"
964 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
965 "vmovss %%xmm1, (%%rcx,%%rsi) \n\t"
966 "vpermilps $0x39, %%xmm1, %%xmm0 \n\t"
967 "vmovss %%xmm0, (%%rcx,%%r12) \n\t"
968 "vpermilps $0x39, %%xmm0, %%xmm1 \n\t"
969 "vmovss %%xmm1, (%%rcx,%%r13) \n\t"
970 "vmovss %%xmm2, (%%rdx) \n\t"
971 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
972 "vmovss %%xmm3, (%%rdx,%%rsi) \n\t"
973 "vpermilps $0x39, %%xmm3, %%xmm2 \n\t"
974 "vmovss %%xmm2, (%%rdx,%%r12) \n\t"
975 "vpermilps $0x39, %%xmm2, %%xmm3 \n\t"
976 "vmovss %%xmm3, (%%rdx,%%r13) \n\t"
977 " \n\t"
978 " \n\t"
979 "jmp .SDONE \n\t" // jump to end.
980 " \n\t"
981 " \n\t"
982 " \n\t"
983 ".SCOLSTORBZ: \n\t"
984 " \n\t"
985 " \n\t"
986 "vmovaps %%ymm15, (%%rcx) \n\t" // and store back to memory.
987 "addq %%rdi, %%rcx \n\t" // c += cs_c;
988 " \n\t"
989 "vmovaps %%ymm14, (%%rcx) \n\t" // and store back to memory.
990 "addq %%rdi, %%rcx \n\t" // c += cs_c;
991 " \n\t"
992 "vmovaps %%ymm13, (%%rcx) \n\t" // and store back to memory.
993 "addq %%rdi, %%rcx \n\t" // c += cs_c;
994 " \n\t"
995 "vmovaps %%ymm12, (%%rcx) \n\t" // and store back to memory.
996 "addq %%rdi, %%rcx \n\t" // c += cs_c;
997 " \n\t"
998 "vmovaps %%ymm11, (%%rcx) \n\t" // and store back to memory.
999 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1000 " \n\t"
1001 "vmovaps %%ymm10, (%%rcx) \n\t" // and store back to memory.
1002 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1003 " \n\t"
1004 "vmovaps %%ymm9, (%%rcx) \n\t" // and store back to memory.
1005 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1006 " \n\t"
1007 "vmovaps %%ymm8, (%%rcx) \n\t" // and store back to memory.
1008 " \n\t"
1009 " \n\t"
1010 " \n\t"
1011 " \n\t"
1012 " \n\t"
1013 ".SDONE: \n\t"
1014 " \n\t"
1015
1016 : // output operands (none)
1017 : // input operands
1018 "m" (k_iter), // 0
1019 "m" (k_left), // 1
1020 "m" (a), // 2
1021 "m" (b), // 3
1022 "m" (alpha), // 4
1023 "m" (beta), // 5
1024 "m" (c), // 6
1025 "m" (rs_c), // 7
1026 "m" (cs_c)/*, // 8
1027 "m" (b_next), // 9
1028 "m" (a_next)*/ // 10
1029 : // register clobber list
1030 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1031 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1032 "xmm0", "xmm1", "xmm2", "xmm3",
1033 "xmm4", "xmm5", "xmm6", "xmm7",
1034 "xmm8", "xmm9", "xmm10", "xmm11",
1035 "xmm12", "xmm13", "xmm14", "xmm15",
1036 "memory"
1037 );
1038 }
1039
bli_dgemm_asm_8x4(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)1040 void bli_dgemm_asm_8x4
1041 (
1042 dim_t k,
1043 double* restrict alpha,
1044 double* restrict a,
1045 double* restrict b,
1046 double* restrict beta,
1047 double* restrict c, inc_t rs_c, inc_t cs_c,
1048 auxinfo_t* restrict data,
1049 cntx_t* restrict cntx
1050 )
1051 {
1052 //void* a_next = bli_auxinfo_next_a( data );
1053 void* b_next = bli_auxinfo_next_b( data );
1054
1055 uint64_t k_iter = k / 4;
1056 uint64_t k_left = k % 4;
1057
1058 __asm__ volatile
1059 (
1060 " \n\t"
1061 " \n\t"
1062 "movq %2, %%rax \n\t" // load address of a.
1063 "movq %3, %%rbx \n\t" // load address of b.
1064 "movq %9, %%r15 \n\t" // load address of b_next.
1065 //"movq %10, %%r14 \n\t" // load address of a_next.
1066 "addq $-4 * 64, %%r15 \n\t"
1067 " \n\t"
1068 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
1069 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t" // elements of a and b.
1070 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1071 " \n\t"
1072 "movq %6, %%rcx \n\t" // load address of c
1073 "movq %8, %%rdi \n\t" // load cs_c
1074 "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(double)
1075 "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c;
1076 " \n\t"
1077 "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
1078 "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
1079 "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c
1080 "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
1081 " \n\t"
1082 "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t"
1083 "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
1084 "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
1085 "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
1086 "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
1087 "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
1088 "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
1089 "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
1090 " \n\t"
1091 " \n\t"
1092 " \n\t"
1093 "movq %0, %%rsi \n\t" // i = k_iter;
1094 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
1095 "je .DCONSIDKLEFT \n\t" // if i == 0, jump to code that
1096 " \n\t" // contains the k_left loop.
1097 " \n\t"
1098 " \n\t"
1099 ".DLOOPKITER: \n\t" // MAIN LOOP
1100 " \n\t"
1101 "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr)
1102 " \n\t"
1103 " \n\t" // iteration 0
1104 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
1105 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
1106 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1107 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
1108 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1109 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
1110 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
1111 " \n\t"
1112 "prefetcht0 16 * 32(%%rax) \n\t"
1113 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
1114 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
1115 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
1116 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1117 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
1118 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
1119 " \n\t"
1120 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
1121 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
1122 "vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
1123 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
1124 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
1125 "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4]
1126 " \n\t"
1127 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
1128 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
1129 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
1130 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
1131 " \n\t"
1132 " \n\t"
1133 " \n\t" // iteration 1
1134 "vmovapd 3 * 32(%%rax), %%ymm1 \n\t"
1135 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
1136 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1137 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
1138 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1139 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
1140 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
1141 " \n\t"
1142 "prefetcht0 18 * 32(%%rax) \n\t"
1143 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
1144 "vmovapd 2 * 32(%%rbx), %%ymm2 \n\t"
1145 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
1146 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1147 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
1148 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
1149 " \n\t"
1150 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
1151 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
1152 "vmovapd 4 * 32(%%rax), %%ymm0 \n\t"
1153 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
1154 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
1155 " \n\t"
1156 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
1157 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
1158 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
1159 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
1160 " \n\t"
1161 " \n\t"
1162 " \n\t" // iteration 2
1163 "vmovapd 5 * 32(%%rax), %%ymm1 \n\t"
1164 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
1165 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1166 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
1167 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1168 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
1169 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
1170 " \n\t"
1171 "prefetcht0 20 * 32(%%rax) \n\t"
1172 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
1173 "vmovapd 3 * 32(%%rbx), %%ymm2 \n\t"
1174 "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
1175 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
1176 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1177 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
1178 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
1179 " \n\t"
1180 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
1181 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
1182 "vmovapd 6 * 32(%%rax), %%ymm0 \n\t"
1183 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
1184 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
1185 "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4]
1186 " \n\t"
1187 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
1188 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
1189 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
1190 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
1191 " \n\t"
1192 " \n\t"
1193 " \n\t" // iteration 3
1194 "vmovapd 7 * 32(%%rax), %%ymm1 \n\t"
1195 "addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr)
1196 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
1197 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1198 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
1199 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1200 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
1201 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
1202 " \n\t"
1203 //"prefetcht0 22 * 32(%%rax) \n\t"
1204 "prefetcht0 14 * 32(%%rax) \n\t"
1205 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
1206 "vmovapd 0 * 32(%%rbx), %%ymm2 \n\t"
1207 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
1208 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1209 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
1210 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
1211 " \n\t"
1212 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
1213 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
1214 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
1215 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
1216 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
1217 " \n\t"
1218 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
1219 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
1220 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
1221 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
1222 " \n\t"
1223 " \n\t"
1224 " \n\t"
1225 //"addq $4 * 8 * 8, %%rax \n\t" // a += 4*8 (unroll x mr)
1226 //"addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
1227 " \n\t"
1228 "decq %%rsi \n\t" // i -= 1;
1229 "jne .DLOOPKITER \n\t" // iterate again if i != 0.
1230 " \n\t"
1231 " \n\t"
1232 " \n\t"
1233 " \n\t"
1234 " \n\t"
1235 " \n\t"
1236 ".DCONSIDKLEFT: \n\t"
1237 " \n\t"
1238 "movq %1, %%rsi \n\t" // i = k_left;
1239 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
1240 "je .DPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
1241 " \n\t" // else, we prepare to enter k_left loop.
1242 " \n\t"
1243 " \n\t"
1244 ".DLOOPKLEFT: \n\t" // EDGE LOOP
1245 " \n\t"
1246 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
1247 "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr)
1248 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
1249 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1250 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
1251 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1252 "vaddpd %%ymm15, %%ymm6, %%ymm15 \n\t"
1253 "vaddpd %%ymm13, %%ymm7, %%ymm13 \n\t"
1254 " \n\t"
1255 "prefetcht0 14 * 32(%%rax) \n\t"
1256 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
1257 "vmovapd 1 * 32(%%rbx), %%ymm2 \n\t"
1258 "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
1259 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
1260 "vpermilpd $0x5, %%ymm2, %%ymm3 \n\t"
1261 "vaddpd %%ymm14, %%ymm6, %%ymm14 \n\t"
1262 "vaddpd %%ymm12, %%ymm7, %%ymm12 \n\t"
1263 " \n\t"
1264 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
1265 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
1266 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t"
1267 "vaddpd %%ymm11, %%ymm6, %%ymm11 \n\t"
1268 "vaddpd %%ymm9, %%ymm7, %%ymm9 \n\t"
1269 " \n\t"
1270 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
1271 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
1272 "vaddpd %%ymm10, %%ymm6, %%ymm10 \n\t"
1273 "vaddpd %%ymm8, %%ymm7, %%ymm8 \n\t"
1274 " \n\t"
1275 " \n\t"
1276 "decq %%rsi \n\t" // i -= 1;
1277 "jne .DLOOPKLEFT \n\t" // iterate again if i != 0.
1278 " \n\t"
1279 " \n\t"
1280 " \n\t"
1281 ".DPOSTACCUM: \n\t"
1282 " \n\t"
1283 " \n\t"
1284 " \n\t" // ymm15: ymm13: ymm11: ymm9:
1285 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
1286 " \n\t" // ab11 ab10 ab13 ab12
1287 " \n\t" // ab22 ab23 ab20 ab21
1288 " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
1289 " \n\t"
1290 " \n\t" // ymm14: ymm12: ymm10: ymm8:
1291 " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
1292 " \n\t" // ab51 ab50 ab53 ab52
1293 " \n\t" // ab62 ab63 ab60 ab61
1294 " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
1295 " \n\t"
1296 "vmovapd %%ymm15, %%ymm7 \n\t"
1297 "vshufpd $0xa, %%ymm15, %%ymm13, %%ymm15 \n\t"
1298 "vshufpd $0xa, %%ymm13, %%ymm7, %%ymm13 \n\t"
1299 " \n\t"
1300 "vmovapd %%ymm11, %%ymm7 \n\t"
1301 "vshufpd $0xa, %%ymm11, %%ymm9, %%ymm11 \n\t"
1302 "vshufpd $0xa, %%ymm9, %%ymm7, %%ymm9 \n\t"
1303 " \n\t"
1304 "vmovapd %%ymm14, %%ymm7 \n\t"
1305 "vshufpd $0xa, %%ymm14, %%ymm12, %%ymm14 \n\t"
1306 "vshufpd $0xa, %%ymm12, %%ymm7, %%ymm12 \n\t"
1307 " \n\t"
1308 "vmovapd %%ymm10, %%ymm7 \n\t"
1309 "vshufpd $0xa, %%ymm10, %%ymm8, %%ymm10 \n\t"
1310 "vshufpd $0xa, %%ymm8, %%ymm7, %%ymm8 \n\t"
1311 " \n\t"
1312 " \n\t" // ymm15: ymm13: ymm11: ymm9:
1313 " \n\t" // ( ab01 ( ab00 ( ab03 ( ab02
1314 " \n\t" // ab11 ab10 ab13 ab12
1315 " \n\t" // ab23 ab22 ab21 ab20
1316 " \n\t" // ab33 ) ab32 ) ab31 ) ab30 )
1317 " \n\t"
1318 " \n\t" // ymm14: ymm12: ymm10: ymm8:
1319 " \n\t" // ( ab41 ( ab40 ( ab43 ( ab42
1320 " \n\t" // ab51 ab50 ab53 ab52
1321 " \n\t" // ab63 ab62 ab61 ab60
1322 " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
1323 " \n\t"
1324 "vmovapd %%ymm15, %%ymm7 \n\t"
1325 "vperm2f128 $0x30, %%ymm15, %%ymm11, %%ymm15 \n\t"
1326 "vperm2f128 $0x12, %%ymm7, %%ymm11, %%ymm11 \n\t"
1327 " \n\t"
1328 "vmovapd %%ymm13, %%ymm7 \n\t"
1329 "vperm2f128 $0x30, %%ymm13, %%ymm9, %%ymm13 \n\t"
1330 "vperm2f128 $0x12, %%ymm7, %%ymm9, %%ymm9 \n\t"
1331 " \n\t"
1332 "vmovapd %%ymm14, %%ymm7 \n\t"
1333 "vperm2f128 $0x30, %%ymm14, %%ymm10, %%ymm14 \n\t"
1334 "vperm2f128 $0x12, %%ymm7, %%ymm10, %%ymm10 \n\t"
1335 " \n\t"
1336 "vmovapd %%ymm12, %%ymm7 \n\t"
1337 "vperm2f128 $0x30, %%ymm12, %%ymm8, %%ymm12 \n\t"
1338 "vperm2f128 $0x12, %%ymm7, %%ymm8, %%ymm8 \n\t"
1339 " \n\t"
1340 " \n\t" // ymm9: ymm11: ymm13: ymm15:
1341 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
1342 " \n\t" // ab10 ab11 ab12 ab13
1343 " \n\t" // ab20 ab21 ab22 ab23
1344 " \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
1345 " \n\t"
1346 " \n\t" // ymm8: ymm10: ymm12: ymm14:
1347 " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
1348 " \n\t" // ab50 ab51 ab52 ab53
1349 " \n\t" // ab60 ab61 ab62 ab63
1350 " \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
1351 " \n\t"
1352 " \n\t"
1353 "movq %4, %%rax \n\t" // load address of alpha
1354 "movq %5, %%rbx \n\t" // load address of beta
1355 "vbroadcastsd (%%rax), %%ymm0 \n\t" // load alpha and duplicate
1356 "vbroadcastsd (%%rbx), %%ymm2 \n\t" // load beta and duplicate
1357 " \n\t"
1358 "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" // scale by alpha
1359 "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t"
1360 "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t"
1361 "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t"
1362 "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t"
1363 "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t"
1364 "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t"
1365 "vmulpd %%ymm0, %%ymm15, %%ymm15 \n\t"
1366 " \n\t"
1367 " \n\t"
1368 " \n\t"
1369 " \n\t"
1370 " \n\t"
1371 " \n\t"
1372 "movq %7, %%rsi \n\t" // load rs_c
1373 "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(double)
1374 " \n\t"
1375 "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
1376 " \n\t"
1377 "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c;
1378 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c;
1379 " \n\t"
1380 " \n\t"
1381 " \n\t"
1382 " \n\t" // determine if
1383 " \n\t" // c % 32 == 0, AND
1384 " \n\t" // 8*cs_c % 32 == 0, AND
1385 " \n\t" // rs_c == 1
1386 " \n\t" // ie: aligned, ldim aligned, and
1387 " \n\t" // column-stored
1388 " \n\t"
1389 "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8.
1390 "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
1391 "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
1392 "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
1393 "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero.
1394 "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
1395 " \n\t" // and(bl,bh) followed by
1396 " \n\t" // and(bh,al) will reveal result
1397 " \n\t"
1398 " \n\t" // now avoid loading C if beta == 0
1399 " \n\t"
1400 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
1401 "vucomisd %%xmm0, %%xmm2 \n\t" // set ZF if beta == 0.
1402 "je .DBETAZERO \n\t" // if ZF = 1, jump to beta == 0 case
1403 " \n\t"
1404 " \n\t"
1405 " \n\t" // check if aligned/column-stored
1406 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
1407 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
1408 "jne .DCOLSTORED \n\t" // jump to column storage case
1409 " \n\t"
1410 " \n\t"
1411 " \n\t"
1412 ".DGENSTORED: \n\t"
1413 " \n\t" // update c00:c33
1414 " \n\t"
1415 "vextractf128 $1, %%ymm9, %%xmm1 \n\t"
1416 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c00 and c10,
1417 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
1418 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1419 "vaddpd %%xmm9, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1420 "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
1421 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
1422 "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c20 and c30,
1423 "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
1424 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1425 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1426 "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory.
1427 "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
1428 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1429 " \n\t"
1430 "vextractf128 $1, %%ymm11, %%xmm1 \n\t"
1431 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c01 and c11,
1432 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
1433 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1434 "vaddpd %%xmm11, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1435 "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
1436 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
1437 "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c21 and c31,
1438 "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
1439 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1440 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1441 "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory.
1442 "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
1443 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1444 " \n\t"
1445 "vextractf128 $1, %%ymm13, %%xmm1 \n\t"
1446 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c02 and c12,
1447 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
1448 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1449 "vaddpd %%xmm13, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1450 "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
1451 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
1452 "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c22 and c32,
1453 "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
1454 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1455 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1456 "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory.
1457 "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
1458 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1459 " \n\t"
1460 "vextractf128 $1, %%ymm15, %%xmm1 \n\t"
1461 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load c03 and c13,
1462 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t"
1463 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1464 "vaddpd %%xmm15, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1465 "vmovlpd %%xmm0, (%%rcx) \n\t" // and store back to memory.
1466 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t"
1467 "vmovlpd (%%rcx,%%r12), %%xmm0, %%xmm0 \n\t" // load c23 and c33,
1468 "vmovhpd (%%rcx,%%r13), %%xmm0, %%xmm0 \n\t"
1469 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1470 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1471 "vmovlpd %%xmm0, (%%rcx,%%r12) \n\t" // and store back to memory.
1472 "vmovhpd %%xmm0, (%%rcx,%%r13) \n\t"
1473 " \n\t"
1474 " \n\t" // update c40:c73
1475 " \n\t"
1476 "vextractf128 $1, %%ymm8, %%xmm1 \n\t"
1477 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c40 and c50,
1478 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
1479 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1480 "vaddpd %%xmm8, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1481 "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory.
1482 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
1483 "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c60 and c70,
1484 "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
1485 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1486 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1487 "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory.
1488 "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
1489 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1490 " \n\t"
1491 "vextractf128 $1, %%ymm10, %%xmm1 \n\t"
1492 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c41 and c51,
1493 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
1494 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1495 "vaddpd %%xmm10, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1496 "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory.
1497 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
1498 "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c61 and c71,
1499 "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
1500 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1501 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1502 "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory.
1503 "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
1504 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1505 " \n\t"
1506 "vextractf128 $1, %%ymm12, %%xmm1 \n\t"
1507 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c42 and c52,
1508 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
1509 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1510 "vaddpd %%xmm12, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1511 "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory.
1512 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
1513 "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c62 and c72,
1514 "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
1515 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1516 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1517 "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory.
1518 "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
1519 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1520 " \n\t"
1521 "vextractf128 $1, %%ymm14, %%xmm1 \n\t"
1522 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load c43 and c53,
1523 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t"
1524 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1525 "vaddpd %%xmm14, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1526 "vmovlpd %%xmm0, (%%rdx) \n\t" // and store back to memory.
1527 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t"
1528 "vmovlpd (%%rdx,%%r12), %%xmm0, %%xmm0 \n\t" // load c63 and c73,
1529 "vmovhpd (%%rdx,%%r13), %%xmm0, %%xmm0 \n\t"
1530 "vmulpd %%xmm2, %%xmm0, %%xmm0 \n\t" // scale by beta,
1531 "vaddpd %%xmm1, %%xmm0, %%xmm0 \n\t" // add the gemm result,
1532 "vmovlpd %%xmm0, (%%rdx,%%r12) \n\t" // and store back to memory.
1533 "vmovhpd %%xmm0, (%%rdx,%%r13) \n\t"
1534 " \n\t"
1535 " \n\t"
1536 "jmp .DDONE \n\t" // jump to end.
1537 " \n\t"
1538 " \n\t"
1539 " \n\t"
1540 ".DCOLSTORED: \n\t"
1541 " \n\t" // update c00:c33
1542 " \n\t"
1543 "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30,
1544 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1545 "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1546 "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory.
1547 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1548 " \n\t"
1549 "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31,
1550 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1551 "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1552 "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory.
1553 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1554 " \n\t"
1555 "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32,
1556 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1557 "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1558 "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory.
1559 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1560 " \n\t"
1561 "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33,
1562 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1563 "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1564 "vmovapd %%ymm0, (%%rcx) \n\t" // and store back to memory.
1565 " \n\t"
1566 " \n\t" // update c40:c73
1567 " \n\t"
1568 "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70,
1569 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1570 "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1571 "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory.
1572 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1573 " \n\t"
1574 "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71,
1575 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1576 "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1577 "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory.
1578 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1579 " \n\t"
1580 "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72,
1581 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1582 "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1583 "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory.
1584 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1585 " \n\t"
1586 "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73,
1587 "vmulpd %%ymm2, %%ymm0, %%ymm0 \n\t" // scale by beta,
1588 "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result,
1589 "vmovapd %%ymm0, (%%rdx) \n\t" // and store back to memory.
1590 " \n\t"
1591 " \n\t"
1592 "jmp .DDONE \n\t" // jump to end.
1593 " \n\t"
1594 " \n\t"
1595 " \n\t"
1596 " \n\t"
1597 ".DBETAZERO: \n\t"
1598 " \n\t" // check if aligned/column-stored
1599 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
1600 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
1601 "jne .DCOLSTORBZ \n\t" // jump to column storage case
1602 " \n\t"
1603 " \n\t"
1604 " \n\t"
1605 ".DGENSTORBZ: \n\t"
1606 " \n\t" // update c00:c33
1607 " \n\t"
1608 "vextractf128 $1, %%ymm9, %%xmm1 \n\t"
1609 "vmovlpd %%xmm9, (%%rcx) \n\t" // store to c00:c30
1610 "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t"
1611 "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
1612 "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
1613 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1614 " \n\t"
1615 "vextractf128 $1, %%ymm11, %%xmm1 \n\t"
1616 "vmovlpd %%xmm11, (%%rcx) \n\t" // store to c01:c31
1617 "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t"
1618 "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
1619 "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
1620 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1621 " \n\t"
1622 "vextractf128 $1, %%ymm13, %%xmm1 \n\t"
1623 "vmovlpd %%xmm13, (%%rcx) \n\t" // store to c02:c32
1624 "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t"
1625 "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
1626 "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
1627 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1628 " \n\t"
1629 "vextractf128 $1, %%ymm15, %%xmm1 \n\t"
1630 "vmovlpd %%xmm15, (%%rcx) \n\t" // store to c03:c33
1631 "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t"
1632 "vmovlpd %%xmm1, (%%rcx,%%r12) \n\t"
1633 "vmovhpd %%xmm1, (%%rcx,%%r13) \n\t"
1634 " \n\t"
1635 " \n\t" // update c40:c73
1636 " \n\t"
1637 "vextractf128 $1, %%ymm8, %%xmm1 \n\t"
1638 "vmovlpd %%xmm8, (%%rdx) \n\t" // store to c40:c70
1639 "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t"
1640 "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
1641 "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
1642 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1643 " \n\t"
1644 "vextractf128 $1, %%ymm10, %%xmm1 \n\t"
1645 "vmovlpd %%xmm10, (%%rdx) \n\t" // store to c41:c71
1646 "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t"
1647 "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
1648 "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
1649 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1650 " \n\t"
1651 "vextractf128 $1, %%ymm12, %%xmm1 \n\t"
1652 "vmovlpd %%xmm12, (%%rdx) \n\t" // store to c42:c72
1653 "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t"
1654 "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
1655 "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
1656 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1657 " \n\t"
1658 "vextractf128 $1, %%ymm14, %%xmm1 \n\t"
1659 "vmovlpd %%xmm14, (%%rdx) \n\t" // store to c43:c73
1660 "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t"
1661 "vmovlpd %%xmm1, (%%rdx,%%r12) \n\t"
1662 "vmovhpd %%xmm1, (%%rdx,%%r13) \n\t"
1663 " \n\t"
1664 " \n\t"
1665 "jmp .DDONE \n\t" // jump to end.
1666 " \n\t"
1667 " \n\t"
1668 " \n\t"
1669 ".DCOLSTORBZ: \n\t"
1670 " \n\t" // update c00:c33
1671 " \n\t"
1672 "vmovapd %%ymm9, (%%rcx) \n\t" // store c00:c30
1673 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1674 " \n\t"
1675 "vmovapd %%ymm11, (%%rcx) \n\t" // store c01:c31
1676 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1677 " \n\t"
1678 "vmovapd %%ymm13, (%%rcx) \n\t" // store c02:c32
1679 "addq %%rdi, %%rcx \n\t" // c += cs_c;
1680 " \n\t"
1681 "vmovapd %%ymm15, (%%rcx) \n\t" // store c03:c33
1682 " \n\t"
1683 " \n\t" // update c40:c73
1684 " \n\t"
1685 "vmovapd %%ymm8, (%%rdx) \n\t" // store c40:c70
1686 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1687 " \n\t"
1688 "vmovapd %%ymm10, (%%rdx) \n\t" // store c41:c71
1689 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1690 " \n\t"
1691 "vmovapd %%ymm12, (%%rdx) \n\t" // store c42:c72
1692 "addq %%rdi, %%rdx \n\t" // c += cs_c;
1693 " \n\t"
1694 "vmovapd %%ymm14, (%%rdx) \n\t" // store c43:c73
1695 " \n\t"
1696 " \n\t"
1697 " \n\t"
1698 " \n\t"
1699 " \n\t"
1700 ".DDONE: \n\t"
1701 " \n\t"
1702 // "vzeroupper \n\t"
1703 " \n\t"
1704
1705 : // output operands (none)
1706 : // input operands
1707 "m" (k_iter), // 0
1708 "m" (k_left), // 1
1709 "m" (a), // 2
1710 "m" (b), // 3
1711 "m" (alpha), // 4
1712 "m" (beta), // 5
1713 "m" (c), // 6
1714 "m" (rs_c), // 7
1715 "m" (cs_c), // 8
1716 "m" (b_next)/*, // 9
1717 "m" (a_next)*/ // 10
1718 : // register clobber list
1719 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1720 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1721 "xmm0", "xmm1", "xmm2", "xmm3",
1722 "xmm4", "xmm5", "xmm6", "xmm7",
1723 "xmm8", "xmm9", "xmm10", "xmm11",
1724 "xmm12", "xmm13", "xmm14", "xmm15",
1725 "memory"
1726 );
1727 }
1728
bli_cgemm_asm_8x4(dim_t k,scomplex * restrict alpha,scomplex * restrict a,scomplex * restrict b,scomplex * restrict beta,scomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)1729 void bli_cgemm_asm_8x4
1730 (
1731 dim_t k,
1732 scomplex* restrict alpha,
1733 scomplex* restrict a,
1734 scomplex* restrict b,
1735 scomplex* restrict beta,
1736 scomplex* restrict c, inc_t rs_c, inc_t cs_c,
1737 auxinfo_t* restrict data,
1738 cntx_t* restrict cntx
1739 )
1740 {
1741 //void* a_next = bli_auxinfo_next_a( data );
1742 void* b_next = bli_auxinfo_next_b( data );
1743
1744 uint64_t k_iter = k / 4;
1745 uint64_t k_left = k % 4;
1746
1747 __asm__ volatile
1748 (
1749 " \n\t"
1750 " \n\t"
1751 "movq %2, %%rax \n\t" // load address of a.
1752 "movq %3, %%rbx \n\t" // load address of b.
1753 "movq %9, %%r15 \n\t" // load address of b_next.
1754 //"movq %10, %%r14 \n\t" // load address of a_next.
1755 "addq $-4 * 64, %%r15 \n\t"
1756 " \n\t"
1757 "vmovaps 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
1758 "vmovsldup 0 * 32(%%rbx), %%ymm2 \n\t"
1759 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1760 " \n\t"
1761 "movq %6, %%rcx \n\t" // load address of c
1762 "movq %8, %%rdi \n\t" // load cs_c
1763 "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(scomplex)
1764 "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c;
1765 " \n\t"
1766 "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
1767 "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
1768 "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c
1769 "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
1770 " \n\t"
1771 "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t"
1772 "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t"
1773 "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t"
1774 "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t"
1775 "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t"
1776 "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t"
1777 "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t"
1778 "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"
1779 " \n\t"
1780 " \n\t"
1781 " \n\t"
1782 "movq %0, %%rsi \n\t" // i = k_iter;
1783 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
1784 "je .CCONSIDKLEFT \n\t" // if i == 0, jump to code that
1785 " \n\t" // contains the k_left loop.
1786 " \n\t"
1787 " \n\t"
1788 ".CLOOPKITER: \n\t" // MAIN LOOP
1789 " \n\t"
1790 "addq $4 * 4 * 8, %%r15 \n\t" // b_next += 4*4 (unroll x nr)
1791 " \n\t"
1792 " \n\t" // iteration 0
1793 "prefetcht0 8 * 32(%%rax) \n\t"
1794 "vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
1795 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1796 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1797 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1798 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1799 "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t"
1800 "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t"
1801 " \n\t"
1802 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1803 "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
1804 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1805 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1806 "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t"
1807 "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t"
1808 " \n\t"
1809 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1810 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1811 "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"
1812 "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t"
1813 "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t"
1814 " \n\t"
1815 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1816 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1817 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1818 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1819 "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t"
1820 "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t"
1821 "prefetcht0 0 * 32(%%r15) \n\t" // prefetch b_next[0*4]
1822 " \n\t"
1823 "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t"
1824 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1825 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1826 "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t"
1827 "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t"
1828 " \n\t"
1829 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1830 "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
1831 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1832 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1833 "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t"
1834 "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t"
1835 " \n\t"
1836 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1837 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1838 "vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
1839 "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t"
1840 "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t"
1841 " \n\t"
1842 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1843 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1844 "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t"
1845 "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t"
1846 " \n\t"
1847 " \n\t"
1848 " \n\t" // iteration 1
1849 "prefetcht0 10 * 32(%%rax) \n\t"
1850 "vmovaps 3 * 32(%%rax), %%ymm1 \n\t"
1851 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1852 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1853 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1854 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1855 "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t"
1856 "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t"
1857 " \n\t"
1858 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1859 "vmovshdup 1 * 32(%%rbx), %%ymm2 \n\t"
1860 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1861 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1862 "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t"
1863 "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t"
1864 " \n\t"
1865 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1866 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1867 "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"
1868 "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t"
1869 "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t"
1870 " \n\t"
1871 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1872 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1873 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1874 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1875 "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t"
1876 "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t"
1877 " \n\t"
1878 "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t"
1879 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1880 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1881 "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t"
1882 "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t"
1883 " \n\t"
1884 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1885 "vmovsldup 2 * 32(%%rbx), %%ymm2 \n\t"
1886 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1887 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1888 "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t"
1889 "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t"
1890 " \n\t"
1891 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1892 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1893 "vmovaps 4 * 32(%%rax), %%ymm0 \n\t"
1894 "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t"
1895 "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t"
1896 " \n\t"
1897 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1898 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1899 "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t"
1900 "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t"
1901 " \n\t"
1902 " \n\t"
1903 " \n\t" // iteration 2
1904 "prefetcht0 12 * 32(%%rax) \n\t"
1905 "vmovaps 5 * 32(%%rax), %%ymm1 \n\t"
1906 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1907 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1908 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1909 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1910 "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t"
1911 "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t"
1912 " \n\t"
1913 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1914 "vmovshdup 2 * 32(%%rbx), %%ymm2 \n\t"
1915 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1916 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1917 "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t"
1918 "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t"
1919 " \n\t"
1920 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1921 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1922 "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"
1923 "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t"
1924 "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t"
1925 " \n\t"
1926 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1927 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1928 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1929 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1930 "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t"
1931 "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t"
1932 "prefetcht0 2 * 32(%%r15) \n\t" // prefetch b_next[2*4]
1933 " \n\t"
1934 "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t"
1935 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1936 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1937 "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t"
1938 "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t"
1939 " \n\t"
1940 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1941 "vmovsldup 3 * 32(%%rbx), %%ymm2 \n\t"
1942 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1943 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1944 "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t"
1945 "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t"
1946 " \n\t"
1947 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1948 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1949 "vmovaps 6 * 32(%%rax), %%ymm0 \n\t"
1950 "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t"
1951 "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t"
1952 " \n\t"
1953 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1954 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1955 "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t"
1956 "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t"
1957 " \n\t"
1958 " \n\t"
1959 " \n\t" // iteration 3
1960 "prefetcht0 14 * 32(%%rax) \n\t"
1961 "vmovaps 7 * 32(%%rax), %%ymm1 \n\t"
1962 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1963 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1964 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1965 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1966 "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t"
1967 "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t"
1968 " \n\t"
1969 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1970 "vmovshdup 3 * 32(%%rbx), %%ymm2 \n\t"
1971 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1972 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1973 "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t"
1974 "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t"
1975 " \n\t"
1976 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
1977 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
1978 "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"
1979 "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t"
1980 "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t"
1981 " \n\t"
1982 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
1983 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
1984 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
1985 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
1986 "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t"
1987 "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t"
1988 " \n\t"
1989 "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t"
1990 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
1991 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
1992 "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t"
1993 "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t"
1994 " \n\t"
1995 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
1996 "vmovsldup 4 * 32(%%rbx), %%ymm2 \n\t"
1997 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
1998 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
1999 "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t"
2000 "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t"
2001 " \n\t"
2002 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
2003 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
2004 "vmovaps 8 * 32(%%rax), %%ymm0 \n\t"
2005 "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t"
2006 "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t"
2007 " \n\t"
2008 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
2009 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
2010 "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t"
2011 "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t"
2012 " \n\t"
2013 " \n\t"
2014 "addq $8 * 4 * 8, %%rax \n\t" // a += 8*4 (unroll x mr)
2015 "addq $4 * 4 * 8, %%rbx \n\t" // b += 4*4 (unroll x nr)
2016 " \n\t"
2017 " \n\t"
2018 "decq %%rsi \n\t" // i -= 1;
2019 "jne .CLOOPKITER \n\t" // iterate again if i != 0.
2020 " \n\t"
2021 " \n\t"
2022 " \n\t"
2023 " \n\t"
2024 " \n\t"
2025 " \n\t"
2026 ".CCONSIDKLEFT: \n\t"
2027 " \n\t"
2028 "movq %1, %%rsi \n\t" // i = k_left;
2029 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
2030 "je .CPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
2031 " \n\t" // else, we prepare to enter k_left loop.
2032 " \n\t"
2033 " \n\t"
2034 ".CLOOPKLEFT: \n\t" // EDGE LOOP
2035 " \n\t"
2036 " \n\t" // iteration 0
2037 "prefetcht0 8 * 32(%%rax) \n\t"
2038 "vmovaps 1 * 32(%%rax), %%ymm1 \n\t"
2039 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
2040 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2041 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
2042 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2043 "vaddps %%ymm6, %%ymm15, %%ymm15 \n\t"
2044 "vaddps %%ymm7, %%ymm13, %%ymm13 \n\t"
2045 " \n\t"
2046 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
2047 "vmovshdup 0 * 32(%%rbx), %%ymm2 \n\t"
2048 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
2049 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
2050 "vaddps %%ymm6, %%ymm14, %%ymm14 \n\t"
2051 "vaddps %%ymm7, %%ymm12, %%ymm12 \n\t"
2052 " \n\t"
2053 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
2054 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
2055 "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t"
2056 "vaddps %%ymm6, %%ymm11, %%ymm11 \n\t"
2057 "vaddps %%ymm7, %%ymm9, %%ymm9 \n\t"
2058 " \n\t"
2059 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
2060 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2061 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
2062 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2063 "vaddps %%ymm6, %%ymm10, %%ymm10 \n\t"
2064 "vaddps %%ymm7, %%ymm8, %%ymm8 \n\t"
2065 " \n\t"
2066 "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t"
2067 "vmulps %%ymm0, %%ymm2, %%ymm6 \n\t"
2068 "vmulps %%ymm0, %%ymm3, %%ymm7 \n\t"
2069 "vaddsubps %%ymm6, %%ymm15, %%ymm15 \n\t"
2070 "vaddsubps %%ymm7, %%ymm13, %%ymm13 \n\t"
2071 " \n\t"
2072 "vmulps %%ymm1, %%ymm2, %%ymm6 \n\t"
2073 "vmovsldup 1 * 32(%%rbx), %%ymm2 \n\t"
2074 "vmulps %%ymm1, %%ymm3, %%ymm7 \n\t"
2075 "vpermilps $0x4e, %%ymm2, %%ymm3 \n\t"
2076 "vaddsubps %%ymm6, %%ymm14, %%ymm14 \n\t"
2077 "vaddsubps %%ymm7, %%ymm12, %%ymm12 \n\t"
2078 " \n\t"
2079 "vmulps %%ymm0, %%ymm4, %%ymm6 \n\t"
2080 "vmulps %%ymm0, %%ymm5, %%ymm7 \n\t"
2081 "vmovaps 2 * 32(%%rax), %%ymm0 \n\t"
2082 "vaddsubps %%ymm6, %%ymm11, %%ymm11 \n\t"
2083 "vaddsubps %%ymm7, %%ymm9, %%ymm9 \n\t"
2084 " \n\t"
2085 "vmulps %%ymm1, %%ymm4, %%ymm6 \n\t"
2086 "vmulps %%ymm1, %%ymm5, %%ymm7 \n\t"
2087 "vaddsubps %%ymm6, %%ymm10, %%ymm10 \n\t"
2088 "vaddsubps %%ymm7, %%ymm8, %%ymm8 \n\t"
2089 " \n\t"
2090 " \n\t"
2091 "addq $8 * 1 * 8, %%rax \n\t" // a += 8 (1 x mr)
2092 "addq $4 * 1 * 8, %%rbx \n\t" // b += 4 (1 x nr)
2093 " \n\t"
2094 " \n\t"
2095 "decq %%rsi \n\t" // i -= 1;
2096 "jne .CLOOPKLEFT \n\t" // iterate again if i != 0.
2097 " \n\t"
2098 " \n\t"
2099 " \n\t"
2100 ".CPOSTACCUM: \n\t"
2101 " \n\t"
2102 " \n\t" // ymm15: ymm13: ymm11: ymm9:
2103 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
2104 " \n\t" // ab10 ab11 ab12 ab13
2105 " \n\t" // ab21 ab20 ab23 ab22
2106 " \n\t" // ab31 ab30 ab33 ab32
2107 " \n\t" // ab42 ab43 ab40 ab41
2108 " \n\t" // ab52 ab53 ab50 ab51
2109 " \n\t" // ab63 ab62 ab61 ab60
2110 " \n\t" // ab73 ) ab72 ) ab71 ) ab70 )
2111 " \n\t"
2112 " \n\t" // ymm14: ymm12: ymm10: ymm8:
2113 " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83
2114 " \n\t" // ab90 ab91 ab92 ab93
2115 " \n\t" // aba1 aba0 aba3 aba2
2116 " \n\t" // abb1 abb0 abb3 abb2
2117 " \n\t" // abc2 abc3 abc0 abc1
2118 " \n\t" // abd2 abd3 abd0 abd1
2119 " \n\t" // abe3 abe2 abe1 abe0
2120 " \n\t" // abf3 abf2 abf1 abf0 )
2121 " \n\t"
2122 "vmovaps %%ymm15, %%ymm7 \n\t"
2123 "vshufps $0xe4, %%ymm13, %%ymm15, %%ymm15 \n\t"
2124 "vshufps $0xe4, %%ymm7, %%ymm13, %%ymm13 \n\t"
2125 " \n\t"
2126 "vmovaps %%ymm11, %%ymm7 \n\t"
2127 "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm11 \n\t"
2128 "vshufps $0xe4, %%ymm7, %%ymm9, %%ymm9 \n\t"
2129 " \n\t"
2130 "vmovaps %%ymm14, %%ymm7 \n\t"
2131 "vshufps $0xe4, %%ymm12, %%ymm14, %%ymm14 \n\t"
2132 "vshufps $0xe4, %%ymm7, %%ymm12, %%ymm12 \n\t"
2133 " \n\t"
2134 "vmovaps %%ymm10, %%ymm7 \n\t"
2135 "vshufps $0xe4, %%ymm8, %%ymm10, %%ymm10 \n\t"
2136 "vshufps $0xe4, %%ymm7, %%ymm8, %%ymm8 \n\t"
2137 " \n\t"
2138 " \n\t" // ymm15: ymm13: ymm11: ymm9:
2139 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
2140 " \n\t" // ab10 ab11 ab12 ab13
2141 " \n\t" // ab20 ab21 ab22 ab23
2142 " \n\t" // ab30 ab31 ab32 ab33
2143 " \n\t" // ab42 ab43 ab40 ab41
2144 " \n\t" // ab52 ab53 ab50 ab51
2145 " \n\t" // ab62 ab63 ab60 ab61
2146 " \n\t" // ab72 ) ab73 ) ab70 ) ab71 )
2147 " \n\t"
2148 " \n\t" // ymm14: ymm12: ymm10: ymm8:
2149 " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83
2150 " \n\t" // ab90 ab91 ab92 ab93
2151 " \n\t" // aba0 aba1 aba2 aba3
2152 " \n\t" // abb0 abb1 abb2 abb3
2153 " \n\t" // abc2 abc3 abc0 abc1
2154 " \n\t" // abd2 abd3 abd0 abd1
2155 " \n\t" // abe2 abe3 abe0 abe1
2156 " \n\t" // abf2 ) abf3 ) abf0 ) abf1 )
2157 " \n\t"
2158 "vmovaps %%ymm15, %%ymm7 \n\t"
2159 "vperm2f128 $0x12, %%ymm15, %%ymm11, %%ymm15 \n\t"
2160 "vperm2f128 $0x30, %%ymm7, %%ymm11, %%ymm11 \n\t"
2161 " \n\t"
2162 "vmovaps %%ymm13, %%ymm7 \n\t"
2163 "vperm2f128 $0x12, %%ymm13, %%ymm9, %%ymm13 \n\t"
2164 "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t"
2165 " \n\t"
2166 "vmovaps %%ymm14, %%ymm7 \n\t"
2167 "vperm2f128 $0x12, %%ymm14, %%ymm10, %%ymm14 \n\t"
2168 "vperm2f128 $0x30, %%ymm7, %%ymm10, %%ymm10 \n\t"
2169 " \n\t"
2170 "vmovaps %%ymm12, %%ymm7 \n\t"
2171 "vperm2f128 $0x12, %%ymm12, %%ymm8, %%ymm12 \n\t"
2172 "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t"
2173 " \n\t"
2174 " \n\t" // ymm15: ymm13: ymm11: ymm9:
2175 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
2176 " \n\t" // ab10 ab11 ab12 ab13
2177 " \n\t" // ab20 ab21 ab22 ab23
2178 " \n\t" // ab30 ab31 ab32 ab33
2179 " \n\t" // ab40 ab41 ab42 ab43
2180 " \n\t" // ab50 ab51 ab52 ab53
2181 " \n\t" // ab60 ab61 ab62 ab63
2182 " \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
2183 " \n\t"
2184 " \n\t" // ymm14: ymm12: ymm10: ymm8:
2185 " \n\t" // ( ab80 ( ab81 ( ab82 ( ab83
2186 " \n\t" // ab90 ab91 ab92 ab93
2187 " \n\t" // aba0 aba1 aba2 aba3
2188 " \n\t" // abb0 abb1 abb2 abb3
2189 " \n\t" // abc0 abc1 abc2 abc3
2190 " \n\t" // abd0 abd1 abd2 abd3
2191 " \n\t" // abe0 abe1 abe2 abe3
2192 " \n\t" // abf0 ) abf1 ) abf2 ) abf3 )
2193 " \n\t"
2194 " \n\t"
2195 " \n\t"
2196 " \n\t"
2197 " \n\t" // scale by alpha
2198 " \n\t"
2199 "movq %4, %%rax \n\t" // load address of alpha
2200 "vbroadcastss (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate
2201 "vbroadcastss 4(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate
2202 " \n\t"
2203 "vpermilps $0xb1, %%ymm15, %%ymm3 \n\t"
2204 "vmulps %%ymm7, %%ymm15, %%ymm15 \n\t"
2205 "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t"
2206 "vaddsubps %%ymm3, %%ymm15, %%ymm15 \n\t"
2207 " \n\t"
2208 "vpermilps $0xb1, %%ymm14, %%ymm2 \n\t"
2209 "vmulps %%ymm7, %%ymm14, %%ymm14 \n\t"
2210 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2211 "vaddsubps %%ymm2, %%ymm14, %%ymm14 \n\t"
2212 " \n\t"
2213 "vpermilps $0xb1, %%ymm13, %%ymm1 \n\t"
2214 "vmulps %%ymm7, %%ymm13, %%ymm13 \n\t"
2215 "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t"
2216 "vaddsubps %%ymm1, %%ymm13, %%ymm13 \n\t"
2217 " \n\t"
2218 "vpermilps $0xb1, %%ymm12, %%ymm0 \n\t"
2219 "vmulps %%ymm7, %%ymm12, %%ymm12 \n\t"
2220 "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t"
2221 "vaddsubps %%ymm0, %%ymm12, %%ymm12 \n\t"
2222 " \n\t"
2223 "vpermilps $0xb1, %%ymm11, %%ymm3 \n\t"
2224 "vmulps %%ymm7, %%ymm11, %%ymm11 \n\t"
2225 "vmulps %%ymm6, %%ymm3, %%ymm3 \n\t"
2226 "vaddsubps %%ymm3, %%ymm11, %%ymm11 \n\t"
2227 " \n\t"
2228 "vpermilps $0xb1, %%ymm10, %%ymm2 \n\t"
2229 "vmulps %%ymm7, %%ymm10, %%ymm10 \n\t"
2230 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2231 "vaddsubps %%ymm2, %%ymm10, %%ymm10 \n\t"
2232 " \n\t"
2233 "vpermilps $0xb1, %%ymm9, %%ymm1 \n\t"
2234 "vmulps %%ymm7, %%ymm9, %%ymm9 \n\t"
2235 "vmulps %%ymm6, %%ymm1, %%ymm1 \n\t"
2236 "vaddsubps %%ymm1, %%ymm9, %%ymm9 \n\t"
2237 " \n\t"
2238 "vpermilps $0xb1, %%ymm8, %%ymm0 \n\t"
2239 "vmulps %%ymm7, %%ymm8, %%ymm8 \n\t"
2240 "vmulps %%ymm6, %%ymm0, %%ymm0 \n\t"
2241 "vaddsubps %%ymm0, %%ymm8, %%ymm8 \n\t"
2242 " \n\t"
2243 " \n\t"
2244 " \n\t"
2245 " \n\t"
2246 "movq %5, %%rbx \n\t" // load address of beta
2247 "vbroadcastss (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate
2248 "vbroadcastss 4(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate
2249 " \n\t"
2250 " \n\t"
2251 " \n\t"
2252 " \n\t"
2253 " \n\t"
2254 " \n\t"
2255 " \n\t"
2256 "movq %7, %%rsi \n\t" // load rs_c
2257 "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(scomplex)
2258 " \n\t"
2259 "leaq (%%rcx,%%rsi,4), %%rdx \n\t" // load address of c + 4*rs_c;
2260 " \n\t"
2261 "leaq (,%%rsi,2), %%r12 \n\t" // r12 = 2*rs_c;
2262 "leaq (%%r12,%%rsi,1), %%r13 \n\t" // r13 = 3*rs_c;
2263 " \n\t"
2264 " \n\t"
2265 " \n\t"
2266 " \n\t" // determine if
2267 " \n\t" // c % 32 == 0, AND
2268 " \n\t" // 8*cs_c % 32 == 0, AND
2269 " \n\t" // rs_c == 1
2270 " \n\t" // ie: aligned, ldim aligned, and
2271 " \n\t" // column-stored
2272 " \n\t"
2273 "cmpq $8, %%rsi \n\t" // set ZF if (8*rs_c) == 8.
2274 "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
2275 "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
2276 "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
2277 "testq $31, %%rdi \n\t" // set ZF if (8*cs_c) & 32 is zero.
2278 "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
2279 " \n\t" // and(bl,bh) followed by
2280 " \n\t" // and(bh,al) will reveal result
2281 " \n\t"
2282 " \n\t" // now avoid loading C if beta == 0
2283 " \n\t"
2284 "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
2285 "vucomiss %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0.
2286 "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
2287 "vucomiss %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0.
2288 "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
2289 "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1.
2290 "jne .CBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case
2291 " \n\t"
2292 " \n\t"
2293 " \n\t" // check if aligned/column-stored
2294 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
2295 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
2296 "jne .CCOLSTORED \n\t" // jump to column storage case
2297 " \n\t"
2298 " \n\t"
2299 " \n\t"
2300 ".CGENSTORED: \n\t"
2301 " \n\t"
2302 " \n\t" // update c00:c70
2303 " \n\t"
2304 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c00,10) into xmm0[0:1]
2305 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c20,30) into xmm0[2:3]
2306 "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c40,50) into xmm2[0:1]
2307 "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c60,70) into xmm2[2:3]
2308 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2309 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2310 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2311 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2312 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2313 "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2314 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2315 "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c00,c10)
2316 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c20,c30)
2317 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50)
2318 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70)
2319 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2320 " \n\t"
2321 " \n\t" // update c80:cf0
2322 " \n\t"
2323 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c80,90) into xmm0[0:1]
2324 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca0,b0) into xmm0[2:3]
2325 "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc0,d0) into xmm2[0:1]
2326 "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce0,f0) into xmm2[2:3]
2327 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2328 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2329 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2330 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2331 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2332 "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2333 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2334 "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c80,c90)
2335 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca0,cb0)
2336 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0)
2337 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0)
2338 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2339 " \n\t"
2340 " \n\t" // update c01:c71
2341 " \n\t"
2342 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c01,11) into xmm0[0:1]
2343 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c21,31) into xmm0[2:3]
2344 "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c41,51) into xmm2[0:1]
2345 "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c61,71) into xmm2[2:3]
2346 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2347 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2348 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2349 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2350 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2351 "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2352 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2353 "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c01,c11)
2354 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c21,c31)
2355 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51)
2356 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71)
2357 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2358 " \n\t"
2359 " \n\t" // update c81:cf1
2360 " \n\t"
2361 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c81,91) into xmm0[0:1]
2362 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca1,b1) into xmm0[2:3]
2363 "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc1,d1) into xmm2[0:1]
2364 "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce1,f1) into xmm2[2:3]
2365 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2366 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2367 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2368 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2369 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2370 "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2371 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2372 "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c81,c91)
2373 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca1,cb1)
2374 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1)
2375 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1)
2376 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2377 " \n\t"
2378 " \n\t" // update c02:c72
2379 " \n\t"
2380 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c02,12) into xmm0[0:1]
2381 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c22,32) into xmm0[2:3]
2382 "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c42,52) into xmm2[0:1]
2383 "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c62,72) into xmm2[2:3]
2384 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2385 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2386 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2387 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2388 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2389 "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2390 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2391 "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c02,c12)
2392 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c22,c32)
2393 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52)
2394 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72)
2395 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2396 " \n\t"
2397 " \n\t" // update c82:cf2
2398 " \n\t"
2399 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c82,92) into xmm0[0:1]
2400 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca2,b2) into xmm0[2:3]
2401 "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc2,d2) into xmm2[0:1]
2402 "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce2,f2) into xmm2[2:3]
2403 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2404 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2405 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2406 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2407 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2408 "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2409 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2410 "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c82,c92)
2411 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca2,cb2)
2412 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2)
2413 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2)
2414 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2415 " \n\t"
2416 " \n\t" // update c03:c73
2417 " \n\t"
2418 "vmovlpd (%%rcx), %%xmm0, %%xmm0 \n\t" // load (c03,13) into xmm0[0:1]
2419 "vmovhpd (%%rcx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (c23,33) into xmm0[2:3]
2420 "vmovlpd (%%rcx,%%r12), %%xmm2, %%xmm2 \n\t" // load (c43,53) into xmm2[0:1]
2421 "vmovhpd (%%rcx,%%r13), %%xmm2, %%xmm2 \n\t" // load (c63,73) into xmm2[2:3]
2422 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2423 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2424 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2425 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2426 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2427 "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2428 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2429 "vmovlpd %%xmm0, (%%rcx) \n\t" // store (c03,c13)
2430 "vmovhpd %%xmm0, (%%rcx,%%rsi) \n\t" // store (c23,c33)
2431 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53)
2432 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73)
2433 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2434 " \n\t"
2435 " \n\t" // update c83:cf3
2436 " \n\t"
2437 "vmovlpd (%%rdx), %%xmm0, %%xmm0 \n\t" // load (c83,93) into xmm0[0:1]
2438 "vmovhpd (%%rdx,%%rsi), %%xmm0, %%xmm0 \n\t" // load (ca3,b3) into xmm0[2:3]
2439 "vmovlpd (%%rdx,%%r12), %%xmm2, %%xmm2 \n\t" // load (cc3,d3) into xmm2[0:1]
2440 "vmovhpd (%%rdx,%%r13), %%xmm2, %%xmm2 \n\t" // load (ce3,f3) into xmm2[2:3]
2441 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:3],xmm2)
2442 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2443 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2444 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2445 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2446 "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2447 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2448 "vmovlpd %%xmm0, (%%rdx) \n\t" // store (c83,c93)
2449 "vmovhpd %%xmm0, (%%rdx,%%rsi) \n\t" // store (ca3,cb3)
2450 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3)
2451 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3)
2452 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2453 " \n\t"
2454 " \n\t"
2455 " \n\t"
2456 "jmp .CDONE \n\t" // jump to end.
2457 " \n\t"
2458 " \n\t"
2459 " \n\t"
2460 ".CCOLSTORED: \n\t"
2461 " \n\t"
2462 " \n\t" // update c00:c70
2463 " \n\t"
2464 "vmovaps (%%rcx), %%ymm0 \n\t" // load c00:c70 into ymm0
2465 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2466 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2467 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2468 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2469 "vaddps %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2470 "vmovaps %%ymm0, (%%rcx) \n\t" // store c00:c70
2471 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2472 " \n\t"
2473 " \n\t" // update c80:cf0
2474 " \n\t"
2475 "vmovaps (%%rdx), %%ymm0 \n\t" // load c80:f0 into ymm0
2476 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2477 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2478 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2479 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2480 "vaddps %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2481 "vmovaps %%ymm0, (%%rdx) \n\t" // store c80:cf0
2482 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2483 " \n\t"
2484 " \n\t" // update c00:c70
2485 " \n\t"
2486 "vmovaps (%%rcx), %%ymm0 \n\t" // load c01:c71 into ymm0
2487 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2488 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2489 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2490 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2491 "vaddps %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2492 "vmovaps %%ymm0, (%%rcx) \n\t" // store c01:c71
2493 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2494 " \n\t"
2495 " \n\t" // update c81:cf1
2496 " \n\t"
2497 "vmovaps (%%rdx), %%ymm0 \n\t" // load c81:f1 into ymm0
2498 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2499 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2500 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2501 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2502 "vaddps %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2503 "vmovaps %%ymm0, (%%rdx) \n\t" // store c81:cf1
2504 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2505 " \n\t"
2506 " \n\t" // update c02:c72
2507 " \n\t"
2508 "vmovaps (%%rcx), %%ymm0 \n\t" // load c02:c72 into ymm0
2509 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2510 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2511 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2512 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2513 "vaddps %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2514 "vmovaps %%ymm0, (%%rcx) \n\t" // store c02:c72
2515 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2516 " \n\t"
2517 " \n\t" // update c82:cf2
2518 " \n\t"
2519 "vmovaps (%%rdx), %%ymm0 \n\t" // load c82:f2 into ymm0
2520 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2521 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2522 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2523 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2524 "vaddps %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2525 "vmovaps %%ymm0, (%%rdx) \n\t" // store c82:cf2
2526 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2527 " \n\t"
2528 " \n\t" // update c03:c73
2529 " \n\t"
2530 "vmovaps (%%rcx), %%ymm0 \n\t" // load c03:c73 into ymm0
2531 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2532 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2533 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2534 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2535 "vaddps %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2536 "vmovaps %%ymm0, (%%rcx) \n\t" // store c03:c73
2537 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2538 " \n\t"
2539 " \n\t" // update c83:cf3
2540 " \n\t"
2541 "vmovaps (%%rdx), %%ymm0 \n\t" // load c83:f3 into ymm0
2542 "vpermilps $0xb1, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
2543 "vmulps %%ymm7, %%ymm0, %%ymm0 \n\t"
2544 "vmulps %%ymm6, %%ymm2, %%ymm2 \n\t"
2545 "vaddsubps %%ymm2, %%ymm0, %%ymm0 \n\t"
2546 "vaddps %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
2547 "vmovaps %%ymm0, (%%rdx) \n\t" // store c83:cf3
2548 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2549 " \n\t"
2550 " \n\t"
2551 " \n\t"
2552 "jmp .CDONE \n\t" // jump to end.
2553 " \n\t"
2554 " \n\t"
2555 " \n\t"
2556 ".CBETAZERO: \n\t"
2557 " \n\t" // check if aligned/column-stored
2558 " \n\t" // check if aligned/column-stored
2559 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
2560 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
2561 "jne .CCOLSTORBZ \n\t" // jump to column storage case
2562 " \n\t"
2563 " \n\t"
2564 " \n\t"
2565 ".CGENSTORBZ: \n\t"
2566 " \n\t"
2567 " \n\t" // update c00:c70
2568 " \n\t"
2569 "vextractf128 $1, %%ymm15, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2570 "vmovlpd %%xmm15, (%%rcx) \n\t" // store (c00,c10)
2571 "vmovhpd %%xmm15, (%%rcx,%%rsi) \n\t" // store (c20,c30)
2572 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c40,c50)
2573 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c60,c70)
2574 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2575 " \n\t"
2576 " \n\t" // update c80:cf0
2577 " \n\t"
2578 "vextractf128 $1, %%ymm14, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2579 "vmovlpd %%xmm14, (%%rdx) \n\t" // store (c80,c90)
2580 "vmovhpd %%xmm14, (%%rdx,%%rsi) \n\t" // store (ca0,cb0)
2581 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc0,cd0)
2582 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce0,cf0)
2583 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2584 " \n\t"
2585 " \n\t" // update c01:c71
2586 " \n\t"
2587 "vextractf128 $1, %%ymm13, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2588 "vmovlpd %%xmm13, (%%rcx) \n\t" // store (c01,c11)
2589 "vmovhpd %%xmm13, (%%rcx,%%rsi) \n\t" // store (c21,c31)
2590 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c41,c51)
2591 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c61,c71)
2592 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2593 " \n\t"
2594 " \n\t" // update c81:cf1
2595 " \n\t"
2596 "vextractf128 $1, %%ymm12, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2597 "vmovlpd %%xmm12, (%%rdx) \n\t" // store (c81,c91)
2598 "vmovhpd %%xmm12, (%%rdx,%%rsi) \n\t" // store (ca1,cb1)
2599 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc1,cd1)
2600 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce1,cf1)
2601 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2602 " \n\t"
2603 " \n\t" // update c02:c72
2604 " \n\t"
2605 "vextractf128 $1, %%ymm11, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2606 "vmovlpd %%xmm11, (%%rcx) \n\t" // store (c02,c12)
2607 "vmovhpd %%xmm11, (%%rcx,%%rsi) \n\t" // store (c22,c32)
2608 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c42,c52)
2609 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c62,c72)
2610 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2611 " \n\t"
2612 " \n\t" // update c82:cf2
2613 " \n\t"
2614 "vextractf128 $1, %%ymm10, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2615 "vmovlpd %%xmm10, (%%rdx) \n\t" // store (c82,c92)
2616 "vmovhpd %%xmm10, (%%rdx,%%rsi) \n\t" // store (ca2,cb2)
2617 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc2,cd2)
2618 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce2,cf2)
2619 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2620 " \n\t"
2621 " \n\t" // update c03:c73
2622 " \n\t"
2623 "vextractf128 $1, %%ymm9, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2624 "vmovlpd %%xmm9, (%%rcx) \n\t" // store (c03,c13)
2625 "vmovhpd %%xmm9, (%%rcx,%%rsi) \n\t" // store (c23,c33)
2626 "vmovlpd %%xmm2, (%%rcx,%%r12) \n\t" // store (c43,c53)
2627 "vmovhpd %%xmm2, (%%rcx,%%r13) \n\t" // store (c63,c73)
2628 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2629 " \n\t"
2630 " \n\t" // update c83:cf3
2631 " \n\t"
2632 "vextractf128 $1, %%ymm8, %%xmm2 \n\t" // xmm2 := ymm0[4:7]
2633 "vmovlpd %%xmm8, (%%rdx) \n\t" // store (c83,c93)
2634 "vmovhpd %%xmm8, (%%rdx,%%rsi) \n\t" // store (ca3,cb3)
2635 "vmovlpd %%xmm2, (%%rdx,%%r12) \n\t" // store (cc3,cd3)
2636 "vmovhpd %%xmm2, (%%rdx,%%r13) \n\t" // store (ce3,cf3)
2637 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2638 " \n\t"
2639 " \n\t"
2640 " \n\t"
2641 "jmp .CDONE \n\t" // jump to end.
2642 " \n\t"
2643 " \n\t"
2644 " \n\t"
2645 ".CCOLSTORBZ: \n\t"
2646 " \n\t"
2647 " \n\t"
2648 "vmovaps %%ymm15, (%%rcx) \n\t" // store c00:c70
2649 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2650 " \n\t"
2651 "vmovaps %%ymm14, (%%rdx) \n\t" // store c80:cf0
2652 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2653 " \n\t"
2654 "vmovaps %%ymm13, (%%rcx) \n\t" // store c01:c71
2655 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2656 " \n\t"
2657 "vmovaps %%ymm12, (%%rdx) \n\t" // store c81:cf1
2658 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2659 " \n\t"
2660 "vmovaps %%ymm11, (%%rcx) \n\t" // store c02:c72
2661 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2662 " \n\t"
2663 "vmovaps %%ymm10, (%%rdx) \n\t" // store c82:cf2
2664 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2665 " \n\t"
2666 "vmovaps %%ymm9, (%%rcx) \n\t" // store c03:c73
2667 "addq %%rdi, %%rcx \n\t" // c += cs_c;
2668 " \n\t"
2669 "vmovaps %%ymm8, (%%rdx) \n\t" // store c83:cf3
2670 "addq %%rdi, %%rdx \n\t" // c += cs_c;
2671 " \n\t"
2672 " \n\t"
2673 " \n\t"
2674 " \n\t"
2675 " \n\t"
2676 ".CDONE: \n\t"
2677 " \n\t"
2678
2679 : // output operands (none)
2680 : // input operands
2681 "m" (k_iter), // 0
2682 "m" (k_left), // 1
2683 "m" (a), // 2
2684 "m" (b), // 3
2685 "m" (alpha), // 4
2686 "m" (beta), // 5
2687 "m" (c), // 6
2688 "m" (rs_c), // 7
2689 "m" (cs_c), // 8
2690 "m" (b_next)/*, // 9
2691 "m" (a_next)*/ // 10
2692 : // register clobber list
2693 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
2694 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
2695 "xmm0", "xmm1", "xmm2", "xmm3",
2696 "xmm4", "xmm5", "xmm6", "xmm7",
2697 "xmm8", "xmm9", "xmm10", "xmm11",
2698 "xmm12", "xmm13", "xmm14", "xmm15",
2699 "memory"
2700 );
2701 }
2702
2703
2704
bli_zgemm_asm_4x4(dim_t k,dcomplex * restrict alpha,dcomplex * restrict a,dcomplex * restrict b,dcomplex * restrict beta,dcomplex * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)2705 void bli_zgemm_asm_4x4
2706 (
2707 dim_t k,
2708 dcomplex* restrict alpha,
2709 dcomplex* restrict a,
2710 dcomplex* restrict b,
2711 dcomplex* restrict beta,
2712 dcomplex* restrict c, inc_t rs_c, inc_t cs_c,
2713 auxinfo_t* restrict data,
2714 cntx_t* restrict cntx
2715 )
2716 {
2717 //void* a_next = bli_auxinfo_next_a( data );
2718 //void* b_next = bli_auxinfo_next_b( data );
2719
2720 uint64_t k_iter = k / 4;
2721 uint64_t k_left = k % 4;
2722
2723 __asm__ volatile
2724 (
2725 " \n\t"
2726 " \n\t"
2727 "movq %2, %%rax \n\t" // load address of a.
2728 "movq %3, %%rbx \n\t" // load address of b.
2729 //"movq %9, %%r15 \n\t" // load address of b_next.
2730 //"movq %10, %%r14 \n\t" // load address of a_next.
2731 " \n\t"
2732 "vmovapd 0 * 32(%%rax), %%ymm0 \n\t" // initialize loop by pre-loading
2733 "vmovddup 0 + 0 * 32(%%rbx), %%ymm2 \n\t"
2734 "vmovddup 0 + 1 * 32(%%rbx), %%ymm3 \n\t"
2735 " \n\t"
2736 "movq %6, %%rcx \n\t" // load address of c
2737 "movq %8, %%rdi \n\t" // load cs_c
2738 "leaq (,%%rdi,8), %%rdi \n\t" // cs_c *= sizeof(dcomplex)
2739 "leaq (,%%rdi,2), %%rdi \n\t"
2740 "leaq (%%rcx,%%rdi,2), %%r10 \n\t" // load address of c + 2*cs_c;
2741 " \n\t"
2742 "prefetcht0 3 * 8(%%rcx) \n\t" // prefetch c + 0*cs_c
2743 "prefetcht0 3 * 8(%%rcx,%%rdi) \n\t" // prefetch c + 1*cs_c
2744 "prefetcht0 3 * 8(%%r10) \n\t" // prefetch c + 2*cs_c
2745 "prefetcht0 3 * 8(%%r10,%%rdi) \n\t" // prefetch c + 3*cs_c
2746 " \n\t"
2747 "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t"
2748 "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t"
2749 "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t"
2750 "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t"
2751 "vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t"
2752 "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t"
2753 "vxorpd %%ymm14, %%ymm14, %%ymm14 \n\t"
2754 "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"
2755 " \n\t"
2756 " \n\t"
2757 " \n\t"
2758 "movq %0, %%rsi \n\t" // i = k_iter;
2759 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
2760 "je .ZCONSIDKLEFT \n\t" // if i == 0, jump to code that
2761 " \n\t" // contains the k_left loop.
2762 " \n\t"
2763 " \n\t"
2764 ".ZLOOPKITER: \n\t" // MAIN LOOP
2765 " \n\t"
2766 " \n\t"
2767 " \n\t" // iteration 0
2768 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
2769 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2770 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2771 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2772 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2773 "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2774 "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2775 " \n\t"
2776 "prefetcht0 16 * 32(%%rax) \n\t"
2777 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2778 "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t"
2779 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2780 "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t"
2781 "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2782 "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2783 " \n\t"
2784 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2785 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2786 "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"
2787 "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2788 "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2789 " \n\t"
2790 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2791 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2792 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2793 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2794 "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2795 "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2796 " \n\t"
2797 "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"
2798 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2799 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2800 "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2801 "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2802 " \n\t"
2803 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2804 "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t"
2805 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2806 "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t"
2807 "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2808 "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2809 " \n\t"
2810 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2811 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2812 "vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
2813 "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2814 "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2815 " \n\t"
2816 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2817 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2818 "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2819 "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2820 " \n\t"
2821 " \n\t"
2822 " \n\t" // iteration 1
2823 "vmovapd 3 * 32(%%rax), %%ymm1 \n\t"
2824 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2825 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2826 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2827 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2828 "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2829 "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2830 " \n\t"
2831 "prefetcht0 18 * 32(%%rax) \n\t"
2832 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2833 "vmovddup 8 + 2 * 32(%%rbx), %%ymm2 \n\t"
2834 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2835 "vmovddup 8 + 3 * 32(%%rbx), %%ymm3 \n\t"
2836 "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2837 "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2838 " \n\t"
2839 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2840 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2841 "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"
2842 "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2843 "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2844 " \n\t"
2845 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2846 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2847 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2848 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2849 "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2850 "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2851 " \n\t"
2852 "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"
2853 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2854 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2855 "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2856 "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2857 " \n\t"
2858 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2859 "vmovddup 0 + 4 * 32(%%rbx), %%ymm2 \n\t"
2860 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2861 "vmovddup 0 + 5 * 32(%%rbx), %%ymm3 \n\t"
2862 "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2863 "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2864 " \n\t"
2865 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2866 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2867 "vmovapd 4 * 32(%%rax), %%ymm0 \n\t"
2868 "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2869 "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2870 " \n\t"
2871 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2872 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2873 "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2874 "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2875 " \n\t"
2876 " \n\t"
2877 " \n\t" // iteration 2
2878 "vmovapd 5 * 32(%%rax), %%ymm1 \n\t"
2879 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2880 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2881 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2882 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2883 "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2884 "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2885 " \n\t"
2886 "prefetcht0 20 * 32(%%rax) \n\t"
2887 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2888 "vmovddup 8 + 4 * 32(%%rbx), %%ymm2 \n\t"
2889 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2890 "vmovddup 8 + 5 * 32(%%rbx), %%ymm3 \n\t"
2891 "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2892 "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2893 " \n\t"
2894 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2895 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2896 "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"
2897 "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2898 "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2899 " \n\t"
2900 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2901 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2902 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2903 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2904 "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2905 "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2906 " \n\t"
2907 "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"
2908 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2909 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2910 "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2911 "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2912 " \n\t"
2913 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2914 "vmovddup 0 + 6 * 32(%%rbx), %%ymm2 \n\t"
2915 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2916 "vmovddup 0 + 7 * 32(%%rbx), %%ymm3 \n\t"
2917 "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2918 "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2919 " \n\t"
2920 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2921 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2922 "vmovapd 6 * 32(%%rax), %%ymm0 \n\t"
2923 "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2924 "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2925 " \n\t"
2926 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2927 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2928 "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2929 "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2930 " \n\t"
2931 " \n\t"
2932 " \n\t" // iteration 3
2933 "vmovapd 7 * 32(%%rax), %%ymm1 \n\t"
2934 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2935 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2936 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2937 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2938 "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2939 "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2940 " \n\t"
2941 "prefetcht0 22 * 32(%%rax) \n\t"
2942 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2943 "vmovddup 8 + 6 * 32(%%rbx), %%ymm2 \n\t"
2944 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2945 "vmovddup 8 + 7 * 32(%%rbx), %%ymm3 \n\t"
2946 "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2947 "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2948 " \n\t"
2949 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2950 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2951 "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"
2952 "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2953 "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2954 " \n\t"
2955 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2956 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
2957 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2958 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
2959 "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2960 "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2961 " \n\t"
2962 "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"
2963 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
2964 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
2965 "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"
2966 "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"
2967 " \n\t"
2968 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
2969 "vmovddup 0 + 8 * 32(%%rbx), %%ymm2 \n\t"
2970 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
2971 "vmovddup 0 + 9 * 32(%%rbx), %%ymm3 \n\t"
2972 "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t"
2973 "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t"
2974 " \n\t"
2975 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
2976 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
2977 "vmovapd 8 * 32(%%rax), %%ymm0 \n\t"
2978 "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t"
2979 "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t"
2980 " \n\t"
2981 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
2982 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
2983 "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t"
2984 "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t"
2985 " \n\t"
2986 " \n\t"
2987 "addq $4 * 4 * 16, %%rbx \n\t" // b += 4*4 (unroll x nr)
2988 "addq $4 * 4 * 16, %%rax \n\t" // a += 4*4 (unroll x mr)
2989 " \n\t"
2990 " \n\t"
2991 "decq %%rsi \n\t" // i -= 1;
2992 "jne .ZLOOPKITER \n\t" // iterate again if i != 0.
2993 " \n\t"
2994 " \n\t"
2995 " \n\t"
2996 " \n\t"
2997 " \n\t"
2998 " \n\t"
2999 ".ZCONSIDKLEFT: \n\t"
3000 " \n\t"
3001 "movq %1, %%rsi \n\t" // i = k_left;
3002 "testq %%rsi, %%rsi \n\t" // check i via logical AND.
3003 "je .ZPOSTACCUM \n\t" // if i == 0, we're done; jump to end.
3004 " \n\t" // else, we prepare to enter k_left loop.
3005 " \n\t"
3006 " \n\t"
3007 ".ZLOOPKLEFT: \n\t" // EDGE LOOP
3008 " \n\t"
3009 " \n\t" // iteration 0
3010 "vmovapd 1 * 32(%%rax), %%ymm1 \n\t"
3011 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
3012 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
3013 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
3014 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
3015 "vaddpd %%ymm6, %%ymm15, %%ymm15 \n\t"
3016 "vaddpd %%ymm7, %%ymm11, %%ymm11 \n\t"
3017 " \n\t"
3018 "prefetcht0 16 * 32(%%rax) \n\t"
3019 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
3020 "vmovddup 8 + 0 * 32(%%rbx), %%ymm2 \n\t"
3021 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
3022 "vmovddup 8 + 1 * 32(%%rbx), %%ymm3 \n\t"
3023 "vaddpd %%ymm6, %%ymm14, %%ymm14 \n\t"
3024 "vaddpd %%ymm7, %%ymm10, %%ymm10 \n\t"
3025 " \n\t"
3026 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
3027 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
3028 "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t"
3029 "vaddpd %%ymm6, %%ymm13, %%ymm13 \n\t"
3030 "vaddpd %%ymm7, %%ymm9, %%ymm9 \n\t"
3031 " \n\t"
3032 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
3033 "vperm2f128 $0x3, %%ymm2, %%ymm2, %%ymm4 \n\t"
3034 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
3035 "vperm2f128 $0x3, %%ymm3, %%ymm3, %%ymm5 \n\t"
3036 "vaddpd %%ymm6, %%ymm12, %%ymm12 \n\t"
3037 "vaddpd %%ymm7, %%ymm8, %%ymm8 \n\t"
3038 " \n\t"
3039 "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t"
3040 "vmulpd %%ymm0, %%ymm2, %%ymm6 \n\t"
3041 "vmulpd %%ymm0, %%ymm3, %%ymm7 \n\t"
3042 "vaddsubpd %%ymm6, %%ymm15, %%ymm15 \n\t"
3043 "vaddsubpd %%ymm7, %%ymm11, %%ymm11 \n\t"
3044 " \n\t"
3045 "vmulpd %%ymm1, %%ymm2, %%ymm6 \n\t"
3046 "vmovddup 0 + 2 * 32(%%rbx), %%ymm2 \n\t"
3047 "vmulpd %%ymm1, %%ymm3, %%ymm7 \n\t"
3048 "vmovddup 0 + 3 * 32(%%rbx), %%ymm3 \n\t"
3049 "vaddsubpd %%ymm6, %%ymm14, %%ymm14 \n\t"
3050 "vaddsubpd %%ymm7, %%ymm10, %%ymm10 \n\t"
3051 " \n\t"
3052 "vmulpd %%ymm0, %%ymm4, %%ymm6 \n\t"
3053 "vmulpd %%ymm0, %%ymm5, %%ymm7 \n\t"
3054 "vmovapd 2 * 32(%%rax), %%ymm0 \n\t"
3055 "vaddsubpd %%ymm6, %%ymm13, %%ymm13 \n\t"
3056 "vaddsubpd %%ymm7, %%ymm9, %%ymm9 \n\t"
3057 " \n\t"
3058 "vmulpd %%ymm1, %%ymm4, %%ymm6 \n\t"
3059 "vmulpd %%ymm1, %%ymm5, %%ymm7 \n\t"
3060 "vaddsubpd %%ymm6, %%ymm12, %%ymm12 \n\t"
3061 "vaddsubpd %%ymm7, %%ymm8, %%ymm8 \n\t"
3062 " \n\t"
3063 " \n\t"
3064 "addq $4 * 1 * 16, %%rax \n\t" // a += 4 (1 x mr)
3065 "addq $4 * 1 * 16, %%rbx \n\t" // b += 4 (1 x nr)
3066 " \n\t"
3067 " \n\t"
3068 "decq %%rsi \n\t" // i -= 1;
3069 "jne .ZLOOPKLEFT \n\t" // iterate again if i != 0.
3070 " \n\t"
3071 " \n\t"
3072 " \n\t"
3073 ".ZPOSTACCUM: \n\t"
3074 " \n\t"
3075 " \n\t" // ymm15: ymm13: ymm11: ymm9:
3076 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
3077 " \n\t" // ab10 ab11 ab12 ab13
3078 " \n\t" // ab21 ab20 ab23 ab22
3079 " \n\t" // ab31 ) ab30 ) ab33 ) ab32 )
3080 " \n\t"
3081 " \n\t" // ymm14: ymm12: ymm10: ymm8:
3082 " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
3083 " \n\t" // ab50 ab51 ab52 ab53
3084 " \n\t" // ab61 ab60 ab63 ab62
3085 " \n\t" // ab71 ) ab70 ) ab73 ) ab72 )
3086 " \n\t"
3087 " \n\t"
3088 "vmovapd %%ymm15, %%ymm7 \n\t"
3089 "vperm2f128 $0x12, %%ymm15, %%ymm13, %%ymm15 \n\t"
3090 "vperm2f128 $0x30, %%ymm7, %%ymm13, %%ymm13 \n\t"
3091 " \n\t"
3092 "vmovapd %%ymm11, %%ymm7 \n\t"
3093 "vperm2f128 $0x12, %%ymm11, %%ymm9, %%ymm11 \n\t"
3094 "vperm2f128 $0x30, %%ymm7, %%ymm9, %%ymm9 \n\t"
3095 " \n\t"
3096 "vmovapd %%ymm14, %%ymm7 \n\t"
3097 "vperm2f128 $0x12, %%ymm14, %%ymm12, %%ymm14 \n\t"
3098 "vperm2f128 $0x30, %%ymm7, %%ymm12, %%ymm12 \n\t"
3099 " \n\t"
3100 "vmovapd %%ymm10, %%ymm7 \n\t"
3101 "vperm2f128 $0x12, %%ymm10, %%ymm8, %%ymm10 \n\t"
3102 "vperm2f128 $0x30, %%ymm7, %%ymm8, %%ymm8 \n\t"
3103 " \n\t"
3104 " \n\t"
3105 " \n\t" // ymm15: ymm13: ymm11: ymm9:
3106 " \n\t" // ( ab00 ( ab01 ( ab02 ( ab03
3107 " \n\t" // ab10 ab11 ab12 ab13
3108 " \n\t" // ab20 ab21 ab22 ab23
3109 " \n\t" // ab30 ) ab31 ) ab32 ) ab33 )
3110 " \n\t"
3111 " \n\t" // ymm14: ymm12: ymm10: ymm8:
3112 " \n\t" // ( ab40 ( ab41 ( ab42 ( ab43
3113 " \n\t" // ab50 ab51 ab52 ab53
3114 " \n\t" // ab60 ab61 ab62 ab63
3115 " \n\t" // ab70 ) ab71 ) ab72 ) ab73 )
3116 " \n\t"
3117 " \n\t"
3118 " \n\t" // scale by alpha
3119 " \n\t"
3120 "movq %4, %%rax \n\t" // load address of alpha
3121 "vbroadcastsd (%%rax), %%ymm7 \n\t" // load alpha_r and duplicate
3122 "vbroadcastsd 8(%%rax), %%ymm6 \n\t" // load alpha_i and duplicate
3123 " \n\t"
3124 "vpermilpd $0x5, %%ymm15, %%ymm3 \n\t"
3125 "vmulpd %%ymm7, %%ymm15, %%ymm15 \n\t"
3126 "vmulpd %%ymm6, %%ymm3, %%ymm3 \n\t"
3127 "vaddsubpd %%ymm3, %%ymm15, %%ymm15 \n\t"
3128 " \n\t"
3129 "vpermilpd $0x5, %%ymm14, %%ymm2 \n\t"
3130 "vmulpd %%ymm7, %%ymm14, %%ymm14 \n\t"
3131 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3132 "vaddsubpd %%ymm2, %%ymm14, %%ymm14 \n\t"
3133 " \n\t"
3134 "vpermilpd $0x5, %%ymm13, %%ymm1 \n\t"
3135 "vmulpd %%ymm7, %%ymm13, %%ymm13 \n\t"
3136 "vmulpd %%ymm6, %%ymm1, %%ymm1 \n\t"
3137 "vaddsubpd %%ymm1, %%ymm13, %%ymm13 \n\t"
3138 " \n\t"
3139 "vpermilpd $0x5, %%ymm12, %%ymm0 \n\t"
3140 "vmulpd %%ymm7, %%ymm12, %%ymm12 \n\t"
3141 "vmulpd %%ymm6, %%ymm0, %%ymm0 \n\t"
3142 "vaddsubpd %%ymm0, %%ymm12, %%ymm12 \n\t"
3143 " \n\t"
3144 "vpermilpd $0x5, %%ymm11, %%ymm3 \n\t"
3145 "vmulpd %%ymm7, %%ymm11, %%ymm11 \n\t"
3146 "vmulpd %%ymm6, %%ymm3, %%ymm3 \n\t"
3147 "vaddsubpd %%ymm3, %%ymm11, %%ymm11 \n\t"
3148 " \n\t"
3149 "vpermilpd $0x5, %%ymm10, %%ymm2 \n\t"
3150 "vmulpd %%ymm7, %%ymm10, %%ymm10 \n\t"
3151 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3152 "vaddsubpd %%ymm2, %%ymm10, %%ymm10 \n\t"
3153 " \n\t"
3154 "vpermilpd $0x5, %%ymm9, %%ymm1 \n\t"
3155 "vmulpd %%ymm7, %%ymm9, %%ymm9 \n\t"
3156 "vmulpd %%ymm6, %%ymm1, %%ymm1 \n\t"
3157 "vaddsubpd %%ymm1, %%ymm9, %%ymm9 \n\t"
3158 " \n\t"
3159 "vpermilpd $0x5, %%ymm8, %%ymm0 \n\t"
3160 "vmulpd %%ymm7, %%ymm8, %%ymm8 \n\t"
3161 "vmulpd %%ymm6, %%ymm0, %%ymm0 \n\t"
3162 "vaddsubpd %%ymm0, %%ymm8, %%ymm8 \n\t"
3163 " \n\t"
3164 " \n\t"
3165 " \n\t"
3166 " \n\t"
3167 "movq %5, %%rbx \n\t" // load address of beta
3168 "vbroadcastsd (%%rbx), %%ymm7 \n\t" // load beta_r and duplicate
3169 "vbroadcastsd 8(%%rbx), %%ymm6 \n\t" // load beta_i and duplicate
3170 " \n\t"
3171 " \n\t"
3172 " \n\t"
3173 " \n\t"
3174 " \n\t"
3175 " \n\t"
3176 " \n\t"
3177 "movq %7, %%rsi \n\t" // load rs_c
3178 "leaq (,%%rsi,8), %%rsi \n\t" // rsi = rs_c * sizeof(dcomplex)
3179 "leaq (,%%rsi,2), %%rsi \n\t"
3180 "leaq (%%rcx,%%rsi,2), %%rdx \n\t" // load address of c + 2*rs_c;
3181 " \n\t"
3182 " \n\t"
3183 " \n\t"
3184 " \n\t"
3185 " \n\t"
3186 " \n\t"
3187 " \n\t" // determine if
3188 " \n\t" // c % 32 == 0, AND
3189 " \n\t" // 16*cs_c % 32 == 0, AND
3190 " \n\t" // rs_c == 1
3191 " \n\t" // ie: aligned, ldim aligned, and
3192 " \n\t" // column-stored
3193 " \n\t"
3194 "cmpq $16, %%rsi \n\t" // set ZF if (16*rs_c) == 16.
3195 "sete %%bl \n\t" // bl = ( ZF == 1 ? 1 : 0 );
3196 "testq $31, %%rcx \n\t" // set ZF if c & 32 is zero.
3197 "setz %%bh \n\t" // bh = ( ZF == 0 ? 1 : 0 );
3198 "testq $31, %%rdi \n\t" // set ZF if (16*cs_c) & 32 is zero.
3199 "setz %%al \n\t" // al = ( ZF == 0 ? 1 : 0 );
3200 " \n\t" // and(bl,bh) followed by
3201 " \n\t" // and(bh,al) will reveal result
3202 " \n\t"
3203 " \n\t" // now avoid loading C if beta == 0
3204 " \n\t"
3205 "vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" // set ymm0 to zero.
3206 "vucomisd %%xmm0, %%xmm7 \n\t" // set ZF if beta_r == 0.
3207 "sete %%r8b \n\t" // r8b = ( ZF == 1 ? 1 : 0 );
3208 "vucomisd %%xmm0, %%xmm6 \n\t" // set ZF if beta_i == 0.
3209 "sete %%r9b \n\t" // r9b = ( ZF == 1 ? 1 : 0 );
3210 "andb %%r8b, %%r9b \n\t" // set ZF if r8b & r9b == 1.
3211 "jne .ZBETAZERO \n\t" // if ZF = 0, jump to beta == 0 case
3212 " \n\t"
3213 " \n\t"
3214 " \n\t" // check if aligned/column-stored
3215 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
3216 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
3217 "jne .ZCOLSTORED \n\t" // jump to column storage case
3218 " \n\t"
3219 " \n\t"
3220 " \n\t"
3221 ".ZGENSTORED: \n\t"
3222 " \n\t" // update c00:c30
3223 " \n\t"
3224 "vmovupd (%%rcx), %%xmm0 \n\t" // load (c00,c10) into xmm0
3225 "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c20,c30) into xmm2
3226 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3227 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3228 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3229 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3230 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3231 "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3232 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3233 "vmovupd %%xmm0, (%%rcx) \n\t" // store (c00,c10)
3234 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30)
3235 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3236 " \n\t"
3237 " \n\t" // update c40:c70
3238 " \n\t"
3239 "vmovupd (%%rdx), %%xmm0 \n\t" // load (c40,c50) into xmm0
3240 "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c60,c70) into xmm2
3241 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3242 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3243 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3244 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3245 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3246 "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3247 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3248 "vmovupd %%xmm0, (%%rdx) \n\t" // store (c40,c50)
3249 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70)
3250 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3251 " \n\t"
3252 " \n\t" // update c01:c31
3253 " \n\t"
3254 "vmovupd (%%rcx), %%xmm0 \n\t" // load (c01,c11) into xmm0
3255 "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c21,c31) into xmm2
3256 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3257 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3258 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3259 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3260 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3261 "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3262 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3263 "vmovupd %%xmm0, (%%rcx) \n\t" // store (c01,c11)
3264 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31)
3265 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3266 " \n\t"
3267 " \n\t" // update c41:c71
3268 " \n\t"
3269 "vmovupd (%%rdx), %%xmm0 \n\t" // load (c41,c51) into xmm0
3270 "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c61,c71) into xmm2
3271 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3272 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3273 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3274 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3275 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3276 "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3277 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3278 "vmovupd %%xmm0, (%%rdx) \n\t" // store (c41,c51)
3279 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71)
3280 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3281 " \n\t"
3282 " \n\t" // update c02:c32
3283 " \n\t"
3284 "vmovupd (%%rcx), %%xmm0 \n\t" // load (c02,c12) into xmm0
3285 "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c22,c32) into xmm2
3286 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3287 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3288 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3289 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3290 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3291 "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3292 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3293 "vmovupd %%xmm0, (%%rcx) \n\t" // store (c02,c12)
3294 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32)
3295 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3296 " \n\t"
3297 " \n\t" // update c42:c72
3298 " \n\t"
3299 "vmovupd (%%rdx), %%xmm0 \n\t" // load (c42,c52) into xmm0
3300 "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c62,c72) into xmm2
3301 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3302 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3303 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3304 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3305 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3306 "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3307 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3308 "vmovupd %%xmm0, (%%rdx) \n\t" // store (c42,c52)
3309 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72)
3310 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3311 " \n\t"
3312 " \n\t" // update c03:c33
3313 " \n\t"
3314 "vmovupd (%%rcx), %%xmm0 \n\t" // load (c03,c13) into xmm0
3315 "vmovupd (%%rcx,%%rsi), %%xmm2 \n\t" // load (c23,c33) into xmm2
3316 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3317 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3318 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3319 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3320 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3321 "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3322 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3323 "vmovupd %%xmm0, (%%rcx) \n\t" // store (c03,c13)
3324 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33)
3325 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3326 " \n\t"
3327 " \n\t" // update c43:c73
3328 " \n\t"
3329 "vmovupd (%%rdx), %%xmm0 \n\t" // load (c43,c53) into xmm0
3330 "vmovupd (%%rdx,%%rsi), %%xmm2 \n\t" // load (c63,c73) into xmm2
3331 "vinsertf128 $1, %%xmm2, %%ymm0, %%ymm0 \n\t" // ymm0 := (ymm0[0:1],xmm2)
3332 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3333 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3334 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3335 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3336 "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3337 "vextractf128 $1, %%ymm0, %%xmm2 \n\t" // xmm2 := ymm0[2:3]
3338 "vmovupd %%xmm0, (%%rdx) \n\t" // store (c43,c53)
3339 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73)
3340 " \n\t"
3341 " \n\t"
3342 " \n\t"
3343 "jmp .ZDONE \n\t" // jump to end.
3344 " \n\t"
3345 " \n\t"
3346 " \n\t"
3347 ".ZCOLSTORED: \n\t"
3348 " \n\t" // update c00:c30
3349 " \n\t"
3350 "vmovapd (%%rcx), %%ymm0 \n\t" // load c00:c30 into ymm0
3351 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3352 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3353 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3354 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3355 "vaddpd %%ymm15, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3356 "vmovapd %%ymm0, (%%rcx) \n\t" // store c00:c30
3357 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3358 " \n\t"
3359 " \n\t" // update c40:c70
3360 " \n\t"
3361 "vmovapd (%%rdx), %%ymm0 \n\t" // load c40:c70 into ymm0
3362 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3363 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3364 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3365 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3366 "vaddpd %%ymm14, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3367 "vmovapd %%ymm0, (%%rdx) \n\t" // store c40:c70
3368 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3369 " \n\t"
3370 " \n\t" // update c01:c31
3371 " \n\t"
3372 "vmovapd (%%rcx), %%ymm0 \n\t" // load c01:c31 into ymm0
3373 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3374 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3375 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3376 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3377 "vaddpd %%ymm13, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3378 "vmovapd %%ymm0, (%%rcx) \n\t" // store c01:c31
3379 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3380 " \n\t"
3381 " \n\t" // update c41:c71
3382 " \n\t"
3383 "vmovapd (%%rdx), %%ymm0 \n\t" // load c41:c71 into ymm0
3384 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3385 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3386 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3387 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3388 "vaddpd %%ymm12, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3389 "vmovapd %%ymm0, (%%rdx) \n\t" // store c41:c71
3390 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3391 " \n\t"
3392 " \n\t" // update c02:c32
3393 " \n\t"
3394 "vmovapd (%%rcx), %%ymm0 \n\t" // load c02:c32 into ymm0
3395 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3396 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3397 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3398 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3399 "vaddpd %%ymm11, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3400 "vmovapd %%ymm0, (%%rcx) \n\t" // store c02:c32
3401 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3402 " \n\t"
3403 " \n\t" // update c42:c72
3404 " \n\t"
3405 "vmovapd (%%rdx), %%ymm0 \n\t" // load c42:c72 into ymm0
3406 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3407 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3408 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3409 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3410 "vaddpd %%ymm10, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3411 "vmovapd %%ymm0, (%%rdx) \n\t" // store c42:c72
3412 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3413 " \n\t"
3414 " \n\t" // update c03:c33
3415 " \n\t"
3416 "vmovapd (%%rcx), %%ymm0 \n\t" // load c03:c33 into ymm0
3417 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3418 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3419 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3420 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3421 "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3422 "vmovapd %%ymm0, (%%rcx) \n\t" // store c03:c33
3423 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3424 " \n\t"
3425 " \n\t" // update c43:c73
3426 " \n\t"
3427 "vmovapd (%%rdx), %%ymm0 \n\t" // load c43:c73 into ymm0
3428 "vpermilpd $0x5, %%ymm0, %%ymm2 \n\t" // scale ymm0 by beta
3429 "vmulpd %%ymm7, %%ymm0, %%ymm0 \n\t"
3430 "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t"
3431 "vaddsubpd %%ymm2, %%ymm0, %%ymm0 \n\t"
3432 "vaddpd %%ymm8, %%ymm0, %%ymm0 \n\t" // add the gemm result to ymm0
3433 "vmovapd %%ymm0, (%%rdx) \n\t" // store c43:c73
3434 " \n\t"
3435 " \n\t"
3436 " \n\t"
3437 "jmp .ZDONE \n\t" // jump to end.
3438 " \n\t"
3439 " \n\t"
3440 " \n\t"
3441 ".ZBETAZERO: \n\t"
3442 " \n\t" // check if aligned/column-stored
3443 " \n\t" // check if aligned/column-stored
3444 "andb %%bl, %%bh \n\t" // set ZF if bl & bh == 1.
3445 "andb %%bh, %%al \n\t" // set ZF if bh & al == 1.
3446 "jne .ZCOLSTORBZ \n\t" // jump to column storage case
3447 " \n\t"
3448 " \n\t"
3449 " \n\t"
3450 ".ZGENSTORBZ: \n\t"
3451 " \n\t" // update c00:c30
3452 " \n\t"
3453 "vextractf128 $1, %%ymm15, %%xmm2 \n\t"
3454 "vmovupd %%xmm15, (%%rcx) \n\t" // store (c00,c10)
3455 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c20,c30)
3456 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3457 " \n\t"
3458 " \n\t" // update c40:c70
3459 " \n\t"
3460 "vextractf128 $1, %%ymm14, %%xmm2 \n\t"
3461 "vmovupd %%xmm14, (%%rdx) \n\t" // store (c40,c50)
3462 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c60,c70)
3463 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3464 " \n\t"
3465 " \n\t" // update c01:c31
3466 " \n\t"
3467 "vextractf128 $1, %%ymm13, %%xmm2 \n\t"
3468 "vmovupd %%xmm13, (%%rcx) \n\t" // store (c01,c11)
3469 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c21,c31)
3470 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3471 " \n\t"
3472 " \n\t" // update c41:c71
3473 " \n\t"
3474 "vextractf128 $1, %%ymm12, %%xmm2 \n\t"
3475 "vmovupd %%xmm12, (%%rdx) \n\t" // store (c41,c51)
3476 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c61,c71)
3477 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3478 " \n\t"
3479 " \n\t" // update c02:c32
3480 " \n\t"
3481 "vextractf128 $1, %%ymm11, %%xmm2 \n\t"
3482 "vmovupd %%xmm11, (%%rcx) \n\t" // store (c02,c12)
3483 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c22,c32)
3484 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3485 " \n\t"
3486 " \n\t" // update c42:c72
3487 " \n\t"
3488 "vextractf128 $1, %%ymm10, %%xmm2 \n\t"
3489 "vmovupd %%xmm10, (%%rdx) \n\t" // store (c42,c52)
3490 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c62,c72)
3491 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3492 " \n\t"
3493 " \n\t" // update c03:c33
3494 " \n\t"
3495 "vextractf128 $1, %%ymm9, %%xmm2 \n\t"
3496 "vmovupd %%xmm9, (%%rcx) \n\t" // store (c03,c13)
3497 "vmovupd %%xmm2, (%%rcx,%%rsi) \n\t" // store (c23,c33)
3498 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3499 " \n\t"
3500 " \n\t" // update c43:c73
3501 " \n\t"
3502 "vextractf128 $1, %%ymm8, %%xmm2 \n\t"
3503 "vmovupd %%xmm8, (%%rdx) \n\t" // store (c43,c53)
3504 "vmovupd %%xmm2, (%%rdx,%%rsi) \n\t" // store (c63,c73)
3505 " \n\t"
3506 " \n\t"
3507 " \n\t"
3508 "jmp .ZDONE \n\t" // jump to end.
3509 " \n\t"
3510 " \n\t"
3511 " \n\t"
3512 ".ZCOLSTORBZ: \n\t"
3513 " \n\t"
3514 " \n\t"
3515 "vmovapd %%ymm15, (%%rcx) \n\t" // store c00:c30
3516 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3517 " \n\t"
3518 "vmovapd %%ymm14, (%%rdx) \n\t" // store c40:c70
3519 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3520 " \n\t"
3521 "vmovapd %%ymm13, (%%rcx) \n\t" // store c01:c31
3522 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3523 " \n\t"
3524 "vmovapd %%ymm12, (%%rdx) \n\t" // store c41:c71
3525 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3526 " \n\t"
3527 "vmovapd %%ymm11, (%%rcx) \n\t" // store c02:c32
3528 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3529 " \n\t"
3530 "vmovapd %%ymm10, (%%rdx) \n\t" // store c42:c72
3531 "addq %%rdi, %%rdx \n\t" // c += cs_c;
3532 " \n\t"
3533 "vmovapd %%ymm9, (%%rcx) \n\t" // store c03:c33
3534 "addq %%rdi, %%rcx \n\t" // c += cs_c;
3535 " \n\t"
3536 "vmovapd %%ymm8, (%%rdx) \n\t" // store c43:c73
3537 " \n\t"
3538 " \n\t"
3539 " \n\t"
3540 " \n\t"
3541 " \n\t"
3542 ".ZDONE: \n\t"
3543 " \n\t"
3544
3545 : // output operands (none)
3546 : // input operands
3547 "m" (k_iter), // 0
3548 "m" (k_left), // 1
3549 "m" (a), // 2
3550 "m" (b), // 3
3551 "m" (alpha), // 4
3552 "m" (beta), // 5
3553 "m" (c), // 6
3554 "m" (rs_c), // 7
3555 "m" (cs_c)/*, // 8
3556 "m" (b_next), // 9
3557 "m" (a_next)*/ // 10
3558 : // register clobber list
3559 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
3560 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
3561 "xmm0", "xmm1", "xmm2", "xmm3",
3562 "xmm4", "xmm5", "xmm6", "xmm7",
3563 "xmm8", "xmm9", "xmm10", "xmm11",
3564 "xmm12", "xmm13", "xmm14", "xmm15",
3565 "memory"
3566 );
3567 }
3568
3569