1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36
37 #define BLIS_ASM_SYNTAX_ATT
38 #include "bli_x86_asm_macros.h"
39
40 #define GROUP_YMM_BY_4 \
41 vmovaps(ymm15, ymm7)\
42 vshufps(imm(0xe4), ymm13, ymm15, ymm15)\
43 vshufps(imm(0xe4), ymm7, ymm13, ymm13)\
44 \
45 vmovaps(ymm11, ymm7)\
46 vshufps(imm(0xe4), ymm9, ymm11, ymm11)\
47 vshufps(imm(0xe4), ymm7, ymm9, ymm9)\
48 \
49 vmovaps(ymm14, ymm7)\
50 vshufps(imm(0xe4), ymm12, ymm14, ymm14)\
51 vshufps(imm(0xe4), ymm7, ymm12, ymm12)\
52 \
53 vmovaps(ymm10, ymm7)\
54 vshufps(imm(0xe4), ymm8, ymm10, ymm10)\
55 vshufps(imm(0xe4), ymm7, ymm8, ymm8)\
56 \
57 vmovaps(ymm15, ymm7)\
58 vperm2f128(imm(0x12), ymm15, ymm11, ymm15)\
59 vperm2f128(imm(0x30), ymm7, ymm11, ymm11)\
60 \
61 vmovaps(ymm13, ymm7)\
62 vperm2f128(imm(0x12), ymm13, ymm9, ymm13)\
63 vperm2f128(imm(0x30), ymm7, ymm9, ymm9)\
64 \
65 vmovaps(ymm14, ymm7)\
66 vperm2f128(imm(0x12), ymm14, ymm10, ymm14)\
67 vperm2f128(imm(0x30), ymm7, ymm10, ymm10)\
68 \
69 vmovaps(ymm12, ymm7)\
70 vperm2f128(imm(0x12), ymm12, ymm8, ymm12)\
71 vperm2f128(imm(0x30), ymm7, ymm8, ymm8)
72
73 #define STORE_SS \
74 vextractf128(imm(1), ymm0, xmm2)\
75 vmovss(xmm0, mem(rcx))\
76 vpermilps(imm(0x39), xmm0, xmm1)\
77 vmovss(xmm1, mem(rcx, rsi, 1))\
78 vpermilps(imm(0x39), xmm1, xmm0)\
79 vmovss(xmm0, mem(rcx, r12, 1))\
80 vpermilps(imm(0x39), xmm0, xmm1)\
81 vmovss(xmm1, mem(rcx, r13, 1))\
82 vmovss(xmm2, mem(rdx))\
83 vpermilps(imm(0x39), xmm2, xmm3)\
84 vmovss(xmm3, mem(rdx, rsi, 1))\
85 vpermilps(imm(0x39), xmm3, xmm2)\
86 vmovss(xmm2, mem(rdx, r12, 1))\
87 vpermilps(imm(0x39), xmm2, xmm3)\
88 vmovss(xmm3, mem(rdx, r13, 1))\
89
90
bli_sgemm_bulldozer_asm_8x8_fma4(dim_t k0,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c0,inc_t cs_c0,auxinfo_t * restrict data,cntx_t * restrict cntx)91 void bli_sgemm_bulldozer_asm_8x8_fma4
92 (
93 dim_t k0,
94 float* restrict alpha,
95 float* restrict a,
96 float* restrict b,
97 float* restrict beta,
98 float* restrict c, inc_t rs_c0, inc_t cs_c0,
99 auxinfo_t* restrict data,
100 cntx_t* restrict cntx
101 )
102 {
103 // Typecast local copies of integers in case dim_t and inc_t are a
104 // different size than is expected by load instructions.
105 uint64_t k_iter = k0 / 4;
106 uint64_t k_left = k0 % 4;
107 uint64_t rs_c = rs_c0;
108 uint64_t cs_c = cs_c0;
109
110 begin_asm()
111
112 mov(var(a), rax) // load address of a.
113 mov(var(b), rbx) // load address of b.
114
115 vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
116 vmovsldup(mem(rbx, 0*32), ymm2) // elements of a and b.
117 vpermilps(imm(0x4e), ymm2, ymm3)
118
119 mov(var(c), rcx) // load address of c
120 mov(var(cs_c), rdi) // load cs_c
121 lea(mem(, rdi, 4), rdi) // cs_c *= sizeof(float)
122 lea(mem(rcx, rdi, 4), r10) // load address of c + 4*cs_c;
123
124 lea(mem(rdi, rdi, 2), r14) // r14 = 3*cs_c;
125 prefetch(0, mem(rcx, 7*8)) // prefetch c + 0*cs_c
126 prefetch(0, mem(rcx, rdi, 1, 7*8)) // prefetch c + 1*cs_c
127 prefetch(0, mem(rcx, rdi, 2, 7*8)) // prefetch c + 2*cs_c
128 prefetch(0, mem(rcx, r14, 1, 7*8)) // prefetch c + 3*cs_c
129 prefetch(0, mem(r10, 7*8)) // prefetch c + 4*cs_c
130 prefetch(0, mem(r10, rdi, 1, 7*8)) // prefetch c + 5*cs_c
131 prefetch(0, mem(r10, rdi, 2, 7*8)) // prefetch c + 6*cs_c
132 prefetch(0, mem(r10, r14, 1, 7*8)) // prefetch c + 7*cs_c
133
134 vxorps(ymm8, ymm8, ymm8)
135 vxorps(ymm9, ymm9, ymm9)
136 vxorps(ymm10, ymm10, ymm10)
137 vxorps(ymm11, ymm11, ymm11)
138 vxorps(ymm12, ymm12, ymm12)
139 vxorps(ymm13, ymm13, ymm13)
140 vxorps(ymm14, ymm14, ymm14)
141 vxorps(ymm15, ymm15, ymm15)
142
143
144 mov(var(k_iter), rsi) // i = k_iter;
145 test(rsi, rsi) // check i via logical AND.
146 je(.SCONSIDKLEFT) // if i == 0, jump to code that
147 // contains the k_left loop.
148
149 label(.SLOOPKITER) // MAIN LOOP
150
151 // iteration 0
152 prefetch(0, mem(rax, 16*32))
153 vfmaddps(ymm15, ymm0, ymm2, ymm15)
154 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
155 vmovshdup(mem(rbx, 0*32), ymm2)
156 vfmaddps(ymm13, ymm0, ymm3, ymm13)
157 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
158
159 vmovaps(mem(rax, 1*32), ymm1)
160 vpermilps(imm(0x4e), ymm2, ymm3)
161 vfmaddps(ymm11, ymm0, ymm4, ymm11)
162 vfmaddps(ymm9, ymm0, ymm5, ymm9)
163
164 vfmaddps(ymm14, ymm0, ymm2, ymm14)
165 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
166 vmovsldup(mem(rbx, 1*32), ymm2)
167 vfmaddps(ymm12, ymm0, ymm3, ymm12)
168 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
169
170 vpermilps(imm(0x4e), ymm2, ymm3)
171 vfmaddps(ymm10, ymm0, ymm4, ymm10)
172 vfmaddps(ymm8, ymm0, ymm5, ymm8)
173
174 // iteration 1
175 vfmaddps(ymm15, ymm1, ymm2, ymm15)
176 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
177 vmovshdup(mem(rbx, 1*32), ymm2)
178 vfmaddps(ymm13, ymm1, ymm3, ymm13)
179 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
180
181 vmovaps(mem(rax, 2*32), ymm0)
182 vpermilps(imm(0x4e), ymm2, ymm3)
183 vfmaddps(ymm11, ymm1, ymm4, ymm11)
184 vfmaddps(ymm9, ymm1, ymm5, ymm9)
185
186 vfmaddps(ymm14, ymm1, ymm2, ymm14)
187 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
188 vmovsldup(mem(rbx, 2*32), ymm2)
189 vfmaddps(ymm12, ymm1, ymm3, ymm12)
190 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
191
192 vpermilps(imm(0x4e), ymm2, ymm3)
193 vfmaddps(ymm10, ymm1, ymm4, ymm10)
194 vfmaddps(ymm8, ymm1, ymm5, ymm8)
195
196 // iteration 2
197 prefetch(0, mem(rax, 18*32))
198 vfmaddps(ymm15, ymm0, ymm2, ymm15)
199 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
200 vmovshdup(mem(rbx, 2*32), ymm2)
201 vfmaddps(ymm13, ymm0, ymm3, ymm13)
202 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
203
204 vmovaps(mem(rax, 3*32), ymm1)
205 add(imm(4*8*4), rax) // a += 4*8 (unroll x mr)
206 vpermilps(imm(0x4e), ymm2, ymm3)
207 vfmaddps(ymm11, ymm0, ymm4, ymm11)
208 vfmaddps(ymm9, ymm0, ymm5, ymm9)
209
210 vfmaddps(ymm14, ymm0, ymm2, ymm14)
211 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
212 vmovsldup(mem(rbx, 3*32), ymm2)
213 vfmaddps(ymm12, ymm0, ymm3, ymm12)
214 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
215
216 vpermilps(imm(0x4e), ymm2, ymm3)
217 vfmaddps(ymm10, ymm0, ymm4, ymm10)
218 vfmaddps(ymm8, ymm0, ymm5, ymm8)
219
220 // iteration 3
221 vfmaddps(ymm15, ymm1, ymm2, ymm15)
222 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
223 vmovshdup(mem(rbx, 3*32), ymm2)
224 add(imm(4*8*4), rbx) // b += 4*8 (unroll x nr)
225 vfmaddps(ymm13, ymm1, ymm3, ymm13)
226 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
227
228 vmovaps(mem(rax, 0*32), ymm0)
229 vpermilps(imm(0x4e), ymm2, ymm3)
230 vfmaddps(ymm11, ymm1, ymm4, ymm11)
231 vfmaddps(ymm9, ymm1, ymm5, ymm9)
232
233 vfmaddps(ymm14, ymm1, ymm2, ymm14)
234 vperm2f128(imm(0x03), ymm2, ymm2, ymm4)
235 vmovsldup(mem(rbx, 0*32), ymm2)
236 vfmaddps(ymm12, ymm1, ymm3, ymm12)
237 vperm2f128(imm(0x03), ymm3, ymm3, ymm5)
238
239 vpermilps(imm(0x4e), ymm2, ymm3)
240 vfmaddps(ymm10, ymm1, ymm4, ymm10)
241 vfmaddps(ymm8, ymm1, ymm5, ymm8)
242
243
244
245 dec(rsi) // i -= 1;
246 jne(.SLOOPKITER) // iterate again if i != 0.
247
248
249
250
251 label(.SCONSIDKLEFT)
252
253 mov(var(k_left), rsi) // i = k_left;
254 test(rsi, rsi) // check i via logical AND.
255 je(.SPOSTACCUM) // if i == 0, we're done; jump to end.
256 // else, we prepare to enter k_left loop.
257
258 label(.SLOOPKLEFT) // EDGE LOOP
259
260 prefetch(0, mem(rax, 16*32))
261 vfmaddps(ymm15, ymm0, ymm2, ymm15)
262 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
263 vmovshdup(mem(rbx, 0*32), ymm2)
264 vfmaddps(ymm13, ymm0, ymm3, ymm13)
265 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
266
267 vmovaps(mem(rax, 1*32), ymm1)
268 add(imm(8*1*4), rax) // a += 8 (1 x mr)
269 vpermilps(imm(0x4e), ymm2, ymm3)
270 vfmaddps(ymm11, ymm0, ymm4, ymm11)
271 vfmaddps(ymm9, ymm0, ymm5, ymm9)
272
273 vfmaddps(ymm14, ymm0, ymm2, ymm14)
274 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
275 vmovsldup(mem(rbx, 1*32), ymm2)
276 add(imm(8*1*4), rbx) // b += 8 (1 x nr)
277 vfmaddps(ymm12, ymm0, ymm3, ymm12)
278 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
279
280 vpermilps(imm(0x4e), ymm2, ymm3)
281 vfmaddps(ymm10, ymm0, ymm4, ymm10)
282 vfmaddps(ymm8, ymm0, ymm5, ymm8)
283 vmovaps(ymm1, ymm0)
284
285
286 dec(rsi) // i -= 1;
287 jne(.SLOOPKLEFT) // iterate again if i != 0.
288
289
290 label(.SPOSTACCUM)
291 // ymm15: ymm13: ymm11: ymm9:
292 // ( ab00 ( ab02 ( ab04 ( ab06
293 // ab10 ab12 ab14 ab16
294 // ab22 ab20 ab26 ab24
295 // ab32 ab30 ab36 ab34
296 // ab44 ab46 ab40 ab42
297 // ab54 ab56 ab50 ab52
298 // ab66 ab64 ab62 ab60
299 // ab76 ) ab74 ) ab72 ) ab70 )
300
301 // ymm14: ymm12: ymm10: ymm8:
302 // ( ab01 ( ab03 ( ab05 ( ab07
303 // ab11 ab13 ab15 ab17
304 // ab23 ab21 ab27 ab25
305 // ab33 ab31 ab37 ab35
306 // ab45 ab47 ab41 ab43
307 // ab55 ab57 ab51 ab53
308 // ab67 ab65 ab63 ab61
309 // ab77 ) ab75 ) ab73 ) ab71 )
310 GROUP_YMM_BY_4
311 // ymm15: ymm13: ymm11: ymm9:
312 // ( ab00 ( ab02 ( ab04 ( ab06
313 // ab10 ab12 ab14 ab16
314 // ab20 ab22 ab24 ab26
315 // ab30 ab32 ab34 ab36
316 // ab44 ab46 ab40 ab42
317 // ab54 ab56 ab50 ab52
318 // ab64 ab66 ab60 ab62
319 // ab74 ) ab76 ) ab70 ) ab72 )
320
321 // ymm14: ymm12: ymm10: ymm8:
322 // ( ab01 ( ab03 ( ab05 ( ab07
323 // ab11 ab13 ab15 ab17
324 // ab21 ab23 ab25 ab27
325 // ab31 ab33 ab35 ab37
326 // ab45 ab47 ab41 ab43
327 // ab55 ab57 ab51 ab53
328 // ab65 ab67 ab61 ab63
329 // ab75 ) ab77 ) ab71 ) ab73 )
330 // ymm15: ymm13: ymm11: ymm9:
331 // ( ab00 ( ab02 ( ab04 ( ab06
332 // ab10 ab12 ab14 ab16
333 // ab20 ab22 ab24 ab26
334 // ab30 ab32 ab34 ab36
335 // ab40 ab42 ab44 ab46
336 // ab50 ab52 ab54 ab56
337 // ab60 ab62 ab64 ab66
338 // ab70 ) ab72 ) ab74 ) ab76 )
339
340 // ymm14: ymm12: ymm10: ymm8:
341 // ( ab01 ( ab03 ( ab05 ( ab07
342 // ab11 ab13 ab15 ab17
343 // ab21 ab23 ab25 ab27
344 // ab31 ab33 ab35 ab37
345 // ab41 ab43 ab45 ab47
346 // ab51 ab53 ab55 ab57
347 // ab61 ab63 ab65 ab67
348 // ab71 ) ab73 ) ab75 ) ab77 )
349
350 mov(var(alpha), rax) // load address of alpha
351 mov(var(beta), rbx) // load address of beta
352 vbroadcastss(mem(rax), ymm0) // load alpha and duplicate
353 vbroadcastss(mem(rbx), ymm4) // load beta and duplicate
354
355 vmulps(ymm0, ymm8, ymm8) // scale by alpha
356 vmulps(ymm0, ymm9, ymm9)
357 vmulps(ymm0, ymm10, ymm10)
358 vmulps(ymm0, ymm11, ymm11)
359 vmulps(ymm0, ymm12, ymm12)
360 vmulps(ymm0, ymm13, ymm13)
361 vmulps(ymm0, ymm14, ymm14)
362 vmulps(ymm0, ymm15, ymm15)
363
364
365
366
367 mov(var(rs_c), rsi) // load rs_c
368 lea(mem(, rsi, 4), rsi) // rsi = rs_c * sizeof(float)
369
370 lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
371
372 lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
373 lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
374
375
376 // determine if
377 // c % 32 == 0, AND
378 // 4*cs_c % 32 == 0, AND
379 // rs_c == 1
380 // ie: aligned, ldim aligned, and
381 // column-stored
382
383 cmp(imm(4), rsi) // set ZF if (4*rs_c) == 4.
384 sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
385 test(imm(31), rcx) // set ZF if c & 32 is zero.
386 setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
387 test(imm(31), rdi) // set ZF if (4*cs_c) & 32 is zero.
388 setz(al) // al = ( ZF == 0 ? 1 : 0 );
389 // and(bl,bh) followed by
390 // and(bh,al) will reveal result
391
392 // now avoid loading C if beta == 0
393
394 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
395 vucomiss(xmm0, xmm4) // set ZF if beta == 0.
396 je(.SBETAZERO) // if ZF = 1, jump to beta == 0 case
397
398
399 // check if aligned/column-stored
400 and(bl, bh) // set ZF if bl & bh == 1.
401 and(bh, al) // set ZF if bh & al == 1.
402 jne(.SCOLSTORED) // jump to column storage case
403
404
405 label(.SGENSTORED)
406 // update c00:c70
407 vmovlps(mem(rcx), xmm0, xmm0)
408 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
409 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
410 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
411 vshufps(imm(0x88), xmm1, xmm0, xmm0)
412 vmovlps(mem(rdx), xmm2, xmm2)
413 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
414 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
415 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
416 vshufps(imm(0x88), xmm3, xmm2, xmm2)
417 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
418
419 vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
420
421 STORE_SS
422
423 add(rdi, rcx) // c += cs_c;
424 add(rdi, rdx) // c += cs_c;
425
426 // update c01:c71
427 vmovlps(mem(rcx), xmm0, xmm0)
428 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
429 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
430 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
431 vshufps(imm(0x88), xmm1, xmm0, xmm0)
432 vmovlps(mem(rdx), xmm2, xmm2)
433 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
434 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
435 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
436 vshufps(imm(0x88), xmm3, xmm2, xmm2)
437 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
438
439 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
440 // vaddps(ymm14, ymm0, ymm0) // add the gemm result,
441 vfmaddps(ymm14, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
442
443 STORE_SS
444
445 add(rdi, rcx) // c += cs_c;
446 add(rdi, rdx) // c += cs_c;
447
448
449 // update c02:c72
450 vmovlps(mem(rcx), xmm0, xmm0)
451 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
452 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
453 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
454 vshufps(imm(0x88), xmm1, xmm0, xmm0)
455 vmovlps(mem(rdx), xmm2, xmm2)
456 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
457 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
458 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
459 vshufps(imm(0x88), xmm3, xmm2, xmm2)
460 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
461
462 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
463 // vaddps(ymm13, ymm0, ymm0) // add the gemm result,
464 vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
465
466 STORE_SS
467
468 add(rdi, rcx) // c += cs_c;
469 add(rdi, rdx) // c += cs_c;
470
471
472 // update c03:c73
473 vmovlps(mem(rcx), xmm0, xmm0)
474 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
475 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
476 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
477 vshufps(imm(0x88), xmm1, xmm0, xmm0)
478 vmovlps(mem(rdx), xmm2, xmm2)
479 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
480 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
481 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
482 vshufps(imm(0x88), xmm3, xmm2, xmm2)
483 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
484
485 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
486 // vaddps(ymm12, ymm0, ymm0) // add the gemm result,
487 vfmaddps(ymm12, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
488
489 STORE_SS
490
491 add(rdi, rcx) // c += cs_c;
492 add(rdi, rdx) // c += cs_c;
493
494
495 // update c04:c74
496 vmovlps(mem(rcx), xmm0, xmm0)
497 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
498 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
499 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
500 vshufps(imm(0x88), xmm1, xmm0, xmm0)
501 vmovlps(mem(rdx), xmm2, xmm2)
502 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
503 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
504 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
505 vshufps(imm(0x88), xmm3, xmm2, xmm2)
506 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
507
508 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
509 // vaddps(ymm11, ymm0, ymm0) // add the gemm result,
510 vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
511
512 STORE_SS
513
514 add(rdi, rcx) // c += cs_c;
515 add(rdi, rdx) // c += cs_c;
516
517
518 // update c05:c75
519 vmovlps(mem(rcx), xmm0, xmm0)
520 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
521 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
522 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
523 vshufps(imm(0x88), xmm1, xmm0, xmm0)
524 vmovlps(mem(rdx), xmm2, xmm2)
525 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
526 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
527 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
528 vshufps(imm(0x88), xmm3, xmm2, xmm2)
529 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
530
531 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
532 // vaddps(ymm10, ymm0, ymm0) // add the gemm result,
533 vfmaddps(ymm10, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
534
535 STORE_SS
536
537 add(rdi, rcx) // c += cs_c;
538 add(rdi, rdx) // c += cs_c;
539
540
541 // update c06:c76
542 vmovlps(mem(rcx), xmm0, xmm0)
543 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
544 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
545 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
546 vshufps(imm(0x88), xmm1, xmm0, xmm0)
547 vmovlps(mem(rdx), xmm2, xmm2)
548 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
549 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
550 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
551 vshufps(imm(0x88), xmm3, xmm2, xmm2)
552 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
553
554 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
555 // vaddps(ymm9, ymm0, ymm0) // add the gemm result,
556 vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
557
558 STORE_SS
559
560 add(rdi, rcx) // c += cs_c;
561 add(rdi, rdx) // c += cs_c;
562
563
564 // update c07:c77
565 vmovlps(mem(rcx), xmm0, xmm0)
566 vmovhps(mem(rcx, rsi, 1), xmm0, xmm0)
567 vmovlps(mem(rcx, r12, 1), xmm1, xmm1)
568 vmovhps(mem(rcx, r13, 1), xmm1, xmm1)
569 vshufps(imm(0x88), xmm1, xmm0, xmm0)
570 vmovlps(mem(rdx), xmm2, xmm2)
571 vmovhps(mem(rdx, rsi, 1), xmm2, xmm2)
572 vmovlps(mem(rdx, r12, 1), xmm3, xmm3)
573 vmovhps(mem(rdx, r13, 1), xmm3, xmm3)
574 vshufps(imm(0x88), xmm3, xmm2, xmm2)
575 vperm2f128(imm(0x20), ymm2, ymm0, ymm0)
576
577 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
578 // vaddps(ymm8, ymm0, ymm0) // add the gemm result,
579 vfmaddps(ymm8, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
580
581 STORE_SS
582
583
584
585 jmp(.SDONE) // jump to end.
586
587
588
589 label(.SCOLSTORED)
590
591
592 vmovaps(mem(rcx), ymm0) // load c00:c70,
593 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
594 // vaddps(ymm15, ymm0, ymm0) // add the gemm result,
595 vfmaddps(ymm15, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
596 vmovaps(ymm0, mem(rcx)) // and store back to memory.
597 add(rdi, rcx) // c += cs_c;
598
599 vmovaps(mem(rcx), ymm1) // load c01:c71,
600 // vmulps(ymm4, ymm1, ymm1) // scale by beta,
601 // vaddps(ymm14, ymm1, ymm1) // add the gemm result,
602 vfmaddps(ymm14, ymm1, ymm4, ymm1) // scale by beta and add the gemm result,
603 vmovaps(ymm1, mem(rcx)) // and store back to memory.
604 add(rdi, rcx) // c += cs_c;
605
606 vmovaps(mem(rcx), ymm0) // load c02:c72,
607 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
608 // vaddps(ymm13, ymm0, ymm0) // add the gemm result,
609 vfmaddps(ymm13, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
610 vmovaps(ymm0, mem(rcx)) // and store back to memory.
611 add(rdi, rcx) // c += cs_c;
612
613 vmovaps(mem(rcx), ymm1) // load c03:c73,
614 // vmulps(ymm4, ymm1, ymm1) // scale by beta,
615 // vaddps(ymm12, ymm1, ymm1) // add the gemm result,
616 vfmaddps(ymm12, ymm1, ymm4, ymm1) // scale by beta and add the gemm result,
617 vmovaps(ymm1, mem(rcx)) // and store back to memory.
618 add(rdi, rcx) // c += cs_c;
619
620 vmovaps(mem(rcx), ymm0) // load c04:c74,
621 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
622 // vaddps(ymm11, ymm0, ymm0) // add the gemm result,
623 vfmaddps(ymm11, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
624 vmovaps(ymm0, mem(rcx)) // and store back to memory.
625 add(rdi, rcx) // c += cs_c;
626
627 vmovaps(mem(rcx), ymm1) // load c05:c75,
628 // vmulps(ymm4, ymm1, ymm1) // scale by beta,
629 // vaddps(ymm10, ymm1, ymm1) // add the gemm result,
630 vfmaddps(ymm10, ymm1, ymm4, ymm1) // scale by beta and add the gemm result,
631 vmovaps(ymm1, mem(rcx)) // and store back to memory.
632 add(rdi, rcx) // c += cs_c;
633
634 vmovaps(mem(rcx), ymm0) // load c06:c76,
635 // vmulps(ymm4, ymm0, ymm0) // scale by beta,
636 // vaddps(ymm9, ymm0, ymm0) // add the gemm result,
637 vfmaddps(ymm9, ymm0, ymm4, ymm0) // scale by beta and add the gemm result,
638 vmovaps(ymm0, mem(rcx)) // and store back to memory.
639 add(rdi, rcx) // c += cs_c;
640
641 vmovaps(mem(rcx), ymm1) // load c07:c77,
642 // vmulps(ymm4, ymm1, ymm1) // scale by beta,
643 // vaddps(ymm8, ymm1, ymm1) // add the gemm result,
644 vfmaddps(ymm8, ymm1, ymm4, ymm1) // scale by beta and add the gemm result,
645 vmovaps(ymm1, mem(rcx)) // and store back to memory.
646
647 jmp(.SDONE) // jump to end.
648
649
650 label(.SBETAZERO)
651 // check if aligned/column-stored
652 and(bl, bh) // set ZF if bl & bh == 1.
653 and(bh, al) // set ZF if bh & al == 1.
654 jne(.SCOLSTORBZ) // jump to column storage case
655
656
657 label(.SGENSTORBZ)
658 // update c00:c70
659 vmovapd(ymm15, ymm0)
660 STORE_SS
661
662 add(rdi, rcx) // c += cs_c;
663 add(rdi, rdx) // c += cs_c;
664
665 // update c01:c71
666 vmovapd(ymm14, ymm0)
667 STORE_SS
668
669 add(rdi, rcx) // c += cs_c;
670 add(rdi, rdx) // c += cs_c;
671
672 // update c02:c72
673 vmovapd(ymm13, ymm0)
674 STORE_SS
675
676 add(rdi, rcx) // c += cs_c;
677 add(rdi, rdx) // c += cs_c;
678
679 // update c03:c73
680 vmovapd(ymm12, ymm0)
681 STORE_SS
682
683 add(rdi, rcx) // c += cs_c;
684 add(rdi, rdx) // c += cs_c;
685
686 // update c04:c74
687 vmovapd(ymm11, ymm0)
688 STORE_SS
689
690 add(rdi, rcx) // c += cs_c;
691 add(rdi, rdx) // c += cs_c;
692
693 // update c05:c75
694 vmovapd(ymm10, ymm0)
695 STORE_SS
696
697 add(rdi, rcx) // c += cs_c;
698 add(rdi, rdx) // c += cs_c;
699
700 // update c06:c76
701 vmovapd(ymm9, ymm0)
702 STORE_SS
703
704 add(rdi, rcx) // c += cs_c;
705 add(rdi, rdx) // c += cs_c;
706
707 // update c07:c77
708 vmovapd(ymm8, ymm0)
709 STORE_SS
710
711 jmp(.SDONE) // jump to end.
712
713
714 label(.SCOLSTORBZ)
715
716 vmovaps(ymm15, mem(rcx)) // and store back to memory.
717 add(rdi, rcx) // c += cs_c;
718
719 vmovaps(ymm14, mem(rcx)) // and store back to memory.
720 add(rdi, rcx) // c += cs_c;
721
722 vmovaps(ymm13, mem(rcx)) // and store back to memory.
723 add(rdi, rcx) // c += cs_c;
724
725 vmovaps(ymm12, mem(rcx)) // and store back to memory.
726 add(rdi, rcx) // c += cs_c;
727
728 vmovaps(ymm11, mem(rcx)) // and store back to memory.
729 add(rdi, rcx) // c += cs_c;
730
731 vmovaps(ymm10, mem(rcx)) // and store back to memory.
732 add(rdi, rcx) // c += cs_c;
733
734 vmovaps(ymm9, mem(rcx)) // and store back to memory.
735 add(rdi, rcx) // c += cs_c;
736
737 vmovaps(ymm8, mem(rcx)) // and store back to memory.
738
739 label(.SDONE)
740
741
742 end_asm(
743 : // output operands (none)
744 : // input operands
745 [k_iter] "m" (k_iter), // 0
746 [k_left] "m" (k_left), // 1
747 [a] "m" (a), // 2
748 [b] "m" (b), // 3
749 [alpha] "m" (alpha), // 4
750 [beta] "m" (beta), // 5
751 [c] "m" (c), // 6
752 [rs_c] "m" (rs_c), // 7
753 [cs_c] "m" (cs_c)/*, // 8
754 [b_next] "m" (b_next), // 9
755 [a_next] "m" (a_next)*/ // 10
756 : // register clobber list
757 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
758 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
759 "xmm0", "xmm1", "xmm2", "xmm3",
760 "xmm4", "xmm5", "xmm6", "xmm7",
761 "xmm8", "xmm9", "xmm10", "xmm11",
762 "xmm12", "xmm13", "xmm14", "xmm15",
763 "memory"
764 )
765 }
766
767 #undef KERNEL4x6_1
768 #undef KERNEL4x6_2
769 #undef KERNEL4x6_3
770 #undef KERNEL4x6_4
771
772 #define KERNEL4x6_1(xx) \
773 ALIGN4\
774 vmovddup(mem(rax, -8*8), xmm0)\
775 vfmaddpd(xmm4, xmm1, xmm0, xmm4)\
776 vfmaddpd(xmm5, xmm2, xmm0, xmm5)\
777 vfmaddpd(xmm6, xmm3, xmm0, xmm6)\
778 vmovddup(mem(rax, -7*8), xmm0)\
779 vfmaddpd(xmm7, xmm1, xmm0, xmm7)\
780 prefetch(0, mem(rax, 128))\
781 vfmaddpd(xmm8, xmm2, xmm0, xmm8)\
782 vfmaddpd(xmm9, xmm3, xmm0, xmm9)\
783 vmovddup(mem(rax, -6*8), xmm0)\
784 vfmaddpd(xmm10, xmm1, xmm0, xmm10)\
785 vfmaddpd(xmm11, xmm2, xmm0, xmm11)\
786 vfmaddpd(xmm12, xmm3, xmm0, xmm12)\
787 vmovddup(mem(rax, -5*8), xmm0)\
788 vfmaddpd(xmm13, xmm1, xmm0, xmm13)\
789 vmovaps(mem(rbx, -6*8), xmm1)\
790 vfmaddpd(xmm14, xmm2, xmm0, xmm14)\
791 vmovaps(mem(rbx, -4*8), xmm2)\
792 vfmaddpd(xmm15, xmm3, xmm0, xmm15)\
793 vmovaps(mem(rbx, -2*8), xmm3)
794
795 #define KERNEL4x6_2(xx) \
796 vmovddup(mem(rax, -4*8), xmm0)\
797 vfmaddpd(xmm4, xmm1, xmm0, xmm4)\
798 prefetch(0, mem(rax, 192))\
799 vfmaddpd(xmm5, xmm2, xmm0, xmm5)\
800 vfmaddpd(xmm6, xmm3, xmm0, xmm6)\
801 vmovddup(mem(rax, -3*8), xmm0)\
802 vfmaddpd(xmm7, xmm1, xmm0, xmm7)\
803 vfmaddpd(xmm8, xmm2, xmm0, xmm8)\
804 vfmaddpd(xmm9, xmm3, xmm0, xmm9)\
805 vmovddup(mem(rax, -2*8), xmm0)\
806 vfmaddpd(xmm10, xmm1, xmm0, xmm10)\
807 vfmaddpd(xmm11, xmm2, xmm0, xmm11)\
808 vfmaddpd(xmm12, xmm3, xmm0, xmm12)\
809 vmovddup(mem(rax, -1*8), xmm0)\
810 vfmaddpd(xmm13, xmm1, xmm0, xmm13)\
811 vmovaps(mem(rbx, 0*8), xmm1)\
812 vfmaddpd(xmm14, xmm2, xmm0, xmm14)\
813 vmovaps(mem(rbx, 2*8), xmm2)\
814 vfmaddpd(xmm15, xmm3, xmm0, xmm15)\
815 vmovaps(mem(rbx, 4*8), xmm3)\
816
817 #define KERNEL4x6_3(xx) \
818 vmovddup(mem(rax, 0*8), xmm0)\
819 vfmaddpd(xmm4, xmm1, xmm0, xmm4)\
820 vfmaddpd(xmm5, xmm2, xmm0, xmm5)\
821 vfmaddpd(xmm6, xmm3, xmm0, xmm6)\
822 vmovddup(mem(rax, 1*8), xmm0)\
823 vfmaddpd(xmm7, xmm1, xmm0, xmm7)\
824 prefetch(0, mem(rax, 224))\
825 vfmaddpd(xmm8, xmm2, xmm0, xmm8)\
826 vfmaddpd(xmm9, xmm3, xmm0, xmm9)\
827 vmovddup(mem(rax, 2*8), xmm0)\
828 vfmaddpd(xmm10, xmm1, xmm0, xmm10)\
829 vfmaddpd(xmm11, xmm2, xmm0, xmm11)\
830 vfmaddpd(xmm12, xmm3, xmm0, xmm12)\
831 vmovddup(mem(rax, 3*8), xmm0)\
832 vfmaddpd(xmm13, xmm1, xmm0, xmm13)\
833 vmovaps(mem(rbx, 6*8), xmm1)\
834 vfmaddpd(xmm14, xmm2, xmm0, xmm14)\
835 vmovaps(mem(rbx, 8*8), xmm2)\
836 vfmaddpd(xmm15, xmm3, xmm0, xmm15)\
837 vmovaps(mem(rbx, 10*8), xmm3)
838
839 #define KERNEL4x6_4(xx) \
840 vmovddup(mem(rax, 4*8), xmm0)\
841 vfmaddpd(xmm4, xmm1, xmm0, xmm4)\
842 prefetch(0, mem(rax, 224))\
843 vfmaddpd(xmm5, xmm2, xmm0, xmm5)\
844 vfmaddpd(xmm6, xmm3, xmm0, xmm6)\
845 vmovddup(mem(rax, 5*8), xmm0)\
846 vfmaddpd(xmm7, xmm1, xmm0, xmm7)\
847 vfmaddpd(xmm8, xmm2, xmm0, xmm8)\
848 vfmaddpd(xmm9, xmm3, xmm0, xmm9)\
849 vmovddup(mem(rax, 6*8), xmm0)\
850 vfmaddpd(xmm10, xmm1, xmm0, xmm10)\
851 vfmaddpd(xmm11, xmm2, xmm0, xmm11)\
852 vfmaddpd(xmm12, xmm3, xmm0, xmm12)\
853 vmovddup(mem(rax, 7*8), xmm0)\
854 vfmaddpd(xmm13, xmm1, xmm0, xmm13)\
855 vmovaps(mem(rbx, 12*8), xmm1)\
856 vfmaddpd(xmm14, xmm2, xmm0, xmm14)\
857 vmovaps(mem(rbx, 14*8), xmm2)\
858 vfmaddpd(xmm15, xmm3, xmm0, xmm15)\
859 add(imm(16*8), rax)\
860 vmovaps(mem(rbx, 16*8), xmm3)\
861 add(imm(24*8), rbx)
862
bli_dgemm_bulldozer_asm_4x6_fma4(dim_t k0,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c0,inc_t cs_c0,auxinfo_t * restrict data,cntx_t * restrict cntx)863 void bli_dgemm_bulldozer_asm_4x6_fma4
864 (
865 dim_t k0,
866 double* restrict alpha,
867 double* restrict a,
868 double* restrict b,
869 double* restrict beta,
870 double* restrict c, inc_t rs_c0, inc_t cs_c0,
871 auxinfo_t* restrict data,
872 cntx_t* restrict cntx
873 )
874 {
875 // Typecast local copies of integers in case dim_t and inc_t are a
876 // different size than is expected by load instructions.
877 uint64_t k_iter = k0 / 12;
878 uint64_t k_left = k0 % 12;
879 uint64_t rs_c = rs_c0;
880 uint64_t cs_c = cs_c0;
881
882 begin_asm()
883
884
885 vzeroall()
886 mov(var(b), rbx) // load address of b.
887 mov(var(a), rax) // load address of a.
888 prefetch(0, mem(rax, 64))
889
890
891 vmovaps(mem(rbx, 0*8), xmm1)
892 vmovaps(mem(rbx, 2*8), xmm2)
893 vmovaps(mem(rbx, 4*8), xmm3)
894 add(imm(12*8), rbx)
895 add(imm(8*8), rax)
896
897 mov(var(k_iter), rsi) // i = k_iter; notice var(k_iter) not $0
898 test(rsi, rsi)
899 je(.CONSIDERKLEFT)
900
901 ALIGN32
902 label(.LOOPKITER) // MAIN LOOP
903
904 KERNEL4x6_1(xx)
905 KERNEL4x6_2(xx)
906 KERNEL4x6_3(xx)
907 KERNEL4x6_4(xx)
908 KERNEL4x6_1(xx)
909 KERNEL4x6_2(xx)
910 KERNEL4x6_3(xx)
911 KERNEL4x6_4(xx)
912 KERNEL4x6_1(xx)
913 KERNEL4x6_2(xx)
914 KERNEL4x6_3(xx)
915 KERNEL4x6_4(xx)
916
917 dec(rsi)
918 jne(.LOOPKITER)
919
920 label(.CONSIDERKLEFT)
921
922 mov(var(k_left), rsi)
923 test(rsi, rsi)
924 label(.LOOPKLEFT)
925 je(.POSTACCUM)
926
927 KERNEL4x6_1(xx)
928 add(imm(6*8), rbx)
929 add(imm(4*8), rax)
930
931 dec(rsi)
932 jmp(.LOOPKLEFT) // iterate again if i != 0.
933
934 label(.POSTACCUM)
935
936
937 mov(var(rs_c), rsi) // load cs_c
938 mov(var(cs_c), rdi) // load rs_c
939 vmovddup(mem(var(alpha)), xmm2) //load alpha
940 vmovddup(mem(var(beta)), xmm3) //load beta
941 mov(var(c), rcx) // load address of c
942 sal(imm(3), rsi) // cs_c *= sizeof(double)
943 sal(imm(3), rdi) // rs_c *= sizeof(double)
944 lea(mem(rcx, rdi, 2), rdx)
945
946 vmovlpd(mem(rcx), xmm0, xmm0)
947 vmovlpd(mem(rdx), xmm1, xmm1)
948 vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0)
949 vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1)
950 lea(mem(rdx, rdi, 2), r8)
951 vmulpd(xmm2, xmm4, xmm4) // scale by alpha,
952 vmulpd(xmm2, xmm5, xmm5) // scale by alpha,
953 vfmaddpd(xmm4, xmm0, xmm3, xmm4) // scale by beta, and add the gemm result
954 vmovlpd(mem(r8), xmm0, xmm0)
955 vfmaddpd(xmm5, xmm1, xmm3, xmm5) // scale by beta, and add the gemm result
956 vmovhpd(mem(r8, rdi, 1), xmm0, xmm0)
957 vmovlpd(xmm4, mem(rcx)) // and store back to memory.
958 vmovlpd(xmm5, mem(rdx)) // and store back to memory.
959 vmovhpd(xmm4, mem(rcx, rdi, 1))
960 add(rsi, rcx)
961 vmovhpd(xmm5, mem(rdx, rdi, 1))
962 add(rsi, rdx)
963
964 vmulpd(xmm2, xmm6, xmm6) // scale by alpha,
965 vfmaddpd(xmm6, xmm0, xmm3, xmm6) // scale by beta, and add the gemm result
966 vmovlpd(xmm6, mem(r8)) // and store back to memory.
967 vmovhpd(xmm6, mem(r8, rdi, 1))
968 add(rsi, r8)
969
970
971 vmovlpd(mem(rcx), xmm0, xmm0)
972 vmovlpd(mem(rdx), xmm1, xmm1)
973 vmovlpd(mem(r8), xmm4, xmm4)
974 vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0)
975 vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1)
976 vmovhpd(mem(r8, rdi, 1), xmm4, xmm4)
977 vmulpd(xmm2, xmm7, xmm7) // scale by alpha,
978 vmulpd(xmm2, xmm8, xmm8) // scale by alpha,
979 vmulpd(xmm2, xmm9, xmm9) // scale by alpha,
980 vfmaddpd(xmm7, xmm0, xmm3, xmm7) // scale by beta, and add the gemm result
981 vfmaddpd(xmm8, xmm1, xmm3, xmm8) // scale by beta, and add the gemm result
982 vfmaddpd(xmm9, xmm4, xmm3, xmm9) // scale by beta, and add the gemm result
983 vmovlpd(xmm7, mem(rcx)) // and store back to memory.
984 vmovlpd(xmm8, mem(rdx)) // and store back to memory.
985 vmovlpd(xmm9, mem(r8)) // and store back to memory.
986 vmovhpd(xmm7, mem(rcx, rdi, 1))
987 add(rsi, rcx)
988 vmovhpd(xmm8, mem(rdx, rdi, 1))
989 add(rsi, rdx)
990 vmovhpd(xmm9, mem(r8, rdi, 1))
991 add(rsi, r8)
992
993
994 vmovlpd(mem(rcx), xmm0, xmm0)
995 vmovlpd(mem(rdx), xmm1, xmm1)
996 vmovlpd(mem(r8), xmm4, xmm4)
997 vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0)
998 vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1)
999 vmovhpd(mem(r8, rdi, 1), xmm4, xmm4)
1000 vmulpd(xmm2, xmm10, xmm10) // scale by alpha,
1001 vmulpd(xmm2, xmm11, xmm11) // scale by alpha,
1002 vmulpd(xmm2, xmm12, xmm12) // scale by alpha,
1003 vfmaddpd(xmm10, xmm0, xmm3, xmm10) // scale by beta, and add the gemm result
1004 vfmaddpd(xmm11, xmm1, xmm3, xmm11) // scale by beta, and add the gemm result
1005 vfmaddpd(xmm12, xmm4, xmm3, xmm12) // scale by beta, and add the gemm result
1006 vmovlpd(xmm10, mem(rcx)) // and store back to memory.
1007 vmovlpd(xmm11, mem(rdx)) // and store back to memory.
1008 vmovlpd(xmm12, mem(r8)) // and store back to memory.
1009 vmovhpd(xmm10, mem(rcx, rdi, 1))
1010 add(rsi, rcx)
1011 vmovhpd(xmm11, mem(rdx, rdi, 1))
1012 add(rsi, rdx)
1013 vmovhpd(xmm12, mem(r8, rdi, 1))
1014 add(rsi, r8)
1015
1016
1017 vmovlpd(mem(rcx), xmm0, xmm0)
1018 vmovlpd(mem(rdx), xmm1, xmm1)
1019 vmovlpd(mem(r8), xmm4, xmm4)
1020 vmovhpd(mem(rcx, rdi, 1), xmm0, xmm0)
1021 vmovhpd(mem(rdx, rdi, 1), xmm1, xmm1)
1022 vmovhpd(mem(r8, rdi, 1), xmm4, xmm4)
1023 vmulpd(xmm2, xmm13, xmm13) // scale by alpha,
1024 vmulpd(xmm2, xmm14, xmm14) // scale by alpha,
1025 vmulpd(xmm2, xmm15, xmm15) // scale by alpha,
1026 vfmaddpd(xmm13, xmm0, xmm3, xmm13) // scale by beta, and add the gemm result
1027 vfmaddpd(xmm14, xmm1, xmm3, xmm14) // scale by beta, and add the gemm result
1028 vfmaddpd(xmm15, xmm4, xmm3, xmm15) // scale by beta, and add the gemm result
1029 vmovlpd(xmm13, mem(rcx)) // and store back to memory.
1030 vmovlpd(xmm14, mem(rdx)) // and store back to memory.
1031 vmovlpd(xmm15, mem(r8)) // and store back to memory.
1032 vmovhpd(xmm13, mem(rcx, rdi, 1))
1033 vmovhpd(xmm14, mem(rdx, rdi, 1))
1034 vmovhpd(xmm15, mem(r8, rdi, 1))
1035
1036 end_asm(
1037 : // output operands (none)
1038 : // input operands
1039 [k_iter] "r" (k_iter), // 0
1040 [k_left] "r" (k_left), // 1
1041 [a] "r" (a), // 2
1042 [b] "r" (b), // 3
1043 [alpha] "r" (alpha), // 4
1044 [beta] "r" (beta), // 5
1045 [c] "r" (c), // 6
1046 [rs_c] "m" (rs_c), // 7
1047 [cs_c] "m" (cs_c)/*, // 8
1048 [b_next] "m" (b_next), // 9
1049 [a_next] "m" (a_next)*/ // 10
1050 : // register clobber list
1051 "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8",
1052 "xmm0", "xmm1", "xmm2", "xmm3",
1053 "xmm4", "xmm5", "xmm6", "xmm7",
1054 "xmm8", "xmm9", "xmm10", "xmm11",
1055 "xmm12", "xmm13", "xmm14", "xmm15",
1056 "memory"
1057 )
1058 }
1059 //The parameter "i" is the iteration number, i.e. the B values to read
1060 #define MADD_TO_YMM(i) \
1061 vfmaddps(ymm15, ymm0, ymm2, ymm15)\
1062 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\
1063 vfmaddps(ymm13, ymm0, ymm3, ymm13)\
1064 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\
1065 vfmaddps(ymm14, ymm1, ymm2, ymm14)\
1066 vmovshdup(mem(rbx, i*32), ymm2)\
1067 vfmaddps(ymm12, ymm1, ymm3, ymm12)\
1068 vpermilps(imm(0x4e), ymm2, ymm3)\
1069 vfmaddps(ymm11, ymm0, ymm4, ymm11)\
1070 vfmaddps(ymm9, ymm0, ymm5, ymm9)\
1071 vpermilps(imm(0xb1), ymm0, ymm0)\
1072 vfmaddps(ymm10, ymm1, ymm4, ymm10)\
1073 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\
1074 vfmaddps(ymm8, ymm1, ymm5, ymm8)\
1075 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\
1076
bli_cgemm_bulldozer_asm_8x4_fma4(dim_t k0,scomplex * restrict alpha,scomplex * restrict a,scomplex * restrict b,scomplex * restrict beta,scomplex * restrict c,inc_t rs_c0,inc_t cs_c0,auxinfo_t * restrict data,cntx_t * restrict cntx)1077 void bli_cgemm_bulldozer_asm_8x4_fma4
1078 (
1079 dim_t k0,
1080 scomplex* restrict alpha,
1081 scomplex* restrict a,
1082 scomplex* restrict b,
1083 scomplex* restrict beta,
1084 scomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
1085 auxinfo_t* restrict data,
1086 cntx_t* restrict cntx
1087 )
1088 {
1089 //void* a_next = bli_auxinfo_next_a( data );
1090 void* b_next = bli_auxinfo_next_b( data );
1091
1092 // Typecast local copies of integers in case dim_t and inc_t are a
1093 // different size than is expected by load instructions.
1094 uint64_t k_iter = k0 / 4;
1095 uint64_t k_left = k0 % 4;
1096 uint64_t rs_c = rs_c0;
1097 uint64_t cs_c = cs_c0;
1098
1099 begin_asm()
1100
1101 mov(var(a), rax) // load address of a.
1102 mov(var(b), rbx) // load address of b.
1103 mov(var(b_next), r15) // load address of b_next.
1104 //mov(var(a_next), r14) // load address of a_next.
1105 sub(imm(4*64), r15)
1106
1107 vmovaps(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
1108 vmovsldup(mem(rbx, 0*32), ymm2)
1109 vpermilps(imm(0x4e), ymm2, ymm3)
1110
1111 mov(var(c), rcx) // load address of c
1112 mov(var(cs_c), rdi) // load cs_c
1113 lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(scomplex)
1114 lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
1115
1116 prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
1117 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
1118 prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
1119 prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
1120
1121 vxorps(ymm8, ymm8, ymm8)
1122 vxorps(ymm9, ymm9, ymm9)
1123 vxorps(ymm10, ymm10, ymm10)
1124 vxorps(ymm11, ymm11, ymm11)
1125 vxorps(ymm12, ymm12, ymm12)
1126 vxorps(ymm13, ymm13, ymm13)
1127 vxorps(ymm14, ymm14, ymm14)
1128 vxorps(ymm15, ymm15, ymm15)
1129
1130 mov(var(k_iter), rsi) // i = k_iter;
1131 test(rsi, rsi) // check i via logical AND.
1132 je(.CCONSIDKLEFT) // if i == 0, jump to code that
1133 // contains the k_left loop.
1134
1135 label(.CLOOPKITER) // MAIN LOOP
1136
1137 add(imm(4*4*8), r15) // b_next += 4*4 (unroll x nr)
1138
1139 // iteration 0
1140 prefetch(0, mem(rax, 8*32))
1141 vmovaps(mem(rax, 1*32), ymm1)
1142 MADD_TO_YMM(0)
1143
1144 vpermilps(imm(0xb1), ymm1, ymm1)
1145 vmulps(ymm0, ymm2, ymm6)
1146 vaddsubps(ymm6, ymm15, ymm15)
1147 vmulps(ymm0, ymm3, ymm7)
1148 vaddsubps(ymm7, ymm13, ymm13)
1149
1150 vmulps(ymm1, ymm2, ymm6)
1151 vmovsldup(mem(rbx, 1*32), ymm2)
1152 vmulps(ymm1, ymm3, ymm7)
1153 vpermilps(imm(0x4e), ymm2, ymm3)
1154 vaddsubps(ymm6, ymm14, ymm14)
1155 vaddsubps(ymm7, ymm12, ymm12)
1156
1157 vmulps(ymm0, ymm4, ymm6)
1158 vmulps(ymm0, ymm5, ymm7)
1159 vmovaps(mem(rax, 2*32), ymm0)
1160 vaddsubps(ymm6, ymm11, ymm11)
1161 vaddsubps(ymm7, ymm9, ymm9)
1162
1163 vmulps(ymm1, ymm4, ymm6)
1164 vmulps(ymm1, ymm5, ymm7)
1165 vaddsubps(ymm6, ymm10, ymm10)
1166 vaddsubps(ymm7, ymm8, ymm8)
1167
1168
1169 // iteration 1
1170 prefetch(0, mem(rax, 10*32))
1171 vmovaps(mem(rax, 3*32), ymm1)
1172 MADD_TO_YMM(1)
1173
1174 vpermilps(imm(0xb1), ymm1, ymm1)
1175 vmulps(ymm0, ymm2, ymm6)
1176 vmulps(ymm0, ymm3, ymm7)
1177 vaddsubps(ymm6, ymm15, ymm15)
1178 vaddsubps(ymm7, ymm13, ymm13)
1179
1180 vmulps(ymm1, ymm2, ymm6)
1181 vmovsldup(mem(rbx, 2*32), ymm2)
1182 vmulps(ymm1, ymm3, ymm7)
1183 vpermilps(imm(0x4e), ymm2, ymm3)
1184 vaddsubps(ymm6, ymm14, ymm14)
1185 vaddsubps(ymm7, ymm12, ymm12)
1186
1187 vmulps(ymm0, ymm4, ymm6)
1188 vmulps(ymm0, ymm5, ymm7)
1189 vmovaps(mem(rax, 4*32), ymm0)
1190 vaddsubps(ymm6, ymm11, ymm11)
1191 vaddsubps(ymm7, ymm9, ymm9)
1192
1193 vmulps(ymm1, ymm4, ymm6)
1194 vmulps(ymm1, ymm5, ymm7)
1195 vaddsubps(ymm6, ymm10, ymm10)
1196 vaddsubps(ymm7, ymm8, ymm8)
1197
1198 // iteration 2
1199 prefetch(0, mem(rax, 12*32))
1200 vmovaps(mem(rax, 5*32), ymm1)
1201 MADD_TO_YMM(2)
1202 prefetch(0, mem(r15, 2*32)) // prefetch b_next[2*4]
1203
1204 vpermilps(imm(0xb1), ymm1, ymm1)
1205 vmulps(ymm0, ymm2, ymm6)
1206 vmulps(ymm0, ymm3, ymm7)
1207 vaddsubps(ymm6, ymm15, ymm15)
1208 vaddsubps(ymm7, ymm13, ymm13)
1209
1210 vmulps(ymm1, ymm2, ymm6)
1211 vmovsldup(mem(rbx, 3*32), ymm2)
1212 vmulps(ymm1, ymm3, ymm7)
1213 vpermilps(imm(0x4e), ymm2, ymm3)
1214 vaddsubps(ymm6, ymm14, ymm14)
1215 vaddsubps(ymm7, ymm12, ymm12)
1216
1217 vmulps(ymm0, ymm4, ymm6)
1218 vmulps(ymm0, ymm5, ymm7)
1219 vmovaps(mem(rax, 6*32), ymm0)
1220 vaddsubps(ymm6, ymm11, ymm11)
1221 vaddsubps(ymm7, ymm9, ymm9)
1222
1223 vmulps(ymm1, ymm4, ymm6)
1224 vmulps(ymm1, ymm5, ymm7)
1225 vaddsubps(ymm6, ymm10, ymm10)
1226 vaddsubps(ymm7, ymm8, ymm8)
1227
1228
1229 // iteration 3
1230 prefetch(0, mem(rax, 14*32))
1231 vmovaps(mem(rax, 7*32), ymm1)
1232 MADD_TO_YMM(3)
1233
1234 vpermilps(imm(0xb1), ymm1, ymm1)
1235 vmulps(ymm0, ymm2, ymm6)
1236 vmulps(ymm0, ymm3, ymm7)
1237 vaddsubps(ymm6, ymm15, ymm15)
1238 vaddsubps(ymm7, ymm13, ymm13)
1239
1240 vmulps(ymm1, ymm2, ymm6)
1241 vmovsldup(mem(rbx, 4*32), ymm2)
1242 vmulps(ymm1, ymm3, ymm7)
1243 vpermilps(imm(0x4e), ymm2, ymm3)
1244 vaddsubps(ymm6, ymm14, ymm14)
1245 vaddsubps(ymm7, ymm12, ymm12)
1246
1247 vmulps(ymm0, ymm4, ymm6)
1248 vmulps(ymm0, ymm5, ymm7)
1249 vmovaps(mem(rax, 8*32), ymm0)
1250 vaddsubps(ymm6, ymm11, ymm11)
1251 vaddsubps(ymm7, ymm9, ymm9)
1252
1253 vmulps(ymm1, ymm4, ymm6)
1254 vmulps(ymm1, ymm5, ymm7)
1255 vaddsubps(ymm6, ymm10, ymm10)
1256 vaddsubps(ymm7, ymm8, ymm8)
1257
1258
1259 add(imm(8*4*8), rax) // a += 8*4 (unroll x mr)
1260 add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
1261
1262
1263 dec(rsi) // i -= 1;
1264 jne(.CLOOPKITER) // iterate again if i != 0.
1265
1266
1267
1268 label(.CCONSIDKLEFT)
1269
1270 mov(var(k_left), rsi) // i = k_left;
1271 test(rsi, rsi) // check i via logical AND.
1272 je(.CPOSTACCUM) // if i == 0, we're done; jump to end.
1273 // else, we prepare to enter k_left loop.
1274
1275
1276 label(.CLOOPKLEFT) // EDGE LOOP
1277
1278 // iteration 0
1279 prefetch(0, mem(rax, 8*32))
1280 vmovaps(mem(rax, 1*32), ymm1)
1281 MADD_TO_YMM(0)
1282
1283 vpermilps(imm(0xb1), ymm1, ymm1)
1284 vmulps(ymm0, ymm2, ymm6)
1285 vmulps(ymm0, ymm3, ymm7)
1286 vaddsubps(ymm6, ymm15, ymm15)
1287 vaddsubps(ymm7, ymm13, ymm13)
1288
1289 vmulps(ymm1, ymm2, ymm6)
1290 vmovsldup(mem(rbx, 1*32), ymm2)
1291 vmulps(ymm1, ymm3, ymm7)
1292 vpermilps(imm(0x4e), ymm2, ymm3)
1293 vaddsubps(ymm6, ymm14, ymm14)
1294 vaddsubps(ymm7, ymm12, ymm12)
1295
1296 vmulps(ymm0, ymm4, ymm6)
1297 vmulps(ymm0, ymm5, ymm7)
1298 vmovaps(mem(rax, 2*32), ymm0)
1299 vaddsubps(ymm6, ymm11, ymm11)
1300 vaddsubps(ymm7, ymm9, ymm9)
1301
1302 vmulps(ymm1, ymm4, ymm6)
1303 vmulps(ymm1, ymm5, ymm7)
1304 vaddsubps(ymm6, ymm10, ymm10)
1305 vaddsubps(ymm7, ymm8, ymm8)
1306
1307
1308 add(imm(8*1*8), rax) // a += 8 (1 x mr)
1309 add(imm(4*1*8), rbx) // b += 4 (1 x nr)
1310
1311
1312 dec(rsi) // i -= 1;
1313 jne(.CLOOPKLEFT) // iterate again if i != 0.
1314
1315
1316
1317 label(.CPOSTACCUM)
1318
1319 // ymm15: ymm13: ymm11: ymm9:
1320 // ( ab00 ( ab01 ( ab02 ( ab03
1321 // ab10 ab11 ab12 ab13
1322 // ab21 ab20 ab23 ab22
1323 // ab31 ab30 ab33 ab32
1324 // ab42 ab43 ab40 ab41
1325 // ab52 ab53 ab50 ab51
1326 // ab63 ab62 ab61 ab60
1327 // ab73 ) ab72 ) ab71 ) ab70 )
1328
1329 // ymm14: ymm12: ymm10: ymm8:
1330 // ( ab80 ( ab81 ( ab82 ( ab83
1331 // ab90 ab91 ab92 ab93
1332 // aba1 aba0 aba3 aba2
1333 // abb1 abb0 abb3 abb2
1334 // abc2 abc3 abc0 abc1
1335 // abd2 abd3 abd0 abd1
1336 // abe3 abe2 abe1 abe0
1337 // abf3 abf2 abf1 abf0 )
1338 GROUP_YMM_BY_4
1339 // ymm15: ymm13: ymm11: ymm9:
1340 // ( ab00 ( ab01 ( ab02 ( ab03
1341 // ab10 ab11 ab12 ab13
1342 // ab20 ab21 ab22 ab23
1343 // ab30 ab31 ab32 ab33
1344 // ab42 ab43 ab40 ab41
1345 // ab52 ab53 ab50 ab51
1346 // ab62 ab63 ab60 ab61
1347 // ab72 ) ab73 ) ab70 ) ab71 )
1348
1349 // ymm14: ymm12: ymm10: ymm8:
1350 // ( ab80 ( ab81 ( ab82 ( ab83
1351 // ab90 ab91 ab92 ab93
1352 // aba0 aba1 aba2 aba3
1353 // abb0 abb1 abb2 abb3
1354 // abc2 abc3 abc0 abc1
1355 // abd2 abd3 abd0 abd1
1356 // abe2 abe3 abe0 abe1
1357 // abf2 ) abf3 ) abf0 ) abf1 )
1358
1359 // ymm15: ymm13: ymm11: ymm9:
1360 // ( ab00 ( ab01 ( ab02 ( ab03
1361 // ab10 ab11 ab12 ab13
1362 // ab20 ab21 ab22 ab23
1363 // ab30 ab31 ab32 ab33
1364 // ab40 ab41 ab42 ab43
1365 // ab50 ab51 ab52 ab53
1366 // ab60 ab61 ab62 ab63
1367 // ab70 ) ab71 ) ab72 ) ab73 )
1368
1369 // ymm14: ymm12: ymm10: ymm8:
1370 // ( ab80 ( ab81 ( ab82 ( ab83
1371 // ab90 ab91 ab92 ab93
1372 // aba0 aba1 aba2 aba3
1373 // abb0 abb1 abb2 abb3
1374 // abc0 abc1 abc2 abc3
1375 // abd0 abd1 abd2 abd3
1376 // abe0 abe1 abe2 abe3
1377 // abf0 ) abf1 ) abf2 ) abf3 )
1378
1379 // scale by alpha
1380
1381 mov(var(alpha), rax) // load address of alpha
1382 vbroadcastss(mem(rax), ymm7) // load alpha_r and duplicate
1383 vbroadcastss(mem(rax, 4), ymm6) // load alpha_i and duplicate
1384
1385 vpermilps(imm(0xb1), ymm15, ymm3)
1386 vmulps(ymm7, ymm15, ymm15)
1387 vmulps(ymm6, ymm3, ymm3)
1388 vaddsubps(ymm3, ymm15, ymm15)
1389
1390 vpermilps(imm(0xb1), ymm14, ymm2)
1391 vmulps(ymm7, ymm14, ymm14)
1392 vmulps(ymm6, ymm2, ymm2)
1393 vaddsubps(ymm2, ymm14, ymm14)
1394
1395 vpermilps(imm(0xb1), ymm13, ymm1)
1396 vmulps(ymm7, ymm13, ymm13)
1397 vmulps(ymm6, ymm1, ymm1)
1398 vaddsubps(ymm1, ymm13, ymm13)
1399
1400 vpermilps(imm(0xb1), ymm12, ymm0)
1401 vmulps(ymm7, ymm12, ymm12)
1402 vmulps(ymm6, ymm0, ymm0)
1403 vaddsubps(ymm0, ymm12, ymm12)
1404
1405 vpermilps(imm(0xb1), ymm11, ymm3)
1406 vmulps(ymm7, ymm11, ymm11)
1407 vmulps(ymm6, ymm3, ymm3)
1408 vaddsubps(ymm3, ymm11, ymm11)
1409
1410 vpermilps(imm(0xb1), ymm10, ymm2)
1411 vmulps(ymm7, ymm10, ymm10)
1412 vmulps(ymm6, ymm2, ymm2)
1413 vaddsubps(ymm2, ymm10, ymm10)
1414
1415 vpermilps(imm(0xb1), ymm9, ymm1)
1416 vmulps(ymm7, ymm9, ymm9)
1417 vmulps(ymm6, ymm1, ymm1)
1418 vaddsubps(ymm1, ymm9, ymm9)
1419
1420 vpermilps(imm(0xb1), ymm8, ymm0)
1421 vmulps(ymm7, ymm8, ymm8)
1422 vmulps(ymm6, ymm0, ymm0)
1423 vaddsubps(ymm0, ymm8, ymm8)
1424
1425
1426
1427
1428 mov(var(beta), rbx) // load address of beta
1429 vbroadcastss(mem(rbx), ymm7) // load beta_r and duplicate
1430 vbroadcastss(mem(rbx, 4), ymm6) // load beta_i and duplicate
1431
1432
1433
1434
1435
1436
1437
1438 mov(var(rs_c), rsi) // load rs_c
1439 lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(scomplex)
1440
1441 lea(mem(rcx, rsi, 4), rdx) // load address of c + 4*rs_c;
1442
1443 lea(mem(, rsi, 2), r12) // r12 = 2*rs_c;
1444 lea(mem(r12, rsi, 1), r13) // r13 = 3*rs_c;
1445
1446
1447
1448 // determine if
1449 // c % 32 == 0, AND
1450 // 8*cs_c % 32 == 0, AND
1451 // rs_c == 1
1452 // ie: aligned, ldim aligned, and
1453 // column-stored
1454
1455 cmp(imm(8), rsi) // set ZF if (8*rs_c) == 8.
1456 sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
1457 test(imm(31), rcx) // set ZF if c & 32 is zero.
1458 setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
1459 test(imm(31), rdi) // set ZF if (8*cs_c) & 32 is zero.
1460 setz(al) // al = ( ZF == 0 ? 1 : 0 );
1461 // and(bl,bh) followed by
1462 // and(bh,al) will reveal result
1463
1464 // now avoid loading C if beta == 0
1465
1466 vxorps(ymm0, ymm0, ymm0) // set ymm0 to zero.
1467 vucomiss(xmm0, xmm7) // set ZF if beta_r == 0.
1468 sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
1469 vucomiss(xmm0, xmm6) // set ZF if beta_i == 0.
1470 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
1471 and(r8b, r9b) // set ZF if r8b & r9b == 1.
1472 jne(.CBETAZERO) // if ZF = 0, jump to beta == 0 case
1473
1474
1475 // check if aligned/column-stored
1476 and(bl, bh) // set ZF if bl & bh == 1.
1477 and(bh, al) // set ZF if bh & al == 1.
1478 jne(.CCOLSTORED) // jump to column storage case
1479
1480
1481
1482 label(.CGENSTORED)
1483
1484 // update c00:c70
1485
1486 vmovlpd(mem(rcx), xmm0, xmm0) // load (c00,10) into xmm0[0:1]
1487 vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c20,30) into xmm0[2:3]
1488 vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c40,50) into xmm2[0:1]
1489 vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c60,70) into xmm2[2:3]
1490 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1491 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1492 vmulps(ymm7, ymm0, ymm0)
1493 vmulps(ymm6, ymm2, ymm2)
1494 vaddsubps(ymm2, ymm0, ymm0)
1495 vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
1496 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1497 vmovlpd(xmm0, mem(rcx)) // store (c00,c10)
1498 vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c20,c30)
1499 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
1500 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
1501 add(rdi, rcx) // c += cs_c;
1502
1503 // update c80:cf0
1504
1505 vmovlpd(mem(rdx), xmm0, xmm0) // load (c80,90) into xmm0[0:1]
1506 vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca0,b0) into xmm0[2:3]
1507 vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc0,d0) into xmm2[0:1]
1508 vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce0,f0) into xmm2[2:3]
1509 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1510 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1511 vmulps(ymm7, ymm0, ymm0)
1512 vmulps(ymm6, ymm2, ymm2)
1513 vaddsubps(ymm2, ymm0, ymm0)
1514 vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
1515 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1516 vmovlpd(xmm0, mem(rdx)) // store (c80,c90)
1517 vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca0,cb0)
1518 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
1519 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
1520 add(rdi, rdx) // c += cs_c;
1521
1522 // update c01:c71
1523
1524 vmovlpd(mem(rcx), xmm0, xmm0) // load (c01,11) into xmm0[0:1]
1525 vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c21,31) into xmm0[2:3]
1526 vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c41,51) into xmm2[0:1]
1527 vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c61,71) into xmm2[2:3]
1528 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1529 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1530 vmulps(ymm7, ymm0, ymm0)
1531 vmulps(ymm6, ymm2, ymm2)
1532 vaddsubps(ymm2, ymm0, ymm0)
1533 vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
1534 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1535 vmovlpd(xmm0, mem(rcx)) // store (c01,c11)
1536 vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c21,c31)
1537 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
1538 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
1539 add(rdi, rcx) // c += cs_c;
1540
1541 // update c81:cf1
1542
1543 vmovlpd(mem(rdx), xmm0, xmm0) // load (c81,91) into xmm0[0:1]
1544 vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca1,b1) into xmm0[2:3]
1545 vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc1,d1) into xmm2[0:1]
1546 vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce1,f1) into xmm2[2:3]
1547 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1548 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1549 vmulps(ymm7, ymm0, ymm0)
1550 vmulps(ymm6, ymm2, ymm2)
1551 vaddsubps(ymm2, ymm0, ymm0)
1552 vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
1553 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1554 vmovlpd(xmm0, mem(rdx)) // store (c81,c91)
1555 vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca1,cb1)
1556 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
1557 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
1558 add(rdi, rdx) // c += cs_c;
1559
1560 // update c02:c72
1561
1562 vmovlpd(mem(rcx), xmm0, xmm0) // load (c02,12) into xmm0[0:1]
1563 vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c22,32) into xmm0[2:3]
1564 vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c42,52) into xmm2[0:1]
1565 vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c62,72) into xmm2[2:3]
1566 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1567 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1568 vmulps(ymm7, ymm0, ymm0)
1569 vmulps(ymm6, ymm2, ymm2)
1570 vaddsubps(ymm2, ymm0, ymm0)
1571 vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
1572 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1573 vmovlpd(xmm0, mem(rcx)) // store (c02,c12)
1574 vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c22,c32)
1575 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
1576 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
1577 add(rdi, rcx) // c += cs_c;
1578
1579 // update c82:cf2
1580
1581 vmovlpd(mem(rdx), xmm0, xmm0) // load (c82,92) into xmm0[0:1]
1582 vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca2,b2) into xmm0[2:3]
1583 vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc2,d2) into xmm2[0:1]
1584 vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce2,f2) into xmm2[2:3]
1585 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1586 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1587 vmulps(ymm7, ymm0, ymm0)
1588 vmulps(ymm6, ymm2, ymm2)
1589 vaddsubps(ymm2, ymm0, ymm0)
1590 vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
1591 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1592 vmovlpd(xmm0, mem(rdx)) // store (c82,c92)
1593 vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca2,cb2)
1594 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
1595 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
1596 add(rdi, rdx) // c += cs_c;
1597
1598 // update c03:c73
1599
1600 vmovlpd(mem(rcx), xmm0, xmm0) // load (c03,13) into xmm0[0:1]
1601 vmovhpd(mem(rcx, rsi, 1), xmm0, xmm0) // load (c23,33) into xmm0[2:3]
1602 vmovlpd(mem(rcx, r12, 1), xmm2, xmm2) // load (c43,53) into xmm2[0:1]
1603 vmovhpd(mem(rcx, r13, 1), xmm2, xmm2) // load (c63,73) into xmm2[2:3]
1604 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1605 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1606 vmulps(ymm7, ymm0, ymm0)
1607 vmulps(ymm6, ymm2, ymm2)
1608 vaddsubps(ymm2, ymm0, ymm0)
1609 vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
1610 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1611 vmovlpd(xmm0, mem(rcx)) // store (c03,c13)
1612 vmovhpd(xmm0, mem(rcx, rsi, 1)) // store (c23,c33)
1613 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
1614 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
1615 add(rdi, rcx) // c += cs_c;
1616
1617 // update c83:cf3
1618
1619 vmovlpd(mem(rdx), xmm0, xmm0) // load (c83,93) into xmm0[0:1]
1620 vmovhpd(mem(rdx, rsi, 1), xmm0, xmm0) // load (ca3,b3) into xmm0[2:3]
1621 vmovlpd(mem(rdx, r12, 1), xmm2, xmm2) // load (cc3,d3) into xmm2[0:1]
1622 vmovhpd(mem(rdx, r13, 1), xmm2, xmm2) // load (ce3,f3) into xmm2[2:3]
1623 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:3],xmm2)
1624 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1625 vmulps(ymm7, ymm0, ymm0)
1626 vmulps(ymm6, ymm2, ymm2)
1627 vaddsubps(ymm2, ymm0, ymm0)
1628 vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
1629 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[4:7]
1630 vmovlpd(xmm0, mem(rdx)) // store (c83,c93)
1631 vmovhpd(xmm0, mem(rdx, rsi, 1)) // store (ca3,cb3)
1632 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
1633 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
1634 add(rdi, rdx) // c += cs_c;
1635
1636
1637
1638 jmp(.CDONE) // jump to end.
1639
1640
1641
1642 label(.CCOLSTORED)
1643
1644 // update c00:c70
1645
1646 vmovaps(mem(rcx), ymm0) // load c00:c70 into ymm0
1647 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1648 vmulps(ymm7, ymm0, ymm0)
1649 vmulps(ymm6, ymm2, ymm2)
1650 vaddsubps(ymm2, ymm0, ymm0)
1651 vaddps(ymm15, ymm0, ymm0) // add the gemm result to ymm0
1652 vmovaps(ymm0, mem(rcx)) // store c00:c70
1653 add(rdi, rcx) // c += cs_c;
1654
1655 // update c80:cf0
1656
1657 vmovaps(mem(rdx), ymm0) // load c80:f0 into ymm0
1658 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1659 vmulps(ymm7, ymm0, ymm0)
1660 vmulps(ymm6, ymm2, ymm2)
1661 vaddsubps(ymm2, ymm0, ymm0)
1662 vaddps(ymm14, ymm0, ymm0) // add the gemm result to ymm0
1663 vmovaps(ymm0, mem(rdx)) // store c80:cf0
1664 add(rdi, rdx) // c += cs_c;
1665
1666 // update c00:c70
1667
1668 vmovaps(mem(rcx), ymm0) // load c01:c71 into ymm0
1669 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1670 vmulps(ymm7, ymm0, ymm0)
1671 vmulps(ymm6, ymm2, ymm2)
1672 vaddsubps(ymm2, ymm0, ymm0)
1673 vaddps(ymm13, ymm0, ymm0) // add the gemm result to ymm0
1674 vmovaps(ymm0, mem(rcx)) // store c01:c71
1675 add(rdi, rcx) // c += cs_c;
1676
1677 // update c81:cf1
1678
1679 vmovaps(mem(rdx), ymm0) // load c81:f1 into ymm0
1680 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1681 vmulps(ymm7, ymm0, ymm0)
1682 vmulps(ymm6, ymm2, ymm2)
1683 vaddsubps(ymm2, ymm0, ymm0)
1684 vaddps(ymm12, ymm0, ymm0) // add the gemm result to ymm0
1685 vmovaps(ymm0, mem(rdx)) // store c81:cf1
1686 add(rdi, rdx) // c += cs_c;
1687
1688 // update c02:c72
1689 vmovaps(mem(rcx), ymm0) // load c02:c72 into ymm0
1690 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1691 vmulps(ymm7, ymm0, ymm0)
1692 vmulps(ymm6, ymm2, ymm2)
1693 vaddsubps(ymm2, ymm0, ymm0)
1694 vaddps(ymm11, ymm0, ymm0) // add the gemm result to ymm0
1695 vmovaps(ymm0, mem(rcx)) // store c02:c72
1696 add(rdi, rcx) // c += cs_c;
1697
1698 // update c82:cf2
1699 vmovaps(mem(rdx), ymm0) // load c82:f2 into ymm0
1700 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1701 vmulps(ymm7, ymm0, ymm0)
1702 vmulps(ymm6, ymm2, ymm2)
1703 vaddsubps(ymm2, ymm0, ymm0)
1704 vaddps(ymm10, ymm0, ymm0) // add the gemm result to ymm0
1705 vmovaps(ymm0, mem(rdx)) // store c82:cf2
1706 add(rdi, rdx) // c += cs_c;
1707
1708 // update c03:c73
1709 vmovaps(mem(rcx), ymm0) // load c03:c73 into ymm0
1710 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1711 vmulps(ymm7, ymm0, ymm0)
1712 vmulps(ymm6, ymm2, ymm2)
1713 vaddsubps(ymm2, ymm0, ymm0)
1714 vaddps(ymm9, ymm0, ymm0) // add the gemm result to ymm0
1715 vmovaps(ymm0, mem(rcx)) // store c03:c73
1716 add(rdi, rcx) // c += cs_c;
1717
1718 // update c83:cf3
1719 vmovaps(mem(rdx), ymm0) // load c83:f3 into ymm0
1720 vpermilps(imm(0xb1), ymm0, ymm2) // scale ymm0 by beta
1721 vmulps(ymm7, ymm0, ymm0)
1722 vmulps(ymm6, ymm2, ymm2)
1723 vaddsubps(ymm2, ymm0, ymm0)
1724 vaddps(ymm8, ymm0, ymm0) // add the gemm result to ymm0
1725 vmovaps(ymm0, mem(rdx)) // store c83:cf3
1726 add(rdi, rdx) // c += cs_c;
1727
1728 jmp(.CDONE) // jump to end.
1729
1730
1731 label(.CBETAZERO)
1732 // check if aligned/column-stored
1733 // check if aligned/column-stored
1734 and(bl, bh) // set ZF if bl & bh == 1.
1735 and(bh, al) // set ZF if bh & al == 1.
1736 jne(.CCOLSTORBZ) // jump to column storage case
1737
1738
1739 label(.CGENSTORBZ)
1740 // update c00:c70
1741 vextractf128(imm(1), ymm15, xmm2) // xmm2 := ymm0[4:7]
1742 vmovlpd(xmm15, mem(rcx)) // store (c00,c10)
1743 vmovhpd(xmm15, mem(rcx, rsi, 1)) // store (c20,c30)
1744 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c40,c50)
1745 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c60,c70)
1746 add(rdi, rcx) // c += cs_c;
1747
1748 // update c80:cf0
1749 vextractf128(imm(1), ymm14, xmm2) // xmm2 := ymm0[4:7]
1750 vmovlpd(xmm14, mem(rdx)) // store (c80,c90)
1751 vmovhpd(xmm14, mem(rdx, rsi, 1)) // store (ca0,cb0)
1752 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc0,cd0)
1753 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce0,cf0)
1754 add(rdi, rdx) // c += cs_c;
1755
1756 // update c01:c71
1757 vextractf128(imm(1), ymm13, xmm2) // xmm2 := ymm0[4:7]
1758 vmovlpd(xmm13, mem(rcx)) // store (c01,c11)
1759 vmovhpd(xmm13, mem(rcx, rsi, 1)) // store (c21,c31)
1760 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c41,c51)
1761 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c61,c71)
1762 add(rdi, rcx) // c += cs_c;
1763
1764 // update c81:cf1
1765 vextractf128(imm(1), ymm12, xmm2) // xmm2 := ymm0[4:7]
1766 vmovlpd(xmm12, mem(rdx)) // store (c81,c91)
1767 vmovhpd(xmm12, mem(rdx, rsi, 1)) // store (ca1,cb1)
1768 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc1,cd1)
1769 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce1,cf1)
1770 add(rdi, rdx) // c += cs_c;
1771
1772 // update c02:c72
1773 vextractf128(imm(1), ymm11, xmm2) // xmm2 := ymm0[4:7]
1774 vmovlpd(xmm11, mem(rcx)) // store (c02,c12)
1775 vmovhpd(xmm11, mem(rcx, rsi, 1)) // store (c22,c32)
1776 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c42,c52)
1777 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c62,c72)
1778 add(rdi, rcx) // c += cs_c;
1779
1780 // update c82:cf2
1781 vextractf128(imm(1), ymm10, xmm2) // xmm2 := ymm0[4:7]
1782 vmovlpd(xmm10, mem(rdx)) // store (c82,c92)
1783 vmovhpd(xmm10, mem(rdx, rsi, 1)) // store (ca2,cb2)
1784 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc2,cd2)
1785 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce2,cf2)
1786 add(rdi, rdx) // c += cs_c;
1787
1788 // update c03:c73
1789 vextractf128(imm(1), ymm9, xmm2) // xmm2 := ymm0[4:7]
1790 vmovlpd(xmm9, mem(rcx)) // store (c03,c13)
1791 vmovhpd(xmm9, mem(rcx, rsi, 1)) // store (c23,c33)
1792 vmovlpd(xmm2, mem(rcx, r12, 1)) // store (c43,c53)
1793 vmovhpd(xmm2, mem(rcx, r13, 1)) // store (c63,c73)
1794 add(rdi, rcx) // c += cs_c;
1795
1796 // update c83:cf3
1797 vextractf128(imm(1), ymm8, xmm2) // xmm2 := ymm0[4:7]
1798 vmovlpd(xmm8, mem(rdx)) // store (c83,c93)
1799 vmovhpd(xmm8, mem(rdx, rsi, 1)) // store (ca3,cb3)
1800 vmovlpd(xmm2, mem(rdx, r12, 1)) // store (cc3,cd3)
1801 vmovhpd(xmm2, mem(rdx, r13, 1)) // store (ce3,cf3)
1802 add(rdi, rdx) // c += cs_c;
1803
1804
1805 jmp(.CDONE) // jump to end.
1806
1807
1808 label(.CCOLSTORBZ)
1809
1810 vmovaps(ymm15, mem(rcx)) // store c00:c70
1811 add(rdi, rcx) // c += cs_c;
1812
1813 vmovaps(ymm14, mem(rdx)) // store c80:cf0
1814 add(rdi, rdx) // c += cs_c;
1815
1816 vmovaps(ymm13, mem(rcx)) // store c01:c71
1817 add(rdi, rcx) // c += cs_c;
1818
1819 vmovaps(ymm12, mem(rdx)) // store c81:cf1
1820 add(rdi, rdx) // c += cs_c;
1821
1822 vmovaps(ymm11, mem(rcx)) // store c02:c72
1823 add(rdi, rcx) // c += cs_c;
1824
1825 vmovaps(ymm10, mem(rdx)) // store c82:cf2
1826 add(rdi, rdx) // c += cs_c;
1827
1828 vmovaps(ymm9, mem(rcx)) // store c03:c73
1829 add(rdi, rcx) // c += cs_c;
1830
1831 vmovaps(ymm8, mem(rdx)) // store c83:cf3
1832 add(rdi, rdx) // c += cs_c;
1833
1834
1835
1836 label(.CDONE)
1837
1838
1839 end_asm(
1840 : // output operands (none)
1841 : // input operands
1842 [k_iter] "m" (k_iter), // 0
1843 [k_left] "m" (k_left), // 1
1844 [a] "m" (a), // 2
1845 [b] "m" (b), // 3
1846 [alpha] "m" (alpha), // 4
1847 [beta] "m" (beta), // 5
1848 [c] "m" (c), // 6
1849 [rs_c] "m" (rs_c), // 7
1850 [cs_c] "m" (cs_c), // 8
1851 [b_next] "m" (b_next)/*, // 9
1852 [a_next] "m" (a_next)*/ // 10
1853 : // register clobber list
1854 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
1855 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
1856 "ymm0", "ymm1", "ymm2", "ymm3",
1857 "ymm4", "ymm5", "ymm6", "ymm7",
1858 "ymm8", "ymm9", "ymm10", "ymm11",
1859 "ymm12", "ymm13", "ymm14", "ymm15",
1860 "memory"
1861 )
1862 }
1863
1864 #define MADDSUBPD_TO_YMM \
1865 vfmaddpd(ymm13, ymm0, ymm4, ymm13)\
1866 vfmaddpd(ymm9, ymm0, ymm5, ymm9)\
1867 vpermilpd(imm(0x5), ymm0, ymm0)\
1868 \
1869 vfmaddpd(ymm12, ymm1, ymm4, ymm12)\
1870 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)\
1871 vfmaddpd(ymm8, ymm1, ymm5, ymm8)\
1872 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)\
1873 \
1874 vpermilpd(imm(0x5), ymm1, ymm1)\
1875 vmulpd(ymm0, ymm2, ymm6)\
1876 vmulpd(ymm0, ymm3, ymm7)\
1877 vaddsubpd(ymm6, ymm15, ymm15)\
1878 vaddsubpd(ymm7, ymm11, ymm11)\
1879 \
1880
1881 #define Z_ALPHA(i, j) \
1882 vpermilpd(imm(0x5), ymm(i), ymm(j))\
1883 vmulpd(ymm7, ymm(i), ymm(i))\
1884 vmulpd(ymm6, ymm(j), ymm(j))\
1885 vaddsubpd(ymm(j), ymm(i), ymm(i))\
1886
1887
bli_zgemm_bulldozer_asm_4x4_fma4(dim_t k0,dcomplex * restrict alpha,dcomplex * restrict a,dcomplex * restrict b,dcomplex * restrict beta,dcomplex * restrict c,inc_t rs_c0,inc_t cs_c0,auxinfo_t * restrict data,cntx_t * restrict cntx)1888 void bli_zgemm_bulldozer_asm_4x4_fma4
1889 (
1890 dim_t k0,
1891 dcomplex* restrict alpha,
1892 dcomplex* restrict a,
1893 dcomplex* restrict b,
1894 dcomplex* restrict beta,
1895 dcomplex* restrict c, inc_t rs_c0, inc_t cs_c0,
1896 auxinfo_t* restrict data,
1897 cntx_t* restrict cntx
1898 )
1899 {
1900 //void* a_next = bli_auxinfo_next_a( data );
1901 //void* b_next = bli_auxinfo_next_b( data );
1902
1903 // Typecast local copies of integers in case dim_t and inc_t are a
1904 // different size than is expected by load instructions.
1905 uint64_t k_iter = k0 / 4;
1906 uint64_t k_left = k0 % 4;
1907 uint64_t rs_c = rs_c0;
1908 uint64_t cs_c = cs_c0;
1909
1910 begin_asm()
1911
1912
1913 mov(var(a), rax) // load address of a.
1914 mov(var(b), rbx) // load address of b.
1915 //mov(var(b_next), r15) // load address of b_next.
1916 //mov(var(a_next), r14) // load address of a_next.
1917
1918 vmovapd(mem(rax, 0*32), ymm0) // initialize loop by pre-loading
1919 vmovddup(mem(rbx, 0+0*32), ymm2)
1920 vmovddup(mem(rbx, 0+1*32), ymm3)
1921
1922 mov(var(c), rcx) // load address of c
1923 mov(var(cs_c), rdi) // load cs_c
1924 lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(dcomplex)
1925 lea(mem(, rdi, 2), rdi)
1926 lea(mem(rcx, rdi, 2), r10) // load address of c + 2*cs_c;
1927
1928 prefetch(0, mem(rcx, 3*8)) // prefetch c + 0*cs_c
1929 prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*cs_c
1930 prefetch(0, mem(r10, 3*8)) // prefetch c + 2*cs_c
1931 prefetch(0, mem(r10, rdi, 1, 3*8)) // prefetch c + 3*cs_c
1932
1933 vxorpd(ymm8, ymm8, ymm8)
1934 vxorpd(ymm9, ymm9, ymm9)
1935 vxorpd(ymm10, ymm10, ymm10)
1936 vxorpd(ymm11, ymm11, ymm11)
1937 vxorpd(ymm12, ymm12, ymm12)
1938 vxorpd(ymm13, ymm13, ymm13)
1939 vxorpd(ymm14, ymm14, ymm14)
1940 vxorpd(ymm15, ymm15, ymm15)
1941
1942
1943 mov(var(k_iter), rsi) // i = k_iter;
1944 test(rsi, rsi) // check i via logical AND.
1945 je(.ZCONSIDKLEFT) // if i == 0, jump to code that
1946 // contains the k_left loop.
1947
1948 label(.ZLOOPKITER) // MAIN LOOP
1949
1950 // iteration 0
1951 vmovapd(mem(rax, 1*32), ymm1)
1952 vfmaddpd(ymm15, ymm0, ymm2, ymm15)
1953 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
1954 vfmaddpd(ymm11, ymm0, ymm3, ymm11)
1955 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
1956
1957 prefetch(0, mem(rax, 16*32))
1958 vfmaddpd(ymm14, ymm1, ymm2, ymm14)
1959 vmovddup(mem(rbx, 8+0*32), ymm2)
1960 vfmaddpd(ymm10, ymm1, ymm3, ymm10)
1961 vmovddup(mem(rbx, 8+1*32), ymm3)
1962
1963 MADDSUBPD_TO_YMM
1964 vmulpd(ymm1, ymm2, ymm6)
1965 vmovddup(mem(rbx, 0+2*32), ymm2)
1966 vmulpd(ymm1, ymm3, ymm7)
1967 vmovddup(mem(rbx, 0+3*32), ymm3)
1968 vaddsubpd(ymm6, ymm14, ymm14)
1969 vaddsubpd(ymm7, ymm10, ymm10)
1970
1971 vmulpd(ymm0, ymm4, ymm6)
1972 vmulpd(ymm0, ymm5, ymm7)
1973 vmovapd(mem(rax, 2*32), ymm0)
1974 vaddsubpd(ymm6, ymm13, ymm13)
1975 vaddsubpd(ymm7, ymm9, ymm9)
1976
1977 vmulpd(ymm1, ymm4, ymm6)
1978 vmulpd(ymm1, ymm5, ymm7)
1979 vaddsubpd(ymm6, ymm12, ymm12)
1980 vaddsubpd(ymm7, ymm8, ymm8)
1981
1982 // iteration 1
1983 vmovapd(mem(rax, 3*32), ymm1)
1984 vfmaddpd(ymm15, ymm0, ymm2, ymm15)
1985 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
1986 vfmaddpd(ymm11, ymm0, ymm3, ymm11)
1987 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
1988
1989 prefetch(0, mem(rax, 18*32))
1990 vfmaddpd(ymm14, ymm1, ymm2, ymm14)
1991 vmovddup(mem(rbx, 8+2*32), ymm2)
1992 vfmaddpd(ymm10, ymm1, ymm3, ymm10)
1993 vmovddup(mem(rbx, 8+3*32), ymm3)
1994
1995 MADDSUBPD_TO_YMM
1996 vmulpd(ymm1, ymm2, ymm6)
1997 vmovddup(mem(rbx, 0+4*32), ymm2)
1998 vmulpd(ymm1, ymm3, ymm7)
1999 vmovddup(mem(rbx, 0+5*32), ymm3)
2000 vaddsubpd(ymm6, ymm14, ymm14)
2001 vaddsubpd(ymm7, ymm10, ymm10)
2002
2003 vmulpd(ymm0, ymm4, ymm6)
2004 vmulpd(ymm0, ymm5, ymm7)
2005 vmovapd(mem(rax, 4*32), ymm0)
2006 vaddsubpd(ymm6, ymm13, ymm13)
2007 vaddsubpd(ymm7, ymm9, ymm9)
2008
2009 vmulpd(ymm1, ymm4, ymm6)
2010 vmulpd(ymm1, ymm5, ymm7)
2011 vaddsubpd(ymm6, ymm12, ymm12)
2012 vaddsubpd(ymm7, ymm8, ymm8)
2013
2014 // iteration 2
2015 vmovapd(mem(rax, 5*32), ymm1)
2016 vfmaddpd(ymm15, ymm0, ymm2, ymm15)
2017 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
2018 vfmaddpd(ymm11, ymm0, ymm3, ymm11)
2019 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
2020
2021 prefetch(0, mem(rax, 20*32))
2022 vfmaddpd(ymm14, ymm1, ymm2, ymm14)
2023 vmovddup(mem(rbx, 8+4*32), ymm2)
2024 vfmaddpd(ymm10, ymm1, ymm3, ymm10)
2025 vmovddup(mem(rbx, 8+5*32), ymm3)
2026
2027 MADDSUBPD_TO_YMM
2028 vmulpd(ymm1, ymm2, ymm6)
2029 vmovddup(mem(rbx, 0+6*32), ymm2)
2030 vmulpd(ymm1, ymm3, ymm7)
2031 vmovddup(mem(rbx, 0+7*32), ymm3)
2032 vaddsubpd(ymm6, ymm14, ymm14)
2033 vaddsubpd(ymm7, ymm10, ymm10)
2034
2035 vmulpd(ymm0, ymm4, ymm6)
2036 vmulpd(ymm0, ymm5, ymm7)
2037 vmovapd(mem(rax, 6*32), ymm0)
2038 vaddsubpd(ymm6, ymm13, ymm13)
2039 vaddsubpd(ymm7, ymm9, ymm9)
2040
2041 vmulpd(ymm1, ymm4, ymm6)
2042 vmulpd(ymm1, ymm5, ymm7)
2043 vaddsubpd(ymm6, ymm12, ymm12)
2044 vaddsubpd(ymm7, ymm8, ymm8)
2045
2046 // iteration 3
2047 vmovapd(mem(rax, 7*32), ymm1)
2048 vfmaddpd(ymm15, ymm0, ymm2, ymm15)
2049 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
2050 vfmaddpd(ymm11, ymm0, ymm3, ymm11)
2051 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
2052
2053 prefetch(0, mem(rax, 22*32))
2054 vfmaddpd(ymm14, ymm1, ymm2, ymm14)
2055 vmovddup(mem(rbx, 8+6*32), ymm2)
2056 vfmaddpd(ymm10, ymm1, ymm3, ymm10)
2057 vmovddup(mem(rbx, 8+7*32), ymm3)
2058
2059 MADDSUBPD_TO_YMM
2060 vmulpd(ymm1, ymm2, ymm6)
2061 vmovddup(mem(rbx, 0+8*32), ymm2)
2062 vmulpd(ymm1, ymm3, ymm7)
2063 vmovddup(mem(rbx, 0+9*32), ymm3)
2064 vaddsubpd(ymm6, ymm14, ymm14)
2065 vaddsubpd(ymm7, ymm10, ymm10)
2066
2067 vmulpd(ymm0, ymm4, ymm6)
2068 vmulpd(ymm0, ymm5, ymm7)
2069 vmovapd(mem(rax, 8*32), ymm0)
2070 vaddsubpd(ymm6, ymm13, ymm13)
2071 vaddsubpd(ymm7, ymm9, ymm9)
2072
2073 vmulpd(ymm1, ymm4, ymm6)
2074 vmulpd(ymm1, ymm5, ymm7)
2075 vaddsubpd(ymm6, ymm12, ymm12)
2076 vaddsubpd(ymm7, ymm8, ymm8)
2077
2078 add(imm(4*4*16), rbx) // b += 4*4 (unroll x nr)
2079 add(imm(4*4*16), rax) // a += 4*4 (unroll x mr)
2080
2081 dec(rsi) // i -= 1;
2082 jne(.ZLOOPKITER) // iterate again if i != 0.
2083
2084
2085 label(.ZCONSIDKLEFT)
2086
2087 mov(var(k_left), rsi) // i = k_left;
2088 test(rsi, rsi) // check i via logical AND.
2089 je(.ZPOSTACCUM) // if i == 0, we're done; jump to end.
2090 // else, we prepare to enter k_left loop.
2091
2092
2093 label(.ZLOOPKLEFT) // EDGE LOOP
2094
2095 // iteration 0
2096 vmovapd(mem(rax, 1*32), ymm1)
2097 vfmaddpd(ymm15, ymm0, ymm2, ymm15)
2098 vperm2f128(imm(0x3), ymm2, ymm2, ymm4)
2099 vfmaddpd(ymm11, ymm0, ymm3, ymm11)
2100 vperm2f128(imm(0x3), ymm3, ymm3, ymm5)
2101
2102 prefetch(0, mem(rax, 16*32))
2103 vfmaddpd(ymm14, ymm1, ymm2, ymm14)
2104 vmovddup(mem(rbx, 8+0*32), ymm2)
2105 vfmaddpd(ymm10, ymm1, ymm3, ymm10)
2106 vmovddup(mem(rbx, 8+1*32), ymm3)
2107
2108 MADDSUBPD_TO_YMM
2109 vmulpd(ymm1, ymm2, ymm6)
2110 vmovddup(mem(rbx, 0+2*32), ymm2)
2111 vmulpd(ymm1, ymm3, ymm7)
2112 vmovddup(mem(rbx, 0+3*32), ymm3)
2113 vaddsubpd(ymm6, ymm14, ymm14)
2114 vaddsubpd(ymm7, ymm10, ymm10)
2115
2116 vmulpd(ymm0, ymm4, ymm6)
2117 vmulpd(ymm0, ymm5, ymm7)
2118 vmovapd(mem(rax, 2*32), ymm0)
2119 vaddsubpd(ymm6, ymm13, ymm13)
2120 vaddsubpd(ymm7, ymm9, ymm9)
2121
2122 vmulpd(ymm1, ymm4, ymm6)
2123 vmulpd(ymm1, ymm5, ymm7)
2124 vaddsubpd(ymm6, ymm12, ymm12)
2125 vaddsubpd(ymm7, ymm8, ymm8)
2126
2127
2128 add(imm(4*1*16), rax) // a += 4 (1 x mr)
2129 add(imm(4*1*16), rbx) // b += 4 (1 x nr)
2130
2131 dec(rsi) // i -= 1;
2132 jne(.ZLOOPKLEFT) // iterate again if i != 0.
2133
2134
2135 label(.ZPOSTACCUM)
2136 // ymm15: ymm13: ymm11: ymm9:
2137 // ( ab00 ( ab01 ( ab02 ( ab03
2138 // ab10 ab11 ab12 ab13
2139 // ab21 ab20 ab23 ab22
2140 // ab31 ) ab30 ) ab33 ) ab32 )
2141
2142 // ymm14: ymm12: ymm10: ymm8:
2143 // ( ab40 ( ab41 ( ab42 ( ab43
2144 // ab50 ab51 ab52 ab53
2145 // ab61 ab60 ab63 ab62
2146 // ab71 ) ab70 ) ab73 ) ab72 )
2147
2148 vmovapd(ymm15, ymm7)
2149 vperm2f128(imm(0x12), ymm15, ymm13, ymm15)
2150 vperm2f128(imm(0x30), ymm7, ymm13, ymm13)
2151
2152 vmovapd(ymm11, ymm7)
2153 vperm2f128(imm(0x12), ymm11, ymm9, ymm11)
2154 vperm2f128(imm(0x30), ymm7, ymm9, ymm9)
2155
2156 vmovapd(ymm14, ymm7)
2157 vperm2f128(imm(0x12), ymm14, ymm12, ymm14)
2158 vperm2f128(imm(0x30), ymm7, ymm12, ymm12)
2159
2160 vmovapd(ymm10, ymm7)
2161 vperm2f128(imm(0x12), ymm10, ymm8, ymm10)
2162 vperm2f128(imm(0x30), ymm7, ymm8, ymm8)
2163
2164
2165 // ymm15: ymm13: ymm11: ymm9:
2166 // ( ab00 ( ab01 ( ab02 ( ab03
2167 // ab10 ab11 ab12 ab13
2168 // ab20 ab21 ab22 ab23
2169 // ab30 ) ab31 ) ab32 ) ab33 )
2170
2171 // ymm14: ymm12: ymm10: ymm8:
2172 // ( ab40 ( ab41 ( ab42 ( ab43
2173 // ab50 ab51 ab52 ab53
2174 // ab60 ab61 ab62 ab63
2175 // ab70 ) ab71 ) ab72 ) ab73 )
2176
2177
2178 // scale by alpha
2179
2180 mov(var(alpha), rax) // load address of alpha
2181 vbroadcastsd(mem(rax), ymm7) // load alpha_r and duplicate
2182 vbroadcastsd(mem(rax, 8), ymm6) // load alpha_i and duplicate
2183
2184 Z_ALPHA(15, 3)
2185 Z_ALPHA(14, 2)
2186 Z_ALPHA(13, 1)
2187 Z_ALPHA(12, 0)
2188
2189 Z_ALPHA(11, 3)
2190 Z_ALPHA(10, 2)
2191 Z_ALPHA(9, 1)
2192 Z_ALPHA(8, 0)
2193
2194 mov(var(beta), rbx) // load address of beta
2195 vbroadcastsd(mem(rbx), ymm7) // load beta_r and duplicate
2196 vbroadcastsd(mem(rbx, 8), ymm6) // load beta_i and duplicate
2197
2198
2199
2200 mov(var(rs_c), rsi) // load rs_c
2201 lea(mem(, rsi, 8), rsi) // rsi = rs_c * sizeof(dcomplex)
2202 lea(mem(, rsi, 2), rsi)
2203 lea(mem(rcx, rsi, 2), rdx) // load address of c + 2*rs_c;
2204
2205
2206
2207 // determine if
2208 // c % 32 == 0, AND
2209 // 16*cs_c % 32 == 0, AND
2210 // rs_c == 1
2211 // ie: aligned, ldim aligned, and
2212 // column-stored
2213
2214 cmp(imm(16), rsi) // set ZF if (16*rs_c) == 16.
2215 sete(bl) // bl = ( ZF == 1 ? 1 : 0 );
2216 test(imm(31), rcx) // set ZF if c & 32 is zero.
2217 setz(bh) // bh = ( ZF == 0 ? 1 : 0 );
2218 test(imm(31), rdi) // set ZF if (16*cs_c) & 32 is zero.
2219 setz(al) // al = ( ZF == 0 ? 1 : 0 );
2220 // and(bl,bh) followed by
2221 // and(bh,al) will reveal result
2222
2223 // now avoid loading C if beta == 0
2224
2225 vxorpd(ymm0, ymm0, ymm0) // set ymm0 to zero.
2226 vucomisd(xmm0, xmm7) // set ZF if beta_r == 0.
2227 sete(r8b) // r8b = ( ZF == 1 ? 1 : 0 );
2228 vucomisd(xmm0, xmm6) // set ZF if beta_i == 0.
2229 sete(r9b) // r9b = ( ZF == 1 ? 1 : 0 );
2230 and(r8b, r9b) // set ZF if r8b & r9b == 1.
2231 jne(.ZBETAZERO) // if ZF = 0, jump to beta == 0 case
2232
2233
2234 // check if aligned/column-stored
2235 and(bl, bh) // set ZF if bl & bh == 1.
2236 and(bh, al) // set ZF if bh & al == 1.
2237 jne(.ZCOLSTORED) // jump to column storage case
2238
2239
2240
2241 label(.ZGENSTORED)
2242 // update c00:c30
2243
2244 vmovupd(mem(rcx), xmm0) // load (c00,c10) into xmm0
2245 vmovupd(mem(rcx, rsi, 1), xmm2) // load (c20,c30) into xmm2
2246 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2247 Z_ALPHA(0, 2) // scale ymm0 by beta
2248 vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
2249 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2250 vmovupd(xmm0, mem(rcx)) // store (c00,c10)
2251 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
2252 add(rdi, rcx) // c += cs_c;
2253
2254 // update c40:c70
2255
2256 vmovupd(mem(rdx), xmm0) // load (c40,c50) into xmm0
2257 vmovupd(mem(rdx, rsi, 1), xmm2) // load (c60,c70) into xmm2
2258 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2259 Z_ALPHA(0, 2) // scale ymm0 by beta
2260 vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
2261 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2262 vmovupd(xmm0, mem(rdx)) // store (c40,c50)
2263 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
2264 add(rdi, rdx) // c += cs_c;
2265
2266 // update c01:c31
2267
2268 vmovupd(mem(rcx), xmm0) // load (c01,c11) into xmm0
2269 vmovupd(mem(rcx, rsi, 1), xmm2) // load (c21,c31) into xmm2
2270 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2271 Z_ALPHA(0, 2) // scale ymm0 by beta
2272 vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
2273 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2274 vmovupd(xmm0, mem(rcx)) // store (c01,c11)
2275 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
2276 add(rdi, rcx) // c += cs_c;
2277
2278 // update c41:c71
2279
2280 vmovupd(mem(rdx), xmm0) // load (c41,c51) into xmm0
2281 vmovupd(mem(rdx, rsi, 1), xmm2) // load (c61,c71) into xmm2
2282 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2283 Z_ALPHA(0, 2) // scale ymm0 by beta
2284 vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
2285 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2286 vmovupd(xmm0, mem(rdx)) // store (c41,c51)
2287 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
2288 add(rdi, rdx) // c += cs_c;
2289
2290 // update c02:c32
2291
2292 vmovupd(mem(rcx), xmm0) // load (c02,c12) into xmm0
2293 vmovupd(mem(rcx, rsi, 1), xmm2) // load (c22,c32) into xmm2
2294 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2295 Z_ALPHA(0, 2) // scale ymm0 by beta
2296 vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
2297 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2298 vmovupd(xmm0, mem(rcx)) // store (c02,c12)
2299 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
2300 add(rdi, rcx) // c += cs_c;
2301
2302 // update c42:c72
2303
2304 vmovupd(mem(rdx), xmm0) // load (c42,c52) into xmm0
2305 vmovupd(mem(rdx, rsi, 1), xmm2) // load (c62,c72) into xmm2
2306 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2307 Z_ALPHA(0, 2) // scale ymm0 by beta
2308 vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
2309 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2310 vmovupd(xmm0, mem(rdx)) // store (c42,c52)
2311 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
2312 add(rdi, rdx) // c += cs_c;
2313
2314 // update c03:c33
2315
2316 vmovupd(mem(rcx), xmm0) // load (c03,c13) into xmm0
2317 vmovupd(mem(rcx, rsi, 1), xmm2) // load (c23,c33) into xmm2
2318 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2319 Z_ALPHA(0, 2) // scale ymm0 by beta
2320 vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
2321 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2322 vmovupd(xmm0, mem(rcx)) // store (c03,c13)
2323 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
2324 add(rdi, rcx) // c += cs_c;
2325
2326 // update c43:c73
2327
2328 vmovupd(mem(rdx), xmm0) // load (c43,c53) into xmm0
2329 vmovupd(mem(rdx, rsi, 1), xmm2) // load (c63,c73) into xmm2
2330 vinsertf128(imm(1), xmm2, ymm0, ymm0) // ymm0 := (ymm0[0:1],xmm2)
2331 Z_ALPHA(0, 2) // scale ymm0 by beta
2332 vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
2333 vextractf128(imm(1), ymm0, xmm2) // xmm2 := ymm0[2:3]
2334 vmovupd(xmm0, mem(rdx)) // store (c43,c53)
2335 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
2336
2337
2338
2339 jmp(.ZDONE) // jump to end.
2340
2341
2342
2343 label(.ZCOLSTORED)
2344 // update c00:c30
2345
2346 vmovapd(mem(rcx), ymm0) // load c00:c30 into ymm0
2347 Z_ALPHA(0, 2) // scale ymm0 by beta
2348 vaddpd(ymm15, ymm0, ymm0) // add the gemm result to ymm0
2349 vmovapd(ymm0, mem(rcx)) // store c00:c30
2350 add(rdi, rcx) // c += cs_c;
2351
2352 // update c40:c70
2353
2354 vmovapd(mem(rdx), ymm0) // load c40:c70 into ymm0
2355 Z_ALPHA(0, 2) // scale ymm0 by beta
2356 vaddpd(ymm14, ymm0, ymm0) // add the gemm result to ymm0
2357 vmovapd(ymm0, mem(rdx)) // store c40:c70
2358 add(rdi, rdx) // c += cs_c;
2359
2360 // update c01:c31
2361
2362 vmovapd(mem(rcx), ymm0) // load c01:c31 into ymm0
2363 Z_ALPHA(0, 2) // scale ymm0 by beta
2364 vaddpd(ymm13, ymm0, ymm0) // add the gemm result to ymm0
2365 vmovapd(ymm0, mem(rcx)) // store c01:c31
2366 add(rdi, rcx) // c += cs_c;
2367
2368 // update c41:c71
2369
2370 vmovapd(mem(rdx), ymm0) // load c41:c71 into ymm0
2371 Z_ALPHA(0, 2) // scale ymm0 by beta
2372 vaddpd(ymm12, ymm0, ymm0) // add the gemm result to ymm0
2373 vmovapd(ymm0, mem(rdx)) // store c41:c71
2374 add(rdi, rdx) // c += cs_c;
2375
2376 // update c02:c32
2377
2378 vmovapd(mem(rcx), ymm0) // load c02:c32 into ymm0
2379 Z_ALPHA(0, 2) // scale ymm0 by beta
2380 vaddpd(ymm11, ymm0, ymm0) // add the gemm result to ymm0
2381 vmovapd(ymm0, mem(rcx)) // store c02:c32
2382 add(rdi, rcx) // c += cs_c;
2383
2384 // update c42:c72
2385
2386 vmovapd(mem(rdx), ymm0) // load c42:c72 into ymm0
2387 Z_ALPHA(0, 2) // scale ymm0 by beta
2388 vaddpd(ymm10, ymm0, ymm0) // add the gemm result to ymm0
2389 vmovapd(ymm0, mem(rdx)) // store c42:c72
2390 add(rdi, rdx) // c += cs_c;
2391
2392 // update c03:c33
2393
2394 vmovapd(mem(rcx), ymm0) // load c03:c33 into ymm0
2395 Z_ALPHA(0, 2) // scale ymm0 by beta
2396 vaddpd(ymm9, ymm0, ymm0) // add the gemm result to ymm0
2397 vmovapd(ymm0, mem(rcx)) // store c03:c33
2398 add(rdi, rcx) // c += cs_c;
2399
2400 // update c43:c73
2401
2402 vmovapd(mem(rdx), ymm0) // load c43:c73 into ymm0
2403 Z_ALPHA(0, 2) // scale ymm0 by beta
2404 vaddpd(ymm8, ymm0, ymm0) // add the gemm result to ymm0
2405 vmovapd(ymm0, mem(rdx)) // store c43:c73
2406
2407
2408
2409 jmp(.ZDONE) // jump to end.
2410
2411
2412
2413 label(.ZBETAZERO)
2414 // check if aligned/column-stored
2415 // check if aligned/column-stored
2416 and(bl, bh) // set ZF if bl & bh == 1.
2417 and(bh, al) // set ZF if bh & al == 1.
2418 jne(.ZCOLSTORBZ) // jump to column storage case
2419
2420
2421
2422 label(.ZGENSTORBZ)
2423 // update c00:c30
2424
2425 vextractf128(imm(1), ymm15, xmm2)
2426 vmovupd(xmm15, mem(rcx)) // store (c00,c10)
2427 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c20,c30)
2428 add(rdi, rcx) // c += cs_c;
2429
2430 // update c40:c70
2431
2432 vextractf128(imm(1), ymm14, xmm2)
2433 vmovupd(xmm14, mem(rdx)) // store (c40,c50)
2434 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c60,c70)
2435 add(rdi, rdx) // c += cs_c;
2436
2437 // update c01:c31
2438
2439 vextractf128(imm(1), ymm13, xmm2)
2440 vmovupd(xmm13, mem(rcx)) // store (c01,c11)
2441 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c21,c31)
2442 add(rdi, rcx) // c += cs_c;
2443
2444 // update c41:c71
2445
2446 vextractf128(imm(1), ymm12, xmm2)
2447 vmovupd(xmm12, mem(rdx)) // store (c41,c51)
2448 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c61,c71)
2449 add(rdi, rdx) // c += cs_c;
2450
2451 // update c02:c32
2452
2453 vextractf128(imm(1), ymm11, xmm2)
2454 vmovupd(xmm11, mem(rcx)) // store (c02,c12)
2455 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c22,c32)
2456 add(rdi, rcx) // c += cs_c;
2457
2458 // update c42:c72
2459
2460 vextractf128(imm(1), ymm10, xmm2)
2461 vmovupd(xmm10, mem(rdx)) // store (c42,c52)
2462 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c62,c72)
2463 add(rdi, rdx) // c += cs_c;
2464
2465 // update c03:c33
2466
2467 vextractf128(imm(1), ymm9, xmm2)
2468 vmovupd(xmm9, mem(rcx)) // store (c03,c13)
2469 vmovupd(xmm2, mem(rcx, rsi, 1)) // store (c23,c33)
2470 add(rdi, rcx) // c += cs_c;
2471
2472 // update c43:c73
2473
2474 vextractf128(imm(1), ymm8, xmm2)
2475 vmovupd(xmm8, mem(rdx)) // store (c43,c53)
2476 vmovupd(xmm2, mem(rdx, rsi, 1)) // store (c63,c73)
2477
2478
2479 jmp(.ZDONE) // jump to end.
2480
2481
2482 label(.ZCOLSTORBZ)
2483
2484
2485 vmovapd(ymm15, mem(rcx)) // store c00:c30
2486 add(rdi, rcx) // c += cs_c;
2487
2488 vmovapd(ymm14, mem(rdx)) // store c40:c70
2489 add(rdi, rdx) // c += cs_c;
2490
2491 vmovapd(ymm13, mem(rcx)) // store c01:c31
2492 add(rdi, rcx) // c += cs_c;
2493
2494 vmovapd(ymm12, mem(rdx)) // store c41:c71
2495 add(rdi, rdx) // c += cs_c;
2496
2497 vmovapd(ymm11, mem(rcx)) // store c02:c32
2498 add(rdi, rcx) // c += cs_c;
2499
2500 vmovapd(ymm10, mem(rdx)) // store c42:c72
2501 add(rdi, rdx) // c += cs_c;
2502
2503 vmovapd(ymm9, mem(rcx)) // store c03:c33
2504 add(rdi, rcx) // c += cs_c;
2505
2506 vmovapd(ymm8, mem(rdx)) // store c43:c73
2507
2508
2509 label(.ZDONE)
2510
2511
2512 end_asm(
2513 : // output operands (none)
2514 : // input operands
2515 [k_iter] "m" (k_iter), // 0
2516 [k_left] "m" (k_left), // 1
2517 [a] "m" (a), // 2
2518 [b] "m" (b), // 3
2519 [alpha] "m" (alpha), // 4
2520 [beta] "m" (beta), // 5
2521 [c] "m" (c), // 6
2522 [rs_c] "m" (rs_c), // 7
2523 [cs_c] "m" (cs_c)/*, // 8
2524 [b_next] "m" (b_next), // 9
2525 [a_next] "m" (a_next)*/ // 10
2526 : // register clobber list
2527 "rax", "rbx", "rcx", "rdx", "rsi", "rdi",
2528 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
2529 "ymm0", "ymm1", "ymm2", "ymm3",
2530 "ymm4", "ymm5", "ymm6", "ymm7",
2531 "ymm8", "ymm9", "ymm10", "ymm11",
2532 "ymm12", "ymm13", "ymm14", "ymm15",
2533 "memory"
2534 )
2535 }
2536
2537