1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #include <assert.h>
37 
38 #include "../knl/bli_avx512_macros.h"
39 #include "common.h"
40 
41 #define CACHELINE_SIZE 64 //size of cache line in bytes
42 
43 /* During each subiteration, prefetching 2 cache lines of B
44  * UNROLL factor ahead. 2cache lines = 32 floats (NR).
45  * */
46 #define PREFETCH_B_L1(n, k) \
47     PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*32*4 + (2*n+k)  * CACHELINE_SIZE))
48 
49 #define LOOP_ALIGN ALIGN16
50 
51 #define UPDATE_C(R1,R2,R3,R4) \
52 \
53     VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
54     VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
55     VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
56     VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
57     VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
58     VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
59     VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
60     VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
61     VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
62     VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
63     VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
64     VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
65     LEA(RCX, MEM(RCX,RAX,2))
66 
67 #define UPDATE_C_BZ(R1,R2,R3,R4) \
68 \
69     VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
70     VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
71     VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
72     VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
73     VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
74     VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
75     VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
76     VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
77     LEA(RCX, MEM(RCX,RAX,2))
78 
79 #define UPDATE_C_BZ_RS_ONE(R,C) \
80 \
81     KXNORW(K(1), K(0), K(0)) \
82     KXNORW(K(2), K(0), K(0)) \
83     VMULPS(ZMM(R), ZMM(R), ZMM(0)) \
84     VMOVAPS(ZMM(4), ZMM(R)) \
85     VEXTRACTF64X4(YMM(5), ZMM(R), IMM(1)) \
86     VSCATTERQPS(MEM(C,ZMM(2),1) MASK_K(1), YMM(4)) \
87     VSCATTERQPS(MEM(C,ZMM(3),1) MASK_K(2), YMM(5))
88 
89 #define UPDATE_C_RS_ONE(R,C) \
90 \
91     KXNORW(K(1), K(0), K(0)) \
92     KXNORW(K(2), K(0), K(0)) \
93     KXNORW(K(3), K(0), K(0)) \
94     KXNORW(K(4), K(0), K(0)) \
95     VMULPS(ZMM(R), ZMM(R), ZMM(0)) \
96     VMOVAPS(ZMM(4), ZMM(R)) \
97     VEXTRACTF64X4(YMM(5), ZMM(R), IMM(1)) \
98     VGATHERQPS(YMM(6) MASK_K(1), MEM(C,ZMM(2),1)) \
99     VGATHERQPS(YMM(7) MASK_K(2), MEM(C,ZMM(3),1)) \
100     VFMADD231PS(YMM(4), YMM(6), YMM(1)) \
101     VFMADD231PS(YMM(5), YMM(7), YMM(1)) \
102     VSCATTERQPS(MEM(C,ZMM(2),1) MASK_K(3), YMM(4)) \
103     VSCATTERQPS(MEM(C,ZMM(3),1) MASK_K(4), YMM(5))
104 
105 #define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
106 \
107     UPDATE_C_RS_ONE(R1,RCX) \
108     UPDATE_C_RS_ONE(R2,RDX) \
109 \
110     LEA(RCX, MEM(RCX,RAX,1)) \
111     LEA(RDX, MEM(RDX,RAX,1)) \
112 \
113     UPDATE_C_RS_ONE(R3,RCX) \
114     UPDATE_C_RS_ONE(R4,RDX) \
115 \
116     LEA(RCX, MEM(RCX,RAX,1)) \
117     LEA(RDX, MEM(RDX,RAX,1))
118 
119 #define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
120 \
121     UPDATE_C_BZ_RS_ONE(R1,RCX) \
122     UPDATE_C_BZ_RS_ONE(R2,RDX) \
123 \
124     LEA(RCX, MEM(RCX,RAX,1)) \
125     LEA(RDX, MEM(RDX,RAX,1)) \
126 \
127     UPDATE_C_BZ_RS_ONE(R3,RCX) \
128     UPDATE_C_BZ_RS_ONE(R4,RDX) \
129 \
130     LEA(RCX, MEM(RCX,RAX,1)) \
131     LEA(RDX, MEM(RDX,RAX,1))
132 
133 #ifdef PREFETCH_C_L2
134 #undef PREFETCH_C_L2
135 #define PREFETCH_C_L2 \
136 \
137     PREFETCH(1, MEM(RCX,      0*64)) \
138     PREFETCH(1, MEM(RCX,      1*64)) \
139     \
140     PREFETCH(1, MEM(RCX,R12,1,0*64)) \
141     PREFETCH(1, MEM(RCX,R12,1,1*64)) \
142     \
143     PREFETCH(1, MEM(RCX,R12,2,0*64)) \
144     PREFETCH(1, MEM(RCX,R12,2,1*64)) \
145     \
146     PREFETCH(1, MEM(RCX,R13,1,0*64)) \
147     PREFETCH(1, MEM(RCX,R13,1,1*64)) \
148     \
149     PREFETCH(1, MEM(RCX,R12,4,0*64)) \
150     PREFETCH(1, MEM(RCX,R12,4,1*64)) \
151     \
152     PREFETCH(1, MEM(RCX,R14,1,0*64)) \
153     PREFETCH(1, MEM(RCX,R14,1,1*64)) \
154     \
155     PREFETCH(1, MEM(RCX,R13,2,0*64)) \
156     PREFETCH(1, MEM(RCX,R13,2,1*64)) \
157     \
158     PREFETCH(1, MEM(RCX,R15,1,0*64)) \
159     PREFETCH(1, MEM(RCX,R15,1,1*64)) \
160     \
161     PREFETCH(1, MEM(RDX,      0*64)) \
162     PREFETCH(1, MEM(RDX,      1*64)) \
163     \
164     PREFETCH(1, MEM(RDX,R12,1,0*64)) \
165     PREFETCH(1, MEM(RDX,R12,1,1*64)) \
166     \
167     PREFETCH(1, MEM(RDX,R12,2,0*64)) \
168     PREFETCH(1, MEM(RDX,R12,2,1*64)) \
169     \
170     PREFETCH(1, MEM(RDX,R13,1,0*64)) \
171     PREFETCH(1, MEM(RDX,R13,1,1*64))
172 
173 #else
174 #undef PREFETCH_C_L2
175 #define PREFETCH_C_L2
176 #endif
177 
178 
179 #define PREFETCH_C_L1 \
180 \
181     PREFETCHW0(MEM(RCX,      0*64)) \
182     PREFETCHW0(MEM(RCX,      1*64)) \
183     PREFETCHW0(MEM(RCX,R12,1,0*64)) \
184     PREFETCHW0(MEM(RCX,R12,1,1*64)) \
185     PREFETCHW0(MEM(RCX,R12,2,0*64)) \
186     PREFETCHW0(MEM(RCX,R12,2,1*64)) \
187     PREFETCHW0(MEM(RCX,R13,1,0*64)) \
188     PREFETCHW0(MEM(RCX,R13,1,1*64)) \
189     PREFETCHW0(MEM(RCX,R12,4,0*64)) \
190     PREFETCHW0(MEM(RCX,R12,4,1*64)) \
191     PREFETCHW0(MEM(RCX,R14,1,0*64)) \
192     PREFETCHW0(MEM(RCX,R14,1,1*64)) \
193     PREFETCHW0(MEM(RCX,R13,2,0*64)) \
194     PREFETCHW0(MEM(RCX,R13,2,1*64)) \
195     PREFETCHW0(MEM(RCX,R15,1,0*64)) \
196     PREFETCHW0(MEM(RCX,R15,1,1*64)) \
197     PREFETCHW0(MEM(RDX,      0*64)) \
198     PREFETCHW0(MEM(RDX,      1*64)) \
199     PREFETCHW0(MEM(RDX,R12,1,0*64)) \
200     PREFETCHW0(MEM(RDX,R12,1,1*64)) \
201     PREFETCHW0(MEM(RDX,R12,2,0*64)) \
202     PREFETCHW0(MEM(RDX,R12,2,1*64)) \
203     PREFETCHW0(MEM(RDX,R13,1,0*64)) \
204     PREFETCHW0(MEM(RDX,R13,1,1*64))
205 
206 //
207 // n: index in unrolled loop
208 //
209 // a: ZMM register to load into
210 // b: ZMM register to read from
211 //
212 // ...: addressing for A, except for offset
213 //
214 #define SUBITER(n) \
215 \
216     PREFETCH_B_L1(n, 0) \
217     \
218     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 0)*4)) \
219     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 1)*4)) \
220     VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \
221     VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \
222     VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \
223     VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \
224     \
225     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 2)*4)) \
226     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 3)*4)) \
227     VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \
228     VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \
229     VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \
230     VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \
231     \
232     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 4)*4)) \
233     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 5)*4)) \
234     VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \
235     VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \
236     VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \
237     VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \
238     \
239     PREFETCH_B_L1(n, 1) \
240     \
241     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 6)*4)) \
242     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 7)*4)) \
243     VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \
244     VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \
245     VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \
246     VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \
247     \
248     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 8)*4)) \
249     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 9)*4)) \
250     VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \
251     VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \
252     VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \
253     VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \
254     \
255     VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+10)*4)) \
256     VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+11)*4)) \
257     VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \
258     VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \
259     VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \
260     VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \
261     \
262     VMOVAPS(ZMM(0), MEM(RBX,(32*n+ 0)*4)) \
263     VMOVAPS(ZMM(1), MEM(RBX,(32*n+16)*4))
264 
265 //This is an array used for the scatter/gather instructions.
266 static int64_t offsets[16] __attribute__((aligned(64))) =
267     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
268 
bli_sgemm_opt_12x32_l2(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * data,cntx_t * restrict cntx)269 void bli_sgemm_opt_12x32_l2(
270                              dim_t            k_,
271                              float* restrict alpha,
272                              float* restrict a,
273                              float* restrict b,
274                              float* restrict beta,
275                              float* restrict c, inc_t rs_c_, inc_t cs_c_,
276                              auxinfo_t*       data,
277                              cntx_t* restrict cntx
278                            )
279 {
280     (void)data;
281     (void)cntx;
282 
283     const int64_t* offsetPtr = &offsets[0];
284     const int64_t k = k_;
285     const int64_t rs_c = rs_c_;
286     const int64_t cs_c = cs_c_;
287 
288     __asm__ volatile
289     (
290 
291     VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
292     VMOVAPD(YMM( 7), YMM(8))
293     VMOVAPD(YMM( 9), YMM(8))
294     VMOVAPD(YMM(10), YMM(8))   MOV(RSI, VAR(k)) //loop index
295     VMOVAPD(YMM(11), YMM(8))   MOV(RAX, VAR(a)) //load address of a
296     VMOVAPD(YMM(12), YMM(8))   MOV(RBX, VAR(b)) //load address of b
297     VMOVAPD(YMM(13), YMM(8))   MOV(RCX, VAR(c)) //load address of c
298     VMOVAPD(YMM(14), YMM(8))
299     VMOVAPD(YMM(15), YMM(8))   VMOVAPS(ZMM(0), MEM(RBX,  0*4)) //pre-load b
300     VMOVAPD(YMM(16), YMM(8))   VMOVAPS(ZMM(1), MEM(RBX, 16*4)) //pre-load b
301     VMOVAPD(YMM(17), YMM(8))
302     VMOVAPD(YMM(18), YMM(8))
303     VMOVAPD(YMM(19), YMM(8))   MOV(R12, VAR(rs_c))      //rs_c
304     VMOVAPD(YMM(20), YMM(8))   LEA(R13, MEM(R12,R12,2)) //*3
305     VMOVAPD(YMM(21), YMM(8))   LEA(R14, MEM(R12,R12,4)) //*5
306     VMOVAPD(YMM(22), YMM(8))   LEA(R15, MEM(R14,R12,2)) //*7
307     VMOVAPD(YMM(23), YMM(8))   LEA(RDX, MEM(RCX,R12,8)) //c + 8*rs_c
308     VMOVAPD(YMM(24), YMM(8))
309     VMOVAPD(YMM(25), YMM(8))   MOV(R8, IMM(12*4)) //mr*sizeof(float)
310     VMOVAPD(YMM(26), YMM(8))   MOV(R9, IMM(32*4)) //nr*sizeof(float)
311     VMOVAPD(YMM(27), YMM(8))
312     VMOVAPD(YMM(28), YMM(8))   LEA(RBX, MEM(RBX,R9,1)) //adjust b for pre-load
313     VMOVAPD(YMM(29), YMM(8))
314     VMOVAPD(YMM(30), YMM(8))
315     VMOVAPD(YMM(31), YMM(8))
316 
317     TEST(RSI, RSI)
318     JZ(POSTACCUM)
319 
320 #ifdef PREFETCH_A_BEFORE
321     PREFETCH(0, MEM(RAX,0*64))
322     PREFETCH(0, MEM(RAX,1*64))
323     PREFETCH(0, MEM(RAX,2*64))
324 #endif
325 
326 #ifdef PREFETCH_B_BEFORE
327     PREFETCH(0, MEM(RBX,0*64))
328     PREFETCH(0, MEM(RBX,1*64))
329     PREFETCH(0, MEM(RBX,2*64))
330     PREFETCH(0, MEM(RBX,3*64))
331     PREFETCH(0, MEM(RBX,4*64))
332     PREFETCH(0, MEM(RBX,5*64))
333     PREFETCH(0, MEM(RBX,6*64))
334     PREFETCH(0, MEM(RBX,7*64))
335 #endif
336 
337     PREFETCH_C_L2
338 
339     MOV(RDI, RSI)
340     AND(RSI, IMM(3))
341     SAR(RDI, IMM(2))
342 
343     SUB(RDI, IMM(0+TAIL_NITER))
344     JLE(K_SMALL)
345 
346     LOOP_ALIGN
347     LABEL(MAIN_LOOP)
348 
349         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
350         SUBITER(0)
351         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+64))
352         SUBITER(1)
353         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+128))
354         SUBITER(2)
355         SUBITER(3)
356 
357         LEA(RAX, MEM(RAX,R8,4))
358         LEA(RBX, MEM(RBX,R9,4))
359 
360         DEC(RDI)
361 
362     JNZ(MAIN_LOOP)
363 
364     LABEL(K_SMALL)
365 
366     PREFETCH_C_L1
367 
368     ADD(RDI, IMM(0+TAIL_NITER))
369     JZ(TAIL_LOOP)
370 
371     LOOP_ALIGN
372     LABEL(SMALL_LOOP)
373 
374         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
375         SUBITER(0)
376         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+64))
377         SUBITER(1)
378         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+128))
379         SUBITER(2)
380         SUBITER(3)
381 
382         LEA(RAX, MEM(RAX,R8,4))
383         LEA(RBX, MEM(RBX,R9,4))
384 
385         DEC(RDI)
386 
387     JNZ(SMALL_LOOP)
388 
389     TEST(RSI, RSI)
390     JZ(POSTACCUM)
391 
392     LOOP_ALIGN
393     LABEL(TAIL_LOOP)
394 
395         PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
396         SUBITER(0)
397 
398         ADD(RAX, R8)
399         ADD(RBX, R9)
400 
401         DEC(RSI)
402 
403     JNZ(TAIL_LOOP)
404 
405     LABEL(POSTACCUM)
406 
407 #ifdef PREFETCH_A_AFTER
408     MOV(R8, VAR(a))
409     PREFETCH(0, MEM(R8,0*64))
410     PREFETCH(0, MEM(R8,1*64))
411     PREFETCH(0, MEM(R8,2*64))
412 #endif
413 
414 #ifdef PREFETCH_B_AFTER
415     MOV(R9, VAR(b))
416     PREFETCH(0, MEM(R9,0*64))
417     PREFETCH(0, MEM(R9,1*64))
418     PREFETCH(0, MEM(R9,2*64))
419     PREFETCH(0, MEM(R9,3*64))
420     PREFETCH(0, MEM(R9,4*64))
421     PREFETCH(0, MEM(R9,5*64))
422     PREFETCH(0, MEM(R9,6*64))
423     PREFETCH(0, MEM(R9,7*64))
424 #endif
425 
426     MOV(RAX, VAR(alpha))
427     MOV(RBX, VAR(beta))
428     VBROADCASTSS(ZMM(0), MEM(RAX))
429     VBROADCASTSS(ZMM(1), MEM(RBX))
430 
431     MOV(RAX, VAR(rs_c))
432     LEA(RAX, MEM(,RAX,4))
433     MOV(RBX, VAR(cs_c))
434     LEA(RBX, MEM(,RBX,4))
435 
436     // Check if C is row stride. If not, jump to the slow scattered update
437     CMP(RBX, IMM(4))
438     JNE(SCATTEREDUPDATE)
439 
440         VCOMISS(XMM(1), XMM(7))
441         JE(COLSTORBZ)
442 
443             UPDATE_C( 8, 9,10,11)
444             UPDATE_C(12,13,14,15)
445             UPDATE_C(16,17,18,19)
446             UPDATE_C(20,21,22,23)
447             UPDATE_C(24,25,26,27)
448             UPDATE_C(28,29,30,31)
449 
450         JMP(END)
451         LABEL(COLSTORBZ)
452 
453             UPDATE_C_BZ( 8, 9,10,11)
454             UPDATE_C_BZ(12,13,14,15)
455             UPDATE_C_BZ(16,17,18,19)
456             UPDATE_C_BZ(20,21,22,23)
457             UPDATE_C_BZ(24,25,26,27)
458             UPDATE_C_BZ(28,29,30,31)
459 
460     JMP(END)
461     LABEL(SCATTEREDUPDATE)
462 
463         LEA(RDX, MEM(RCX,RBX,8))
464         LEA(RDX, MEM(RDX,RBX,8))
465 
466         MOV(RDI, VAR(offsetPtr))
467         VMOVDQA64(ZMM(2), MEM(RDI,0*64))
468         VMOVDQA64(ZMM(3), MEM(RDI,1*64))
469         VPBROADCASTQ(ZMM(6), RBX)
470         VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
471         VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
472 
473         VCOMISS(XMM(1), XMM(7))
474         JE(SCATTERBZ)
475 
476             UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
477             UPDATE_C_ROW_SCATTERED(12,13,14,15)
478             UPDATE_C_ROW_SCATTERED(16,17,18,19)
479             UPDATE_C_ROW_SCATTERED(20,21,22,23)
480             UPDATE_C_ROW_SCATTERED(24,25,26,27)
481             UPDATE_C_ROW_SCATTERED(28,29,30,31)
482 
483         JMP(END)
484         LABEL(SCATTERBZ)
485 
486             UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
487             UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
488             UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
489             UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
490             UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
491             UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
492 
493     LABEL(END)
494 
495     VZEROUPPER()
496 
497     : // output operands
498     : // input operands
499       [k]         "m" (k),
500       [a]         "m" (a),
501       [b]         "m" (b),
502       [alpha]     "m" (alpha),
503       [beta]      "m" (beta),
504       [c]         "m" (c),
505       [rs_c]      "m" (rs_c),
506       [cs_c]      "m" (cs_c),
507       [offsetPtr] "m" (offsetPtr)
508     : // register clobber list
509       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
510       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
511       "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
512       "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
513       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
514       "zmm30", "zmm31", "memory"
515     );
516 }
517