1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #include <assert.h>
37
38 #include "../knl/bli_avx512_macros.h"
39 #include "common.h"
40
41 #define CACHELINE_SIZE 64 //size of cache line in bytes
42
43 /* During each subiteration, prefetching 2 cache lines of B
44 * UNROLL factor ahead. 2cache lines = 32 floats (NR).
45 * */
46 #define PREFETCH_B_L1(n, k) \
47 PREFETCH(0, MEM(RBX, B_L1_PREFETCH_DIST*32*4 + (2*n+k) * CACHELINE_SIZE))
48
49 #define LOOP_ALIGN ALIGN16
50
51 #define UPDATE_C(R1,R2,R3,R4) \
52 \
53 VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
54 VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
55 VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
56 VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
57 VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX,0*64)) \
58 VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,1*64)) \
59 VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,1,0*64)) \
60 VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RAX,1,1*64)) \
61 VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
62 VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
63 VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
64 VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
65 LEA(RCX, MEM(RCX,RAX,2))
66
67 #define UPDATE_C_BZ(R1,R2,R3,R4) \
68 \
69 VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
70 VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
71 VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
72 VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
73 VMOVUPS(MEM(RCX,0*64), ZMM(R1)) \
74 VMOVUPS(MEM(RCX,1*64), ZMM(R2)) \
75 VMOVUPS(MEM(RCX,RAX,1,0*64), ZMM(R3)) \
76 VMOVUPS(MEM(RCX,RAX,1,1*64), ZMM(R4)) \
77 LEA(RCX, MEM(RCX,RAX,2))
78
79 #define UPDATE_C_BZ_RS_ONE(R,C) \
80 \
81 KXNORW(K(1), K(0), K(0)) \
82 KXNORW(K(2), K(0), K(0)) \
83 VMULPS(ZMM(R), ZMM(R), ZMM(0)) \
84 VMOVAPS(ZMM(4), ZMM(R)) \
85 VEXTRACTF64X4(YMM(5), ZMM(R), IMM(1)) \
86 VSCATTERQPS(MEM(C,ZMM(2),1) MASK_K(1), YMM(4)) \
87 VSCATTERQPS(MEM(C,ZMM(3),1) MASK_K(2), YMM(5))
88
89 #define UPDATE_C_RS_ONE(R,C) \
90 \
91 KXNORW(K(1), K(0), K(0)) \
92 KXNORW(K(2), K(0), K(0)) \
93 KXNORW(K(3), K(0), K(0)) \
94 KXNORW(K(4), K(0), K(0)) \
95 VMULPS(ZMM(R), ZMM(R), ZMM(0)) \
96 VMOVAPS(ZMM(4), ZMM(R)) \
97 VEXTRACTF64X4(YMM(5), ZMM(R), IMM(1)) \
98 VGATHERQPS(YMM(6) MASK_K(1), MEM(C,ZMM(2),1)) \
99 VGATHERQPS(YMM(7) MASK_K(2), MEM(C,ZMM(3),1)) \
100 VFMADD231PS(YMM(4), YMM(6), YMM(1)) \
101 VFMADD231PS(YMM(5), YMM(7), YMM(1)) \
102 VSCATTERQPS(MEM(C,ZMM(2),1) MASK_K(3), YMM(4)) \
103 VSCATTERQPS(MEM(C,ZMM(3),1) MASK_K(4), YMM(5))
104
105 #define UPDATE_C_ROW_SCATTERED(R1,R2,R3,R4) \
106 \
107 UPDATE_C_RS_ONE(R1,RCX) \
108 UPDATE_C_RS_ONE(R2,RDX) \
109 \
110 LEA(RCX, MEM(RCX,RAX,1)) \
111 LEA(RDX, MEM(RDX,RAX,1)) \
112 \
113 UPDATE_C_RS_ONE(R3,RCX) \
114 UPDATE_C_RS_ONE(R4,RDX) \
115 \
116 LEA(RCX, MEM(RCX,RAX,1)) \
117 LEA(RDX, MEM(RDX,RAX,1))
118
119 #define UPDATE_C_BZ_ROW_SCATTERED(R1,R2,R3,R4) \
120 \
121 UPDATE_C_BZ_RS_ONE(R1,RCX) \
122 UPDATE_C_BZ_RS_ONE(R2,RDX) \
123 \
124 LEA(RCX, MEM(RCX,RAX,1)) \
125 LEA(RDX, MEM(RDX,RAX,1)) \
126 \
127 UPDATE_C_BZ_RS_ONE(R3,RCX) \
128 UPDATE_C_BZ_RS_ONE(R4,RDX) \
129 \
130 LEA(RCX, MEM(RCX,RAX,1)) \
131 LEA(RDX, MEM(RDX,RAX,1))
132
133 #ifdef PREFETCH_C_L2
134 #undef PREFETCH_C_L2
135 #define PREFETCH_C_L2 \
136 \
137 PREFETCH(1, MEM(RCX, 0*64)) \
138 PREFETCH(1, MEM(RCX, 1*64)) \
139 \
140 PREFETCH(1, MEM(RCX,R12,1,0*64)) \
141 PREFETCH(1, MEM(RCX,R12,1,1*64)) \
142 \
143 PREFETCH(1, MEM(RCX,R12,2,0*64)) \
144 PREFETCH(1, MEM(RCX,R12,2,1*64)) \
145 \
146 PREFETCH(1, MEM(RCX,R13,1,0*64)) \
147 PREFETCH(1, MEM(RCX,R13,1,1*64)) \
148 \
149 PREFETCH(1, MEM(RCX,R12,4,0*64)) \
150 PREFETCH(1, MEM(RCX,R12,4,1*64)) \
151 \
152 PREFETCH(1, MEM(RCX,R14,1,0*64)) \
153 PREFETCH(1, MEM(RCX,R14,1,1*64)) \
154 \
155 PREFETCH(1, MEM(RCX,R13,2,0*64)) \
156 PREFETCH(1, MEM(RCX,R13,2,1*64)) \
157 \
158 PREFETCH(1, MEM(RCX,R15,1,0*64)) \
159 PREFETCH(1, MEM(RCX,R15,1,1*64)) \
160 \
161 PREFETCH(1, MEM(RDX, 0*64)) \
162 PREFETCH(1, MEM(RDX, 1*64)) \
163 \
164 PREFETCH(1, MEM(RDX,R12,1,0*64)) \
165 PREFETCH(1, MEM(RDX,R12,1,1*64)) \
166 \
167 PREFETCH(1, MEM(RDX,R12,2,0*64)) \
168 PREFETCH(1, MEM(RDX,R12,2,1*64)) \
169 \
170 PREFETCH(1, MEM(RDX,R13,1,0*64)) \
171 PREFETCH(1, MEM(RDX,R13,1,1*64))
172
173 #else
174 #undef PREFETCH_C_L2
175 #define PREFETCH_C_L2
176 #endif
177
178
179 #define PREFETCH_C_L1 \
180 \
181 PREFETCHW0(MEM(RCX, 0*64)) \
182 PREFETCHW0(MEM(RCX, 1*64)) \
183 PREFETCHW0(MEM(RCX,R12,1,0*64)) \
184 PREFETCHW0(MEM(RCX,R12,1,1*64)) \
185 PREFETCHW0(MEM(RCX,R12,2,0*64)) \
186 PREFETCHW0(MEM(RCX,R12,2,1*64)) \
187 PREFETCHW0(MEM(RCX,R13,1,0*64)) \
188 PREFETCHW0(MEM(RCX,R13,1,1*64)) \
189 PREFETCHW0(MEM(RCX,R12,4,0*64)) \
190 PREFETCHW0(MEM(RCX,R12,4,1*64)) \
191 PREFETCHW0(MEM(RCX,R14,1,0*64)) \
192 PREFETCHW0(MEM(RCX,R14,1,1*64)) \
193 PREFETCHW0(MEM(RCX,R13,2,0*64)) \
194 PREFETCHW0(MEM(RCX,R13,2,1*64)) \
195 PREFETCHW0(MEM(RCX,R15,1,0*64)) \
196 PREFETCHW0(MEM(RCX,R15,1,1*64)) \
197 PREFETCHW0(MEM(RDX, 0*64)) \
198 PREFETCHW0(MEM(RDX, 1*64)) \
199 PREFETCHW0(MEM(RDX,R12,1,0*64)) \
200 PREFETCHW0(MEM(RDX,R12,1,1*64)) \
201 PREFETCHW0(MEM(RDX,R12,2,0*64)) \
202 PREFETCHW0(MEM(RDX,R12,2,1*64)) \
203 PREFETCHW0(MEM(RDX,R13,1,0*64)) \
204 PREFETCHW0(MEM(RDX,R13,1,1*64))
205
206 //
207 // n: index in unrolled loop
208 //
209 // a: ZMM register to load into
210 // b: ZMM register to read from
211 //
212 // ...: addressing for A, except for offset
213 //
214 #define SUBITER(n) \
215 \
216 PREFETCH_B_L1(n, 0) \
217 \
218 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 0)*4)) \
219 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 1)*4)) \
220 VFMADD231PS(ZMM( 8), ZMM(0), ZMM(3)) \
221 VFMADD231PS(ZMM( 9), ZMM(1), ZMM(3)) \
222 VFMADD231PS(ZMM(10), ZMM(0), ZMM(4)) \
223 VFMADD231PS(ZMM(11), ZMM(1), ZMM(4)) \
224 \
225 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 2)*4)) \
226 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 3)*4)) \
227 VFMADD231PS(ZMM(12), ZMM(0), ZMM(3)) \
228 VFMADD231PS(ZMM(13), ZMM(1), ZMM(3)) \
229 VFMADD231PS(ZMM(14), ZMM(0), ZMM(4)) \
230 VFMADD231PS(ZMM(15), ZMM(1), ZMM(4)) \
231 \
232 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 4)*4)) \
233 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 5)*4)) \
234 VFMADD231PS(ZMM(16), ZMM(0), ZMM(3)) \
235 VFMADD231PS(ZMM(17), ZMM(1), ZMM(3)) \
236 VFMADD231PS(ZMM(18), ZMM(0), ZMM(4)) \
237 VFMADD231PS(ZMM(19), ZMM(1), ZMM(4)) \
238 \
239 PREFETCH_B_L1(n, 1) \
240 \
241 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 6)*4)) \
242 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 7)*4)) \
243 VFMADD231PS(ZMM(20), ZMM(0), ZMM(3)) \
244 VFMADD231PS(ZMM(21), ZMM(1), ZMM(3)) \
245 VFMADD231PS(ZMM(22), ZMM(0), ZMM(4)) \
246 VFMADD231PS(ZMM(23), ZMM(1), ZMM(4)) \
247 \
248 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+ 8)*4)) \
249 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+ 9)*4)) \
250 VFMADD231PS(ZMM(24), ZMM(0), ZMM(3)) \
251 VFMADD231PS(ZMM(25), ZMM(1), ZMM(3)) \
252 VFMADD231PS(ZMM(26), ZMM(0), ZMM(4)) \
253 VFMADD231PS(ZMM(27), ZMM(1), ZMM(4)) \
254 \
255 VBROADCASTSS(ZMM(3), MEM(RAX,(12*n+10)*4)) \
256 VBROADCASTSS(ZMM(4), MEM(RAX,(12*n+11)*4)) \
257 VFMADD231PS(ZMM(28), ZMM(0), ZMM(3)) \
258 VFMADD231PS(ZMM(29), ZMM(1), ZMM(3)) \
259 VFMADD231PS(ZMM(30), ZMM(0), ZMM(4)) \
260 VFMADD231PS(ZMM(31), ZMM(1), ZMM(4)) \
261 \
262 VMOVAPS(ZMM(0), MEM(RBX,(32*n+ 0)*4)) \
263 VMOVAPS(ZMM(1), MEM(RBX,(32*n+16)*4))
264
265 //This is an array used for the scatter/gather instructions.
266 static int64_t offsets[16] __attribute__((aligned(64))) =
267 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15};
268
bli_sgemm_opt_12x32_l2(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * data,cntx_t * restrict cntx)269 void bli_sgemm_opt_12x32_l2(
270 dim_t k_,
271 float* restrict alpha,
272 float* restrict a,
273 float* restrict b,
274 float* restrict beta,
275 float* restrict c, inc_t rs_c_, inc_t cs_c_,
276 auxinfo_t* data,
277 cntx_t* restrict cntx
278 )
279 {
280 (void)data;
281 (void)cntx;
282
283 const int64_t* offsetPtr = &offsets[0];
284 const int64_t k = k_;
285 const int64_t rs_c = rs_c_;
286 const int64_t cs_c = cs_c_;
287
288 __asm__ volatile
289 (
290
291 VXORPD(YMM(8), YMM(8), YMM(8)) //clear out registers
292 VMOVAPD(YMM( 7), YMM(8))
293 VMOVAPD(YMM( 9), YMM(8))
294 VMOVAPD(YMM(10), YMM(8)) MOV(RSI, VAR(k)) //loop index
295 VMOVAPD(YMM(11), YMM(8)) MOV(RAX, VAR(a)) //load address of a
296 VMOVAPD(YMM(12), YMM(8)) MOV(RBX, VAR(b)) //load address of b
297 VMOVAPD(YMM(13), YMM(8)) MOV(RCX, VAR(c)) //load address of c
298 VMOVAPD(YMM(14), YMM(8))
299 VMOVAPD(YMM(15), YMM(8)) VMOVAPS(ZMM(0), MEM(RBX, 0*4)) //pre-load b
300 VMOVAPD(YMM(16), YMM(8)) VMOVAPS(ZMM(1), MEM(RBX, 16*4)) //pre-load b
301 VMOVAPD(YMM(17), YMM(8))
302 VMOVAPD(YMM(18), YMM(8))
303 VMOVAPD(YMM(19), YMM(8)) MOV(R12, VAR(rs_c)) //rs_c
304 VMOVAPD(YMM(20), YMM(8)) LEA(R13, MEM(R12,R12,2)) //*3
305 VMOVAPD(YMM(21), YMM(8)) LEA(R14, MEM(R12,R12,4)) //*5
306 VMOVAPD(YMM(22), YMM(8)) LEA(R15, MEM(R14,R12,2)) //*7
307 VMOVAPD(YMM(23), YMM(8)) LEA(RDX, MEM(RCX,R12,8)) //c + 8*rs_c
308 VMOVAPD(YMM(24), YMM(8))
309 VMOVAPD(YMM(25), YMM(8)) MOV(R8, IMM(12*4)) //mr*sizeof(float)
310 VMOVAPD(YMM(26), YMM(8)) MOV(R9, IMM(32*4)) //nr*sizeof(float)
311 VMOVAPD(YMM(27), YMM(8))
312 VMOVAPD(YMM(28), YMM(8)) LEA(RBX, MEM(RBX,R9,1)) //adjust b for pre-load
313 VMOVAPD(YMM(29), YMM(8))
314 VMOVAPD(YMM(30), YMM(8))
315 VMOVAPD(YMM(31), YMM(8))
316
317 TEST(RSI, RSI)
318 JZ(POSTACCUM)
319
320 #ifdef PREFETCH_A_BEFORE
321 PREFETCH(0, MEM(RAX,0*64))
322 PREFETCH(0, MEM(RAX,1*64))
323 PREFETCH(0, MEM(RAX,2*64))
324 #endif
325
326 #ifdef PREFETCH_B_BEFORE
327 PREFETCH(0, MEM(RBX,0*64))
328 PREFETCH(0, MEM(RBX,1*64))
329 PREFETCH(0, MEM(RBX,2*64))
330 PREFETCH(0, MEM(RBX,3*64))
331 PREFETCH(0, MEM(RBX,4*64))
332 PREFETCH(0, MEM(RBX,5*64))
333 PREFETCH(0, MEM(RBX,6*64))
334 PREFETCH(0, MEM(RBX,7*64))
335 #endif
336
337 PREFETCH_C_L2
338
339 MOV(RDI, RSI)
340 AND(RSI, IMM(3))
341 SAR(RDI, IMM(2))
342
343 SUB(RDI, IMM(0+TAIL_NITER))
344 JLE(K_SMALL)
345
346 LOOP_ALIGN
347 LABEL(MAIN_LOOP)
348
349 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
350 SUBITER(0)
351 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+64))
352 SUBITER(1)
353 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+128))
354 SUBITER(2)
355 SUBITER(3)
356
357 LEA(RAX, MEM(RAX,R8,4))
358 LEA(RBX, MEM(RBX,R9,4))
359
360 DEC(RDI)
361
362 JNZ(MAIN_LOOP)
363
364 LABEL(K_SMALL)
365
366 PREFETCH_C_L1
367
368 ADD(RDI, IMM(0+TAIL_NITER))
369 JZ(TAIL_LOOP)
370
371 LOOP_ALIGN
372 LABEL(SMALL_LOOP)
373
374 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
375 SUBITER(0)
376 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+64))
377 SUBITER(1)
378 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4+128))
379 SUBITER(2)
380 SUBITER(3)
381
382 LEA(RAX, MEM(RAX,R8,4))
383 LEA(RBX, MEM(RBX,R9,4))
384
385 DEC(RDI)
386
387 JNZ(SMALL_LOOP)
388
389 TEST(RSI, RSI)
390 JZ(POSTACCUM)
391
392 LOOP_ALIGN
393 LABEL(TAIL_LOOP)
394
395 PREFETCH(0, MEM(RAX,A_L1_PREFETCH_DIST*12*4))
396 SUBITER(0)
397
398 ADD(RAX, R8)
399 ADD(RBX, R9)
400
401 DEC(RSI)
402
403 JNZ(TAIL_LOOP)
404
405 LABEL(POSTACCUM)
406
407 #ifdef PREFETCH_A_AFTER
408 MOV(R8, VAR(a))
409 PREFETCH(0, MEM(R8,0*64))
410 PREFETCH(0, MEM(R8,1*64))
411 PREFETCH(0, MEM(R8,2*64))
412 #endif
413
414 #ifdef PREFETCH_B_AFTER
415 MOV(R9, VAR(b))
416 PREFETCH(0, MEM(R9,0*64))
417 PREFETCH(0, MEM(R9,1*64))
418 PREFETCH(0, MEM(R9,2*64))
419 PREFETCH(0, MEM(R9,3*64))
420 PREFETCH(0, MEM(R9,4*64))
421 PREFETCH(0, MEM(R9,5*64))
422 PREFETCH(0, MEM(R9,6*64))
423 PREFETCH(0, MEM(R9,7*64))
424 #endif
425
426 MOV(RAX, VAR(alpha))
427 MOV(RBX, VAR(beta))
428 VBROADCASTSS(ZMM(0), MEM(RAX))
429 VBROADCASTSS(ZMM(1), MEM(RBX))
430
431 MOV(RAX, VAR(rs_c))
432 LEA(RAX, MEM(,RAX,4))
433 MOV(RBX, VAR(cs_c))
434 LEA(RBX, MEM(,RBX,4))
435
436 // Check if C is row stride. If not, jump to the slow scattered update
437 CMP(RBX, IMM(4))
438 JNE(SCATTEREDUPDATE)
439
440 VCOMISS(XMM(1), XMM(7))
441 JE(COLSTORBZ)
442
443 UPDATE_C( 8, 9,10,11)
444 UPDATE_C(12,13,14,15)
445 UPDATE_C(16,17,18,19)
446 UPDATE_C(20,21,22,23)
447 UPDATE_C(24,25,26,27)
448 UPDATE_C(28,29,30,31)
449
450 JMP(END)
451 LABEL(COLSTORBZ)
452
453 UPDATE_C_BZ( 8, 9,10,11)
454 UPDATE_C_BZ(12,13,14,15)
455 UPDATE_C_BZ(16,17,18,19)
456 UPDATE_C_BZ(20,21,22,23)
457 UPDATE_C_BZ(24,25,26,27)
458 UPDATE_C_BZ(28,29,30,31)
459
460 JMP(END)
461 LABEL(SCATTEREDUPDATE)
462
463 LEA(RDX, MEM(RCX,RBX,8))
464 LEA(RDX, MEM(RDX,RBX,8))
465
466 MOV(RDI, VAR(offsetPtr))
467 VMOVDQA64(ZMM(2), MEM(RDI,0*64))
468 VMOVDQA64(ZMM(3), MEM(RDI,1*64))
469 VPBROADCASTQ(ZMM(6), RBX)
470 VPMULLQ(ZMM(2), ZMM(6), ZMM(2))
471 VPMULLQ(ZMM(3), ZMM(6), ZMM(3))
472
473 VCOMISS(XMM(1), XMM(7))
474 JE(SCATTERBZ)
475
476 UPDATE_C_ROW_SCATTERED( 8, 9,10,11)
477 UPDATE_C_ROW_SCATTERED(12,13,14,15)
478 UPDATE_C_ROW_SCATTERED(16,17,18,19)
479 UPDATE_C_ROW_SCATTERED(20,21,22,23)
480 UPDATE_C_ROW_SCATTERED(24,25,26,27)
481 UPDATE_C_ROW_SCATTERED(28,29,30,31)
482
483 JMP(END)
484 LABEL(SCATTERBZ)
485
486 UPDATE_C_BZ_ROW_SCATTERED( 8, 9,10,11)
487 UPDATE_C_BZ_ROW_SCATTERED(12,13,14,15)
488 UPDATE_C_BZ_ROW_SCATTERED(16,17,18,19)
489 UPDATE_C_BZ_ROW_SCATTERED(20,21,22,23)
490 UPDATE_C_BZ_ROW_SCATTERED(24,25,26,27)
491 UPDATE_C_BZ_ROW_SCATTERED(28,29,30,31)
492
493 LABEL(END)
494
495 VZEROUPPER()
496
497 : // output operands
498 : // input operands
499 [k] "m" (k),
500 [a] "m" (a),
501 [b] "m" (b),
502 [alpha] "m" (alpha),
503 [beta] "m" (beta),
504 [c] "m" (c),
505 [rs_c] "m" (rs_c),
506 [cs_c] "m" (cs_c),
507 [offsetPtr] "m" (offsetPtr)
508 : // register clobber list
509 "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
510 "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
511 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
512 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
513 "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
514 "zmm30", "zmm31", "memory"
515 );
516 }
517