1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #include <assert.h>
37 
38 #include "bli_avx512_macros.h"
39 
40 #define A_L1_PREFETCH_DIST 4
41 #define B_L1_PREFETCH_DIST 2
42 #define L2_PREFETCH_DIST  16 // Must be greater than 10, because of the way the loop is constructed.
43 
44 //Alternate code path uused if C is not row-major
45 // r9 = c
46 // zmm30 = cs_c * 1...16
47 // r11 = rs_c
48 // r12 = &alpha
49 // r13 = &beta
50 #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \
51 \
52     BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \
53     KXNORW(K(3), K(0), K(0)) \
54     BNZ1 VGATHERDPS(ZMM(31) MASK_K(2), MEM(R(9),ZMM(30),4)) BNZ2 \
55     VMULPS(ZMM(NUM), ZMM(NUM), MEM_1TO16(R(12))) /*scale by alpha*/ \
56     BNZ1 VFMADD231PS(ZMM(NUM), ZMM(31), MEM_1TO16(R(13))) BNZ2 /*scale by beta, add in result*/ \
57     VSCATTERDPS(MEM(R(9),ZMM(30),4) MASK_K(3), ZMM(NUM)) \
58     ADD(R(9), R(11))
59 
60 #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,)
61 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END)
62 
63 // r12 = &alpha
64 // zmm31 = beta
65 // r9 = c
66 // r11 =   rs_c
67 // r10 = 3*rs_c
68 // rdi = 4*rs_c
69 #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \
70 \
71     VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \
72     VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \
73     VMULPS(ZMM(R3), ZMM(R3), MEM_1TO16(R(12))) \
74     VMULPS(ZMM(R4), ZMM(R4), MEM_1TO16(R(12))) \
75     BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9)        )) BNZ2 \
76     BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
77     BNZ1 VFMADD231PS(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \
78     BNZ1 VFMADD231PS(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \
79     VMOVUPS(MEM(R(9)        ), ZMM(R1)) \
80     VMOVUPS(MEM(R(9),R(11),1), ZMM(R2)) \
81     VMOVUPS(MEM(R(9),R(11),2), ZMM(R3)) \
82     VMOVUPS(MEM(R(9),R(10),1), ZMM(R4)) \
83     ADD(R(9), RDI)
84 
85 // r12 = &alpha
86 // zmm31 = beta
87 // r9 = c
88 // r11 = rs_c
89 #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \
90 \
91     VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \
92     VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \
93     BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9)        )) BNZ2 \
94     BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
95     VMOVUPS(MEM(R(9)        ), ZMM(R1)) \
96     VMOVUPS(MEM(R(9),R(11),1), ZMM(R2))
97 
98 #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,)
99 #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,)
100 #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END)
101 #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END)
102 
103 #define A_TIMES_B_ROW(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),n*4))
104 #define A_TIMES_B_ROW_PREV(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),(n-32)*4))
105 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*4*32+n*64))
106 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64))
107 #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*4*16))
108 #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1))
109 
110 //One iteration of the k_r loop.
111 //Each iteration, we prefetch A into L1 and into L2
112 // r15 = a
113 // rbx = b
114 // rcx = c
115 // r11 = rs_c
116 // r13 = L2_PREFETCH_DIST*4*16
117 // r14 = L2_PREFETCH_DIST*4*32
118 // r12 = 32*4 = dist. to next sliver of a
119 // r9  = 16*4 = dist. to next sliver of b
120 #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \
121 \
122     /* Can this be pre-loaded for next it. in zmm30? */              \
123     VMOVAPS(ZMM(31), MEM(RBX))                                       \
124                                                                      \
125     A_TIMES_B_ROW     ( 0)                                           \
126     A_TIMES_B_ROW     ( 1)    PREFETCH_A_L1(0)                       \
127     A_TIMES_B_ROW     ( 2)    PREFETCH_A_L1(1)                       \
128     A_TIMES_B_ROW     ( 3)    PREFETCH_A_L1(2)                       \
129     A_TIMES_B_ROW     ( 4)    PREFETCH_A_L1(3)                       \
130     A_TIMES_B_ROW     ( 5)    PREFETCH_A_L2(0)                       \
131     A_TIMES_B_ROW     ( 6)    PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
132     A_TIMES_B_ROW     ( 7)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
133     A_TIMES_B_ROW     ( 8)                                           \
134     A_TIMES_B_ROW     ( 9)    PC_L2_1 PREFETCH(1, MEM(RCX)) PC_L2_2 \
135     A_TIMES_B_ROW     (10)    PREFETCH_A_L2(1)                       \
136     A_TIMES_B_ROW     (11)    PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
137     A_TIMES_B_ROW     (12)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
138     A_TIMES_B_ROW     (13)                                           \
139     A_TIMES_B_ROW     (14)                                           \
140     A_TIMES_B_ROW     (15)    PREFETCH_A_L2(2)                       \
141     A_TIMES_B_ROW     (16)    PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
142     A_TIMES_B_ROW     (17)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
143     A_TIMES_B_ROW     (18)                                           \
144     A_TIMES_B_ROW     (19)                                           \
145     A_TIMES_B_ROW     (20)    PREFETCH_A_L2(3)                       \
146     A_TIMES_B_ROW     (21)    ADD(R(15), R(12))                      \
147     A_TIMES_B_ROW_PREV(22)                                           \
148     A_TIMES_B_ROW_PREV(23)    PC_L2_1 ADD(RCX, R(11))        PC_L2_2 \
149     A_TIMES_B_ROW_PREV(24)    DEC(COUNTER)                           \
150     A_TIMES_B_ROW_PREV(25)    PREFETCH_B_L2                          \
151     A_TIMES_B_ROW_PREV(26)    PREFETCH_B_L1                          \
152     A_TIMES_B_ROW_PREV(27)    ADD(RBX, R(9))                         \
153     A_TIMES_B_ROW_PREV(28)    CMP(COUNTER, IMM(0))                   \
154     A_TIMES_B_ROW_PREV(29)
155 
156 #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END)
157 #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END)
158 #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,)
159 
160 //This is an array used for the scatter/gather instructions.
161 int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0,  1,  2,  3,  4,  5,  6,  7,
162                                                          8,  9, 10, 11, 12, 13, 14, 15,
163                                                         16, 17, 18, 19, 20, 21, 22, 23,
164                                                         24, 25, 26, 27, 28, 29, 30, 31};
165 
166 //#define MONITORS
167 //#define LOOPMON
bli_sgemm_knl_asm_30x16_knc(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * restrict data,cntx_t * restrict cntx)168 void bli_sgemm_knl_asm_30x16_knc
169      (
170        dim_t               k_,
171        float*     restrict alpha,
172        float*     restrict a,
173        float*     restrict b,
174        float*     restrict beta,
175        float*     restrict c, inc_t rs_c_, inc_t cs_c_,
176        auxinfo_t* restrict data,
177        cntx_t*    restrict cntx
178      )
179 {
180     (void)data;
181     (void)cntx;
182 
183     const float * a_next = bli_auxinfo_next_a( data );
184     const float * b_next = bli_auxinfo_next_b( data );
185 
186     const int32_t * offsetPtr = &offsets[0];
187     const int64_t k = k_;
188     const int64_t rs_c = rs_c_;
189     const int64_t cs_c = cs_c_;
190 
191 #ifdef MONITORS
192     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
193 #endif
194 #ifdef LOOPMON
195     int tlooph, tloopl, blooph, bloopl;
196 #endif
197 
198     __asm__ volatile
199     (
200 #ifdef MONITORS
201     RDTSC
202     MOV(VAR(topl), EAX)
203     MOV(VAR(toph), EDX)
204 #endif
205 
206     VPXORD(ZMM(0), ZMM(0), ZMM(0)) //clear out registers
207 
208     VMOVAPS(ZMM( 1), ZMM(0))
209     VMOVAPS(ZMM( 2), ZMM(0))    MOV(RSI, VAR(k)) //loop index
210     VMOVAPS(ZMM( 3), ZMM(0))    MOV(R(11), VAR(rs_c)) //load row stride
211     VMOVAPS(ZMM( 4), ZMM(0))    SAL(R(11), IMM(2)) //scale row stride
212     VMOVAPS(ZMM( 5), ZMM(0))    MOV(R(15), VAR(a)) //load address of a
213     VMOVAPS(ZMM( 6), ZMM(0))    MOV(RBX, VAR(b)) //load address of b
214     VMOVAPS(ZMM( 7), ZMM(0))
215     VMOVAPS(ZMM( 8), ZMM(0))    LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11
216     VMOVAPS(ZMM( 9), ZMM(0))
217     VMOVAPS(ZMM(10), ZMM(0))    MOV(RDI, R(11))
218     VMOVAPS(ZMM(11), ZMM(0))    SAL(RDI, IMM(2)) //rdi has 4*r11
219     VMOVAPS(ZMM(12), ZMM(0))    MOV(RCX, VAR(c)) //load address of c for prefetching
220     VMOVAPS(ZMM(13), ZMM(0))
221     VMOVAPS(ZMM(14), ZMM(0))    MOV(R(8), VAR(k))
222     VMOVAPS(ZMM(15), ZMM(0))
223     VMOVAPS(ZMM(16), ZMM(0))
224     VMOVAPS(ZMM(17), ZMM(0))    MOV(R(13), IMM(4*16*L2_PREFETCH_DIST))
225     VMOVAPS(ZMM(18), ZMM(0))    MOV(R(14), IMM(4*32*L2_PREFETCH_DIST))
226     VMOVAPS(ZMM(19), ZMM(0))
227     VMOVAPS(ZMM(20), ZMM(0))
228     VMOVAPS(ZMM(21), ZMM(0))
229     VMOVAPS(ZMM(22), ZMM(0))
230     VMOVAPS(ZMM(23), ZMM(0))    SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do.
231     VMOVAPS(ZMM(24), ZMM(0))    MOV(R(8), IMM(30))
232     VMOVAPS(ZMM(25), ZMM(0))    MOV(R(9), IMM(4*16)) //amount to increment b* by each iteration
233     VMOVAPS(ZMM(26), ZMM(0))    MOV(R(12), IMM(4*32)) //amount to increment a* by each iteration
234     VMOVAPS(ZMM(27), ZMM(0))
235     VMOVAPS(ZMM(28), ZMM(0))
236     VMOVAPS(ZMM(29), ZMM(0))
237 
238 #ifdef MONITORS
239     RDTSC
240     MOV(VAR(midl), EAX)
241     MOV(VAR(midh), EDX)
242 #endif
243 
244     JLE(CONSIDER_UNDER_40)
245     SUB(RSI, IMM(30+L2_PREFETCH_DIST))
246 
247     //First 30 iterations
248     LABEL(LOOPREFECHCL2)
249     MAIN_LOOP_PC_L2(R(8))
250     JNZ(LOOPREFECHCL2)
251     MOV(RCX, VAR(c))
252 
253     //Main Loop.
254     LABEL(LOOPMAIN)
255     MAIN_LOOP(RSI)
256     JNZ(LOOPMAIN)
257 
258     //Penultimate 22 iterations.
259     //Break these off from the main loop to avoid prefetching extra shit.
260     MOV(R(14), VAR(a_next))
261     MOV(R(13), VAR(b_next))
262     SUB(R(14), R(15))
263     SUB(R(13), RBX)
264     //Yes, I know 10-20 = -10
265     MOV(RSI, IMM(10+L2_PREFETCH_DIST-20))
266 
267     LABEL(LOOPMAIN2)
268     MAIN_LOOP(RSI)
269     JNZ(LOOPMAIN2)
270 
271     //Last 10 iterations
272     MOV(R(8), IMM(10))
273 
274     LABEL(LOOPREFETCHCL1)
275     MAIN_LOOP_PC_L1(R(8))
276     JNZ(LOOPREFETCHCL1)
277 
278     JMP(POSTACCUM)
279 
280     //Alternate main loop, with no prefetching of C
281     //Used when <= 40 iterations
282     LABEL(CONSIDER_UNDER_40)
283 
284     MOV(RSI, VAR(k))
285     TEST(RSI, RSI)
286     JZ(POSTACCUM)
287 
288     LABEL(LOOP_UNDER_40)
289     MAIN_LOOP(RSI)
290     JNZ(LOOP_UNDER_40)
291 
292     LABEL(POSTACCUM)
293 
294 #ifdef MONITORS
295     RDTSC
296     MOV(VAR(mid2l), EAX)
297     MOV(VAR(mid2h), EDX)
298 #endif
299 
300     MOV(R(9), VAR(c)) //load address of c for update
301     MOV(R(12), VAR(alpha)) //load address of alpha
302 
303     // Check if C is row stride. If not, jump to the slow scattered update
304     MOV(R(14), VAR(cs_c))
305     DEC(R(14))
306     JNZ(SCATTEREDUPDATE)
307 
308     MOV(R(14), VAR(beta))
309     VBROADCASTSS(ZMM(31), MEM(R(14)))
310 
311     MOV(EBX, MEM(R(14)))
312     TEST(EBX, EBX)
313     JZ(COLSTORBZ)
314 
315     UPDATE_C_4_ROWS( 0, 1, 2, 3)
316     UPDATE_C_4_ROWS( 4, 5, 6, 7)
317     UPDATE_C_4_ROWS( 8, 9,10,11)
318     UPDATE_C_4_ROWS(12,13,14,15)
319     UPDATE_C_4_ROWS(16,17,18,19)
320     UPDATE_C_4_ROWS(20,21,22,23)
321     UPDATE_C_4_ROWS(24,25,26,27)
322     UPDATE_C_2_ROWS(28,29)
323 
324     JMP(END)
325 
326     LABEL(COLSTORBZ)
327 
328     UPDATE_C_BZ_4_ROWS( 0, 1, 2, 3)
329     UPDATE_C_BZ_4_ROWS( 4, 5, 6, 7)
330     UPDATE_C_BZ_4_ROWS( 8, 9,10,11)
331     UPDATE_C_BZ_4_ROWS(12,13,14,15)
332     UPDATE_C_BZ_4_ROWS(16,17,18,19)
333     UPDATE_C_BZ_4_ROWS(20,21,22,23)
334     UPDATE_C_BZ_4_ROWS(24,25,26,27)
335     UPDATE_C_BZ_2_ROWS(28,29)
336 
337     JMP(END)
338 
339     LABEL(SCATTEREDUPDATE)
340 
341     MOV(R(13), VAR(beta))
342     MOV(R(10), VAR(offsetPtr))
343     VMOVAPS(ZMM(30), MEM(R(10)))
344     MOV(EBX, MEM(R(13)))
345     /* Note that this ignores the upper 32 bits in cs_c */
346     VPBROADCASTD(ZMM(31), VAR(cs_c))
347     VPMULLD(ZMM(30), ZMM(31), ZMM(30))
348 
349     TEST(EBX, EBX)
350     JZ(SCATTERBZ)
351 
352     UPDATE_C_ROW_SCATTERED( 0)
353     UPDATE_C_ROW_SCATTERED( 1)
354     UPDATE_C_ROW_SCATTERED( 2)
355     UPDATE_C_ROW_SCATTERED( 3)
356     UPDATE_C_ROW_SCATTERED( 4)
357     UPDATE_C_ROW_SCATTERED( 5)
358     UPDATE_C_ROW_SCATTERED( 6)
359     UPDATE_C_ROW_SCATTERED( 7)
360     UPDATE_C_ROW_SCATTERED( 8)
361     UPDATE_C_ROW_SCATTERED( 9)
362     UPDATE_C_ROW_SCATTERED(10)
363     UPDATE_C_ROW_SCATTERED(11)
364     UPDATE_C_ROW_SCATTERED(12)
365     UPDATE_C_ROW_SCATTERED(13)
366     UPDATE_C_ROW_SCATTERED(14)
367     UPDATE_C_ROW_SCATTERED(15)
368     UPDATE_C_ROW_SCATTERED(16)
369     UPDATE_C_ROW_SCATTERED(17)
370     UPDATE_C_ROW_SCATTERED(18)
371     UPDATE_C_ROW_SCATTERED(19)
372     UPDATE_C_ROW_SCATTERED(20)
373     UPDATE_C_ROW_SCATTERED(21)
374     UPDATE_C_ROW_SCATTERED(22)
375     UPDATE_C_ROW_SCATTERED(23)
376     UPDATE_C_ROW_SCATTERED(24)
377     UPDATE_C_ROW_SCATTERED(25)
378     UPDATE_C_ROW_SCATTERED(26)
379     UPDATE_C_ROW_SCATTERED(27)
380     UPDATE_C_ROW_SCATTERED(28)
381     UPDATE_C_ROW_SCATTERED(29)
382 
383     JMP(END)
384 
385     LABEL(SCATTERBZ)
386 
387     UPDATE_C_BZ_ROW_SCATTERED( 0)
388     UPDATE_C_BZ_ROW_SCATTERED( 1)
389     UPDATE_C_BZ_ROW_SCATTERED( 2)
390     UPDATE_C_BZ_ROW_SCATTERED( 3)
391     UPDATE_C_BZ_ROW_SCATTERED( 4)
392     UPDATE_C_BZ_ROW_SCATTERED( 5)
393     UPDATE_C_BZ_ROW_SCATTERED( 6)
394     UPDATE_C_BZ_ROW_SCATTERED( 7)
395     UPDATE_C_BZ_ROW_SCATTERED( 8)
396     UPDATE_C_BZ_ROW_SCATTERED( 9)
397     UPDATE_C_BZ_ROW_SCATTERED(10)
398     UPDATE_C_BZ_ROW_SCATTERED(11)
399     UPDATE_C_BZ_ROW_SCATTERED(12)
400     UPDATE_C_BZ_ROW_SCATTERED(13)
401     UPDATE_C_BZ_ROW_SCATTERED(14)
402     UPDATE_C_BZ_ROW_SCATTERED(15)
403     UPDATE_C_BZ_ROW_SCATTERED(16)
404     UPDATE_C_BZ_ROW_SCATTERED(17)
405     UPDATE_C_BZ_ROW_SCATTERED(18)
406     UPDATE_C_BZ_ROW_SCATTERED(19)
407     UPDATE_C_BZ_ROW_SCATTERED(20)
408     UPDATE_C_BZ_ROW_SCATTERED(21)
409     UPDATE_C_BZ_ROW_SCATTERED(22)
410     UPDATE_C_BZ_ROW_SCATTERED(23)
411     UPDATE_C_BZ_ROW_SCATTERED(24)
412     UPDATE_C_BZ_ROW_SCATTERED(25)
413     UPDATE_C_BZ_ROW_SCATTERED(26)
414     UPDATE_C_BZ_ROW_SCATTERED(27)
415     UPDATE_C_BZ_ROW_SCATTERED(28)
416     UPDATE_C_BZ_ROW_SCATTERED(29)
417 
418     LABEL(END)
419 
420 #ifdef MONITORS
421     RDTSC
422     MOV(VAR(botl), EAX)
423     MOV(VAR(both), EDX)
424 #endif
425     : // output operands
426 #ifdef MONITORS
427       [topl]  "=m" (topl),
428       [toph]  "=m" (toph),
429       [midl]  "=m" (midl),
430       [midh]  "=m" (midh),
431       [mid2l] "=m" (mid2l),
432       [mid2h] "=m" (mid2h),
433       [botl]  "=m" (botl),
434       [both]  "=m" (both)
435 #endif
436     : // input operands
437       [k]         "m" (k),
438       [a]         "m" (a),
439       [b]         "m" (b),
440       [alpha]     "m" (alpha),
441       [beta]      "m" (beta),
442       [c]         "m" (c),
443       [rs_c]      "m" (rs_c),
444       [cs_c]      "m" (cs_c),
445       [a_next]    "m" (a_next),
446       [b_next]    "m" (b_next),
447       [offsetPtr] "m" (offsetPtr)
448     : // register clobber list
449       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
450       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
451       "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
452       "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
453       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
454       "zmm30", "zmm31", "memory"
455     );
456 
457 #ifdef LOOPMON
458     printf("looptime = \t%d\n", bloopl - tloopl);
459 #endif
460 #ifdef MONITORS
461     dim_t top = ((dim_t)toph << 32) | topl;
462     dim_t mid = ((dim_t)midh << 32) | midl;
463     dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
464     dim_t bot = ((dim_t)both << 32) | botl;
465     printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
466 #endif
467 }
468