1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #include <assert.h>
37 
38 #include "bli_avx512_macros.h"
39 
40 #define A_L1_PREFETCH_DIST 4
41 #define B_L1_PREFETCH_DIST 2
42 #define L2_PREFETCH_DIST  16 // Must be greater than 10, because of the way the loop is constructed.
43 
44 //Alternate code path uused if C is not row-major
45 // r9 = c
46 // ymm0 = cs_c * 1...8
47 // r11 = rs_c
48 // r12 = &alpha
49 // r13 = &beta
50 #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \
51 \
52     BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \
53     KXNORW(K(3), K(0), K(0)) \
54     BNZ1 VGATHERDPD(ZMM(31) MASK_K(2), MEM(R(9),YMM(0),8)) BNZ2 \
55     VMULPD(ZMM(NUM), ZMM(NUM), MEM_1TO8(R(12))) /*scale by alpha*/ \
56     BNZ1 VFMADD231PD(ZMM(NUM), ZMM(31), MEM_1TO8(R(13))) BNZ2 /*scale by beta, add in result*/ \
57     VSCATTERDPD(MEM(R(9),YMM(0),8) MASK_K(3), ZMM(NUM)) \
58     ADD(R(9), R(11))
59 
60 #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,)
61 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END)
62 
63 // r12 = &alpha
64 // zmm31 = beta
65 // r9 = c
66 // r11 =   rs_c
67 // r10 = 3*rs_c
68 // rdi = 4*rs_c
69 #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \
70 \
71     VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \
72     VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \
73     VMULPD(ZMM(R3), ZMM(R3), MEM_1TO8(R(12))) \
74     VMULPD(ZMM(R4), ZMM(R4), MEM_1TO8(R(12))) \
75     BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9)        )) BNZ2 \
76     BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
77     BNZ1 VFMADD231PD(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \
78     BNZ1 VFMADD231PD(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \
79     VMOVUPD(MEM(R(9)        ), ZMM(R1)) \
80     VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \
81     VMOVUPD(MEM(R(9),R(11),2), ZMM(R3)) \
82     VMOVUPD(MEM(R(9),R(10),1), ZMM(R4)) \
83     ADD(R(9), RDI)
84 
85 // r12 = &alpha
86 // zmm31 = beta
87 // r9 = c
88 // r11 = rs_c
89 #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \
90 \
91     VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \
92     VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \
93     BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9)        )) BNZ2 \
94     BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
95     VMOVUPD(MEM(R(9)        ), ZMM(R1)) \
96     VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \
97 
98 #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,)
99 #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,)
100 #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END)
101 #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END)
102 
103 #define A_TIMES_B_ROW(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),(n-1)*8))
104 #define A_TIMES_B_ROW_PREV(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),((n-1)-32)*8))
105 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*8*32+n*64))
106 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64))
107 #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*8*8))
108 #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1))
109 
110 //One iteration of the k_r loop.
111 //Each iteration, we prefetch A into L1 and into L2
112 // r15 = a
113 // rbx = b
114 // rcx = c
115 // r11 = rs_c
116 // r13 = L2_PREFETCH_DIST*8*8
117 // r14 = L2_PREFETCH_DIST*8*32
118 // r12 = 32*8 = dist. to next sliver of a
119 // r9  =  8*8 = dist. to next sliver of b
120 #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \
121 \
122     /* Can this be pre-loaded for next it. in zmm0? */               \
123     VMOVAPD(ZMM(31), MEM(RBX))                                       \
124                                                                      \
125     A_TIMES_B_ROW     ( 1)                                           \
126     A_TIMES_B_ROW     ( 2)    PREFETCH_A_L1(0)                       \
127     A_TIMES_B_ROW     ( 3)    PREFETCH_A_L1(1)                       \
128     A_TIMES_B_ROW     ( 4)    PREFETCH_A_L1(2)                       \
129     A_TIMES_B_ROW     ( 5)    PREFETCH_A_L1(3)                       \
130     A_TIMES_B_ROW     ( 6)    PREFETCH_A_L2(0)                       \
131     A_TIMES_B_ROW     ( 7)    PC_L1_1 PREFETCH(0, MEM(RCX))  PC_L1_2 \
132     A_TIMES_B_ROW     ( 8)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
133     A_TIMES_B_ROW     ( 9)                                           \
134     A_TIMES_B_ROW     (10)    PC_L2_1 PREFETCH(1, MEM(RCX))  PC_L2_2 \
135     A_TIMES_B_ROW     (11)    PREFETCH_A_L2(1)                       \
136     A_TIMES_B_ROW     (12)    PC_L1_1 PREFETCH(0, MEM(RCX))  PC_L1_2 \
137     A_TIMES_B_ROW     (13)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
138     A_TIMES_B_ROW     (14)                                           \
139     A_TIMES_B_ROW     (15)                                           \
140     A_TIMES_B_ROW     (16)    PREFETCH_A_L2(2)                       \
141     A_TIMES_B_ROW     (17)    PC_L1_1 PREFETCH(0, MEM(RCX))  PC_L1_2 \
142     A_TIMES_B_ROW     (18)    PC_L1_1 ADD(RCX, R(11))        PC_L1_2 \
143     A_TIMES_B_ROW     (19)                                           \
144     A_TIMES_B_ROW     (20)                                           \
145     A_TIMES_B_ROW     (21)    PREFETCH_A_L2(3)                       \
146     A_TIMES_B_ROW     (22)    ADD(R(15), R(12))                      \
147     A_TIMES_B_ROW_PREV(23)                                           \
148     A_TIMES_B_ROW_PREV(24)    PC_L2_1 ADD(RCX, R(11))        PC_L2_2 \
149     A_TIMES_B_ROW_PREV(25)    DEC(COUNTER)                           \
150     A_TIMES_B_ROW_PREV(26)    PREFETCH_B_L2                          \
151     A_TIMES_B_ROW_PREV(27)    PREFETCH_B_L1                          \
152     A_TIMES_B_ROW_PREV(28)    ADD(RBX, R(9))                         \
153     A_TIMES_B_ROW_PREV(29)    CMP(COUNTER, IMM(0))                   \
154     A_TIMES_B_ROW_PREV(30)
155 
156 #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END)
157 #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END)
158 #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,)
159 
160 //This is an array used for the scatter/gather instructions.
161 static int32_t offsets[32] __attribute__((aligned(64))) =
162     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
163      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
164 
165 //#define MONITORS
166 //#define LOOPMON
bli_dgemm_opt_30x8_knc(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * data,cntx_t * restrict cntx)167 void bli_dgemm_opt_30x8_knc(
168                     dim_t            k,
169                     double* restrict alpha,
170                     double* restrict a,
171                     double* restrict b,
172                     double* restrict beta,
173                     double* restrict c, inc_t rs_c, inc_t cs_c,
174                     auxinfo_t*       data,
175                     cntx_t* restrict cntx
176                   )
177 {
178     const double * a_next = bli_auxinfo_next_a( data );
179     const double * b_next = bli_auxinfo_next_b( data );
180 
181     const int32_t * offsetPtr = &offsets[0];
182 
183 #ifdef MONITORS
184     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
185 #endif
186 #ifdef LOOPMON
187     int tlooph, tloopl, blooph, bloopl;
188 #endif
189 
190     __asm__ volatile
191     (
192 #ifdef MONITORS
193     RDTSC
194     MOV(VAR(topl), EAX)
195     MOV(VAR(toph), EDX)
196 #endif
197 
198     VPXORD(ZMM(1), ZMM(1), ZMM(1)) //clear out registers
199 
200     VMOVAPS(ZMM( 2), ZMM(1))
201     VMOVAPS(ZMM( 3), ZMM(1))    MOV(RSI, VAR(k)) //loop index
202     VMOVAPS(ZMM( 4), ZMM(1))    MOV(R(11), VAR(rs_c)) //load row stride
203     VMOVAPS(ZMM( 5), ZMM(1))    SAL(R(11), IMM(3)) //scale row stride
204     VMOVAPS(ZMM( 6), ZMM(1))    MOV(R(15), VAR(a)) //load address of a
205     VMOVAPS(ZMM( 7), ZMM(1))    MOV(RBX, VAR(b)) //load address of b
206     VMOVAPS(ZMM( 8), ZMM(1))
207     VMOVAPS(ZMM( 9), ZMM(1))    LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11
208     VMOVAPS(ZMM(10), ZMM(1))
209     VMOVAPS(ZMM(11), ZMM(1))    MOV(RDI, R(11))
210     VMOVAPS(ZMM(12), ZMM(1))    SAL(RDI, IMM(2)) //rdi has 4*r11
211     VMOVAPS(ZMM(13), ZMM(1))    MOV(RCX, VAR(c)) //load address of c for prefetching
212     VMOVAPS(ZMM(14), ZMM(1))
213     VMOVAPS(ZMM(15), ZMM(1))    MOV(R(8), VAR(k))
214     VMOVAPS(ZMM(16), ZMM(1))
215     VMOVAPS(ZMM(17), ZMM(1))
216     VMOVAPS(ZMM(18), ZMM(1))    MOV(R(13), IMM(8*8*L2_PREFETCH_DIST))
217     VMOVAPS(ZMM(19), ZMM(1))    MOV(R(14), IMM(8*32*L2_PREFETCH_DIST))
218     VMOVAPS(ZMM(20), ZMM(1))
219     VMOVAPS(ZMM(21), ZMM(1))
220     VMOVAPS(ZMM(22), ZMM(1))
221     VMOVAPS(ZMM(23), ZMM(1))
222     VMOVAPS(ZMM(24), ZMM(1))    SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do.
223     VMOVAPS(ZMM(25), ZMM(1))    MOV(R(8), IMM(30))
224     VMOVAPS(ZMM(26), ZMM(1))    MOV(R(9), IMM(8*8)) //amount to increment b* by each iteration
225     VMOVAPS(ZMM(27), ZMM(1))    MOV(R(12), IMM(8*32)) //amount to increment a* by each iteration
226     VMOVAPS(ZMM(28), ZMM(1))
227     VMOVAPS(ZMM(29), ZMM(1))
228     VMOVAPS(ZMM(30), ZMM(1))
229 
230 #ifdef MONITORS
231     RDTSC
232     MOV(VAR(midl), EAX)
233     MOV(VAR(midh), EDX)
234 #endif
235 
236     JLE(CONSIDER_UNDER_40)
237     SUB(RSI, IMM(30+L2_PREFETCH_DIST))
238 
239     //First 30 iterations
240     LABEL(LOOPREFECHCL2)
241     MAIN_LOOP_PC_L2(R(8))
242     JNZ(LOOPREFECHCL2)
243     MOV(RCX, VAR(c))
244 
245     //Main Loop.
246     LABEL(LOOPMAIN)
247     MAIN_LOOP(RSI)
248     JNZ(LOOPMAIN)
249 
250     //Penultimate 22 iterations.
251     //Break these off from the main loop to avoid prefetching extra shit.
252     MOV(R(14), VAR(a_next))
253     MOV(R(13), VAR(b_next))
254     SUB(R(14), R(15))
255     SUB(R(13), RBX)
256     //Yes, I know 10-20 = -10
257     MOV(RSI, IMM(10+L2_PREFETCH_DIST-20))
258 
259     LABEL(LOOPMAIN2)
260     MAIN_LOOP(RSI)
261     JNZ(LOOPMAIN2)
262 
263     //Last 10 iterations
264     MOV(R(8), IMM(10))
265 
266     LABEL(LOOPREFETCHCL1)
267     MAIN_LOOP_PC_L1(R(8))
268     JNZ(LOOPREFETCHCL1)
269 
270     JMP(POSTACCUM)
271 
272     //Alternate main loop, with no prefetching of C
273     //Used when <= 40 iterations
274     LABEL(CONSIDER_UNDER_40)
275 
276     MOV(RSI, VAR(k))
277     TEST(RSI, RSI)
278     JZ(POSTACCUM)
279 
280     LABEL(LOOP_UNDER_40)
281     MAIN_LOOP(RSI)
282     JNZ(LOOP_UNDER_40)
283 
284     LABEL(POSTACCUM)
285 
286 #ifdef MONITORS
287     RDTSC
288     MOV(VAR(mid2l), EAX)
289     MOV(VAR(mid2h), EDX)
290 #endif
291 
292     MOV(R(9), VAR(c)) //load address of c for update
293     MOV(R(12), VAR(alpha)) //load address of alpha
294 
295     // Check if C is row stride. If not, jump to the slow scattered update
296     MOV(R(14), VAR(cs_c))
297     DEC(R(14))
298     JNZ(SCATTEREDUPDATE)
299 
300     MOV(R(14), VAR(beta))
301     VBROADCASTSD(ZMM(31), MEM(R(14)))
302 
303     MOV(RBX, MEM(R(14)))
304     TEST(RBX, RBX)
305     JZ(COLSTORBZ)
306 
307     UPDATE_C_4_ROWS( 1, 2, 3, 4)
308     UPDATE_C_4_ROWS( 5, 6, 7, 8)
309     UPDATE_C_4_ROWS( 9,10,11,12)
310     UPDATE_C_4_ROWS(13,14,15,16)
311     UPDATE_C_4_ROWS(17,18,19,20)
312     UPDATE_C_4_ROWS(21,22,23,24)
313     UPDATE_C_4_ROWS(25,26,27,28)
314     UPDATE_C_2_ROWS(29,30)
315 
316     JMP(END)
317 
318     LABEL(COLSTORBZ)
319 
320     UPDATE_C_BZ_4_ROWS( 1, 2, 3, 4)
321     UPDATE_C_BZ_4_ROWS( 5, 6, 7, 8)
322     UPDATE_C_BZ_4_ROWS( 9,10,11,12)
323     UPDATE_C_BZ_4_ROWS(13,14,15,16)
324     UPDATE_C_BZ_4_ROWS(17,18,19,20)
325     UPDATE_C_BZ_4_ROWS(21,22,23,24)
326     UPDATE_C_BZ_4_ROWS(25,26,27,28)
327     UPDATE_C_BZ_2_ROWS(29,30)
328 
329     JMP(END)
330 
331     LABEL(SCATTEREDUPDATE)
332 
333     MOV(R(13), VAR(beta))
334     MOV(R(10), VAR(offsetPtr))
335     VMOVAPS(ZMM(0), MEM(R(10)))
336     MOV(RBX, MEM(R(13)))
337     /* Note that this ignores the upper 32 bits in cs_c */
338     VPBROADCASTD(ZMM(31), VAR(cs_c))
339     VPMULLD(ZMM(0), ZMM(31), ZMM(0))
340 
341     TEST(RBX, RBX)
342     JZ(SCATTERBZ)
343 
344     UPDATE_C_ROW_SCATTERED( 1)
345     UPDATE_C_ROW_SCATTERED( 2)
346     UPDATE_C_ROW_SCATTERED( 3)
347     UPDATE_C_ROW_SCATTERED( 4)
348     UPDATE_C_ROW_SCATTERED( 5)
349     UPDATE_C_ROW_SCATTERED( 6)
350     UPDATE_C_ROW_SCATTERED( 7)
351     UPDATE_C_ROW_SCATTERED( 8)
352     UPDATE_C_ROW_SCATTERED( 9)
353     UPDATE_C_ROW_SCATTERED(10)
354     UPDATE_C_ROW_SCATTERED(11)
355     UPDATE_C_ROW_SCATTERED(12)
356     UPDATE_C_ROW_SCATTERED(13)
357     UPDATE_C_ROW_SCATTERED(14)
358     UPDATE_C_ROW_SCATTERED(15)
359     UPDATE_C_ROW_SCATTERED(16)
360     UPDATE_C_ROW_SCATTERED(17)
361     UPDATE_C_ROW_SCATTERED(18)
362     UPDATE_C_ROW_SCATTERED(19)
363     UPDATE_C_ROW_SCATTERED(20)
364     UPDATE_C_ROW_SCATTERED(21)
365     UPDATE_C_ROW_SCATTERED(22)
366     UPDATE_C_ROW_SCATTERED(23)
367     UPDATE_C_ROW_SCATTERED(24)
368     UPDATE_C_ROW_SCATTERED(25)
369     UPDATE_C_ROW_SCATTERED(26)
370     UPDATE_C_ROW_SCATTERED(27)
371     UPDATE_C_ROW_SCATTERED(28)
372     UPDATE_C_ROW_SCATTERED(29)
373     UPDATE_C_ROW_SCATTERED(30)
374 
375     JMP(END)
376 
377     LABEL(SCATTERBZ)
378 
379     UPDATE_C_BZ_ROW_SCATTERED( 1)
380     UPDATE_C_BZ_ROW_SCATTERED( 2)
381     UPDATE_C_BZ_ROW_SCATTERED( 3)
382     UPDATE_C_BZ_ROW_SCATTERED( 4)
383     UPDATE_C_BZ_ROW_SCATTERED( 5)
384     UPDATE_C_BZ_ROW_SCATTERED( 6)
385     UPDATE_C_BZ_ROW_SCATTERED( 7)
386     UPDATE_C_BZ_ROW_SCATTERED( 8)
387     UPDATE_C_BZ_ROW_SCATTERED( 9)
388     UPDATE_C_BZ_ROW_SCATTERED(10)
389     UPDATE_C_BZ_ROW_SCATTERED(11)
390     UPDATE_C_BZ_ROW_SCATTERED(12)
391     UPDATE_C_BZ_ROW_SCATTERED(13)
392     UPDATE_C_BZ_ROW_SCATTERED(14)
393     UPDATE_C_BZ_ROW_SCATTERED(15)
394     UPDATE_C_BZ_ROW_SCATTERED(16)
395     UPDATE_C_BZ_ROW_SCATTERED(17)
396     UPDATE_C_BZ_ROW_SCATTERED(18)
397     UPDATE_C_BZ_ROW_SCATTERED(19)
398     UPDATE_C_BZ_ROW_SCATTERED(20)
399     UPDATE_C_BZ_ROW_SCATTERED(21)
400     UPDATE_C_BZ_ROW_SCATTERED(22)
401     UPDATE_C_BZ_ROW_SCATTERED(23)
402     UPDATE_C_BZ_ROW_SCATTERED(24)
403     UPDATE_C_BZ_ROW_SCATTERED(25)
404     UPDATE_C_BZ_ROW_SCATTERED(26)
405     UPDATE_C_BZ_ROW_SCATTERED(27)
406     UPDATE_C_BZ_ROW_SCATTERED(28)
407     UPDATE_C_BZ_ROW_SCATTERED(29)
408     UPDATE_C_BZ_ROW_SCATTERED(30)
409 
410     LABEL(END)
411 
412 #ifdef MONITORS
413     RDTSC
414     MOV(VAR(botl), EAX)
415     MOV(VAR(both), EDX)
416 #endif
417     : // output operands
418 #ifdef MONITORS
419       [topl]  "=m" (topl),
420       [toph]  "=m" (toph),
421       [midl]  "=m" (midl),
422       [midh]  "=m" (midh),
423       [mid2l] "=m" (mid2l),
424       [mid2h] "=m" (mid2h),
425       [botl]  "=m" (botl),
426       [both]  "=m" (both)
427 #endif
428     : // input operands
429       [k]         "m" (k),
430       [a]         "m" (a),
431       [b]         "m" (b),
432       [alpha]     "m" (alpha),
433       [beta]      "m" (beta),
434       [c]         "m" (c),
435       [rs_c]      "m" (rs_c),
436       [cs_c]      "m" (cs_c),
437       [a_next]    "m" (a_next),
438       [b_next]    "m" (b_next),
439       [offsetPtr] "m" (offsetPtr)
440     : // register clobber list
441       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
442       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
443       "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
444       "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
445       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
446       "zmm30", "zmm31", "memory"
447     );
448 
449 #ifdef LOOPMON
450     printf("looptime = \t%d\n", bloopl - tloopl);
451 #endif
452 #ifdef MONITORS
453     dim_t top = ((dim_t)toph << 32) | topl;
454     dim_t mid = ((dim_t)midh << 32) | midl;
455     dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
456     dim_t bot = ((dim_t)both << 32) | botl;
457     printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
458 #endif
459 }
460