1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #include <assert.h>
37
38 #include "bli_avx512_macros.h"
39
40 #define A_L1_PREFETCH_DIST 4
41 #define B_L1_PREFETCH_DIST 2
42 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed.
43
44 //Alternate code path uused if C is not row-major
45 // r9 = c
46 // ymm0 = cs_c * 1...8
47 // r11 = rs_c
48 // r12 = &alpha
49 // r13 = &beta
50 #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \
51 \
52 BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \
53 KXNORW(K(3), K(0), K(0)) \
54 BNZ1 VGATHERDPD(ZMM(31) MASK_K(2), MEM(R(9),YMM(0),8)) BNZ2 \
55 VMULPD(ZMM(NUM), ZMM(NUM), MEM_1TO8(R(12))) /*scale by alpha*/ \
56 BNZ1 VFMADD231PD(ZMM(NUM), ZMM(31), MEM_1TO8(R(13))) BNZ2 /*scale by beta, add in result*/ \
57 VSCATTERDPD(MEM(R(9),YMM(0),8) MASK_K(3), ZMM(NUM)) \
58 ADD(R(9), R(11))
59
60 #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,)
61 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END)
62
63 // r12 = &alpha
64 // zmm31 = beta
65 // r9 = c
66 // r11 = rs_c
67 // r10 = 3*rs_c
68 // rdi = 4*rs_c
69 #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \
70 \
71 VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \
72 VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \
73 VMULPD(ZMM(R3), ZMM(R3), MEM_1TO8(R(12))) \
74 VMULPD(ZMM(R4), ZMM(R4), MEM_1TO8(R(12))) \
75 BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \
76 BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
77 BNZ1 VFMADD231PD(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \
78 BNZ1 VFMADD231PD(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \
79 VMOVUPD(MEM(R(9) ), ZMM(R1)) \
80 VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \
81 VMOVUPD(MEM(R(9),R(11),2), ZMM(R3)) \
82 VMOVUPD(MEM(R(9),R(10),1), ZMM(R4)) \
83 ADD(R(9), RDI)
84
85 // r12 = &alpha
86 // zmm31 = beta
87 // r9 = c
88 // r11 = rs_c
89 #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \
90 \
91 VMULPD(ZMM(R1), ZMM(R1), MEM_1TO8(R(12))) \
92 VMULPD(ZMM(R2), ZMM(R2), MEM_1TO8(R(12))) \
93 BNZ1 VFMADD231PD(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \
94 BNZ1 VFMADD231PD(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
95 VMOVUPD(MEM(R(9) ), ZMM(R1)) \
96 VMOVUPD(MEM(R(9),R(11),1), ZMM(R2)) \
97
98 #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,)
99 #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,)
100 #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END)
101 #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END)
102
103 #define A_TIMES_B_ROW(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),(n-1)*8))
104 #define A_TIMES_B_ROW_PREV(n) VFMADD231PD(ZMM(n), ZMM(31), MEM_1TO8(R(15),((n-1)-32)*8))
105 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*8*32+n*64))
106 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64))
107 #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*8*8))
108 #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1))
109
110 //One iteration of the k_r loop.
111 //Each iteration, we prefetch A into L1 and into L2
112 // r15 = a
113 // rbx = b
114 // rcx = c
115 // r11 = rs_c
116 // r13 = L2_PREFETCH_DIST*8*8
117 // r14 = L2_PREFETCH_DIST*8*32
118 // r12 = 32*8 = dist. to next sliver of a
119 // r9 = 8*8 = dist. to next sliver of b
120 #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \
121 \
122 /* Can this be pre-loaded for next it. in zmm0? */ \
123 VMOVAPD(ZMM(31), MEM(RBX)) \
124 \
125 A_TIMES_B_ROW ( 1) \
126 A_TIMES_B_ROW ( 2) PREFETCH_A_L1(0) \
127 A_TIMES_B_ROW ( 3) PREFETCH_A_L1(1) \
128 A_TIMES_B_ROW ( 4) PREFETCH_A_L1(2) \
129 A_TIMES_B_ROW ( 5) PREFETCH_A_L1(3) \
130 A_TIMES_B_ROW ( 6) PREFETCH_A_L2(0) \
131 A_TIMES_B_ROW ( 7) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
132 A_TIMES_B_ROW ( 8) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
133 A_TIMES_B_ROW ( 9) \
134 A_TIMES_B_ROW (10) PC_L2_1 PREFETCH(1, MEM(RCX)) PC_L2_2 \
135 A_TIMES_B_ROW (11) PREFETCH_A_L2(1) \
136 A_TIMES_B_ROW (12) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
137 A_TIMES_B_ROW (13) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
138 A_TIMES_B_ROW (14) \
139 A_TIMES_B_ROW (15) \
140 A_TIMES_B_ROW (16) PREFETCH_A_L2(2) \
141 A_TIMES_B_ROW (17) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
142 A_TIMES_B_ROW (18) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
143 A_TIMES_B_ROW (19) \
144 A_TIMES_B_ROW (20) \
145 A_TIMES_B_ROW (21) PREFETCH_A_L2(3) \
146 A_TIMES_B_ROW (22) ADD(R(15), R(12)) \
147 A_TIMES_B_ROW_PREV(23) \
148 A_TIMES_B_ROW_PREV(24) PC_L2_1 ADD(RCX, R(11)) PC_L2_2 \
149 A_TIMES_B_ROW_PREV(25) DEC(COUNTER) \
150 A_TIMES_B_ROW_PREV(26) PREFETCH_B_L2 \
151 A_TIMES_B_ROW_PREV(27) PREFETCH_B_L1 \
152 A_TIMES_B_ROW_PREV(28) ADD(RBX, R(9)) \
153 A_TIMES_B_ROW_PREV(29) CMP(COUNTER, IMM(0)) \
154 A_TIMES_B_ROW_PREV(30)
155
156 #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END)
157 #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END)
158 #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,)
159
160 //This is an array used for the scatter/gather instructions.
161 static int32_t offsets[32] __attribute__((aligned(64))) =
162 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
163 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
164
165 //#define MONITORS
166 //#define LOOPMON
bli_dgemm_opt_30x8_knc(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * data,cntx_t * restrict cntx)167 void bli_dgemm_opt_30x8_knc(
168 dim_t k,
169 double* restrict alpha,
170 double* restrict a,
171 double* restrict b,
172 double* restrict beta,
173 double* restrict c, inc_t rs_c, inc_t cs_c,
174 auxinfo_t* data,
175 cntx_t* restrict cntx
176 )
177 {
178 const double * a_next = bli_auxinfo_next_a( data );
179 const double * b_next = bli_auxinfo_next_b( data );
180
181 const int32_t * offsetPtr = &offsets[0];
182
183 #ifdef MONITORS
184 int toph, topl, both, botl, midl, midh, mid2l, mid2h;
185 #endif
186 #ifdef LOOPMON
187 int tlooph, tloopl, blooph, bloopl;
188 #endif
189
190 __asm__ volatile
191 (
192 #ifdef MONITORS
193 RDTSC
194 MOV(VAR(topl), EAX)
195 MOV(VAR(toph), EDX)
196 #endif
197
198 VPXORD(ZMM(1), ZMM(1), ZMM(1)) //clear out registers
199
200 VMOVAPS(ZMM( 2), ZMM(1))
201 VMOVAPS(ZMM( 3), ZMM(1)) MOV(RSI, VAR(k)) //loop index
202 VMOVAPS(ZMM( 4), ZMM(1)) MOV(R(11), VAR(rs_c)) //load row stride
203 VMOVAPS(ZMM( 5), ZMM(1)) SAL(R(11), IMM(3)) //scale row stride
204 VMOVAPS(ZMM( 6), ZMM(1)) MOV(R(15), VAR(a)) //load address of a
205 VMOVAPS(ZMM( 7), ZMM(1)) MOV(RBX, VAR(b)) //load address of b
206 VMOVAPS(ZMM( 8), ZMM(1))
207 VMOVAPS(ZMM( 9), ZMM(1)) LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11
208 VMOVAPS(ZMM(10), ZMM(1))
209 VMOVAPS(ZMM(11), ZMM(1)) MOV(RDI, R(11))
210 VMOVAPS(ZMM(12), ZMM(1)) SAL(RDI, IMM(2)) //rdi has 4*r11
211 VMOVAPS(ZMM(13), ZMM(1)) MOV(RCX, VAR(c)) //load address of c for prefetching
212 VMOVAPS(ZMM(14), ZMM(1))
213 VMOVAPS(ZMM(15), ZMM(1)) MOV(R(8), VAR(k))
214 VMOVAPS(ZMM(16), ZMM(1))
215 VMOVAPS(ZMM(17), ZMM(1))
216 VMOVAPS(ZMM(18), ZMM(1)) MOV(R(13), IMM(8*8*L2_PREFETCH_DIST))
217 VMOVAPS(ZMM(19), ZMM(1)) MOV(R(14), IMM(8*32*L2_PREFETCH_DIST))
218 VMOVAPS(ZMM(20), ZMM(1))
219 VMOVAPS(ZMM(21), ZMM(1))
220 VMOVAPS(ZMM(22), ZMM(1))
221 VMOVAPS(ZMM(23), ZMM(1))
222 VMOVAPS(ZMM(24), ZMM(1)) SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do.
223 VMOVAPS(ZMM(25), ZMM(1)) MOV(R(8), IMM(30))
224 VMOVAPS(ZMM(26), ZMM(1)) MOV(R(9), IMM(8*8)) //amount to increment b* by each iteration
225 VMOVAPS(ZMM(27), ZMM(1)) MOV(R(12), IMM(8*32)) //amount to increment a* by each iteration
226 VMOVAPS(ZMM(28), ZMM(1))
227 VMOVAPS(ZMM(29), ZMM(1))
228 VMOVAPS(ZMM(30), ZMM(1))
229
230 #ifdef MONITORS
231 RDTSC
232 MOV(VAR(midl), EAX)
233 MOV(VAR(midh), EDX)
234 #endif
235
236 JLE(CONSIDER_UNDER_40)
237 SUB(RSI, IMM(30+L2_PREFETCH_DIST))
238
239 //First 30 iterations
240 LABEL(LOOPREFECHCL2)
241 MAIN_LOOP_PC_L2(R(8))
242 JNZ(LOOPREFECHCL2)
243 MOV(RCX, VAR(c))
244
245 //Main Loop.
246 LABEL(LOOPMAIN)
247 MAIN_LOOP(RSI)
248 JNZ(LOOPMAIN)
249
250 //Penultimate 22 iterations.
251 //Break these off from the main loop to avoid prefetching extra shit.
252 MOV(R(14), VAR(a_next))
253 MOV(R(13), VAR(b_next))
254 SUB(R(14), R(15))
255 SUB(R(13), RBX)
256 //Yes, I know 10-20 = -10
257 MOV(RSI, IMM(10+L2_PREFETCH_DIST-20))
258
259 LABEL(LOOPMAIN2)
260 MAIN_LOOP(RSI)
261 JNZ(LOOPMAIN2)
262
263 //Last 10 iterations
264 MOV(R(8), IMM(10))
265
266 LABEL(LOOPREFETCHCL1)
267 MAIN_LOOP_PC_L1(R(8))
268 JNZ(LOOPREFETCHCL1)
269
270 JMP(POSTACCUM)
271
272 //Alternate main loop, with no prefetching of C
273 //Used when <= 40 iterations
274 LABEL(CONSIDER_UNDER_40)
275
276 MOV(RSI, VAR(k))
277 TEST(RSI, RSI)
278 JZ(POSTACCUM)
279
280 LABEL(LOOP_UNDER_40)
281 MAIN_LOOP(RSI)
282 JNZ(LOOP_UNDER_40)
283
284 LABEL(POSTACCUM)
285
286 #ifdef MONITORS
287 RDTSC
288 MOV(VAR(mid2l), EAX)
289 MOV(VAR(mid2h), EDX)
290 #endif
291
292 MOV(R(9), VAR(c)) //load address of c for update
293 MOV(R(12), VAR(alpha)) //load address of alpha
294
295 // Check if C is row stride. If not, jump to the slow scattered update
296 MOV(R(14), VAR(cs_c))
297 DEC(R(14))
298 JNZ(SCATTEREDUPDATE)
299
300 MOV(R(14), VAR(beta))
301 VBROADCASTSD(ZMM(31), MEM(R(14)))
302
303 MOV(RBX, MEM(R(14)))
304 TEST(RBX, RBX)
305 JZ(COLSTORBZ)
306
307 UPDATE_C_4_ROWS( 1, 2, 3, 4)
308 UPDATE_C_4_ROWS( 5, 6, 7, 8)
309 UPDATE_C_4_ROWS( 9,10,11,12)
310 UPDATE_C_4_ROWS(13,14,15,16)
311 UPDATE_C_4_ROWS(17,18,19,20)
312 UPDATE_C_4_ROWS(21,22,23,24)
313 UPDATE_C_4_ROWS(25,26,27,28)
314 UPDATE_C_2_ROWS(29,30)
315
316 JMP(END)
317
318 LABEL(COLSTORBZ)
319
320 UPDATE_C_BZ_4_ROWS( 1, 2, 3, 4)
321 UPDATE_C_BZ_4_ROWS( 5, 6, 7, 8)
322 UPDATE_C_BZ_4_ROWS( 9,10,11,12)
323 UPDATE_C_BZ_4_ROWS(13,14,15,16)
324 UPDATE_C_BZ_4_ROWS(17,18,19,20)
325 UPDATE_C_BZ_4_ROWS(21,22,23,24)
326 UPDATE_C_BZ_4_ROWS(25,26,27,28)
327 UPDATE_C_BZ_2_ROWS(29,30)
328
329 JMP(END)
330
331 LABEL(SCATTEREDUPDATE)
332
333 MOV(R(13), VAR(beta))
334 MOV(R(10), VAR(offsetPtr))
335 VMOVAPS(ZMM(0), MEM(R(10)))
336 MOV(RBX, MEM(R(13)))
337 /* Note that this ignores the upper 32 bits in cs_c */
338 VPBROADCASTD(ZMM(31), VAR(cs_c))
339 VPMULLD(ZMM(0), ZMM(31), ZMM(0))
340
341 TEST(RBX, RBX)
342 JZ(SCATTERBZ)
343
344 UPDATE_C_ROW_SCATTERED( 1)
345 UPDATE_C_ROW_SCATTERED( 2)
346 UPDATE_C_ROW_SCATTERED( 3)
347 UPDATE_C_ROW_SCATTERED( 4)
348 UPDATE_C_ROW_SCATTERED( 5)
349 UPDATE_C_ROW_SCATTERED( 6)
350 UPDATE_C_ROW_SCATTERED( 7)
351 UPDATE_C_ROW_SCATTERED( 8)
352 UPDATE_C_ROW_SCATTERED( 9)
353 UPDATE_C_ROW_SCATTERED(10)
354 UPDATE_C_ROW_SCATTERED(11)
355 UPDATE_C_ROW_SCATTERED(12)
356 UPDATE_C_ROW_SCATTERED(13)
357 UPDATE_C_ROW_SCATTERED(14)
358 UPDATE_C_ROW_SCATTERED(15)
359 UPDATE_C_ROW_SCATTERED(16)
360 UPDATE_C_ROW_SCATTERED(17)
361 UPDATE_C_ROW_SCATTERED(18)
362 UPDATE_C_ROW_SCATTERED(19)
363 UPDATE_C_ROW_SCATTERED(20)
364 UPDATE_C_ROW_SCATTERED(21)
365 UPDATE_C_ROW_SCATTERED(22)
366 UPDATE_C_ROW_SCATTERED(23)
367 UPDATE_C_ROW_SCATTERED(24)
368 UPDATE_C_ROW_SCATTERED(25)
369 UPDATE_C_ROW_SCATTERED(26)
370 UPDATE_C_ROW_SCATTERED(27)
371 UPDATE_C_ROW_SCATTERED(28)
372 UPDATE_C_ROW_SCATTERED(29)
373 UPDATE_C_ROW_SCATTERED(30)
374
375 JMP(END)
376
377 LABEL(SCATTERBZ)
378
379 UPDATE_C_BZ_ROW_SCATTERED( 1)
380 UPDATE_C_BZ_ROW_SCATTERED( 2)
381 UPDATE_C_BZ_ROW_SCATTERED( 3)
382 UPDATE_C_BZ_ROW_SCATTERED( 4)
383 UPDATE_C_BZ_ROW_SCATTERED( 5)
384 UPDATE_C_BZ_ROW_SCATTERED( 6)
385 UPDATE_C_BZ_ROW_SCATTERED( 7)
386 UPDATE_C_BZ_ROW_SCATTERED( 8)
387 UPDATE_C_BZ_ROW_SCATTERED( 9)
388 UPDATE_C_BZ_ROW_SCATTERED(10)
389 UPDATE_C_BZ_ROW_SCATTERED(11)
390 UPDATE_C_BZ_ROW_SCATTERED(12)
391 UPDATE_C_BZ_ROW_SCATTERED(13)
392 UPDATE_C_BZ_ROW_SCATTERED(14)
393 UPDATE_C_BZ_ROW_SCATTERED(15)
394 UPDATE_C_BZ_ROW_SCATTERED(16)
395 UPDATE_C_BZ_ROW_SCATTERED(17)
396 UPDATE_C_BZ_ROW_SCATTERED(18)
397 UPDATE_C_BZ_ROW_SCATTERED(19)
398 UPDATE_C_BZ_ROW_SCATTERED(20)
399 UPDATE_C_BZ_ROW_SCATTERED(21)
400 UPDATE_C_BZ_ROW_SCATTERED(22)
401 UPDATE_C_BZ_ROW_SCATTERED(23)
402 UPDATE_C_BZ_ROW_SCATTERED(24)
403 UPDATE_C_BZ_ROW_SCATTERED(25)
404 UPDATE_C_BZ_ROW_SCATTERED(26)
405 UPDATE_C_BZ_ROW_SCATTERED(27)
406 UPDATE_C_BZ_ROW_SCATTERED(28)
407 UPDATE_C_BZ_ROW_SCATTERED(29)
408 UPDATE_C_BZ_ROW_SCATTERED(30)
409
410 LABEL(END)
411
412 #ifdef MONITORS
413 RDTSC
414 MOV(VAR(botl), EAX)
415 MOV(VAR(both), EDX)
416 #endif
417 : // output operands
418 #ifdef MONITORS
419 [topl] "=m" (topl),
420 [toph] "=m" (toph),
421 [midl] "=m" (midl),
422 [midh] "=m" (midh),
423 [mid2l] "=m" (mid2l),
424 [mid2h] "=m" (mid2h),
425 [botl] "=m" (botl),
426 [both] "=m" (both)
427 #endif
428 : // input operands
429 [k] "m" (k),
430 [a] "m" (a),
431 [b] "m" (b),
432 [alpha] "m" (alpha),
433 [beta] "m" (beta),
434 [c] "m" (c),
435 [rs_c] "m" (rs_c),
436 [cs_c] "m" (cs_c),
437 [a_next] "m" (a_next),
438 [b_next] "m" (b_next),
439 [offsetPtr] "m" (offsetPtr)
440 : // register clobber list
441 "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
442 "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
443 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
444 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
445 "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
446 "zmm30", "zmm31", "memory"
447 );
448
449 #ifdef LOOPMON
450 printf("looptime = \t%d\n", bloopl - tloopl);
451 #endif
452 #ifdef MONITORS
453 dim_t top = ((dim_t)toph << 32) | topl;
454 dim_t mid = ((dim_t)midh << 32) | midl;
455 dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
456 dim_t bot = ((dim_t)both << 32) | botl;
457 printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
458 #endif
459 }
460