1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #include <assert.h>
37
38 #include "bli_avx512_macros.h"
39
40 #define A_L1_PREFETCH_DIST 4
41 #define B_L1_PREFETCH_DIST 2
42 #define L2_PREFETCH_DIST 16 // Must be greater than 10, because of the way the loop is constructed.
43
44 //Alternate code path uused if C is not row-major
45 // r9 = c
46 // zmm30 = cs_c * 1...16
47 // r11 = rs_c
48 // r12 = &alpha
49 // r13 = &beta
50 #define UPDATE_C_ROW_SCATTERED_(NUM,BNZ1,BNZ2) \
51 \
52 BNZ1 KXNORW(K(2), K(0), K(0)) BNZ2 \
53 KXNORW(K(3), K(0), K(0)) \
54 BNZ1 VGATHERDPS(ZMM(31) MASK_K(2), MEM(R(9),ZMM(30),4)) BNZ2 \
55 VMULPS(ZMM(NUM), ZMM(NUM), MEM_1TO16(R(12))) /*scale by alpha*/ \
56 BNZ1 VFMADD231PS(ZMM(NUM), ZMM(31), MEM_1TO16(R(13))) BNZ2 /*scale by beta, add in result*/ \
57 VSCATTERDPS(MEM(R(9),ZMM(30),4) MASK_K(3), ZMM(NUM)) \
58 ADD(R(9), R(11))
59
60 #define UPDATE_C_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,,)
61 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) UPDATE_C_ROW_SCATTERED_(NUM,COMMENT_BEGIN,COMMENT_END)
62
63 // r12 = &alpha
64 // zmm31 = beta
65 // r9 = c
66 // r11 = rs_c
67 // r10 = 3*rs_c
68 // rdi = 4*rs_c
69 #define UPDATE_C_4_ROWS_(R1,R2,R3,R4,BNZ1,BNZ2) \
70 \
71 VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \
72 VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \
73 VMULPS(ZMM(R3), ZMM(R3), MEM_1TO16(R(12))) \
74 VMULPS(ZMM(R4), ZMM(R4), MEM_1TO16(R(12))) \
75 BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \
76 BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
77 BNZ1 VFMADD231PS(ZMM(R3), ZMM(31), MEM(R(9),R(11),2)) BNZ2 \
78 BNZ1 VFMADD231PS(ZMM(R4), ZMM(31), MEM(R(9),R(10),1)) BNZ2 \
79 VMOVUPS(MEM(R(9) ), ZMM(R1)) \
80 VMOVUPS(MEM(R(9),R(11),1), ZMM(R2)) \
81 VMOVUPS(MEM(R(9),R(11),2), ZMM(R3)) \
82 VMOVUPS(MEM(R(9),R(10),1), ZMM(R4)) \
83 ADD(R(9), RDI)
84
85 // r12 = &alpha
86 // zmm31 = beta
87 // r9 = c
88 // r11 = rs_c
89 #define UPDATE_C_2_ROWS_(R1,R2,BNZ1,BNZ2) \
90 \
91 VMULPS(ZMM(R1), ZMM(R1), MEM_1TO16(R(12))) \
92 VMULPS(ZMM(R2), ZMM(R2), MEM_1TO16(R(12))) \
93 BNZ1 VFMADD231PS(ZMM(R1), ZMM(31), MEM(R(9) )) BNZ2 \
94 BNZ1 VFMADD231PS(ZMM(R2), ZMM(31), MEM(R(9),R(11),1)) BNZ2 \
95 VMOVUPS(MEM(R(9) ), ZMM(R1)) \
96 VMOVUPS(MEM(R(9),R(11),1), ZMM(R2))
97
98 #define UPDATE_C_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,,)
99 #define UPDATE_C_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,,)
100 #define UPDATE_C_BZ_4_ROWS(R1,R2,R3,R4) UPDATE_C_4_ROWS_(R1,R2,R3,R4,COMMENT_BEGIN,COMMENT_END)
101 #define UPDATE_C_BZ_2_ROWS(R1,R2) UPDATE_C_2_ROWS_(R1,R2,COMMENT_BEGIN,COMMENT_END)
102
103 #define A_TIMES_B_ROW(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),n*4))
104 #define A_TIMES_B_ROW_PREV(n) VFMADD231PS(ZMM(n), ZMM(31), MEM_1TO16(R(15),(n-32)*4))
105 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(R(15),A_L1_PREFETCH_DIST*4*32+n*64))
106 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(R(15),R(14),1,n*64))
107 #define PREFETCH_B_L1 PREFETCH(0, MEM(RBX,B_L1_PREFETCH_DIST*4*16))
108 #define PREFETCH_B_L2 PREFETCH(1, MEM(RBX,R(13),1))
109
110 //One iteration of the k_r loop.
111 //Each iteration, we prefetch A into L1 and into L2
112 // r15 = a
113 // rbx = b
114 // rcx = c
115 // r11 = rs_c
116 // r13 = L2_PREFETCH_DIST*4*16
117 // r14 = L2_PREFETCH_DIST*4*32
118 // r12 = 32*4 = dist. to next sliver of a
119 // r9 = 16*4 = dist. to next sliver of b
120 #define MAIN_LOOP_(COUNTER, PC_L1_1, PC_L1_2, PC_L2_1, PC_L2_2) \
121 \
122 /* Can this be pre-loaded for next it. in zmm30? */ \
123 VMOVAPS(ZMM(31), MEM(RBX)) \
124 \
125 A_TIMES_B_ROW ( 0) \
126 A_TIMES_B_ROW ( 1) PREFETCH_A_L1(0) \
127 A_TIMES_B_ROW ( 2) PREFETCH_A_L1(1) \
128 A_TIMES_B_ROW ( 3) PREFETCH_A_L1(2) \
129 A_TIMES_B_ROW ( 4) PREFETCH_A_L1(3) \
130 A_TIMES_B_ROW ( 5) PREFETCH_A_L2(0) \
131 A_TIMES_B_ROW ( 6) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
132 A_TIMES_B_ROW ( 7) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
133 A_TIMES_B_ROW ( 8) \
134 A_TIMES_B_ROW ( 9) PC_L2_1 PREFETCH(1, MEM(RCX)) PC_L2_2 \
135 A_TIMES_B_ROW (10) PREFETCH_A_L2(1) \
136 A_TIMES_B_ROW (11) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
137 A_TIMES_B_ROW (12) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
138 A_TIMES_B_ROW (13) \
139 A_TIMES_B_ROW (14) \
140 A_TIMES_B_ROW (15) PREFETCH_A_L2(2) \
141 A_TIMES_B_ROW (16) PC_L1_1 PREFETCH(0, MEM(RCX)) PC_L1_2 \
142 A_TIMES_B_ROW (17) PC_L1_1 ADD(RCX, R(11)) PC_L1_2 \
143 A_TIMES_B_ROW (18) \
144 A_TIMES_B_ROW (19) \
145 A_TIMES_B_ROW (20) PREFETCH_A_L2(3) \
146 A_TIMES_B_ROW (21) ADD(R(15), R(12)) \
147 A_TIMES_B_ROW_PREV(22) \
148 A_TIMES_B_ROW_PREV(23) PC_L2_1 ADD(RCX, R(11)) PC_L2_2 \
149 A_TIMES_B_ROW_PREV(24) DEC(COUNTER) \
150 A_TIMES_B_ROW_PREV(25) PREFETCH_B_L2 \
151 A_TIMES_B_ROW_PREV(26) PREFETCH_B_L1 \
152 A_TIMES_B_ROW_PREV(27) ADD(RBX, R(9)) \
153 A_TIMES_B_ROW_PREV(28) CMP(COUNTER, IMM(0)) \
154 A_TIMES_B_ROW_PREV(29)
155
156 #define MAIN_LOOP(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,COMMENT_BEGIN,COMMENT_END)
157 #define MAIN_LOOP_PC_L1(COUNTER) MAIN_LOOP_(COUNTER,,,COMMENT_BEGIN,COMMENT_END)
158 #define MAIN_LOOP_PC_L2(COUNTER) MAIN_LOOP_(COUNTER,COMMENT_BEGIN,COMMENT_END,,)
159
160 //This is an array used for the scatter/gather instructions.
161 int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5, 6, 7,
162 8, 9, 10, 11, 12, 13, 14, 15,
163 16, 17, 18, 19, 20, 21, 22, 23,
164 24, 25, 26, 27, 28, 29, 30, 31};
165
166 //#define MONITORS
167 //#define LOOPMON
bli_sgemm_knl_asm_30x16_knc(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * restrict data,cntx_t * restrict cntx)168 void bli_sgemm_knl_asm_30x16_knc
169 (
170 dim_t k_,
171 float* restrict alpha,
172 float* restrict a,
173 float* restrict b,
174 float* restrict beta,
175 float* restrict c, inc_t rs_c_, inc_t cs_c_,
176 auxinfo_t* restrict data,
177 cntx_t* restrict cntx
178 )
179 {
180 (void)data;
181 (void)cntx;
182
183 const float * a_next = bli_auxinfo_next_a( data );
184 const float * b_next = bli_auxinfo_next_b( data );
185
186 const int32_t * offsetPtr = &offsets[0];
187 const int64_t k = k_;
188 const int64_t rs_c = rs_c_;
189 const int64_t cs_c = cs_c_;
190
191 #ifdef MONITORS
192 int toph, topl, both, botl, midl, midh, mid2l, mid2h;
193 #endif
194 #ifdef LOOPMON
195 int tlooph, tloopl, blooph, bloopl;
196 #endif
197
198 __asm__ volatile
199 (
200 #ifdef MONITORS
201 RDTSC
202 MOV(VAR(topl), EAX)
203 MOV(VAR(toph), EDX)
204 #endif
205
206 VPXORD(ZMM(0), ZMM(0), ZMM(0)) //clear out registers
207
208 VMOVAPS(ZMM( 1), ZMM(0))
209 VMOVAPS(ZMM( 2), ZMM(0)) MOV(RSI, VAR(k)) //loop index
210 VMOVAPS(ZMM( 3), ZMM(0)) MOV(R(11), VAR(rs_c)) //load row stride
211 VMOVAPS(ZMM( 4), ZMM(0)) SAL(R(11), IMM(2)) //scale row stride
212 VMOVAPS(ZMM( 5), ZMM(0)) MOV(R(15), VAR(a)) //load address of a
213 VMOVAPS(ZMM( 6), ZMM(0)) MOV(RBX, VAR(b)) //load address of b
214 VMOVAPS(ZMM( 7), ZMM(0))
215 VMOVAPS(ZMM( 8), ZMM(0)) LEA(R(10), MEM(R(11),R(11),2)) //r10 has 3 * r11
216 VMOVAPS(ZMM( 9), ZMM(0))
217 VMOVAPS(ZMM(10), ZMM(0)) MOV(RDI, R(11))
218 VMOVAPS(ZMM(11), ZMM(0)) SAL(RDI, IMM(2)) //rdi has 4*r11
219 VMOVAPS(ZMM(12), ZMM(0)) MOV(RCX, VAR(c)) //load address of c for prefetching
220 VMOVAPS(ZMM(13), ZMM(0))
221 VMOVAPS(ZMM(14), ZMM(0)) MOV(R(8), VAR(k))
222 VMOVAPS(ZMM(15), ZMM(0))
223 VMOVAPS(ZMM(16), ZMM(0))
224 VMOVAPS(ZMM(17), ZMM(0)) MOV(R(13), IMM(4*16*L2_PREFETCH_DIST))
225 VMOVAPS(ZMM(18), ZMM(0)) MOV(R(14), IMM(4*32*L2_PREFETCH_DIST))
226 VMOVAPS(ZMM(19), ZMM(0))
227 VMOVAPS(ZMM(20), ZMM(0))
228 VMOVAPS(ZMM(21), ZMM(0))
229 VMOVAPS(ZMM(22), ZMM(0))
230 VMOVAPS(ZMM(23), ZMM(0)) SUB(R(8), IMM(30+L2_PREFETCH_DIST)) //Check if we have over 40 operations to do.
231 VMOVAPS(ZMM(24), ZMM(0)) MOV(R(8), IMM(30))
232 VMOVAPS(ZMM(25), ZMM(0)) MOV(R(9), IMM(4*16)) //amount to increment b* by each iteration
233 VMOVAPS(ZMM(26), ZMM(0)) MOV(R(12), IMM(4*32)) //amount to increment a* by each iteration
234 VMOVAPS(ZMM(27), ZMM(0))
235 VMOVAPS(ZMM(28), ZMM(0))
236 VMOVAPS(ZMM(29), ZMM(0))
237
238 #ifdef MONITORS
239 RDTSC
240 MOV(VAR(midl), EAX)
241 MOV(VAR(midh), EDX)
242 #endif
243
244 JLE(CONSIDER_UNDER_40)
245 SUB(RSI, IMM(30+L2_PREFETCH_DIST))
246
247 //First 30 iterations
248 LABEL(LOOPREFECHCL2)
249 MAIN_LOOP_PC_L2(R(8))
250 JNZ(LOOPREFECHCL2)
251 MOV(RCX, VAR(c))
252
253 //Main Loop.
254 LABEL(LOOPMAIN)
255 MAIN_LOOP(RSI)
256 JNZ(LOOPMAIN)
257
258 //Penultimate 22 iterations.
259 //Break these off from the main loop to avoid prefetching extra shit.
260 MOV(R(14), VAR(a_next))
261 MOV(R(13), VAR(b_next))
262 SUB(R(14), R(15))
263 SUB(R(13), RBX)
264 //Yes, I know 10-20 = -10
265 MOV(RSI, IMM(10+L2_PREFETCH_DIST-20))
266
267 LABEL(LOOPMAIN2)
268 MAIN_LOOP(RSI)
269 JNZ(LOOPMAIN2)
270
271 //Last 10 iterations
272 MOV(R(8), IMM(10))
273
274 LABEL(LOOPREFETCHCL1)
275 MAIN_LOOP_PC_L1(R(8))
276 JNZ(LOOPREFETCHCL1)
277
278 JMP(POSTACCUM)
279
280 //Alternate main loop, with no prefetching of C
281 //Used when <= 40 iterations
282 LABEL(CONSIDER_UNDER_40)
283
284 MOV(RSI, VAR(k))
285 TEST(RSI, RSI)
286 JZ(POSTACCUM)
287
288 LABEL(LOOP_UNDER_40)
289 MAIN_LOOP(RSI)
290 JNZ(LOOP_UNDER_40)
291
292 LABEL(POSTACCUM)
293
294 #ifdef MONITORS
295 RDTSC
296 MOV(VAR(mid2l), EAX)
297 MOV(VAR(mid2h), EDX)
298 #endif
299
300 MOV(R(9), VAR(c)) //load address of c for update
301 MOV(R(12), VAR(alpha)) //load address of alpha
302
303 // Check if C is row stride. If not, jump to the slow scattered update
304 MOV(R(14), VAR(cs_c))
305 DEC(R(14))
306 JNZ(SCATTEREDUPDATE)
307
308 MOV(R(14), VAR(beta))
309 VBROADCASTSS(ZMM(31), MEM(R(14)))
310
311 MOV(EBX, MEM(R(14)))
312 TEST(EBX, EBX)
313 JZ(COLSTORBZ)
314
315 UPDATE_C_4_ROWS( 0, 1, 2, 3)
316 UPDATE_C_4_ROWS( 4, 5, 6, 7)
317 UPDATE_C_4_ROWS( 8, 9,10,11)
318 UPDATE_C_4_ROWS(12,13,14,15)
319 UPDATE_C_4_ROWS(16,17,18,19)
320 UPDATE_C_4_ROWS(20,21,22,23)
321 UPDATE_C_4_ROWS(24,25,26,27)
322 UPDATE_C_2_ROWS(28,29)
323
324 JMP(END)
325
326 LABEL(COLSTORBZ)
327
328 UPDATE_C_BZ_4_ROWS( 0, 1, 2, 3)
329 UPDATE_C_BZ_4_ROWS( 4, 5, 6, 7)
330 UPDATE_C_BZ_4_ROWS( 8, 9,10,11)
331 UPDATE_C_BZ_4_ROWS(12,13,14,15)
332 UPDATE_C_BZ_4_ROWS(16,17,18,19)
333 UPDATE_C_BZ_4_ROWS(20,21,22,23)
334 UPDATE_C_BZ_4_ROWS(24,25,26,27)
335 UPDATE_C_BZ_2_ROWS(28,29)
336
337 JMP(END)
338
339 LABEL(SCATTEREDUPDATE)
340
341 MOV(R(13), VAR(beta))
342 MOV(R(10), VAR(offsetPtr))
343 VMOVAPS(ZMM(30), MEM(R(10)))
344 MOV(EBX, MEM(R(13)))
345 /* Note that this ignores the upper 32 bits in cs_c */
346 VPBROADCASTD(ZMM(31), VAR(cs_c))
347 VPMULLD(ZMM(30), ZMM(31), ZMM(30))
348
349 TEST(EBX, EBX)
350 JZ(SCATTERBZ)
351
352 UPDATE_C_ROW_SCATTERED( 0)
353 UPDATE_C_ROW_SCATTERED( 1)
354 UPDATE_C_ROW_SCATTERED( 2)
355 UPDATE_C_ROW_SCATTERED( 3)
356 UPDATE_C_ROW_SCATTERED( 4)
357 UPDATE_C_ROW_SCATTERED( 5)
358 UPDATE_C_ROW_SCATTERED( 6)
359 UPDATE_C_ROW_SCATTERED( 7)
360 UPDATE_C_ROW_SCATTERED( 8)
361 UPDATE_C_ROW_SCATTERED( 9)
362 UPDATE_C_ROW_SCATTERED(10)
363 UPDATE_C_ROW_SCATTERED(11)
364 UPDATE_C_ROW_SCATTERED(12)
365 UPDATE_C_ROW_SCATTERED(13)
366 UPDATE_C_ROW_SCATTERED(14)
367 UPDATE_C_ROW_SCATTERED(15)
368 UPDATE_C_ROW_SCATTERED(16)
369 UPDATE_C_ROW_SCATTERED(17)
370 UPDATE_C_ROW_SCATTERED(18)
371 UPDATE_C_ROW_SCATTERED(19)
372 UPDATE_C_ROW_SCATTERED(20)
373 UPDATE_C_ROW_SCATTERED(21)
374 UPDATE_C_ROW_SCATTERED(22)
375 UPDATE_C_ROW_SCATTERED(23)
376 UPDATE_C_ROW_SCATTERED(24)
377 UPDATE_C_ROW_SCATTERED(25)
378 UPDATE_C_ROW_SCATTERED(26)
379 UPDATE_C_ROW_SCATTERED(27)
380 UPDATE_C_ROW_SCATTERED(28)
381 UPDATE_C_ROW_SCATTERED(29)
382
383 JMP(END)
384
385 LABEL(SCATTERBZ)
386
387 UPDATE_C_BZ_ROW_SCATTERED( 0)
388 UPDATE_C_BZ_ROW_SCATTERED( 1)
389 UPDATE_C_BZ_ROW_SCATTERED( 2)
390 UPDATE_C_BZ_ROW_SCATTERED( 3)
391 UPDATE_C_BZ_ROW_SCATTERED( 4)
392 UPDATE_C_BZ_ROW_SCATTERED( 5)
393 UPDATE_C_BZ_ROW_SCATTERED( 6)
394 UPDATE_C_BZ_ROW_SCATTERED( 7)
395 UPDATE_C_BZ_ROW_SCATTERED( 8)
396 UPDATE_C_BZ_ROW_SCATTERED( 9)
397 UPDATE_C_BZ_ROW_SCATTERED(10)
398 UPDATE_C_BZ_ROW_SCATTERED(11)
399 UPDATE_C_BZ_ROW_SCATTERED(12)
400 UPDATE_C_BZ_ROW_SCATTERED(13)
401 UPDATE_C_BZ_ROW_SCATTERED(14)
402 UPDATE_C_BZ_ROW_SCATTERED(15)
403 UPDATE_C_BZ_ROW_SCATTERED(16)
404 UPDATE_C_BZ_ROW_SCATTERED(17)
405 UPDATE_C_BZ_ROW_SCATTERED(18)
406 UPDATE_C_BZ_ROW_SCATTERED(19)
407 UPDATE_C_BZ_ROW_SCATTERED(20)
408 UPDATE_C_BZ_ROW_SCATTERED(21)
409 UPDATE_C_BZ_ROW_SCATTERED(22)
410 UPDATE_C_BZ_ROW_SCATTERED(23)
411 UPDATE_C_BZ_ROW_SCATTERED(24)
412 UPDATE_C_BZ_ROW_SCATTERED(25)
413 UPDATE_C_BZ_ROW_SCATTERED(26)
414 UPDATE_C_BZ_ROW_SCATTERED(27)
415 UPDATE_C_BZ_ROW_SCATTERED(28)
416 UPDATE_C_BZ_ROW_SCATTERED(29)
417
418 LABEL(END)
419
420 #ifdef MONITORS
421 RDTSC
422 MOV(VAR(botl), EAX)
423 MOV(VAR(both), EDX)
424 #endif
425 : // output operands
426 #ifdef MONITORS
427 [topl] "=m" (topl),
428 [toph] "=m" (toph),
429 [midl] "=m" (midl),
430 [midh] "=m" (midh),
431 [mid2l] "=m" (mid2l),
432 [mid2h] "=m" (mid2h),
433 [botl] "=m" (botl),
434 [both] "=m" (both)
435 #endif
436 : // input operands
437 [k] "m" (k),
438 [a] "m" (a),
439 [b] "m" (b),
440 [alpha] "m" (alpha),
441 [beta] "m" (beta),
442 [c] "m" (c),
443 [rs_c] "m" (rs_c),
444 [cs_c] "m" (cs_c),
445 [a_next] "m" (a_next),
446 [b_next] "m" (b_next),
447 [offsetPtr] "m" (offsetPtr)
448 : // register clobber list
449 "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
450 "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
451 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
452 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
453 "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
454 "zmm30", "zmm31", "memory"
455 );
456
457 #ifdef LOOPMON
458 printf("looptime = \t%d\n", bloopl - tloopl);
459 #endif
460 #ifdef MONITORS
461 dim_t top = ((dim_t)toph << 32) | topl;
462 dim_t mid = ((dim_t)midh << 32) | midl;
463 dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
464 dim_t bot = ((dim_t)both << 32) | botl;
465 printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
466 #endif
467 }
468