1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #include <assert.h>
37 
38 #include "bli_avx512_macros.h"
39 
40 #define UNROLL_K 8
41 
42 #define SCATTER_PREFETCH_AB 0
43 #define SCATTER_PREFETCH_C 1
44 
45 #define PREFETCH_A_L2 0
46 #define PREFETCH_B_L2 0
47 #define L2_PREFETCH_DIST 64
48 
49 #define A_L1_PREFETCH_DIST 32
50 #define B_L1_PREFETCH_DIST 12
51 
52 #define C_MIN_L2_ITERS 64 //C is not prefetched into L2 for k <= this
53 #define C_L1_ITERS 8 //number of iterations before the end to prefetch C into L1
54                       //make sure there is an unrolled MAIN_LOOP_X for this number
55 
56 #define LOOP_ALIGN ALIGN16
57 
58 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
59 \
60     VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
61     VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
62     VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
63     VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
64     VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX      )) \
65     VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
66     VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
67     VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
68     VMOVUPD(MEM(RCX      ), ZMM(R1)) \
69     VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \
70     VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \
71     VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \
72     LEA(RCX, MEM(RCX,RAX,4))
73 
74 #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
75 \
76     VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
77     VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
78     VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
79     VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
80     VMOVUPD(MEM(RCX      ), ZMM(R1)) \
81     VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \
82     VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \
83     VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \
84     LEA(RCX, MEM(RCX,RAX,4))
85 
86 #define UPDATE_C_ROW_SCATTERED(NUM) \
87 \
88     KXNORW(K(1), K(0), K(0)) \
89     KXNORW(K(2), K(0), K(0)) \
90     VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \
91     VGATHERDPD(ZMM(3) MASK_K(1), MEM(RCX,YMM(2),8)) \
92     VFMADD231PD(ZMM(NUM), ZMM(3), ZMM(1)) \
93     VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(2), ZMM(NUM)) \
94     ADD(RCX, RAX)
95 
96 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
97 \
98     KXNORW(K(1), K(0), K(0)) \
99     VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \
100     VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(1), ZMM(NUM)) \
101     ADD(RCX, RAX)
102 
103 #define PREFETCH_B_L1_1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8))
104 #define PREFETCH_B_L1_2(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+64))
105 #define PREFETCH_B_L1_3(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+128))
106 
107 #if PREFETCH_B_L2
108 #undef PREFETCH_B_L2
109 
110 #define PREFETCH_B_L2(n) \
111 \
112     PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8)) \
113     PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+64)) \
114     PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+128))
115 
116 #else
117 #undef PREFETCH_B_L2
118 #define PREFETCH_B_L2(...)
119 #endif
120 
121 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*8*8))
122 
123 #if PREFETCH_A_L2
124 #undef PREFETCH_A_L2
125 
126 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*8*8))
127 
128 #else
129 #undef PREFETCH_A_L2
130 #define PREFETCH_A_L2(...)
131 #endif
132 
133 #if SCATTER_PREFETCH_AB
134 #undef SCATTER_PREFETCH_AB
135 #undef PREFETCH_B_L1_1
136 #undef PREFETCH_B_L1_2
137 #undef PREFETCH_B_L1_3
138 #undef PREFETCH_A_L1
139 
140 #define SCATTER_PREFETCH_AB(n) \
141 \
142     KXNORW(K(1), K(0), K(0)) \
143     VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n  )*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(1)) \
144     KXNORW(K(2), K(0), K(0)) \
145     VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+1)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(2)) \
146     KXNORW(K(3), K(0), K(0)) \
147     VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+2)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(3)) \
148     KXNORW(K(4), K(0), K(0)) \
149     VGATHERPFDPS(0, MEM(RAX,ZMM(4),8,(   n   *16+  A_L1_PREFETCH_DIST)*64) MASK_K(4))
150 
151 #define PREFETCH_B_L1_1(...)
152 #define PREFETCH_B_L1_2(...)
153 #define PREFETCH_B_L1_3(...)
154 #define PREFETCH_A_L1(...)
155 
156 #else
157 #undef SCATTER_PREFETCH_AB
158 
159 #define SCATTER_PREFETCH_AB(...)
160 
161 #endif
162 
163 //
164 // n: index in unrolled loop (for prefetching offsets)
165 //
166 // a: ZMM register to load into
167 // b: ZMM register to read from
168 //
169 // ...: addressing for B, except for offset
170 //
171 #define SUBITER(n,a,b,...) \
172 \
173         PREFETCH_B_L2(n) \
174 \
175         VMOVAPD(ZMM(a), MEM(RAX,(n+1)*64)) \
176         VFMADD231PD(ZMM( 8), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 0)*8)) \
177         VFMADD231PD(ZMM( 9), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 1)*8)) \
178         VFMADD231PD(ZMM(10), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 2)*8)) \
179         VFMADD231PD(ZMM(11), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 3)*8)) \
180         PREFETCH_B_L1_1(n) \
181         VFMADD231PD(ZMM(12), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 4)*8)) \
182         VFMADD231PD(ZMM(13), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 5)*8)) \
183         VFMADD231PD(ZMM(14), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 6)*8)) \
184         VFMADD231PD(ZMM(15), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 7)*8)) \
185         PREFETCH_B_L1_2(n) \
186         VFMADD231PD(ZMM(16), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 8)*8)) \
187         VFMADD231PD(ZMM(17), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 9)*8)) \
188         VFMADD231PD(ZMM(18), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+10)*8)) \
189         VFMADD231PD(ZMM(19), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+11)*8)) \
190         PREFETCH_B_L1_3(n) \
191         VFMADD231PD(ZMM(20), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+12)*8)) \
192         VFMADD231PD(ZMM(21), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+13)*8)) \
193         VFMADD231PD(ZMM(22), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+14)*8)) \
194         VFMADD231PD(ZMM(23), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+15)*8)) \
195         PREFETCH_A_L1(n) \
196         VFMADD231PD(ZMM(24), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+16)*8)) \
197         VFMADD231PD(ZMM(25), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+17)*8)) \
198         VFMADD231PD(ZMM(26), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+18)*8)) \
199         VFMADD231PD(ZMM(27), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+19)*8)) \
200         PREFETCH_A_L2(n) \
201         VFMADD231PD(ZMM(28), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+20)*8)) \
202         VFMADD231PD(ZMM(29), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+21)*8)) \
203         VFMADD231PD(ZMM(30), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+22)*8)) \
204         VFMADD231PD(ZMM(31), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+23)*8))
205 
206 #define TAIL_LOOP(NAME) \
207 \
208     LOOP_ALIGN \
209     LABEL(NAME) \
210 \
211         SUBITER(0,1,0,RBX) \
212 \
213         VMOVAPD(ZMM(0), ZMM(1)) \
214 \
215         LEA(RBX, MEM(RBX,24*8)) \
216         LEA(RAX, MEM(RAX, 8*8)) \
217 \
218         SUB(RDI, IMM(1)) \
219 \
220     JNZ(NAME)
221 
222 #define MAIN_LOOP_1(NAME) \
223 \
224     LOOP_ALIGN \
225     LABEL(NAME##_LOOP) \
226 \
227         SUBITER(0,1,0,RBX) \
228 \
229         VMOVAPD(ZMM(0), ZMM(1)) \
230 \
231         LEA(RBX, MEM(RBX,24*8)) \
232         LEA(RAX, MEM(RAX, 8*8)) \
233 \
234         SUB(RSI, IMM(1)) \
235 \
236     JNZ(NAME##_LOOP)
237 
238 #define MAIN_LOOP_2(NAME) \
239 \
240     MOV(RDI, RSI) \
241     AND(RDI, IMM(1)) \
242     SAR1(RSI) \
243     JZ(NAME##_TAIL) \
244 \
245     LOOP_ALIGN \
246     LABEL(NAME##_LOOP) \
247 \
248         SUBITER(0,1,0,RBX) \
249         SUBITER(1,0,1,RBX) \
250 \
251         LEA(RBX, MEM(RBX,2*24*8)) \
252         LEA(RAX, MEM(RAX,2* 8*8)) \
253 \
254         SUB(RSI, IMM(1)) \
255 \
256     JNZ(NAME##_LOOP) \
257 \
258     TEST(RDI, RDI) \
259     JZ(NAME##_DONE) \
260 \
261     LABEL(NAME##_TAIL) \
262 \
263     SUBITER(0,1,0,RBX) \
264 \
265     VMOVAPD(ZMM(0), ZMM(1)) \
266 \
267     LEA(RBX, MEM(RBX,24*8)) \
268     LEA(RAX, MEM(RAX, 8*8)) \
269 \
270     LABEL(NAME##_DONE)
271 
272 #define MAIN_LOOP_4(NAME) \
273 \
274     MOV(RDI, RSI) \
275     AND(RDI, IMM(3)) \
276     SAR(RSI, IMM(2)) \
277     JZ(NAME##_TAIL) \
278 \
279     LOOP_ALIGN \
280     LABEL(NAME##_LOOP) \
281 \
282         SUBITER(0,1,0,RBX) \
283         SUBITER(1,0,1,RBX) \
284         SUBITER(2,1,0,RBX) \
285         SUBITER(3,0,1,RBX) \
286 \
287         LEA(RBX, MEM(RBX,4*24*8)) \
288         LEA(RAX, MEM(RAX,4* 8*8)) \
289 \
290         SUB(RSI, IMM(1)) \
291 \
292     JNZ(NAME##_LOOP) \
293 \
294     TEST(RDI, RDI) \
295     JZ(NAME##_DONE) \
296 \
297     TAIL_LOOP(NAME##_TAIL) \
298 \
299     LABEL(NAME##_DONE)
300 
301 #define MAIN_LOOP_8(NAME) \
302 \
303     MOV(RDI, RSI) \
304     AND(RDI, IMM(7)) \
305     SAR(RSI, IMM(3)) \
306     JZ(NAME##_TAIL) \
307 \
308     LOOP_ALIGN \
309     LABEL(NAME##_LOOP) \
310 \
311         SUBITER(0,1,0,RBX) \
312         SUBITER(1,0,1,RBX) \
313         SUBITER(2,1,0,RBX) \
314         SUBITER(3,0,1,RBX) \
315         SUBITER(4,1,0,RBX,R8,1) \
316         SUBITER(5,0,1,RBX,R8,1) \
317         SUBITER(6,1,0,RBX,R8,1) \
318         SUBITER(7,0,1,RBX,R8,1) \
319 \
320         LEA(RBX, MEM(RBX,8*24*8)) \
321         LEA(RAX, MEM(RAX,8* 8*8)) \
322 \
323         SUB(RSI, IMM(1)) \
324 \
325     JNZ(NAME##_LOOP) \
326 \
327     TEST(RDI, RDI) \
328     JZ(NAME##_DONE) \
329 \
330     TAIL_LOOP(NAME##_TAIL) \
331 \
332     LABEL(NAME##_DONE)
333 
334 #define MAIN_LOOP_16(NAME) \
335 \
336     MOV(RDI, RSI) \
337     AND(RDI, IMM(15)) \
338     SAR(RSI, IMM(4)) \
339     JZ(NAME##_TAIL) \
340 \
341     LOOP_ALIGN \
342     LABEL(NAME##_LOOP) \
343 \
344         SCATTER_PREFETCH_AB(0) \
345 \
346         SUBITER( 0,1,0,RBX) \
347         SUBITER( 1,0,1,RBX) \
348         SUBITER( 2,1,0,RBX) \
349         SUBITER( 3,0,1,RBX) \
350         SUBITER( 4,1,0,RBX,R8,1) \
351         SUBITER( 5,0,1,RBX,R8,1) \
352         SUBITER( 6,1,0,RBX,R8,1) \
353         SUBITER( 7,0,1,RBX,R8,1) \
354         SUBITER( 8,1,0,RBX,R8,2) \
355         SUBITER( 9,0,1,RBX,R8,2) \
356         SUBITER(10,1,0,RBX,R8,2) \
357         SUBITER(11,0,1,RBX,R8,2) \
358         SUBITER(12,1,0,RBX,R9,1) \
359         SUBITER(13,0,1,RBX,R9,1) \
360         SUBITER(14,1,0,RBX,R9,1) \
361         SUBITER(15,0,1,RBX,R9,1) \
362 \
363         LEA(RBX, MEM(RBX,16*24*8)) \
364         LEA(RAX, MEM(RAX,16* 8*8)) \
365 \
366         SUB(RSI, IMM(1)) \
367 \
368     JNZ(NAME##_LOOP) \
369 \
370     TEST(RDI, RDI) \
371     JZ(NAME##_DONE) \
372 \
373     SCATTER_PREFETCH_AB(0) \
374 \
375     TAIL_LOOP(NAME##_TAIL) \
376 \
377     LABEL(NAME##_DONE)
378 
379 #define MAIN_LOOP_32(NAME) \
380 \
381     MOV(RDI, RSI) \
382     AND(RDI, IMM(31)) \
383     SAR(RSI, IMM(5)) \
384     JZ(NAME##_TAIL) \
385 \
386     LOOP_ALIGN \
387     LABEL(NAME##_LOOP) \
388 \
389         SCATTER_PREFETCH_AB(0) \
390 \
391         SUBITER( 0,1,0,RBX) \
392         SUBITER( 1,0,1,RBX) \
393         SUBITER( 2,1,0,RBX) \
394         SUBITER( 3,0,1,RBX) \
395         SUBITER( 4,1,0,RBX,R8,1) \
396         SUBITER( 5,0,1,RBX,R8,1) \
397         SUBITER( 6,1,0,RBX,R8,1) \
398         SUBITER( 7,0,1,RBX,R8,1) \
399         SUBITER( 8,1,0,RBX,R8,2) \
400         SUBITER( 9,0,1,RBX,R8,2) \
401         SUBITER(10,1,0,RBX,R8,2) \
402         SUBITER(11,0,1,RBX,R8,2) \
403         SUBITER(12,1,0,RBX,R9,1) \
404         SUBITER(13,0,1,RBX,R9,1) \
405         SUBITER(14,1,0,RBX,R9,1) \
406         SUBITER(15,0,1,RBX,R9,1) \
407 \
408         SCATTER_PREFETCH_AB(1) \
409 \
410         SUBITER(16,1,0,RBX,R8,4) \
411         SUBITER(17,0,1,RBX,R8,4) \
412         SUBITER(18,1,0,RBX,R8,4) \
413         SUBITER(19,0,1,RBX,R8,4) \
414         SUBITER(20,1,0,RBX,R10,1) \
415         SUBITER(21,0,1,RBX,R10,1) \
416         SUBITER(22,1,0,RBX,R10,1) \
417         SUBITER(23,0,1,RBX,R10,1) \
418         SUBITER(24,1,0,RBX,R9,2) \
419         SUBITER(25,0,1,RBX,R9,2) \
420         SUBITER(26,1,0,RBX,R9,2) \
421         SUBITER(27,0,1,RBX,R9,2) \
422         SUBITER(28,1,0,RBX,R11,1) \
423         SUBITER(29,0,1,RBX,R11,1) \
424         SUBITER(30,1,0,RBX,R11,1) \
425         SUBITER(31,0,1,RBX,R11,1) \
426 \
427         LEA(RBX, MEM(RBX,32*24*8)) \
428         LEA(RAX, MEM(RAX,32* 8*8)) \
429 \
430         SUB(RSI, IMM(1)) \
431 \
432     JNZ(NAME##_LOOP) \
433 \
434     TEST(RDI, RDI) \
435     JZ(NAME##_DONE) \
436 \
437     SCATTER_PREFETCH_AB(0) \
438     SCATTER_PREFETCH_AB(1) \
439 \
440     TAIL_LOOP(NAME##_TAIL) \
441 \
442     LABEL(NAME##_DONE)
443 
444 #define LOOP_K_(M,K) M##K
445 #define LOOP_K(M,K,NAME) LOOP_K_(M,K)(NAME)
446 
447 #define MAIN_LOOP_L2 LOOP_K(MAIN_LOOP_,UNROLL_K,MAIN_LOOP_L2)
448 #define MAIN_LOOP_L1 LOOP_K(MAIN_LOOP_,C_L1_ITERS,MAIN_LOOP_L1)
449 
450 //This is an array used for the scatter/gather instructions.
451 extern int32_t offsets[24];
452 
453 //#define MONITORS
454 //#define LOOPMON
bli_dgemm_knl_asm_8x24(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)455 void bli_dgemm_knl_asm_8x24
456      (
457        dim_t               k,
458        double*    restrict alpha,
459        double*    restrict a,
460        double*    restrict b,
461        double*    restrict beta,
462        double*    restrict c, inc_t rs_c, inc_t cs_c,
463        auxinfo_t* restrict data,
464        cntx_t*    restrict cntx
465       )
466 {
467     const double * a_next = bli_auxinfo_next_a( data );
468     const double * b_next = bli_auxinfo_next_b( data );
469 
470     const int32_t * offsetPtr = &offsets[0];
471 
472     uint64_t k64 = k;
473 
474 #ifdef MONITORS
475     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
476 #endif
477 #ifdef LOOPMON
478     int tlooph, tloopl, blooph, bloopl;
479 #endif
480 
481     __asm__ volatile
482     (
483 #ifdef MONITORS
484     RDTSC
485     MOV(VAR(topl), EAX)
486     MOV(VAR(toph), EDX)
487 #endif
488 
489     VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
490     VMOVAPS(ZMM( 9), ZMM(8))
491     VMOVAPS(ZMM(10), ZMM(8))   MOV(RSI, VAR(k)) //loop index
492     VMOVAPS(ZMM(11), ZMM(8))   MOV(RAX, VAR(a)) //load address of a
493     VMOVAPS(ZMM(12), ZMM(8))   MOV(RBX, VAR(b)) //load address of b
494     VMOVAPS(ZMM(13), ZMM(8))   MOV(RCX, VAR(c)) //load address of c
495     VMOVAPS(ZMM(14), ZMM(8))   VMOVAPD(ZMM(0), MEM(RAX)) //pre-load a
496     VMOVAPS(ZMM(15), ZMM(8))   MOV(RDI, VAR(offsetPtr))
497     VMOVAPS(ZMM(16), ZMM(8))   VMOVAPS(ZMM(4), MEM(RDI))
498 #if SCATTER_PREFETCH_C
499     VMOVAPS(ZMM(17), ZMM(8))
500     VMOVAPS(ZMM(18), ZMM(8))
501     VMOVAPS(ZMM(19), ZMM(8))   VBROADCASTSS(ZMM(5), VAR(cs_c))
502     VMOVAPS(ZMM(20), ZMM(8))
503     VMOVAPS(ZMM(21), ZMM(8))   VPMULLD(ZMM(2), ZMM(4), ZMM(5))
504     VMOVAPS(ZMM(22), ZMM(8))   VMOVAPS(YMM(3), MEM(RDI,64))
505     VMOVAPS(ZMM(23), ZMM(8))   VPMULLD(YMM(3), YMM(3), YMM(5))
506 #else
507     VMOVAPS(ZMM(17), ZMM(8))   MOV(R12, VAR(cs_c))
508     VMOVAPS(ZMM(18), ZMM(8))   LEA(R13, MEM(R12,R12,2))
509     VMOVAPS(ZMM(19), ZMM(8))   LEA(R14, MEM(R12,R12,4))
510     VMOVAPS(ZMM(20), ZMM(8))   LEA(R15, MEM(R13,R12,4))
511     VMOVAPS(ZMM(21), ZMM(8))   LEA(RDX, MEM(RCX,R12,8))
512     VMOVAPS(ZMM(22), ZMM(8))   LEA(RDI, MEM(RDX,R12,8))
513     VMOVAPS(ZMM(23), ZMM(8))
514 #endif
515     VMOVAPS(ZMM(24), ZMM(8))   VPSLLD(ZMM(4), ZMM(4), IMM(3))
516     VMOVAPS(ZMM(25), ZMM(8))   MOV(R8, IMM(4*24*8))     //offset for 4 iterations
517     VMOVAPS(ZMM(26), ZMM(8))   LEA(R9, MEM(R8,R8,2))    //*3
518     VMOVAPS(ZMM(27), ZMM(8))   LEA(R10, MEM(R8,R8,4))   //*5
519     VMOVAPS(ZMM(28), ZMM(8))   LEA(R11, MEM(R9,R8,4))   //*7
520     VMOVAPS(ZMM(29), ZMM(8))
521     VMOVAPS(ZMM(30), ZMM(8))
522     VMOVAPS(ZMM(31), ZMM(8))
523 
524 #ifdef MONITORS
525     RDTSC
526     MOV(VAR(midl), EAX)
527     MOV(VAR(midh), EDX)
528 #endif
529 
530     //need 0+... to satisfy preprocessor
531     CMP(RSI, IMM(0+C_MIN_L2_ITERS))
532     JLE(PREFETCH_C_L1)
533 
534     SUB(RSI, IMM(0+C_L1_ITERS))
535 
536     //prefetch C into L2
537 #if SCATTER_PREFETCH_C
538     KXNORW(K(1), K(0), K(0))
539     KXNORW(K(2), K(0), K(0))
540     VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
541     VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
542 #else
543     PREFETCH(1, MEM(RCX      ))
544     PREFETCH(1, MEM(RCX,R12,1))
545     PREFETCH(1, MEM(RCX,R12,2))
546     PREFETCH(1, MEM(RCX,R13,1))
547     PREFETCH(1, MEM(RCX,R12,4))
548     PREFETCH(1, MEM(RCX,R14,1))
549     PREFETCH(1, MEM(RCX,R13,2))
550     PREFETCH(1, MEM(RCX,R15,1))
551     PREFETCH(1, MEM(RDX      ))
552     PREFETCH(1, MEM(RDX,R12,1))
553     PREFETCH(1, MEM(RDX,R12,2))
554     PREFETCH(1, MEM(RDX,R13,1))
555     PREFETCH(1, MEM(RDX,R12,4))
556     PREFETCH(1, MEM(RDX,R14,1))
557     PREFETCH(1, MEM(RDX,R13,2))
558     PREFETCH(1, MEM(RDX,R15,1))
559     PREFETCH(1, MEM(RDI      ))
560     PREFETCH(1, MEM(RDI,R12,1))
561     PREFETCH(1, MEM(RDI,R12,2))
562     PREFETCH(1, MEM(RDI,R13,1))
563     PREFETCH(1, MEM(RDI,R12,4))
564     PREFETCH(1, MEM(RDI,R14,1))
565     PREFETCH(1, MEM(RDI,R13,2))
566     PREFETCH(1, MEM(RDI,R15,1))
567 #endif
568 
569     MAIN_LOOP_L2
570 
571     MOV(RSI, IMM(0+C_L1_ITERS))
572 
573     LABEL(PREFETCH_C_L1)
574 
575     //prefetch C into L1
576 #if SCATTER_PREFETCH_C
577     KXNORW(K(1), K(0), K(0))
578     KXNORW(K(2), K(0), K(0))
579     VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
580     VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
581 #else
582     PREFETCH(0, MEM(RCX      ))
583     PREFETCH(0, MEM(RCX,R12,1))
584     PREFETCH(0, MEM(RCX,R12,2))
585     PREFETCH(0, MEM(RCX,R13,1))
586     PREFETCH(0, MEM(RCX,R12,4))
587     PREFETCH(0, MEM(RCX,R14,1))
588     PREFETCH(0, MEM(RCX,R13,2))
589     PREFETCH(0, MEM(RCX,R15,1))
590     PREFETCH(0, MEM(RDX      ))
591     PREFETCH(0, MEM(RDX,R12,1))
592     PREFETCH(0, MEM(RDX,R12,2))
593     PREFETCH(0, MEM(RDX,R13,1))
594     PREFETCH(0, MEM(RDX,R12,4))
595     PREFETCH(0, MEM(RDX,R14,1))
596     PREFETCH(0, MEM(RDX,R13,2))
597     PREFETCH(0, MEM(RDX,R15,1))
598     PREFETCH(0, MEM(RDI      ))
599     PREFETCH(0, MEM(RDI,R12,1))
600     PREFETCH(0, MEM(RDI,R12,2))
601     PREFETCH(0, MEM(RDI,R13,1))
602     PREFETCH(0, MEM(RDI,R12,4))
603     PREFETCH(0, MEM(RDI,R14,1))
604     PREFETCH(0, MEM(RDI,R13,2))
605     PREFETCH(0, MEM(RDI,R15,1))
606 #endif
607 
608     MAIN_LOOP_L1
609 
610     LABEL(POSTACCUM)
611 
612 #ifdef MONITORS
613     RDTSC
614     MOV(VAR(mid2l), EAX)
615     MOV(VAR(mid2h), EDX)
616 #endif
617 
618     MOV(RAX, VAR(alpha))
619     MOV(RBX, VAR(beta))
620     VBROADCASTSD(ZMM(0), MEM(RAX))
621     VBROADCASTSD(ZMM(1), MEM(RBX))
622 
623     // Check if C is column stride. If not, jump to the slow scattered update
624     MOV(RAX, VAR(cs_c))
625     LEA(RAX, MEM(,RAX,8))
626     MOV(RBX, VAR(rs_c))
627     LEA(RDI, MEM(RAX,RAX,2))
628     CMP(RBX, IMM(1))
629     JNE(SCATTEREDUPDATE)
630 
631     VMOVQ(RDX, XMM(1))
632     SAL1(RDX) //shift out sign bit
633     JZ(COLSTORBZ)
634 
635     UPDATE_C_FOUR_ROWS( 8, 9,10,11)
636     UPDATE_C_FOUR_ROWS(12,13,14,15)
637     UPDATE_C_FOUR_ROWS(16,17,18,19)
638     UPDATE_C_FOUR_ROWS(20,21,22,23)
639     UPDATE_C_FOUR_ROWS(24,25,26,27)
640     UPDATE_C_FOUR_ROWS(28,29,30,31)
641 
642     JMP(END)
643 
644     LABEL(COLSTORBZ)
645 
646     UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
647     UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
648     UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
649     UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
650     UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
651     UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
652 
653     JMP(END)
654 
655     LABEL(SCATTEREDUPDATE)
656 
657     MOV(RDI, VAR(offsetPtr))
658     VMOVAPS(ZMM(2), MEM(RDI))
659     /* Note that this ignores the upper 32 bits in rs_c */
660     VPBROADCASTD(ZMM(3), EBX)
661     VPMULLD(ZMM(2), ZMM(3), ZMM(2))
662 
663     VMOVQ(RDX, XMM(1))
664     SAL1(RDX) //shift out sign bit
665     JZ(SCATTERBZ)
666 
667     UPDATE_C_ROW_SCATTERED( 8)
668     UPDATE_C_ROW_SCATTERED( 9)
669     UPDATE_C_ROW_SCATTERED(10)
670     UPDATE_C_ROW_SCATTERED(11)
671     UPDATE_C_ROW_SCATTERED(12)
672     UPDATE_C_ROW_SCATTERED(13)
673     UPDATE_C_ROW_SCATTERED(14)
674     UPDATE_C_ROW_SCATTERED(15)
675     UPDATE_C_ROW_SCATTERED(16)
676     UPDATE_C_ROW_SCATTERED(17)
677     UPDATE_C_ROW_SCATTERED(18)
678     UPDATE_C_ROW_SCATTERED(19)
679     UPDATE_C_ROW_SCATTERED(20)
680     UPDATE_C_ROW_SCATTERED(21)
681     UPDATE_C_ROW_SCATTERED(22)
682     UPDATE_C_ROW_SCATTERED(23)
683     UPDATE_C_ROW_SCATTERED(24)
684     UPDATE_C_ROW_SCATTERED(25)
685     UPDATE_C_ROW_SCATTERED(26)
686     UPDATE_C_ROW_SCATTERED(27)
687     UPDATE_C_ROW_SCATTERED(28)
688     UPDATE_C_ROW_SCATTERED(29)
689     UPDATE_C_ROW_SCATTERED(30)
690     UPDATE_C_ROW_SCATTERED(31)
691 
692     JMP(END)
693 
694     LABEL(SCATTERBZ)
695 
696     UPDATE_C_BZ_ROW_SCATTERED( 8)
697     UPDATE_C_BZ_ROW_SCATTERED( 9)
698     UPDATE_C_BZ_ROW_SCATTERED(10)
699     UPDATE_C_BZ_ROW_SCATTERED(11)
700     UPDATE_C_BZ_ROW_SCATTERED(12)
701     UPDATE_C_BZ_ROW_SCATTERED(13)
702     UPDATE_C_BZ_ROW_SCATTERED(14)
703     UPDATE_C_BZ_ROW_SCATTERED(15)
704     UPDATE_C_BZ_ROW_SCATTERED(16)
705     UPDATE_C_BZ_ROW_SCATTERED(17)
706     UPDATE_C_BZ_ROW_SCATTERED(18)
707     UPDATE_C_BZ_ROW_SCATTERED(19)
708     UPDATE_C_BZ_ROW_SCATTERED(20)
709     UPDATE_C_BZ_ROW_SCATTERED(21)
710     UPDATE_C_BZ_ROW_SCATTERED(22)
711     UPDATE_C_BZ_ROW_SCATTERED(23)
712     UPDATE_C_BZ_ROW_SCATTERED(24)
713     UPDATE_C_BZ_ROW_SCATTERED(25)
714     UPDATE_C_BZ_ROW_SCATTERED(26)
715     UPDATE_C_BZ_ROW_SCATTERED(27)
716     UPDATE_C_BZ_ROW_SCATTERED(28)
717     UPDATE_C_BZ_ROW_SCATTERED(29)
718     UPDATE_C_BZ_ROW_SCATTERED(30)
719     UPDATE_C_BZ_ROW_SCATTERED(31)
720 
721     LABEL(END)
722 
723 #ifdef MONITORS
724     RDTSC
725     MOV(VAR(botl), EAX)
726     MOV(VAR(both), EDX)
727 #endif
728     : // output operands
729 #ifdef MONITORS
730       [topl]  "=m" (topl),
731       [toph]  "=m" (toph),
732       [midl]  "=m" (midl),
733       [midh]  "=m" (midh),
734       [mid2l] "=m" (mid2l),
735       [mid2h] "=m" (mid2h),
736       [botl]  "=m" (botl),
737       [both]  "=m" (both)
738 #endif
739     : // input operands
740       [k]         "m" (k64),
741       [a]         "m" (a),
742       [b]         "m" (b),
743       [alpha]     "m" (alpha),
744       [beta]      "m" (beta),
745       [c]         "m" (c),
746       [rs_c]      "m" (rs_c),
747       [cs_c]      "m" (cs_c),
748       [a_next]    "m" (a_next),
749       [b_next]    "m" (b_next),
750       [offsetPtr] "m" (offsetPtr)
751     : // register clobber list
752       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
753       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
754       "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
755       "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
756       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
757       "zmm30", "zmm31", "memory"
758     );
759 
760 #ifdef LOOPMON
761     printf("looptime = \t%d\n", bloopl - tloopl);
762 #endif
763 #ifdef MONITORS
764     dim_t top = ((dim_t)toph << 32) | topl;
765     dim_t mid = ((dim_t)midh << 32) | midl;
766     dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
767     dim_t bot = ((dim_t)both << 32) | botl;
768     printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
769 #endif
770 }
771