1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name(s) of the copyright holder(s) nor the names of its
18       contributors may be used to endorse or promote products derived
19       from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25    OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29    OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "blis.h"
36 #include <assert.h>
37 
38 #define BLIS_ASM_SYNTAX_INTEL
39 #include "bli_x86_asm_macros.h"
40 
41 #define UNROLL_K 32
42 
43 #define SCATTER_PREFETCH_C 1
44 
45 #define PREFETCH_A_L2 0
46 #define PREFETCH_B_L2 0
47 #define L2_PREFETCH_DIST 64
48 
49 #define A_L1_PREFETCH_DIST 36
50 #define B_L1_PREFETCH_DIST 18
51 
52 #define LOOP_ALIGN ALIGN16
53 
54 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
55 \
56     VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
57     VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
58     VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
59     VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
60     VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX      )) \
61     VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
62     VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
63     VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
64     VMOVUPS(MEM(RCX      ), ZMM(R1)) \
65     VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
66     VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
67     VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
68     LEA(RCX, MEM(RCX,RAX,4))
69 
70 #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
71 \
72     VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
73     VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
74     VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
75     VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
76     VMOVUPS(MEM(RCX      ), ZMM(R1)) \
77     VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
78     VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
79     VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
80     LEA(RCX, MEM(RCX,RAX,4))
81 
82 #define UPDATE_C_ROW_SCATTERED(NUM) \
83 \
84     KXNORW(K(1), K(0), K(0)) \
85     KXNORW(K(2), K(0), K(0)) \
86     VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
87     VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \
88     VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \
89     VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \
90     ADD(RCX, RAX)
91 
92 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
93 \
94     KXNORW(K(1), K(0), K(0)) \
95     VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
96     VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \
97     ADD(RCX, RAX)
98 
99 #define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4))
100 #define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64))
101 
102 #if PREFETCH_A_L2
103 #undef PREFETCH_A_L2
104 
105 #define PREFETCH_A_L2(n) \
106 \
107     PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \
108     PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64))
109 
110 #else
111 #undef PREFETCH_A_L2
112 #define PREFETCH_A_L2(...)
113 #endif
114 
115 #define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4))
116 
117 #if PREFETCH_B_L2
118 #undef PREFETCH_B_L2
119 
120 #define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4))
121 
122 #else
123 #undef PREFETCH_B_L2
124 #define PREFETCH_B_L2(...)
125 #endif
126 
127 #define PREFETCH_C_L1_1
128 #define PREFETCH_C_L1_2
129 #define PREFETCH_C_L1_3
130 
131 //
132 // n: index in unrolled loop
133 //
134 // a: ZMM register to load into
135 // b: ZMM register to read from
136 //
137 // ...: addressing for A, except for offset
138 //
139 #define SUBITER(n,a,b,...) \
140 \
141         PREFETCH_A_L2(n) \
142 \
143         VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \
144         VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \
145         VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \
146         VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \
147         PREFETCH_A_L1_1(n) \
148         VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \
149         VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \
150         VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \
151         PREFETCH_C_L1_1 \
152         VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \
153         VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \
154         VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \
155         PREFETCH_A_L1_2(n) \
156         VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \
157         VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \
158         VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \
159         PREFETCH_C_L1_2 \
160         VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \
161         VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \
162         VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \
163         PREFETCH_C_L1_3 \
164         VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \
165         VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \
166         VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \
167         PREFETCH_B_L1(n) \
168         VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \
169         VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \
170         VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \
171         PREFETCH_B_L2(n) \
172         VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \
173         VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \
174         VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4))
175 
176 //This is an array used for the scatter/gather instructions.
177 static int32_t offsets[32] __attribute__((aligned(64))) =
178     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
179      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
180 
181 //#define MONITORS
182 //#define LOOPMON
bli_sgemm_knl_asm_24x16(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * restrict data,cntx_t * restrict cntx)183 void bli_sgemm_knl_asm_24x16
184      (
185        dim_t               k_,
186        float*     restrict alpha,
187        float*     restrict a,
188        float*     restrict b,
189        float*     restrict beta,
190        float*     restrict c, inc_t rs_c_, inc_t cs_c_,
191        auxinfo_t* restrict data,
192        cntx_t*    restrict cntx
193      )
194 {
195     (void)data;
196     (void)cntx;
197 
198     const double * a_next = bli_auxinfo_next_a( data );
199     const double * b_next = bli_auxinfo_next_b( data );
200 
201     const int32_t * offsetPtr = &offsets[0];
202     const int64_t k = k_;
203     const int64_t rs_c = rs_c_;
204     const int64_t cs_c = cs_c_;
205 
206 #ifdef MONITORS
207     int toph, topl, both, botl, midl, midh, mid2l, mid2h;
208 #endif
209 #ifdef LOOPMON
210     int tlooph, tloopl, blooph, bloopl;
211 #endif
212 
213     BEGIN_ASM()
214 
215 #ifdef MONITORS
216     RDTSC
217     MOV(VAR(topl), EAX)
218     MOV(VAR(toph), EDX)
219 #endif
220 
221     VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
222     VMOVAPS(ZMM( 9), ZMM(8))   MOV(R12, VAR(rs_c))
223     VMOVAPS(ZMM(10), ZMM(8))   MOV(RSI, VAR(k)) //loop index
224     VMOVAPS(ZMM(11), ZMM(8))   MOV(RAX, VAR(a)) //load address of a
225     VMOVAPS(ZMM(12), ZMM(8))   MOV(RBX, VAR(b)) //load address of b
226     VMOVAPS(ZMM(13), ZMM(8))   MOV(RCX, VAR(c)) //load address of c
227     VMOVAPS(ZMM(14), ZMM(8))   VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
228     VMOVAPS(ZMM(15), ZMM(8))   MOV(RDI, VAR(offsetPtr))
229     VMOVAPS(ZMM(16), ZMM(8))   VMOVAPS(ZMM(4), MEM(RDI))
230 #if SCATTER_PREFETCH_C
231     VMOVAPS(ZMM(17), ZMM(8))
232     VMOVAPS(ZMM(18), ZMM(8))
233     VMOVAPS(ZMM(19), ZMM(8))   VBROADCASTSS(ZMM(5), VAR(rs_c))
234     VMOVAPS(ZMM(20), ZMM(8))
235     VMOVAPS(ZMM(21), ZMM(8))   VPMULLD(ZMM(2), ZMM(4), ZMM(5))
236     VMOVAPS(ZMM(22), ZMM(8))   VMOVAPS(YMM(3), MEM(RDI,64))
237     VMOVAPS(ZMM(23), ZMM(8))   VPMULLD(YMM(3), YMM(3), YMM(5))
238 #else
239     VMOVAPS(ZMM(17), ZMM(8))
240     VMOVAPS(ZMM(18), ZMM(8))   LEA(R13, MEM(R12,R12,2))
241     VMOVAPS(ZMM(19), ZMM(8))   LEA(R14, MEM(R12,R12,4))
242     VMOVAPS(ZMM(20), ZMM(8))   LEA(R15, MEM(R13,R12,4))
243     VMOVAPS(ZMM(21), ZMM(8))
244     VMOVAPS(ZMM(22), ZMM(8))
245     VMOVAPS(ZMM(23), ZMM(8))
246 #endif
247     VMOVAPS(ZMM(24), ZMM(8))   VPSLLD(ZMM(4), ZMM(4), IMM(2))
248     VMOVAPS(ZMM(25), ZMM(8))   MOV(R8, IMM(4*24*4))     //offset for 4 iterations
249     VMOVAPS(ZMM(26), ZMM(8))   LEA(R9, MEM(R8,R8,2))    //*3
250     VMOVAPS(ZMM(27), ZMM(8))   LEA(R10, MEM(R8,R8,4))   //*5
251     VMOVAPS(ZMM(28), ZMM(8))   LEA(R11, MEM(R9,R8,4))   //*7
252     VMOVAPS(ZMM(29), ZMM(8))
253     VMOVAPS(ZMM(30), ZMM(8))
254     VMOVAPS(ZMM(31), ZMM(8))
255 
256 #ifdef MONITORS
257     RDTSC
258     MOV(VAR(midl), EAX)
259     MOV(VAR(midh), EDX)
260 #endif
261 
262     SUB(RSI, IMM(32))
263     JLE(TAIL)
264 
265     //prefetch C into L2
266 #if SCATTER_PREFETCH_C
267     ADD(RSI, IMM(24))
268     KXNORW(K(1), K(0), K(0))
269     KXNORW(K(2), K(0), K(0))
270     VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
271     VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
272 #else
273     PREFETCHW1(MEM(RCX      ))
274     SUBITER( 0,1,0,RAX      )
275     PREFETCHW1(MEM(RCX,R12,1))
276     SUBITER( 1,0,1,RAX      )
277     PREFETCHW1(MEM(RCX,R12,2))
278     SUBITER( 2,1,0,RAX      )
279     PREFETCHW1(MEM(RCX,R13,1))
280     SUBITER( 3,0,1,RAX      )
281     PREFETCHW1(MEM(RCX,R12,4))
282     SUBITER( 4,1,0,RAX,R8, 1)
283     PREFETCHW1(MEM(RCX,R14,1))
284     SUBITER( 5,0,1,RAX,R8, 1)
285     PREFETCHW1(MEM(RCX,R13,2))
286     SUBITER( 6,1,0,RAX,R8, 1)
287     PREFETCHW1(MEM(RCX,R15,1))
288     SUBITER( 7,0,1,RAX,R8, 1)
289 
290     LEA(RDX, MEM(RCX,R12,8))
291 
292     PREFETCHW1(MEM(RDX      ))
293     SUBITER( 8,1,0,RAX,R8, 2)
294     PREFETCHW1(MEM(RDX,R12,1))
295     SUBITER( 9,0,1,RAX,R8, 2)
296     PREFETCHW1(MEM(RDX,R12,2))
297     SUBITER(10,1,0,RAX,R8, 2)
298     PREFETCHW1(MEM(RDX,R13,1))
299     SUBITER(11,0,1,RAX,R8, 2)
300     PREFETCHW1(MEM(RDX,R12,4))
301     SUBITER(12,1,0,RAX,R9, 1)
302     PREFETCHW1(MEM(RDX,R14,1))
303     SUBITER(13,0,1,RAX,R9, 1)
304     PREFETCHW1(MEM(RDX,R13,2))
305     SUBITER(14,1,0,RAX,R9, 1)
306     PREFETCHW1(MEM(RDX,R15,1))
307     SUBITER(15,0,1,RAX,R9, 1)
308 
309     LEA(RDI, MEM(RDX,R12,8))
310 
311     PREFETCHW1(MEM(RDI      ))
312     SUBITER(16,1,0,RAX,R8, 4)
313     PREFETCHW1(MEM(RDI,R12,1))
314     SUBITER(17,0,1,RAX,R8, 4)
315     PREFETCHW1(MEM(RDI,R12,2))
316     SUBITER(18,1,0,RAX,R8, 4)
317     PREFETCHW1(MEM(RDI,R13,1))
318     SUBITER(19,0,1,RAX,R8, 4)
319     PREFETCHW1(MEM(RDI,R12,4))
320     SUBITER(20,1,0,RAX,R10,1)
321     PREFETCHW1(MEM(RDI,R14,1))
322     SUBITER(21,0,1,RAX,R10,1)
323     PREFETCHW1(MEM(RDI,R13,2))
324     SUBITER(22,1,0,RAX,R10,1)
325     PREFETCHW1(MEM(RDI,R15,1))
326     SUBITER(23,0,1,RAX,R10,1)
327 
328     ADD(RAX, IMM(24*24*4))
329     ADD(RBX, IMM(24*16*4))
330 #endif
331 
332     MOV(RDI, RSI)
333     AND(RDI, IMM(31))
334     SAR(RSI, IMM(5))
335     JZ(REM_1)
336 
337     LOOP_ALIGN
338     LABEL(MAIN_LOOP)
339 
340         SUBITER( 0,1,0,RAX      )
341         SUBITER( 1,0,1,RAX      )
342         SUBITER( 2,1,0,RAX      )
343         SUBITER( 3,0,1,RAX      )
344         SUBITER( 4,1,0,RAX,R8, 1)
345         SUBITER( 5,0,1,RAX,R8, 1)
346         SUBITER( 6,1,0,RAX,R8, 1)
347         SUBITER( 7,0,1,RAX,R8, 1)
348         SUBITER( 8,1,0,RAX,R8, 2)
349         SUBITER( 9,0,1,RAX,R8, 2)
350         SUBITER(10,1,0,RAX,R8, 2)
351         SUBITER(11,0,1,RAX,R8, 2)
352         SUBITER(12,1,0,RAX,R9, 1)
353         SUBITER(13,0,1,RAX,R9, 1)
354         SUBITER(14,1,0,RAX,R9, 1)
355         SUBITER(15,0,1,RAX,R9, 1)
356         SUBITER(16,1,0,RAX,R8, 4)
357         SUBITER(17,0,1,RAX,R8, 4)
358         SUBITER(18,1,0,RAX,R8, 4)
359         SUBITER(19,0,1,RAX,R8, 4)
360         SUBITER(20,1,0,RAX,R10,1)
361         SUBITER(21,0,1,RAX,R10,1)
362         SUBITER(22,1,0,RAX,R10,1)
363         SUBITER(23,0,1,RAX,R10,1)
364         SUBITER(24,1,0,RAX,R9, 2)
365         SUBITER(25,0,1,RAX,R9, 2)
366         SUBITER(26,1,0,RAX,R9, 2)
367         SUBITER(27,0,1,RAX,R9, 2)
368         SUBITER(28,1,0,RAX,R11,1)
369         SUBITER(29,0,1,RAX,R11,1)
370         SUBITER(30,1,0,RAX,R11,1)
371         SUBITER(31,0,1,RAX,R11,1)
372 
373         ADD(RAX, IMM(32*24*4))
374         ADD(RBX, IMM(32*16*4))
375 
376         SUB(RSI, IMM(1))
377 
378     JNZ(MAIN_LOOP)
379 
380     LABEL(REM_1)
381     SAR(RDI)
382     JNC(REM_2)
383 
384     SUBITER(0,1,0,RAX)
385     VMOVAPD(ZMM(0), ZMM(1))
386     ADD(RAX, IMM(24*4))
387     ADD(RBX, IMM(16*4))
388 
389     LABEL(REM_2)
390     SAR(RDI)
391     JNC(REM_4)
392 
393     SUBITER(0,1,0,RAX)
394     SUBITER(1,0,1,RAX)
395     ADD(RAX, IMM(2*24*4))
396     ADD(RBX, IMM(2*16*4))
397 
398     LABEL(REM_4)
399     SAR(RDI)
400     JNC(REM_8)
401 
402     SUBITER(0,1,0,RAX)
403     SUBITER(1,0,1,RAX)
404     SUBITER(2,1,0,RAX)
405     SUBITER(3,0,1,RAX)
406     ADD(RAX, IMM(4*24*4))
407     ADD(RBX, IMM(4*16*4))
408 
409     LABEL(REM_8)
410     SAR(RDI)
411     JNC(REM_16)
412 
413     SUBITER(0,1,0,RAX     )
414     SUBITER(1,0,1,RAX     )
415     SUBITER(2,1,0,RAX     )
416     SUBITER(3,0,1,RAX     )
417     SUBITER(4,1,0,RAX,R8,1)
418     SUBITER(5,0,1,RAX,R8,1)
419     SUBITER(6,1,0,RAX,R8,1)
420     SUBITER(7,0,1,RAX,R8,1)
421     ADD(RAX, IMM(8*24*4))
422     ADD(RBX, IMM(8*16*4))
423 
424     LABEL(REM_16)
425     SAR(RDI)
426     JNC(AFTER_LOOP)
427 
428     SUBITER( 0,1,0,RAX      )
429     SUBITER( 1,0,1,RAX      )
430     SUBITER( 2,1,0,RAX      )
431     SUBITER( 3,0,1,RAX      )
432     SUBITER( 4,1,0,RAX,R8, 1)
433     SUBITER( 5,0,1,RAX,R8, 1)
434     SUBITER( 6,1,0,RAX,R8, 1)
435     SUBITER( 7,0,1,RAX,R8, 1)
436     SUBITER( 8,1,0,RAX,R8, 2)
437     SUBITER( 9,0,1,RAX,R8, 2)
438     SUBITER(10,1,0,RAX,R8, 2)
439     SUBITER(11,0,1,RAX,R8, 2)
440     SUBITER(12,1,0,RAX,R9, 1)
441     SUBITER(13,0,1,RAX,R9, 1)
442     SUBITER(14,1,0,RAX,R9, 1)
443     SUBITER(15,0,1,RAX,R9, 1)
444     ADD(RAX, IMM(16*24*4))
445     ADD(RBX, IMM(16*16*4))
446 
447     LABEL(AFTER_LOOP)
448 
449     //prefetch C into L1
450 #if SCATTER_PREFETCH_C
451     KXNORW(K(1), K(0), K(0))
452     KXNORW(K(2), K(0), K(0))
453     VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
454     VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
455 
456     SUBITER(0,1,0,RAX     )
457     SUBITER(1,0,1,RAX     )
458     SUBITER(2,1,0,RAX     )
459     SUBITER(3,0,1,RAX     )
460     SUBITER(4,1,0,RAX,R8,1)
461     SUBITER(5,0,1,RAX,R8,1)
462     SUBITER(6,1,0,RAX,R8,1)
463     SUBITER(7,0,1,RAX,R8,1)
464 #else
465 
466     LEA(RDX, MEM(RCX,R12,8))
467     LEA(RDI, MEM(RDX,R12,8))
468 
469 #undef PREFETCH_C_L1_1
470 #undef PREFETCH_C_L1_2
471 #undef PREFETCH_C_L1_3
472 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX      ))
473 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
474 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
475     SUBITER(0,1,0,RAX     )
476 #undef PREFETCH_C_L1_1
477 #undef PREFETCH_C_L1_2
478 #undef PREFETCH_C_L1_3
479 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
480 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
481 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
482     SUBITER(1,0,1,RAX     )
483 #undef PREFETCH_C_L1_1
484 #undef PREFETCH_C_L1_2
485 #undef PREFETCH_C_L1_3
486 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
487 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
488 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX      ))
489     SUBITER(2,1,0,RAX     )
490 #undef PREFETCH_C_L1_1
491 #undef PREFETCH_C_L1_2
492 #undef PREFETCH_C_L1_3
493 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
494 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
495 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
496     SUBITER(3,0,1,RAX     )
497 #undef PREFETCH_C_L1_1
498 #undef PREFETCH_C_L1_2
499 #undef PREFETCH_C_L1_3
500 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
501 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
502 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
503     SUBITER(4,1,0,RAX,R8,1)
504 #undef PREFETCH_C_L1_1
505 #undef PREFETCH_C_L1_2
506 #undef PREFETCH_C_L1_3
507 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
508 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI      ))
509 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
510     SUBITER(5,0,1,RAX,R8,1)
511 #undef PREFETCH_C_L1_1
512 #undef PREFETCH_C_L1_2
513 #undef PREFETCH_C_L1_3
514 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
515 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
516 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
517     SUBITER(6,1,0,RAX,R8,1)
518 #undef PREFETCH_C_L1_1
519 #undef PREFETCH_C_L1_2
520 #undef PREFETCH_C_L1_3
521 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
522 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
523 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
524     SUBITER(7,0,1,RAX,R8,1)
525 #endif
526 
527     JMP(POSTACCUM)
528 
529     LABEL(TAIL)
530 
531     MOV(RDX, RCX)
532     ADD(RSI, IMM(32))
533     JZ(POSTACCUM)
534 
535     LABEL(TAIL_LOOP)
536 
537         PREFETCHW0(MEM(RDX))
538         ADD(RDX, R12)
539 
540         SUBITER(0,1,0,RAX)
541         VMOVAPD(ZMM(0), ZMM(1))
542         ADD(RAX, IMM(24*4))
543         ADD(RBX, IMM(16*4))
544 
545         SUB(RSI, IMM(1))
546 
547     JNZ(TAIL_LOOP)
548 
549     LABEL(POSTACCUM)
550 
551 #ifdef MONITORS
552     RDTSC
553     MOV(VAR(mid2l), EAX)
554     MOV(VAR(mid2h), EDX)
555 #endif
556 
557     MOV(RAX, VAR(alpha))
558     MOV(RBX, VAR(beta))
559     VBROADCASTSS(ZMM(0), MEM(RAX))
560     VBROADCASTSS(ZMM(1), MEM(RBX))
561 
562     // Check if C is row stride. If not, jump to the slow scattered update
563     MOV(RAX, VAR(rs_c))
564     LEA(RAX, MEM(,RAX,4))
565     MOV(RBX, VAR(cs_c))
566     LEA(RDI, MEM(RAX,RAX,2))
567     CMP(RBX, IMM(1))
568     JNE(SCATTEREDUPDATE)
569 
570     VMOVD(EDX, XMM(1))
571     SAL(EDX) //shift out sign bit
572     JZ(COLSTORBZ)
573 
574     UPDATE_C_FOUR_ROWS( 8, 9,10,11)
575     UPDATE_C_FOUR_ROWS(12,13,14,15)
576     UPDATE_C_FOUR_ROWS(16,17,18,19)
577     UPDATE_C_FOUR_ROWS(20,21,22,23)
578     UPDATE_C_FOUR_ROWS(24,25,26,27)
579     UPDATE_C_FOUR_ROWS(28,29,30,31)
580 
581     JMP(END)
582 
583     LABEL(COLSTORBZ)
584 
585     UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
586     UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
587     UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
588     UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
589     UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
590     UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
591 
592     JMP(END)
593 
594     LABEL(SCATTEREDUPDATE)
595 
596     MOV(RDI, VAR(offsetPtr))
597     VMOVAPS(ZMM(2), MEM(RDI))
598     /* Note that this ignores the upper 32 bits in cs_c */
599     VPBROADCASTD(ZMM(3), EBX)
600     VPMULLD(ZMM(2), ZMM(3), ZMM(2))
601 
602     VMOVD(EDX, XMM(1))
603     SAL(EDX) //shift out sign bit
604     JZ(SCATTERBZ)
605 
606     UPDATE_C_ROW_SCATTERED( 8)
607     UPDATE_C_ROW_SCATTERED( 9)
608     UPDATE_C_ROW_SCATTERED(10)
609     UPDATE_C_ROW_SCATTERED(11)
610     UPDATE_C_ROW_SCATTERED(12)
611     UPDATE_C_ROW_SCATTERED(13)
612     UPDATE_C_ROW_SCATTERED(14)
613     UPDATE_C_ROW_SCATTERED(15)
614     UPDATE_C_ROW_SCATTERED(16)
615     UPDATE_C_ROW_SCATTERED(17)
616     UPDATE_C_ROW_SCATTERED(18)
617     UPDATE_C_ROW_SCATTERED(19)
618     UPDATE_C_ROW_SCATTERED(20)
619     UPDATE_C_ROW_SCATTERED(21)
620     UPDATE_C_ROW_SCATTERED(22)
621     UPDATE_C_ROW_SCATTERED(23)
622     UPDATE_C_ROW_SCATTERED(24)
623     UPDATE_C_ROW_SCATTERED(25)
624     UPDATE_C_ROW_SCATTERED(26)
625     UPDATE_C_ROW_SCATTERED(27)
626     UPDATE_C_ROW_SCATTERED(28)
627     UPDATE_C_ROW_SCATTERED(29)
628     UPDATE_C_ROW_SCATTERED(30)
629     UPDATE_C_ROW_SCATTERED(31)
630 
631     JMP(END)
632 
633     LABEL(SCATTERBZ)
634 
635     UPDATE_C_BZ_ROW_SCATTERED( 8)
636     UPDATE_C_BZ_ROW_SCATTERED( 9)
637     UPDATE_C_BZ_ROW_SCATTERED(10)
638     UPDATE_C_BZ_ROW_SCATTERED(11)
639     UPDATE_C_BZ_ROW_SCATTERED(12)
640     UPDATE_C_BZ_ROW_SCATTERED(13)
641     UPDATE_C_BZ_ROW_SCATTERED(14)
642     UPDATE_C_BZ_ROW_SCATTERED(15)
643     UPDATE_C_BZ_ROW_SCATTERED(16)
644     UPDATE_C_BZ_ROW_SCATTERED(17)
645     UPDATE_C_BZ_ROW_SCATTERED(18)
646     UPDATE_C_BZ_ROW_SCATTERED(19)
647     UPDATE_C_BZ_ROW_SCATTERED(20)
648     UPDATE_C_BZ_ROW_SCATTERED(21)
649     UPDATE_C_BZ_ROW_SCATTERED(22)
650     UPDATE_C_BZ_ROW_SCATTERED(23)
651     UPDATE_C_BZ_ROW_SCATTERED(24)
652     UPDATE_C_BZ_ROW_SCATTERED(25)
653     UPDATE_C_BZ_ROW_SCATTERED(26)
654     UPDATE_C_BZ_ROW_SCATTERED(27)
655     UPDATE_C_BZ_ROW_SCATTERED(28)
656     UPDATE_C_BZ_ROW_SCATTERED(29)
657     UPDATE_C_BZ_ROW_SCATTERED(30)
658     UPDATE_C_BZ_ROW_SCATTERED(31)
659 
660     LABEL(END)
661 
662 #ifdef MONITORS
663     RDTSC
664     MOV(VAR(botl), EAX)
665     MOV(VAR(both), EDX)
666 #endif
667 
668     END_ASM(
669     : // output operands
670 #ifdef MONITORS
671       [topl]  "=m" (topl),
672       [toph]  "=m" (toph),
673       [midl]  "=m" (midl),
674       [midh]  "=m" (midh),
675       [mid2l] "=m" (mid2l),
676       [mid2h] "=m" (mid2h),
677       [botl]  "=m" (botl),
678       [both]  "=m" (both)
679 #endif
680     : // input operands
681       [k]         "m" (k),
682       [a]         "m" (a),
683       [b]         "m" (b),
684       [alpha]     "m" (alpha),
685       [beta]      "m" (beta),
686       [c]         "m" (c),
687       [rs_c]      "m" (rs_c),
688       [cs_c]      "m" (cs_c),
689       [a_next]    "m" (a_next),
690       [b_next]    "m" (b_next),
691       [offsetPtr] "m" (offsetPtr)
692     : // register clobber list
693       "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
694       "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
695       "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
696       "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
697       "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
698       "zmm30", "zmm31", "memory"
699     )
700 
701 #ifdef LOOPMON
702     printf("looptime = \t%d\n", bloopl - tloopl);
703 #endif
704 #ifdef MONITORS
705     dim_t top = ((dim_t)toph << 32) | topl;
706     dim_t mid = ((dim_t)midh << 32) | midl;
707     dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
708     dim_t bot = ((dim_t)both << 32) | botl;
709     printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
710 #endif
711 }
712