1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #include <assert.h>
37
38 #define BLIS_ASM_SYNTAX_INTEL
39 #include "bli_x86_asm_macros.h"
40
41 #define UNROLL_K 32
42
43 #define SCATTER_PREFETCH_C 1
44
45 #define PREFETCH_A_L2 0
46 #define PREFETCH_B_L2 0
47 #define L2_PREFETCH_DIST 64
48
49 #define A_L1_PREFETCH_DIST 36
50 #define B_L1_PREFETCH_DIST 18
51
52 #define LOOP_ALIGN ALIGN16
53
54 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
55 \
56 VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
57 VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
58 VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
59 VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
60 VFMADD231PS(ZMM(R1), ZMM(1), MEM(RCX )) \
61 VFMADD231PS(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
62 VFMADD231PS(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
63 VFMADD231PS(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
64 VMOVUPS(MEM(RCX ), ZMM(R1)) \
65 VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
66 VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
67 VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
68 LEA(RCX, MEM(RCX,RAX,4))
69
70 #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
71 \
72 VMULPS(ZMM(R1), ZMM(R1), ZMM(0)) \
73 VMULPS(ZMM(R2), ZMM(R2), ZMM(0)) \
74 VMULPS(ZMM(R3), ZMM(R3), ZMM(0)) \
75 VMULPS(ZMM(R4), ZMM(R4), ZMM(0)) \
76 VMOVUPS(MEM(RCX ), ZMM(R1)) \
77 VMOVUPS(MEM(RCX,RAX,1), ZMM(R2)) \
78 VMOVUPS(MEM(RCX,RAX,2), ZMM(R3)) \
79 VMOVUPS(MEM(RCX,RDI,1), ZMM(R4)) \
80 LEA(RCX, MEM(RCX,RAX,4))
81
82 #define UPDATE_C_ROW_SCATTERED(NUM) \
83 \
84 KXNORW(K(1), K(0), K(0)) \
85 KXNORW(K(2), K(0), K(0)) \
86 VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
87 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RCX,ZMM(2),4)) \
88 VFMADD231PS(ZMM(NUM), ZMM(3), ZMM(1)) \
89 VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(2), ZMM(NUM)) \
90 ADD(RCX, RAX)
91
92 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
93 \
94 KXNORW(K(1), K(0), K(0)) \
95 VMULPS(ZMM(NUM), ZMM(NUM), ZMM(0)) \
96 VSCATTERDPS(MEM(RCX,ZMM(2),4) MASK_K(1), ZMM(NUM)) \
97 ADD(RCX, RAX)
98
99 #define PREFETCH_A_L1_1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4))
100 #define PREFETCH_A_L1_2(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*24*4+64))
101
102 #if PREFETCH_A_L2
103 #undef PREFETCH_A_L2
104
105 #define PREFETCH_A_L2(n) \
106 \
107 PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4)) \
108 PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*24*4+64))
109
110 #else
111 #undef PREFETCH_A_L2
112 #define PREFETCH_A_L2(...)
113 #endif
114
115 #define PREFETCH_B_L1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*16*4))
116
117 #if PREFETCH_B_L2
118 #undef PREFETCH_B_L2
119
120 #define PREFETCH_B_L2(n) PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*16*4))
121
122 #else
123 #undef PREFETCH_B_L2
124 #define PREFETCH_B_L2(...)
125 #endif
126
127 #define PREFETCH_C_L1_1
128 #define PREFETCH_C_L1_2
129 #define PREFETCH_C_L1_3
130
131 //
132 // n: index in unrolled loop
133 //
134 // a: ZMM register to load into
135 // b: ZMM register to read from
136 //
137 // ...: addressing for A, except for offset
138 //
139 #define SUBITER(n,a,b,...) \
140 \
141 PREFETCH_A_L2(n) \
142 \
143 VMOVAPS(ZMM(a), MEM(RBX,(n+1)*64)) \
144 VFMADD231PS(ZMM( 8), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 0)*4)) \
145 VFMADD231PS(ZMM( 9), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 1)*4)) \
146 VFMADD231PS(ZMM(10), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 2)*4)) \
147 PREFETCH_A_L1_1(n) \
148 VFMADD231PS(ZMM(11), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 3)*4)) \
149 VFMADD231PS(ZMM(12), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 4)*4)) \
150 VFMADD231PS(ZMM(13), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 5)*4)) \
151 PREFETCH_C_L1_1 \
152 VFMADD231PS(ZMM(14), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 6)*4)) \
153 VFMADD231PS(ZMM(15), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 7)*4)) \
154 VFMADD231PS(ZMM(16), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 8)*4)) \
155 PREFETCH_A_L1_2(n) \
156 VFMADD231PS(ZMM(17), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+ 9)*4)) \
157 VFMADD231PS(ZMM(18), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+10)*4)) \
158 VFMADD231PS(ZMM(19), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+11)*4)) \
159 PREFETCH_C_L1_2 \
160 VFMADD231PS(ZMM(20), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+12)*4)) \
161 VFMADD231PS(ZMM(21), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+13)*4)) \
162 VFMADD231PS(ZMM(22), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+14)*4)) \
163 PREFETCH_C_L1_3 \
164 VFMADD231PS(ZMM(23), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+15)*4)) \
165 VFMADD231PS(ZMM(24), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+16)*4)) \
166 VFMADD231PS(ZMM(25), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+17)*4)) \
167 PREFETCH_B_L1(n) \
168 VFMADD231PS(ZMM(26), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+18)*4)) \
169 VFMADD231PS(ZMM(27), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+19)*4)) \
170 VFMADD231PS(ZMM(28), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+20)*4)) \
171 PREFETCH_B_L2(n) \
172 VFMADD231PS(ZMM(29), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+21)*4)) \
173 VFMADD231PS(ZMM(30), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+22)*4)) \
174 VFMADD231PS(ZMM(31), ZMM(b), MEM_1TO16(__VA_ARGS__,((n%%4)*24+23)*4))
175
176 //This is an array used for the scatter/gather instructions.
177 static int32_t offsets[32] __attribute__((aligned(64))) =
178 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
179 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
180
181 //#define MONITORS
182 //#define LOOPMON
bli_sgemm_knl_asm_24x16(dim_t k_,float * restrict alpha,float * restrict a,float * restrict b,float * restrict beta,float * restrict c,inc_t rs_c_,inc_t cs_c_,auxinfo_t * restrict data,cntx_t * restrict cntx)183 void bli_sgemm_knl_asm_24x16
184 (
185 dim_t k_,
186 float* restrict alpha,
187 float* restrict a,
188 float* restrict b,
189 float* restrict beta,
190 float* restrict c, inc_t rs_c_, inc_t cs_c_,
191 auxinfo_t* restrict data,
192 cntx_t* restrict cntx
193 )
194 {
195 (void)data;
196 (void)cntx;
197
198 const double * a_next = bli_auxinfo_next_a( data );
199 const double * b_next = bli_auxinfo_next_b( data );
200
201 const int32_t * offsetPtr = &offsets[0];
202 const int64_t k = k_;
203 const int64_t rs_c = rs_c_;
204 const int64_t cs_c = cs_c_;
205
206 #ifdef MONITORS
207 int toph, topl, both, botl, midl, midh, mid2l, mid2h;
208 #endif
209 #ifdef LOOPMON
210 int tlooph, tloopl, blooph, bloopl;
211 #endif
212
213 BEGIN_ASM()
214
215 #ifdef MONITORS
216 RDTSC
217 MOV(VAR(topl), EAX)
218 MOV(VAR(toph), EDX)
219 #endif
220
221 VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
222 VMOVAPS(ZMM( 9), ZMM(8)) MOV(R12, VAR(rs_c))
223 VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index
224 VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
225 VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
226 VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
227 VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX)) //pre-load b
228 VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
229 VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
230 #if SCATTER_PREFETCH_C
231 VMOVAPS(ZMM(17), ZMM(8))
232 VMOVAPS(ZMM(18), ZMM(8))
233 VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
234 VMOVAPS(ZMM(20), ZMM(8))
235 VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
236 VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
237 VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
238 #else
239 VMOVAPS(ZMM(17), ZMM(8))
240 VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
241 VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
242 VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
243 VMOVAPS(ZMM(21), ZMM(8))
244 VMOVAPS(ZMM(22), ZMM(8))
245 VMOVAPS(ZMM(23), ZMM(8))
246 #endif
247 VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(2))
248 VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*4)) //offset for 4 iterations
249 VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
250 VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5
251 VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7
252 VMOVAPS(ZMM(29), ZMM(8))
253 VMOVAPS(ZMM(30), ZMM(8))
254 VMOVAPS(ZMM(31), ZMM(8))
255
256 #ifdef MONITORS
257 RDTSC
258 MOV(VAR(midl), EAX)
259 MOV(VAR(midh), EDX)
260 #endif
261
262 SUB(RSI, IMM(32))
263 JLE(TAIL)
264
265 //prefetch C into L2
266 #if SCATTER_PREFETCH_C
267 ADD(RSI, IMM(24))
268 KXNORW(K(1), K(0), K(0))
269 KXNORW(K(2), K(0), K(0))
270 VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
271 VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
272 #else
273 PREFETCHW1(MEM(RCX ))
274 SUBITER( 0,1,0,RAX )
275 PREFETCHW1(MEM(RCX,R12,1))
276 SUBITER( 1,0,1,RAX )
277 PREFETCHW1(MEM(RCX,R12,2))
278 SUBITER( 2,1,0,RAX )
279 PREFETCHW1(MEM(RCX,R13,1))
280 SUBITER( 3,0,1,RAX )
281 PREFETCHW1(MEM(RCX,R12,4))
282 SUBITER( 4,1,0,RAX,R8, 1)
283 PREFETCHW1(MEM(RCX,R14,1))
284 SUBITER( 5,0,1,RAX,R8, 1)
285 PREFETCHW1(MEM(RCX,R13,2))
286 SUBITER( 6,1,0,RAX,R8, 1)
287 PREFETCHW1(MEM(RCX,R15,1))
288 SUBITER( 7,0,1,RAX,R8, 1)
289
290 LEA(RDX, MEM(RCX,R12,8))
291
292 PREFETCHW1(MEM(RDX ))
293 SUBITER( 8,1,0,RAX,R8, 2)
294 PREFETCHW1(MEM(RDX,R12,1))
295 SUBITER( 9,0,1,RAX,R8, 2)
296 PREFETCHW1(MEM(RDX,R12,2))
297 SUBITER(10,1,0,RAX,R8, 2)
298 PREFETCHW1(MEM(RDX,R13,1))
299 SUBITER(11,0,1,RAX,R8, 2)
300 PREFETCHW1(MEM(RDX,R12,4))
301 SUBITER(12,1,0,RAX,R9, 1)
302 PREFETCHW1(MEM(RDX,R14,1))
303 SUBITER(13,0,1,RAX,R9, 1)
304 PREFETCHW1(MEM(RDX,R13,2))
305 SUBITER(14,1,0,RAX,R9, 1)
306 PREFETCHW1(MEM(RDX,R15,1))
307 SUBITER(15,0,1,RAX,R9, 1)
308
309 LEA(RDI, MEM(RDX,R12,8))
310
311 PREFETCHW1(MEM(RDI ))
312 SUBITER(16,1,0,RAX,R8, 4)
313 PREFETCHW1(MEM(RDI,R12,1))
314 SUBITER(17,0,1,RAX,R8, 4)
315 PREFETCHW1(MEM(RDI,R12,2))
316 SUBITER(18,1,0,RAX,R8, 4)
317 PREFETCHW1(MEM(RDI,R13,1))
318 SUBITER(19,0,1,RAX,R8, 4)
319 PREFETCHW1(MEM(RDI,R12,4))
320 SUBITER(20,1,0,RAX,R10,1)
321 PREFETCHW1(MEM(RDI,R14,1))
322 SUBITER(21,0,1,RAX,R10,1)
323 PREFETCHW1(MEM(RDI,R13,2))
324 SUBITER(22,1,0,RAX,R10,1)
325 PREFETCHW1(MEM(RDI,R15,1))
326 SUBITER(23,0,1,RAX,R10,1)
327
328 ADD(RAX, IMM(24*24*4))
329 ADD(RBX, IMM(24*16*4))
330 #endif
331
332 MOV(RDI, RSI)
333 AND(RDI, IMM(31))
334 SAR(RSI, IMM(5))
335 JZ(REM_1)
336
337 LOOP_ALIGN
338 LABEL(MAIN_LOOP)
339
340 SUBITER( 0,1,0,RAX )
341 SUBITER( 1,0,1,RAX )
342 SUBITER( 2,1,0,RAX )
343 SUBITER( 3,0,1,RAX )
344 SUBITER( 4,1,0,RAX,R8, 1)
345 SUBITER( 5,0,1,RAX,R8, 1)
346 SUBITER( 6,1,0,RAX,R8, 1)
347 SUBITER( 7,0,1,RAX,R8, 1)
348 SUBITER( 8,1,0,RAX,R8, 2)
349 SUBITER( 9,0,1,RAX,R8, 2)
350 SUBITER(10,1,0,RAX,R8, 2)
351 SUBITER(11,0,1,RAX,R8, 2)
352 SUBITER(12,1,0,RAX,R9, 1)
353 SUBITER(13,0,1,RAX,R9, 1)
354 SUBITER(14,1,0,RAX,R9, 1)
355 SUBITER(15,0,1,RAX,R9, 1)
356 SUBITER(16,1,0,RAX,R8, 4)
357 SUBITER(17,0,1,RAX,R8, 4)
358 SUBITER(18,1,0,RAX,R8, 4)
359 SUBITER(19,0,1,RAX,R8, 4)
360 SUBITER(20,1,0,RAX,R10,1)
361 SUBITER(21,0,1,RAX,R10,1)
362 SUBITER(22,1,0,RAX,R10,1)
363 SUBITER(23,0,1,RAX,R10,1)
364 SUBITER(24,1,0,RAX,R9, 2)
365 SUBITER(25,0,1,RAX,R9, 2)
366 SUBITER(26,1,0,RAX,R9, 2)
367 SUBITER(27,0,1,RAX,R9, 2)
368 SUBITER(28,1,0,RAX,R11,1)
369 SUBITER(29,0,1,RAX,R11,1)
370 SUBITER(30,1,0,RAX,R11,1)
371 SUBITER(31,0,1,RAX,R11,1)
372
373 ADD(RAX, IMM(32*24*4))
374 ADD(RBX, IMM(32*16*4))
375
376 SUB(RSI, IMM(1))
377
378 JNZ(MAIN_LOOP)
379
380 LABEL(REM_1)
381 SAR(RDI)
382 JNC(REM_2)
383
384 SUBITER(0,1,0,RAX)
385 VMOVAPD(ZMM(0), ZMM(1))
386 ADD(RAX, IMM(24*4))
387 ADD(RBX, IMM(16*4))
388
389 LABEL(REM_2)
390 SAR(RDI)
391 JNC(REM_4)
392
393 SUBITER(0,1,0,RAX)
394 SUBITER(1,0,1,RAX)
395 ADD(RAX, IMM(2*24*4))
396 ADD(RBX, IMM(2*16*4))
397
398 LABEL(REM_4)
399 SAR(RDI)
400 JNC(REM_8)
401
402 SUBITER(0,1,0,RAX)
403 SUBITER(1,0,1,RAX)
404 SUBITER(2,1,0,RAX)
405 SUBITER(3,0,1,RAX)
406 ADD(RAX, IMM(4*24*4))
407 ADD(RBX, IMM(4*16*4))
408
409 LABEL(REM_8)
410 SAR(RDI)
411 JNC(REM_16)
412
413 SUBITER(0,1,0,RAX )
414 SUBITER(1,0,1,RAX )
415 SUBITER(2,1,0,RAX )
416 SUBITER(3,0,1,RAX )
417 SUBITER(4,1,0,RAX,R8,1)
418 SUBITER(5,0,1,RAX,R8,1)
419 SUBITER(6,1,0,RAX,R8,1)
420 SUBITER(7,0,1,RAX,R8,1)
421 ADD(RAX, IMM(8*24*4))
422 ADD(RBX, IMM(8*16*4))
423
424 LABEL(REM_16)
425 SAR(RDI)
426 JNC(AFTER_LOOP)
427
428 SUBITER( 0,1,0,RAX )
429 SUBITER( 1,0,1,RAX )
430 SUBITER( 2,1,0,RAX )
431 SUBITER( 3,0,1,RAX )
432 SUBITER( 4,1,0,RAX,R8, 1)
433 SUBITER( 5,0,1,RAX,R8, 1)
434 SUBITER( 6,1,0,RAX,R8, 1)
435 SUBITER( 7,0,1,RAX,R8, 1)
436 SUBITER( 8,1,0,RAX,R8, 2)
437 SUBITER( 9,0,1,RAX,R8, 2)
438 SUBITER(10,1,0,RAX,R8, 2)
439 SUBITER(11,0,1,RAX,R8, 2)
440 SUBITER(12,1,0,RAX,R9, 1)
441 SUBITER(13,0,1,RAX,R9, 1)
442 SUBITER(14,1,0,RAX,R9, 1)
443 SUBITER(15,0,1,RAX,R9, 1)
444 ADD(RAX, IMM(16*24*4))
445 ADD(RBX, IMM(16*16*4))
446
447 LABEL(AFTER_LOOP)
448
449 //prefetch C into L1
450 #if SCATTER_PREFETCH_C
451 KXNORW(K(1), K(0), K(0))
452 KXNORW(K(2), K(0), K(0))
453 VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
454 VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
455
456 SUBITER(0,1,0,RAX )
457 SUBITER(1,0,1,RAX )
458 SUBITER(2,1,0,RAX )
459 SUBITER(3,0,1,RAX )
460 SUBITER(4,1,0,RAX,R8,1)
461 SUBITER(5,0,1,RAX,R8,1)
462 SUBITER(6,1,0,RAX,R8,1)
463 SUBITER(7,0,1,RAX,R8,1)
464 #else
465
466 LEA(RDX, MEM(RCX,R12,8))
467 LEA(RDI, MEM(RDX,R12,8))
468
469 #undef PREFETCH_C_L1_1
470 #undef PREFETCH_C_L1_2
471 #undef PREFETCH_C_L1_3
472 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX ))
473 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,1))
474 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R12,2))
475 SUBITER(0,1,0,RAX )
476 #undef PREFETCH_C_L1_1
477 #undef PREFETCH_C_L1_2
478 #undef PREFETCH_C_L1_3
479 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,1))
480 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R12,4))
481 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RCX,R14,1))
482 SUBITER(1,0,1,RAX )
483 #undef PREFETCH_C_L1_1
484 #undef PREFETCH_C_L1_2
485 #undef PREFETCH_C_L1_3
486 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RCX,R13,2))
487 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RCX,R15,1))
488 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX ))
489 SUBITER(2,1,0,RAX )
490 #undef PREFETCH_C_L1_1
491 #undef PREFETCH_C_L1_2
492 #undef PREFETCH_C_L1_3
493 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,1))
494 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R12,2))
495 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,1))
496 SUBITER(3,0,1,RAX )
497 #undef PREFETCH_C_L1_1
498 #undef PREFETCH_C_L1_2
499 #undef PREFETCH_C_L1_3
500 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R12,4))
501 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDX,R14,1))
502 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDX,R13,2))
503 SUBITER(4,1,0,RAX,R8,1)
504 #undef PREFETCH_C_L1_1
505 #undef PREFETCH_C_L1_2
506 #undef PREFETCH_C_L1_3
507 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDX,R15,1))
508 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI ))
509 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,1))
510 SUBITER(5,0,1,RAX,R8,1)
511 #undef PREFETCH_C_L1_1
512 #undef PREFETCH_C_L1_2
513 #undef PREFETCH_C_L1_3
514 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R12,2))
515 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,1))
516 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R12,4))
517 SUBITER(6,1,0,RAX,R8,1)
518 #undef PREFETCH_C_L1_1
519 #undef PREFETCH_C_L1_2
520 #undef PREFETCH_C_L1_3
521 #define PREFETCH_C_L1_1 PREFETCHW0(MEM(RDI,R14,1))
522 #define PREFETCH_C_L1_2 PREFETCHW0(MEM(RDI,R13,2))
523 #define PREFETCH_C_L1_3 PREFETCHW0(MEM(RDI,R15,1))
524 SUBITER(7,0,1,RAX,R8,1)
525 #endif
526
527 JMP(POSTACCUM)
528
529 LABEL(TAIL)
530
531 MOV(RDX, RCX)
532 ADD(RSI, IMM(32))
533 JZ(POSTACCUM)
534
535 LABEL(TAIL_LOOP)
536
537 PREFETCHW0(MEM(RDX))
538 ADD(RDX, R12)
539
540 SUBITER(0,1,0,RAX)
541 VMOVAPD(ZMM(0), ZMM(1))
542 ADD(RAX, IMM(24*4))
543 ADD(RBX, IMM(16*4))
544
545 SUB(RSI, IMM(1))
546
547 JNZ(TAIL_LOOP)
548
549 LABEL(POSTACCUM)
550
551 #ifdef MONITORS
552 RDTSC
553 MOV(VAR(mid2l), EAX)
554 MOV(VAR(mid2h), EDX)
555 #endif
556
557 MOV(RAX, VAR(alpha))
558 MOV(RBX, VAR(beta))
559 VBROADCASTSS(ZMM(0), MEM(RAX))
560 VBROADCASTSS(ZMM(1), MEM(RBX))
561
562 // Check if C is row stride. If not, jump to the slow scattered update
563 MOV(RAX, VAR(rs_c))
564 LEA(RAX, MEM(,RAX,4))
565 MOV(RBX, VAR(cs_c))
566 LEA(RDI, MEM(RAX,RAX,2))
567 CMP(RBX, IMM(1))
568 JNE(SCATTEREDUPDATE)
569
570 VMOVD(EDX, XMM(1))
571 SAL(EDX) //shift out sign bit
572 JZ(COLSTORBZ)
573
574 UPDATE_C_FOUR_ROWS( 8, 9,10,11)
575 UPDATE_C_FOUR_ROWS(12,13,14,15)
576 UPDATE_C_FOUR_ROWS(16,17,18,19)
577 UPDATE_C_FOUR_ROWS(20,21,22,23)
578 UPDATE_C_FOUR_ROWS(24,25,26,27)
579 UPDATE_C_FOUR_ROWS(28,29,30,31)
580
581 JMP(END)
582
583 LABEL(COLSTORBZ)
584
585 UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
586 UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
587 UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
588 UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
589 UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
590 UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
591
592 JMP(END)
593
594 LABEL(SCATTEREDUPDATE)
595
596 MOV(RDI, VAR(offsetPtr))
597 VMOVAPS(ZMM(2), MEM(RDI))
598 /* Note that this ignores the upper 32 bits in cs_c */
599 VPBROADCASTD(ZMM(3), EBX)
600 VPMULLD(ZMM(2), ZMM(3), ZMM(2))
601
602 VMOVD(EDX, XMM(1))
603 SAL(EDX) //shift out sign bit
604 JZ(SCATTERBZ)
605
606 UPDATE_C_ROW_SCATTERED( 8)
607 UPDATE_C_ROW_SCATTERED( 9)
608 UPDATE_C_ROW_SCATTERED(10)
609 UPDATE_C_ROW_SCATTERED(11)
610 UPDATE_C_ROW_SCATTERED(12)
611 UPDATE_C_ROW_SCATTERED(13)
612 UPDATE_C_ROW_SCATTERED(14)
613 UPDATE_C_ROW_SCATTERED(15)
614 UPDATE_C_ROW_SCATTERED(16)
615 UPDATE_C_ROW_SCATTERED(17)
616 UPDATE_C_ROW_SCATTERED(18)
617 UPDATE_C_ROW_SCATTERED(19)
618 UPDATE_C_ROW_SCATTERED(20)
619 UPDATE_C_ROW_SCATTERED(21)
620 UPDATE_C_ROW_SCATTERED(22)
621 UPDATE_C_ROW_SCATTERED(23)
622 UPDATE_C_ROW_SCATTERED(24)
623 UPDATE_C_ROW_SCATTERED(25)
624 UPDATE_C_ROW_SCATTERED(26)
625 UPDATE_C_ROW_SCATTERED(27)
626 UPDATE_C_ROW_SCATTERED(28)
627 UPDATE_C_ROW_SCATTERED(29)
628 UPDATE_C_ROW_SCATTERED(30)
629 UPDATE_C_ROW_SCATTERED(31)
630
631 JMP(END)
632
633 LABEL(SCATTERBZ)
634
635 UPDATE_C_BZ_ROW_SCATTERED( 8)
636 UPDATE_C_BZ_ROW_SCATTERED( 9)
637 UPDATE_C_BZ_ROW_SCATTERED(10)
638 UPDATE_C_BZ_ROW_SCATTERED(11)
639 UPDATE_C_BZ_ROW_SCATTERED(12)
640 UPDATE_C_BZ_ROW_SCATTERED(13)
641 UPDATE_C_BZ_ROW_SCATTERED(14)
642 UPDATE_C_BZ_ROW_SCATTERED(15)
643 UPDATE_C_BZ_ROW_SCATTERED(16)
644 UPDATE_C_BZ_ROW_SCATTERED(17)
645 UPDATE_C_BZ_ROW_SCATTERED(18)
646 UPDATE_C_BZ_ROW_SCATTERED(19)
647 UPDATE_C_BZ_ROW_SCATTERED(20)
648 UPDATE_C_BZ_ROW_SCATTERED(21)
649 UPDATE_C_BZ_ROW_SCATTERED(22)
650 UPDATE_C_BZ_ROW_SCATTERED(23)
651 UPDATE_C_BZ_ROW_SCATTERED(24)
652 UPDATE_C_BZ_ROW_SCATTERED(25)
653 UPDATE_C_BZ_ROW_SCATTERED(26)
654 UPDATE_C_BZ_ROW_SCATTERED(27)
655 UPDATE_C_BZ_ROW_SCATTERED(28)
656 UPDATE_C_BZ_ROW_SCATTERED(29)
657 UPDATE_C_BZ_ROW_SCATTERED(30)
658 UPDATE_C_BZ_ROW_SCATTERED(31)
659
660 LABEL(END)
661
662 #ifdef MONITORS
663 RDTSC
664 MOV(VAR(botl), EAX)
665 MOV(VAR(both), EDX)
666 #endif
667
668 END_ASM(
669 : // output operands
670 #ifdef MONITORS
671 [topl] "=m" (topl),
672 [toph] "=m" (toph),
673 [midl] "=m" (midl),
674 [midh] "=m" (midh),
675 [mid2l] "=m" (mid2l),
676 [mid2h] "=m" (mid2h),
677 [botl] "=m" (botl),
678 [both] "=m" (both)
679 #endif
680 : // input operands
681 [k] "m" (k),
682 [a] "m" (a),
683 [b] "m" (b),
684 [alpha] "m" (alpha),
685 [beta] "m" (beta),
686 [c] "m" (c),
687 [rs_c] "m" (rs_c),
688 [cs_c] "m" (cs_c),
689 [a_next] "m" (a_next),
690 [b_next] "m" (b_next),
691 [offsetPtr] "m" (offsetPtr)
692 : // register clobber list
693 "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
694 "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
695 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
696 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
697 "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
698 "zmm30", "zmm31", "memory"
699 )
700
701 #ifdef LOOPMON
702 printf("looptime = \t%d\n", bloopl - tloopl);
703 #endif
704 #ifdef MONITORS
705 dim_t top = ((dim_t)toph << 32) | topl;
706 dim_t mid = ((dim_t)midh << 32) | midl;
707 dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
708 dim_t bot = ((dim_t)both << 32) | botl;
709 printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
710 #endif
711 }
712