1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name(s) of the copyright holder(s) nor the names of its
18 contributors may be used to endorse or promote products derived
19 from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
25 OF TEXAS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
29 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "blis.h"
36 #include <assert.h>
37
38 #include "bli_avx512_macros.h"
39
40 #define UNROLL_K 8
41
42 #define SCATTER_PREFETCH_AB 0
43 #define SCATTER_PREFETCH_C 1
44
45 #define PREFETCH_A_L2 0
46 #define PREFETCH_B_L2 0
47 #define L2_PREFETCH_DIST 64
48
49 #define A_L1_PREFETCH_DIST 32
50 #define B_L1_PREFETCH_DIST 12
51
52 #define C_MIN_L2_ITERS 64 //C is not prefetched into L2 for k <= this
53 #define C_L1_ITERS 8 //number of iterations before the end to prefetch C into L1
54 //make sure there is an unrolled MAIN_LOOP_X for this number
55
56 #define LOOP_ALIGN ALIGN16
57
58 #define UPDATE_C_FOUR_ROWS(R1,R2,R3,R4) \
59 \
60 VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
61 VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
62 VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
63 VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
64 VFMADD231PD(ZMM(R1), ZMM(1), MEM(RCX )) \
65 VFMADD231PD(ZMM(R2), ZMM(1), MEM(RCX,RAX,1)) \
66 VFMADD231PD(ZMM(R3), ZMM(1), MEM(RCX,RAX,2)) \
67 VFMADD231PD(ZMM(R4), ZMM(1), MEM(RCX,RDI,1)) \
68 VMOVUPD(MEM(RCX ), ZMM(R1)) \
69 VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \
70 VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \
71 VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \
72 LEA(RCX, MEM(RCX,RAX,4))
73
74 #define UPDATE_C_BZ_FOUR_ROWS(R1,R2,R3,R4) \
75 \
76 VMULPD(ZMM(R1), ZMM(R1), ZMM(0)) \
77 VMULPD(ZMM(R2), ZMM(R2), ZMM(0)) \
78 VMULPD(ZMM(R3), ZMM(R3), ZMM(0)) \
79 VMULPD(ZMM(R4), ZMM(R4), ZMM(0)) \
80 VMOVUPD(MEM(RCX ), ZMM(R1)) \
81 VMOVUPD(MEM(RCX,RAX,1), ZMM(R2)) \
82 VMOVUPD(MEM(RCX,RAX,2), ZMM(R3)) \
83 VMOVUPD(MEM(RCX,RDI,1), ZMM(R4)) \
84 LEA(RCX, MEM(RCX,RAX,4))
85
86 #define UPDATE_C_ROW_SCATTERED(NUM) \
87 \
88 KXNORW(K(1), K(0), K(0)) \
89 KXNORW(K(2), K(0), K(0)) \
90 VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \
91 VGATHERDPD(ZMM(3) MASK_K(1), MEM(RCX,YMM(2),8)) \
92 VFMADD231PD(ZMM(NUM), ZMM(3), ZMM(1)) \
93 VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(2), ZMM(NUM)) \
94 ADD(RCX, RAX)
95
96 #define UPDATE_C_BZ_ROW_SCATTERED(NUM) \
97 \
98 KXNORW(K(1), K(0), K(0)) \
99 VMULPD(ZMM(NUM), ZMM(NUM), ZMM(0)) \
100 VSCATTERDPD(MEM(RCX,YMM(2),8) MASK_K(1), ZMM(NUM)) \
101 ADD(RCX, RAX)
102
103 #define PREFETCH_B_L1_1(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8))
104 #define PREFETCH_B_L1_2(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+64))
105 #define PREFETCH_B_L1_3(n) PREFETCH(0, MEM(RBX,(B_L1_PREFETCH_DIST+n)*24*8+128))
106
107 #if PREFETCH_B_L2
108 #undef PREFETCH_B_L2
109
110 #define PREFETCH_B_L2(n) \
111 \
112 PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8)) \
113 PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+64)) \
114 PREFETCH(1, MEM(RBX,(L2_PREFETCH_DIST+n)*24*8+128))
115
116 #else
117 #undef PREFETCH_B_L2
118 #define PREFETCH_B_L2(...)
119 #endif
120
121 #define PREFETCH_A_L1(n) PREFETCH(0, MEM(RAX,(A_L1_PREFETCH_DIST+n)*8*8))
122
123 #if PREFETCH_A_L2
124 #undef PREFETCH_A_L2
125
126 #define PREFETCH_A_L2(n) PREFETCH(1, MEM(RAX,(L2_PREFETCH_DIST+n)*8*8))
127
128 #else
129 #undef PREFETCH_A_L2
130 #define PREFETCH_A_L2(...)
131 #endif
132
133 #if SCATTER_PREFETCH_AB
134 #undef SCATTER_PREFETCH_AB
135 #undef PREFETCH_B_L1_1
136 #undef PREFETCH_B_L1_2
137 #undef PREFETCH_B_L1_3
138 #undef PREFETCH_A_L1
139
140 #define SCATTER_PREFETCH_AB(n) \
141 \
142 KXNORW(K(1), K(0), K(0)) \
143 VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n )*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(1)) \
144 KXNORW(K(2), K(0), K(0)) \
145 VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+1)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(2)) \
146 KXNORW(K(3), K(0), K(0)) \
147 VGATHERPFDPS(0, MEM(RBX,ZMM(4),8,((3*n+2)*16+3*B_L1_PREFETCH_DIST)*64) MASK_K(3)) \
148 KXNORW(K(4), K(0), K(0)) \
149 VGATHERPFDPS(0, MEM(RAX,ZMM(4),8,( n *16+ A_L1_PREFETCH_DIST)*64) MASK_K(4))
150
151 #define PREFETCH_B_L1_1(...)
152 #define PREFETCH_B_L1_2(...)
153 #define PREFETCH_B_L1_3(...)
154 #define PREFETCH_A_L1(...)
155
156 #else
157 #undef SCATTER_PREFETCH_AB
158
159 #define SCATTER_PREFETCH_AB(...)
160
161 #endif
162
163 //
164 // n: index in unrolled loop (for prefetching offsets)
165 //
166 // a: ZMM register to load into
167 // b: ZMM register to read from
168 //
169 // ...: addressing for B, except for offset
170 //
171 #define SUBITER(n,a,b,...) \
172 \
173 PREFETCH_B_L2(n) \
174 \
175 VMOVAPD(ZMM(a), MEM(RAX,(n+1)*64)) \
176 VFMADD231PD(ZMM( 8), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 0)*8)) \
177 VFMADD231PD(ZMM( 9), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 1)*8)) \
178 VFMADD231PD(ZMM(10), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 2)*8)) \
179 VFMADD231PD(ZMM(11), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 3)*8)) \
180 PREFETCH_B_L1_1(n) \
181 VFMADD231PD(ZMM(12), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 4)*8)) \
182 VFMADD231PD(ZMM(13), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 5)*8)) \
183 VFMADD231PD(ZMM(14), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 6)*8)) \
184 VFMADD231PD(ZMM(15), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 7)*8)) \
185 PREFETCH_B_L1_2(n) \
186 VFMADD231PD(ZMM(16), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 8)*8)) \
187 VFMADD231PD(ZMM(17), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+ 9)*8)) \
188 VFMADD231PD(ZMM(18), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+10)*8)) \
189 VFMADD231PD(ZMM(19), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+11)*8)) \
190 PREFETCH_B_L1_3(n) \
191 VFMADD231PD(ZMM(20), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+12)*8)) \
192 VFMADD231PD(ZMM(21), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+13)*8)) \
193 VFMADD231PD(ZMM(22), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+14)*8)) \
194 VFMADD231PD(ZMM(23), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+15)*8)) \
195 PREFETCH_A_L1(n) \
196 VFMADD231PD(ZMM(24), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+16)*8)) \
197 VFMADD231PD(ZMM(25), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+17)*8)) \
198 VFMADD231PD(ZMM(26), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+18)*8)) \
199 VFMADD231PD(ZMM(27), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+19)*8)) \
200 PREFETCH_A_L2(n) \
201 VFMADD231PD(ZMM(28), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+20)*8)) \
202 VFMADD231PD(ZMM(29), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+21)*8)) \
203 VFMADD231PD(ZMM(30), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+22)*8)) \
204 VFMADD231PD(ZMM(31), ZMM(b), MEM_1TO8(__VA_ARGS__,((n%%4)*24+23)*8))
205
206 #define TAIL_LOOP(NAME) \
207 \
208 LOOP_ALIGN \
209 LABEL(NAME) \
210 \
211 SUBITER(0,1,0,RBX) \
212 \
213 VMOVAPD(ZMM(0), ZMM(1)) \
214 \
215 LEA(RBX, MEM(RBX,24*8)) \
216 LEA(RAX, MEM(RAX, 8*8)) \
217 \
218 SUB(RDI, IMM(1)) \
219 \
220 JNZ(NAME)
221
222 #define MAIN_LOOP_1(NAME) \
223 \
224 LOOP_ALIGN \
225 LABEL(NAME##_LOOP) \
226 \
227 SUBITER(0,1,0,RBX) \
228 \
229 VMOVAPD(ZMM(0), ZMM(1)) \
230 \
231 LEA(RBX, MEM(RBX,24*8)) \
232 LEA(RAX, MEM(RAX, 8*8)) \
233 \
234 SUB(RSI, IMM(1)) \
235 \
236 JNZ(NAME##_LOOP)
237
238 #define MAIN_LOOP_2(NAME) \
239 \
240 MOV(RDI, RSI) \
241 AND(RDI, IMM(1)) \
242 SAR1(RSI) \
243 JZ(NAME##_TAIL) \
244 \
245 LOOP_ALIGN \
246 LABEL(NAME##_LOOP) \
247 \
248 SUBITER(0,1,0,RBX) \
249 SUBITER(1,0,1,RBX) \
250 \
251 LEA(RBX, MEM(RBX,2*24*8)) \
252 LEA(RAX, MEM(RAX,2* 8*8)) \
253 \
254 SUB(RSI, IMM(1)) \
255 \
256 JNZ(NAME##_LOOP) \
257 \
258 TEST(RDI, RDI) \
259 JZ(NAME##_DONE) \
260 \
261 LABEL(NAME##_TAIL) \
262 \
263 SUBITER(0,1,0,RBX) \
264 \
265 VMOVAPD(ZMM(0), ZMM(1)) \
266 \
267 LEA(RBX, MEM(RBX,24*8)) \
268 LEA(RAX, MEM(RAX, 8*8)) \
269 \
270 LABEL(NAME##_DONE)
271
272 #define MAIN_LOOP_4(NAME) \
273 \
274 MOV(RDI, RSI) \
275 AND(RDI, IMM(3)) \
276 SAR(RSI, IMM(2)) \
277 JZ(NAME##_TAIL) \
278 \
279 LOOP_ALIGN \
280 LABEL(NAME##_LOOP) \
281 \
282 SUBITER(0,1,0,RBX) \
283 SUBITER(1,0,1,RBX) \
284 SUBITER(2,1,0,RBX) \
285 SUBITER(3,0,1,RBX) \
286 \
287 LEA(RBX, MEM(RBX,4*24*8)) \
288 LEA(RAX, MEM(RAX,4* 8*8)) \
289 \
290 SUB(RSI, IMM(1)) \
291 \
292 JNZ(NAME##_LOOP) \
293 \
294 TEST(RDI, RDI) \
295 JZ(NAME##_DONE) \
296 \
297 TAIL_LOOP(NAME##_TAIL) \
298 \
299 LABEL(NAME##_DONE)
300
301 #define MAIN_LOOP_8(NAME) \
302 \
303 MOV(RDI, RSI) \
304 AND(RDI, IMM(7)) \
305 SAR(RSI, IMM(3)) \
306 JZ(NAME##_TAIL) \
307 \
308 LOOP_ALIGN \
309 LABEL(NAME##_LOOP) \
310 \
311 SUBITER(0,1,0,RBX) \
312 SUBITER(1,0,1,RBX) \
313 SUBITER(2,1,0,RBX) \
314 SUBITER(3,0,1,RBX) \
315 SUBITER(4,1,0,RBX,R8,1) \
316 SUBITER(5,0,1,RBX,R8,1) \
317 SUBITER(6,1,0,RBX,R8,1) \
318 SUBITER(7,0,1,RBX,R8,1) \
319 \
320 LEA(RBX, MEM(RBX,8*24*8)) \
321 LEA(RAX, MEM(RAX,8* 8*8)) \
322 \
323 SUB(RSI, IMM(1)) \
324 \
325 JNZ(NAME##_LOOP) \
326 \
327 TEST(RDI, RDI) \
328 JZ(NAME##_DONE) \
329 \
330 TAIL_LOOP(NAME##_TAIL) \
331 \
332 LABEL(NAME##_DONE)
333
334 #define MAIN_LOOP_16(NAME) \
335 \
336 MOV(RDI, RSI) \
337 AND(RDI, IMM(15)) \
338 SAR(RSI, IMM(4)) \
339 JZ(NAME##_TAIL) \
340 \
341 LOOP_ALIGN \
342 LABEL(NAME##_LOOP) \
343 \
344 SCATTER_PREFETCH_AB(0) \
345 \
346 SUBITER( 0,1,0,RBX) \
347 SUBITER( 1,0,1,RBX) \
348 SUBITER( 2,1,0,RBX) \
349 SUBITER( 3,0,1,RBX) \
350 SUBITER( 4,1,0,RBX,R8,1) \
351 SUBITER( 5,0,1,RBX,R8,1) \
352 SUBITER( 6,1,0,RBX,R8,1) \
353 SUBITER( 7,0,1,RBX,R8,1) \
354 SUBITER( 8,1,0,RBX,R8,2) \
355 SUBITER( 9,0,1,RBX,R8,2) \
356 SUBITER(10,1,0,RBX,R8,2) \
357 SUBITER(11,0,1,RBX,R8,2) \
358 SUBITER(12,1,0,RBX,R9,1) \
359 SUBITER(13,0,1,RBX,R9,1) \
360 SUBITER(14,1,0,RBX,R9,1) \
361 SUBITER(15,0,1,RBX,R9,1) \
362 \
363 LEA(RBX, MEM(RBX,16*24*8)) \
364 LEA(RAX, MEM(RAX,16* 8*8)) \
365 \
366 SUB(RSI, IMM(1)) \
367 \
368 JNZ(NAME##_LOOP) \
369 \
370 TEST(RDI, RDI) \
371 JZ(NAME##_DONE) \
372 \
373 SCATTER_PREFETCH_AB(0) \
374 \
375 TAIL_LOOP(NAME##_TAIL) \
376 \
377 LABEL(NAME##_DONE)
378
379 #define MAIN_LOOP_32(NAME) \
380 \
381 MOV(RDI, RSI) \
382 AND(RDI, IMM(31)) \
383 SAR(RSI, IMM(5)) \
384 JZ(NAME##_TAIL) \
385 \
386 LOOP_ALIGN \
387 LABEL(NAME##_LOOP) \
388 \
389 SCATTER_PREFETCH_AB(0) \
390 \
391 SUBITER( 0,1,0,RBX) \
392 SUBITER( 1,0,1,RBX) \
393 SUBITER( 2,1,0,RBX) \
394 SUBITER( 3,0,1,RBX) \
395 SUBITER( 4,1,0,RBX,R8,1) \
396 SUBITER( 5,0,1,RBX,R8,1) \
397 SUBITER( 6,1,0,RBX,R8,1) \
398 SUBITER( 7,0,1,RBX,R8,1) \
399 SUBITER( 8,1,0,RBX,R8,2) \
400 SUBITER( 9,0,1,RBX,R8,2) \
401 SUBITER(10,1,0,RBX,R8,2) \
402 SUBITER(11,0,1,RBX,R8,2) \
403 SUBITER(12,1,0,RBX,R9,1) \
404 SUBITER(13,0,1,RBX,R9,1) \
405 SUBITER(14,1,0,RBX,R9,1) \
406 SUBITER(15,0,1,RBX,R9,1) \
407 \
408 SCATTER_PREFETCH_AB(1) \
409 \
410 SUBITER(16,1,0,RBX,R8,4) \
411 SUBITER(17,0,1,RBX,R8,4) \
412 SUBITER(18,1,0,RBX,R8,4) \
413 SUBITER(19,0,1,RBX,R8,4) \
414 SUBITER(20,1,0,RBX,R10,1) \
415 SUBITER(21,0,1,RBX,R10,1) \
416 SUBITER(22,1,0,RBX,R10,1) \
417 SUBITER(23,0,1,RBX,R10,1) \
418 SUBITER(24,1,0,RBX,R9,2) \
419 SUBITER(25,0,1,RBX,R9,2) \
420 SUBITER(26,1,0,RBX,R9,2) \
421 SUBITER(27,0,1,RBX,R9,2) \
422 SUBITER(28,1,0,RBX,R11,1) \
423 SUBITER(29,0,1,RBX,R11,1) \
424 SUBITER(30,1,0,RBX,R11,1) \
425 SUBITER(31,0,1,RBX,R11,1) \
426 \
427 LEA(RBX, MEM(RBX,32*24*8)) \
428 LEA(RAX, MEM(RAX,32* 8*8)) \
429 \
430 SUB(RSI, IMM(1)) \
431 \
432 JNZ(NAME##_LOOP) \
433 \
434 TEST(RDI, RDI) \
435 JZ(NAME##_DONE) \
436 \
437 SCATTER_PREFETCH_AB(0) \
438 SCATTER_PREFETCH_AB(1) \
439 \
440 TAIL_LOOP(NAME##_TAIL) \
441 \
442 LABEL(NAME##_DONE)
443
444 #define LOOP_K_(M,K) M##K
445 #define LOOP_K(M,K,NAME) LOOP_K_(M,K)(NAME)
446
447 #define MAIN_LOOP_L2 LOOP_K(MAIN_LOOP_,UNROLL_K,MAIN_LOOP_L2)
448 #define MAIN_LOOP_L1 LOOP_K(MAIN_LOOP_,C_L1_ITERS,MAIN_LOOP_L1)
449
450 //This is an array used for the scatter/gather instructions.
451 extern int32_t offsets[24];
452
453 //#define MONITORS
454 //#define LOOPMON
bli_dgemm_knl_asm_8x24(dim_t k,double * restrict alpha,double * restrict a,double * restrict b,double * restrict beta,double * restrict c,inc_t rs_c,inc_t cs_c,auxinfo_t * restrict data,cntx_t * restrict cntx)455 void bli_dgemm_knl_asm_8x24
456 (
457 dim_t k,
458 double* restrict alpha,
459 double* restrict a,
460 double* restrict b,
461 double* restrict beta,
462 double* restrict c, inc_t rs_c, inc_t cs_c,
463 auxinfo_t* restrict data,
464 cntx_t* restrict cntx
465 )
466 {
467 const double * a_next = bli_auxinfo_next_a( data );
468 const double * b_next = bli_auxinfo_next_b( data );
469
470 const int32_t * offsetPtr = &offsets[0];
471
472 uint64_t k64 = k;
473
474 #ifdef MONITORS
475 int toph, topl, both, botl, midl, midh, mid2l, mid2h;
476 #endif
477 #ifdef LOOPMON
478 int tlooph, tloopl, blooph, bloopl;
479 #endif
480
481 __asm__ volatile
482 (
483 #ifdef MONITORS
484 RDTSC
485 MOV(VAR(topl), EAX)
486 MOV(VAR(toph), EDX)
487 #endif
488
489 VPXORD(ZMM(8), ZMM(8), ZMM(8)) //clear out registers
490 VMOVAPS(ZMM( 9), ZMM(8))
491 VMOVAPS(ZMM(10), ZMM(8)) MOV(RSI, VAR(k)) //loop index
492 VMOVAPS(ZMM(11), ZMM(8)) MOV(RAX, VAR(a)) //load address of a
493 VMOVAPS(ZMM(12), ZMM(8)) MOV(RBX, VAR(b)) //load address of b
494 VMOVAPS(ZMM(13), ZMM(8)) MOV(RCX, VAR(c)) //load address of c
495 VMOVAPS(ZMM(14), ZMM(8)) VMOVAPD(ZMM(0), MEM(RAX)) //pre-load a
496 VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
497 VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
498 #if SCATTER_PREFETCH_C
499 VMOVAPS(ZMM(17), ZMM(8))
500 VMOVAPS(ZMM(18), ZMM(8))
501 VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(cs_c))
502 VMOVAPS(ZMM(20), ZMM(8))
503 VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
504 VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
505 VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
506 #else
507 VMOVAPS(ZMM(17), ZMM(8)) MOV(R12, VAR(cs_c))
508 VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
509 VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
510 VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
511 VMOVAPS(ZMM(21), ZMM(8)) LEA(RDX, MEM(RCX,R12,8))
512 VMOVAPS(ZMM(22), ZMM(8)) LEA(RDI, MEM(RDX,R12,8))
513 VMOVAPS(ZMM(23), ZMM(8))
514 #endif
515 VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3))
516 VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations
517 VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
518 VMOVAPS(ZMM(27), ZMM(8)) LEA(R10, MEM(R8,R8,4)) //*5
519 VMOVAPS(ZMM(28), ZMM(8)) LEA(R11, MEM(R9,R8,4)) //*7
520 VMOVAPS(ZMM(29), ZMM(8))
521 VMOVAPS(ZMM(30), ZMM(8))
522 VMOVAPS(ZMM(31), ZMM(8))
523
524 #ifdef MONITORS
525 RDTSC
526 MOV(VAR(midl), EAX)
527 MOV(VAR(midh), EDX)
528 #endif
529
530 //need 0+... to satisfy preprocessor
531 CMP(RSI, IMM(0+C_MIN_L2_ITERS))
532 JLE(PREFETCH_C_L1)
533
534 SUB(RSI, IMM(0+C_L1_ITERS))
535
536 //prefetch C into L2
537 #if SCATTER_PREFETCH_C
538 KXNORW(K(1), K(0), K(0))
539 KXNORW(K(2), K(0), K(0))
540 VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
541 VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
542 #else
543 PREFETCH(1, MEM(RCX ))
544 PREFETCH(1, MEM(RCX,R12,1))
545 PREFETCH(1, MEM(RCX,R12,2))
546 PREFETCH(1, MEM(RCX,R13,1))
547 PREFETCH(1, MEM(RCX,R12,4))
548 PREFETCH(1, MEM(RCX,R14,1))
549 PREFETCH(1, MEM(RCX,R13,2))
550 PREFETCH(1, MEM(RCX,R15,1))
551 PREFETCH(1, MEM(RDX ))
552 PREFETCH(1, MEM(RDX,R12,1))
553 PREFETCH(1, MEM(RDX,R12,2))
554 PREFETCH(1, MEM(RDX,R13,1))
555 PREFETCH(1, MEM(RDX,R12,4))
556 PREFETCH(1, MEM(RDX,R14,1))
557 PREFETCH(1, MEM(RDX,R13,2))
558 PREFETCH(1, MEM(RDX,R15,1))
559 PREFETCH(1, MEM(RDI ))
560 PREFETCH(1, MEM(RDI,R12,1))
561 PREFETCH(1, MEM(RDI,R12,2))
562 PREFETCH(1, MEM(RDI,R13,1))
563 PREFETCH(1, MEM(RDI,R12,4))
564 PREFETCH(1, MEM(RDI,R14,1))
565 PREFETCH(1, MEM(RDI,R13,2))
566 PREFETCH(1, MEM(RDI,R15,1))
567 #endif
568
569 MAIN_LOOP_L2
570
571 MOV(RSI, IMM(0+C_L1_ITERS))
572
573 LABEL(PREFETCH_C_L1)
574
575 //prefetch C into L1
576 #if SCATTER_PREFETCH_C
577 KXNORW(K(1), K(0), K(0))
578 KXNORW(K(2), K(0), K(0))
579 VSCATTERPFDPS(0, MEM(RCX,ZMM(2),8) MASK_K(1))
580 VSCATTERPFDPD(0, MEM(RCX,YMM(3),8) MASK_K(2))
581 #else
582 PREFETCH(0, MEM(RCX ))
583 PREFETCH(0, MEM(RCX,R12,1))
584 PREFETCH(0, MEM(RCX,R12,2))
585 PREFETCH(0, MEM(RCX,R13,1))
586 PREFETCH(0, MEM(RCX,R12,4))
587 PREFETCH(0, MEM(RCX,R14,1))
588 PREFETCH(0, MEM(RCX,R13,2))
589 PREFETCH(0, MEM(RCX,R15,1))
590 PREFETCH(0, MEM(RDX ))
591 PREFETCH(0, MEM(RDX,R12,1))
592 PREFETCH(0, MEM(RDX,R12,2))
593 PREFETCH(0, MEM(RDX,R13,1))
594 PREFETCH(0, MEM(RDX,R12,4))
595 PREFETCH(0, MEM(RDX,R14,1))
596 PREFETCH(0, MEM(RDX,R13,2))
597 PREFETCH(0, MEM(RDX,R15,1))
598 PREFETCH(0, MEM(RDI ))
599 PREFETCH(0, MEM(RDI,R12,1))
600 PREFETCH(0, MEM(RDI,R12,2))
601 PREFETCH(0, MEM(RDI,R13,1))
602 PREFETCH(0, MEM(RDI,R12,4))
603 PREFETCH(0, MEM(RDI,R14,1))
604 PREFETCH(0, MEM(RDI,R13,2))
605 PREFETCH(0, MEM(RDI,R15,1))
606 #endif
607
608 MAIN_LOOP_L1
609
610 LABEL(POSTACCUM)
611
612 #ifdef MONITORS
613 RDTSC
614 MOV(VAR(mid2l), EAX)
615 MOV(VAR(mid2h), EDX)
616 #endif
617
618 MOV(RAX, VAR(alpha))
619 MOV(RBX, VAR(beta))
620 VBROADCASTSD(ZMM(0), MEM(RAX))
621 VBROADCASTSD(ZMM(1), MEM(RBX))
622
623 // Check if C is column stride. If not, jump to the slow scattered update
624 MOV(RAX, VAR(cs_c))
625 LEA(RAX, MEM(,RAX,8))
626 MOV(RBX, VAR(rs_c))
627 LEA(RDI, MEM(RAX,RAX,2))
628 CMP(RBX, IMM(1))
629 JNE(SCATTEREDUPDATE)
630
631 VMOVQ(RDX, XMM(1))
632 SAL1(RDX) //shift out sign bit
633 JZ(COLSTORBZ)
634
635 UPDATE_C_FOUR_ROWS( 8, 9,10,11)
636 UPDATE_C_FOUR_ROWS(12,13,14,15)
637 UPDATE_C_FOUR_ROWS(16,17,18,19)
638 UPDATE_C_FOUR_ROWS(20,21,22,23)
639 UPDATE_C_FOUR_ROWS(24,25,26,27)
640 UPDATE_C_FOUR_ROWS(28,29,30,31)
641
642 JMP(END)
643
644 LABEL(COLSTORBZ)
645
646 UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)
647 UPDATE_C_BZ_FOUR_ROWS(12,13,14,15)
648 UPDATE_C_BZ_FOUR_ROWS(16,17,18,19)
649 UPDATE_C_BZ_FOUR_ROWS(20,21,22,23)
650 UPDATE_C_BZ_FOUR_ROWS(24,25,26,27)
651 UPDATE_C_BZ_FOUR_ROWS(28,29,30,31)
652
653 JMP(END)
654
655 LABEL(SCATTEREDUPDATE)
656
657 MOV(RDI, VAR(offsetPtr))
658 VMOVAPS(ZMM(2), MEM(RDI))
659 /* Note that this ignores the upper 32 bits in rs_c */
660 VPBROADCASTD(ZMM(3), EBX)
661 VPMULLD(ZMM(2), ZMM(3), ZMM(2))
662
663 VMOVQ(RDX, XMM(1))
664 SAL1(RDX) //shift out sign bit
665 JZ(SCATTERBZ)
666
667 UPDATE_C_ROW_SCATTERED( 8)
668 UPDATE_C_ROW_SCATTERED( 9)
669 UPDATE_C_ROW_SCATTERED(10)
670 UPDATE_C_ROW_SCATTERED(11)
671 UPDATE_C_ROW_SCATTERED(12)
672 UPDATE_C_ROW_SCATTERED(13)
673 UPDATE_C_ROW_SCATTERED(14)
674 UPDATE_C_ROW_SCATTERED(15)
675 UPDATE_C_ROW_SCATTERED(16)
676 UPDATE_C_ROW_SCATTERED(17)
677 UPDATE_C_ROW_SCATTERED(18)
678 UPDATE_C_ROW_SCATTERED(19)
679 UPDATE_C_ROW_SCATTERED(20)
680 UPDATE_C_ROW_SCATTERED(21)
681 UPDATE_C_ROW_SCATTERED(22)
682 UPDATE_C_ROW_SCATTERED(23)
683 UPDATE_C_ROW_SCATTERED(24)
684 UPDATE_C_ROW_SCATTERED(25)
685 UPDATE_C_ROW_SCATTERED(26)
686 UPDATE_C_ROW_SCATTERED(27)
687 UPDATE_C_ROW_SCATTERED(28)
688 UPDATE_C_ROW_SCATTERED(29)
689 UPDATE_C_ROW_SCATTERED(30)
690 UPDATE_C_ROW_SCATTERED(31)
691
692 JMP(END)
693
694 LABEL(SCATTERBZ)
695
696 UPDATE_C_BZ_ROW_SCATTERED( 8)
697 UPDATE_C_BZ_ROW_SCATTERED( 9)
698 UPDATE_C_BZ_ROW_SCATTERED(10)
699 UPDATE_C_BZ_ROW_SCATTERED(11)
700 UPDATE_C_BZ_ROW_SCATTERED(12)
701 UPDATE_C_BZ_ROW_SCATTERED(13)
702 UPDATE_C_BZ_ROW_SCATTERED(14)
703 UPDATE_C_BZ_ROW_SCATTERED(15)
704 UPDATE_C_BZ_ROW_SCATTERED(16)
705 UPDATE_C_BZ_ROW_SCATTERED(17)
706 UPDATE_C_BZ_ROW_SCATTERED(18)
707 UPDATE_C_BZ_ROW_SCATTERED(19)
708 UPDATE_C_BZ_ROW_SCATTERED(20)
709 UPDATE_C_BZ_ROW_SCATTERED(21)
710 UPDATE_C_BZ_ROW_SCATTERED(22)
711 UPDATE_C_BZ_ROW_SCATTERED(23)
712 UPDATE_C_BZ_ROW_SCATTERED(24)
713 UPDATE_C_BZ_ROW_SCATTERED(25)
714 UPDATE_C_BZ_ROW_SCATTERED(26)
715 UPDATE_C_BZ_ROW_SCATTERED(27)
716 UPDATE_C_BZ_ROW_SCATTERED(28)
717 UPDATE_C_BZ_ROW_SCATTERED(29)
718 UPDATE_C_BZ_ROW_SCATTERED(30)
719 UPDATE_C_BZ_ROW_SCATTERED(31)
720
721 LABEL(END)
722
723 #ifdef MONITORS
724 RDTSC
725 MOV(VAR(botl), EAX)
726 MOV(VAR(both), EDX)
727 #endif
728 : // output operands
729 #ifdef MONITORS
730 [topl] "=m" (topl),
731 [toph] "=m" (toph),
732 [midl] "=m" (midl),
733 [midh] "=m" (midh),
734 [mid2l] "=m" (mid2l),
735 [mid2h] "=m" (mid2h),
736 [botl] "=m" (botl),
737 [both] "=m" (both)
738 #endif
739 : // input operands
740 [k] "m" (k64),
741 [a] "m" (a),
742 [b] "m" (b),
743 [alpha] "m" (alpha),
744 [beta] "m" (beta),
745 [c] "m" (c),
746 [rs_c] "m" (rs_c),
747 [cs_c] "m" (cs_c),
748 [a_next] "m" (a_next),
749 [b_next] "m" (b_next),
750 [offsetPtr] "m" (offsetPtr)
751 : // register clobber list
752 "rax", "rbx", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12",
753 "r13", "r14", "r15", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
754 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13",
755 "zmm14", "zmm15", "zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21",
756 "zmm22", "zmm23", "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
757 "zmm30", "zmm31", "memory"
758 );
759
760 #ifdef LOOPMON
761 printf("looptime = \t%d\n", bloopl - tloopl);
762 #endif
763 #ifdef MONITORS
764 dim_t top = ((dim_t)toph << 32) | topl;
765 dim_t mid = ((dim_t)midh << 32) | midl;
766 dim_t mid2 = ((dim_t)mid2h << 32) | mid2l;
767 dim_t bot = ((dim_t)both << 32) | botl;
768 printf("setup =\t%u\tmain loop =\t%u\tcleanup=\t%u\ttotal=\t%u\n", mid - top, mid2 - mid, bot - mid2, bot - top);
769 #endif
770 }
771