1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "bli_avx512_macros.h"
36 #include "blis.h"
37
38 #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
39 z0,z1,z2,z3,z4,z5,z6,z7) \
40 \
41 VMULPD(ZMM(z0), ZMM(31), MEM(a, o)) \
42 VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \
43 VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \
44 VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \
45 VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \
46 VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) \
47 VMULPD(ZMM(z6), ZMM(31), MEM(a,s3,2,o)) \
48 VMULPD(ZMM(z7), ZMM(31), MEM(a,s7,1,o))
49
50 #define LOADMUL8x8_MASK(a,o,s1,s3,s5,s7, \
51 z0,z1,z2,z3,z4,z5,z6,z7,k) \
52 \
53 VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a, o)) \
54 VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \
55 VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \
56 VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \
57 VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \
58 VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) \
59 VMULPD(ZMM(z6) MASK_KZ(k), ZMM(31), MEM(a,s3,2,o)) \
60 VMULPD(ZMM(z7) MASK_KZ(k), ZMM(31), MEM(a,s7,1,o))
61
62 #define STORE8x8(a,o,s1,s3,s5,s7, \
63 z0,z1,z2,z3,z4,z5,z6,z7) \
64 \
65 VMOVUPD(MEM(a, o), ZMM(z0)) \
66 VMOVUPD(MEM(a,s1,1,o), ZMM(z1)) \
67 VMOVUPD(MEM(a,s1,2,o), ZMM(z2)) \
68 VMOVUPD(MEM(a,s3,1,o), ZMM(z3)) \
69 VMOVUPD(MEM(a,s1,4,o), ZMM(z4)) \
70 VMOVUPD(MEM(a,s5,1,o), ZMM(z5)) \
71 VMOVUPD(MEM(a,s3,2,o), ZMM(z6)) \
72 VMOVUPD(MEM(a,s7,1,o), ZMM(z7))
73
74 #define TRANSPOSE8x8(a0,a1,a2,a3,a4,a5,a6,a7, \
75 b0,b1,b2,b3,b4,b5,b6,b7) \
76 \
77 VUNPCKLPD(ZMM(b0), ZMM(a0), ZMM(a1)) \
78 VUNPCKHPD(ZMM(b1), ZMM(a0), ZMM(a1)) \
79 VUNPCKLPD(ZMM(b2), ZMM(a2), ZMM(a3)) \
80 VUNPCKHPD(ZMM(b3), ZMM(a2), ZMM(a3)) \
81 VUNPCKLPD(ZMM(b4), ZMM(a4), ZMM(a5)) \
82 VUNPCKHPD(ZMM(b5), ZMM(a4), ZMM(a5)) \
83 VUNPCKLPD(ZMM(b6), ZMM(a6), ZMM(a7)) \
84 VUNPCKHPD(ZMM(b7), ZMM(a6), ZMM(a7)) \
85 VSHUFF64X2(ZMM(a0), ZMM(b0), ZMM(b2), IMM(0x44)) \
86 VSHUFF64X2(ZMM(a1), ZMM(b1), ZMM(b3), IMM(0x44)) \
87 VSHUFF64X2(ZMM(a2), ZMM(b0), ZMM(b2), IMM(0xEE)) \
88 VSHUFF64X2(ZMM(a3), ZMM(b1), ZMM(b3), IMM(0xEE)) \
89 VSHUFF64X2(ZMM(a4), ZMM(b4), ZMM(b6), IMM(0x44)) \
90 VSHUFF64X2(ZMM(a5), ZMM(b5), ZMM(b7), IMM(0x44)) \
91 VSHUFF64X2(ZMM(a6), ZMM(b4), ZMM(b6), IMM(0xEE)) \
92 VSHUFF64X2(ZMM(a7), ZMM(b5), ZMM(b7), IMM(0xEE)) \
93 VSHUFF64X2(ZMM(b0), ZMM(a0), ZMM(a4), IMM(0x88)) \
94 VSHUFF64X2(ZMM(b1), ZMM(a1), ZMM(a5), IMM(0x88)) \
95 VSHUFF64X2(ZMM(b2), ZMM(a0), ZMM(a4), IMM(0xDD)) \
96 VSHUFF64X2(ZMM(b3), ZMM(a1), ZMM(a5), IMM(0xDD)) \
97 VSHUFF64X2(ZMM(b4), ZMM(a2), ZMM(a6), IMM(0x88)) \
98 VSHUFF64X2(ZMM(b5), ZMM(a3), ZMM(a7), IMM(0x88)) \
99 VSHUFF64X2(ZMM(b6), ZMM(a2), ZMM(a6), IMM(0xDD)) \
100 VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD))
101
102 //This is an array used for the scatter/gather instructions.
103 static int32_t offsets[32] __attribute__((aligned(64))) =
104 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
105 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
106
bli_dpackm_8xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)107 void bli_dpackm_8xk_opt
108 (
109 conj_t conja,
110 dim_t n_,
111 void* restrict kappa_,
112 void* restrict a_, inc_t inca_, inc_t lda_,
113 void* restrict p_, inc_t ldp_
114 )
115 {
116 (void)conja;
117
118 const int32_t * offsetPtr = &offsets[0];
119 double* a = (double*)a_;
120 double* p = (double*)p_;
121 double* kappa = (double*)kappa_;
122 const int64_t n = n_;
123 const int64_t inca = inca_;
124 const int64_t lda = lda_;
125 const int64_t ldp = ldp_;
126
127 __asm__ volatile
128 (
129 MOV(RSI, VAR(n))
130 MOV(RAX, VAR(a))
131 MOV(RBX, VAR(inca))
132 MOV(RCX, VAR(lda))
133 MOV(R14, VAR(p))
134 MOV(RDI, VAR(ldp))
135
136 TEST(RSI, RSI)
137 JZ(PACK8_DONE)
138
139 LEA(RBX, MEM(,RBX,8)) //inca in bytes
140 LEA(RCX, MEM(,RCX,8)) //lda in bytes
141 LEA(RDI, MEM(,RDI,8)) //ldp in bytes
142 LEA(R11, MEM(RDI,RDI,2)) //ldp*3
143 LEA(R12, MEM(RDI,RDI,4)) //ldp*5
144 LEA(R13, MEM(R11,RDI,4)) //ldp*7
145
146 VBROADCASTSD(ZMM(31), VAR(kappa))
147
148 CMP(RBX, IMM(8))
149 JNE(PACK8_T)
150
151 LABEL(PACK8_N)
152
153 MOV(RDX, RSI)
154 AND(RDX, IMM(7))
155 SAR(RSI, IMM(3))
156 JZ(PACK8_N_TAIL)
157
158 LEA(R8, MEM(RCX,RCX,2)) //lda*3
159 LEA(R9, MEM(RCX,RCX,4)) //lda*5
160 LEA(R10, MEM(R8 ,RCX,4)) //lda*7
161
162 LABEL(PACK8_N_LOOP)
163
164 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
165 STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
166
167 LEA(RAX, MEM(RAX,RCX,8))
168 LEA(R14, MEM(R14,RDI,8))
169
170 SUB(RSI, IMM(1))
171
172 JNZ(PACK8_N_LOOP)
173
174 TEST(RDX, RDX)
175 JZ(PACK8_DONE)
176
177 LABEL(PACK8_N_TAIL)
178
179 VMULPD(ZMM(0), ZMM(31), MEM(RAX))
180 VMOVUPD(MEM(R14), ZMM(0))
181
182 LEA(RAX, MEM(RAX,RCX,1))
183 LEA(R14, MEM(R14,RDI,1))
184
185 SUB(RDX, IMM(1))
186
187 JNZ(PACK8_N_TAIL)
188
189 JMP(PACK8_DONE)
190
191 LABEL(PACK8_T)
192
193 CMP(RCX, IMM(8))
194 JNE(PACK8_G)
195
196 LEA(R8, MEM(RBX,RBX,2)) //inca*3
197 LEA(R9, MEM(RBX,RBX,4)) //inca*5
198 LEA(R10, MEM(R8 ,RBX,4)) //inca*7
199
200 MOV(RDX, RSI)
201 AND(RDX, IMM(7))
202 SAR(RSI, IMM(3))
203 JZ(PACK8_T_TAIL)
204
205 LABEL(PACK8_T_LOOP)
206
207 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
208 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
209 16,17,18,19,20,21,22,23)
210 STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
211
212 LEA(RAX, MEM(RAX,RCX,8))
213 LEA(R14, MEM(R14,RDI,8))
214
215 SUB(RSI, IMM(1))
216
217 JNZ(PACK8_T_LOOP)
218
219 TEST(RDX, RDX)
220 JZ(PACK8_DONE)
221
222 LABEL(PACK8_T_TAIL)
223
224 MOV(RSI, IMM(1))
225 SHLX(RSI, RSI, RDX)
226 SUB(RSI, IMM(1))
227 KMOV(K(1), ESI) //mask for n%8 elements
228
229 LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
230 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
231 8, 9,10,11,12,13,14,15)
232
233 VMOVUPD(MEM(R14 ), ZMM( 8))
234 SUB(RDX, IMM(1))
235 JZ(PACK8_DONE)
236 VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
237 SUB(RDX, IMM(1))
238 JZ(PACK8_DONE)
239 VMOVUPD(MEM(R14,RDI,2), ZMM(10))
240 SUB(RDX, IMM(1))
241 JZ(PACK8_DONE)
242 VMOVUPD(MEM(R14,R11,1), ZMM(11))
243 SUB(RDX, IMM(1))
244 JZ(PACK8_DONE)
245 VMOVUPD(MEM(R14,RDI,4), ZMM(12))
246 SUB(RDX, IMM(1))
247 JZ(PACK8_DONE)
248 VMOVUPD(MEM(R14,R12,1), ZMM(13))
249 SUB(RDX, IMM(1))
250 JZ(PACK8_DONE)
251 VMOVUPD(MEM(R14,R11,2), ZMM(14))
252
253 JMP(PACK8_DONE)
254
255 LABEL(PACK8_G)
256
257 VPBROADCASTD(ZMM(3), VAR(inca))
258 MOV(RBX, VAR(offsetPtr))
259 VPMULLD(YMM(0), YMM(3), MEM(RBX))
260
261 LABEL(PACK8_G_LOOP)
262
263 KXNORW(K(1), K(0), K(0))
264 VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
265 VMULPD(ZMM(3), ZMM(3), ZMM(31))
266 VMOVUPD(MEM(R14), ZMM(3))
267
268 LEA(RAX, MEM(RAX,RCX,1))
269 LEA(R14, MEM(R14,RDI,1))
270
271 SUB(RSI, IMM(1))
272
273 JNZ(PACK8_G_LOOP)
274
275 LABEL(PACK8_DONE)
276
277 : //output operands
278 : //input operands
279 [n] "m" (n),
280 [kappa] "m" (*kappa),
281 [a] "m" (a),
282 [inca] "m" (inca),
283 [lda] "m" (lda),
284 [p] "m" (p),
285 [ldp] "m" (ldp),
286 [offsetPtr] "m" (offsetPtr)
287 : //clobbers
288 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
289 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
290 "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
291 "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
292 "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
293 "zmm30", "zmm31",
294 "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
295 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
296 );
297 }
298
bli_dpackm_24xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)299 void bli_dpackm_24xk_opt
300 (
301 conj_t conja,
302 dim_t n_,
303 void* restrict kappa_,
304 void* restrict a_, inc_t inca_, inc_t lda_,
305 void* restrict p_, inc_t ldp_
306 )
307 {
308 (void)conja;
309
310 const int32_t * offsetPtr = &offsets[0];
311 double* a = (double*)a_;
312 double* p = (double*)p_;
313 double* kappa = (double*)kappa_;
314 const int64_t n = n_;
315 const int64_t inca = inca_;
316 const int64_t lda = lda_;
317 const int64_t ldp = ldp_;
318
319 __asm__ volatile
320 (
321 MOV(RSI, VAR(n))
322 MOV(RAX, VAR(a))
323 MOV(RBX, VAR(inca))
324 MOV(RCX, VAR(lda))
325 MOV(R15, VAR(p))
326 MOV(RDI, VAR(ldp))
327
328 LEA(RBX, MEM(,RBX,8)) //inca in bytes
329 LEA(RCX, MEM(,RCX,8)) //lda in bytes
330 LEA(RDI, MEM(,RDI,8)) //ldp in bytes
331 LEA(R11, MEM(RDI,RDI,2)) //ldp*3
332 LEA(R12, MEM(RDI,RDI,4)) //ldp*5
333 LEA(R13, MEM(R11,RDI,4)) //ldp*7
334
335 VBROADCASTSD(ZMM(31), VAR(kappa))
336
337 TEST(RSI, RSI)
338 JZ(PACK24_DONE)
339
340 CMP(RBX, IMM(8))
341 JNE(PACK24_T)
342
343 LABEL(PACK24_N)
344
345 SAR(RSI, IMM(3))
346 JZ(PACK24_N_TAIL)
347
348 LEA(R8, MEM(RCX,RCX,2)) //lda*3
349 LEA(R9, MEM(RCX,RCX,4)) //lda*5
350 LEA(R10, MEM(R8 ,RCX,4)) //lda*7
351
352 LABEL(PACK24_N_LOOP)
353
354 LOADMUL8x8(RAX, 0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
355 LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
356 LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23)
357 STORE8x8(R15, 0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
358 STORE8x8(R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
359 STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
360
361 LEA(RAX, MEM(RAX,RCX,8))
362 LEA(R15, MEM(R15,RDI,8))
363
364 SUB(RSI, IMM(1))
365
366 JNZ(PACK24_N_LOOP)
367
368 LABEL(PACK24_N_TAIL)
369
370 MOV(RSI, VAR(n))
371 AND(RSI, IMM(7))
372 TEST(RSI, RSI)
373 JZ(PACK24_DONE)
374
375 LABEL(PACK24_N_TAIL_LOOP)
376
377 VMULPD(ZMM(0), ZMM(31), MEM(RAX, 0))
378 VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64))
379 VMULPD(ZMM(2), ZMM(31), MEM(RAX,128))
380 VMOVUPD(MEM(R15, 0), ZMM(0))
381 VMOVUPD(MEM(R15, 64), ZMM(1))
382 VMOVUPD(MEM(R15,128), ZMM(2))
383
384 LEA(RAX, MEM(RAX,RCX,1))
385 LEA(R15, MEM(R15,RDI,1))
386
387 SUB(RSI, IMM(1))
388
389 JNZ(PACK24_N_TAIL_LOOP)
390
391 JMP(PACK24_DONE)
392
393 LABEL(PACK24_T)
394
395 CMP(RCX, IMM(8))
396 JNE(PACK24_G)
397
398 LEA(R8, MEM(RBX,RBX,2)) //inca*3
399 LEA(R9, MEM(RBX,RBX,4)) //inca*5
400 LEA(R10, MEM(R8 ,RBX,4)) //inca*7
401
402 LEA(R14, MEM(RAX,RBX,8))
403 LEA(RCX, MEM(R14,RBX,8))
404
405 SAR(RSI, IMM(3))
406 JZ(PACK24_T_TAIL)
407
408 LABEL(PACK24_T_LOOP)
409
410 LOADMUL8x8(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
411 LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
412 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
413 16,17,18,19,20,21,22,23)
414 STORE8x8(R15, 0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
415 LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
416 TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
417 16,17,18,19,20,21,22,23)
418 STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
419 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
420 16,17,18,19,20,21,22,23)
421 STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
422
423 LEA(RAX, MEM(RAX,64))
424 LEA(R14, MEM(R14,64))
425 LEA(RCX, MEM(RCX,64))
426 LEA(R15, MEM(R15,RDI,8))
427
428 SUB(RSI, IMM(1))
429
430 JNZ(PACK24_T_LOOP)
431
432 LABEL(PACK24_T_TAIL)
433
434 MOV(RSI, VAR(n))
435 AND(RSI, IMM(7))
436 TEST(RSI, RSI)
437 JZ(PACK24_DONE)
438
439 MOV(R13, IMM(1))
440 SHLX(R13, R13, RSI)
441 SUB(R13, IMM(1))
442 KMOV(K(1), R13D) //mask for n%8 elements
443
444 LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
445 LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)
446 LOADMUL8x8_MASK(RCX,0,RBX,R8,R9,R10,16,17,18,19,20,21,22,23,1)
447 TRANSPOSE8x8(16,17,18,19,20,21,22,23,
448 24,25,26,27,28,29,30,31)
449 TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
450 16,17,18,19,20,21,22,23)
451 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
452 8, 9,10,11,12,13,14,15)
453
454 VMOVUPD(MEM(R15, 0), ZMM( 8))
455 VMOVUPD(MEM(R15, 64), ZMM(16))
456 VMOVUPD(MEM(R15, 128), ZMM(24))
457 SUB(RSI, IMM(1))
458 JZ(PACK24_DONE)
459 VMOVUPD(MEM(R15,RDI,1, 0), ZMM( 9))
460 VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17))
461 VMOVUPD(MEM(R15,RDI,1,128), ZMM(25))
462 SUB(RSI, IMM(1))
463 JZ(PACK24_DONE)
464 VMOVUPD(MEM(R15,RDI,2, 0), ZMM(10))
465 VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18))
466 VMOVUPD(MEM(R15,RDI,2,128), ZMM(26))
467 SUB(RSI, IMM(1))
468 JZ(PACK24_DONE)
469 VMOVUPD(MEM(R15,R11,1, 0), ZMM(11))
470 VMOVUPD(MEM(R15,R11,1, 64), ZMM(19))
471 VMOVUPD(MEM(R15,R11,1,128), ZMM(27))
472 SUB(RSI, IMM(1))
473 JZ(PACK24_DONE)
474 VMOVUPD(MEM(R15,RDI,4, 0), ZMM(12))
475 VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20))
476 VMOVUPD(MEM(R15,RDI,4,128), ZMM(28))
477 SUB(RSI, IMM(1))
478 JZ(PACK24_DONE)
479 VMOVUPD(MEM(R15,R12,1, 0), ZMM(13))
480 VMOVUPD(MEM(R15,R12,1, 64), ZMM(21))
481 VMOVUPD(MEM(R15,R12,1,128), ZMM(29))
482 SUB(RSI, IMM(1))
483 JZ(PACK24_DONE)
484 VMOVUPD(MEM(R15,R11,2, 0), ZMM(14))
485 VMOVUPD(MEM(R15,R11,2, 64), ZMM(22))
486 VMOVUPD(MEM(R15,R11,2,128), ZMM(30))
487
488 JMP(PACK24_DONE)
489
490 LABEL(PACK24_G)
491
492 VPBROADCASTD(ZMM(3), VAR(inca))
493 MOV(RBX, VAR(offsetPtr))
494 VPMULLD(YMM(0), YMM(3), MEM(RBX, 0))
495 VPMULLD(YMM(1), YMM(3), MEM(RBX,32))
496 VPMULLD(YMM(2), YMM(3), MEM(RBX,64))
497
498 LABEL(PACK24_G_LOOP)
499
500 KXNORW(K(1), K(0), K(0))
501 KXNORW(K(2), K(0), K(0))
502 KXNORW(K(3), K(0), K(0))
503 VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
504 VGATHERDPD(ZMM(4) MASK_K(2), MEM(RAX,YMM(1),8))
505 VGATHERDPD(ZMM(5) MASK_K(3), MEM(RAX,YMM(2),8))
506 VMULPD(ZMM(3), ZMM(3), ZMM(31))
507 VMULPD(ZMM(4), ZMM(4), ZMM(31))
508 VMULPD(ZMM(5), ZMM(5), ZMM(31))
509 VMOVUPD(MEM(R15, 0), ZMM(3))
510 VMOVUPD(MEM(R15, 64), ZMM(4))
511 VMOVUPD(MEM(R15,128), ZMM(5))
512
513 LEA(RAX, MEM(RAX,RCX,1))
514 LEA(R15, MEM(R15,RDI,1))
515
516 SUB(RSI, IMM(1))
517
518 JNZ(PACK24_G_LOOP)
519
520 LABEL(PACK24_DONE)
521
522 : //output operands
523 : //input operands
524 [n] "m" (n),
525 [kappa] "m" (*kappa),
526 [a] "m" (a),
527 [inca] "m" (inca),
528 [lda] "m" (lda),
529 [p] "m" (p),
530 [ldp] "m" (ldp),
531 [offsetPtr] "m" (offsetPtr)
532 : //clobbers
533 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
534 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
535 "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
536 "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
537 "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
538 "zmm30", "zmm31",
539 "rax", "rbx", "rcx", "rdi", "rsi",
540 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
541 );
542 }
543