1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "bli_avx512_macros.h"
36 #include "blis.h"
37 
38 #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
39                    z0,z1,z2,z3,z4,z5,z6,z7) \
40     \
41     VMULPD(ZMM(z0), ZMM(31), MEM(a,     o)) \
42     VMULPD(ZMM(z1), ZMM(31), MEM(a,s1,1,o)) \
43     VMULPD(ZMM(z2), ZMM(31), MEM(a,s1,2,o)) \
44     VMULPD(ZMM(z3), ZMM(31), MEM(a,s3,1,o)) \
45     VMULPD(ZMM(z4), ZMM(31), MEM(a,s1,4,o)) \
46     VMULPD(ZMM(z5), ZMM(31), MEM(a,s5,1,o)) \
47     VMULPD(ZMM(z6), ZMM(31), MEM(a,s3,2,o)) \
48     VMULPD(ZMM(z7), ZMM(31), MEM(a,s7,1,o))
49 
50 #define LOADMUL8x8_MASK(a,o,s1,s3,s5,s7, \
51                         z0,z1,z2,z3,z4,z5,z6,z7,k) \
52     \
53     VMULPD(ZMM(z0) MASK_KZ(k), ZMM(31), MEM(a,     o)) \
54     VMULPD(ZMM(z1) MASK_KZ(k), ZMM(31), MEM(a,s1,1,o)) \
55     VMULPD(ZMM(z2) MASK_KZ(k), ZMM(31), MEM(a,s1,2,o)) \
56     VMULPD(ZMM(z3) MASK_KZ(k), ZMM(31), MEM(a,s3,1,o)) \
57     VMULPD(ZMM(z4) MASK_KZ(k), ZMM(31), MEM(a,s1,4,o)) \
58     VMULPD(ZMM(z5) MASK_KZ(k), ZMM(31), MEM(a,s5,1,o)) \
59     VMULPD(ZMM(z6) MASK_KZ(k), ZMM(31), MEM(a,s3,2,o)) \
60     VMULPD(ZMM(z7) MASK_KZ(k), ZMM(31), MEM(a,s7,1,o))
61 
62 #define STORE8x8(a,o,s1,s3,s5,s7, \
63                  z0,z1,z2,z3,z4,z5,z6,z7) \
64     \
65     VMOVUPD(MEM(a,     o), ZMM(z0)) \
66     VMOVUPD(MEM(a,s1,1,o), ZMM(z1)) \
67     VMOVUPD(MEM(a,s1,2,o), ZMM(z2)) \
68     VMOVUPD(MEM(a,s3,1,o), ZMM(z3)) \
69     VMOVUPD(MEM(a,s1,4,o), ZMM(z4)) \
70     VMOVUPD(MEM(a,s5,1,o), ZMM(z5)) \
71     VMOVUPD(MEM(a,s3,2,o), ZMM(z6)) \
72     VMOVUPD(MEM(a,s7,1,o), ZMM(z7))
73 
74 #define TRANSPOSE8x8(a0,a1,a2,a3,a4,a5,a6,a7, \
75                      b0,b1,b2,b3,b4,b5,b6,b7) \
76     \
77     VUNPCKLPD(ZMM(b0), ZMM(a0), ZMM(a1)) \
78     VUNPCKHPD(ZMM(b1), ZMM(a0), ZMM(a1)) \
79     VUNPCKLPD(ZMM(b2), ZMM(a2), ZMM(a3)) \
80     VUNPCKHPD(ZMM(b3), ZMM(a2), ZMM(a3)) \
81     VUNPCKLPD(ZMM(b4), ZMM(a4), ZMM(a5)) \
82     VUNPCKHPD(ZMM(b5), ZMM(a4), ZMM(a5)) \
83     VUNPCKLPD(ZMM(b6), ZMM(a6), ZMM(a7)) \
84     VUNPCKHPD(ZMM(b7), ZMM(a6), ZMM(a7)) \
85     VSHUFF64X2(ZMM(a0), ZMM(b0), ZMM(b2), IMM(0x44)) \
86     VSHUFF64X2(ZMM(a1), ZMM(b1), ZMM(b3), IMM(0x44)) \
87     VSHUFF64X2(ZMM(a2), ZMM(b0), ZMM(b2), IMM(0xEE)) \
88     VSHUFF64X2(ZMM(a3), ZMM(b1), ZMM(b3), IMM(0xEE)) \
89     VSHUFF64X2(ZMM(a4), ZMM(b4), ZMM(b6), IMM(0x44)) \
90     VSHUFF64X2(ZMM(a5), ZMM(b5), ZMM(b7), IMM(0x44)) \
91     VSHUFF64X2(ZMM(a6), ZMM(b4), ZMM(b6), IMM(0xEE)) \
92     VSHUFF64X2(ZMM(a7), ZMM(b5), ZMM(b7), IMM(0xEE)) \
93     VSHUFF64X2(ZMM(b0), ZMM(a0), ZMM(a4), IMM(0x88)) \
94     VSHUFF64X2(ZMM(b1), ZMM(a1), ZMM(a5), IMM(0x88)) \
95     VSHUFF64X2(ZMM(b2), ZMM(a0), ZMM(a4), IMM(0xDD)) \
96     VSHUFF64X2(ZMM(b3), ZMM(a1), ZMM(a5), IMM(0xDD)) \
97     VSHUFF64X2(ZMM(b4), ZMM(a2), ZMM(a6), IMM(0x88)) \
98     VSHUFF64X2(ZMM(b5), ZMM(a3), ZMM(a7), IMM(0x88)) \
99     VSHUFF64X2(ZMM(b6), ZMM(a2), ZMM(a6), IMM(0xDD)) \
100     VSHUFF64X2(ZMM(b7), ZMM(a3), ZMM(a7), IMM(0xDD))
101 
102 //This is an array used for the scatter/gather instructions.
103 static int32_t offsets[32] __attribute__((aligned(64))) =
104     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
105      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
106 
bli_dpackm_8xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)107 void bli_dpackm_8xk_opt
108      (
109        conj_t         conja,
110        dim_t          n_,
111        void* restrict kappa_,
112        void* restrict a_, inc_t inca_, inc_t lda_,
113        void* restrict p_,              inc_t ldp_
114      )
115 {
116     (void)conja;
117 
118     const int32_t * offsetPtr = &offsets[0];
119     double* a = (double*)a_;
120     double* p = (double*)p_;
121     double* kappa = (double*)kappa_;
122     const int64_t n = n_;
123     const int64_t inca = inca_;
124     const int64_t lda = lda_;
125     const int64_t ldp = ldp_;
126 
127     __asm__ volatile
128     (
129         MOV(RSI, VAR(n))
130         MOV(RAX, VAR(a))
131         MOV(RBX, VAR(inca))
132         MOV(RCX, VAR(lda))
133         MOV(R14, VAR(p))
134         MOV(RDI, VAR(ldp))
135 
136         TEST(RSI, RSI)
137         JZ(PACK8_DONE)
138 
139         LEA(RBX, MEM(,RBX,8))    //inca in bytes
140         LEA(RCX, MEM(,RCX,8))    //lda in bytes
141         LEA(RDI, MEM(,RDI,8))    //ldp in bytes
142         LEA(R11, MEM(RDI,RDI,2)) //ldp*3
143         LEA(R12, MEM(RDI,RDI,4)) //ldp*5
144         LEA(R13, MEM(R11,RDI,4)) //ldp*7
145 
146         VBROADCASTSD(ZMM(31), VAR(kappa))
147 
148         CMP(RBX, IMM(8))
149         JNE(PACK8_T)
150 
151         LABEL(PACK8_N)
152 
153             MOV(RDX, RSI)
154             AND(RDX, IMM(7))
155             SAR(RSI, IMM(3))
156             JZ(PACK8_N_TAIL)
157 
158             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
159             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
160             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
161 
162             LABEL(PACK8_N_LOOP)
163 
164                 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
165                 STORE8x8(R14,0,RDI,R11,R12,R13,0,1,2,3,4,5,6,7)
166 
167                 LEA(RAX, MEM(RAX,RCX,8))
168                 LEA(R14, MEM(R14,RDI,8))
169 
170                 SUB(RSI, IMM(1))
171 
172             JNZ(PACK8_N_LOOP)
173 
174             TEST(RDX, RDX)
175             JZ(PACK8_DONE)
176 
177             LABEL(PACK8_N_TAIL)
178 
179                 VMULPD(ZMM(0), ZMM(31), MEM(RAX))
180                 VMOVUPD(MEM(R14), ZMM(0))
181 
182                 LEA(RAX, MEM(RAX,RCX,1))
183                 LEA(R14, MEM(R14,RDI,1))
184 
185                 SUB(RDX, IMM(1))
186 
187             JNZ(PACK8_N_TAIL)
188 
189             JMP(PACK8_DONE)
190 
191         LABEL(PACK8_T)
192 
193             CMP(RCX, IMM(8))
194             JNE(PACK8_G)
195 
196             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
197             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
198             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
199 
200             MOV(RDX, RSI)
201             AND(RDX, IMM(7))
202             SAR(RSI, IMM(3))
203             JZ(PACK8_T_TAIL)
204 
205             LABEL(PACK8_T_LOOP)
206 
207                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
208                 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
209                              16,17,18,19,20,21,22,23)
210                 STORE8x8(R14,0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
211 
212                 LEA(RAX, MEM(RAX,RCX,8))
213                 LEA(R14, MEM(R14,RDI,8))
214 
215                 SUB(RSI, IMM(1))
216 
217             JNZ(PACK8_T_LOOP)
218 
219             TEST(RDX, RDX)
220             JZ(PACK8_DONE)
221 
222             LABEL(PACK8_T_TAIL)
223 
224             MOV(RSI, IMM(1))
225             SHLX(RSI, RSI, RDX)
226             SUB(RSI, IMM(1))
227             KMOV(K(1), ESI)  //mask for n%8 elements
228 
229             LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7,1)
230             TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
231                           8, 9,10,11,12,13,14,15)
232 
233             VMOVUPD(MEM(R14      ), ZMM( 8))
234             SUB(RDX, IMM(1))
235             JZ(PACK8_DONE)
236             VMOVUPD(MEM(R14,RDI,1), ZMM( 9))
237             SUB(RDX, IMM(1))
238             JZ(PACK8_DONE)
239             VMOVUPD(MEM(R14,RDI,2), ZMM(10))
240             SUB(RDX, IMM(1))
241             JZ(PACK8_DONE)
242             VMOVUPD(MEM(R14,R11,1), ZMM(11))
243             SUB(RDX, IMM(1))
244             JZ(PACK8_DONE)
245             VMOVUPD(MEM(R14,RDI,4), ZMM(12))
246             SUB(RDX, IMM(1))
247             JZ(PACK8_DONE)
248             VMOVUPD(MEM(R14,R12,1), ZMM(13))
249             SUB(RDX, IMM(1))
250             JZ(PACK8_DONE)
251             VMOVUPD(MEM(R14,R11,2), ZMM(14))
252 
253             JMP(PACK8_DONE)
254 
255         LABEL(PACK8_G)
256 
257             VPBROADCASTD(ZMM(3), VAR(inca))
258             MOV(RBX, VAR(offsetPtr))
259             VPMULLD(YMM(0), YMM(3), MEM(RBX))
260 
261             LABEL(PACK8_G_LOOP)
262 
263                 KXNORW(K(1), K(0), K(0))
264                 VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
265                 VMULPD(ZMM(3), ZMM(3), ZMM(31))
266                 VMOVUPD(MEM(R14), ZMM(3))
267 
268                 LEA(RAX, MEM(RAX,RCX,1))
269                 LEA(R14, MEM(R14,RDI,1))
270 
271                 SUB(RSI, IMM(1))
272 
273             JNZ(PACK8_G_LOOP)
274 
275         LABEL(PACK8_DONE)
276 
277         : //output operands
278         : //input operands
279           [n]         "m" (n),
280           [kappa]     "m" (*kappa),
281           [a]         "m" (a),
282           [inca]      "m" (inca),
283           [lda]       "m" (lda),
284           [p]         "m" (p),
285           [ldp]       "m" (ldp),
286           [offsetPtr] "m" (offsetPtr)
287         : //clobbers
288           "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
289           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
290           "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
291           "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
292           "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
293           "zmm30", "zmm31",
294           "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
295           "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
296     );
297 }
298 
bli_dpackm_24xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)299 void bli_dpackm_24xk_opt
300      (
301        conj_t         conja,
302        dim_t          n_,
303        void* restrict kappa_,
304        void* restrict a_, inc_t inca_, inc_t lda_,
305        void* restrict p_,              inc_t ldp_
306      )
307 {
308     (void)conja;
309 
310     const int32_t * offsetPtr = &offsets[0];
311     double* a = (double*)a_;
312     double* p = (double*)p_;
313     double* kappa = (double*)kappa_;
314     const int64_t n = n_;
315     const int64_t inca = inca_;
316     const int64_t lda = lda_;
317     const int64_t ldp = ldp_;
318 
319     __asm__ volatile
320     (
321         MOV(RSI, VAR(n))
322         MOV(RAX, VAR(a))
323         MOV(RBX, VAR(inca))
324         MOV(RCX, VAR(lda))
325         MOV(R15, VAR(p))
326         MOV(RDI, VAR(ldp))
327 
328         LEA(RBX, MEM(,RBX,8))    //inca in bytes
329         LEA(RCX, MEM(,RCX,8))    //lda in bytes
330         LEA(RDI, MEM(,RDI,8))    //ldp in bytes
331         LEA(R11, MEM(RDI,RDI,2)) //ldp*3
332         LEA(R12, MEM(RDI,RDI,4)) //ldp*5
333         LEA(R13, MEM(R11,RDI,4)) //ldp*7
334 
335         VBROADCASTSD(ZMM(31), VAR(kappa))
336 
337         TEST(RSI, RSI)
338         JZ(PACK24_DONE)
339 
340         CMP(RBX, IMM(8))
341         JNE(PACK24_T)
342 
343         LABEL(PACK24_N)
344 
345             SAR(RSI, IMM(3))
346             JZ(PACK24_N_TAIL)
347 
348             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
349             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
350             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
351 
352             LABEL(PACK24_N_LOOP)
353 
354                 LOADMUL8x8(RAX,  0,RCX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
355                 LOADMUL8x8(RAX, 64,RCX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
356                 LOADMUL8x8(RAX,128,RCX,R8,R9,R10,16,17,18,19,20,21,22,23)
357                 STORE8x8(R15,  0,RDI,R11,R12,R13, 0, 1, 2, 3, 4, 5, 6, 7)
358                 STORE8x8(R15, 64,RDI,R11,R12,R13, 8, 9,10,11,12,13,14,15)
359                 STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
360 
361                 LEA(RAX, MEM(RAX,RCX,8))
362                 LEA(R15, MEM(R15,RDI,8))
363 
364                 SUB(RSI, IMM(1))
365 
366             JNZ(PACK24_N_LOOP)
367 
368             LABEL(PACK24_N_TAIL)
369 
370             MOV(RSI, VAR(n))
371             AND(RSI, IMM(7))
372             TEST(RSI, RSI)
373             JZ(PACK24_DONE)
374 
375             LABEL(PACK24_N_TAIL_LOOP)
376 
377                 VMULPD(ZMM(0), ZMM(31), MEM(RAX,  0))
378                 VMULPD(ZMM(1), ZMM(31), MEM(RAX, 64))
379                 VMULPD(ZMM(2), ZMM(31), MEM(RAX,128))
380                 VMOVUPD(MEM(R15,  0), ZMM(0))
381                 VMOVUPD(MEM(R15, 64), ZMM(1))
382                 VMOVUPD(MEM(R15,128), ZMM(2))
383 
384                 LEA(RAX, MEM(RAX,RCX,1))
385                 LEA(R15, MEM(R15,RDI,1))
386 
387                 SUB(RSI, IMM(1))
388 
389             JNZ(PACK24_N_TAIL_LOOP)
390 
391             JMP(PACK24_DONE)
392 
393         LABEL(PACK24_T)
394 
395             CMP(RCX, IMM(8))
396             JNE(PACK24_G)
397 
398             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
399             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
400             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
401 
402             LEA(R14, MEM(RAX,RBX,8))
403             LEA(RCX, MEM(R14,RBX,8))
404 
405             SAR(RSI, IMM(3))
406             JZ(PACK24_T_TAIL)
407 
408             LABEL(PACK24_T_LOOP)
409 
410                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
411                 LOADMUL8x8(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15)
412                 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
413                              16,17,18,19,20,21,22,23)
414                 STORE8x8(R15,  0,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
415                 LOADMUL8x8(RCX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7)
416                 TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
417                              16,17,18,19,20,21,22,23)
418                 STORE8x8(R15, 64,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
419                 TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
420                              16,17,18,19,20,21,22,23)
421                 STORE8x8(R15,128,RDI,R11,R12,R13,16,17,18,19,20,21,22,23)
422 
423                 LEA(RAX, MEM(RAX,64))
424                 LEA(R14, MEM(R14,64))
425                 LEA(RCX, MEM(RCX,64))
426                 LEA(R15, MEM(R15,RDI,8))
427 
428                 SUB(RSI, IMM(1))
429 
430             JNZ(PACK24_T_LOOP)
431 
432             LABEL(PACK24_T_TAIL)
433 
434             MOV(RSI, VAR(n))
435             AND(RSI, IMM(7))
436             TEST(RSI, RSI)
437             JZ(PACK24_DONE)
438 
439             MOV(R13, IMM(1))
440             SHLX(R13, R13, RSI)
441             SUB(R13, IMM(1))
442             KMOV(K(1), R13D)  //mask for n%8 elements
443 
444             LOADMUL8x8_MASK(RAX,0,RBX,R8,R9,R10, 0, 1, 2, 3, 4, 5, 6, 7,1)
445             LOADMUL8x8_MASK(R14,0,RBX,R8,R9,R10, 8, 9,10,11,12,13,14,15,1)
446             LOADMUL8x8_MASK(RCX,0,RBX,R8,R9,R10,16,17,18,19,20,21,22,23,1)
447             TRANSPOSE8x8(16,17,18,19,20,21,22,23,
448                          24,25,26,27,28,29,30,31)
449             TRANSPOSE8x8( 8, 9,10,11,12,13,14,15,
450                          16,17,18,19,20,21,22,23)
451             TRANSPOSE8x8( 0, 1, 2, 3, 4, 5, 6, 7,
452                           8, 9,10,11,12,13,14,15)
453 
454             VMOVUPD(MEM(R15,        0), ZMM( 8))
455             VMOVUPD(MEM(R15,       64), ZMM(16))
456             VMOVUPD(MEM(R15,      128), ZMM(24))
457             SUB(RSI, IMM(1))
458             JZ(PACK24_DONE)
459             VMOVUPD(MEM(R15,RDI,1,  0), ZMM( 9))
460             VMOVUPD(MEM(R15,RDI,1, 64), ZMM(17))
461             VMOVUPD(MEM(R15,RDI,1,128), ZMM(25))
462             SUB(RSI, IMM(1))
463             JZ(PACK24_DONE)
464             VMOVUPD(MEM(R15,RDI,2,  0), ZMM(10))
465             VMOVUPD(MEM(R15,RDI,2, 64), ZMM(18))
466             VMOVUPD(MEM(R15,RDI,2,128), ZMM(26))
467             SUB(RSI, IMM(1))
468             JZ(PACK24_DONE)
469             VMOVUPD(MEM(R15,R11,1,  0), ZMM(11))
470             VMOVUPD(MEM(R15,R11,1, 64), ZMM(19))
471             VMOVUPD(MEM(R15,R11,1,128), ZMM(27))
472             SUB(RSI, IMM(1))
473             JZ(PACK24_DONE)
474             VMOVUPD(MEM(R15,RDI,4,  0), ZMM(12))
475             VMOVUPD(MEM(R15,RDI,4, 64), ZMM(20))
476             VMOVUPD(MEM(R15,RDI,4,128), ZMM(28))
477             SUB(RSI, IMM(1))
478             JZ(PACK24_DONE)
479             VMOVUPD(MEM(R15,R12,1,  0), ZMM(13))
480             VMOVUPD(MEM(R15,R12,1, 64), ZMM(21))
481             VMOVUPD(MEM(R15,R12,1,128), ZMM(29))
482             SUB(RSI, IMM(1))
483             JZ(PACK24_DONE)
484             VMOVUPD(MEM(R15,R11,2,  0), ZMM(14))
485             VMOVUPD(MEM(R15,R11,2, 64), ZMM(22))
486             VMOVUPD(MEM(R15,R11,2,128), ZMM(30))
487 
488             JMP(PACK24_DONE)
489 
490         LABEL(PACK24_G)
491 
492             VPBROADCASTD(ZMM(3), VAR(inca))
493             MOV(RBX, VAR(offsetPtr))
494             VPMULLD(YMM(0), YMM(3), MEM(RBX, 0))
495             VPMULLD(YMM(1), YMM(3), MEM(RBX,32))
496             VPMULLD(YMM(2), YMM(3), MEM(RBX,64))
497 
498             LABEL(PACK24_G_LOOP)
499 
500                 KXNORW(K(1), K(0), K(0))
501                 KXNORW(K(2), K(0), K(0))
502                 KXNORW(K(3), K(0), K(0))
503                 VGATHERDPD(ZMM(3) MASK_K(1), MEM(RAX,YMM(0),8))
504                 VGATHERDPD(ZMM(4) MASK_K(2), MEM(RAX,YMM(1),8))
505                 VGATHERDPD(ZMM(5) MASK_K(3), MEM(RAX,YMM(2),8))
506                 VMULPD(ZMM(3), ZMM(3), ZMM(31))
507                 VMULPD(ZMM(4), ZMM(4), ZMM(31))
508                 VMULPD(ZMM(5), ZMM(5), ZMM(31))
509                 VMOVUPD(MEM(R15,  0), ZMM(3))
510                 VMOVUPD(MEM(R15, 64), ZMM(4))
511                 VMOVUPD(MEM(R15,128), ZMM(5))
512 
513                 LEA(RAX, MEM(RAX,RCX,1))
514                 LEA(R15, MEM(R15,RDI,1))
515 
516                 SUB(RSI, IMM(1))
517 
518             JNZ(PACK24_G_LOOP)
519 
520         LABEL(PACK24_DONE)
521 
522         : //output operands
523         : //input operands
524           [n]         "m" (n),
525           [kappa]     "m" (*kappa),
526           [a]         "m" (a),
527           [inca]      "m" (inca),
528           [lda]       "m" (lda),
529           [p]         "m" (p),
530           [ldp]       "m" (ldp),
531           [offsetPtr] "m" (offsetPtr)
532         : //clobbers
533           "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
534           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
535           "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
536           "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
537           "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
538           "zmm30", "zmm31",
539           "rax", "rbx", "rcx", "rdi", "rsi",
540           "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "memory"
541     );
542 }
543