1 /*
2 
3    BLIS
4    An object-based framework for developing high-performance BLAS-like
5    libraries.
6 
7    Copyright (C) 2014, The University of Texas at Austin
8 
9    Redistribution and use in source and binary forms, with or without
10    modification, are permitted provided that the following conditions are
11    met:
12     - Redistributions of source code must retain the above copyright
13       notice, this list of conditions and the following disclaimer.
14     - Redistributions in binary form must reproduce the above copyright
15       notice, this list of conditions and the following disclaimer in the
16       documentation and/or other materials provided with the distribution.
17     - Neither the name of The University of Texas at Austin nor the names
18       of its contributors may be used to endorse or promote products
19       derived from this software without specific prior written permission.
20 
21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 
33 */
34 
35 #include "bli_avx512_macros.h"
36 #include "blis.h"
37 
38 #include <stdio.h>
39 
40 #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
41                    z0,z1,z2,z3,z4,z5,z6,z7) \
42     \
43     VMULPS(YMM(z0), YMM(15), MEM(a,     o)) \
44     VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \
45     VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \
46     VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \
47     VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \
48     VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \
49     VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \
50     VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o))
51 
52 #define STORE8x8(a,o,s, \
53                  z0,z1,z2,z3,z4,z5,z6,z7) \
54     \
55     VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \
56     VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \
57     VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \
58     VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \
59     VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \
60     VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \
61     VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \
62     VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7))
63 
64 #define STORETRANS8x8(a,o,s, \
65                       a0,a1,a2,a3,a4,a5,a6,a7, \
66                       t0,t1,t2,t3,t4,t5) \
67     \
68     VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \
69     VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \
70     VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \
71     VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \
72     \
73     VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
74     VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
75     VMOVUPS(MEM(a,(o   )+0*(s)), XMM(t4)) \
76     VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \
77     VEXTRACTF128(MEM(a,(o   )+4*(s)), YMM(t4), IMM(1)) \
78     VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \
79     \
80     VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
81     VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
82     VMOVUPS(MEM(a,(o   )+1*(s)), XMM(t4)) \
83     VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \
84     VEXTRACTF128(MEM(a,(o   )+5*(s)), YMM(t4), IMM(1)) \
85     VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \
86     \
87     VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \
88     VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \
89     VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \
90     VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \
91     \
92     VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
93     VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
94     VMOVUPS(MEM(a,(o   )+2*(s)), XMM(t4)) \
95     VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \
96     VEXTRACTF128(MEM(a,(o   )+6*(s)), YMM(t4), IMM(1)) \
97     VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \
98     \
99     VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
100     VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
101     VMOVUPS(MEM(a,(o   )+3*(s)), XMM(t4)) \
102     VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \
103     VEXTRACTF128(MEM(a,(o   )+7*(s)), YMM(t4), IMM(1)) \
104     VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1))
105 
106 //This is an array used for the scatter/gather instructions.
107 static int32_t offsets[32] __attribute__((aligned(64))) =
108     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
109      16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
110 
bli_spackm_16xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)111 void bli_spackm_16xk_opt
112      (
113        conj_t         conja,
114        dim_t          n_,
115        void* restrict kappa_,
116        void* restrict a_, inc_t inca_, inc_t lda_,
117        void* restrict p_,              inc_t ldp_
118      )
119 {
120     (void)conja;
121 
122     const int32_t * offsetPtr = &offsets[0];
123     float* a = (float*)a_;
124     float* p = (float*)p_;
125     float* kappa = (float*)kappa_;
126     const int64_t n = n_;
127     const int64_t inca = inca_;
128     const int64_t lda = lda_;
129     const int64_t ldp = ldp_;
130 
131     __asm__ volatile
132     (
133         MOV(RSI, VAR(n))
134         MOV(RAX, VAR(a))
135         MOV(RBX, VAR(inca))
136         MOV(RCX, VAR(lda))
137         MOV(R14, VAR(p))
138 
139         TEST(RSI, RSI)
140         JZ(PACK16_DONE)
141 
142         LEA(RBX, MEM(,RBX,4))    //inca in bytes
143         LEA(RCX, MEM(,RCX,4))    //lda in bytes
144 
145         VBROADCASTSS(YMM(15), VAR(kappa))
146 
147         CMP(RBX, IMM(4))
148         JNE(PACK16_T)
149 
150         LABEL(PACK16_N)
151 
152             MOV(RDX, RSI)
153             AND(RDX, IMM(7))
154             SAR(RSI, IMM(3))
155             JZ(PACK16_N_TAIL)
156 
157             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
158             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
159             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
160 
161             LABEL(PACK16_N_LOOP)
162 
163                 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
164                 STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
165 
166                 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
167                 STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
168 
169                 LEA(RAX, MEM(RAX,RCX,8))
170                 LEA(R14, MEM(R14,16*8*4))
171 
172                 SUB(RSI, IMM(1))
173 
174             JNZ(PACK16_N_LOOP)
175 
176             TEST(RDX, RDX)
177             JZ(PACK16_DONE)
178 
179             LABEL(PACK16_N_TAIL)
180 
181                 VMULPS(YMM(0), YMM(15), MEM(RAX   ))
182                 VMULPS(YMM(1), YMM(15), MEM(RAX,32))
183                 VMOVUPS(MEM(R14   ), YMM(0))
184                 VMOVUPS(MEM(R14,32), YMM(1))
185 
186                 LEA(RAX, MEM(RAX,RCX,1))
187                 LEA(R14, MEM(R14, 16*4))
188 
189                 SUB(RDX, IMM(1))
190 
191             JNZ(PACK16_N_TAIL)
192 
193             JMP(PACK16_DONE)
194 
195         LABEL(PACK16_T)
196 
197             CMP(RCX, IMM(4))
198             JNE(PACK16_G)
199 
200             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
201             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
202             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
203             LEA(R11, MEM(RAX,RBX,8))
204 
205             MOV(RDX, RSI)
206             AND(RDX, IMM(7))
207             SAR(RSI, IMM(3))
208             JZ(PACK16_T_TAIL)
209 
210             LABEL(PACK16_T_LOOP)
211 
212                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
213                 STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
214 
215                 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
216                 STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
217 
218                 LEA(RAX, MEM(RAX,   8*4))
219                 LEA(R11, MEM(R11,   8*4))
220                 LEA(R14, MEM(R14,16*8*4))
221 
222                 SUB(RSI, IMM(1))
223 
224             JNZ(PACK16_T_LOOP)
225 
226             TEST(RDX, RDX)
227             JZ(PACK16_DONE)
228 
229             LABEL(PACK16_T_TAIL)
230 
231                 VMULSS(XMM(0), XMM(15), MEM(RAX      ))
232                 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
233                 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
234                 VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
235                 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
236                 VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
237                 VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
238                 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
239                 VMOVSS(MEM(R14,0*4), XMM(0))
240                 VMOVSS(MEM(R14,1*4), XMM(1))
241                 VMOVSS(MEM(R14,2*4), XMM(2))
242                 VMOVSS(MEM(R14,3*4), XMM(3))
243                 VMOVSS(MEM(R14,4*4), XMM(4))
244                 VMOVSS(MEM(R14,5*4), XMM(5))
245                 VMOVSS(MEM(R14,6*4), XMM(6))
246                 VMOVSS(MEM(R14,7*4), XMM(7))
247 
248                 VMULSS(XMM(0), XMM(15), MEM(R11      ))
249                 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
250                 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
251                 VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
252                 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
253                 VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
254                 VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
255                 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
256                 VMOVSS(MEM(R14, 8*4), XMM(0))
257                 VMOVSS(MEM(R14, 9*4), XMM(1))
258                 VMOVSS(MEM(R14,10*4), XMM(2))
259                 VMOVSS(MEM(R14,11*4), XMM(3))
260                 VMOVSS(MEM(R14,12*4), XMM(4))
261                 VMOVSS(MEM(R14,13*4), XMM(5))
262                 VMOVSS(MEM(R14,14*4), XMM(6))
263                 VMOVSS(MEM(R14,15*4), XMM(7))
264 
265                 LEA(RAX, MEM(RAX,   4))
266                 LEA(R11, MEM(R11,   4))
267                 LEA(R14, MEM(R14,16*4))
268 
269                 SUB(RDX, IMM(1))
270 
271             JNZ(PACK16_T_TAIL)
272 
273             JMP(PACK16_DONE)
274 
275         LABEL(PACK16_G)
276 
277             VPBROADCASTD(ZMM(3), VAR(inca))
278             MOV(RBX, VAR(offsetPtr))
279             VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
280 
281             LABEL(PACK16_G_LOOP)
282 
283                 KXNORW(K(1), K(0), K(0))
284                 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
285                 VMULPS(ZMM(3), ZMM(3), ZMM(15))
286                 VMOVUPS(MEM(R14), ZMM(3))
287 
288                 LEA(RAX, MEM(RAX,RCX,1))
289                 LEA(R14, MEM(R14, 16*4))
290 
291                 SUB(RSI, IMM(1))
292 
293             JNZ(PACK16_G_LOOP)
294 
295         LABEL(PACK16_DONE)
296 
297         : //output operands
298         : //input operands
299           [n]         "m" (n),
300           [kappa]     "m" (*kappa),
301           [a]         "m" (a),
302           [inca]      "m" (inca),
303           [lda]       "m" (lda),
304           [p]         "m" (p),
305           [ldp]       "m" (ldp),
306           [offsetPtr] "m" (offsetPtr)
307         : //clobbers
308           "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
309           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
310           "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
311           "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
312           "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
313           "zmm30", "zmm31",
314           "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
315           "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
316     );
317 }
318 
bli_spackm_24xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)319 void bli_spackm_24xk_opt
320      (
321        conj_t         conja,
322        dim_t          n_,
323        void* restrict kappa_,
324        void* restrict a_, inc_t inca_, inc_t lda_,
325        void* restrict p_,              inc_t ldp_
326      )
327 {
328     (void)conja;
329 
330     const int32_t * offsetPtr = &offsets[0];
331     float* a = (float*)a_;
332     float* p = (float*)p_;
333     float* kappa = (float*)kappa_;
334     const int64_t n = n_;
335     const int64_t inca = inca_;
336     const int64_t lda = lda_;
337     const int64_t ldp = ldp_;
338 
339     __asm__ volatile
340     (
341         MOV(RSI, VAR(n))
342         MOV(RAX, VAR(a))
343         MOV(RBX, VAR(inca))
344         MOV(RCX, VAR(lda))
345         MOV(R14, VAR(p))
346         MOV(RDI, VAR(ldp))
347 
348         TEST(RSI, RSI)
349         JZ(PACK24_DONE)
350 
351         LEA(RBX, MEM(,RBX,4))    //inca in bytes
352         LEA(RCX, MEM(,RCX,4))    //lda in bytes
353         LEA(RDI, MEM(,RDI,4))    //ldp in bytes
354 
355         VBROADCASTSS(ZMM(15), VAR(kappa))
356 
357         CMP(RBX, IMM(4))
358         JNE(PACK24_T)
359 
360         LABEL(PACK24_N)
361 
362             MOV(RDX, RSI)
363             AND(RDX, IMM(7))
364             SAR(RSI, IMM(3))
365             JZ(PACK24_N_TAIL)
366 
367             LEA(R8,  MEM(RCX,RCX,2)) //lda*3
368             LEA(R9,  MEM(RCX,RCX,4)) //lda*5
369             LEA(R10, MEM(R8 ,RCX,4)) //lda*7
370 
371             LABEL(PACK24_N_LOOP)
372 
373                 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
374                 STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
375 
376                 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
377                 STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
378 
379                 LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
380                 STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
381 
382                 LEA(RAX, MEM(RAX,RCX,8))
383                 LEA(R14, MEM(R14,RDI,8))
384 
385                 SUB(RSI, IMM(1))
386 
387             JNZ(PACK24_N_LOOP)
388 
389             TEST(RDX, RDX)
390             JZ(PACK24_DONE)
391 
392             LABEL(PACK24_N_TAIL)
393 
394                 VMULPS(ZMM(0), ZMM(15), MEM(RAX))
395                 VMOVUPS(MEM(R14), ZMM(0))
396 
397                 VMULPS(YMM(1), YMM(15), MEM(RAX,64))
398                 VMOVUPS(MEM(R14,64), YMM(1))
399 
400                 LEA(RAX, MEM(RAX,RCX,1))
401                 LEA(R14, MEM(R14,RDI,1))
402 
403                 SUB(RDX, IMM(1))
404 
405             JNZ(PACK24_N_TAIL)
406 
407             JMP(PACK24_DONE)
408 
409         LABEL(PACK24_T)
410 
411             CMP(RCX, IMM(4))
412             JNE(PACK24_G)
413 
414             LEA(R8,  MEM(RBX,RBX,2)) //inca*3
415             LEA(R9,  MEM(RBX,RBX,4)) //inca*5
416             LEA(R10, MEM(R8 ,RBX,4)) //inca*7
417             LEA(R11, MEM(RAX,RBX,8))
418             LEA(R12, MEM(R11,RBX,8))
419 
420             MOV(RDX, RSI)
421             AND(RDX, IMM(7))
422             SAR(RSI, IMM(3))
423             JZ(PACK24_T_TAIL)
424 
425             LABEL(PACK24_T_LOOP)
426 
427                 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
428                 STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
429 
430                 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
431                 STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
432 
433                 LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
434                 STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
435 
436                 LEA(RAX, MEM(RAX,RCX,8))
437                 LEA(R11, MEM(R11,RCX,8))
438                 LEA(R12, MEM(R12,RCX,8))
439                 LEA(R14, MEM(R14,RDI,8))
440 
441                 SUB(RSI, IMM(1))
442 
443             JNZ(PACK24_T_LOOP)
444 
445             TEST(RDX, RDX)
446             JZ(PACK24_DONE)
447 
448             LABEL(PACK24_T_TAIL)
449 
450                 VMULSS(XMM(0), XMM(15), MEM(RAX))
451                 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
452                 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
453                 VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
454                 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
455                 VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
456                 VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
457                 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
458                 VMOVSS(MEM(R14,0*4), XMM(0))
459                 VMOVSS(MEM(R14,1*4), XMM(1))
460                 VMOVSS(MEM(R14,2*4), XMM(2))
461                 VMOVSS(MEM(R14,3*4), XMM(3))
462                 VMOVSS(MEM(R14,4*4), XMM(4))
463                 VMOVSS(MEM(R14,5*4), XMM(5))
464                 VMOVSS(MEM(R14,6*4), XMM(6))
465                 VMOVSS(MEM(R14,7*4), XMM(7))
466 
467                 VMULSS(XMM(0), XMM(15), MEM(R11))
468                 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
469                 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
470                 VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
471                 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
472                 VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
473                 VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
474                 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
475                 VMOVSS(MEM(R14, 8*4), XMM(0))
476                 VMOVSS(MEM(R14, 9*4), XMM(1))
477                 VMOVSS(MEM(R14,10*4), XMM(2))
478                 VMOVSS(MEM(R14,11*4), XMM(3))
479                 VMOVSS(MEM(R14,12*4), XMM(4))
480                 VMOVSS(MEM(R14,13*4), XMM(5))
481                 VMOVSS(MEM(R14,14*4), XMM(6))
482                 VMOVSS(MEM(R14,15*4), XMM(7))
483 
484                 VMULSS(XMM(0), XMM(15), MEM(R12))
485                 VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
486                 VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
487                 VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
488                 VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
489                 VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
490                 VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
491                 VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
492                 VMOVSS(MEM(R14,16*4), XMM(0))
493                 VMOVSS(MEM(R14,17*4), XMM(1))
494                 VMOVSS(MEM(R14,18*4), XMM(2))
495                 VMOVSS(MEM(R14,19*4), XMM(3))
496                 VMOVSS(MEM(R14,20*4), XMM(4))
497                 VMOVSS(MEM(R14,21*4), XMM(5))
498                 VMOVSS(MEM(R14,22*4), XMM(6))
499                 VMOVSS(MEM(R14,23*4), XMM(7))
500 
501                 LEA(RAX, MEM(RAX,RCX,1))
502                 LEA(R11, MEM(R11,RCX,1))
503                 LEA(R12, MEM(R12,RCX,1))
504                 LEA(R14, MEM(R14,RDI,1))
505 
506                 SUB(RDX, IMM(1))
507 
508             JNZ(PACK24_T_TAIL)
509 
510             JMP(PACK24_DONE)
511 
512         LABEL(PACK24_G)
513 
514             VPBROADCASTD(ZMM(3), VAR(inca))
515             MOV(RBX, VAR(offsetPtr))
516             VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
517 
518             LEA(R11, MEM(RAX,RBX,8))
519             LEA(R11, MEM(R11,RBX,8))
520 
521             LABEL(PACK24_G_LOOP)
522 
523                 KXNORW(K(1), K(0), K(0))
524                 KSHIFTRW(K(2), K(1), IMM(8))
525                 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
526                 VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
527                 VMULPS(ZMM(3), ZMM(3), ZMM(15))
528                 VMULPS(YMM(4), YMM(4), YMM(15))
529                 VMOVUPS(MEM(R14), ZMM(3))
530                 VMOVUPS(MEM(R14,64), YMM(4))
531 
532                 LEA(RAX, MEM(RAX,RCX,1))
533                 LEA(R14, MEM(R14,RDI,1))
534 
535                 SUB(RSI, IMM(1))
536 
537             JNZ(PACK24_G_LOOP)
538 
539         LABEL(PACK24_DONE)
540 
541         : //output operands
542         : //input operands
543           [n]         "m" (n),
544           [kappa]     "m" (*kappa),
545           [a]         "m" (a),
546           [inca]      "m" (inca),
547           [lda]       "m" (lda),
548           [p]         "m" (p),
549           [ldp]       "m" (ldp),
550           [offsetPtr] "m" (offsetPtr)
551         : //clobbers
552           "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
553           "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
554           "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
555           "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
556           "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
557           "zmm30", "zmm31",
558           "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
559           "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
560     );
561 }
562