1 /*
2
3 BLIS
4 An object-based framework for developing high-performance BLAS-like
5 libraries.
6
7 Copyright (C) 2014, The University of Texas at Austin
8
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions are
11 met:
12 - Redistributions of source code must retain the above copyright
13 notice, this list of conditions and the following disclaimer.
14 - Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17 - Neither the name of The University of Texas at Austin nor the names
18 of its contributors may be used to endorse or promote products
19 derived from this software without specific prior written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33 */
34
35 #include "bli_avx512_macros.h"
36 #include "blis.h"
37
38 #include <stdio.h>
39
40 #define LOADMUL8x8(a,o,s1,s3,s5,s7, \
41 z0,z1,z2,z3,z4,z5,z6,z7) \
42 \
43 VMULPS(YMM(z0), YMM(15), MEM(a, o)) \
44 VMULPS(YMM(z1), YMM(15), MEM(a,s1,1,o)) \
45 VMULPS(YMM(z2), YMM(15), MEM(a,s1,2,o)) \
46 VMULPS(YMM(z3), YMM(15), MEM(a,s3,1,o)) \
47 VMULPS(YMM(z4), YMM(15), MEM(a,s1,4,o)) \
48 VMULPS(YMM(z5), YMM(15), MEM(a,s5,1,o)) \
49 VMULPS(YMM(z6), YMM(15), MEM(a,s3,2,o)) \
50 VMULPS(YMM(z7), YMM(15), MEM(a,s7,1,o))
51
52 #define STORE8x8(a,o,s, \
53 z0,z1,z2,z3,z4,z5,z6,z7) \
54 \
55 VMOVUPS(MEM(a,(o)+0*(s)), YMM(z0)) \
56 VMOVUPS(MEM(a,(o)+1*(s)), YMM(z1)) \
57 VMOVUPS(MEM(a,(o)+2*(s)), YMM(z2)) \
58 VMOVUPS(MEM(a,(o)+3*(s)), YMM(z3)) \
59 VMOVUPS(MEM(a,(o)+4*(s)), YMM(z4)) \
60 VMOVUPS(MEM(a,(o)+5*(s)), YMM(z5)) \
61 VMOVUPS(MEM(a,(o)+6*(s)), YMM(z6)) \
62 VMOVUPS(MEM(a,(o)+7*(s)), YMM(z7))
63
64 #define STORETRANS8x8(a,o,s, \
65 a0,a1,a2,a3,a4,a5,a6,a7, \
66 t0,t1,t2,t3,t4,t5) \
67 \
68 VUNPCKLPS(YMM(t0), YMM(a0), YMM(a1)) \
69 VUNPCKLPS(YMM(t2), YMM(a2), YMM(a3)) \
70 VUNPCKLPS(YMM(t1), YMM(a4), YMM(a5)) \
71 VUNPCKLPS(YMM(t3), YMM(a6), YMM(a7)) \
72 \
73 VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
74 VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
75 VMOVUPS(MEM(a,(o )+0*(s)), XMM(t4)) \
76 VMOVUPS(MEM(a,(o+16)+0*(s)), XMM(t5)) \
77 VEXTRACTF128(MEM(a,(o )+4*(s)), YMM(t4), IMM(1)) \
78 VEXTRACTF128(MEM(a,(o+16)+4*(s)), YMM(t5), IMM(1)) \
79 \
80 VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
81 VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
82 VMOVUPS(MEM(a,(o )+1*(s)), XMM(t4)) \
83 VMOVUPS(MEM(a,(o+16)+1*(s)), XMM(t5)) \
84 VEXTRACTF128(MEM(a,(o )+5*(s)), YMM(t4), IMM(1)) \
85 VEXTRACTF128(MEM(a,(o+16)+5*(s)), YMM(t5), IMM(1)) \
86 \
87 VUNPCKHPS(YMM(t0), YMM(a0), YMM(a1)) \
88 VUNPCKHPS(YMM(t2), YMM(a2), YMM(a3)) \
89 VUNPCKHPS(YMM(t1), YMM(a4), YMM(a5)) \
90 VUNPCKHPS(YMM(t3), YMM(a6), YMM(a7)) \
91 \
92 VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0x44)) \
93 VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0x44)) \
94 VMOVUPS(MEM(a,(o )+2*(s)), XMM(t4)) \
95 VMOVUPS(MEM(a,(o+16)+2*(s)), XMM(t5)) \
96 VEXTRACTF128(MEM(a,(o )+6*(s)), YMM(t4), IMM(1)) \
97 VEXTRACTF128(MEM(a,(o+16)+6*(s)), YMM(t5), IMM(1)) \
98 \
99 VSHUFPS(YMM(t4), YMM(t0), YMM(t2), IMM(0xEE)) \
100 VSHUFPS(YMM(t5), YMM(t1), YMM(t3), IMM(0xEE)) \
101 VMOVUPS(MEM(a,(o )+3*(s)), XMM(t4)) \
102 VMOVUPS(MEM(a,(o+16)+3*(s)), XMM(t5)) \
103 VEXTRACTF128(MEM(a,(o )+7*(s)), YMM(t4), IMM(1)) \
104 VEXTRACTF128(MEM(a,(o+16)+7*(s)), YMM(t5), IMM(1))
105
106 //This is an array used for the scatter/gather instructions.
107 static int32_t offsets[32] __attribute__((aligned(64))) =
108 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
109 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
110
bli_spackm_16xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)111 void bli_spackm_16xk_opt
112 (
113 conj_t conja,
114 dim_t n_,
115 void* restrict kappa_,
116 void* restrict a_, inc_t inca_, inc_t lda_,
117 void* restrict p_, inc_t ldp_
118 )
119 {
120 (void)conja;
121
122 const int32_t * offsetPtr = &offsets[0];
123 float* a = (float*)a_;
124 float* p = (float*)p_;
125 float* kappa = (float*)kappa_;
126 const int64_t n = n_;
127 const int64_t inca = inca_;
128 const int64_t lda = lda_;
129 const int64_t ldp = ldp_;
130
131 __asm__ volatile
132 (
133 MOV(RSI, VAR(n))
134 MOV(RAX, VAR(a))
135 MOV(RBX, VAR(inca))
136 MOV(RCX, VAR(lda))
137 MOV(R14, VAR(p))
138
139 TEST(RSI, RSI)
140 JZ(PACK16_DONE)
141
142 LEA(RBX, MEM(,RBX,4)) //inca in bytes
143 LEA(RCX, MEM(,RCX,4)) //lda in bytes
144
145 VBROADCASTSS(YMM(15), VAR(kappa))
146
147 CMP(RBX, IMM(4))
148 JNE(PACK16_T)
149
150 LABEL(PACK16_N)
151
152 MOV(RDX, RSI)
153 AND(RDX, IMM(7))
154 SAR(RSI, IMM(3))
155 JZ(PACK16_N_TAIL)
156
157 LEA(R8, MEM(RCX,RCX,2)) //lda*3
158 LEA(R9, MEM(RCX,RCX,4)) //lda*5
159 LEA(R10, MEM(R8 ,RCX,4)) //lda*7
160
161 LABEL(PACK16_N_LOOP)
162
163 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
164 STORE8x8(R14,0,16*4,0,1,2,3,4,5,6,7)
165
166 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
167 STORE8x8(R14,32,16*4,0,1,2,3,4,5,6,7)
168
169 LEA(RAX, MEM(RAX,RCX,8))
170 LEA(R14, MEM(R14,16*8*4))
171
172 SUB(RSI, IMM(1))
173
174 JNZ(PACK16_N_LOOP)
175
176 TEST(RDX, RDX)
177 JZ(PACK16_DONE)
178
179 LABEL(PACK16_N_TAIL)
180
181 VMULPS(YMM(0), YMM(15), MEM(RAX ))
182 VMULPS(YMM(1), YMM(15), MEM(RAX,32))
183 VMOVUPS(MEM(R14 ), YMM(0))
184 VMOVUPS(MEM(R14,32), YMM(1))
185
186 LEA(RAX, MEM(RAX,RCX,1))
187 LEA(R14, MEM(R14, 16*4))
188
189 SUB(RDX, IMM(1))
190
191 JNZ(PACK16_N_TAIL)
192
193 JMP(PACK16_DONE)
194
195 LABEL(PACK16_T)
196
197 CMP(RCX, IMM(4))
198 JNE(PACK16_G)
199
200 LEA(R8, MEM(RBX,RBX,2)) //inca*3
201 LEA(R9, MEM(RBX,RBX,4)) //inca*5
202 LEA(R10, MEM(R8 ,RBX,4)) //inca*7
203 LEA(R11, MEM(RAX,RBX,8))
204
205 MOV(RDX, RSI)
206 AND(RDX, IMM(7))
207 SAR(RSI, IMM(3))
208 JZ(PACK16_T_TAIL)
209
210 LABEL(PACK16_T_LOOP)
211
212 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
213 STORETRANS8x8(R14,0,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
214
215 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
216 STORETRANS8x8(R14,32,16*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
217
218 LEA(RAX, MEM(RAX, 8*4))
219 LEA(R11, MEM(R11, 8*4))
220 LEA(R14, MEM(R14,16*8*4))
221
222 SUB(RSI, IMM(1))
223
224 JNZ(PACK16_T_LOOP)
225
226 TEST(RDX, RDX)
227 JZ(PACK16_DONE)
228
229 LABEL(PACK16_T_TAIL)
230
231 VMULSS(XMM(0), XMM(15), MEM(RAX ))
232 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
233 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
234 VMULSS(XMM(3), XMM(15), MEM(RAX,R8 ,1))
235 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
236 VMULSS(XMM(5), XMM(15), MEM(RAX,R9 ,1))
237 VMULSS(XMM(6), XMM(15), MEM(RAX,R8 ,2))
238 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
239 VMOVSS(MEM(R14,0*4), XMM(0))
240 VMOVSS(MEM(R14,1*4), XMM(1))
241 VMOVSS(MEM(R14,2*4), XMM(2))
242 VMOVSS(MEM(R14,3*4), XMM(3))
243 VMOVSS(MEM(R14,4*4), XMM(4))
244 VMOVSS(MEM(R14,5*4), XMM(5))
245 VMOVSS(MEM(R14,6*4), XMM(6))
246 VMOVSS(MEM(R14,7*4), XMM(7))
247
248 VMULSS(XMM(0), XMM(15), MEM(R11 ))
249 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
250 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
251 VMULSS(XMM(3), XMM(15), MEM(R11,R8 ,1))
252 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
253 VMULSS(XMM(5), XMM(15), MEM(R11,R9 ,1))
254 VMULSS(XMM(6), XMM(15), MEM(R11,R8 ,2))
255 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
256 VMOVSS(MEM(R14, 8*4), XMM(0))
257 VMOVSS(MEM(R14, 9*4), XMM(1))
258 VMOVSS(MEM(R14,10*4), XMM(2))
259 VMOVSS(MEM(R14,11*4), XMM(3))
260 VMOVSS(MEM(R14,12*4), XMM(4))
261 VMOVSS(MEM(R14,13*4), XMM(5))
262 VMOVSS(MEM(R14,14*4), XMM(6))
263 VMOVSS(MEM(R14,15*4), XMM(7))
264
265 LEA(RAX, MEM(RAX, 4))
266 LEA(R11, MEM(R11, 4))
267 LEA(R14, MEM(R14,16*4))
268
269 SUB(RDX, IMM(1))
270
271 JNZ(PACK16_T_TAIL)
272
273 JMP(PACK16_DONE)
274
275 LABEL(PACK16_G)
276
277 VPBROADCASTD(ZMM(3), VAR(inca))
278 MOV(RBX, VAR(offsetPtr))
279 VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
280
281 LABEL(PACK16_G_LOOP)
282
283 KXNORW(K(1), K(0), K(0))
284 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
285 VMULPS(ZMM(3), ZMM(3), ZMM(15))
286 VMOVUPS(MEM(R14), ZMM(3))
287
288 LEA(RAX, MEM(RAX,RCX,1))
289 LEA(R14, MEM(R14, 16*4))
290
291 SUB(RSI, IMM(1))
292
293 JNZ(PACK16_G_LOOP)
294
295 LABEL(PACK16_DONE)
296
297 : //output operands
298 : //input operands
299 [n] "m" (n),
300 [kappa] "m" (*kappa),
301 [a] "m" (a),
302 [inca] "m" (inca),
303 [lda] "m" (lda),
304 [p] "m" (p),
305 [ldp] "m" (ldp),
306 [offsetPtr] "m" (offsetPtr)
307 : //clobbers
308 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
309 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
310 "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
311 "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
312 "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
313 "zmm30", "zmm31",
314 "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
315 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
316 );
317 }
318
bli_spackm_24xk_opt(conj_t conja,dim_t n_,void * restrict kappa_,void * restrict a_,inc_t inca_,inc_t lda_,void * restrict p_,inc_t ldp_)319 void bli_spackm_24xk_opt
320 (
321 conj_t conja,
322 dim_t n_,
323 void* restrict kappa_,
324 void* restrict a_, inc_t inca_, inc_t lda_,
325 void* restrict p_, inc_t ldp_
326 )
327 {
328 (void)conja;
329
330 const int32_t * offsetPtr = &offsets[0];
331 float* a = (float*)a_;
332 float* p = (float*)p_;
333 float* kappa = (float*)kappa_;
334 const int64_t n = n_;
335 const int64_t inca = inca_;
336 const int64_t lda = lda_;
337 const int64_t ldp = ldp_;
338
339 __asm__ volatile
340 (
341 MOV(RSI, VAR(n))
342 MOV(RAX, VAR(a))
343 MOV(RBX, VAR(inca))
344 MOV(RCX, VAR(lda))
345 MOV(R14, VAR(p))
346 MOV(RDI, VAR(ldp))
347
348 TEST(RSI, RSI)
349 JZ(PACK24_DONE)
350
351 LEA(RBX, MEM(,RBX,4)) //inca in bytes
352 LEA(RCX, MEM(,RCX,4)) //lda in bytes
353 LEA(RDI, MEM(,RDI,4)) //ldp in bytes
354
355 VBROADCASTSS(ZMM(15), VAR(kappa))
356
357 CMP(RBX, IMM(4))
358 JNE(PACK24_T)
359
360 LABEL(PACK24_N)
361
362 MOV(RDX, RSI)
363 AND(RDX, IMM(7))
364 SAR(RSI, IMM(3))
365 JZ(PACK24_N_TAIL)
366
367 LEA(R8, MEM(RCX,RCX,2)) //lda*3
368 LEA(R9, MEM(RCX,RCX,4)) //lda*5
369 LEA(R10, MEM(R8 ,RCX,4)) //lda*7
370
371 LABEL(PACK24_N_LOOP)
372
373 LOADMUL8x8(RAX,0,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
374 STORE8x8(R14,0,24*4,0,1,2,3,4,5,6,7)
375
376 LOADMUL8x8(RAX,32,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
377 STORE8x8(R14,32,24*4,0,1,2,3,4,5,6,7)
378
379 LOADMUL8x8(RAX,64,RCX,R8,R9,R10,0,1,2,3,4,5,6,7)
380 STORE8x8(R14,64,24*4,0,1,2,3,4,5,6,7)
381
382 LEA(RAX, MEM(RAX,RCX,8))
383 LEA(R14, MEM(R14,RDI,8))
384
385 SUB(RSI, IMM(1))
386
387 JNZ(PACK24_N_LOOP)
388
389 TEST(RDX, RDX)
390 JZ(PACK24_DONE)
391
392 LABEL(PACK24_N_TAIL)
393
394 VMULPS(ZMM(0), ZMM(15), MEM(RAX))
395 VMOVUPS(MEM(R14), ZMM(0))
396
397 VMULPS(YMM(1), YMM(15), MEM(RAX,64))
398 VMOVUPS(MEM(R14,64), YMM(1))
399
400 LEA(RAX, MEM(RAX,RCX,1))
401 LEA(R14, MEM(R14,RDI,1))
402
403 SUB(RDX, IMM(1))
404
405 JNZ(PACK24_N_TAIL)
406
407 JMP(PACK24_DONE)
408
409 LABEL(PACK24_T)
410
411 CMP(RCX, IMM(4))
412 JNE(PACK24_G)
413
414 LEA(R8, MEM(RBX,RBX,2)) //inca*3
415 LEA(R9, MEM(RBX,RBX,4)) //inca*5
416 LEA(R10, MEM(R8 ,RBX,4)) //inca*7
417 LEA(R11, MEM(RAX,RBX,8))
418 LEA(R12, MEM(R11,RBX,8))
419
420 MOV(RDX, RSI)
421 AND(RDX, IMM(7))
422 SAR(RSI, IMM(3))
423 JZ(PACK24_T_TAIL)
424
425 LABEL(PACK24_T_LOOP)
426
427 LOADMUL8x8(RAX,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
428 STORETRANS8x8(R14,0,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
429
430 LOADMUL8x8(R11,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
431 STORETRANS8x8(R14,32,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
432
433 LOADMUL8x8(R12,0,RBX,R8,R9,R10,0,1,2,3,4,5,6,7)
434 STORETRANS8x8(R14,64,24*4,0,1,2,3,4,5,6,7,8,9,10,11,12,13)
435
436 LEA(RAX, MEM(RAX,RCX,8))
437 LEA(R11, MEM(R11,RCX,8))
438 LEA(R12, MEM(R12,RCX,8))
439 LEA(R14, MEM(R14,RDI,8))
440
441 SUB(RSI, IMM(1))
442
443 JNZ(PACK24_T_LOOP)
444
445 TEST(RDX, RDX)
446 JZ(PACK24_DONE)
447
448 LABEL(PACK24_T_TAIL)
449
450 VMULSS(XMM(0), XMM(15), MEM(RAX))
451 VMULSS(XMM(1), XMM(15), MEM(RAX,RBX,1))
452 VMULSS(XMM(2), XMM(15), MEM(RAX,RBX,2))
453 VMULSS(XMM(3), XMM(15), MEM(RAX,R8,1))
454 VMULSS(XMM(4), XMM(15), MEM(RAX,RBX,4))
455 VMULSS(XMM(5), XMM(15), MEM(RAX,R9,1))
456 VMULSS(XMM(6), XMM(15), MEM(RAX,R8,2))
457 VMULSS(XMM(7), XMM(15), MEM(RAX,R10,1))
458 VMOVSS(MEM(R14,0*4), XMM(0))
459 VMOVSS(MEM(R14,1*4), XMM(1))
460 VMOVSS(MEM(R14,2*4), XMM(2))
461 VMOVSS(MEM(R14,3*4), XMM(3))
462 VMOVSS(MEM(R14,4*4), XMM(4))
463 VMOVSS(MEM(R14,5*4), XMM(5))
464 VMOVSS(MEM(R14,6*4), XMM(6))
465 VMOVSS(MEM(R14,7*4), XMM(7))
466
467 VMULSS(XMM(0), XMM(15), MEM(R11))
468 VMULSS(XMM(1), XMM(15), MEM(R11,RBX,1))
469 VMULSS(XMM(2), XMM(15), MEM(R11,RBX,2))
470 VMULSS(XMM(3), XMM(15), MEM(R11,R8,1))
471 VMULSS(XMM(4), XMM(15), MEM(R11,RBX,4))
472 VMULSS(XMM(5), XMM(15), MEM(R11,R9,1))
473 VMULSS(XMM(6), XMM(15), MEM(R11,R8,2))
474 VMULSS(XMM(7), XMM(15), MEM(R11,R10,1))
475 VMOVSS(MEM(R14, 8*4), XMM(0))
476 VMOVSS(MEM(R14, 9*4), XMM(1))
477 VMOVSS(MEM(R14,10*4), XMM(2))
478 VMOVSS(MEM(R14,11*4), XMM(3))
479 VMOVSS(MEM(R14,12*4), XMM(4))
480 VMOVSS(MEM(R14,13*4), XMM(5))
481 VMOVSS(MEM(R14,14*4), XMM(6))
482 VMOVSS(MEM(R14,15*4), XMM(7))
483
484 VMULSS(XMM(0), XMM(15), MEM(R12))
485 VMULSS(XMM(1), XMM(15), MEM(R12,RBX,1))
486 VMULSS(XMM(2), XMM(15), MEM(R12,RBX,2))
487 VMULSS(XMM(3), XMM(15), MEM(R12,R8,1))
488 VMULSS(XMM(4), XMM(15), MEM(R12,RBX,4))
489 VMULSS(XMM(5), XMM(15), MEM(R12,R9,1))
490 VMULSS(XMM(6), XMM(15), MEM(R12,R8,2))
491 VMULSS(XMM(7), XMM(15), MEM(R12,R10,1))
492 VMOVSS(MEM(R14,16*4), XMM(0))
493 VMOVSS(MEM(R14,17*4), XMM(1))
494 VMOVSS(MEM(R14,18*4), XMM(2))
495 VMOVSS(MEM(R14,19*4), XMM(3))
496 VMOVSS(MEM(R14,20*4), XMM(4))
497 VMOVSS(MEM(R14,21*4), XMM(5))
498 VMOVSS(MEM(R14,22*4), XMM(6))
499 VMOVSS(MEM(R14,23*4), XMM(7))
500
501 LEA(RAX, MEM(RAX,RCX,1))
502 LEA(R11, MEM(R11,RCX,1))
503 LEA(R12, MEM(R12,RCX,1))
504 LEA(R14, MEM(R14,RDI,1))
505
506 SUB(RDX, IMM(1))
507
508 JNZ(PACK24_T_TAIL)
509
510 JMP(PACK24_DONE)
511
512 LABEL(PACK24_G)
513
514 VPBROADCASTD(ZMM(3), VAR(inca))
515 MOV(RBX, VAR(offsetPtr))
516 VPMULLD(ZMM(0), ZMM(3), MEM(RBX))
517
518 LEA(R11, MEM(RAX,RBX,8))
519 LEA(R11, MEM(R11,RBX,8))
520
521 LABEL(PACK24_G_LOOP)
522
523 KXNORW(K(1), K(0), K(0))
524 KSHIFTRW(K(2), K(1), IMM(8))
525 VGATHERDPS(ZMM(3) MASK_K(1), MEM(RAX,ZMM(0),8))
526 VGATHERDPS(ZMM(4) MASK_K(2), MEM(R11,ZMM(0),8))
527 VMULPS(ZMM(3), ZMM(3), ZMM(15))
528 VMULPS(YMM(4), YMM(4), YMM(15))
529 VMOVUPS(MEM(R14), ZMM(3))
530 VMOVUPS(MEM(R14,64), YMM(4))
531
532 LEA(RAX, MEM(RAX,RCX,1))
533 LEA(R14, MEM(R14,RDI,1))
534
535 SUB(RSI, IMM(1))
536
537 JNZ(PACK24_G_LOOP)
538
539 LABEL(PACK24_DONE)
540
541 : //output operands
542 : //input operands
543 [n] "m" (n),
544 [kappa] "m" (*kappa),
545 [a] "m" (a),
546 [inca] "m" (inca),
547 [lda] "m" (lda),
548 [p] "m" (p),
549 [ldp] "m" (ldp),
550 [offsetPtr] "m" (offsetPtr)
551 : //clobbers
552 "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5",
553 "zmm6", "zmm7", "zmm8", "zmm9", "zmm10", "zmm11",
554 "zmm12", "zmm13", "zmm14", "zmm15", "zmm16", "zmm17",
555 "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
556 "zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29",
557 "zmm30", "zmm31",
558 "rax", "rbx", "rcx", "rdx", "rdi", "rsi",
559 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "memory"
560 );
561 }
562