1/*********************************************************************************
2Copyright (c) 2013, The OpenBLAS Project
3All rights reserved.
4Redistribution and use in source and binary forms, with or without
5modification, are permitted provided that the following conditions are
6met:
71. Redistributions of source code must retain the above copyright
8notice, this list of conditions and the following disclaimer.
92. Redistributions in binary form must reproduce the above copyright
10notice, this list of conditions and the following disclaimer in
11the documentation and/or other materials provided with the
12distribution.
133. Neither the name of the OpenBLAS project nor the names of
14its contributors may be used to endorse or promote products
15derived from this software without specific prior written permission.
16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26**********************************************************************************/
27
28/*********************************************************************
29* 2014/07/29 Saar
30*        BLASTEST               : OK
31*        CTEST                  : OK
32*        TEST                   : OK
33*
34* 2013/10/28 Saar
35* Parameter:
36*       CGEMM_DEFAULT_UNROLL_N  2
37*       CGEMM_DEFAULT_UNROLL_M  8
38*       CGEMM_DEFAULT_P         384
39*       CGEMM_DEFAULT_Q         192
40*	A_PR1			512
41*	B_PR1			512
42*
43* 2014/07/29 Saar
44* Performance at 6912x6912x6912:
45*       1 thread:      107 GFLOPS       (SANDYBRIDGE:  60)      (MKL:   86)
46*       2 threads:     208 GFLOPS       (SANDYBRIDGE: 114)      (MKL:  155)
47*       3 threads:     289 GFLOPS       (SANDYBRIDGE: 162)      (MKL:  222)
48*       4 threads:     377 GFLOPS       (SANDYBRIDGE: 223)      (MKL:  279)
49*
50*
51*********************************************************************/
52
53
54
55#define ASSEMBLER
56#include "common.h"
57
58#define OLD_M	%rdi
59#define OLD_N	%rsi
60#define M	%r13
61#define J	%r14
62#define OLD_K	%rdx
63
64#define A	%rcx
65#define B	%r8
66#define C	%r9
67#define LDC	%r10
68
69#define I	%r11
70#define AO	%rdi
71#define BO	%rsi
72#define	CO1	%r15
73#define K	%r12
74#define BI	%rbp
75#define	SP	%rbx
76
77#define BO1	%rdi
78#define BO2	%rbp
79
80#ifndef WINDOWS_ABI
81
82#define STACKSIZE 96
83
84#else
85
86#define STACKSIZE 320
87
88#define OLD_ALPHA_I     40 + STACKSIZE(%rsp)
89#define OLD_A           48 + STACKSIZE(%rsp)
90#define OLD_B           56 + STACKSIZE(%rsp)
91#define OLD_C           64 + STACKSIZE(%rsp)
92#define OLD_LDC         72 + STACKSIZE(%rsp)
93#define OLD_OFFSET      80 + STACKSIZE(%rsp)
94
95#endif
96
97#define L_BUFFER_SIZE 8192
98
99#define Ndiv6	 24(%rsp)
100#define Nmod6	 32(%rsp)
101#define N	 40(%rsp)
102#define ALPHA_R  48(%rsp)
103#define ALPHA_I  56(%rsp)
104#define OFFSET   64(%rsp)
105#define KK       72(%rsp)
106#define KKK      80(%rsp)
107#define BUFFER1	           128(%rsp)
108
109#if defined(OS_WINDOWS)
110#if   L_BUFFER_SIZE > 16384
111#define STACK_TOUCH \
112        movl    $ 0,  4096 * 4(%rsp);\
113        movl    $ 0,  4096 * 3(%rsp);\
114        movl    $ 0,  4096 * 2(%rsp);\
115        movl    $ 0,  4096 * 1(%rsp);
116#elif L_BUFFER_SIZE > 12288
117#define STACK_TOUCH \
118        movl    $ 0,  4096 * 3(%rsp);\
119        movl    $ 0,  4096 * 2(%rsp);\
120        movl    $ 0,  4096 * 1(%rsp);
121#elif L_BUFFER_SIZE > 8192
122#define STACK_TOUCH \
123        movl    $ 0,  4096 * 2(%rsp);\
124        movl    $ 0,  4096 * 1(%rsp);
125#elif L_BUFFER_SIZE > 4096
126#define STACK_TOUCH \
127        movl    $ 0,  4096 * 1(%rsp);
128#else
129#define STACK_TOUCH
130#endif
131#else
132#define STACK_TOUCH
133#endif
134
135
136#if defined(BULLDOZER)
137
138#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
139
140#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
141
142#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
143
144#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
145
146#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
147
148#define	VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
149
150#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
151
152#define	VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0
153
154#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
155
156#else
157
158#define	VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
159
160#define	VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0
161
162#endif
163
164#else
165
166#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
167
168#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
169
170#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
171
172#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
173
174#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
175
176#define	VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0
177
178#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
179
180#define	VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0
181
182#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
183
184#else
185
186#define	VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
187
188#define	VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0
189
190#endif
191
192#endif
193
194
195#define	A_PR1	512
196#define	B_PR1	512
197
198
199
200/***************************************************************************************************************************/
201
202.macro KERNEL8x3_SUB
203
204        vmovups         -16 * SIZE(AO), %ymm0
205        vmovups          -8 * SIZE(AO), %ymm1
206        vbroadcastss     -8 * SIZE(BO), %ymm2
207        vbroadcastss     -7 * SIZE(BO), %ymm3
208	prefetcht0	A_PR1(AO)
209
210        VFMADDPS_R(        %ymm8 ,%ymm2,%ymm0 )
211        VFMADDPS_R(        %ymm12,%ymm2,%ymm1 )
212        VFMADDPS_I(        %ymm9 ,%ymm3,%ymm0 )
213        VFMADDPS_I(        %ymm13,%ymm3,%ymm1 )
214
215        vbroadcastss     -6 * SIZE(BO), %ymm2
216        vbroadcastss     -5 * SIZE(BO), %ymm3
217        VFMADDPS_R(        %ymm10,%ymm2,%ymm0 )
218        VFMADDPS_R(        %ymm14,%ymm2,%ymm1 )
219        VFMADDPS_I(        %ymm11,%ymm3,%ymm0 )
220        VFMADDPS_I(        %ymm15,%ymm3,%ymm1 )
221
222        vbroadcastss     -4 * SIZE(BO), %ymm2
223        vbroadcastss     -3 * SIZE(BO), %ymm3
224        VFMADDPS_R(        %ymm4 ,%ymm2,%ymm0 )
225        VFMADDPS_R(        %ymm6 ,%ymm2,%ymm1 )
226        VFMADDPS_I(        %ymm5 ,%ymm3,%ymm0 )
227        VFMADDPS_I(        %ymm7 ,%ymm3,%ymm1 )
228
229
230        addq    $ 6*SIZE, BO
231        addq    $ 16*SIZE, AO
232	decq	%rax
233.endm
234
235.macro SAVE8x3
236
237	vbroadcastss	ALPHA_R, %ymm0
238	vbroadcastss	ALPHA_I, %ymm1
239
240	// swap high and low 64 bytes
241        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
242        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
243        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
244        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
245        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
246        vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
247
248#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
249    defined(NR) || defined(NC) || defined(TR) || defined(TC)
250
251        vaddsubps %ymm9, %ymm8 , %ymm8
252        vaddsubps %ymm11,%ymm10, %ymm10
253        vaddsubps %ymm13,%ymm12, %ymm12
254        vaddsubps %ymm15,%ymm14, %ymm14
255        vaddsubps %ymm5, %ymm4 , %ymm4
256        vaddsubps %ymm7, %ymm6 , %ymm6
257
258        vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
259        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
260        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
261        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
262        vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
263        vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7
264
265#else
266        vaddsubps %ymm8,  %ymm9 ,%ymm9
267        vaddsubps %ymm10, %ymm11,%ymm11
268        vaddsubps %ymm12, %ymm13,%ymm13
269        vaddsubps %ymm14, %ymm15,%ymm15
270        vaddsubps %ymm4,  %ymm5 ,%ymm5
271        vaddsubps %ymm6,  %ymm7 ,%ymm7
272
273        vmovaps   %ymm9,  %ymm8
274        vmovaps   %ymm11, %ymm10
275        vmovaps   %ymm13, %ymm12
276        vmovaps   %ymm15, %ymm14
277        vmovaps   %ymm5,  %ymm4
278        vmovaps   %ymm7,  %ymm6
279
280	// swap high and low 64 bytes
281        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
282        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
283        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
284        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
285        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
286        vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7
287
288#endif
289
290	// multiply with ALPHA_R
291        vmulps  %ymm8 , %ymm0, %ymm8
292        vmulps  %ymm10, %ymm0, %ymm10
293        vmulps  %ymm12, %ymm0, %ymm12
294        vmulps  %ymm14, %ymm0, %ymm14
295        vmulps  %ymm4 , %ymm0, %ymm4
296        vmulps  %ymm6 , %ymm0, %ymm6
297
298	// multiply with ALPHA_I
299        vmulps  %ymm9 , %ymm1, %ymm9
300        vmulps  %ymm11, %ymm1, %ymm11
301        vmulps  %ymm13, %ymm1, %ymm13
302        vmulps  %ymm15, %ymm1, %ymm15
303        vmulps  %ymm5 , %ymm1, %ymm5
304        vmulps  %ymm7 , %ymm1, %ymm7
305
306	vaddsubps %ymm9, %ymm8 , %ymm8
307        vaddsubps %ymm11,%ymm10, %ymm10
308        vaddsubps %ymm13,%ymm12, %ymm12
309        vaddsubps %ymm15,%ymm14, %ymm14
310	vaddsubps %ymm5, %ymm4 , %ymm4
311	vaddsubps %ymm7, %ymm6 , %ymm6
312
313#if !defined(TRMMKERNEL)
314
315	vaddps 	 	(CO1), %ymm8 , %ymm8
316	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
317
318	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
319	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
320
321	vaddps 	 	(CO1, LDC,2), %ymm4, %ymm4
322	vaddps  8 * SIZE(CO1, LDC,2), %ymm6, %ymm6
323
324#endif
325
326	vmovups	%ymm8 ,  	 (CO1)
327	vmovups	%ymm12 , 8 * SIZE(CO1)
328
329	vmovups	%ymm10 ,  	 (CO1, LDC)
330	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
331
332	vmovups	%ymm4  ,  	 (CO1, LDC,2)
333	vmovups	%ymm6  , 8 * SIZE(CO1, LDC,2)
334
335.endm
336
337
338/***************************************************************************************************************************/
339
340.macro KERNEL4x3_SUB
341
342        vmovups         -16 * SIZE(AO), %ymm0
343        vbroadcastss     -8 * SIZE(BO), %ymm2
344        vbroadcastss     -7 * SIZE(BO), %ymm3
345
346        VFMADDPS_R(        %ymm8 ,%ymm2,%ymm0 )
347        VFMADDPS_I(        %ymm9 ,%ymm3,%ymm0 )
348
349        vbroadcastss     -6 * SIZE(BO), %ymm2
350        vbroadcastss     -5 * SIZE(BO), %ymm3
351        VFMADDPS_R(        %ymm12,%ymm2,%ymm0 )
352        VFMADDPS_I(        %ymm13,%ymm3,%ymm0 )
353
354        vbroadcastss     -4 * SIZE(BO), %ymm2
355        vbroadcastss     -3 * SIZE(BO), %ymm3
356        VFMADDPS_R(        %ymm4 ,%ymm2,%ymm0 )
357        VFMADDPS_I(        %ymm5 ,%ymm3,%ymm0 )
358
359        addq    $ 6*SIZE, BO
360        addq    $ 8*SIZE, AO
361	decq	%rax
362.endm
363
364.macro SAVE4x3
365
366	vbroadcastss	ALPHA_R, %ymm0
367	vbroadcastss	ALPHA_I, %ymm1
368
369	// swap high and low 64 bytes
370        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
371        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
372        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
373
374#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
375    defined(NR) || defined(NC) || defined(TR) || defined(TC)
376
377        vaddsubps %ymm9, %ymm8 , %ymm8
378        vaddsubps %ymm13,%ymm12, %ymm12
379        vaddsubps %ymm5, %ymm4 , %ymm4
380
381        vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9
382        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
383        vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5
384
385#else
386        vaddsubps %ymm8,  %ymm9 ,%ymm9
387        vaddsubps %ymm12, %ymm13,%ymm13
388        vaddsubps %ymm4,  %ymm5 ,%ymm5
389
390        vmovaps   %ymm9,  %ymm8
391        vmovaps   %ymm13, %ymm12
392        vmovaps   %ymm5,  %ymm4
393
394	// swap high and low 64 bytes
395        vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9
396        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
397        vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5
398
399#endif
400
401	// multiply with ALPHA_R
402        vmulps  %ymm8 , %ymm0, %ymm8
403        vmulps  %ymm12, %ymm0, %ymm12
404        vmulps  %ymm4 , %ymm0, %ymm4
405
406	// multiply with ALPHA_I
407        vmulps  %ymm9 , %ymm1, %ymm9
408        vmulps  %ymm13, %ymm1, %ymm13
409        vmulps  %ymm5 , %ymm1, %ymm5
410
411	vaddsubps %ymm9, %ymm8 , %ymm8
412        vaddsubps %ymm13,%ymm12, %ymm12
413	vaddsubps %ymm5, %ymm4 , %ymm4
414
415#if !defined(TRMMKERNEL)
416
417	vaddps 	 	(CO1), %ymm8 , %ymm8
418	vaddps 	 	(CO1, LDC), %ymm12, %ymm12
419	vaddps 	 	(CO1, LDC,2), %ymm4, %ymm4
420
421#endif
422
423	vmovups	%ymm8 ,  	 (CO1)
424	vmovups	%ymm12 ,  	 (CO1, LDC)
425	vmovups	%ymm4  ,  	 (CO1, LDC,2)
426
427.endm
428
429/***************************************************************************************************************************/
430
431.macro KERNEL2x3_SUB
432
433        vmovups         -16 * SIZE(AO), %xmm0
434        vbroadcastss     -8 * SIZE(BO), %xmm2
435        vbroadcastss     -7 * SIZE(BO), %xmm3
436
437        VFMADDPS_R(        %xmm8 ,%xmm2,%xmm0 )
438        VFMADDPS_I(        %xmm9 ,%xmm3,%xmm0 )
439
440        vbroadcastss     -6 * SIZE(BO), %xmm2
441        vbroadcastss     -5 * SIZE(BO), %xmm3
442        VFMADDPS_R(        %xmm12,%xmm2,%xmm0 )
443        VFMADDPS_I(        %xmm13,%xmm3,%xmm0 )
444
445        vbroadcastss     -4 * SIZE(BO), %xmm2
446        vbroadcastss     -3 * SIZE(BO), %xmm3
447        VFMADDPS_R(        %xmm4 ,%xmm2,%xmm0 )
448        VFMADDPS_I(        %xmm5 ,%xmm3,%xmm0 )
449
450        addq    $ 6*SIZE, BO
451        addq    $ 4*SIZE, AO
452	decq	%rax
453
454.endm
455
456.macro SAVE2x3
457
458	vbroadcastss	ALPHA_R, %xmm0
459	vbroadcastss	ALPHA_I, %xmm1
460
461	// swap high and low 64 bytes
462        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
463        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
464        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
465
466#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
467    defined(NR) || defined(NC) || defined(TR) || defined(TC)
468
469        vaddsubps %xmm9, %xmm8 , %xmm8
470        vaddsubps %xmm13,%xmm12, %xmm12
471        vaddsubps %xmm5, %xmm4 , %xmm4
472
473        vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
474        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
475        vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
476
477#else
478        vaddsubps %xmm8,  %xmm9 ,%xmm9
479        vaddsubps %xmm12, %xmm13,%xmm13
480        vaddsubps %xmm4,  %xmm5 ,%xmm5
481
482        vmovaps   %xmm9,  %xmm8
483        vmovaps   %xmm13, %xmm12
484        vmovaps   %xmm5,  %xmm4
485
486	// swap high and low 64 bytes
487        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
488        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
489        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
490
491#endif
492
493	// multiply with ALPHA_R
494        vmulps  %xmm8 , %xmm0, %xmm8
495        vmulps  %xmm12, %xmm0, %xmm12
496        vmulps  %xmm4 , %xmm0, %xmm4
497
498	// multiply with ALPHA_I
499        vmulps  %xmm9 , %xmm1, %xmm9
500        vmulps  %xmm13, %xmm1, %xmm13
501        vmulps  %xmm5 , %xmm1, %xmm5
502
503	vaddsubps %xmm9, %xmm8 , %xmm8
504        vaddsubps %xmm13,%xmm12, %xmm12
505	vaddsubps %xmm5, %xmm4 , %xmm4
506
507#if !defined(TRMMKERNEL)
508
509	vaddps 	 	(CO1), %xmm8 , %xmm8
510	vaddps 	 	(CO1, LDC), %xmm12, %xmm12
511	vaddps 	 	(CO1, LDC,2), %xmm4, %xmm4
512
513#endif
514
515	vmovups	%xmm8 ,  	 (CO1)
516	vmovups	%xmm12 ,  	 (CO1, LDC)
517	vmovups	%xmm4  ,  	 (CO1, LDC,2)
518
519.endm
520
521
522/***************************************************************************************************************************/
523
524.macro KERNEL1x3_SUB
525
526        vmovsd          -16 * SIZE(AO), %xmm0
527        vbroadcastss     -8 * SIZE(BO), %xmm2
528        vbroadcastss     -7 * SIZE(BO), %xmm3
529
530        VFMADDPS_R(        %xmm8 ,%xmm2,%xmm0 )
531        VFMADDPS_I(        %xmm9 ,%xmm3,%xmm0 )
532
533        vbroadcastss     -6 * SIZE(BO), %xmm2
534        vbroadcastss     -5 * SIZE(BO), %xmm3
535        VFMADDPS_R(        %xmm12,%xmm2,%xmm0 )
536        VFMADDPS_I(        %xmm13,%xmm3,%xmm0 )
537
538        vbroadcastss     -4 * SIZE(BO), %xmm2
539        vbroadcastss     -3 * SIZE(BO), %xmm3
540        VFMADDPS_R(        %xmm4 ,%xmm2,%xmm0 )
541        VFMADDPS_I(        %xmm5 ,%xmm3,%xmm0 )
542
543        addq    $ 6*SIZE, BO
544        addq    $ 2*SIZE, AO
545	decq	%rax
546
547.endm
548
549.macro SAVE1x3
550
551	vbroadcastss	ALPHA_R, %xmm0
552	vbroadcastss	ALPHA_I, %xmm1
553
554	// swap high and low 64 bytes
555        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
556        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
557        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
558
559#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
560    defined(NR) || defined(NC) || defined(TR) || defined(TC)
561
562        vaddsubps %xmm9, %xmm8 , %xmm8
563        vaddsubps %xmm13,%xmm12, %xmm12
564        vaddsubps %xmm5, %xmm4 , %xmm4
565
566        vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9
567        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
568        vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5
569
570#else
571        vaddsubps %xmm8,  %xmm9 ,%xmm9
572        vaddsubps %xmm12, %xmm13,%xmm13
573        vaddsubps %xmm4,  %xmm5 ,%xmm5
574
575        vmovaps   %xmm9,  %xmm8
576        vmovaps   %xmm13, %xmm12
577        vmovaps   %xmm5,  %xmm4
578
579	// swap high and low 64 bytes
580        vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9
581        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
582        vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5
583
584#endif
585
586	// multiply with ALPHA_R
587        vmulps  %xmm8 , %xmm0, %xmm8
588        vmulps  %xmm12, %xmm0, %xmm12
589        vmulps  %xmm4 , %xmm0, %xmm4
590
591	// multiply with ALPHA_I
592        vmulps  %xmm9 , %xmm1, %xmm9
593        vmulps  %xmm13, %xmm1, %xmm13
594        vmulps  %xmm5 , %xmm1, %xmm5
595
596	vaddsubps %xmm9, %xmm8 , %xmm8
597        vaddsubps %xmm13,%xmm12, %xmm12
598	vaddsubps %xmm5, %xmm4 , %xmm4
599
600#if !defined(TRMMKERNEL)
601
602	vmovsd		(CO1)      , %xmm9
603	vmovsd		(CO1,LDC)  , %xmm13
604	vmovsd		(CO1,LDC,2), %xmm5
605	vaddps 	 	%xmm9 , %xmm8 , %xmm8
606	vaddps 	 	%xmm13, %xmm12, %xmm12
607	vaddps 	 	%xmm5 , %xmm4, %xmm4
608
609#endif
610
611	vmovsd	%xmm8 ,  	 (CO1)
612	vmovsd	%xmm12 ,  	 (CO1, LDC)
613	vmovsd	%xmm4  ,  	 (CO1, LDC,2)
614
615.endm
616
617
618/***************************************************************************************************************************/
619
620.macro KERNEL8x2_SUB
621
622        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
623        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %ymm4
624        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
625        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
626        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
627        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %ymm5
628        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
629        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
630        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %ymm6
631        VFMADDPS_R(        %ymm10,%ymm6,%ymm0 )
632        VFMADDPS_R(        %ymm14,%ymm6,%ymm1 )
633        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %ymm7
634        VFMADDPS_I(        %ymm11,%ymm7,%ymm0 )
635        VFMADDPS_I(        %ymm15,%ymm7,%ymm1 )
636        addq    $ 4 , BI
637        addq    $ 16, %rax
638.endm
639
640.macro SAVE8x2
641
642	vbroadcastss	ALPHA_R, %ymm0
643	vbroadcastss	ALPHA_I, %ymm1
644
645	// swap high and low 64 bytes
646        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
647        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
648        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
649        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
650
651#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
652    defined(NR) || defined(NC) || defined(TR) || defined(TC)
653
654        vaddsubps %ymm9, %ymm8 , %ymm8
655        vaddsubps %ymm11,%ymm10, %ymm10
656        vaddsubps %ymm13,%ymm12, %ymm12
657        vaddsubps %ymm15,%ymm14, %ymm14
658
659        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
660        vshufps $ 0xb1, %ymm10, %ymm10, %ymm11
661        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
662        vshufps $ 0xb1, %ymm14, %ymm14, %ymm15
663
664#else
665        vaddsubps %ymm8,  %ymm9 ,%ymm9
666        vaddsubps %ymm10, %ymm11,%ymm11
667        vaddsubps %ymm12, %ymm13,%ymm13
668        vaddsubps %ymm14, %ymm15,%ymm15
669
670        vmovaps   %ymm9,  %ymm8
671        vmovaps   %ymm11, %ymm10
672        vmovaps   %ymm13, %ymm12
673        vmovaps   %ymm15, %ymm14
674
675	// swap high and low 64 bytes
676        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
677        vshufps $ 0xb1, %ymm11, %ymm11, %ymm11
678        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
679        vshufps $ 0xb1, %ymm15, %ymm15, %ymm15
680
681#endif
682
683	// multiply with ALPHA_R
684        vmulps  %ymm8 , %ymm0, %ymm8
685        vmulps  %ymm10, %ymm0, %ymm10
686        vmulps  %ymm12, %ymm0, %ymm12
687        vmulps  %ymm14, %ymm0, %ymm14
688
689	// multiply with ALPHA_I
690        vmulps  %ymm9 , %ymm1, %ymm9
691        vmulps  %ymm11, %ymm1, %ymm11
692        vmulps  %ymm13, %ymm1, %ymm13
693        vmulps  %ymm15, %ymm1, %ymm15
694
695	vaddsubps %ymm9, %ymm8 , %ymm8
696        vaddsubps %ymm11,%ymm10, %ymm10
697        vaddsubps %ymm13,%ymm12, %ymm12
698        vaddsubps %ymm15,%ymm14, %ymm14
699
700
701
702#if !defined(TRMMKERNEL)
703
704	vaddps 	 	(CO1), %ymm8 , %ymm8
705	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
706
707	vaddps 	 	(CO1, LDC), %ymm10, %ymm10
708	vaddps  8 * SIZE(CO1, LDC), %ymm14, %ymm14
709
710#endif
711
712	vmovups	%ymm8 ,  	(CO1)
713	vmovups	%ymm12 , 8 * SIZE(CO1)
714
715	vmovups	%ymm10 ,  	(CO1, LDC)
716	vmovups	%ymm14 , 8 * SIZE(CO1, LDC)
717
718	prefetcht0	64(CO1)
719	prefetcht0	64(CO1, LDC)
720
721.endm
722
723/***************************************************************************************************************************/
724
725.macro KERNEL4x2_SUB
726        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
727        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
728        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
729        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
730        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
731        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
732        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
733        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
734        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
735        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
736        VFMADDPS_R(        %xmm14,%xmm6,%xmm1 )
737        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
738        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
739        VFMADDPS_I(        %xmm15,%xmm7,%xmm1 )
740        addq    $ 4, BI
741        addq    $ 8, %rax
742.endm
743
744.macro SAVE4x2
745
746	vbroadcastss	ALPHA_R, %xmm0
747	vbroadcastss	ALPHA_I, %xmm1
748
749	// swap high and low 64 bytes
750        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
751        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
752        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
753        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
754
755#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
756    defined(NR) || defined(NC) || defined(TR) || defined(TC)
757
758        vaddsubps %xmm9, %xmm8 , %xmm8
759        vaddsubps %xmm11,%xmm10, %xmm10
760        vaddsubps %xmm13,%xmm12, %xmm12
761        vaddsubps %xmm15,%xmm14, %xmm14
762
763        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
764        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
765        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
766        vshufps $ 0xb1, %xmm14, %xmm14, %xmm15
767
768#else
769        vaddsubps %xmm8,  %xmm9 ,%xmm9
770        vaddsubps %xmm10, %xmm11,%xmm11
771        vaddsubps %xmm12, %xmm13,%xmm13
772        vaddsubps %xmm14, %xmm15,%xmm15
773
774        vmovaps   %xmm9,  %xmm8
775        vmovaps   %xmm11, %xmm10
776        vmovaps   %xmm13, %xmm12
777        vmovaps   %xmm15, %xmm14
778
779	// swap high and low 64 bytes
780        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
781        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
782        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
783        vshufps $ 0xb1, %xmm15, %xmm15, %xmm15
784
785#endif
786
787	// multiply with ALPHA_R
788        vmulps  %xmm8 , %xmm0, %xmm8
789        vmulps  %xmm10, %xmm0, %xmm10
790        vmulps  %xmm12, %xmm0, %xmm12
791        vmulps  %xmm14, %xmm0, %xmm14
792
793	// multiply with ALPHA_I
794        vmulps  %xmm9 , %xmm1, %xmm9
795        vmulps  %xmm11, %xmm1, %xmm11
796        vmulps  %xmm13, %xmm1, %xmm13
797        vmulps  %xmm15, %xmm1, %xmm15
798
799	vaddsubps %xmm9, %xmm8 , %xmm8
800        vaddsubps %xmm11,%xmm10, %xmm10
801        vaddsubps %xmm13,%xmm12, %xmm12
802        vaddsubps %xmm15,%xmm14, %xmm14
803
804#if !defined(TRMMKERNEL)
805
806	vaddps 	 	(CO1), %xmm8 , %xmm8
807	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
808
809	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
810	vaddps  4 * SIZE(CO1, LDC), %xmm14, %xmm14
811
812#endif
813
814	vmovups	%xmm8 ,  	(CO1)
815	vmovups	%xmm12 , 4 * SIZE(CO1)
816
817	vmovups	%xmm10 ,  	(CO1, LDC)
818	vmovups	%xmm14 , 4 * SIZE(CO1, LDC)
819
820.endm
821
822/************************************************************************************************/
823
824.macro KERNEL2x2_SUB
825        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
826        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
827        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
828        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
829        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
830        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
831        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
832        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
833        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
834        addq    $ 4, BI
835        addq    $ 4, %rax
836.endm
837
838.macro SAVE2x2
839
840	vbroadcastss	ALPHA_R, %xmm0
841	vbroadcastss	ALPHA_I, %xmm1
842
843	// swap high and low 4 bytes
844        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
845        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
846
847#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
848    defined(NR) || defined(NC) || defined(TR) || defined(TC)
849
850        vaddsubps %xmm9, %xmm8 , %xmm8
851        vaddsubps %xmm11,%xmm10, %xmm10
852
853        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
854        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
855
856#else
857        vaddsubps %xmm8,  %xmm9 ,%xmm9
858        vaddsubps %xmm10, %xmm11,%xmm11
859
860        vmovaps   %xmm9,  %xmm8
861        vmovaps   %xmm11, %xmm10
862
863	// swap high and low 4 bytes
864        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
865        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
866
867#endif
868
869	// multiply with ALPHA_R
870        vmulps  %xmm8 , %xmm0, %xmm8
871        vmulps  %xmm10, %xmm0, %xmm10
872
873	// multiply with ALPHA_I
874        vmulps  %xmm9 , %xmm1, %xmm9
875        vmulps  %xmm11, %xmm1, %xmm11
876
877	vaddsubps %xmm9, %xmm8 , %xmm8
878        vaddsubps %xmm11,%xmm10, %xmm10
879
880#if !defined(TRMMKERNEL)
881
882	vaddps 	 	(CO1), %xmm8 , %xmm8
883
884	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
885
886#endif
887
888	vmovups	%xmm8 ,  	(CO1)
889
890	vmovups	%xmm10 ,  	(CO1, LDC)
891
892.endm
893
894/************************************************************************************************/
895
896.macro KERNEL1x2_SUB
897        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
898        vbroadcastss         -8 * SIZE(BO, BI, SIZE), %xmm4
899        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
900        vbroadcastss         -7 * SIZE(BO, BI, SIZE), %xmm5
901        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
902        vbroadcastss         -6 * SIZE(BO, BI, SIZE), %xmm6
903        VFMADDPS_R(        %xmm10,%xmm6,%xmm0 )
904        vbroadcastss         -5 * SIZE(BO, BI, SIZE), %xmm7
905        VFMADDPS_I(        %xmm11,%xmm7,%xmm0 )
906        addq    $ 4, BI
907        addq    $ 2, %rax
908.endm
909
910.macro SAVE1x2
911
912	vbroadcastss	ALPHA_R, %xmm0
913	vbroadcastss	ALPHA_I, %xmm1
914
915	// swap high and low 64 bytes
916        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
917        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
918
919#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
920    defined(NR) || defined(NC) || defined(TR) || defined(TC)
921
922        vaddsubps %xmm9, %xmm8 , %xmm8
923        vaddsubps %xmm11,%xmm10, %xmm10
924
925        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
926        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
927
928#else
929        vaddsubps %xmm8,  %xmm9 ,%xmm9
930        vaddsubps %xmm10, %xmm11,%xmm11
931
932        vmovaps   %xmm9,  %xmm8
933        vmovaps   %xmm11, %xmm10
934
935	// swap high and low 64 bytes
936        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
937        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
938
939#endif
940
941	// multiply with ALPHA_R
942        vmulps  %xmm8 , %xmm0, %xmm8
943        vmulps  %xmm10, %xmm0, %xmm10
944
945	// multiply with ALPHA_I
946        vmulps  %xmm9 , %xmm1, %xmm9
947        vmulps  %xmm11, %xmm1, %xmm11
948
949	vaddsubps %xmm9, %xmm8 , %xmm8
950        vaddsubps %xmm11,%xmm10, %xmm10
951
952#if !defined(TRMMKERNEL)
953
954	vmovsd		(CO1), %xmm14
955	vaddps 	 	%xmm14, %xmm8 , %xmm8
956
957	vmovsd		(CO1, LDC), %xmm15
958	vaddps 	 	%xmm15, %xmm10, %xmm10
959
960#endif
961
962	vmovsd	%xmm8 ,  	(CO1)
963	vmovsd	%xmm10 ,  	(CO1, LDC)
964
965.endm
966
967/************************************************************************************************/
968
969.macro KERNEL8x1_SUB
970        vmovups         -16 * SIZE(AO, %rax, SIZE), %ymm0
971        vmovups          -8 * SIZE(AO, %rax, SIZE), %ymm1
972        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %ymm4
973        VFMADDPS_R(        %ymm8,%ymm4,%ymm0  )
974        VFMADDPS_R(        %ymm12,%ymm4,%ymm1 )
975        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %ymm5
976        VFMADDPS_I(        %ymm9,%ymm5,%ymm0  )
977        VFMADDPS_I(        %ymm13,%ymm5,%ymm1 )
978        addq    $ 2 , BI
979        addq    $ 16, %rax
980.endm
981
982.macro SAVE8x1
983
984	vbroadcastss	ALPHA_R, %ymm0
985	vbroadcastss	ALPHA_I, %ymm1
986
987	// swap high and low 64 bytes
988        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
989        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
990
991#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
992    defined(NR) || defined(NC) || defined(TR) || defined(TC)
993
994        vaddsubps %ymm9, %ymm8 , %ymm8
995        vaddsubps %ymm13,%ymm12, %ymm12
996
997        vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9
998        vshufps $ 0xb1, %ymm12, %ymm12, %ymm13
999
1000#else
1001        vaddsubps %ymm8,  %ymm9 ,%ymm9
1002        vaddsubps %ymm12, %ymm13,%ymm13
1003
1004        vmovaps   %ymm9,  %ymm8
1005        vmovaps   %ymm13, %ymm12
1006
1007	// swap high and low 64 bytes
1008        vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9
1009        vshufps $ 0xb1, %ymm13, %ymm13, %ymm13
1010
1011#endif
1012
1013	// multiply with ALPHA_R
1014        vmulps  %ymm8 , %ymm0, %ymm8
1015        vmulps  %ymm12, %ymm0, %ymm12
1016
1017	// multiply with ALPHA_I
1018        vmulps  %ymm9 , %ymm1, %ymm9
1019        vmulps  %ymm13, %ymm1, %ymm13
1020
1021	vaddsubps %ymm9, %ymm8 , %ymm8
1022        vaddsubps %ymm13,%ymm12, %ymm12
1023
1024
1025
1026#if !defined(TRMMKERNEL)
1027
1028	vaddps 	 	(CO1), %ymm8 , %ymm8
1029	vaddps  8 * SIZE(CO1), %ymm12, %ymm12
1030
1031#endif
1032
1033	vmovups	%ymm8 ,  	(CO1)
1034	vmovups	%ymm12 , 8 * SIZE(CO1)
1035
1036.endm
1037
1038
1039/************************************************************************************************/
1040
1041.macro KERNEL4x1_SUB
1042        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
1043        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
1044        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
1045        vmovups         -12 * SIZE(AO, %rax, SIZE), %xmm1
1046        VFMADDPS_R(        %xmm12,%xmm4,%xmm1 )
1047        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
1048        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
1049        VFMADDPS_I(        %xmm13,%xmm5,%xmm1 )
1050        addq    $ 2, BI
1051        addq    $ 8, %rax
1052.endm
1053
1054.macro SAVE4x1
1055
1056	vbroadcastss	ALPHA_R, %xmm0
1057	vbroadcastss	ALPHA_I, %xmm1
1058
1059	// swap high and low 4 bytes
1060        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1061        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
1062
1063#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1064    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1065
1066        vaddsubps %xmm9, %xmm8 , %xmm8
1067        vaddsubps %xmm13,%xmm12, %xmm12
1068
1069        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
1070        vshufps $ 0xb1, %xmm12, %xmm12, %xmm13
1071
1072#else
1073        vaddsubps %xmm8,  %xmm9 ,%xmm9
1074        vaddsubps %xmm12, %xmm13,%xmm13
1075
1076        vmovaps   %xmm9,  %xmm8
1077        vmovaps   %xmm13, %xmm12
1078
1079	// swap high and low 4 bytes
1080        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1081        vshufps $ 0xb1, %xmm13, %xmm13, %xmm13
1082
1083#endif
1084
1085	// multiply with ALPHA_R
1086        vmulps  %xmm8 , %xmm0, %xmm8
1087        vmulps  %xmm12, %xmm0, %xmm12
1088
1089	// multiply with ALPHA_I
1090        vmulps  %xmm9 , %xmm1, %xmm9
1091        vmulps  %xmm13, %xmm1, %xmm13
1092
1093	vaddsubps %xmm9, %xmm8 , %xmm8
1094        vaddsubps %xmm13,%xmm12, %xmm12
1095
1096#ifndef TRMMKERNEL
1097
1098	vaddps 	 	(CO1), %xmm8 , %xmm8
1099	vaddps  4 * SIZE(CO1), %xmm12, %xmm12
1100
1101#endif
1102
1103	vmovups	%xmm8 ,  	(CO1)
1104	vmovups	%xmm12 , 4 * SIZE(CO1)
1105
1106.endm
1107
1108/************************************************************************************************/
1109
1110.macro KERNEL2x1_SUB
1111        vmovups         -16 * SIZE(AO, %rax, SIZE), %xmm0
1112        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
1113        VFMADDPS_R(        %xmm8,%xmm4,%xmm0  )
1114        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
1115        VFMADDPS_I(        %xmm9,%xmm5,%xmm0  )
1116        addq    $ 2, BI
1117        addq    $ 4, %rax
1118.endm
1119
1120.macro SAVE2x1
1121
1122	vbroadcastss	ALPHA_R, %xmm0
1123	vbroadcastss	ALPHA_I, %xmm1
1124
1125	// swap high and low 64 bytes
1126        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1127
1128#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1129    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1130
1131        vaddsubps %xmm9, %xmm8 , %xmm8
1132
1133        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
1134
1135#else
1136        vaddsubps %xmm8,  %xmm9 ,%xmm9
1137
1138        vmovaps   %xmm9,  %xmm8
1139
1140	// swap high and low 64 bytes
1141        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1142
1143#endif
1144
1145	// multiply with ALPHA_R
1146        vmulps  %xmm8 , %xmm0, %xmm8
1147
1148	// multiply with ALPHA_I
1149        vmulps  %xmm9 , %xmm1, %xmm9
1150
1151	vaddsubps %xmm9, %xmm8 , %xmm8
1152
1153#if !defined(TRMMKERNEL)
1154
1155	vaddps 	 	(CO1), %xmm8 , %xmm8
1156
1157#endif
1158
1159	vmovups	%xmm8 ,  	(CO1)
1160
1161.endm
1162
1163/************************************************************************************************/
1164
1165.macro KERNEL1x1_SUB
1166        vmovsd         -16 * SIZE(AO, %rax, SIZE), %xmm0
1167        vbroadcastss         -4 * SIZE(BO, BI, SIZE), %xmm4
1168        VFMADDPS_R(        %xmm8,%xmm4,%xmm0 )
1169        vbroadcastss         -3 * SIZE(BO, BI, SIZE), %xmm5
1170        VFMADDPS_I(        %xmm9,%xmm5,%xmm0 )
1171        addq    $ 2, BI
1172        addq    $ 2, %rax
1173.endm
1174
1175.macro SAVE1x1
1176
1177	vbroadcastss	ALPHA_R, %xmm0
1178	vbroadcastss	ALPHA_I, %xmm1
1179
1180	// swap high and low 64 bytes
1181        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1182
1183#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1184    defined(NR) || defined(NC) || defined(TR) || defined(TC)
1185
1186        vaddsubps %xmm9, %xmm8 , %xmm8
1187
1188        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
1189
1190#else
1191        vaddsubps %xmm8,  %xmm9 ,%xmm9
1192
1193        vmovaps   %xmm9,  %xmm8
1194
1195	// swap high and low 64 bytes
1196        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
1197
1198#endif
1199
1200	// multiply with ALPHA_R
1201        vmulps  %xmm8 , %xmm0, %xmm8
1202
1203	// multiply with ALPHA_I
1204        vmulps  %xmm9 , %xmm1, %xmm9
1205
1206	vaddsubps %xmm9, %xmm8 , %xmm8
1207
1208#if !defined(TRMMKERNEL)
1209
1210	vmovsd		(CO1), %xmm14
1211	vaddps 	 	%xmm14, %xmm8 , %xmm8
1212
1213#endif
1214
1215	vmovsd	%xmm8 ,  	(CO1)
1216
1217.endm
1218
1219
1220#if !defined(TRMMKERNEL)
1221
1222	PROLOGUE
1223	PROFCODE
1224
1225	subq	$STACKSIZE, %rsp
1226	movq	%rbx,   (%rsp)
1227	movq	%rbp,  8(%rsp)
1228	movq	%r12, 16(%rsp)
1229	movq	%r13, 24(%rsp)
1230	movq	%r14, 32(%rsp)
1231	movq	%r15, 40(%rsp)
1232
1233	vzeroupper
1234
1235#ifdef WINDOWS_ABI
1236	movq	%rdi,    48(%rsp)
1237	movq	%rsi,    56(%rsp)
1238	vmovups	%xmm6,   64(%rsp)
1239	vmovups	%xmm7,   80(%rsp)
1240	vmovups	%xmm8,   96(%rsp)
1241	vmovups	%xmm9,  112(%rsp)
1242	vmovups	%xmm10, 128(%rsp)
1243	vmovups	%xmm11, 144(%rsp)
1244	vmovups	%xmm12, 160(%rsp)
1245	vmovups	%xmm13, 176(%rsp)
1246	vmovups	%xmm14, 192(%rsp)
1247	vmovups	%xmm15, 208(%rsp)
1248
1249	movq	ARG1,      OLD_M
1250	movq	ARG2,      OLD_N
1251	movq	ARG3,      OLD_K
1252	movq	OLD_A,     A
1253	movq	OLD_B,     B
1254	movq	OLD_C,     C
1255	movq	OLD_LDC,   LDC
1256	vmovaps	%xmm3, %xmm0
1257	vmovsd   OLD_ALPHA_I, %xmm1
1258
1259#else
1260	movq	STACKSIZE +  8(%rsp), LDC
1261
1262#endif
1263
1264	movq    %rsp, SP      # save old stack
1265        subq    $ 128 + L_BUFFER_SIZE, %rsp
1266        andq    $ -4096, %rsp    # align stack
1267
1268        STACK_TOUCH
1269
1270	cmpq	$ 0, OLD_M
1271	je	.L999
1272
1273	cmpq	$ 0, OLD_N
1274	je	.L999
1275
1276	cmpq	$ 0, OLD_K
1277	je	.L999
1278
1279	movq	OLD_M, M
1280	movq	OLD_N, N
1281	movq	OLD_K, K
1282
1283	vmovss	 %xmm0, ALPHA_R
1284	vmovss	 %xmm1, ALPHA_I
1285
1286	salq	$ ZBASE_SHIFT, LDC
1287
1288	movq    N, %rax
1289        xorq    %rdx, %rdx
1290        movq    $ 6,  %rdi
1291        divq    %rdi                    //    N / 6
1292        movq    %rax, Ndiv6             //    N / 6
1293        movq    %rdx, Nmod6             //    N % 6
1294
1295/************************************************************************************************/
1296
1297.L6_0:
1298
1299	movq	Ndiv6,  J
1300	cmpq	$ 0, J
1301	je	.L2_00
1302	ALIGN_4
1303
1304
1305
1306.L6_01:
1307	// copy to sub buffer
1308	movq	B, BO1
1309	leaq    BUFFER1, BO		// first buffer to BO
1310	movq    K, %rax
1311        salq    $2, %rax                // 2 * COMPSIZE
1312        leaq    (B, %rax,4), BO2
1313        movq    BO2, B                  // next offset of B
1314        movq    K, %rax
1315
1316	ALIGN_4
1317
1318.L6_02b:
1319
1320	vmovups	(BO1), %xmm0
1321	vmovsd	(BO2), %xmm1
1322	vmovups	%xmm0,       (BO)
1323	vmovsd  %xmm1, 4*SIZE(BO)
1324	addq	$ 4*SIZE,BO1
1325	addq	$ 4*SIZE,BO2
1326	addq	$ 6*SIZE,BO
1327	decq	%rax
1328	jnz	.L6_02b
1329
1330
1331.L6_10:
1332	movq	C, CO1
1333	leaq	(C, LDC, 2), C		// c += 2 * ldc
1334	leaq	(C, LDC, 1), C		// c += 1 * ldc
1335
1336	movq	A, AO		 	// aoffset = a
1337	addq	$ 16 * SIZE, AO
1338
1339	movq	M,  I
1340	sarq	$ 3, I			// i = (m >> 3)
1341	je	.L6_4_10
1342
1343	ALIGN_4
1344/**********************************************************************************************************/
1345
1346.L6_8_11:
1347
1348	leaq	BUFFER1, BO		// first buffer to BO
1349	addq	$ 8 * SIZE, BO
1350
1351	vzeroall
1352
1353        movq    K, %rax
1354
1355	andq	$ -8, %rax			//  K = K - ( K % 8 )
1356	je	.L6_8_16
1357
1358	ALIGN_4
1359
1360.L6_8_12:
1361
1362	KERNEL8x3_SUB
1363	KERNEL8x3_SUB
1364	KERNEL8x3_SUB
1365	KERNEL8x3_SUB
1366
1367	KERNEL8x3_SUB
1368	KERNEL8x3_SUB
1369	KERNEL8x3_SUB
1370	KERNEL8x3_SUB
1371
1372	je	.L6_8_16
1373
1374	KERNEL8x3_SUB
1375	KERNEL8x3_SUB
1376	KERNEL8x3_SUB
1377	KERNEL8x3_SUB
1378
1379	KERNEL8x3_SUB
1380	KERNEL8x3_SUB
1381	KERNEL8x3_SUB
1382	KERNEL8x3_SUB
1383
1384	je	.L6_8_16
1385
1386	jmp	.L6_8_12
1387	ALIGN_4
1388
1389.L6_8_16:
1390        movq    K, %rax
1391
1392	andq	$ 7, %rax		# if (k & 1)
1393	je .L6_8_19
1394
1395	ALIGN_4
1396
1397.L6_8_17:
1398
1399	KERNEL8x3_SUB
1400
1401	jnz	.L6_8_17
1402	ALIGN_4
1403
1404
1405.L6_8_19:
1406
1407	SAVE8x3
1408
1409	addq	$ 16 * SIZE, CO1		# coffset += 16
1410	decq	I			# i --
1411	jg	.L6_8_11
1412	ALIGN_4
1413
1414
1415/**********************************************************************************************************/
1416
1417
1418.L6_4_10:
1419	testq	$ 7, M
1420	jz	.L6_4_60		// to next 2 lines of N
1421
1422	testq	$ 4, M
1423	jz	.L6_4_20
1424	ALIGN_4
1425
1426
1427.L6_4_11:
1428
1429	leaq	BUFFER1, BO		// first buffer to BO
1430	addq	$ 8 * SIZE, BO
1431
1432	vzeroall
1433
1434        movq    K, %rax
1435
1436	andq	$ -8, %rax			//  K = K - ( K % 8 )
1437	je	.L6_4_16
1438
1439	ALIGN_4
1440
1441.L6_4_12:
1442
1443	prefetcht0	A_PR1(AO)
1444	KERNEL4x3_SUB
1445	KERNEL4x3_SUB
1446	prefetcht0	A_PR1(AO)
1447	KERNEL4x3_SUB
1448	KERNEL4x3_SUB
1449
1450	prefetcht0	A_PR1(AO)
1451	KERNEL4x3_SUB
1452	KERNEL4x3_SUB
1453	prefetcht0	A_PR1(AO)
1454	KERNEL4x3_SUB
1455	KERNEL4x3_SUB
1456
1457	je	.L6_4_16
1458
1459	prefetcht0	A_PR1(AO)
1460	KERNEL4x3_SUB
1461	KERNEL4x3_SUB
1462	prefetcht0	A_PR1(AO)
1463	KERNEL4x3_SUB
1464	KERNEL4x3_SUB
1465
1466	prefetcht0	A_PR1(AO)
1467	KERNEL4x3_SUB
1468	KERNEL4x3_SUB
1469	prefetcht0	A_PR1(AO)
1470	KERNEL4x3_SUB
1471	KERNEL4x3_SUB
1472
1473	je	.L6_4_16
1474
1475	jmp	.L6_4_12
1476	ALIGN_4
1477
1478.L6_4_16:
1479        movq    K, %rax
1480
1481	andq	$ 7, %rax		# if (k & 1)
1482	je .L6_4_19
1483
1484	ALIGN_4
1485
1486.L6_4_17:
1487
1488	KERNEL4x3_SUB
1489
1490	jnz	.L6_4_17
1491	ALIGN_4
1492
1493
1494.L6_4_19:
1495
1496	SAVE4x3
1497
1498	addq	$ 8 * SIZE, CO1		# coffset += 8
1499	ALIGN_4
1500
1501
1502
1503/**************************************************************************
1504* Rest of M
1505***************************************************************************/
1506
1507.L6_4_20:
1508
1509	testq	$ 2, M
1510	jz	.L6_4_40
1511	ALIGN_4
1512
1513.L6_4_21:
1514
1515	leaq	BUFFER1, BO		// first buffer to BO
1516	addq	$ 8 * SIZE, BO
1517
1518	vzeroall
1519
1520        movq    K, %rax
1521
1522	andq	$ -8, %rax			//  K = K - ( K % 8 )
1523	je	.L6_4_26
1524
1525	ALIGN_4
1526
1527.L6_4_22:
1528
1529	prefetcht0	A_PR1(AO)
1530	KERNEL2x3_SUB
1531	KERNEL2x3_SUB
1532	KERNEL2x3_SUB
1533	KERNEL2x3_SUB
1534
1535	prefetcht0	A_PR1(AO)
1536	KERNEL2x3_SUB
1537	KERNEL2x3_SUB
1538	KERNEL2x3_SUB
1539	KERNEL2x3_SUB
1540
1541	je	.L6_4_26
1542
1543	prefetcht0	A_PR1(AO)
1544	KERNEL2x3_SUB
1545	KERNEL2x3_SUB
1546	KERNEL2x3_SUB
1547	KERNEL2x3_SUB
1548
1549	prefetcht0	A_PR1(AO)
1550	KERNEL2x3_SUB
1551	KERNEL2x3_SUB
1552	KERNEL2x3_SUB
1553	KERNEL2x3_SUB
1554
1555	je	.L6_4_26
1556
1557	jmp	.L6_4_22
1558	ALIGN_4
1559
1560.L6_4_26:
1561        movq    K, %rax
1562
1563	andq	$ 7, %rax		# if (k & 1)
1564	je .L6_4_29
1565
1566	ALIGN_4
1567
1568.L6_4_27:
1569
1570	KERNEL2x3_SUB
1571
1572	jnz	.L6_4_27
1573	ALIGN_4
1574
1575
1576.L6_4_29:
1577
1578	SAVE2x3
1579
1580	addq	$ 4 * SIZE, CO1		# coffset += 4
1581	decq	I			# i --
1582	jg	.L6_4_21
1583	ALIGN_4
1584
1585
1586
1587/**************************************************************************/
1588.L6_4_40:
1589	testq	$ 1, M
1590	jz	.L6_4_60		// to next 2 lines of N
1591
1592	ALIGN_4
1593
1594.L6_4_41:
1595
1596	leaq	BUFFER1, BO		// first buffer to BO
1597	addq	$ 8 * SIZE, BO
1598
1599	vzeroall
1600
1601        movq    K, %rax
1602
1603	andq	$ -8, %rax			//  K = K - ( K % 8 )
1604	je	.L6_4_46
1605
1606	ALIGN_4
1607
1608.L6_4_42:
1609
1610	prefetcht0	A_PR1(AO)
1611	KERNEL1x3_SUB
1612	KERNEL1x3_SUB
1613	KERNEL1x3_SUB
1614	KERNEL1x3_SUB
1615
1616	KERNEL1x3_SUB
1617	KERNEL1x3_SUB
1618	KERNEL1x3_SUB
1619	KERNEL1x3_SUB
1620
1621	je	.L6_4_46
1622
1623	prefetcht0	A_PR1(AO)
1624	KERNEL1x3_SUB
1625	KERNEL1x3_SUB
1626	KERNEL1x3_SUB
1627	KERNEL1x3_SUB
1628
1629	KERNEL1x3_SUB
1630	KERNEL1x3_SUB
1631	KERNEL1x3_SUB
1632	KERNEL1x3_SUB
1633
1634	je	.L6_4_46
1635
1636	jmp	.L6_4_42
1637	ALIGN_4
1638
1639.L6_4_46:
1640        movq    K, %rax
1641
1642	andq	$ 7, %rax		# if (k & 1)
1643	je .L6_4_49
1644	ALIGN_4
1645
1646.L6_4_47:
1647
1648	KERNEL1x3_SUB
1649
1650	jnz	.L6_4_47
1651	ALIGN_4
1652
1653
1654.L6_4_49:
1655
1656	SAVE1x3
1657
1658	addq	$ 2 * SIZE, CO1		# coffset += 2
1659	decq	I			# i --
1660	jg	.L6_4_41
1661	ALIGN_4
1662
1663
1664
1665
1666.L6_4_60:
1667
1668
1669/*******************************************************************************************/
1670
1671.L7_01:
1672	// copy to sub buffer
1673	movq	B, BO1
1674	leaq    BUFFER1, BO		// first buffer to BO
1675	movq    K, %rax
1676        salq    $2, %rax                // 2 * COMPSIZE
1677        leaq    (B, %rax,4), BO2
1678        movq    K, %rax
1679
1680	ALIGN_4
1681
1682.L7_02b:
1683
1684	vmovsd	2*SIZE(BO1), %xmm0
1685	vmovups	      (BO2), %xmm1
1686	vmovsd	 %xmm0,       (BO)
1687	vmovups  %xmm1, 2*SIZE(BO)
1688	addq	$ 4*SIZE,BO1
1689	addq	$ 4*SIZE,BO2
1690	addq	$ 6*SIZE,BO
1691	decq	%rax
1692	jnz	.L7_02b
1693
1694        movq    BO2, B                  // next offset of B
1695
1696.L7_10:
1697	movq	C, CO1
1698	leaq	(C, LDC, 2), C		// c += 2 * ldc
1699	leaq	(C, LDC, 1), C		// c += 1 * ldc
1700
1701	movq	A, AO		 	// aoffset = a
1702	addq	$ 16 * SIZE, AO
1703
1704	movq	M,  I
1705	sarq	$ 3, I			// i = (m >> 3)
1706	je	.L7_4_10
1707
1708	ALIGN_4
1709/**********************************************************************************************************/
1710
1711.L7_8_11:
1712
1713	leaq	BUFFER1, BO		// first buffer to BO
1714	addq	$ 8 * SIZE, BO
1715
1716	vzeroall
1717
1718        movq    K, %rax
1719
1720	andq	$ -8, %rax			//  K = K - ( K % 8 )
1721	je	.L7_8_16
1722
1723	ALIGN_4
1724
1725.L7_8_12:
1726
1727	KERNEL8x3_SUB
1728	KERNEL8x3_SUB
1729	KERNEL8x3_SUB
1730	KERNEL8x3_SUB
1731
1732	KERNEL8x3_SUB
1733	KERNEL8x3_SUB
1734	KERNEL8x3_SUB
1735	KERNEL8x3_SUB
1736
1737	je	.L7_8_16
1738
1739	KERNEL8x3_SUB
1740	KERNEL8x3_SUB
1741	KERNEL8x3_SUB
1742	KERNEL8x3_SUB
1743
1744	KERNEL8x3_SUB
1745	KERNEL8x3_SUB
1746	KERNEL8x3_SUB
1747	KERNEL8x3_SUB
1748
1749	je	.L7_8_16
1750
1751	jmp	.L7_8_12
1752	ALIGN_4
1753
1754.L7_8_16:
1755        movq    K, %rax
1756
1757	andq	$ 7, %rax		# if (k & 1)
1758	je .L7_8_19
1759
1760	ALIGN_4
1761
1762.L7_8_17:
1763
1764	KERNEL8x3_SUB
1765
1766	jnz	.L7_8_17
1767	ALIGN_4
1768
1769
1770.L7_8_19:
1771
1772	SAVE8x3
1773
1774	addq	$ 16 * SIZE, CO1		# coffset += 16
1775	decq	I			# i --
1776	jg	.L7_8_11
1777	ALIGN_4
1778
1779
1780/**********************************************************************************************************/
1781
1782
1783.L7_4_10:
1784	testq	$ 7, M
1785	jz	.L7_4_60		// to next 2 lines of N
1786
1787	testq	$ 4, M
1788	jz	.L7_4_20
1789	ALIGN_4
1790
1791
1792.L7_4_11:
1793
1794	leaq	BUFFER1, BO		// first buffer to BO
1795	addq	$ 8 * SIZE, BO
1796
1797	vzeroall
1798
1799        movq    K, %rax
1800
1801	andq	$ -8, %rax			//  K = K - ( K % 8 )
1802	je	.L7_4_16
1803
1804	ALIGN_4
1805
1806.L7_4_12:
1807
1808	prefetcht0	A_PR1(AO)
1809	KERNEL4x3_SUB
1810	KERNEL4x3_SUB
1811	prefetcht0	A_PR1(AO)
1812	KERNEL4x3_SUB
1813	KERNEL4x3_SUB
1814
1815	prefetcht0	A_PR1(AO)
1816	KERNEL4x3_SUB
1817	KERNEL4x3_SUB
1818	prefetcht0	A_PR1(AO)
1819	KERNEL4x3_SUB
1820	KERNEL4x3_SUB
1821
1822	je	.L7_4_16
1823
1824	prefetcht0	A_PR1(AO)
1825	KERNEL4x3_SUB
1826	KERNEL4x3_SUB
1827	prefetcht0	A_PR1(AO)
1828	KERNEL4x3_SUB
1829	KERNEL4x3_SUB
1830
1831	prefetcht0	A_PR1(AO)
1832	KERNEL4x3_SUB
1833	KERNEL4x3_SUB
1834	prefetcht0	A_PR1(AO)
1835	KERNEL4x3_SUB
1836	KERNEL4x3_SUB
1837
1838	je	.L7_4_16
1839
1840	jmp	.L7_4_12
1841	ALIGN_4
1842
1843.L7_4_16:
1844        movq    K, %rax
1845
1846	andq	$ 7, %rax		# if (k & 1)
1847	je .L7_4_19
1848
1849	ALIGN_4
1850
1851.L7_4_17:
1852
1853	KERNEL4x3_SUB
1854
1855	jnz	.L7_4_17
1856	ALIGN_4
1857
1858
1859.L7_4_19:
1860
1861	SAVE4x3
1862
1863	addq	$ 8 * SIZE, CO1		# coffset += 8
1864	ALIGN_4
1865
1866
1867
1868/**************************************************************************
1869* Rest of M
1870***************************************************************************/
1871
1872.L7_4_20:
1873
1874	testq	$ 2, M
1875	jz	.L7_4_40
1876	ALIGN_4
1877
1878.L7_4_21:
1879
1880	leaq	BUFFER1, BO		// first buffer to BO
1881	addq	$ 8 * SIZE, BO
1882
1883	vzeroall
1884
1885        movq    K, %rax
1886
1887	andq	$ -8, %rax			//  K = K - ( K % 8 )
1888	je	.L7_4_26
1889
1890	ALIGN_4
1891
1892.L7_4_22:
1893
1894	prefetcht0	A_PR1(AO)
1895	KERNEL2x3_SUB
1896	KERNEL2x3_SUB
1897	KERNEL2x3_SUB
1898	KERNEL2x3_SUB
1899
1900	prefetcht0	A_PR1(AO)
1901	KERNEL2x3_SUB
1902	KERNEL2x3_SUB
1903	KERNEL2x3_SUB
1904	KERNEL2x3_SUB
1905
1906	je	.L7_4_26
1907
1908	prefetcht0	A_PR1(AO)
1909	KERNEL2x3_SUB
1910	KERNEL2x3_SUB
1911	KERNEL2x3_SUB
1912	KERNEL2x3_SUB
1913
1914	prefetcht0	A_PR1(AO)
1915	KERNEL2x3_SUB
1916	KERNEL2x3_SUB
1917	KERNEL2x3_SUB
1918	KERNEL2x3_SUB
1919
1920	je	.L7_4_26
1921
1922	jmp	.L7_4_22
1923	ALIGN_4
1924
1925.L7_4_26:
1926        movq    K, %rax
1927
1928	andq	$ 7, %rax		# if (k & 1)
1929	je .L7_4_29
1930
1931	ALIGN_4
1932
1933.L7_4_27:
1934
1935	KERNEL2x3_SUB
1936
1937	jnz	.L7_4_27
1938	ALIGN_4
1939
1940
1941.L7_4_29:
1942
1943	SAVE2x3
1944
1945	addq	$ 4 * SIZE, CO1		# coffset += 4
1946	decq	I			# i --
1947	jg	.L7_4_21
1948	ALIGN_4
1949
1950
1951
1952/**************************************************************************/
1953.L7_4_40:
1954	testq	$ 1, M
1955	jz	.L7_4_60		// to next 2 lines of N
1956
1957	ALIGN_4
1958
1959.L7_4_41:
1960
1961	leaq	BUFFER1, BO		// first buffer to BO
1962	addq	$ 8 * SIZE, BO
1963
1964	vzeroall
1965
1966        movq    K, %rax
1967
1968	andq	$ -8, %rax			//  K = K - ( K % 8 )
1969	je	.L7_4_46
1970
1971	ALIGN_4
1972
1973.L7_4_42:
1974
1975	prefetcht0	A_PR1(AO)
1976	KERNEL1x3_SUB
1977	KERNEL1x3_SUB
1978	KERNEL1x3_SUB
1979	KERNEL1x3_SUB
1980
1981	KERNEL1x3_SUB
1982	KERNEL1x3_SUB
1983	KERNEL1x3_SUB
1984	KERNEL1x3_SUB
1985
1986	je	.L7_4_46
1987
1988	prefetcht0	A_PR1(AO)
1989	KERNEL1x3_SUB
1990	KERNEL1x3_SUB
1991	KERNEL1x3_SUB
1992	KERNEL1x3_SUB
1993
1994	KERNEL1x3_SUB
1995	KERNEL1x3_SUB
1996	KERNEL1x3_SUB
1997	KERNEL1x3_SUB
1998
1999	je	.L7_4_46
2000
2001	jmp	.L7_4_42
2002	ALIGN_4
2003
2004.L7_4_46:
2005        movq    K, %rax
2006
2007	andq	$ 7, %rax		# if (k & 1)
2008	je .L7_4_49
2009	ALIGN_4
2010
2011.L7_4_47:
2012
2013	KERNEL1x3_SUB
2014
2015	jnz	.L7_4_47
2016	ALIGN_4
2017
2018
2019.L7_4_49:
2020
2021	SAVE1x3
2022
2023	addq	$ 2 * SIZE, CO1		# coffset += 2
2024	decq	I			# i --
2025	jg	.L7_4_41
2026	ALIGN_4
2027
2028
2029
2030
2031.L7_4_60:
2032
2033	decq	J			// j --
2034	jg	.L6_01			// next 6 lines of N
2035
2036
2037
2038/************************************************************************************************/
2039
2040.L2_00:
2041
2042	movq	Nmod6,  J
2043	sarq    $1, J           // j = j / 2
2044	cmpq	$ 0, J
2045	je	.L1_0
2046	ALIGN_4
2047
2048
2049
2050.L2_01:
2051	// copy to sub buffer
2052	movq	B, BO1
2053	leaq    BUFFER1, BO		// first buffer to BO
2054	movq	K, %rax
2055	ALIGN_4
2056
2057.L2_02b:
2058
2059	vmovups	(BO1), %xmm0
2060	vmovups	%xmm0,       (BO)
2061	addq	$ 4*SIZE,BO1
2062	addq	$ 4*SIZE,BO
2063	decq	%rax
2064	jnz	.L2_02b
2065
2066.L2_02c:
2067
2068	movq	BO1, B			// next offset of B
2069
2070.L2_10:
2071	movq	C, CO1
2072	leaq	(C, LDC, 2), C		// c += 2 * ldc
2073
2074#if defined(TRMMKERNEL) && defined(LEFT)
2075        movq    OFFSET, %rax
2076        movq    %rax, KK
2077#endif
2078
2079	movq	A, AO		 	// aoffset = a
2080	addq	$ 16 * SIZE, AO
2081
2082	movq	M,  I
2083	sarq	$ 3, I			// i = (m >> 3)
2084	je	.L2_4_10
2085
2086	ALIGN_4
2087/**********************************************************************************************************/
2088
2089.L2_8_11:
2090
2091#if !defined(TRMMKERNEL) || \
2092        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2093        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2094	leaq	BUFFER1, BO		// first buffer to BO
2095	addq	$ 8 * SIZE, BO
2096#else
2097        movq    KK, %rax
2098	leaq	BUFFER1, BO			// first buffer to BO
2099	addq	$ 8 * SIZE, BO
2100	movq    %rax, BI                        //  Index for BO
2101        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
2102        leaq    (BO, BI, SIZE), BO
2103	salq	$ 4, %rax			// rax = rax *16 ; number of values
2104        leaq    (AO, %rax, SIZE), AO
2105#endif
2106
2107	vzeroall
2108
2109#ifndef TRMMKERNEL
2110        movq    K, %rax
2111#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2112        movq    K, %rax
2113        subq    KK, %rax
2114        movq    %rax, KKK
2115#else
2116        movq    KK, %rax
2117#ifdef LEFT
2118        addq    $ 8, %rax        // number of values in AO
2119#else
2120        addq    $ 2, %rax        // number of values in BO
2121#endif
2122        movq    %rax, KKK
2123#endif
2124
2125
2126	andq	$ -8, %rax			//  K = K - ( K % 8 )
2127	je	.L2_8_16
2128	movq    %rax, BI                        //  Index for BO
2129        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2130
2131	salq	$ 4, %rax			// rax = rax *16 ; number of values
2132	leaq	(AO, %rax, SIZE), AO
2133	leaq	(BO, BI, SIZE), BO
2134	negq	BI
2135	negq	%rax
2136	ALIGN_4
2137
2138.L2_8_12:
2139
2140	prefetcht0	A_PR1(AO,%rax,SIZE)
2141	prefetcht0	B_PR1(BO,BI,SIZE)
2142	KERNEL8x2_SUB
2143	prefetcht0	A_PR1(AO,%rax,SIZE)
2144	KERNEL8x2_SUB
2145	prefetcht0	A_PR1(AO,%rax,SIZE)
2146	KERNEL8x2_SUB
2147	prefetcht0	A_PR1(AO,%rax,SIZE)
2148	KERNEL8x2_SUB
2149
2150	prefetcht0	A_PR1(AO,%rax,SIZE)
2151	prefetcht0	B_PR1(BO,BI,SIZE)
2152	KERNEL8x2_SUB
2153	prefetcht0	A_PR1(AO,%rax,SIZE)
2154	KERNEL8x2_SUB
2155	prefetcht0	A_PR1(AO,%rax,SIZE)
2156	KERNEL8x2_SUB
2157	prefetcht0	A_PR1(AO,%rax,SIZE)
2158	KERNEL8x2_SUB
2159
2160	je	.L2_8_16
2161
2162	prefetcht0	A_PR1(AO,%rax,SIZE)
2163	prefetcht0	B_PR1(BO,BI,SIZE)
2164	KERNEL8x2_SUB
2165	prefetcht0	A_PR1(AO,%rax,SIZE)
2166	KERNEL8x2_SUB
2167	prefetcht0	A_PR1(AO,%rax,SIZE)
2168	KERNEL8x2_SUB
2169	prefetcht0	A_PR1(AO,%rax,SIZE)
2170	KERNEL8x2_SUB
2171
2172	prefetcht0	A_PR1(AO,%rax,SIZE)
2173	prefetcht0	B_PR1(BO,BI,SIZE)
2174	KERNEL8x2_SUB
2175	prefetcht0	A_PR1(AO,%rax,SIZE)
2176	KERNEL8x2_SUB
2177	prefetcht0	A_PR1(AO,%rax,SIZE)
2178	KERNEL8x2_SUB
2179	prefetcht0	A_PR1(AO,%rax,SIZE)
2180	KERNEL8x2_SUB
2181
2182	je	.L2_8_16
2183
2184	jmp	.L2_8_12
2185	ALIGN_4
2186
2187.L2_8_16:
2188#ifndef TRMMKERNEL
2189        movq    K, %rax
2190#else
2191        movq    KKK, %rax
2192#endif
2193
2194	andq	$ 7, %rax		# if (k & 1)
2195	je .L2_8_19
2196
2197	movq    %rax, BI                        //  Index for BO
2198        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2199
2200	salq	$ 4, %rax			// rax = rax *16 ; number of values
2201	leaq	(AO, %rax, SIZE), AO
2202	leaq	(BO, BI, SIZE), BO
2203	negq	BI
2204	negq	%rax
2205	ALIGN_4
2206
2207.L2_8_17:
2208
2209	KERNEL8x2_SUB
2210
2211	jl	.L2_8_17
2212	ALIGN_4
2213
2214
2215.L2_8_19:
2216
2217	SAVE8x2
2218
2219
2220#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2221    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2222        movq    K, %rax
2223        subq    KKK, %rax
2224	movq    %rax, BI                        //  Index for BO
2225        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2226        leaq    (BO, BI, SIZE), BO
2227	salq	$ 4, %rax			// rax = rax *16 ; number of values
2228        leaq    (AO, %rax, SIZE), AO
2229#endif
2230
2231
2232#if defined(TRMMKERNEL) && defined(LEFT)
2233        addq    $ 8, KK
2234#endif
2235
2236	addq	$ 16 * SIZE, CO1		# coffset += 16
2237	decq	I			# i --
2238	jg	.L2_8_11
2239	ALIGN_4
2240
2241
2242/**********************************************************************************************************/
2243
2244
2245
2246
2247.L2_4_10:
2248	testq	$ 7, M
2249	jz	.L2_4_60		// to next 2 lines of N
2250
2251	testq	$ 4, M
2252	jz	.L2_4_20
2253	ALIGN_4
2254
2255
2256.L2_4_11:
2257
2258#if !defined(TRMMKERNEL) || \
2259        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2260        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2261	leaq	BUFFER1, BO		// first buffer to BO
2262	addq	$ 8 * SIZE, BO
2263#else
2264        movq    KK, %rax
2265	leaq	BUFFER1, BO			// first buffer to BO
2266	addq	$ 8 * SIZE, BO
2267	movq    %rax, BI                        //  Index for BO
2268        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
2269        leaq    (BO, BI, SIZE), BO
2270	salq	$ 3, %rax			// rax = rax * 8 ; number of values
2271        leaq    (AO, %rax, SIZE), AO
2272#endif
2273
2274	vzeroall
2275
2276#ifndef TRMMKERNEL
2277        movq    K, %rax
2278#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2279        movq    K, %rax
2280        subq    KK, %rax
2281        movq    %rax, KKK
2282#else
2283        movq    KK, %rax
2284#ifdef LEFT
2285        addq    $ 4, %rax        // number of values in AO
2286#else
2287        addq    $ 2, %rax        // number of values in BO
2288#endif
2289        movq    %rax, KKK
2290#endif
2291
2292
2293	andq	$ -8, %rax			//  K = K - ( K % 8 )
2294	je	.L2_4_16
2295	movq    %rax, BI                        //  Index for BO
2296        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2297
2298	salq	$ 3, %rax			// rax = rax * 8 ; number of values
2299	leaq	(AO, %rax, SIZE), AO
2300	leaq	(BO, BI, SIZE), BO
2301	negq	BI
2302	negq	%rax
2303	ALIGN_4
2304
2305.L2_4_12:
2306
2307	prefetcht0	A_PR1(AO,%rax,SIZE)
2308	prefetcht0	B_PR1(BO,BI,SIZE)
2309	KERNEL4x2_SUB
2310	KERNEL4x2_SUB
2311	prefetcht0	A_PR1(AO,%rax,SIZE)
2312	KERNEL4x2_SUB
2313	KERNEL4x2_SUB
2314
2315	prefetcht0	A_PR1(AO,%rax,SIZE)
2316	prefetcht0	B_PR1(BO,BI,SIZE)
2317	KERNEL4x2_SUB
2318	KERNEL4x2_SUB
2319	prefetcht0	A_PR1(AO,%rax,SIZE)
2320	KERNEL4x2_SUB
2321	KERNEL4x2_SUB
2322
2323	je	.L2_4_16
2324
2325	prefetcht0	A_PR1(AO,%rax,SIZE)
2326	prefetcht0	B_PR1(BO,BI,SIZE)
2327	KERNEL4x2_SUB
2328	KERNEL4x2_SUB
2329	prefetcht0	A_PR1(AO,%rax,SIZE)
2330	KERNEL4x2_SUB
2331	KERNEL4x2_SUB
2332
2333	prefetcht0	A_PR1(AO,%rax,SIZE)
2334	prefetcht0	B_PR1(BO,BI,SIZE)
2335	KERNEL4x2_SUB
2336	KERNEL4x2_SUB
2337	prefetcht0	A_PR1(AO,%rax,SIZE)
2338	KERNEL4x2_SUB
2339	KERNEL4x2_SUB
2340
2341	je	.L2_4_16
2342
2343	jmp	.L2_4_12
2344	ALIGN_4
2345
2346.L2_4_16:
2347#ifndef TRMMKERNEL
2348        movq    K, %rax
2349#else
2350        movq    KKK, %rax
2351#endif
2352
2353	andq	$ 7, %rax		# if (k & 1)
2354	je .L2_4_19
2355
2356	movq    %rax, BI                        //  Index for BO
2357        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2358
2359	salq	$ 3, %rax			// rax = rax * 8 ; number of values
2360	leaq	(AO, %rax, SIZE), AO
2361	leaq	(BO, BI, SIZE), BO
2362	negq	BI
2363	negq	%rax
2364	ALIGN_4
2365
2366.L2_4_17:
2367
2368	KERNEL4x2_SUB
2369
2370	jl	.L2_4_17
2371	ALIGN_4
2372
2373
2374.L2_4_19:
2375
2376	SAVE4x2
2377
2378#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2379    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2380        movq    K, %rax
2381        subq    KKK, %rax
2382	movq    %rax, BI                        //  Index for BO
2383        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2384        leaq    (BO, BI, SIZE), BO
2385	salq	$ 3, %rax			// rax = rax * 8 ; number of values
2386        leaq    (AO, %rax, SIZE), AO
2387#endif
2388
2389
2390#if defined(TRMMKERNEL) && defined(LEFT)
2391        addq    $ 4, KK
2392#endif
2393
2394	addq	$ 8 * SIZE, CO1		# coffset += 8
2395	ALIGN_4
2396
2397
2398
2399/**************************************************************************
2400* Rest of M
2401***************************************************************************/
2402
2403.L2_4_20:
2404
2405	testq	$ 2, M
2406	jz	.L2_4_40
2407	ALIGN_4
2408
2409.L2_4_21:
2410
2411#if !defined(TRMMKERNEL) || \
2412        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2413        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2414	leaq	BUFFER1, BO		// first buffer to BO
2415	addq	$ 8 * SIZE, BO
2416#else
2417        movq    KK, %rax
2418	leaq	BUFFER1, BO			// first buffer to BO
2419	addq	$ 8 * SIZE, BO
2420	movq    %rax, BI                        //  Index for BO
2421        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
2422        leaq    (BO, BI, SIZE), BO
2423	salq	$ 2, %rax			// rax = rax * 4 ; number of values
2424        leaq    (AO, %rax, SIZE), AO
2425#endif
2426
2427	vzeroall
2428
2429#ifndef TRMMKERNEL
2430        movq    K, %rax
2431#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2432        movq    K, %rax
2433        subq    KK, %rax
2434        movq    %rax, KKK
2435#else
2436        movq    KK, %rax
2437#ifdef LEFT
2438        addq    $ 2, %rax        // number of values in AO
2439#else
2440        addq    $ 2, %rax        // number of values in BO
2441#endif
2442        movq    %rax, KKK
2443#endif
2444
2445
2446	andq	$ -8, %rax			//  K = K - ( K % 8 )
2447	je	.L2_4_26
2448	movq    %rax, BI                        //  Index for BO
2449        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2450
2451	salq	$ 2, %rax			// rax = rax * 4 ; number of values
2452	leaq	(AO, %rax, SIZE), AO
2453	leaq	(BO, BI, SIZE), BO
2454	negq	BI
2455	negq	%rax
2456	ALIGN_4
2457
2458.L2_4_22:
2459
2460	prefetcht0	A_PR1(AO,%rax,SIZE)
2461	prefetcht0	B_PR1(BO,BI,SIZE)
2462	KERNEL2x2_SUB
2463	KERNEL2x2_SUB
2464	KERNEL2x2_SUB
2465	KERNEL2x2_SUB
2466
2467	prefetcht0	A_PR1(AO,%rax,SIZE)
2468	prefetcht0	B_PR1(BO,BI,SIZE)
2469	KERNEL2x2_SUB
2470	KERNEL2x2_SUB
2471	KERNEL2x2_SUB
2472	KERNEL2x2_SUB
2473
2474	je	.L2_4_26
2475
2476	prefetcht0	A_PR1(AO,%rax,SIZE)
2477	prefetcht0	B_PR1(BO,BI,SIZE)
2478	KERNEL2x2_SUB
2479	KERNEL2x2_SUB
2480	KERNEL2x2_SUB
2481	KERNEL2x2_SUB
2482
2483	prefetcht0	A_PR1(AO,%rax,SIZE)
2484	prefetcht0	B_PR1(BO,BI,SIZE)
2485	KERNEL2x2_SUB
2486	KERNEL2x2_SUB
2487	KERNEL2x2_SUB
2488	KERNEL2x2_SUB
2489
2490	je	.L2_4_26
2491
2492	jmp	.L2_4_22
2493	ALIGN_4
2494
2495.L2_4_26:
2496#ifndef TRMMKERNEL
2497        movq    K, %rax
2498#else
2499        movq    KKK, %rax
2500#endif
2501
2502	andq	$ 7, %rax		# if (k & 1)
2503	je .L2_4_29
2504
2505	movq    %rax, BI                        //  Index for BO
2506        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2507
2508	salq	$ 2, %rax			// rax = rax * 4 ; number of values
2509	leaq	(AO, %rax, SIZE), AO
2510	leaq	(BO, BI, SIZE), BO
2511	negq	BI
2512	negq	%rax
2513	ALIGN_4
2514
2515.L2_4_27:
2516
2517	KERNEL2x2_SUB
2518
2519	jl	.L2_4_27
2520	ALIGN_4
2521
2522
2523.L2_4_29:
2524
2525	vbroadcastss	ALPHA_R, %xmm0
2526	vbroadcastss	ALPHA_I, %xmm1
2527
2528	// swap high and low 64 bytes
2529        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
2530        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
2531
2532#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
2533    defined(NR) || defined(NC) || defined(TR) || defined(TC)
2534
2535        vaddsubps %xmm9, %xmm8 , %xmm8
2536        vaddsubps %xmm11,%xmm10, %xmm10
2537
2538        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
2539        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
2540
2541#else
2542        vaddsubps %xmm8,  %xmm9 ,%xmm9
2543        vaddsubps %xmm10, %xmm11,%xmm11
2544
2545        vmovaps   %xmm9,  %xmm8
2546        vmovaps   %xmm11, %xmm10
2547
2548	// swap high and low 64 bytes
2549        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
2550        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
2551
2552#endif
2553
2554	// multiply with ALPHA_R
2555        vmulps  %xmm8 , %xmm0, %xmm8
2556        vmulps  %xmm10, %xmm0, %xmm10
2557
2558	// multiply with ALPHA_I
2559        vmulps  %xmm9 , %xmm1, %xmm9
2560        vmulps  %xmm11, %xmm1, %xmm11
2561
2562	vaddsubps %xmm9, %xmm8 , %xmm8
2563        vaddsubps %xmm11,%xmm10, %xmm10
2564
2565
2566
2567#ifndef TRMMKERNEL
2568
2569	vaddps 	 	(CO1), %xmm8 , %xmm8
2570
2571	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
2572
2573#endif
2574
2575	vmovups	%xmm8 ,  	(CO1)
2576
2577	vmovups	%xmm10 ,  	(CO1, LDC)
2578
2579
2580
2581#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2582    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2583        movq    K, %rax
2584        subq    KKK, %rax
2585	movq    %rax, BI                        //  Index for BO
2586        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2587        leaq    (BO, BI, SIZE), BO
2588	salq	$ 2, %rax			// rax = rax * 4 ; number of values
2589        leaq    (AO, %rax, SIZE), AO
2590#endif
2591
2592
2593#if defined(TRMMKERNEL) && defined(LEFT)
2594        addq    $ 2, KK
2595#endif
2596
2597	addq	$ 4 * SIZE, CO1		# coffset += 4
2598	decq	I			# i --
2599	jg	.L2_4_21
2600	ALIGN_4
2601
2602
2603
2604/**************************************************************************/
2605.L2_4_40:
2606	testq	$ 1, M
2607	jz	.L2_4_60		// to next 2 lines of N
2608
2609	ALIGN_4
2610
2611.L2_4_41:
2612
2613#if !defined(TRMMKERNEL) || \
2614        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2615        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2616	leaq	BUFFER1, BO		// first buffer to BO
2617	addq	$ 8 * SIZE, BO
2618#else
2619        movq    KK, %rax
2620	leaq	BUFFER1, BO			// first buffer to BO
2621	addq	$ 8 * SIZE, BO
2622	movq    %rax, BI                        //  Index for BO
2623        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
2624        leaq    (BO, BI, SIZE), BO
2625	salq	$ 1, %rax			// rax = rax * 2 ; number of values
2626        leaq    (AO, %rax, SIZE), AO
2627#endif
2628
2629	vzeroall
2630
2631#ifndef TRMMKERNEL
2632        movq    K, %rax
2633#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2634        movq    K, %rax
2635        subq    KK, %rax
2636        movq    %rax, KKK
2637#else
2638        movq    KK, %rax
2639#ifdef LEFT
2640        addq    $ 1, %rax        // number of values in AO
2641#else
2642        addq    $ 2, %rax        // number of values in BO
2643#endif
2644        movq    %rax, KKK
2645#endif
2646
2647
2648	andq	$ -8, %rax			//  K = K - ( K % 8 )
2649	je	.L2_4_46
2650	movq    %rax, BI                        //  Index for BO
2651        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2652
2653	salq	$ 1, %rax			// rax = rax * 2 ; number of values
2654	leaq	(AO, %rax, SIZE), AO
2655	leaq	(BO, BI, SIZE), BO
2656	negq	BI
2657	negq	%rax
2658	ALIGN_4
2659
2660.L2_4_42:
2661
2662	prefetcht0	A_PR1(AO,%rax,SIZE)
2663	prefetcht0	B_PR1(BO,BI,SIZE)
2664	KERNEL1x2_SUB
2665	KERNEL1x2_SUB
2666	KERNEL1x2_SUB
2667	KERNEL1x2_SUB
2668
2669	prefetcht0	B_PR1(BO,BI,SIZE)
2670	KERNEL1x2_SUB
2671	KERNEL1x2_SUB
2672	KERNEL1x2_SUB
2673	KERNEL1x2_SUB
2674
2675	je	.L2_4_46
2676
2677	prefetcht0	A_PR1(AO,%rax,SIZE)
2678	prefetcht0	B_PR1(BO,BI,SIZE)
2679	KERNEL1x2_SUB
2680	KERNEL1x2_SUB
2681	KERNEL1x2_SUB
2682	KERNEL1x2_SUB
2683
2684	prefetcht0	B_PR1(BO,BI,SIZE)
2685	KERNEL1x2_SUB
2686	KERNEL1x2_SUB
2687	KERNEL1x2_SUB
2688	KERNEL1x2_SUB
2689
2690	je	.L2_4_46
2691
2692	jmp	.L2_4_42
2693	ALIGN_4
2694
2695.L2_4_46:
2696#ifndef TRMMKERNEL
2697        movq    K, %rax
2698#else
2699        movq    KKK, %rax
2700#endif
2701
2702	andq	$ 7, %rax		# if (k & 1)
2703	je .L2_4_49
2704
2705	movq    %rax, BI                        //  Index for BO
2706        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2707
2708	salq	$ 1, %rax			// rax = rax * 2 ; number of values
2709	leaq	(AO, %rax, SIZE), AO
2710	leaq	(BO, BI, SIZE), BO
2711	negq	BI
2712	negq	%rax
2713	ALIGN_4
2714
2715.L2_4_47:
2716
2717	KERNEL1x2_SUB
2718
2719	jl	.L2_4_47
2720	ALIGN_4
2721
2722
2723.L2_4_49:
2724
2725	SAVE1x2
2726
2727#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2728    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2729        movq    K, %rax
2730        subq    KKK, %rax
2731	movq    %rax, BI                        //  Index for BO
2732        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
2733        leaq    (BO, BI, SIZE), BO
2734	salq	$ 1, %rax			// rax = rax * 2 ; number of values
2735        leaq    (AO, %rax, SIZE), AO
2736#endif
2737
2738
2739#if defined(TRMMKERNEL) && defined(LEFT)
2740        addq    $ 1, KK
2741#endif
2742
2743	addq	$ 2 * SIZE, CO1		# coffset += 2
2744	decq	I			# i --
2745	jg	.L2_4_41
2746	ALIGN_4
2747
2748
2749
2750
2751.L2_4_60:
2752#if defined(TRMMKERNEL) && !defined(LEFT)
2753        addq    $ 2, KK
2754#endif
2755
2756	decq	J			// j --
2757	jg	.L2_01			// next 2 lines of N
2758
2759
2760
2761.L1_0:
2762
2763/************************************************************************************************
2764* Loop for Nmod6 % 2 > 0
2765*************************************************************************************************/
2766
2767	movq	Nmod6, J
2768	andq	$ 1, J			// j % 2
2769	je	.L999
2770	ALIGN_4
2771
2772.L1_01:
2773	// copy to sub buffer
2774	movq	B, BO1
2775	leaq    BUFFER1, BO		// first buffer to BO
2776	movq	K, %rax
2777	ALIGN_4
2778
2779.L1_02b:
2780
2781	vmovsd		(BO1), %xmm0
2782	vmovsd	%xmm0,       (BO)
2783	addq	$ 2*SIZE,BO1
2784	addq	$ 2*SIZE,BO
2785	decq	%rax
2786	jnz	.L1_02b
2787
2788.L1_02c:
2789
2790	movq	BO1, B			// next offset of B
2791
2792.L1_10:
2793	movq	C, CO1
2794	leaq	(C, LDC, 1), C		// c += 1 * ldc
2795
2796#if defined(TRMMKERNEL) && defined(LEFT)
2797        movq    OFFSET, %rax
2798        movq    %rax, KK
2799#endif
2800
2801	movq	A, AO		 	// aoffset = a
2802	addq	$ 16 * SIZE, AO
2803
2804	movq	M,  I
2805	sarq	$ 3, I			// i = (m >> 3)
2806	je	.L1_4_10
2807
2808	ALIGN_4
2809
2810/**************************************************************************************************/
2811
2812.L1_8_11:
2813
2814#if !defined(TRMMKERNEL) || \
2815        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2816        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2817	leaq	BUFFER1, BO		// first buffer to BO
2818	addq	$ 4 * SIZE, BO
2819#else
2820        movq    KK, %rax
2821	leaq	BUFFER1, BO			// first buffer to BO
2822	addq	$ 4 * SIZE, BO
2823	movq    %rax, BI                        //  Index for BO
2824        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
2825        leaq    (BO, BI, SIZE), BO
2826	salq	$ 4, %rax			// rax = rax *16 ; number of values
2827        leaq    (AO, %rax, SIZE), AO
2828#endif
2829
2830	vzeroall
2831
2832#ifndef TRMMKERNEL
2833        movq    K, %rax
2834#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2835        movq    K, %rax
2836        subq    KK, %rax
2837        movq    %rax, KKK
2838#else
2839        movq    KK, %rax
2840#ifdef LEFT
2841        addq    $ 8, %rax        // number of values in AO
2842#else
2843        addq    $ 1, %rax        // number of values in BO
2844#endif
2845        movq    %rax, KKK
2846#endif
2847
2848
2849	andq	$ -8, %rax			//  K = K - ( K % 8 )
2850	je	.L1_8_16
2851	movq    %rax, BI                        //  Index for BO
2852        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
2853
2854	salq	$ 4, %rax			// rax = rax *16 ; number of values
2855	leaq	(AO, %rax, SIZE), AO
2856	leaq	(BO, BI, SIZE), BO
2857	negq	BI
2858	negq	%rax
2859	ALIGN_4
2860
2861.L1_8_12:
2862
2863	prefetcht0	A_PR1(AO,%rax,SIZE)
2864	prefetcht0	B_PR1(BO,BI,SIZE)
2865	KERNEL8x1_SUB
2866	prefetcht0	A_PR1(AO,%rax,SIZE)
2867	KERNEL8x1_SUB
2868	prefetcht0	A_PR1(AO,%rax,SIZE)
2869	KERNEL8x1_SUB
2870	prefetcht0	A_PR1(AO,%rax,SIZE)
2871	KERNEL8x1_SUB
2872
2873	prefetcht0	A_PR1(AO,%rax,SIZE)
2874	KERNEL8x1_SUB
2875	prefetcht0	A_PR1(AO,%rax,SIZE)
2876	KERNEL8x1_SUB
2877	prefetcht0	A_PR1(AO,%rax,SIZE)
2878	KERNEL8x1_SUB
2879	prefetcht0	A_PR1(AO,%rax,SIZE)
2880	KERNEL8x1_SUB
2881
2882	je	.L1_8_16
2883
2884	prefetcht0	A_PR1(AO,%rax,SIZE)
2885	prefetcht0	B_PR1(BO,BI,SIZE)
2886	KERNEL8x1_SUB
2887	prefetcht0	A_PR1(AO,%rax,SIZE)
2888	KERNEL8x1_SUB
2889	prefetcht0	A_PR1(AO,%rax,SIZE)
2890	KERNEL8x1_SUB
2891	prefetcht0	A_PR1(AO,%rax,SIZE)
2892	KERNEL8x1_SUB
2893
2894	prefetcht0	A_PR1(AO,%rax,SIZE)
2895	KERNEL8x1_SUB
2896	prefetcht0	A_PR1(AO,%rax,SIZE)
2897	KERNEL8x1_SUB
2898	prefetcht0	A_PR1(AO,%rax,SIZE)
2899	KERNEL8x1_SUB
2900	prefetcht0	A_PR1(AO,%rax,SIZE)
2901	KERNEL8x1_SUB
2902
2903	je	.L1_8_16
2904
2905	jmp	.L1_8_12
2906	ALIGN_4
2907
2908.L1_8_16:
2909#ifndef TRMMKERNEL
2910        movq    K, %rax
2911#else
2912        movq    KKK, %rax
2913#endif
2914
2915	andq	$ 7, %rax		# if (k & 1)
2916	je .L1_8_19
2917
2918	movq    %rax, BI                        //  Index for BO
2919        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
2920
2921	salq	$ 4, %rax			// rax = rax *16 ; number of values
2922	leaq	(AO, %rax, SIZE), AO
2923	leaq	(BO, BI, SIZE), BO
2924	negq	BI
2925	negq	%rax
2926	ALIGN_4
2927
2928.L1_8_17:
2929
2930	KERNEL8x1_SUB
2931
2932	jl	.L1_8_17
2933	ALIGN_4
2934
2935
2936.L1_8_19:
2937
2938	SAVE8x1
2939
2940
2941#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2942    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2943        movq    K, %rax
2944        subq    KKK, %rax
2945	movq    %rax, BI                        //  Index for BO
2946        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
2947        leaq    (BO, BI, SIZE), BO
2948	salq	$ 4, %rax			// rax = rax *16 ; number of values
2949        leaq    (AO, %rax, SIZE), AO
2950#endif
2951
2952
2953#if defined(TRMMKERNEL) && defined(LEFT)
2954        addq    $ 8, KK
2955#endif
2956
2957	addq	$ 16 * SIZE, CO1		# coffset += 16
2958	decq	I			# i --
2959	jg	.L1_8_11
2960	ALIGN_4
2961
2962
2963
2964/**************************************************************************************************/
2965.L1_4_10:
2966
2967	testq	$ 7, M
2968	jz	.L999
2969
2970	testq	$ 4, M
2971	jz	.L1_4_20
2972
2973
2974.L1_4_11:
2975
2976#if !defined(TRMMKERNEL) || \
2977        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2978        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2979	leaq	BUFFER1, BO		// first buffer to BO
2980	addq	$ 4 * SIZE, BO
2981#else
2982        movq    KK, %rax
2983	leaq	BUFFER1, BO			// first buffer to BO
2984	addq	$ 4 * SIZE, BO
2985	movq    %rax, BI                        //  Index for BO
2986        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
2987        leaq    (BO, BI, SIZE), BO
2988	salq	$ 3, %rax			// rax = rax * 8 ; number of values
2989        leaq    (AO, %rax, SIZE), AO
2990#endif
2991
2992	vzeroall
2993
2994#ifndef TRMMKERNEL
2995        movq    K, %rax
2996#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2997        movq    K, %rax
2998        subq    KK, %rax
2999        movq    %rax, KKK
3000#else
3001        movq    KK, %rax
3002#ifdef LEFT
3003        addq    $ 4, %rax        // number of values in AO
3004#else
3005        addq    $ 1, %rax        // number of values in BO
3006#endif
3007        movq    %rax, KKK
3008#endif
3009
3010
3011	andq	$ -8, %rax			//  K = K - ( K % 8 )
3012	je	.L1_4_16
3013	movq    %rax, BI                        //  Index for BO
3014        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3015
3016	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3017	leaq	(AO, %rax, SIZE), AO
3018	leaq	(BO, BI, SIZE), BO
3019	negq	BI
3020	negq	%rax
3021	ALIGN_4
3022
3023.L1_4_12:
3024
3025	prefetcht0	A_PR1(AO,%rax,SIZE)
3026	prefetcht0	B_PR1(BO,BI,SIZE)
3027	KERNEL4x1_SUB
3028	KERNEL4x1_SUB
3029	prefetcht0	A_PR1(AO,%rax,SIZE)
3030	KERNEL4x1_SUB
3031	KERNEL4x1_SUB
3032
3033	prefetcht0	A_PR1(AO,%rax,SIZE)
3034	KERNEL4x1_SUB
3035	KERNEL4x1_SUB
3036	prefetcht0	A_PR1(AO,%rax,SIZE)
3037	KERNEL4x1_SUB
3038	KERNEL4x1_SUB
3039
3040	je	.L1_4_16
3041
3042	prefetcht0	A_PR1(AO,%rax,SIZE)
3043	prefetcht0	B_PR1(BO,BI,SIZE)
3044	KERNEL4x1_SUB
3045	KERNEL4x1_SUB
3046	prefetcht0	A_PR1(AO,%rax,SIZE)
3047	KERNEL4x1_SUB
3048	KERNEL4x1_SUB
3049
3050	prefetcht0	A_PR1(AO,%rax,SIZE)
3051	KERNEL4x1_SUB
3052	KERNEL4x1_SUB
3053	prefetcht0	A_PR1(AO,%rax,SIZE)
3054	KERNEL4x1_SUB
3055	KERNEL4x1_SUB
3056
3057	je	.L1_4_16
3058
3059	jmp	.L1_4_12
3060	ALIGN_4
3061
3062.L1_4_16:
3063#ifndef TRMMKERNEL
3064        movq    K, %rax
3065#else
3066        movq    KKK, %rax
3067#endif
3068
3069	andq	$ 7, %rax		# if (k & 1)
3070	je .L1_4_19
3071
3072	movq    %rax, BI                        //  Index for BO
3073        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
3074
3075	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3076	leaq	(AO, %rax, SIZE), AO
3077	leaq	(BO, BI, SIZE), BO
3078	negq	BI
3079	negq	%rax
3080	ALIGN_4
3081
3082.L1_4_17:
3083
3084	KERNEL4x1_SUB
3085
3086	jl	.L1_4_17
3087	ALIGN_4
3088
3089
3090.L1_4_19:
3091
3092	SAVE4x1
3093
3094#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3095    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3096        movq    K, %rax
3097        subq    KKK, %rax
3098	movq    %rax, BI                        //  Index for BO
3099        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3100        leaq    (BO, BI, SIZE), BO
3101	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3102        leaq    (AO, %rax, SIZE), AO
3103#endif
3104
3105
3106#if defined(TRMMKERNEL) && defined(LEFT)
3107        addq    $ 4, KK
3108#endif
3109
3110	addq	$ 8 * SIZE, CO1		# coffset += 8
3111	ALIGN_4
3112
3113
3114
3115/**************************************************************************
3116* Rest of M
3117***************************************************************************/
3118
3119.L1_4_20:
3120
3121	testq	$ 2, M
3122	jz	.L1_4_40
3123	ALIGN_4
3124
3125.L1_4_21:
3126
3127#if !defined(TRMMKERNEL) || \
3128        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3129        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3130	leaq	BUFFER1, BO		// first buffer to BO
3131	addq	$ 4 * SIZE, BO
3132#else
3133        movq    KK, %rax
3134	leaq	BUFFER1, BO			// first buffer to BO
3135	addq	$ 4 * SIZE, BO
3136	movq    %rax, BI                        //  Index for BO
3137        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
3138        leaq    (BO, BI, SIZE), BO
3139	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3140        leaq    (AO, %rax, SIZE), AO
3141#endif
3142
3143	vzeroall
3144
3145#ifndef TRMMKERNEL
3146        movq    K, %rax
3147#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3148        movq    K, %rax
3149        subq    KK, %rax
3150        movq    %rax, KKK
3151#else
3152        movq    KK, %rax
3153#ifdef LEFT
3154        addq    $ 2, %rax        // number of values in AO
3155#else
3156        addq    $ 1, %rax        // number of values in BO
3157#endif
3158        movq    %rax, KKK
3159#endif
3160
3161
3162	andq	$ -8, %rax			//  K = K - ( K % 8 )
3163	je	.L1_4_26
3164	movq    %rax, BI                        //  Index for BO
3165        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3166
3167	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3168	leaq	(AO, %rax, SIZE), AO
3169	leaq	(BO, BI, SIZE), BO
3170	negq	BI
3171	negq	%rax
3172	ALIGN_4
3173
3174.L1_4_22:
3175
3176	prefetcht0	A_PR1(AO,%rax,SIZE)
3177	prefetcht0	B_PR1(BO,BI,SIZE)
3178	KERNEL2x1_SUB
3179	KERNEL2x1_SUB
3180	KERNEL2x1_SUB
3181	KERNEL2x1_SUB
3182
3183	prefetcht0	A_PR1(AO,%rax,SIZE)
3184	KERNEL2x1_SUB
3185	KERNEL2x1_SUB
3186	KERNEL2x1_SUB
3187	KERNEL2x1_SUB
3188
3189	je	.L1_4_26
3190
3191	prefetcht0	A_PR1(AO,%rax,SIZE)
3192	prefetcht0	B_PR1(BO,BI,SIZE)
3193	KERNEL2x1_SUB
3194	KERNEL2x1_SUB
3195	KERNEL2x1_SUB
3196	KERNEL2x1_SUB
3197
3198	prefetcht0	A_PR1(AO,%rax,SIZE)
3199	KERNEL2x1_SUB
3200	KERNEL2x1_SUB
3201	KERNEL2x1_SUB
3202	KERNEL2x1_SUB
3203
3204	je	.L1_4_26
3205
3206	jmp	.L1_4_22
3207	ALIGN_4
3208
3209.L1_4_26:
3210#ifndef TRMMKERNEL
3211        movq    K, %rax
3212#else
3213        movq    KKK, %rax
3214#endif
3215
3216	andq	$ 7, %rax		# if (k & 1)
3217	je .L1_4_29
3218
3219	movq    %rax, BI                        //  Index for BO
3220        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
3221
3222	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3223	leaq	(AO, %rax, SIZE), AO
3224	leaq	(BO, BI, SIZE), BO
3225	negq	BI
3226	negq	%rax
3227	ALIGN_4
3228
3229.L1_4_27:
3230
3231	KERNEL2x1_SUB
3232
3233	jl	.L1_4_27
3234	ALIGN_4
3235
3236
3237.L1_4_29:
3238
3239	SAVE2x1
3240
3241#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3242    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3243        movq    K, %rax
3244        subq    KKK, %rax
3245	movq    %rax, BI                        //  Index for BO
3246        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3247        leaq    (BO, BI, SIZE), BO
3248	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3249        leaq    (AO, %rax, SIZE), AO
3250#endif
3251
3252
3253#if defined(TRMMKERNEL) && defined(LEFT)
3254        addq    $ 2, KK
3255#endif
3256
3257	addq	$ 4 * SIZE, CO1		# coffset += 4
3258	ALIGN_4
3259
3260
3261
3262/**************************************************************************/
3263.L1_4_40:
3264	testq	$ 1, M
3265	jz	.L999		// to next 2 lines of N
3266
3267	ALIGN_4
3268
3269.L1_4_41:
3270
3271#if !defined(TRMMKERNEL) || \
3272        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3273        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3274	leaq	BUFFER1, BO		// first buffer to BO
3275	addq	$ 4 * SIZE, BO
3276#else
3277        movq    KK, %rax
3278	leaq	BUFFER1, BO			// first buffer to BO
3279	addq	$ 4 * SIZE, BO
3280	movq    %rax, BI                        //  Index for BO
3281        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
3282        leaq    (BO, BI, SIZE), BO
3283	salq	$ 1, %rax			// rax = rax * 2 ; number of values
3284        leaq    (AO, %rax, SIZE), AO
3285#endif
3286
3287	vzeroall
3288
3289#ifndef TRMMKERNEL
3290        movq    K, %rax
3291#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3292        movq    K, %rax
3293        subq    KK, %rax
3294        movq    %rax, KKK
3295#else
3296        movq    KK, %rax
3297#ifdef LEFT
3298        addq    $ 1, %rax        // number of values in AO
3299#else
3300        addq    $ 1, %rax        // number of values in BO
3301#endif
3302        movq    %rax, KKK
3303#endif
3304
3305
3306	andq	$ -8, %rax			//  K = K - ( K % 8 )
3307	je	.L1_4_46
3308	movq    %rax, BI                        //  Index for BO
3309        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3310
3311	salq	$ 1, %rax			// rax = rax * 2 ; number of values
3312	leaq	(AO, %rax, SIZE), AO
3313	leaq	(BO, BI, SIZE), BO
3314	negq	BI
3315	negq	%rax
3316	ALIGN_4
3317
3318.L1_4_42:
3319
3320	prefetcht0	A_PR1(AO,%rax,SIZE)
3321	prefetcht0	B_PR1(BO,BI,SIZE)
3322	KERNEL1x1_SUB
3323	KERNEL1x1_SUB
3324	KERNEL1x1_SUB
3325	KERNEL1x1_SUB
3326
3327	KERNEL1x1_SUB
3328	KERNEL1x1_SUB
3329	KERNEL1x1_SUB
3330	KERNEL1x1_SUB
3331
3332	je	.L1_4_46
3333
3334	prefetcht0	A_PR1(AO,%rax,SIZE)
3335	prefetcht0	B_PR1(BO,BI,SIZE)
3336	KERNEL1x1_SUB
3337	KERNEL1x1_SUB
3338	KERNEL1x1_SUB
3339	KERNEL1x1_SUB
3340
3341	KERNEL1x1_SUB
3342	KERNEL1x1_SUB
3343	KERNEL1x1_SUB
3344	KERNEL1x1_SUB
3345
3346	je	.L1_4_46
3347
3348	jmp	.L1_4_42
3349	ALIGN_4
3350
3351.L1_4_46:
3352#ifndef TRMMKERNEL
3353        movq    K, %rax
3354#else
3355        movq    KKK, %rax
3356#endif
3357
3358	andq	$ 7, %rax		# if (k & 1)
3359	je .L1_4_49
3360
3361	movq    %rax, BI                        //  Index for BO
3362        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3363
3364	salq	$ 1, %rax			// rax = rax * 2 ; number of values
3365	leaq	(AO, %rax, SIZE), AO
3366	leaq	(BO, BI, SIZE), BO
3367	negq	BI
3368	negq	%rax
3369	ALIGN_4
3370
3371.L1_4_47:
3372
3373	KERNEL1x1_SUB
3374
3375	jl	.L1_4_47
3376	ALIGN_4
3377
3378
3379.L1_4_49:
3380
3381	SAVE1x1
3382
3383
3384
3385#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3386    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3387        movq    K, %rax
3388        subq    KKK, %rax
3389	movq    %rax, BI                        //  Index for BO
3390        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
3391        leaq    (BO, BI, SIZE), BO
3392	salq	$ 1, %rax			// rax = rax * 2 ; number of values
3393        leaq    (AO, %rax, SIZE), AO
3394#endif
3395
3396
3397#if defined(TRMMKERNEL) && defined(LEFT)
3398        addq    $ 1, KK
3399#endif
3400
3401	addq	$ 2 * SIZE, CO1		# coffset += 2
3402	ALIGN_4
3403
3404
3405.L999:
3406	vzeroupper
3407
3408	movq   		SP, %rsp
3409	movq	   (%rsp), %rbx
3410	movq	  8(%rsp), %rbp
3411	movq	 16(%rsp), %r12
3412	movq	 24(%rsp), %r13
3413	movq	 32(%rsp), %r14
3414	movq	 40(%rsp), %r15
3415
3416#ifdef WINDOWS_ABI
3417	movq	 48(%rsp), %rdi
3418	movq	 56(%rsp), %rsi
3419	vmovups	 64(%rsp), %xmm6
3420	vmovups	 80(%rsp), %xmm7
3421	vmovups	 96(%rsp), %xmm8
3422	vmovups	112(%rsp), %xmm9
3423	vmovups	128(%rsp), %xmm10
3424	vmovups	144(%rsp), %xmm11
3425	vmovups	160(%rsp), %xmm12
3426	vmovups	176(%rsp), %xmm13
3427	vmovups	192(%rsp), %xmm14
3428	vmovups	208(%rsp), %xmm15
3429#endif
3430
3431	addq	$ STACKSIZE, %rsp
3432	ret
3433
3434	EPILOGUE
3435
3436#else
3437
3438/************************************************************************************************/
3439
3440
3441	PROLOGUE
3442	PROFCODE
3443
3444	subq	$ STACKSIZE, %rsp
3445	movq	%rbx,   (%rsp)
3446	movq	%rbp,  8(%rsp)
3447	movq	%r12, 16(%rsp)
3448	movq	%r13, 24(%rsp)
3449	movq	%r14, 32(%rsp)
3450	movq	%r15, 40(%rsp)
3451
3452	vzeroupper
3453
3454#ifdef WINDOWS_ABI
3455	movq	%rdi,    48(%rsp)
3456	movq	%rsi,    56(%rsp)
3457	vmovups	%xmm6,   64(%rsp)
3458	vmovups	%xmm7,   80(%rsp)
3459	vmovups	%xmm8,   96(%rsp)
3460	vmovups	%xmm9,  112(%rsp)
3461	vmovups	%xmm10, 128(%rsp)
3462	vmovups	%xmm11, 144(%rsp)
3463	vmovups	%xmm12, 160(%rsp)
3464	vmovups	%xmm13, 176(%rsp)
3465	vmovups	%xmm14, 192(%rsp)
3466	vmovups	%xmm15, 208(%rsp)
3467
3468	movq	ARG1,      OLD_M
3469	movq	ARG2,      OLD_N
3470	movq	ARG3,      OLD_K
3471	movq	OLD_A,     A
3472	movq	OLD_B,     B
3473	movq	OLD_C,     C
3474	movq	OLD_LDC,   LDC
3475#ifdef TRMMKERNEL
3476	movsd	OLD_OFFSET, %xmm12
3477#endif
3478	vmovaps	%xmm3, %xmm0
3479	vmovsd   OLD_ALPHA_I, %xmm1
3480
3481#else
3482	movq	STACKSIZE +  8(%rsp), LDC
3483#ifdef TRMMKERNEL
3484	movsd	STACKSIZE + 16(%rsp), %xmm12
3485#endif
3486
3487#endif
3488
3489	movq    %rsp, SP      # save old stack
3490        subq    $ 128 + L_BUFFER_SIZE, %rsp
3491        andq    $ -4096, %rsp    # align stack
3492
3493        STACK_TOUCH
3494
3495	cmpq	$ 0, OLD_M
3496	je	.L999
3497
3498	cmpq	$ 0, OLD_N
3499	je	.L999
3500
3501	cmpq	$ 0, OLD_K
3502	je	.L999
3503
3504	movq	OLD_M, M
3505	movq	OLD_N, N
3506	movq	OLD_K, K
3507
3508	vmovss	 %xmm0, ALPHA_R
3509	vmovss	 %xmm1, ALPHA_I
3510
3511	salq	$ ZBASE_SHIFT, LDC
3512
3513	movq    N, %rax
3514        xorq    %rdx, %rdx
3515        movq    $ 2,  %rdi
3516        divq    %rdi                    //    N / 2
3517        movq    %rax, Ndiv6             //    N / 2
3518        movq    %rdx, Nmod6             //    N % 2
3519
3520
3521
3522#ifdef TRMMKERNEL
3523	vmovsd	%xmm12, OFFSET
3524	vmovsd	%xmm12, KK
3525#ifndef LEFT
3526	negq	KK
3527#endif
3528#endif
3529
3530.L2_0:
3531
3532	movq	Ndiv6,  J
3533	cmpq	$ 0, J
3534	je	.L1_0
3535	ALIGN_4
3536
3537
3538
3539.L2_01:
3540	// copy to sub buffer
3541	movq	B, BO1
3542	leaq    BUFFER1, BO		// first buffer to BO
3543	movq	K, %rax
3544	ALIGN_4
3545
3546.L2_02b:
3547
3548	vmovups	(BO1), %xmm0
3549	vmovups	%xmm0,       (BO)
3550	addq	$ 4*SIZE,BO1
3551	addq	$ 4*SIZE,BO
3552	decq	%rax
3553	jnz	.L2_02b
3554
3555.L2_02c:
3556
3557	movq	BO1, B			// next offset of B
3558
3559.L2_10:
3560	movq	C, CO1
3561	leaq	(C, LDC, 2), C		// c += 2 * ldc
3562
3563#if defined(TRMMKERNEL) && defined(LEFT)
3564        movq    OFFSET, %rax
3565        movq    %rax, KK
3566#endif
3567
3568	movq	A, AO		 	// aoffset = a
3569	addq	$ 16 * SIZE, AO
3570
3571	movq	M,  I
3572	sarq	$ 3, I			// i = (m >> 3)
3573	je	.L2_4_10
3574
3575	ALIGN_4
3576/**********************************************************************************************************/
3577
3578.L2_8_11:
3579
3580#if !defined(TRMMKERNEL) || \
3581        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3582        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3583	leaq	BUFFER1, BO		// first buffer to BO
3584	addq	$ 8 * SIZE, BO
3585#else
3586        movq    KK, %rax
3587	leaq	BUFFER1, BO			// first buffer to BO
3588	addq	$ 8 * SIZE, BO
3589	movq    %rax, BI                        //  Index for BO
3590        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
3591        leaq    (BO, BI, SIZE), BO
3592	salq	$ 4, %rax			// rax = rax *16 ; number of values
3593        leaq    (AO, %rax, SIZE), AO
3594#endif
3595
3596	vzeroall
3597
3598#ifndef TRMMKERNEL
3599        movq    K, %rax
3600#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3601        movq    K, %rax
3602        subq    KK, %rax
3603        movq    %rax, KKK
3604#else
3605        movq    KK, %rax
3606#ifdef LEFT
3607        addq    $ 8, %rax        // number of values in AO
3608#else
3609        addq    $ 2, %rax        // number of values in BO
3610#endif
3611        movq    %rax, KKK
3612#endif
3613
3614
3615	andq	$ -8, %rax			//  K = K - ( K % 8 )
3616	je	.L2_8_16
3617	movq    %rax, BI                        //  Index for BO
3618        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3619
3620	salq	$ 4, %rax			// rax = rax *16 ; number of values
3621	leaq	(AO, %rax, SIZE), AO
3622	leaq	(BO, BI, SIZE), BO
3623	negq	BI
3624	negq	%rax
3625	ALIGN_4
3626
3627.L2_8_12:
3628
3629	prefetcht0	A_PR1(AO,%rax,SIZE)
3630	prefetcht0	B_PR1(BO,BI,SIZE)
3631	KERNEL8x2_SUB
3632	prefetcht0	A_PR1(AO,%rax,SIZE)
3633	KERNEL8x2_SUB
3634	prefetcht0	A_PR1(AO,%rax,SIZE)
3635	KERNEL8x2_SUB
3636	prefetcht0	A_PR1(AO,%rax,SIZE)
3637	KERNEL8x2_SUB
3638
3639	prefetcht0	A_PR1(AO,%rax,SIZE)
3640	prefetcht0	B_PR1(BO,BI,SIZE)
3641	KERNEL8x2_SUB
3642	prefetcht0	A_PR1(AO,%rax,SIZE)
3643	KERNEL8x2_SUB
3644	prefetcht0	A_PR1(AO,%rax,SIZE)
3645	KERNEL8x2_SUB
3646	prefetcht0	A_PR1(AO,%rax,SIZE)
3647	KERNEL8x2_SUB
3648
3649	je	.L2_8_16
3650
3651	prefetcht0	A_PR1(AO,%rax,SIZE)
3652	prefetcht0	B_PR1(BO,BI,SIZE)
3653	KERNEL8x2_SUB
3654	prefetcht0	A_PR1(AO,%rax,SIZE)
3655	KERNEL8x2_SUB
3656	prefetcht0	A_PR1(AO,%rax,SIZE)
3657	KERNEL8x2_SUB
3658	prefetcht0	A_PR1(AO,%rax,SIZE)
3659	KERNEL8x2_SUB
3660
3661	prefetcht0	A_PR1(AO,%rax,SIZE)
3662	prefetcht0	B_PR1(BO,BI,SIZE)
3663	KERNEL8x2_SUB
3664	prefetcht0	A_PR1(AO,%rax,SIZE)
3665	KERNEL8x2_SUB
3666	prefetcht0	A_PR1(AO,%rax,SIZE)
3667	KERNEL8x2_SUB
3668	prefetcht0	A_PR1(AO,%rax,SIZE)
3669	KERNEL8x2_SUB
3670
3671	je	.L2_8_16
3672
3673	jmp	.L2_8_12
3674	ALIGN_4
3675
3676.L2_8_16:
3677#ifndef TRMMKERNEL
3678        movq    K, %rax
3679#else
3680        movq    KKK, %rax
3681#endif
3682
3683	andq	$ 7, %rax		# if (k & 1)
3684	je .L2_8_19
3685
3686	movq    %rax, BI                        //  Index for BO
3687        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3688
3689	salq	$ 4, %rax			// rax = rax *16 ; number of values
3690	leaq	(AO, %rax, SIZE), AO
3691	leaq	(BO, BI, SIZE), BO
3692	negq	BI
3693	negq	%rax
3694	ALIGN_4
3695
3696.L2_8_17:
3697
3698	KERNEL8x2_SUB
3699
3700	jl	.L2_8_17
3701	ALIGN_4
3702
3703
3704.L2_8_19:
3705
3706	SAVE8x2
3707
3708
3709#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3710    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3711        movq    K, %rax
3712        subq    KKK, %rax
3713	movq    %rax, BI                        //  Index for BO
3714        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3715        leaq    (BO, BI, SIZE), BO
3716	salq	$ 4, %rax			// rax = rax *16 ; number of values
3717        leaq    (AO, %rax, SIZE), AO
3718#endif
3719
3720
3721#if defined(TRMMKERNEL) && defined(LEFT)
3722        addq    $ 8, KK
3723#endif
3724
3725	addq	$ 16 * SIZE, CO1		# coffset += 16
3726	decq	I			# i --
3727	jg	.L2_8_11
3728	ALIGN_4
3729
3730
3731/**********************************************************************************************************/
3732
3733
3734
3735
3736.L2_4_10:
3737	testq	$ 7, M
3738	jz	.L2_4_60		// to next 2 lines of N
3739
3740	testq	$ 4, M
3741	jz	.L2_4_20
3742	ALIGN_4
3743
3744
3745.L2_4_11:
3746
3747#if !defined(TRMMKERNEL) || \
3748        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3749        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3750	leaq	BUFFER1, BO		// first buffer to BO
3751	addq	$ 8 * SIZE, BO
3752#else
3753        movq    KK, %rax
3754	leaq	BUFFER1, BO			// first buffer to BO
3755	addq	$ 8 * SIZE, BO
3756	movq    %rax, BI                        //  Index for BO
3757        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
3758        leaq    (BO, BI, SIZE), BO
3759	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3760        leaq    (AO, %rax, SIZE), AO
3761#endif
3762
3763	vzeroall
3764
3765#ifndef TRMMKERNEL
3766        movq    K, %rax
3767#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3768        movq    K, %rax
3769        subq    KK, %rax
3770        movq    %rax, KKK
3771#else
3772        movq    KK, %rax
3773#ifdef LEFT
3774        addq    $ 4, %rax        // number of values in AO
3775#else
3776        addq    $ 2, %rax        // number of values in BO
3777#endif
3778        movq    %rax, KKK
3779#endif
3780
3781
3782	andq	$ -8, %rax			//  K = K - ( K % 8 )
3783	je	.L2_4_16
3784	movq    %rax, BI                        //  Index for BO
3785        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3786
3787	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3788	leaq	(AO, %rax, SIZE), AO
3789	leaq	(BO, BI, SIZE), BO
3790	negq	BI
3791	negq	%rax
3792	ALIGN_4
3793
3794.L2_4_12:
3795
3796	prefetcht0	A_PR1(AO,%rax,SIZE)
3797	prefetcht0	B_PR1(BO,BI,SIZE)
3798	KERNEL4x2_SUB
3799	KERNEL4x2_SUB
3800	prefetcht0	A_PR1(AO,%rax,SIZE)
3801	KERNEL4x2_SUB
3802	KERNEL4x2_SUB
3803
3804	prefetcht0	A_PR1(AO,%rax,SIZE)
3805	prefetcht0	B_PR1(BO,BI,SIZE)
3806	KERNEL4x2_SUB
3807	KERNEL4x2_SUB
3808	prefetcht0	A_PR1(AO,%rax,SIZE)
3809	KERNEL4x2_SUB
3810	KERNEL4x2_SUB
3811
3812	je	.L2_4_16
3813
3814	prefetcht0	A_PR1(AO,%rax,SIZE)
3815	prefetcht0	B_PR1(BO,BI,SIZE)
3816	KERNEL4x2_SUB
3817	KERNEL4x2_SUB
3818	prefetcht0	A_PR1(AO,%rax,SIZE)
3819	KERNEL4x2_SUB
3820	KERNEL4x2_SUB
3821
3822	prefetcht0	A_PR1(AO,%rax,SIZE)
3823	prefetcht0	B_PR1(BO,BI,SIZE)
3824	KERNEL4x2_SUB
3825	KERNEL4x2_SUB
3826	prefetcht0	A_PR1(AO,%rax,SIZE)
3827	KERNEL4x2_SUB
3828	KERNEL4x2_SUB
3829
3830	je	.L2_4_16
3831
3832	jmp	.L2_4_12
3833	ALIGN_4
3834
3835.L2_4_16:
3836#ifndef TRMMKERNEL
3837        movq    K, %rax
3838#else
3839        movq    KKK, %rax
3840#endif
3841
3842	andq	$ 7, %rax		# if (k & 1)
3843	je .L2_4_19
3844
3845	movq    %rax, BI                        //  Index for BO
3846        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3847
3848	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3849	leaq	(AO, %rax, SIZE), AO
3850	leaq	(BO, BI, SIZE), BO
3851	negq	BI
3852	negq	%rax
3853	ALIGN_4
3854
3855.L2_4_17:
3856
3857	KERNEL4x2_SUB
3858
3859	jl	.L2_4_17
3860	ALIGN_4
3861
3862
3863.L2_4_19:
3864
3865	SAVE4x2
3866
3867#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3868    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3869        movq    K, %rax
3870        subq    KKK, %rax
3871	movq    %rax, BI                        //  Index for BO
3872        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3873        leaq    (BO, BI, SIZE), BO
3874	salq	$ 3, %rax			// rax = rax * 8 ; number of values
3875        leaq    (AO, %rax, SIZE), AO
3876#endif
3877
3878
3879#if defined(TRMMKERNEL) && defined(LEFT)
3880        addq    $ 4, KK
3881#endif
3882
3883	addq	$ 8 * SIZE, CO1		# coffset += 8
3884	ALIGN_4
3885
3886
3887
3888/**************************************************************************
3889* Rest of M
3890***************************************************************************/
3891
3892.L2_4_20:
3893
3894	testq	$ 2, M
3895	jz	.L2_4_40
3896	ALIGN_4
3897
3898.L2_4_21:
3899
3900#if !defined(TRMMKERNEL) || \
3901        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3902        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3903	leaq	BUFFER1, BO		// first buffer to BO
3904	addq	$ 8 * SIZE, BO
3905#else
3906        movq    KK, %rax
3907	leaq	BUFFER1, BO			// first buffer to BO
3908	addq	$ 8 * SIZE, BO
3909	movq    %rax, BI                        //  Index for BO
3910        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
3911        leaq    (BO, BI, SIZE), BO
3912	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3913        leaq    (AO, %rax, SIZE), AO
3914#endif
3915
3916	vzeroall
3917
3918#ifndef TRMMKERNEL
3919        movq    K, %rax
3920#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3921        movq    K, %rax
3922        subq    KK, %rax
3923        movq    %rax, KKK
3924#else
3925        movq    KK, %rax
3926#ifdef LEFT
3927        addq    $ 2, %rax        // number of values in AO
3928#else
3929        addq    $ 2, %rax        // number of values in BO
3930#endif
3931        movq    %rax, KKK
3932#endif
3933
3934
3935	andq	$ -8, %rax			//  K = K - ( K % 8 )
3936	je	.L2_4_26
3937	movq    %rax, BI                        //  Index for BO
3938        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3939
3940	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3941	leaq	(AO, %rax, SIZE), AO
3942	leaq	(BO, BI, SIZE), BO
3943	negq	BI
3944	negq	%rax
3945	ALIGN_4
3946
3947.L2_4_22:
3948
3949	prefetcht0	A_PR1(AO,%rax,SIZE)
3950	prefetcht0	B_PR1(BO,BI,SIZE)
3951	KERNEL2x2_SUB
3952	KERNEL2x2_SUB
3953	KERNEL2x2_SUB
3954	KERNEL2x2_SUB
3955
3956	prefetcht0	A_PR1(AO,%rax,SIZE)
3957	prefetcht0	B_PR1(BO,BI,SIZE)
3958	KERNEL2x2_SUB
3959	KERNEL2x2_SUB
3960	KERNEL2x2_SUB
3961	KERNEL2x2_SUB
3962
3963	je	.L2_4_26
3964
3965	prefetcht0	A_PR1(AO,%rax,SIZE)
3966	prefetcht0	B_PR1(BO,BI,SIZE)
3967	KERNEL2x2_SUB
3968	KERNEL2x2_SUB
3969	KERNEL2x2_SUB
3970	KERNEL2x2_SUB
3971
3972	prefetcht0	A_PR1(AO,%rax,SIZE)
3973	prefetcht0	B_PR1(BO,BI,SIZE)
3974	KERNEL2x2_SUB
3975	KERNEL2x2_SUB
3976	KERNEL2x2_SUB
3977	KERNEL2x2_SUB
3978
3979	je	.L2_4_26
3980
3981	jmp	.L2_4_22
3982	ALIGN_4
3983
3984.L2_4_26:
3985#ifndef TRMMKERNEL
3986        movq    K, %rax
3987#else
3988        movq    KKK, %rax
3989#endif
3990
3991	andq	$ 7, %rax		# if (k & 1)
3992	je .L2_4_29
3993
3994	movq    %rax, BI                        //  Index for BO
3995        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
3996
3997	salq	$ 2, %rax			// rax = rax * 4 ; number of values
3998	leaq	(AO, %rax, SIZE), AO
3999	leaq	(BO, BI, SIZE), BO
4000	negq	BI
4001	negq	%rax
4002	ALIGN_4
4003
4004.L2_4_27:
4005
4006	KERNEL2x2_SUB
4007
4008	jl	.L2_4_27
4009	ALIGN_4
4010
4011
4012.L2_4_29:
4013
4014	vbroadcastss	ALPHA_R, %xmm0
4015	vbroadcastss	ALPHA_I, %xmm1
4016
4017	// swap high and low 64 bytes
4018        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
4019        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
4020
4021#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
4022    defined(NR) || defined(NC) || defined(TR) || defined(TC)
4023
4024        vaddsubps %xmm9, %xmm8 , %xmm8
4025        vaddsubps %xmm11,%xmm10, %xmm10
4026
4027        vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9
4028        vshufps $ 0xb1, %xmm10, %xmm10, %xmm11
4029
4030#else
4031        vaddsubps %xmm8,  %xmm9 ,%xmm9
4032        vaddsubps %xmm10, %xmm11,%xmm11
4033
4034        vmovaps   %xmm9,  %xmm8
4035        vmovaps   %xmm11, %xmm10
4036
4037	// swap high and low 64 bytes
4038        vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9
4039        vshufps $ 0xb1, %xmm11, %xmm11, %xmm11
4040
4041#endif
4042
4043	// multiply with ALPHA_R
4044        vmulps  %xmm8 , %xmm0, %xmm8
4045        vmulps  %xmm10, %xmm0, %xmm10
4046
4047	// multiply with ALPHA_I
4048        vmulps  %xmm9 , %xmm1, %xmm9
4049        vmulps  %xmm11, %xmm1, %xmm11
4050
4051	vaddsubps %xmm9, %xmm8 , %xmm8
4052        vaddsubps %xmm11,%xmm10, %xmm10
4053
4054
4055
4056#ifndef TRMMKERNEL
4057
4058	vaddps 	 	(CO1), %xmm8 , %xmm8
4059
4060	vaddps 	 	(CO1, LDC), %xmm10, %xmm10
4061
4062#endif
4063
4064	vmovups	%xmm8 ,  	(CO1)
4065
4066	vmovups	%xmm10 ,  	(CO1, LDC)
4067
4068
4069
4070#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4071    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4072        movq    K, %rax
4073        subq    KKK, %rax
4074	movq    %rax, BI                        //  Index for BO
4075        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
4076        leaq    (BO, BI, SIZE), BO
4077	salq	$ 2, %rax			// rax = rax * 4 ; number of values
4078        leaq    (AO, %rax, SIZE), AO
4079#endif
4080
4081
4082#if defined(TRMMKERNEL) && defined(LEFT)
4083        addq    $ 2, KK
4084#endif
4085
4086	addq	$ 4 * SIZE, CO1		# coffset += 4
4087	decq	I			# i --
4088	jg	.L2_4_21
4089	ALIGN_4
4090
4091
4092
4093/**************************************************************************/
4094.L2_4_40:
4095	testq	$ 1, M
4096	jz	.L2_4_60		// to next 2 lines of N
4097
4098	ALIGN_4
4099
4100.L2_4_41:
4101
4102#if !defined(TRMMKERNEL) || \
4103        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4104        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4105	leaq	BUFFER1, BO		// first buffer to BO
4106	addq	$ 8 * SIZE, BO
4107#else
4108        movq    KK, %rax
4109	leaq	BUFFER1, BO			// first buffer to BO
4110	addq	$ 8 * SIZE, BO
4111	movq    %rax, BI                        //  Index for BO
4112        leaq    (,BI,4), BI                     //  BI = BI * 4 ; number of values
4113        leaq    (BO, BI, SIZE), BO
4114	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4115        leaq    (AO, %rax, SIZE), AO
4116#endif
4117
4118	vzeroall
4119
4120#ifndef TRMMKERNEL
4121        movq    K, %rax
4122#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4123        movq    K, %rax
4124        subq    KK, %rax
4125        movq    %rax, KKK
4126#else
4127        movq    KK, %rax
4128#ifdef LEFT
4129        addq    $ 1, %rax        // number of values in AO
4130#else
4131        addq    $ 2, %rax        // number of values in BO
4132#endif
4133        movq    %rax, KKK
4134#endif
4135
4136
4137	andq	$ -8, %rax			//  K = K - ( K % 8 )
4138	je	.L2_4_46
4139	movq    %rax, BI                        //  Index for BO
4140        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
4141
4142	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4143	leaq	(AO, %rax, SIZE), AO
4144	leaq	(BO, BI, SIZE), BO
4145	negq	BI
4146	negq	%rax
4147	ALIGN_4
4148
4149.L2_4_42:
4150
4151	prefetcht0	A_PR1(AO,%rax,SIZE)
4152	prefetcht0	B_PR1(BO,BI,SIZE)
4153	KERNEL1x2_SUB
4154	KERNEL1x2_SUB
4155	KERNEL1x2_SUB
4156	KERNEL1x2_SUB
4157
4158	prefetcht0	B_PR1(BO,BI,SIZE)
4159	KERNEL1x2_SUB
4160	KERNEL1x2_SUB
4161	KERNEL1x2_SUB
4162	KERNEL1x2_SUB
4163
4164	je	.L2_4_46
4165
4166	prefetcht0	A_PR1(AO,%rax,SIZE)
4167	prefetcht0	B_PR1(BO,BI,SIZE)
4168	KERNEL1x2_SUB
4169	KERNEL1x2_SUB
4170	KERNEL1x2_SUB
4171	KERNEL1x2_SUB
4172
4173	prefetcht0	B_PR1(BO,BI,SIZE)
4174	KERNEL1x2_SUB
4175	KERNEL1x2_SUB
4176	KERNEL1x2_SUB
4177	KERNEL1x2_SUB
4178
4179	je	.L2_4_46
4180
4181	jmp	.L2_4_42
4182	ALIGN_4
4183
4184.L2_4_46:
4185#ifndef TRMMKERNEL
4186        movq    K, %rax
4187#else
4188        movq    KKK, %rax
4189#endif
4190
4191	andq	$ 7, %rax		# if (k & 1)
4192	je .L2_4_49
4193
4194	movq    %rax, BI                        //  Index for BO
4195        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
4196
4197	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4198	leaq	(AO, %rax, SIZE), AO
4199	leaq	(BO, BI, SIZE), BO
4200	negq	BI
4201	negq	%rax
4202	ALIGN_4
4203
4204.L2_4_47:
4205
4206	KERNEL1x2_SUB
4207
4208	jl	.L2_4_47
4209	ALIGN_4
4210
4211
4212.L2_4_49:
4213
4214	SAVE1x2
4215
4216#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4217    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4218        movq    K, %rax
4219        subq    KKK, %rax
4220	movq    %rax, BI                        //  Index for BO
4221        leaq    ( ,BI,4), BI                    //  BI = BI * 4 ; number of values
4222        leaq    (BO, BI, SIZE), BO
4223	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4224        leaq    (AO, %rax, SIZE), AO
4225#endif
4226
4227
4228#if defined(TRMMKERNEL) && defined(LEFT)
4229        addq    $ 1, KK
4230#endif
4231
4232	addq	$ 2 * SIZE, CO1		# coffset += 2
4233	decq	I			# i --
4234	jg	.L2_4_41
4235	ALIGN_4
4236
4237
4238
4239
4240.L2_4_60:
4241#if defined(TRMMKERNEL) && !defined(LEFT)
4242        addq    $ 2, KK
4243#endif
4244
4245	decq	J			// j --
4246	jg	.L2_01			// next 2 lines of N
4247
4248
4249
4250.L1_0:
4251
4252/************************************************************************************************
4253* Loop for Nmod6 % 2 > 0
4254*************************************************************************************************/
4255
4256	movq	Nmod6, J
4257	andq	$ 1, J			// j % 2
4258	je	.L999
4259	ALIGN_4
4260
4261.L1_01:
4262	// copy to sub buffer
4263	movq	B, BO1
4264	leaq    BUFFER1, BO		// first buffer to BO
4265	movq	K, %rax
4266	ALIGN_4
4267
4268.L1_02b:
4269
4270	vmovsd		(BO1), %xmm0
4271	vmovsd	%xmm0,       (BO)
4272	addq	$ 2*SIZE,BO1
4273	addq	$ 2*SIZE,BO
4274	decq	%rax
4275	jnz	.L1_02b
4276
4277.L1_02c:
4278
4279	movq	BO1, B			// next offset of B
4280
4281.L1_10:
4282	movq	C, CO1
4283	leaq	(C, LDC, 1), C		// c += 1 * ldc
4284
4285#if defined(TRMMKERNEL) && defined(LEFT)
4286        movq    OFFSET, %rax
4287        movq    %rax, KK
4288#endif
4289
4290	movq	A, AO		 	// aoffset = a
4291	addq	$ 16 * SIZE, AO
4292
4293	movq	M,  I
4294	sarq	$ 3, I			// i = (m >> 3)
4295	je	.L1_4_10
4296
4297	ALIGN_4
4298
4299/**************************************************************************************************/
4300
4301.L1_8_11:
4302
4303#if !defined(TRMMKERNEL) || \
4304        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4305        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4306	leaq	BUFFER1, BO		// first buffer to BO
4307	addq	$ 4 * SIZE, BO
4308#else
4309        movq    KK, %rax
4310	leaq	BUFFER1, BO			// first buffer to BO
4311	addq	$ 4 * SIZE, BO
4312	movq    %rax, BI                        //  Index for BO
4313        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
4314        leaq    (BO, BI, SIZE), BO
4315	salq	$ 4, %rax			// rax = rax *16 ; number of values
4316        leaq    (AO, %rax, SIZE), AO
4317#endif
4318
4319	vzeroall
4320
4321#ifndef TRMMKERNEL
4322        movq    K, %rax
4323#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4324        movq    K, %rax
4325        subq    KK, %rax
4326        movq    %rax, KKK
4327#else
4328        movq    KK, %rax
4329#ifdef LEFT
4330        addq    $ 8, %rax        // number of values in AO
4331#else
4332        addq    $ 1, %rax        // number of values in BO
4333#endif
4334        movq    %rax, KKK
4335#endif
4336
4337
4338	andq	$ -8, %rax			//  K = K - ( K % 8 )
4339	je	.L1_8_16
4340	movq    %rax, BI                        //  Index for BO
4341        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4342
4343	salq	$ 4, %rax			// rax = rax *16 ; number of values
4344	leaq	(AO, %rax, SIZE), AO
4345	leaq	(BO, BI, SIZE), BO
4346	negq	BI
4347	negq	%rax
4348	ALIGN_4
4349
4350.L1_8_12:
4351
4352	prefetcht0	A_PR1(AO,%rax,SIZE)
4353	prefetcht0	B_PR1(BO,BI,SIZE)
4354	KERNEL8x1_SUB
4355	prefetcht0	A_PR1(AO,%rax,SIZE)
4356	KERNEL8x1_SUB
4357	prefetcht0	A_PR1(AO,%rax,SIZE)
4358	KERNEL8x1_SUB
4359	prefetcht0	A_PR1(AO,%rax,SIZE)
4360	KERNEL8x1_SUB
4361
4362	prefetcht0	A_PR1(AO,%rax,SIZE)
4363	KERNEL8x1_SUB
4364	prefetcht0	A_PR1(AO,%rax,SIZE)
4365	KERNEL8x1_SUB
4366	prefetcht0	A_PR1(AO,%rax,SIZE)
4367	KERNEL8x1_SUB
4368	prefetcht0	A_PR1(AO,%rax,SIZE)
4369	KERNEL8x1_SUB
4370
4371	je	.L1_8_16
4372
4373	prefetcht0	A_PR1(AO,%rax,SIZE)
4374	prefetcht0	B_PR1(BO,BI,SIZE)
4375	KERNEL8x1_SUB
4376	prefetcht0	A_PR1(AO,%rax,SIZE)
4377	KERNEL8x1_SUB
4378	prefetcht0	A_PR1(AO,%rax,SIZE)
4379	KERNEL8x1_SUB
4380	prefetcht0	A_PR1(AO,%rax,SIZE)
4381	KERNEL8x1_SUB
4382
4383	prefetcht0	A_PR1(AO,%rax,SIZE)
4384	KERNEL8x1_SUB
4385	prefetcht0	A_PR1(AO,%rax,SIZE)
4386	KERNEL8x1_SUB
4387	prefetcht0	A_PR1(AO,%rax,SIZE)
4388	KERNEL8x1_SUB
4389	prefetcht0	A_PR1(AO,%rax,SIZE)
4390	KERNEL8x1_SUB
4391
4392	je	.L1_8_16
4393
4394	jmp	.L1_8_12
4395	ALIGN_4
4396
4397.L1_8_16:
4398#ifndef TRMMKERNEL
4399        movq    K, %rax
4400#else
4401        movq    KKK, %rax
4402#endif
4403
4404	andq	$ 7, %rax		# if (k & 1)
4405	je .L1_8_19
4406
4407	movq    %rax, BI                        //  Index for BO
4408        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
4409
4410	salq	$ 4, %rax			// rax = rax *16 ; number of values
4411	leaq	(AO, %rax, SIZE), AO
4412	leaq	(BO, BI, SIZE), BO
4413	negq	BI
4414	negq	%rax
4415	ALIGN_4
4416
4417.L1_8_17:
4418
4419	KERNEL8x1_SUB
4420
4421	jl	.L1_8_17
4422	ALIGN_4
4423
4424
4425.L1_8_19:
4426
4427	SAVE8x1
4428
4429
4430#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4431    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4432        movq    K, %rax
4433        subq    KKK, %rax
4434	movq    %rax, BI                        //  Index for BO
4435        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4436        leaq    (BO, BI, SIZE), BO
4437	salq	$ 4, %rax			// rax = rax *16 ; number of values
4438        leaq    (AO, %rax, SIZE), AO
4439#endif
4440
4441
4442#if defined(TRMMKERNEL) && defined(LEFT)
4443        addq    $ 8, KK
4444#endif
4445
4446	addq	$ 16 * SIZE, CO1		# coffset += 16
4447	decq	I			# i --
4448	jg	.L1_8_11
4449	ALIGN_4
4450
4451
4452
4453/**************************************************************************************************/
4454.L1_4_10:
4455
4456	testq	$ 7, M
4457	jz	.L999
4458
4459	testq	$ 4, M
4460	jz	.L1_4_20
4461
4462
4463.L1_4_11:
4464
4465#if !defined(TRMMKERNEL) || \
4466        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4467        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4468	leaq	BUFFER1, BO		// first buffer to BO
4469	addq	$ 4 * SIZE, BO
4470#else
4471        movq    KK, %rax
4472	leaq	BUFFER1, BO			// first buffer to BO
4473	addq	$ 4 * SIZE, BO
4474	movq    %rax, BI                        //  Index for BO
4475        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
4476        leaq    (BO, BI, SIZE), BO
4477	salq	$ 3, %rax			// rax = rax * 8 ; number of values
4478        leaq    (AO, %rax, SIZE), AO
4479#endif
4480
4481	vzeroall
4482
4483#ifndef TRMMKERNEL
4484        movq    K, %rax
4485#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4486        movq    K, %rax
4487        subq    KK, %rax
4488        movq    %rax, KKK
4489#else
4490        movq    KK, %rax
4491#ifdef LEFT
4492        addq    $ 4, %rax        // number of values in AO
4493#else
4494        addq    $ 1, %rax        // number of values in BO
4495#endif
4496        movq    %rax, KKK
4497#endif
4498
4499
4500	andq	$ -8, %rax			//  K = K - ( K % 8 )
4501	je	.L1_4_16
4502	movq    %rax, BI                        //  Index for BO
4503        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4504
4505	salq	$ 3, %rax			// rax = rax * 8 ; number of values
4506	leaq	(AO, %rax, SIZE), AO
4507	leaq	(BO, BI, SIZE), BO
4508	negq	BI
4509	negq	%rax
4510	ALIGN_4
4511
4512.L1_4_12:
4513
4514	prefetcht0	A_PR1(AO,%rax,SIZE)
4515	prefetcht0	B_PR1(BO,BI,SIZE)
4516	KERNEL4x1_SUB
4517	KERNEL4x1_SUB
4518	prefetcht0	A_PR1(AO,%rax,SIZE)
4519	KERNEL4x1_SUB
4520	KERNEL4x1_SUB
4521
4522	prefetcht0	A_PR1(AO,%rax,SIZE)
4523	KERNEL4x1_SUB
4524	KERNEL4x1_SUB
4525	prefetcht0	A_PR1(AO,%rax,SIZE)
4526	KERNEL4x1_SUB
4527	KERNEL4x1_SUB
4528
4529	je	.L1_4_16
4530
4531	prefetcht0	A_PR1(AO,%rax,SIZE)
4532	prefetcht0	B_PR1(BO,BI,SIZE)
4533	KERNEL4x1_SUB
4534	KERNEL4x1_SUB
4535	prefetcht0	A_PR1(AO,%rax,SIZE)
4536	KERNEL4x1_SUB
4537	KERNEL4x1_SUB
4538
4539	prefetcht0	A_PR1(AO,%rax,SIZE)
4540	KERNEL4x1_SUB
4541	KERNEL4x1_SUB
4542	prefetcht0	A_PR1(AO,%rax,SIZE)
4543	KERNEL4x1_SUB
4544	KERNEL4x1_SUB
4545
4546	je	.L1_4_16
4547
4548	jmp	.L1_4_12
4549	ALIGN_4
4550
4551.L1_4_16:
4552#ifndef TRMMKERNEL
4553        movq    K, %rax
4554#else
4555        movq    KKK, %rax
4556#endif
4557
4558	andq	$ 7, %rax		# if (k & 1)
4559	je .L1_4_19
4560
4561	movq    %rax, BI                        //  Index for BO
4562        leaq    ( ,BI,2), BI                    //  BI = BI * 4 ; number of values
4563
4564	salq	$ 3, %rax			// rax = rax * 8 ; number of values
4565	leaq	(AO, %rax, SIZE), AO
4566	leaq	(BO, BI, SIZE), BO
4567	negq	BI
4568	negq	%rax
4569	ALIGN_4
4570
4571.L1_4_17:
4572
4573	KERNEL4x1_SUB
4574
4575	jl	.L1_4_17
4576	ALIGN_4
4577
4578
4579.L1_4_19:
4580
4581	SAVE4x1
4582
4583#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4584    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4585        movq    K, %rax
4586        subq    KKK, %rax
4587	movq    %rax, BI                        //  Index for BO
4588        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4589        leaq    (BO, BI, SIZE), BO
4590	salq	$ 3, %rax			// rax = rax * 8 ; number of values
4591        leaq    (AO, %rax, SIZE), AO
4592#endif
4593
4594
4595#if defined(TRMMKERNEL) && defined(LEFT)
4596        addq    $ 4, KK
4597#endif
4598
4599	addq	$ 8 * SIZE, CO1		# coffset += 8
4600	ALIGN_4
4601
4602
4603
4604/**************************************************************************
4605* Rest of M
4606***************************************************************************/
4607
4608.L1_4_20:
4609
4610	testq	$ 2, M
4611	jz	.L1_4_40
4612	ALIGN_4
4613
4614.L1_4_21:
4615
4616#if !defined(TRMMKERNEL) || \
4617        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4618        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4619	leaq	BUFFER1, BO		// first buffer to BO
4620	addq	$ 4 * SIZE, BO
4621#else
4622        movq    KK, %rax
4623	leaq	BUFFER1, BO			// first buffer to BO
4624	addq	$ 4 * SIZE, BO
4625	movq    %rax, BI                        //  Index for BO
4626        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
4627        leaq    (BO, BI, SIZE), BO
4628	salq	$ 2, %rax			// rax = rax * 4 ; number of values
4629        leaq    (AO, %rax, SIZE), AO
4630#endif
4631
4632	vzeroall
4633
4634#ifndef TRMMKERNEL
4635        movq    K, %rax
4636#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4637        movq    K, %rax
4638        subq    KK, %rax
4639        movq    %rax, KKK
4640#else
4641        movq    KK, %rax
4642#ifdef LEFT
4643        addq    $ 2, %rax        // number of values in AO
4644#else
4645        addq    $ 1, %rax        // number of values in BO
4646#endif
4647        movq    %rax, KKK
4648#endif
4649
4650
4651	andq	$ -8, %rax			//  K = K - ( K % 8 )
4652	je	.L1_4_26
4653	movq    %rax, BI                        //  Index for BO
4654        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4655
4656	salq	$ 2, %rax			// rax = rax * 4 ; number of values
4657	leaq	(AO, %rax, SIZE), AO
4658	leaq	(BO, BI, SIZE), BO
4659	negq	BI
4660	negq	%rax
4661	ALIGN_4
4662
4663.L1_4_22:
4664
4665	prefetcht0	A_PR1(AO,%rax,SIZE)
4666	prefetcht0	B_PR1(BO,BI,SIZE)
4667	KERNEL2x1_SUB
4668	KERNEL2x1_SUB
4669	KERNEL2x1_SUB
4670	KERNEL2x1_SUB
4671
4672	prefetcht0	A_PR1(AO,%rax,SIZE)
4673	KERNEL2x1_SUB
4674	KERNEL2x1_SUB
4675	KERNEL2x1_SUB
4676	KERNEL2x1_SUB
4677
4678	je	.L1_4_26
4679
4680	prefetcht0	A_PR1(AO,%rax,SIZE)
4681	prefetcht0	B_PR1(BO,BI,SIZE)
4682	KERNEL2x1_SUB
4683	KERNEL2x1_SUB
4684	KERNEL2x1_SUB
4685	KERNEL2x1_SUB
4686
4687	prefetcht0	A_PR1(AO,%rax,SIZE)
4688	KERNEL2x1_SUB
4689	KERNEL2x1_SUB
4690	KERNEL2x1_SUB
4691	KERNEL2x1_SUB
4692
4693	je	.L1_4_26
4694
4695	jmp	.L1_4_22
4696	ALIGN_4
4697
4698.L1_4_26:
4699#ifndef TRMMKERNEL
4700        movq    K, %rax
4701#else
4702        movq    KKK, %rax
4703#endif
4704
4705	andq	$ 7, %rax		# if (k & 1)
4706	je .L1_4_29
4707
4708	movq    %rax, BI                        //  Index for BO
4709        leaq    ( ,BI,2), BI                    //  BI = BI * 2; number of values
4710
4711	salq	$ 2, %rax			// rax = rax * 4 ; number of values
4712	leaq	(AO, %rax, SIZE), AO
4713	leaq	(BO, BI, SIZE), BO
4714	negq	BI
4715	negq	%rax
4716	ALIGN_4
4717
4718.L1_4_27:
4719
4720	KERNEL2x1_SUB
4721
4722	jl	.L1_4_27
4723	ALIGN_4
4724
4725
4726.L1_4_29:
4727
4728	SAVE2x1
4729
4730#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4731    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4732        movq    K, %rax
4733        subq    KKK, %rax
4734	movq    %rax, BI                        //  Index for BO
4735        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4736        leaq    (BO, BI, SIZE), BO
4737	salq	$ 2, %rax			// rax = rax * 4 ; number of values
4738        leaq    (AO, %rax, SIZE), AO
4739#endif
4740
4741
4742#if defined(TRMMKERNEL) && defined(LEFT)
4743        addq    $ 2, KK
4744#endif
4745
4746	addq	$ 4 * SIZE, CO1		# coffset += 4
4747	ALIGN_4
4748
4749
4750
4751/**************************************************************************/
4752.L1_4_40:
4753	testq	$ 1, M
4754	jz	.L999		// to next 2 lines of N
4755
4756	ALIGN_4
4757
4758.L1_4_41:
4759
4760#if !defined(TRMMKERNEL) || \
4761        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4762        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4763	leaq	BUFFER1, BO		// first buffer to BO
4764	addq	$ 4 * SIZE, BO
4765#else
4766        movq    KK, %rax
4767	leaq	BUFFER1, BO			// first buffer to BO
4768	addq	$ 4 * SIZE, BO
4769	movq    %rax, BI                        //  Index for BO
4770        leaq    (,BI,2), BI                     //  BI = BI * 2 ; number of values
4771        leaq    (BO, BI, SIZE), BO
4772	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4773        leaq    (AO, %rax, SIZE), AO
4774#endif
4775
4776	vzeroall
4777
4778#ifndef TRMMKERNEL
4779        movq    K, %rax
4780#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4781        movq    K, %rax
4782        subq    KK, %rax
4783        movq    %rax, KKK
4784#else
4785        movq    KK, %rax
4786#ifdef LEFT
4787        addq    $ 1, %rax        // number of values in AO
4788#else
4789        addq    $ 1, %rax        // number of values in BO
4790#endif
4791        movq    %rax, KKK
4792#endif
4793
4794
4795	andq	$ -8, %rax			//  K = K - ( K % 8 )
4796	je	.L1_4_46
4797	movq    %rax, BI                        //  Index for BO
4798        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4799
4800	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4801	leaq	(AO, %rax, SIZE), AO
4802	leaq	(BO, BI, SIZE), BO
4803	negq	BI
4804	negq	%rax
4805	ALIGN_4
4806
4807.L1_4_42:
4808
4809	prefetcht0	A_PR1(AO,%rax,SIZE)
4810	prefetcht0	B_PR1(BO,BI,SIZE)
4811	KERNEL1x1_SUB
4812	KERNEL1x1_SUB
4813	KERNEL1x1_SUB
4814	KERNEL1x1_SUB
4815
4816	KERNEL1x1_SUB
4817	KERNEL1x1_SUB
4818	KERNEL1x1_SUB
4819	KERNEL1x1_SUB
4820
4821	je	.L1_4_46
4822
4823	prefetcht0	A_PR1(AO,%rax,SIZE)
4824	prefetcht0	B_PR1(BO,BI,SIZE)
4825	KERNEL1x1_SUB
4826	KERNEL1x1_SUB
4827	KERNEL1x1_SUB
4828	KERNEL1x1_SUB
4829
4830	KERNEL1x1_SUB
4831	KERNEL1x1_SUB
4832	KERNEL1x1_SUB
4833	KERNEL1x1_SUB
4834
4835	je	.L1_4_46
4836
4837	jmp	.L1_4_42
4838	ALIGN_4
4839
4840.L1_4_46:
4841#ifndef TRMMKERNEL
4842        movq    K, %rax
4843#else
4844        movq    KKK, %rax
4845#endif
4846
4847	andq	$ 7, %rax		# if (k & 1)
4848	je .L1_4_49
4849
4850	movq    %rax, BI                        //  Index for BO
4851        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4852
4853	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4854	leaq	(AO, %rax, SIZE), AO
4855	leaq	(BO, BI, SIZE), BO
4856	negq	BI
4857	negq	%rax
4858	ALIGN_4
4859
4860.L1_4_47:
4861
4862	KERNEL1x1_SUB
4863
4864	jl	.L1_4_47
4865	ALIGN_4
4866
4867
4868.L1_4_49:
4869
4870	SAVE1x1
4871
4872
4873
4874#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4875    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4876        movq    K, %rax
4877        subq    KKK, %rax
4878	movq    %rax, BI                        //  Index for BO
4879        leaq    ( ,BI,2), BI                    //  BI = BI * 2 ; number of values
4880        leaq    (BO, BI, SIZE), BO
4881	salq	$ 1, %rax			// rax = rax * 2 ; number of values
4882        leaq    (AO, %rax, SIZE), AO
4883#endif
4884
4885
4886#if defined(TRMMKERNEL) && defined(LEFT)
4887        addq    $ 1, KK
4888#endif
4889
4890	addq	$ 2 * SIZE, CO1		# coffset += 2
4891	ALIGN_4
4892
4893
4894.L999:
4895	vzeroupper
4896
4897	movq   		SP, %rsp
4898	movq	   (%rsp), %rbx
4899	movq	  8(%rsp), %rbp
4900	movq	 16(%rsp), %r12
4901	movq	 24(%rsp), %r13
4902	movq	 32(%rsp), %r14
4903	movq	 40(%rsp), %r15
4904
4905#ifdef WINDOWS_ABI
4906	movq	 48(%rsp), %rdi
4907	movq	 56(%rsp), %rsi
4908	vmovups	 64(%rsp), %xmm6
4909	vmovups	 80(%rsp), %xmm7
4910	vmovups	 96(%rsp), %xmm8
4911	vmovups	112(%rsp), %xmm9
4912	vmovups	128(%rsp), %xmm10
4913	vmovups	144(%rsp), %xmm11
4914	vmovups	160(%rsp), %xmm12
4915	vmovups	176(%rsp), %xmm13
4916	vmovups	192(%rsp), %xmm14
4917	vmovups	208(%rsp), %xmm15
4918#endif
4919
4920	addq	$ STACKSIZE, %rsp
4921	ret
4922
4923	EPILOGUE
4924
4925
4926#endif
4927
4928