1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39/*********************************************************************
40* 2013/10/20 Saar
41*        BLASTEST               : OK
42*        CTEST                  : OK
43*        TEST                   : OK
44
45*
46*
47* 2013/10/20 Saar
48* Parameter:
49*       DGEMM_DEFAULT_UNROLL_N  2
50*       DGEMM_DEFAULT_UNROLL_M  16
51*       DGEMM_DEFAULT_P         192
52*       DGEMM_DEFAULT_Q         128
53*	A_PR1			512
54*
55*
56* Performance without prefetch of B:
57*       1 thread:       45.8 GFLOPS (MKL:  45)
58*       2 threads:      80.0 GFLOPS (MKL:  91)
59*       4 threads:     135.0 GFLOPS (MKL: 135)
60*********************************************************************/
61
62
63#define ASSEMBLER
64#include "common.h"
65
66#define OLD_M	%rdi
67#define OLD_N	%rsi
68#define M	%r13
69#define J	%r14
70#define OLD_K	%rdx
71
72#define A	%rcx
73#define B	%r8
74#define C	%r9
75#define LDC	%r10
76
77#define I	%r11
78#define AO	%rdi
79#define BO	%rsi
80#define	CO1	%r15
81#define K	%r12
82#define BI	%rbp
83#define	SP	%rbx
84
85#define BO1	%rdi
86#define BO2	%r15
87
88#ifndef WINDOWS_ABI
89
90#define STACKSIZE 96
91
92#else
93
94#define STACKSIZE 256
95
96#define OLD_A		40 + STACKSIZE(%rsp)
97#define OLD_B		48 + STACKSIZE(%rsp)
98#define OLD_C		56 + STACKSIZE(%rsp)
99#define OLD_LDC		64 + STACKSIZE(%rsp)
100#define OLD_OFFSET	72 + STACKSIZE(%rsp)
101
102#endif
103
104#define L_BUFFER_SIZE 512*8*4
105#define LB2_OFFSET    512*8*2
106
107#define Ndiv6	 24(%rsp)
108#define Nmod6	 32(%rsp)
109#define N	 40(%rsp)
110#define ALPHA	 48(%rsp)
111#define OFFSET	 56(%rsp)
112#define KK	 64(%rsp)
113#define KKK	 72(%rsp)
114#define BUFFER1	           128(%rsp)
115#define BUFFER2	LB2_OFFSET+128(%rsp)
116
117#if defined(OS_WINDOWS)
118#if   L_BUFFER_SIZE > 16384
119#define STACK_TOUCH \
120        movl    $0,  4096 * 4(%rsp);\
121        movl    $0,  4096 * 3(%rsp);\
122        movl    $0,  4096 * 2(%rsp);\
123        movl    $0,  4096 * 1(%rsp);
124#elif L_BUFFER_SIZE > 12288
125#define STACK_TOUCH \
126        movl    $0,  4096 * 3(%rsp);\
127        movl    $0,  4096 * 2(%rsp);\
128        movl    $0,  4096 * 1(%rsp);
129#elif L_BUFFER_SIZE > 8192
130#define STACK_TOUCH \
131        movl    $0,  4096 * 2(%rsp);\
132        movl    $0,  4096 * 1(%rsp);
133#elif L_BUFFER_SIZE > 4096
134#define STACK_TOUCH \
135        movl    $0,  4096 * 1(%rsp);
136#else
137#define STACK_TOUCH
138#endif
139#else
140#define STACK_TOUCH
141#endif
142
143#if defined(BULLDOZER)
144
145.macro VFMADD231PD_ y0,y1,y2
146	vfmaddpd \y0,\y1,\y2,\y0
147.endm
148
149.macro VFMADD231SD_ x0,x1,x2
150	vfmaddsd \x0,\x1,\x2,\x0
151.endm
152
153#else
154
155.macro VFMADD231PD_ y0,y1,y2
156	vfmadd231pd \y2,\y1,\y0
157.endm
158
159.macro VFMADD231SD_ x0,x1,x2
160	vfmadd231sd \x2,\x1,\x0
161.endm
162
163#endif
164
165
166#define	A_PR1	1024
167#define	B_PR1	256
168
169/*******************************************************************************************
170* 3 lines of N
171*******************************************************************************************/
172
173.macro KERNEL16x3_SUBN
174	vbroadcastsd	-12 * SIZE(BO), %zmm1
175	vbroadcastsd	-11 * SIZE(BO), %zmm2
176	vbroadcastsd	-10 * SIZE(BO), %zmm3
177
178	vmovaps 	-16 * SIZE(AO), %zmm0
179	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
180	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
181	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
182
183	vmovaps 	 -8 * SIZE(AO), %zmm9
184	VFMADD231PD_  	%zmm10,%zmm1,%zmm9
185	VFMADD231PD_  	%zmm11,%zmm2,%zmm9
186	VFMADD231PD_  	%zmm12,%zmm3,%zmm9
187	addq	$ 3*SIZE , BO
188	addq	$ 16*SIZE, AO
189.endm
190
191
192.macro KERNEL8x3_SUBN
193	vbroadcastsd	-12 * SIZE(BO), %ymm1
194	vmovaps 	-16 * SIZE(AO), %ymm0
195	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
196	vbroadcastsd	-11 * SIZE(BO), %ymm2
197	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
198	vbroadcastsd	-10 * SIZE(BO), %ymm3
199	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
200	vmovaps 	-12 * SIZE(AO), %ymm0
201	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
202	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
203	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
204	prefetcht0	B_PR1(BO)
205	addq	$ 3*SIZE , BO
206	addq	$ 8*SIZE, AO
207.endm
208
209.macro KERNEL4x3_SUBN
210	vbroadcastsd	-12 * SIZE(BO), %ymm1
211	vmovaps 	-16 * SIZE(AO), %ymm0
212	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
213	vbroadcastsd	-11 * SIZE(BO), %ymm2
214	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
215	vbroadcastsd	-10 * SIZE(BO), %ymm3
216	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
217	addq	$ 3*SIZE , BO
218	addq	$ 4*SIZE, AO
219.endm
220
221.macro KERNEL2x3_SUBN
222	vmovsd	-12 * SIZE(BO), %xmm1
223	vmovsd 	-16 * SIZE(AO), %xmm0
224	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
225	vmovsd	-11 * SIZE(BO), %xmm2
226	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
227	vmovsd	-10 * SIZE(BO), %xmm3
228	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
229	vmovsd 	-15 * SIZE(AO), %xmm0
230	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
231	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
232	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
233	addq	$ 3*SIZE , BO
234	addq	$ 2*SIZE, AO
235.endm
236
237.macro KERNEL1x3_SUBN
238	vmovsd	-12 * SIZE(BO), %xmm1
239	vmovsd 	-16 * SIZE(AO), %xmm0
240	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
241	vmovsd	-11 * SIZE(BO), %xmm2
242	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
243	vmovsd	-10 * SIZE(BO), %xmm3
244	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
245	addq	$ 3*SIZE , BO
246	addq	$ 1*SIZE, AO
247.endm
248
249
250
251
252
253
254/******************************************************************************************/
255
256.macro KERNEL16x3_1
257	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %zmm1
258	vmovups 	-32 * SIZE(AO, %rax, SIZE), %zmm0
259	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
260	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %zmm2
261	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
262	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %zmm3
263	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
264	vmovups 	-24 * SIZE(AO, %rax, SIZE), %zmm0
265	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
266	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
267	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
268	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %zmm1
269	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %zmm2
270.endm
271
272
273
274
275.macro KERNEL16x3_2
276	vmovups 	-16 * SIZE(AO, %rax, SIZE), %zmm0
277	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
278	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
279	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %zmm3
280	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
281	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %zmm0
282	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
283	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
284	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
285	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %zmm1
286	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %zmm2
287.endm
288
289.macro KERNEL16x3_3
290	vmovups 	  0 * SIZE(AO, %rax, SIZE), %zmm0
291	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
292	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
293	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %zmm3
294	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
295	vmovups 	  8 * SIZE(AO, %rax, SIZE), %zmm0
296	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
297	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
298	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
299	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %zmm1
300	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %zmm2
301.endm
302
303.macro KERNEL16x3_4
304	vmovups 	 16 * SIZE(AO, %rax, SIZE), %zmm0
305	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
306	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
307	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %zmm3
308	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
309	vmovups 	 24 * SIZE(AO, %rax, SIZE), %zmm0
310	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
311	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
312	addq	$12, BI
313	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
314	addq	$64, %rax
315.endm
316
317.macro KERNEL16x3_SUB
318	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %zmm1
319	vmovups 	-32 * SIZE(AO, %rax, SIZE), %zmm0
320	VFMADD231PD_  	%zmm4,%zmm1,%zmm0
321	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %zmm2
322	VFMADD231PD_  	%zmm5,%zmm2,%zmm0
323	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %zmm3
324	VFMADD231PD_  	%zmm6,%zmm3,%zmm0
325	vmovups 	-24 * SIZE(AO, %rax, SIZE), %zmm0
326	VFMADD231PD_  	%zmm10,%zmm1,%zmm0
327	VFMADD231PD_  	%zmm11,%zmm2,%zmm0
328	VFMADD231PD_  	%zmm12,%zmm3,%zmm0
329	addq	$3 , BI
330	addq	$16, %rax
331.endm
332
333.macro SAVE16x3
334
335	vbroadcastsd	ALPHA, %zmm0
336
337	vmulpd	%zmm0 , %zmm4 , %zmm4
338	vmulpd	%zmm0 , %zmm10, %zmm10
339
340	vmulpd	%zmm0 , %zmm5 , %zmm5
341	vmulpd	%zmm0 , %zmm11, %zmm11
342
343	vmulpd	%zmm0 , %zmm6 , %zmm6
344	vmulpd	%zmm0 , %zmm12, %zmm12
345
346#if !defined(TRMMKERNEL)
347
348	vaddpd 	        (CO1), %zmm4,%zmm4
349	vaddpd  8 * SIZE(CO1), %zmm10,%zmm10
350
351	vaddpd 	        (CO1, LDC), %zmm5,%zmm5
352	vaddpd  8 * SIZE(CO1, LDC), %zmm11,%zmm11
353
354	vaddpd 	        (CO1, LDC, 2), %zmm6,%zmm6
355	vaddpd  8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12
356
357#endif
358
359	vmovups	%zmm4 ,  	(CO1)
360	vmovups	%zmm10, 8 * SIZE(CO1)
361
362	vmovups	%zmm5 ,  	(CO1, LDC)
363	vmovups	%zmm11, 8 * SIZE(CO1, LDC)
364
365	vmovups	%zmm6 ,  	(CO1, LDC, 2)
366	vmovups	%zmm12, 8 * SIZE(CO1, LDC, 2)
367
368.endm
369
370
371
372/*******************************************************************************************/
373
374.macro KERNEL8x3_1
375	prefetcht0	A_PR1(AO, %rax, SIZE)
376	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
377	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
378	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
379	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
380	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
381	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
382	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
383	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
384	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
385	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
386	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
387.endm
388
389.macro KERNEL8x3_2
390	prefetcht0	64+A_PR1(AO, %rax, SIZE)
391	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
392	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
393	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
394	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
395	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
396	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
397	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
398	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
399	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
400	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
401	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
402.endm
403
404.macro KERNEL8x3_3
405	prefetcht0	128+A_PR1(AO, %rax, SIZE)
406	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
407	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
408	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
409	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
410	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
411	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
412	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
413	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
414	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
415	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
416	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
417.endm
418
419.macro KERNEL8x3_4
420	prefetcht0	192+A_PR1(AO, %rax, SIZE)
421	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
422	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
423	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
424	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
425	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
426	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
427	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
428	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
429	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
430	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
431	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
432	addq	$12, BI
433	addq	$32, %rax
434.endm
435
436.macro KERNEL8x3_SUB
437	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
438	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
439	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
440	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
441	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
442	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
443	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
444	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
445	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
446	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
447	VFMADD231PD_  	%ymm9,%ymm3,%ymm0
448	addq	$3 , BI
449	addq	$8 , %rax
450.endm
451
452.macro SAVE8x3
453
454	vbroadcastsd	ALPHA, %ymm0
455
456	vmulpd	%ymm0 , %ymm4 , %ymm4
457	vmulpd	%ymm0 , %ymm7 , %ymm7
458
459	vmulpd	%ymm0 , %ymm5 , %ymm5
460	vmulpd	%ymm0 , %ymm8 , %ymm8
461
462	vmulpd	%ymm0 , %ymm6 , %ymm6
463	vmulpd	%ymm0 , %ymm9 , %ymm9
464
465#if !defined(TRMMKERNEL)
466
467	vaddpd 	        (CO1), %ymm4,%ymm4
468	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
469
470	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
471	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
472
473	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
474	vaddpd  4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9
475
476#endif
477
478	vmovups	%ymm4 ,  	(CO1)
479	vmovups	%ymm7 , 4 * SIZE(CO1)
480
481	vmovups	%ymm5 ,  	(CO1, LDC)
482	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
483
484	vmovups	%ymm6 ,  	(CO1, LDC, 2)
485	vmovups	%ymm9 , 4 * SIZE(CO1, LDC, 2)
486
487.endm
488
489
490
491/*******************************************************************************************/
492
493.macro KERNEL4x3_1
494	prefetcht0	A_PR1(AO, %rax, SIZE)
495	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
496	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
497	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
498	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
499	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
500	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
501	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
502.endm
503
504.macro KERNEL4x3_2
505	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm1
506	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
507	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
508	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm2
509	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
510	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm3
511	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
512.endm
513
514.macro KERNEL4x3_3
515	prefetcht0	A_PR1(AO, %rax, SIZE)
516	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
517	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
518	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
519	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
520	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
521	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm3
522	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
523.endm
524
525.macro KERNEL4x3_4
526	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm1
527	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
528	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
529	vbroadcastsd	  4 * SIZE(BO, BI, SIZE), %ymm2
530	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
531	vbroadcastsd	  5 * SIZE(BO, BI, SIZE), %ymm3
532	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
533	addq	$12, BI
534	addq	$16, %rax
535.endm
536
537.macro KERNEL4x3_SUB
538	vbroadcastsd	 -6 * SIZE(BO, BI, SIZE), %ymm1
539	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
540	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
541	vbroadcastsd	 -5 * SIZE(BO, BI, SIZE), %ymm2
542	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
543	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm3
544	VFMADD231PD_  	%ymm6,%ymm3,%ymm0
545	addq	$3 , BI
546	addq	$4 , %rax
547.endm
548
549.macro SAVE4x3
550
551	vbroadcastsd	ALPHA, %ymm0
552
553	vmulpd	%ymm0 , %ymm4 , %ymm4
554	vmulpd	%ymm0 , %ymm5 , %ymm5
555	vmulpd	%ymm0 , %ymm6 , %ymm6
556
557#if !defined(TRMMKERNEL)
558
559	vaddpd 	        (CO1), %ymm4,%ymm4
560	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
561	vaddpd 	        (CO1, LDC, 2), %ymm6,%ymm6
562
563#endif
564
565	vmovups	%ymm4 ,  	(CO1)
566	vmovups	%ymm5 ,  	(CO1, LDC)
567	vmovups	%ymm6 ,  	(CO1, LDC, 2)
568
569.endm
570
571
572/*******************************************************************************************/
573
574.macro KERNEL2x3_1
575	prefetcht0	A_PR1(AO, %rax, SIZE)
576	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
577	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
578	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
579	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
580	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
581	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
582	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
583	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
584	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
585	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
586	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
587.endm
588
589.macro KERNEL2x3_2
590	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
591	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
592	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
593	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
594	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
595	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
596	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
597	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
598	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
599	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
600	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
601.endm
602
603.macro KERNEL2x3_3
604	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
605	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
606	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
607	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
608	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
609	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
610	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
611	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
612	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
613	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
614	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
615.endm
616
617.macro KERNEL2x3_4
618	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
619	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
620	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
621	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
622	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
623	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
624	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
625	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
626	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
627	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
628	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
629	addq	$12, BI
630	addq	$8, %rax
631.endm
632
633.macro KERNEL2x3_SUB
634	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
635	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
636	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
637	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
638	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
639	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
640	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
641	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
642	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
643	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
644	VFMADD231SD_  	%xmm12,%xmm3,%xmm0
645	addq	$3 , BI
646	addq	$2 , %rax
647.endm
648
649.macro SAVE2x3
650
651	vmovsd	ALPHA, %xmm0
652
653	vmulsd	%xmm0 , %xmm4 , %xmm4
654	vmulsd	%xmm0 , %xmm8 , %xmm8
655	vmulsd	%xmm0 , %xmm5 , %xmm5
656	vmulsd	%xmm0 , %xmm10, %xmm10
657	vmulsd	%xmm0 , %xmm6 , %xmm6
658	vmulsd	%xmm0 , %xmm12, %xmm12
659
660#if !defined(TRMMKERNEL)
661
662	vaddsd 	 (CO1), %xmm4,%xmm4
663	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
664	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
665	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
666	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
667	vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12
668
669#endif
670
671	vmovsd	%xmm4 ,  	(CO1)
672	vmovsd	%xmm8 , 1 * SIZE(CO1)
673	vmovsd	%xmm5 ,  	(CO1, LDC)
674	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
675	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
676	vmovsd	%xmm12, 1 * SIZE(CO1, LDC, 2)
677
678.endm
679
680/*******************************************************************************************/
681
682.macro KERNEL1x3_1
683	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
684	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
685	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
686	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
687	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
688	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
689	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
690.endm
691
692.macro KERNEL1x3_2
693	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm1
694	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
695	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
696	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm2
697	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
698	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm3
699	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
700.endm
701
702.macro KERNEL1x3_3
703	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
704	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
705	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
706	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
707	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
708	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm3
709	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
710.endm
711
712.macro KERNEL1x3_4
713	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm1
714	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
715	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
716	vmovsd	  4 * SIZE(BO, BI, SIZE), %xmm2
717	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
718	vmovsd	  5 * SIZE(BO, BI, SIZE), %xmm3
719	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
720	addq	$12, BI
721	addq	$4, %rax
722.endm
723
724.macro KERNEL1x3_SUB
725	vmovsd	 -6 * SIZE(BO, BI, SIZE), %xmm1
726	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
727	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
728	vmovsd	 -5 * SIZE(BO, BI, SIZE), %xmm2
729	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
730	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm3
731	VFMADD231SD_  	%xmm6,%xmm3,%xmm0
732	addq	$3 , BI
733	addq	$1 , %rax
734.endm
735
736.macro SAVE1x3
737
738	vmovsd	ALPHA, %xmm0
739
740	vmulsd	%xmm0 , %xmm4 , %xmm4
741	vmulsd	%xmm0 , %xmm5 , %xmm5
742	vmulsd	%xmm0 , %xmm6 , %xmm6
743
744#if !defined(TRMMKERNEL)
745
746	vaddsd 	 (CO1), %xmm4,%xmm4
747	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
748	vaddsd 	 (CO1, LDC, 2), %xmm6,%xmm6
749
750#endif
751
752	vmovsd	%xmm4 ,  	(CO1)
753	vmovsd	%xmm5 ,  	(CO1, LDC)
754	vmovsd	%xmm6 ,  	(CO1, LDC, 2)
755
756.endm
757
758
759/*******************************************************************************************/
760
761/*******************************************************************************************
762* 2 lines of N
763*******************************************************************************************/
764
765.macro KERNEL16x2_1
766	prefetcht0	A_PR1(AO, %rax, SIZE)
767	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
768	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
769	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
770	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
771	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
772	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
773	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
774	prefetcht0	64+A_PR1(AO, %rax, SIZE)
775	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
776	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
777	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
778	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
779	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
780	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
781	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
782.endm
783
784.macro KERNEL16x2_2
785	prefetcht0	128+A_PR1(AO, %rax, SIZE)
786	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
787	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
788	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
789	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
790	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
791	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
792	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
793	prefetcht0	192+A_PR1(AO, %rax, SIZE)
794	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
795	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
796	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
797	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
798	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
799	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
800	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
801.endm
802
803.macro KERNEL16x2_3
804	prefetcht0	256+A_PR1(AO, %rax, SIZE)
805	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
806	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
807	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
808	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
809	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
810	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
811	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
812	prefetcht0	320+A_PR1(AO, %rax, SIZE)
813	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
814	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
815	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
816	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
817	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
818	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
819	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
820.endm
821
822.macro KERNEL16x2_4
823	prefetcht0	384+A_PR1(AO, %rax, SIZE)
824	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
825	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
826	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
827	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
828	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
829	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
830	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
831	prefetcht0	448+A_PR1(AO, %rax, SIZE)
832	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
833	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
834	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
835	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
836	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
837	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
838	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
839	addq	$8, BI
840	addq	$64, %rax
841.endm
842
843.macro KERNEL16x2_SUB
844	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
845	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
846	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
847	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
848	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
849	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
850	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
851	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
852	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
853	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
854	VFMADD231PD_  	%ymm11,%ymm2,%ymm0
855	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
856	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
857	VFMADD231PD_  	%ymm14,%ymm2,%ymm0
858	addq	$2, BI
859	addq	$16, %rax
860.endm
861
862.macro SAVE16x2
863
864	vbroadcastsd	ALPHA, %ymm0
865
866	vmulpd	%ymm0 , %ymm4 , %ymm4
867	vmulpd	%ymm0 , %ymm7 , %ymm7
868	vmulpd	%ymm0 , %ymm10, %ymm10
869	vmulpd	%ymm0 , %ymm13, %ymm13
870
871	vmulpd	%ymm0 , %ymm5 , %ymm5
872	vmulpd	%ymm0 , %ymm8 , %ymm8
873	vmulpd	%ymm0 , %ymm11, %ymm11
874	vmulpd	%ymm0 , %ymm14, %ymm14
875
876#if !defined(TRMMKERNEL)
877
878	vaddpd 	        (CO1), %ymm4,%ymm4
879	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
880	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
881	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
882
883	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
884	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
885	vaddpd  8 * SIZE(CO1, LDC), %ymm11,%ymm11
886	vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14
887
888#endif
889
890	vmovups	%ymm4 ,  	(CO1)
891	vmovups	%ymm7 , 4 * SIZE(CO1)
892	vmovups	%ymm10, 8 * SIZE(CO1)
893	vmovups	%ymm13,12 * SIZE(CO1)
894
895	vmovups	%ymm5 ,  	(CO1, LDC)
896	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
897	vmovups	%ymm11, 8 * SIZE(CO1, LDC)
898	vmovups	%ymm14,12 * SIZE(CO1, LDC)
899
900.endm
901
902
903
904/*******************************************************************************************/
905
906.macro KERNEL8x2_1
907	prefetcht0	A_PR1(AO, %rax, SIZE)
908	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
909	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
910	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
911	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
912	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
913	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
914	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
915	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
916.endm
917
918.macro KERNEL8x2_2
919	prefetcht0	64+A_PR1(AO, %rax, SIZE)
920	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
921	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
922	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
923	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
924	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
925	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
926	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
927	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
928.endm
929
930.macro KERNEL8x2_3
931	prefetcht0	128+A_PR1(AO, %rax, SIZE)
932	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
933	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
934	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
935	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
936	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
937	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
938	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
939	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
940.endm
941
942.macro KERNEL8x2_4
943	prefetcht0	192+A_PR1(AO, %rax, SIZE)
944	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
945	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
946	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
947	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
948	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
949	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
950	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
951	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
952	addq	$8, BI
953	addq	$32, %rax
954.endm
955
956.macro KERNEL8x2_SUB
957	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
958	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
959	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
960	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
961	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
962	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
963	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
964	VFMADD231PD_  	%ymm8,%ymm2,%ymm0
965	addq	$2, BI
966	addq	$8 , %rax
967.endm
968
969.macro SAVE8x2
970
971	vbroadcastsd	ALPHA, %ymm0
972
973	vmulpd	%ymm0 , %ymm4 , %ymm4
974	vmulpd	%ymm0 , %ymm7 , %ymm7
975
976	vmulpd	%ymm0 , %ymm5 , %ymm5
977	vmulpd	%ymm0 , %ymm8 , %ymm8
978
979#if !defined(TRMMKERNEL)
980
981	vaddpd 	        (CO1), %ymm4,%ymm4
982	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
983
984	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
985	vaddpd  4 * SIZE(CO1, LDC), %ymm8,%ymm8
986
987#endif
988
989	vmovups	%ymm4 ,  	(CO1)
990	vmovups	%ymm7 , 4 * SIZE(CO1)
991
992	vmovups	%ymm5 ,  	(CO1, LDC)
993	vmovups	%ymm8 , 4 * SIZE(CO1, LDC)
994
995.endm
996
997
998
999/*******************************************************************************************/
1000
1001.macro KERNEL4x2_1
1002	prefetcht0	A_PR1(AO, %rax, SIZE)
1003	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
1004	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1005	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1006	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
1007	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
1008.endm
1009
1010.macro KERNEL4x2_2
1011	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1012	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1013	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1014	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm2
1015	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
1016.endm
1017
1018.macro KERNEL4x2_3
1019	prefetcht0	64+A_PR1(AO, %rax, SIZE)
1020	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
1021	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
1022	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1023	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm2
1024	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
1025.endm
1026
1027.macro KERNEL4x2_4
1028	vbroadcastsd	  2 * SIZE(BO, BI, SIZE), %ymm1
1029	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
1030	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1031	vbroadcastsd	  3 * SIZE(BO, BI, SIZE), %ymm2
1032	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
1033	addq	$8, BI
1034	addq	$16, %rax
1035.endm
1036
1037.macro KERNEL4x2_SUB
1038	vbroadcastsd	 -4 * SIZE(BO, BI, SIZE), %ymm1
1039	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1040	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1041	vbroadcastsd	 -3 * SIZE(BO, BI, SIZE), %ymm2
1042	VFMADD231PD_  	%ymm5,%ymm2,%ymm0
1043	addq	$2, BI
1044	addq	$4 , %rax
1045.endm
1046
1047.macro SAVE4x2
1048
1049	vbroadcastsd	ALPHA, %ymm0
1050
1051	vmulpd	%ymm0 , %ymm4 , %ymm4
1052	vmulpd	%ymm0 , %ymm5 , %ymm5
1053
1054#if !defined(TRMMKERNEL)
1055
1056	vaddpd 	        (CO1), %ymm4,%ymm4
1057	vaddpd 	        (CO1, LDC), %ymm5,%ymm5
1058
1059#endif
1060
1061	vmovups	%ymm4 ,  	(CO1)
1062	vmovups	%ymm5 ,  	(CO1, LDC)
1063
1064.endm
1065
1066
1067/*******************************************************************************************/
1068
1069.macro KERNEL2x2_1
1070	prefetcht0	A_PR1(AO, %rax, SIZE)
1071	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
1072	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1073	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1074	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
1075	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1076	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1077	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1078	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
1079.endm
1080
1081.macro KERNEL2x2_2
1082	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1083	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
1084	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1085	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
1086	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1087	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
1088	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1089	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
1090.endm
1091
1092.macro KERNEL2x2_3
1093	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
1094	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
1095	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1096	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
1097	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1098	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
1099	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1100	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
1101.endm
1102
1103.macro KERNEL2x2_4
1104	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
1105	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
1106	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1107	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
1108	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1109	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
1110	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1111	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
1112	addq	$8, BI
1113	addq	$8, %rax
1114.endm
1115
1116.macro KERNEL2x2_SUB
1117	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
1118	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1119	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1120	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
1121	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1122	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1123	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1124	VFMADD231SD_  	%xmm10,%xmm2,%xmm0
1125	addq	$2, BI
1126	addq	$2, %rax
1127.endm
1128
1129.macro SAVE2x2
1130
1131	vmovsd	ALPHA, %xmm0
1132
1133	vmulsd	%xmm0 , %xmm4 , %xmm4
1134	vmulsd	%xmm0 , %xmm8 , %xmm8
1135	vmulsd	%xmm0 , %xmm5 , %xmm5
1136	vmulsd	%xmm0 , %xmm10, %xmm10
1137
1138#if !defined(TRMMKERNEL)
1139
1140	vaddsd 	 (CO1), %xmm4,%xmm4
1141	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
1142	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
1143	vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10
1144
1145#endif
1146
1147	vmovsd	%xmm4 ,  	(CO1)
1148	vmovsd	%xmm8 , 1 * SIZE(CO1)
1149	vmovsd	%xmm5 ,  	(CO1, LDC)
1150	vmovsd	%xmm10, 1 * SIZE(CO1, LDC)
1151
1152.endm
1153
1154
1155/*******************************************************************************************/
1156
1157.macro KERNEL1x2_1
1158	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
1159	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1160	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1161	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
1162	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1163.endm
1164
1165.macro KERNEL1x2_2
1166	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1167	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1168	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1169	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm2
1170	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1171.endm
1172
1173.macro KERNEL1x2_3
1174	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
1175	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
1176	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1177	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm2
1178	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1179.endm
1180
1181.macro KERNEL1x2_4
1182	vmovsd	  2 * SIZE(BO, BI, SIZE), %xmm1
1183	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
1184	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1185	vmovsd	  3 * SIZE(BO, BI, SIZE), %xmm2
1186	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1187	addq	$8, BI
1188	addq	$4, %rax
1189.endm
1190
1191.macro KERNEL1x2_SUB
1192	vmovsd	 -4 * SIZE(BO, BI, SIZE), %xmm1
1193	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1194	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1195	vmovsd	 -3 * SIZE(BO, BI, SIZE), %xmm2
1196	VFMADD231SD_  	%xmm5,%xmm2,%xmm0
1197	addq	$2, BI
1198	addq	$1, %rax
1199.endm
1200
1201.macro SAVE1x2
1202
1203	vmovsd	ALPHA, %xmm0
1204
1205	vmulsd	%xmm0 , %xmm4 , %xmm4
1206	vmulsd	%xmm0 , %xmm5 , %xmm5
1207
1208#if !defined(TRMMKERNEL)
1209
1210	vaddsd 	 (CO1), %xmm4,%xmm4
1211	vaddsd 	 (CO1, LDC), %xmm5,%xmm5
1212
1213#endif
1214
1215	vmovsd	%xmm4 ,  	(CO1)
1216	vmovsd	%xmm5 ,  	(CO1, LDC)
1217
1218.endm
1219
1220
1221/*******************************************************************************************/
1222
1223/*******************************************************************************************
1224* 1 line of N
1225*******************************************************************************************/
1226
1227.macro KERNEL16x1_1
1228	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1229	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1230	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1231	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1232	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1233	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
1234	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
1235	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
1236	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
1237.endm
1238
1239.macro KERNEL16x1_2
1240	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
1241	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
1242	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1243	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
1244	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1245	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
1246	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
1247	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
1248	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
1249.endm
1250
1251.macro KERNEL16x1_3
1252	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
1253	vmovups 	  0 * SIZE(AO, %rax, SIZE), %ymm0
1254	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1255	vmovups 	  4 * SIZE(AO, %rax, SIZE), %ymm0
1256	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1257	vmovups 	  8 * SIZE(AO, %rax, SIZE), %ymm0
1258	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
1259	vmovups 	 12 * SIZE(AO, %rax, SIZE), %ymm0
1260	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
1261.endm
1262
1263.macro KERNEL16x1_4
1264	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
1265	vmovups 	 16 * SIZE(AO, %rax, SIZE), %ymm0
1266	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1267	vmovups 	 20 * SIZE(AO, %rax, SIZE), %ymm0
1268	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1269	vmovups 	 24 * SIZE(AO, %rax, SIZE), %ymm0
1270	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
1271	vmovups 	 28 * SIZE(AO, %rax, SIZE), %ymm0
1272	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
1273	addq	$4, BI
1274	addq	$64, %rax
1275.endm
1276
1277.macro KERNEL16x1_SUB
1278	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1279	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1280	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1281	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1282	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1283	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
1284	VFMADD231PD_  	%ymm10,%ymm1,%ymm0
1285	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
1286	VFMADD231PD_  	%ymm13,%ymm1,%ymm0
1287	addq	$1, BI
1288	addq	$16, %rax
1289.endm
1290
1291.macro SAVE16x1
1292
1293	vbroadcastsd	ALPHA, %ymm0
1294
1295	vmulpd	%ymm0 , %ymm4 , %ymm4
1296	vmulpd	%ymm0 , %ymm7 , %ymm7
1297	vmulpd	%ymm0 , %ymm10, %ymm10
1298	vmulpd	%ymm0 , %ymm13, %ymm13
1299
1300#if !defined(TRMMKERNEL)
1301
1302	vaddpd 	        (CO1), %ymm4,%ymm4
1303	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
1304	vaddpd  8 * SIZE(CO1), %ymm10,%ymm10
1305	vaddpd 12 * SIZE(CO1), %ymm13,%ymm13
1306
1307#endif
1308
1309	vmovups	%ymm4 ,  	(CO1)
1310	vmovups	%ymm7 , 4 * SIZE(CO1)
1311	vmovups	%ymm10, 8 * SIZE(CO1)
1312	vmovups	%ymm13,12 * SIZE(CO1)
1313
1314.endm
1315
1316
1317
1318/*******************************************************************************************/
1319
1320.macro KERNEL8x1_1
1321	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1322	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1323	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1324	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1325	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1326.endm
1327
1328.macro KERNEL8x1_2
1329	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
1330	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
1331	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1332	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
1333	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1334.endm
1335
1336.macro KERNEL8x1_3
1337	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
1338	vmovups 	-16 * SIZE(AO, %rax, SIZE), %ymm0
1339	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1340	vmovups 	-12 * SIZE(AO, %rax, SIZE), %ymm0
1341	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1342.endm
1343
1344.macro KERNEL8x1_4
1345	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
1346	vmovups 	 -8 * SIZE(AO, %rax, SIZE), %ymm0
1347	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1348	vmovups 	 -4 * SIZE(AO, %rax, SIZE), %ymm0
1349	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1350	addq	$4, BI
1351	addq	$32, %rax
1352.endm
1353
1354.macro KERNEL8x1_SUB
1355	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1356	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1357	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1358	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1359	VFMADD231PD_  	%ymm7,%ymm1,%ymm0
1360	addq	$1, BI
1361	addq	$8 , %rax
1362.endm
1363
1364.macro SAVE8x1
1365
1366	vbroadcastsd	ALPHA, %ymm0
1367
1368	vmulpd	%ymm0 , %ymm4 , %ymm4
1369	vmulpd	%ymm0 , %ymm7 , %ymm7
1370
1371#if !defined(TRMMKERNEL)
1372
1373	vaddpd 	        (CO1), %ymm4,%ymm4
1374	vaddpd  4 * SIZE(CO1), %ymm7,%ymm7
1375
1376#endif
1377
1378	vmovups	%ymm4 ,  	(CO1)
1379	vmovups	%ymm7 , 4 * SIZE(CO1)
1380
1381.endm
1382
1383
1384
1385/*******************************************************************************************/
1386
1387.macro KERNEL4x1_1
1388	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1389	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1390	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1391.endm
1392
1393.macro KERNEL4x1_2
1394	vbroadcastsd	 -1 * SIZE(BO, BI, SIZE), %ymm1
1395	vmovups 	-28 * SIZE(AO, %rax, SIZE), %ymm0
1396	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1397.endm
1398
1399.macro KERNEL4x1_3
1400	vbroadcastsd	  0 * SIZE(BO, BI, SIZE), %ymm1
1401	vmovups 	-24 * SIZE(AO, %rax, SIZE), %ymm0
1402	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1403.endm
1404
1405.macro KERNEL4x1_4
1406	vbroadcastsd	  1 * SIZE(BO, BI, SIZE), %ymm1
1407	vmovups 	-20 * SIZE(AO, %rax, SIZE), %ymm0
1408	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1409	addq	$4, BI
1410	addq	$16, %rax
1411.endm
1412
1413.macro KERNEL4x1_SUB
1414	vbroadcastsd	 -2 * SIZE(BO, BI, SIZE), %ymm1
1415	vmovups 	-32 * SIZE(AO, %rax, SIZE), %ymm0
1416	VFMADD231PD_  	%ymm4,%ymm1,%ymm0
1417	addq	$1, BI
1418	addq	$4 , %rax
1419.endm
1420
1421.macro SAVE4x1
1422
1423	vbroadcastsd	ALPHA, %ymm0
1424
1425	vmulpd	%ymm0 , %ymm4 , %ymm4
1426
1427#if !defined(TRMMKERNEL)
1428
1429	vaddpd 	        (CO1), %ymm4,%ymm4
1430
1431#endif
1432
1433	vmovups	%ymm4 ,  	(CO1)
1434
1435.endm
1436
1437
1438/*******************************************************************************************/
1439
1440.macro KERNEL2x1_1
1441	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1442	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1443	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1444	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1445	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1446.endm
1447
1448.macro KERNEL2x1_2
1449	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
1450	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
1451	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1452	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
1453	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1454.endm
1455
1456.macro KERNEL2x1_3
1457	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
1458	vmovsd 	-28 * SIZE(AO, %rax, SIZE), %xmm0
1459	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1460	vmovsd 	-27 * SIZE(AO, %rax, SIZE), %xmm0
1461	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1462.endm
1463
1464.macro KERNEL2x1_4
1465	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
1466	vmovsd 	-26 * SIZE(AO, %rax, SIZE), %xmm0
1467	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1468	vmovsd 	-25 * SIZE(AO, %rax, SIZE), %xmm0
1469	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1470	addq	$4, BI
1471	addq	$8, %rax
1472.endm
1473
1474.macro KERNEL2x1_SUB
1475	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1476	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1477	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1478	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1479	VFMADD231SD_  	%xmm8,%xmm1,%xmm0
1480	addq	$1, BI
1481	addq	$2 , %rax
1482.endm
1483
1484.macro SAVE2x1
1485
1486	vmovsd	ALPHA, %xmm0
1487
1488	vmulsd	%xmm0 , %xmm4 , %xmm4
1489	vmulsd	%xmm0 , %xmm8 , %xmm8
1490
1491#if !defined(TRMMKERNEL)
1492
1493	vaddsd 	 (CO1), %xmm4,%xmm4
1494	vaddsd 1 * SIZE(CO1), %xmm8,%xmm8
1495
1496#endif
1497
1498	vmovsd	%xmm4 ,  	(CO1)
1499	vmovsd	%xmm8 , 1 * SIZE(CO1)
1500
1501.endm
1502
1503
1504/*******************************************************************************************/
1505
1506.macro KERNEL1x1_1
1507	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1508	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1509	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1510.endm
1511
1512.macro KERNEL1x1_2
1513	vmovsd	 -1 * SIZE(BO, BI, SIZE), %xmm1
1514	vmovsd 	-31 * SIZE(AO, %rax, SIZE), %xmm0
1515	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1516.endm
1517
1518.macro KERNEL1x1_3
1519	vmovsd	  0 * SIZE(BO, BI, SIZE), %xmm1
1520	vmovsd 	-30 * SIZE(AO, %rax, SIZE), %xmm0
1521	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1522.endm
1523
1524.macro KERNEL1x1_4
1525	vmovsd	  1 * SIZE(BO, BI, SIZE), %xmm1
1526	vmovsd 	-29 * SIZE(AO, %rax, SIZE), %xmm0
1527	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1528	addq	$ 4, BI
1529	addq	$ 4, %rax
1530.endm
1531
1532.macro KERNEL1x1_SUB
1533	vmovsd	 -2 * SIZE(BO, BI, SIZE), %xmm1
1534	vmovsd 	-32 * SIZE(AO, %rax, SIZE), %xmm0
1535	VFMADD231SD_  	%xmm4,%xmm1,%xmm0
1536	addq	$ 1, BI
1537	addq	$ 1 , %rax
1538.endm
1539
1540.macro SAVE1x1
1541
1542	vmovsd	ALPHA, %xmm0
1543
1544	vmulsd	%xmm0 , %xmm4 , %xmm4
1545
1546#if !defined(TRMMKERNEL)
1547
1548	vaddsd 	 (CO1), %xmm4,%xmm4
1549
1550#endif
1551
1552	vmovsd	%xmm4 ,  	(CO1)
1553
1554.endm
1555
1556
1557/*******************************************************************************************/
1558
1559#if !defined(TRMMKERNEL)
1560
1561
1562	PROLOGUE
1563	PROFCODE
1564
1565	subq	$STACKSIZE, %rsp
1566	movq	%rbx,   (%rsp)
1567	movq	%rbp,  8(%rsp)
1568	movq	%r12, 16(%rsp)
1569	movq	%r13, 24(%rsp)
1570	movq	%r14, 32(%rsp)
1571	movq	%r15, 40(%rsp)
1572
1573	vzeroupper
1574
1575#ifdef WINDOWS_ABI
1576	movq	%rdi,    48(%rsp)
1577	movq	%rsi,    56(%rsp)
1578	movups	%xmm6,   64(%rsp)
1579	movups	%xmm7,   80(%rsp)
1580	movups	%xmm8,   96(%rsp)
1581	movups	%xmm9,  112(%rsp)
1582	movups	%xmm10, 128(%rsp)
1583	movups	%xmm11, 144(%rsp)
1584	movups	%xmm12, 160(%rsp)
1585	movups	%xmm13, 176(%rsp)
1586	movups	%xmm14, 192(%rsp)
1587	movups	%xmm15, 208(%rsp)
1588
1589	movq	ARG1,      OLD_M
1590	movq	ARG2,      OLD_N
1591	movq	ARG3,      OLD_K
1592	movq	OLD_A,     A
1593	movq	OLD_B,     B
1594	movq	OLD_C,     C
1595	movq	OLD_LDC,   LDC
1596
1597	vmovaps	%xmm3, %xmm0
1598
1599#else
1600	movq	STACKSIZE +  8(%rsp), LDC
1601
1602#endif
1603
1604	movq    %rsp, SP      # save old stack
1605        subq    $128 + L_BUFFER_SIZE, %rsp
1606        andq    $-4096, %rsp    # align stack
1607
1608        STACK_TOUCH
1609
1610	cmpq	$0, OLD_M
1611	je	.L999
1612
1613	cmpq	$0, OLD_N
1614	je	.L999
1615
1616	cmpq	$0, OLD_K
1617	je	.L999
1618
1619	movq	OLD_M, M
1620	movq	OLD_N, N
1621	movq	OLD_K, K
1622
1623	vmovsd	 %xmm0, ALPHA
1624
1625	salq	$BASE_SHIFT, LDC
1626
1627	movq    N, %rax
1628        xorq    %rdx, %rdx
1629        movq    $6,  %rdi
1630        divq    %rdi                    //    N / 6
1631        movq    %rax, Ndiv6             //    N / 6
1632        movq    %rdx, Nmod6             //    N % 6
1633
1634
1635	movq	Ndiv6,  J
1636	cmpq	$0, J
1637	je	.L2_0
1638	ALIGN_4
1639
1640.L6_01:
1641        // copy to sub buffer
1642        movq    K, %rax
1643        salq    $1,%rax                 // K * 2 ; read 2 values
1644        movq    B, BO1
1645        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
1646        leaq    BUFFER1, BO             // first buffer to BO
1647        movq    K, %rax
1648	sarq	$3 , %rax		// K / 8
1649	jz	.L6_01a_2
1650        ALIGN_4
1651
1652.L6_01a_1:
1653
1654        prefetcht0 512(BO1)
1655        prefetcht0 512(BO2)
1656        prefetchw  512(BO)
1657
1658
1659	vmovups	0 * SIZE(BO1), %xmm0
1660	vmovups	2 * SIZE(BO1), %xmm2
1661	vmovups	4 * SIZE(BO1), %xmm4
1662	vmovups	6 * SIZE(BO1), %xmm6
1663	vmovsd  0 * SIZE(BO2), %xmm1
1664	vmovsd  2 * SIZE(BO2), %xmm3
1665	vmovsd  4 * SIZE(BO2), %xmm5
1666	vmovsd  6 * SIZE(BO2), %xmm7
1667	vmovups	%xmm0, 0*SIZE(BO)
1668	vmovsd	%xmm1, 2*SIZE(BO)
1669	vmovups	%xmm2, 3*SIZE(BO)
1670	vmovsd	%xmm3, 5*SIZE(BO)
1671	vmovups	%xmm4, 6*SIZE(BO)
1672	vmovsd	%xmm5, 8*SIZE(BO)
1673	vmovups	%xmm6, 9*SIZE(BO)
1674	vmovsd	%xmm7,11*SIZE(BO)
1675	addq	$ 8*SIZE,BO1
1676	addq	$ 8*SIZE,BO2
1677	addq	$ 12*SIZE,BO
1678
1679	vmovups	0 * SIZE(BO1), %xmm0
1680	vmovups	2 * SIZE(BO1), %xmm2
1681	vmovups	4 * SIZE(BO1), %xmm4
1682	vmovups	6 * SIZE(BO1), %xmm6
1683	vmovsd  0 * SIZE(BO2), %xmm1
1684	vmovsd  2 * SIZE(BO2), %xmm3
1685	vmovsd  4 * SIZE(BO2), %xmm5
1686	vmovsd  6 * SIZE(BO2), %xmm7
1687	vmovups	%xmm0, 0*SIZE(BO)
1688	vmovsd	%xmm1, 2*SIZE(BO)
1689	vmovups	%xmm2, 3*SIZE(BO)
1690	vmovsd	%xmm3, 5*SIZE(BO)
1691	vmovups	%xmm4, 6*SIZE(BO)
1692	vmovsd	%xmm5, 8*SIZE(BO)
1693	vmovups	%xmm6, 9*SIZE(BO)
1694	vmovsd	%xmm7,11*SIZE(BO)
1695	addq	$ 8*SIZE,BO1
1696	addq	$ 8*SIZE,BO2
1697	addq	$ 12*SIZE,BO
1698
1699	decq	%rax
1700	jnz	.L6_01a_1
1701
1702
1703
1704.L6_01a_2:
1705
1706	movq    K, %rax
1707        andq    $7, %rax                // K % 8
1708        jz      .L6_02c
1709        ALIGN_4
1710
1711
1712.L6_02b:
1713
1714	vmovups	0 * SIZE(BO1), %xmm0
1715	vmovsd  0 * SIZE(BO2), %xmm2
1716	vmovups	%xmm0, 0*SIZE(BO)
1717	vmovsd	%xmm2, 2*SIZE(BO)
1718	addq	$ 2*SIZE,BO1
1719	addq	$ 2*SIZE,BO2
1720	addq	$ 3*SIZE,BO
1721	decq	%rax
1722	jnz	.L6_02b
1723
1724.L6_02c:
1725
1726	movq	K, %rax
1727	salq	$1,%rax			// K * 2
1728	leaq	(B,%rax, SIZE), BO1	// next offset to BO1
1729	leaq	(BO1,%rax, SIZE), BO2	// next offset to BO2
1730	leaq    BUFFER2, BO		// second buffer to BO
1731	movq	K, %rax
1732	sarq	$3 , %rax		// K / 8
1733	jz	.L6_02c_2
1734	ALIGN_4
1735
1736.L6_02c_1:
1737
1738	prefetcht0 512(BO2)
1739        prefetchw  512(BO)
1740
1741	vmovups	0 * SIZE(BO2), %xmm0
1742	vmovups	2 * SIZE(BO2), %xmm2
1743	vmovups	4 * SIZE(BO2), %xmm4
1744	vmovups	6 * SIZE(BO2), %xmm6
1745	vmovsd  1 * SIZE(BO1), %xmm1
1746	vmovsd  3 * SIZE(BO1), %xmm3
1747	vmovsd  5 * SIZE(BO1), %xmm5
1748	vmovsd  7 * SIZE(BO1), %xmm7
1749	vmovsd	%xmm1, 0*SIZE(BO)
1750	vmovups	%xmm0, 1*SIZE(BO)
1751	vmovsd	%xmm3, 3*SIZE(BO)
1752	vmovups	%xmm2, 4*SIZE(BO)
1753	vmovsd	%xmm5, 6*SIZE(BO)
1754	vmovups	%xmm4, 7*SIZE(BO)
1755	vmovsd	%xmm7, 9*SIZE(BO)
1756	vmovups	%xmm6,10*SIZE(BO)
1757	addq	$8*SIZE,BO1
1758	addq	$8*SIZE,BO2
1759	addq	$12*SIZE,BO
1760
1761
1762	vmovups	0 * SIZE(BO2), %xmm0
1763	vmovups	2 * SIZE(BO2), %xmm2
1764	vmovups	4 * SIZE(BO2), %xmm4
1765	vmovups	6 * SIZE(BO2), %xmm6
1766	vmovsd  1 * SIZE(BO1), %xmm1
1767	vmovsd  3 * SIZE(BO1), %xmm3
1768	vmovsd  5 * SIZE(BO1), %xmm5
1769	vmovsd  7 * SIZE(BO1), %xmm7
1770	vmovsd	%xmm1, 0*SIZE(BO)
1771	vmovups	%xmm0, 1*SIZE(BO)
1772	vmovsd	%xmm3, 3*SIZE(BO)
1773	vmovups	%xmm2, 4*SIZE(BO)
1774	vmovsd	%xmm5, 6*SIZE(BO)
1775	vmovups	%xmm4, 7*SIZE(BO)
1776	vmovsd	%xmm7, 9*SIZE(BO)
1777	vmovups	%xmm6,10*SIZE(BO)
1778	addq	$8*SIZE,BO1
1779	addq	$8*SIZE,BO2
1780	addq	$12*SIZE,BO
1781
1782	decq	%rax
1783	jnz	.L6_02c_1
1784
1785
1786.L6_02c_2:
1787
1788	movq    K, %rax
1789        andq    $7, %rax                // K % 8
1790        jz      .L6_03c
1791        ALIGN_4
1792
1793.L6_03b:
1794
1795	vmovsd	  1*SIZE(BO1), %xmm0
1796	vmovups	  0*SIZE(BO2), %xmm1
1797	vmovsd	%xmm0, 0*SIZE(BO)
1798	vmovups	%xmm1, 1*SIZE(BO)
1799	addq	$2*SIZE,BO1
1800	addq	$2*SIZE,BO2
1801	addq	$3*SIZE,BO
1802	decq	%rax
1803	jnz	.L6_03b
1804
1805
1806.L6_03c:
1807
1808	movq	BO2, B			// next offset of B
1809
1810.L6_10:
1811	movq	C, CO1
1812	leaq	(C, LDC, 2), C
1813	leaq	(C, LDC, 1), C		// c += 3 * ldc
1814
1815
1816	movq	A, AO		 	// aoffset = a
1817	addq	$16 * SIZE, AO
1818
1819	movq	M,  I
1820	sarq	$4, I			// i = (m >> 4)
1821	je	.L6_20
1822
1823	ALIGN_4
1824
1825.L6_11:
1826        leaq    BUFFER1, BO             // first buffer to BO
1827        addq    $12 * SIZE, BO
1828
1829	prefetcht0	(CO1)
1830	prefetcht0	(CO1,LDC,1)
1831	prefetcht0	(CO1,LDC,2)
1832	prefetcht0	64(CO1)
1833	prefetcht0	64(CO1,LDC,1)
1834	prefetcht0	64(CO1,LDC,2)
1835
1836	vzeroall
1837
1838        movq    K, %rax
1839
1840	sarq $1, %rax			//  K / 8
1841	je	.L6_16
1842
1843	ALIGN_5
1844
1845.L6_12:
1846/*
1847	prefetcht0	B_PR1(BO)
1848	prefetcht0	B_PR1+64(BO)
1849	prefetcht0	B_PR1+128(BO)
1850*/
1851	KERNEL16x3_SUBN
1852	KERNEL16x3_SUBN
1853/*
1854	KERNEL16x3_SUBN
1855	KERNEL16x3_SUBN
1856
1857	KERNEL16x3_SUBN
1858	KERNEL16x3_SUBN
1859	KERNEL16x3_SUBN
1860	KERNEL16x3_SUBN
1861*/
1862	dec	%rax
1863	jne	.L6_12
1864
1865.L6_16:
1866        movq    K, %rax
1867
1868	andq	$1, %rax		# if (k & 1)
1869	je .L6_19
1870
1871	ALIGN_4
1872
1873.L6_17:
1874
1875	KERNEL16x3_SUBN
1876
1877	dec	%rax
1878	jne	.L6_17
1879	ALIGN_4
1880
1881
1882.L6_19:
1883
1884	SAVE16x3
1885
1886	addq	$16 * SIZE, CO1		# coffset += 16
1887	decq	I			# i --
1888	jg	.L6_11
1889	ALIGN_4
1890
1891/**************************************************************************
1892* Rest of M
1893***************************************************************************/
1894.L6_20:
1895	// Test rest of M
1896
1897	testq	$15, M
1898	jz	.L7_10		// to next 3 lines of N
1899
1900	testq	$8, M
1901	jz	.L6_21pre
1902	ALIGN_4
1903
1904/**************************************************************************/
1905
1906.L6_20_1:
1907        leaq    BUFFER1, BO             // first buffer to BO
1908        addq    $12 * SIZE, BO
1909
1910	vzeroall
1911
1912        movq    K, %rax
1913
1914	sarq	$3, %rax
1915	je	.L6_20_6
1916
1917	ALIGN_4
1918
1919.L6_20_2:
1920
1921	KERNEL8x3_SUBN
1922	KERNEL8x3_SUBN
1923	KERNEL8x3_SUBN
1924	KERNEL8x3_SUBN
1925
1926	KERNEL8x3_SUBN
1927	KERNEL8x3_SUBN
1928	KERNEL8x3_SUBN
1929	KERNEL8x3_SUBN
1930	dec	%rax
1931	jne	.L6_20_2
1932	ALIGN_4
1933
1934.L6_20_6:
1935        movq    K, %rax
1936
1937	andq	$7, %rax		# if (k & 1)
1938	je .L6_20_9
1939
1940
1941	ALIGN_4
1942
1943.L6_20_7:
1944
1945	KERNEL8x3_SUBN
1946
1947	dec	%rax
1948	jne	.L6_20_7
1949	ALIGN_4
1950
1951
1952.L6_20_9:
1953
1954	SAVE8x3
1955
1956	addq	$8 * SIZE, CO1		# coffset += 8
1957	ALIGN_4
1958
1959
1960
1961/**************************************************************************/
1962
1963.L6_21pre:
1964
1965	testq	$4, M
1966	jz	.L6_30
1967	ALIGN_4
1968
1969.L6_21:
1970        leaq    BUFFER1, BO             // first buffer to BO
1971        addq    $12 * SIZE, BO
1972
1973	vzeroall
1974
1975        movq    K, %rax
1976
1977	sarq	$3, %rax
1978	je	.L6_26
1979
1980	ALIGN_4
1981
1982.L6_22:
1983
1984	KERNEL4x3_SUBN
1985	KERNEL4x3_SUBN
1986	KERNEL4x3_SUBN
1987	KERNEL4x3_SUBN
1988
1989	KERNEL4x3_SUBN
1990	KERNEL4x3_SUBN
1991	KERNEL4x3_SUBN
1992	KERNEL4x3_SUBN
1993	dec	%rax
1994	jne	.L6_22
1995	ALIGN_4
1996
1997.L6_26:
1998        movq    K, %rax
1999
2000	andq	$7, %rax		# if (k & 1)
2001	je .L6_29
2002
2003	ALIGN_4
2004
2005.L6_27:
2006
2007	KERNEL4x3_SUBN
2008
2009	dec %rax
2010	jne	.L6_27
2011	ALIGN_4
2012
2013
2014.L6_29:
2015
2016	SAVE4x3
2017
2018	addq	$4 * SIZE, CO1		# coffset += 4
2019	ALIGN_4
2020
2021
2022.L6_30:
2023	testq	$2, M
2024	jz	.L6_40
2025
2026	ALIGN_4
2027
2028.L6_31:
2029        leaq    BUFFER1, BO             // first buffer to BO
2030        addq    $12 * SIZE, BO
2031
2032	vzeroall
2033
2034        movq    K, %rax
2035
2036	sarq	$3, %rax
2037	je	.L6_36
2038	ALIGN_4
2039
2040.L6_32:
2041
2042	KERNEL2x3_SUBN
2043	KERNEL2x3_SUBN
2044	KERNEL2x3_SUBN
2045	KERNEL2x3_SUBN
2046
2047	KERNEL2x3_SUBN
2048	KERNEL2x3_SUBN
2049	KERNEL2x3_SUBN
2050	KERNEL2x3_SUBN
2051	dec %rax
2052	jne	.L6_32
2053	ALIGN_4
2054
2055.L6_36:
2056        movq    K, %rax
2057
2058	andq	$7, %rax		# if (k & 1)
2059	je .L6_39
2060
2061	ALIGN_4
2062
2063.L6_37:
2064
2065	KERNEL2x3_SUBN
2066
2067	dec %rax
2068	jne	.L6_37
2069	ALIGN_4
2070
2071
2072.L6_39:
2073
2074	SAVE2x3
2075
2076	addq	$2 * SIZE, CO1		# coffset += 2
2077	ALIGN_4
2078
2079.L6_40:
2080	testq	$1, M
2081	jz	.L7_10		// to next 3 lines of N
2082
2083	ALIGN_4
2084
2085.L6_41:
2086        leaq    BUFFER1, BO             // first buffer to BO
2087        addq    $12 * SIZE, BO
2088
2089	vzeroall
2090
2091        movq    K, %rax
2092
2093	sarq	$3,%rax
2094	je	.L6_46
2095
2096	ALIGN_4
2097
2098.L6_42:
2099
2100	KERNEL1x3_SUBN
2101	KERNEL1x3_SUBN
2102	KERNEL1x3_SUBN
2103	KERNEL1x3_SUBN
2104
2105	KERNEL1x3_SUBN
2106	KERNEL1x3_SUBN
2107	KERNEL1x3_SUBN
2108	KERNEL1x3_SUBN
2109
2110	dec %rax
2111	jne	.L6_42
2112	ALIGN_4
2113
2114.L6_46:
2115        movq    K, %rax
2116
2117	andq	$7, %rax		# if (k & 1)
2118	je .L6_49
2119
2120	ALIGN_4
2121
2122.L6_47:
2123
2124	KERNEL1x3_SUBN
2125
2126	dec	%rax
2127	jne	.L6_47
2128	ALIGN_4
2129
2130
2131.L6_49:
2132
2133	SAVE1x3
2134
2135	addq	$1 * SIZE, CO1		# coffset += 1
2136	ALIGN_4
2137
2138
2139
2140
2141/***************************************************************************************************************/
2142
2143.L7_10:
2144	movq	C, CO1
2145	leaq	(C, LDC, 2), C
2146	leaq	(C, LDC, 1), C		// c += 3 * ldc
2147
2148
2149	movq	A, AO		 	// aoffset = a
2150	addq	$16 * SIZE, AO
2151
2152	movq	M,  I
2153	sarq	$4, I			// i = (m >> 4)
2154	je	.L7_20
2155
2156	ALIGN_4
2157
2158.L7_11:
2159        leaq    BUFFER2, BO             // second buffer to BO
2160        addq    $12 * SIZE, BO
2161
2162	prefetcht0	(CO1)
2163	prefetcht0	(CO1,LDC,1)
2164	prefetcht0	(CO1,LDC,2)
2165	prefetcht0	64(CO1)
2166	prefetcht0	64(CO1,LDC,1)
2167	prefetcht0	64(CO1,LDC,2)
2168
2169	vzeroall
2170
2171        movq    K, %rax
2172
2173	sarq $3, %rax			// K / 8
2174	je	.L7_16
2175	ALIGN_5
2176
2177.L7_12:
2178/*
2179	prefetcht0	B_PR1(BO)
2180	prefetcht0	B_PR1+64(BO)
2181	prefetcht0	B_PR1+128(BO)
2182*/
2183	KERNEL16x3_SUBN
2184	KERNEL16x3_SUBN
2185	KERNEL16x3_SUBN
2186	KERNEL16x3_SUBN
2187
2188	KERNEL16x3_SUBN
2189	KERNEL16x3_SUBN
2190	KERNEL16x3_SUBN
2191	KERNEL16x3_SUBN
2192	dec %rax
2193	jne	.L7_12
2194	ALIGN_4
2195
2196.L7_16:
2197        movq    K, %rax
2198
2199	andq	$7, %rax		# if (k & 1)
2200	je .L7_19
2201
2202	ALIGN_5
2203
2204.L7_17:
2205
2206	KERNEL16x3_SUBN
2207
2208	dec	%rax
2209	jne	.L7_17
2210
2211
2212.L7_19:
2213
2214	SAVE16x3
2215
2216	addq	$16 * SIZE, CO1		# coffset += 16
2217	decq	I			# i --
2218	jg	.L7_11
2219	ALIGN_4
2220
2221/**************************************************************************
2222* Rest of M
2223***************************************************************************/
2224.L7_20:
2225	// Test rest of M
2226
2227	testq	$15, M
2228	jz	.L7_60		// to next 3 lines of N
2229
2230	testq	$8, M
2231	jz	.L7_21pre
2232	ALIGN_4
2233
2234/**************************************************************************/
2235
2236.L7_20_1:
2237        leaq    BUFFER2, BO             // first buffer to BO
2238        addq    $12 * SIZE, BO
2239
2240	vzeroall
2241
2242        movq    K, %rax
2243
2244	sarq	$3, %rax
2245	je	.L7_20_6
2246
2247	ALIGN_4
2248
2249.L7_20_2:
2250
2251	KERNEL8x3_SUBN
2252	KERNEL8x3_SUBN
2253	KERNEL8x3_SUBN
2254	KERNEL8x3_SUBN
2255
2256	KERNEL8x3_SUBN
2257	KERNEL8x3_SUBN
2258	KERNEL8x3_SUBN
2259	KERNEL8x3_SUBN
2260
2261	dec %rax
2262	jne	.L7_20_2
2263	ALIGN_4
2264
2265.L7_20_6:
2266        movq    K, %rax
2267
2268	andq	$7, %rax		# if (k & 1)
2269	je .L7_20_9
2270
2271	ALIGN_4
2272
2273.L7_20_7:
2274
2275	KERNEL8x3_SUBN
2276
2277	dec %rax
2278	jne	.L7_20_7
2279	ALIGN_4
2280
2281.L7_20_9:
2282
2283	SAVE8x3
2284
2285	addq	$8 * SIZE, CO1		# coffset += 8
2286	ALIGN_4
2287
2288
2289
2290/**************************************************************************/
2291
2292.L7_21pre:
2293
2294	testq	$4, M
2295	jz	.L7_30
2296	ALIGN_4
2297
2298.L7_21:
2299        leaq    BUFFER2, BO             // second buffer to BO
2300        addq    $12 * SIZE, BO
2301
2302	vzeroall
2303
2304        movq    K, %rax
2305
2306	sarq	$3, %rax
2307	je	.L7_26
2308
2309	ALIGN_4
2310
2311.L7_22:
2312
2313	KERNEL4x3_SUBN
2314	KERNEL4x3_SUBN
2315	KERNEL4x3_SUBN
2316	KERNEL4x3_SUBN
2317
2318	KERNEL4x3_SUBN
2319	KERNEL4x3_SUBN
2320	KERNEL4x3_SUBN
2321	KERNEL4x3_SUBN
2322
2323	dec %rax
2324	jne	.L7_22
2325	ALIGN_4
2326
2327.L7_26:
2328        movq    K, %rax
2329
2330	andq	$7, %rax		# if (k & 1)
2331	je .L7_29
2332
2333	ALIGN_4
2334
2335.L7_27:
2336
2337	KERNEL4x3_SUBN
2338
2339	dec %rax
2340	jne	.L7_27
2341	ALIGN_4
2342
2343
2344.L7_29:
2345
2346	SAVE4x3
2347
2348	addq	$4 * SIZE, CO1		# coffset += 4
2349	ALIGN_4
2350
2351
2352.L7_30:
2353	testq	$2, M
2354	jz	.L7_40
2355
2356	ALIGN_4
2357
2358.L7_31:
2359        leaq    BUFFER2, BO             // second buffer to BO
2360        addq    $12 * SIZE, BO
2361
2362	vzeroall
2363
2364        movq    K, %rax
2365
2366	sarq	$3, %rax
2367	je	.L7_36
2368
2369	ALIGN_4
2370
2371.L7_32:
2372
2373	KERNEL2x3_SUBN
2374	KERNEL2x3_SUBN
2375	KERNEL2x3_SUBN
2376	KERNEL2x3_SUBN
2377
2378	KERNEL2x3_SUBN
2379	KERNEL2x3_SUBN
2380	KERNEL2x3_SUBN
2381	KERNEL2x3_SUBN
2382
2383	dec %rax
2384	jne	.L7_32
2385	ALIGN_4
2386
2387.L7_36:
2388        movq    K, %rax
2389
2390	andq	$7, %rax		# if (k & 1)
2391	je .L7_39
2392
2393	ALIGN_4
2394
2395.L7_37:
2396
2397	KERNEL2x3_SUBN
2398
2399	dec %rax
2400	jne	.L7_37
2401	ALIGN_4
2402
2403
2404.L7_39:
2405
2406	SAVE2x3
2407
2408	addq	$2 * SIZE, CO1		# coffset += 2
2409	ALIGN_4
2410
2411.L7_40:
2412	testq	$1, M
2413	jz	.L7_60		// to next 3 lines of N
2414
2415	ALIGN_4
2416
2417.L7_41:
2418        leaq    BUFFER2, BO             // second buffer to BO
2419        addq    $12 * SIZE, BO
2420
2421	vzeroall
2422
2423        movq    K, %rax
2424
2425	sarq	$3, %rax
2426	je	.L7_46
2427
2428	ALIGN_4
2429
2430.L7_42:
2431	KERNEL1x3_SUBN
2432	KERNEL1x3_SUBN
2433	KERNEL1x3_SUBN
2434	KERNEL1x3_SUBN
2435
2436	KERNEL1x3_SUBN
2437	KERNEL1x3_SUBN
2438	KERNEL1x3_SUBN
2439	KERNEL1x3_SUBN
2440
2441	dec %rax
2442	jne	.L7_42
2443	ALIGN_4
2444
2445.L7_46:
2446        movq    K, %rax
2447
2448	andq	$7, %rax		# if (k & 1)
2449	je .L7_49
2450
2451	ALIGN_4
2452
2453.L7_47:
2454
2455	KERNEL1x3_SUBN
2456
2457	dec %rax
2458	jne	.L7_47
2459	ALIGN_4
2460
2461
2462.L7_49:
2463
2464	SAVE1x3
2465
2466	addq	$1 * SIZE, CO1		# coffset += 1
2467	ALIGN_4
2468
2469
2470
2471.L7_60:
2472
2473	decq	J			// j --
2474	jg	.L6_01
2475
2476
2477.L2_0:
2478	cmpq	$0, Nmod6		// N % 6 == 0
2479	je	.L999
2480
2481/************************************************************************************************
2482* Loop for Nmod6 / 2 > 0
2483*************************************************************************************************/
2484
2485	movq	Nmod6, J
2486	sarq	$1, J			// j = j / 2
2487	je	.L1_0
2488	ALIGN_4
2489
2490.L2_01:
2491	// copy to sub buffer
2492	movq	B, BO1
2493	leaq    BUFFER1, BO		// first buffer to BO
2494	movq	K, %rax
2495	sarq	$2, %rax		// K / 4
2496	jz	.L2_01b
2497	ALIGN_4
2498
2499.L2_01a:
2500        prefetcht0 512(BO1)
2501        prefetchw  512(BO)
2502
2503	vmovups	      (BO1), %xmm0
2504	vmovups	2*SIZE(BO1), %xmm1
2505	vmovups	4*SIZE(BO1), %xmm2
2506	vmovups	6*SIZE(BO1), %xmm3
2507
2508	vmovups	%xmm0,       (BO)
2509	vmovups	%xmm1, 2*SIZE(BO)
2510	vmovups	%xmm2, 4*SIZE(BO)
2511	vmovups	%xmm3, 6*SIZE(BO)
2512
2513	addq	$8*SIZE,BO1
2514	addq	$8*SIZE,BO
2515	decq	%rax
2516	jnz	.L2_01a
2517
2518
2519.L2_01b:
2520
2521        movq    K, %rax
2522        andq    $3, %rax                // K % 4
2523        jz      .L2_02d
2524        ALIGN_4
2525
2526.L2_02c:
2527
2528	vmovups	(BO1), %xmm0
2529	vmovups	%xmm0, (BO)
2530	addq	$2*SIZE,BO1
2531	addq	$2*SIZE,BO
2532	decq	%rax
2533	jnz	.L2_02c
2534
2535.L2_02d:
2536
2537	movq	BO1, B			// next offset of B
2538
2539.L2_10:
2540	movq	C, CO1
2541	leaq	(C, LDC, 2), C		// c += 2 * ldc
2542
2543
2544	movq	A, AO		 	// aoffset = a
2545	addq	$32 * SIZE, AO
2546
2547	movq	M,  I
2548	sarq	$4, I			// i = (m >> 4)
2549	je	.L2_20
2550
2551	ALIGN_4
2552
2553.L2_11:
2554        leaq    BUFFER1, BO             // first buffer to BO
2555        addq    $4 * SIZE, BO
2556
2557	vzeroall
2558
2559        movq    K, %rax
2560
2561	andq	$-8, %rax			//  K = K - ( K % 8 )
2562	je	.L2_16
2563	movq    %rax, BI                        //  Index for BO
2564        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2565
2566	salq	$4, %rax			// rax = rax * 16 ; number of values
2567	leaq	(AO, %rax, SIZE), AO
2568	leaq	(BO, BI, SIZE), BO
2569	negq	BI
2570	negq	%rax
2571	ALIGN_4
2572
2573.L2_12:
2574
2575	prefetcht0      B_PR1(BO,BI,8)
2576	KERNEL16x2_1
2577	KERNEL16x2_2
2578	KERNEL16x2_3
2579	KERNEL16x2_4
2580
2581	prefetcht0      B_PR1(BO,BI,8)
2582	KERNEL16x2_1
2583	KERNEL16x2_2
2584	KERNEL16x2_3
2585	KERNEL16x2_4
2586
2587	je	.L2_16
2588
2589	prefetcht0      B_PR1(BO,BI,8)
2590	KERNEL16x2_1
2591	KERNEL16x2_2
2592	KERNEL16x2_3
2593	KERNEL16x2_4
2594
2595	prefetcht0      B_PR1(BO,BI,8)
2596	KERNEL16x2_1
2597	KERNEL16x2_2
2598	KERNEL16x2_3
2599	KERNEL16x2_4
2600
2601	je	.L2_16
2602
2603	jmp	.L2_12
2604	ALIGN_4
2605
2606.L2_16:
2607        movq    K, %rax
2608
2609	andq	$7, %rax		# if (k & 1)
2610	je .L2_19
2611
2612	movq    %rax, BI                        //  Index for BO
2613        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2614
2615	salq	$4, %rax			// rax = rax * 16 ; number of values
2616	leaq	(AO, %rax, SIZE), AO
2617	leaq	(BO, BI, SIZE), BO
2618	negq	BI
2619	negq	%rax
2620	ALIGN_4
2621
2622.L2_17:
2623
2624	KERNEL16x2_SUB
2625
2626	jl	.L2_17
2627	ALIGN_4
2628
2629
2630.L2_19:
2631
2632	SAVE16x2
2633
2634	addq	$16 * SIZE, CO1		# coffset += 16
2635	decq	I			# i --
2636	jg	.L2_11
2637	ALIGN_4
2638
2639/**************************************************************************
2640* Rest of M
2641***************************************************************************/
2642.L2_20:
2643	// Test rest of M
2644
2645	testq	$15, M
2646	jz	.L2_60		// to next 3 lines of N
2647
2648	testq	$8, M
2649	jz	.L2_21pre
2650	ALIGN_4
2651
2652/**************************************************************************/
2653
2654.L2_20_1:
2655        leaq    BUFFER1, BO             // first buffer to BO
2656        addq    $4 * SIZE, BO
2657
2658	vzeroall
2659
2660        movq    K, %rax
2661
2662	andq	$-8, %rax
2663	je	.L2_20_6
2664	movq    %rax, BI                        //  Index for BO
2665        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2666
2667	salq	$3, %rax			// rax = rax * 8 ; number of values
2668	leaq	(AO, %rax, SIZE), AO
2669	leaq	(BO, BI, SIZE), BO
2670	negq	BI
2671	negq	%rax
2672	ALIGN_4
2673
2674.L2_20_2:
2675
2676	prefetcht0      B_PR1(BO,BI,8)
2677	KERNEL8x2_1
2678	KERNEL8x2_2
2679	KERNEL8x2_3
2680	KERNEL8x2_4
2681
2682	prefetcht0      B_PR1(BO,BI,8)
2683	KERNEL8x2_1
2684	KERNEL8x2_2
2685	KERNEL8x2_3
2686	KERNEL8x2_4
2687
2688	je	.L2_20_6
2689
2690	prefetcht0      B_PR1(BO,BI,8)
2691	KERNEL8x2_1
2692	KERNEL8x2_2
2693	KERNEL8x2_3
2694	KERNEL8x2_4
2695
2696	prefetcht0      B_PR1(BO,BI,8)
2697	KERNEL8x2_1
2698	KERNEL8x2_2
2699	KERNEL8x2_3
2700	KERNEL8x2_4
2701
2702	je	.L2_20_6
2703
2704	jmp	.L2_20_2
2705	ALIGN_4
2706
2707.L2_20_6:
2708        movq    K, %rax
2709
2710	andq	$7, %rax		# if (k & 1)
2711	je .L2_20_9
2712
2713	movq    %rax, BI                        //  Index for BO
2714        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2715
2716	salq	$3, %rax			// rax = rax * 8 ; number of values
2717	leaq	(AO, %rax, SIZE), AO
2718	leaq	(BO, BI, SIZE), BO
2719	negq	BI
2720	negq	%rax
2721	ALIGN_4
2722
2723.L2_20_7:
2724
2725	KERNEL8x2_SUB
2726
2727	jl	.L2_20_7
2728	ALIGN_4
2729
2730
2731.L2_20_9:
2732
2733	SAVE8x2
2734
2735	addq	$8 * SIZE, CO1		# coffset += 8
2736	ALIGN_4
2737
2738
2739
2740/**************************************************************************/
2741
2742.L2_21pre:
2743
2744	testq	$4, M
2745	jz	.L2_30
2746	ALIGN_4
2747
2748.L2_21:
2749        leaq    BUFFER1, BO             // first buffer to BO
2750        addq    $4 * SIZE, BO
2751
2752	vzeroall
2753
2754        movq    K, %rax
2755
2756	andq	$-8, %rax
2757	je	.L2_26
2758	movq    %rax, BI                        //  Index for BO
2759        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
2760
2761	salq	$2, %rax			// rax = rax * 4 ; number of values
2762	leaq	(AO, %rax, SIZE), AO
2763	leaq	(BO, BI, SIZE), BO
2764	negq	BI
2765	negq	%rax
2766	ALIGN_4
2767
2768.L2_22:
2769
2770	prefetcht0      B_PR1(BO,BI,8)
2771	KERNEL4x2_1
2772	KERNEL4x2_2
2773	KERNEL4x2_3
2774	KERNEL4x2_4
2775
2776	prefetcht0      B_PR1(BO,BI,8)
2777	KERNEL4x2_1
2778	KERNEL4x2_2
2779	KERNEL4x2_3
2780	KERNEL4x2_4
2781
2782	je	.L2_26
2783
2784	prefetcht0      B_PR1(BO,BI,8)
2785	KERNEL4x2_1
2786	KERNEL4x2_2
2787	KERNEL4x2_3
2788	KERNEL4x2_4
2789
2790	prefetcht0      B_PR1(BO,BI,8)
2791	KERNEL4x2_1
2792	KERNEL4x2_2
2793	KERNEL4x2_3
2794	KERNEL4x2_4
2795
2796	je	.L2_26
2797
2798	jmp	.L2_22
2799	ALIGN_4
2800
2801.L2_26:
2802        movq    K, %rax
2803
2804	andq	$7, %rax		# if (k & 1)
2805	je .L2_29
2806
2807	movq    %rax, BI                        //  Index for BO
2808        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2809
2810	salq	$2, %rax			// rax = rax * 4 ; number of values
2811	leaq	(AO, %rax, SIZE), AO
2812	leaq	(BO, BI, SIZE), BO
2813	negq	BI
2814	negq	%rax
2815	ALIGN_4
2816
2817.L2_27:
2818
2819	KERNEL4x2_SUB
2820
2821	jl	.L2_27
2822	ALIGN_4
2823
2824
2825.L2_29:
2826
2827	SAVE4x2
2828
2829	addq	$4 * SIZE, CO1		# coffset += 4
2830	ALIGN_4
2831
2832
2833.L2_30:
2834	testq	$2, M
2835	jz	.L2_40
2836
2837	ALIGN_4
2838
2839.L2_31:
2840        leaq    BUFFER1, BO             // first buffer to BO
2841        addq    $4 * SIZE, BO
2842
2843	vzeroall
2844
2845        movq    K, %rax
2846
2847	andq	$-8, %rax
2848	je	.L2_36
2849	movq    %rax, BI                        //  Index for BO
2850        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2851
2852	salq	$1, %rax			// rax = rax *2 ; number of values
2853	leaq	(AO, %rax, SIZE), AO
2854	leaq	(BO, BI, SIZE), BO
2855	negq	BI
2856	negq	%rax
2857	ALIGN_4
2858
2859.L2_32:
2860
2861	KERNEL2x2_1
2862	KERNEL2x2_2
2863	KERNEL2x2_3
2864	KERNEL2x2_4
2865
2866	KERNEL2x2_1
2867	KERNEL2x2_2
2868	KERNEL2x2_3
2869	KERNEL2x2_4
2870
2871	je	.L2_36
2872
2873	KERNEL2x2_1
2874	KERNEL2x2_2
2875	KERNEL2x2_3
2876	KERNEL2x2_4
2877
2878	KERNEL2x2_1
2879	KERNEL2x2_2
2880	KERNEL2x2_3
2881	KERNEL2x2_4
2882
2883	je	.L2_36
2884
2885	jmp	.L2_32
2886	ALIGN_4
2887
2888.L2_36:
2889        movq    K, %rax
2890
2891	andq	$7, %rax		# if (k & 1)
2892	je .L2_39
2893
2894	movq    %rax, BI                        //  Index for BO
2895        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2896
2897	salq	$1, %rax			// rax = rax *2 ; number of values
2898	leaq	(AO, %rax, SIZE), AO
2899	leaq	(BO, BI, SIZE), BO
2900	negq	BI
2901	negq	%rax
2902	ALIGN_4
2903
2904.L2_37:
2905
2906	KERNEL2x2_SUB
2907
2908	jl	.L2_37
2909	ALIGN_4
2910
2911
2912.L2_39:
2913
2914	SAVE2x2
2915
2916	addq	$2 * SIZE, CO1		# coffset += 2
2917	ALIGN_4
2918
2919.L2_40:
2920	testq	$1, M
2921	jz	.L2_60		// to next 2 lines of N
2922
2923	ALIGN_4
2924
2925.L2_41:
2926        leaq    BUFFER1, BO             // first buffer to BO
2927        addq    $4 * SIZE, BO
2928
2929	vzeroall
2930
2931        movq    K, %rax
2932
2933	andq	$-8, %rax
2934	je	.L2_46
2935	movq    %rax, BI                        //  Index for BO
2936        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2937
2938	leaq	(AO, %rax, SIZE), AO
2939	leaq	(BO, BI, SIZE), BO
2940	negq	BI
2941	negq	%rax
2942	ALIGN_4
2943
2944.L2_42:
2945
2946	KERNEL1x2_1
2947	KERNEL1x2_2
2948	KERNEL1x2_3
2949	KERNEL1x2_4
2950
2951	KERNEL1x2_1
2952	KERNEL1x2_2
2953	KERNEL1x2_3
2954	KERNEL1x2_4
2955
2956	je	.L2_46
2957
2958	KERNEL1x2_1
2959	KERNEL1x2_2
2960	KERNEL1x2_3
2961	KERNEL1x2_4
2962
2963	KERNEL1x2_1
2964	KERNEL1x2_2
2965	KERNEL1x2_3
2966	KERNEL1x2_4
2967
2968	je	.L2_46
2969
2970	jmp	.L2_42
2971	ALIGN_4
2972
2973.L2_46:
2974        movq    K, %rax
2975
2976	andq	$7, %rax		# if (k & 1)
2977	je .L2_49
2978
2979	movq    %rax, BI                        //  Index for BO
2980        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
2981
2982	leaq	(AO, %rax, SIZE), AO
2983	leaq	(BO, BI, SIZE), BO
2984	negq	BI
2985	negq	%rax
2986	ALIGN_4
2987
2988.L2_47:
2989
2990	KERNEL1x2_SUB
2991
2992	jl	.L2_47
2993	ALIGN_4
2994
2995
2996.L2_49:
2997
2998	SAVE1x2
2999
3000	addq	$1 * SIZE, CO1		# coffset += 1
3001	ALIGN_4
3002
3003.L2_60:
3004
3005	decq	J			// j --
3006	jg	.L2_01			// next 2 lines of N
3007
3008
3009
3010.L1_0:
3011
3012/************************************************************************************************
3013* Loop for Nmod6 % 2 > 0
3014*************************************************************************************************/
3015
3016	movq	Nmod6, J
3017	andq	$1, J			// j % 2
3018	je	.L999
3019	ALIGN_4
3020
3021.L1_01:
3022	// copy to sub buffer
3023	movq	B, BO1
3024	leaq    BUFFER1, BO		// first buffer to BO
3025	movq	K, %rax
3026	ALIGN_4
3027
3028.L1_02b:
3029
3030	vmovsd	(BO1), %xmm0
3031	vmovsd	%xmm0,       (BO)
3032	addq	$1*SIZE,BO1
3033	addq	$1*SIZE,BO
3034	decq	%rax
3035	jnz	.L1_02b
3036
3037.L1_02c:
3038
3039	movq	BO1, B			// next offset of B
3040
3041.L1_10:
3042	movq	C, CO1
3043	leaq	(C, LDC, 1), C		// c += 1 * ldc
3044
3045
3046	movq	A, AO		 	// aoffset = a
3047	addq	$32 * SIZE, AO
3048
3049	movq	M,  I
3050	sarq	$4, I			// i = (m >> 4)
3051	je	.L1_20
3052
3053	ALIGN_4
3054
3055.L1_11:
3056        leaq    BUFFER1, BO             // first buffer to BO
3057        addq    $2 * SIZE, BO
3058
3059	vzeroall
3060
3061        movq    K, %rax
3062
3063	andq	$-8, %rax			//  K = K - ( K % 8 )
3064	je	.L1_16
3065	movq    %rax, BI                        //  Index for BO
3066
3067	salq	$4, %rax			// rax = rax * 16 ; number of values
3068	leaq	(AO, %rax, SIZE), AO
3069	leaq	(BO, BI, SIZE), BO
3070	negq	BI
3071	negq	%rax
3072	ALIGN_4
3073
3074.L1_12:
3075
3076	prefetcht0      B_PR1(BO,BI,8)
3077	KERNEL16x1_1
3078	KERNEL16x1_2
3079	KERNEL16x1_3
3080	KERNEL16x1_4
3081
3082	KERNEL16x1_1
3083	KERNEL16x1_2
3084	KERNEL16x1_3
3085	KERNEL16x1_4
3086
3087	je	.L1_16
3088
3089	prefetcht0      B_PR1(BO,BI,8)
3090	KERNEL16x1_1
3091	KERNEL16x1_2
3092	KERNEL16x1_3
3093	KERNEL16x1_4
3094
3095	KERNEL16x1_1
3096	KERNEL16x1_2
3097	KERNEL16x1_3
3098	KERNEL16x1_4
3099
3100	je	.L1_16
3101
3102	jmp	.L1_12
3103	ALIGN_4
3104
3105.L1_16:
3106        movq    K, %rax
3107
3108	andq	$7, %rax		# if (k & 1)
3109	je .L1_19
3110
3111	movq    %rax, BI                        //  Index for BO
3112
3113	salq	$4, %rax			// rax = rax * 16 ; number of values
3114	leaq	(AO, %rax, SIZE), AO
3115	leaq	(BO, BI, SIZE), BO
3116	negq	BI
3117	negq	%rax
3118	ALIGN_4
3119
3120.L1_17:
3121
3122	KERNEL16x1_SUB
3123
3124	jl	.L1_17
3125	ALIGN_4
3126
3127
3128.L1_19:
3129
3130	SAVE16x1
3131
3132	addq	$16 * SIZE, CO1		# coffset += 16
3133	decq	I			# i --
3134	jg	.L1_11
3135	ALIGN_4
3136
3137/**************************************************************************
3138* Rest of M
3139***************************************************************************/
3140.L1_20:
3141	// Test rest of M
3142
3143	testq	$15, M
3144	jz	.L999
3145
3146	testq	$8, M
3147	jz	.L1_21pre
3148	ALIGN_4
3149
3150/**************************************************************************/
3151
3152.L1_20_1:
3153        leaq    BUFFER1, BO             // first buffer to BO
3154        addq    $2 * SIZE, BO
3155
3156	vzeroall
3157
3158        movq    K, %rax
3159
3160	andq	$-8, %rax
3161	je	.L1_20_6
3162	movq    %rax, BI                        //  Index for BO
3163
3164	salq	$3, %rax			// rax = rax * 8 ; number of values
3165	leaq	(AO, %rax, SIZE), AO
3166	leaq	(BO, BI, SIZE), BO
3167	negq	BI
3168	negq	%rax
3169	ALIGN_4
3170
3171.L1_20_2:
3172
3173	prefetcht0      B_PR1(BO,BI,8)
3174	KERNEL8x1_1
3175	KERNEL8x1_2
3176	KERNEL8x1_3
3177	KERNEL8x1_4
3178
3179	KERNEL8x1_1
3180	KERNEL8x1_2
3181	KERNEL8x1_3
3182	KERNEL8x1_4
3183
3184	je	.L1_20_6
3185
3186	prefetcht0      B_PR1(BO,BI,8)
3187	KERNEL8x1_1
3188	KERNEL8x1_2
3189	KERNEL8x1_3
3190	KERNEL8x1_4
3191
3192	KERNEL8x1_1
3193	KERNEL8x1_2
3194	KERNEL8x1_3
3195	KERNEL8x1_4
3196
3197	je	.L1_20_6
3198
3199	jmp	.L1_20_2
3200	ALIGN_4
3201
3202.L1_20_6:
3203        movq    K, %rax
3204
3205	andq	$7, %rax		# if (k & 1)
3206	je .L1_20_9
3207
3208	movq    %rax, BI                        //  Index for BO
3209
3210	salq	$3, %rax			// rax = rax * 8 ; number of values
3211	leaq	(AO, %rax, SIZE), AO
3212	leaq	(BO, BI, SIZE), BO
3213	negq	BI
3214	negq	%rax
3215	ALIGN_4
3216
3217.L1_20_7:
3218
3219	KERNEL8x1_SUB
3220
3221	jl	.L1_20_7
3222	ALIGN_4
3223
3224
3225.L1_20_9:
3226
3227	SAVE8x1
3228
3229	addq	$8 * SIZE, CO1		# coffset += 8
3230	ALIGN_4
3231
3232
3233
3234/**************************************************************************/
3235
3236.L1_21pre:
3237
3238	testq	$4, M
3239	jz	.L1_30
3240	ALIGN_4
3241
3242.L1_21:
3243        leaq    BUFFER1, BO             // first buffer to BO
3244        addq    $2 * SIZE, BO
3245
3246	vzeroall
3247
3248        movq    K, %rax
3249
3250	andq	$-8, %rax
3251	je	.L1_26
3252	movq    %rax, BI                        //  Index for BO
3253
3254	salq	$2, %rax			// rax = rax * 4 ; number of values
3255	leaq	(AO, %rax, SIZE), AO
3256	leaq	(BO, BI, SIZE), BO
3257	negq	BI
3258	negq	%rax
3259	ALIGN_4
3260
3261.L1_22:
3262
3263	prefetcht0      B_PR1(BO,BI,8)
3264	KERNEL4x1_1
3265	KERNEL4x1_2
3266	KERNEL4x1_3
3267	KERNEL4x1_4
3268
3269	KERNEL4x1_1
3270	KERNEL4x1_2
3271	KERNEL4x1_3
3272	KERNEL4x1_4
3273
3274	je	.L1_26
3275
3276	prefetcht0      B_PR1(BO,BI,8)
3277	KERNEL4x1_1
3278	KERNEL4x1_2
3279	KERNEL4x1_3
3280	KERNEL4x1_4
3281
3282	KERNEL4x1_1
3283	KERNEL4x1_2
3284	KERNEL4x1_3
3285	KERNEL4x1_4
3286
3287	je	.L1_26
3288
3289	jmp	.L1_22
3290	ALIGN_4
3291
3292.L1_26:
3293        movq    K, %rax
3294
3295	andq	$7, %rax		# if (k & 1)
3296	je .L1_29
3297
3298	movq    %rax, BI                        //  Index for BO
3299
3300	salq	$2, %rax			// rax = rax * 4 ; number of values
3301	leaq	(AO, %rax, SIZE), AO
3302	leaq	(BO, BI, SIZE), BO
3303	negq	BI
3304	negq	%rax
3305	ALIGN_4
3306
3307.L1_27:
3308
3309	KERNEL4x1_SUB
3310
3311	jl	.L1_27
3312	ALIGN_4
3313
3314
3315.L1_29:
3316
3317	SAVE4x1
3318
3319	addq	$4 * SIZE, CO1		# coffset += 4
3320	ALIGN_4
3321
3322
3323.L1_30:
3324	testq	$2, M
3325	jz	.L1_40
3326
3327	ALIGN_4
3328
3329.L1_31:
3330        leaq    BUFFER1, BO             // first buffer to BO
3331        addq    $2 * SIZE, BO
3332
3333	vzeroall
3334
3335        movq    K, %rax
3336
3337	andq	$-8, %rax
3338	je	.L1_36
3339	movq    %rax, BI                        //  Index for BO
3340
3341	salq	$1, %rax			// rax = rax *2 ; number of values
3342	leaq	(AO, %rax, SIZE), AO
3343	leaq	(BO, BI, SIZE), BO
3344	negq	BI
3345	negq	%rax
3346	ALIGN_4
3347
3348.L1_32:
3349
3350	KERNEL2x1_1
3351	KERNEL2x1_2
3352	KERNEL2x1_3
3353	KERNEL2x1_4
3354
3355	KERNEL2x1_1
3356	KERNEL2x1_2
3357	KERNEL2x1_3
3358	KERNEL2x1_4
3359
3360	je	.L1_36
3361
3362	KERNEL2x1_1
3363	KERNEL2x1_2
3364	KERNEL2x1_3
3365	KERNEL2x1_4
3366
3367	KERNEL2x1_1
3368	KERNEL2x1_2
3369	KERNEL2x1_3
3370	KERNEL2x1_4
3371
3372	je	.L1_36
3373
3374	jmp	.L1_32
3375	ALIGN_4
3376
3377.L1_36:
3378        movq    K, %rax
3379
3380	andq	$7, %rax		# if (k & 1)
3381	je .L1_39
3382
3383	movq    %rax, BI                        //  Index for BO
3384
3385	salq	$1, %rax			// rax = rax *2 ; number of values
3386	leaq	(AO, %rax, SIZE), AO
3387	leaq	(BO, BI, SIZE), BO
3388	negq	BI
3389	negq	%rax
3390	ALIGN_4
3391
3392.L1_37:
3393
3394	KERNEL2x1_SUB
3395
3396	jl	.L1_37
3397	ALIGN_4
3398
3399
3400.L1_39:
3401
3402	SAVE2x1
3403
3404	addq	$2 * SIZE, CO1		# coffset += 2
3405	ALIGN_4
3406
3407.L1_40:
3408	testq	$1, M
3409	jz	.L999
3410
3411	ALIGN_4
3412
3413.L1_41:
3414        leaq    BUFFER1, BO             // first buffer to BO
3415        addq    $2 * SIZE, BO
3416
3417	vzeroall
3418
3419        movq    K, %rax
3420
3421	andq	$-8, %rax
3422	je	.L1_46
3423	movq    %rax, BI                        //  Index for BO
3424
3425	leaq	(AO, %rax, SIZE), AO
3426	leaq	(BO, BI, SIZE), BO
3427	negq	BI
3428	negq	%rax
3429	ALIGN_4
3430
3431.L1_42:
3432
3433	KERNEL1x1_1
3434	KERNEL1x1_2
3435	KERNEL1x1_3
3436	KERNEL1x1_4
3437
3438	KERNEL1x1_1
3439	KERNEL1x1_2
3440	KERNEL1x1_3
3441	KERNEL1x1_4
3442
3443	je	.L1_46
3444
3445	KERNEL1x1_1
3446	KERNEL1x1_2
3447	KERNEL1x1_3
3448	KERNEL1x1_4
3449
3450	KERNEL1x1_1
3451	KERNEL1x1_2
3452	KERNEL1x1_3
3453	KERNEL1x1_4
3454
3455	je	.L1_46
3456
3457	jmp	.L1_42
3458	ALIGN_4
3459
3460.L1_46:
3461        movq    K, %rax
3462
3463	andq	$7, %rax		# if (k & 1)
3464	je .L1_49
3465
3466	movq    %rax, BI                        //  Index for BO
3467
3468	leaq	(AO, %rax, SIZE), AO
3469	leaq	(BO, BI, SIZE), BO
3470	negq	BI
3471	negq	%rax
3472	ALIGN_4
3473
3474.L1_47:
3475
3476	KERNEL1x1_SUB
3477
3478	jl	.L1_47
3479	ALIGN_4
3480
3481
3482.L1_49:
3483
3484	SAVE1x1
3485
3486	addq	$1 * SIZE, CO1		# coffset += 1
3487	ALIGN_4
3488
3489
3490.L999:
3491	movq   		SP, %rsp
3492	movq	   (%rsp), %rbx
3493	movq	  8(%rsp), %rbp
3494	movq	 16(%rsp), %r12
3495	movq	 24(%rsp), %r13
3496	movq	 32(%rsp), %r14
3497	movq	 40(%rsp), %r15
3498
3499#ifdef WINDOWS_ABI
3500	movq	 48(%rsp), %rdi
3501	movq	 56(%rsp), %rsi
3502	movups	 64(%rsp), %xmm6
3503	movups	 80(%rsp), %xmm7
3504	movups	 96(%rsp), %xmm8
3505	movups	112(%rsp), %xmm9
3506	movups	128(%rsp), %xmm10
3507	movups	144(%rsp), %xmm11
3508	movups	160(%rsp), %xmm12
3509	movups	176(%rsp), %xmm13
3510	movups	192(%rsp), %xmm14
3511	movups	208(%rsp), %xmm15
3512#endif
3513
3514	addq	$STACKSIZE, %rsp
3515	ret
3516
3517	EPILOGUE
3518
3519
3520#else
3521/*************************************************************************************
3522* TRMM Kernel
3523*************************************************************************************/
3524
3525
3526	PROLOGUE
3527	PROFCODE
3528
3529	subq	$STACKSIZE, %rsp
3530	movq	%rbx,   (%rsp)
3531	movq	%rbp,  8(%rsp)
3532	movq	%r12, 16(%rsp)
3533	movq	%r13, 24(%rsp)
3534	movq	%r14, 32(%rsp)
3535	movq	%r15, 40(%rsp)
3536
3537	vzeroupper
3538
3539#ifdef WINDOWS_ABI
3540	movq	%rdi,    48(%rsp)
3541	movq	%rsi,    56(%rsp)
3542	movups	%xmm6,   64(%rsp)
3543	movups	%xmm7,   80(%rsp)
3544	movups	%xmm8,   96(%rsp)
3545	movups	%xmm9,  112(%rsp)
3546	movups	%xmm10, 128(%rsp)
3547	movups	%xmm11, 144(%rsp)
3548	movups	%xmm12, 160(%rsp)
3549	movups	%xmm13, 176(%rsp)
3550	movups	%xmm14, 192(%rsp)
3551	movups	%xmm15, 208(%rsp)
3552
3553	movq	ARG1,      OLD_M
3554	movq	ARG2,      OLD_N
3555	movq	ARG3,      OLD_K
3556	movq	OLD_A,     A
3557	movq	OLD_B,     B
3558	movq	OLD_C,     C
3559	movq	OLD_LDC,   LDC
3560#ifdef TRMMKERNEL
3561	movsd	OLD_OFFSET, %xmm12
3562#endif
3563	vmovaps	%xmm3, %xmm0
3564
3565#else
3566	movq	STACKSIZE +  8(%rsp), LDC
3567#ifdef TRMMKERNEL
3568	movsd	STACKSIZE + 16(%rsp), %xmm12
3569#endif
3570
3571#endif
3572
3573	movq    %rsp, SP      # save old stack
3574        subq    $128 + L_BUFFER_SIZE, %rsp
3575        andq    $-4096, %rsp    # align stack
3576
3577        STACK_TOUCH
3578
3579	cmpq	$0, OLD_M
3580	je	.L999
3581
3582	cmpq	$0, OLD_N
3583	je	.L999
3584
3585	cmpq	$0, OLD_K
3586	je	.L999
3587
3588	movq	OLD_M, M
3589	movq	OLD_N, N
3590	movq	OLD_K, K
3591
3592	vmovsd	 %xmm0, ALPHA
3593
3594	salq	$BASE_SHIFT, LDC
3595
3596	movq    N, %rax
3597        xorq    %rdx, %rdx
3598        movq    $2,  %rdi
3599        divq    %rdi                    //    N / 6
3600        movq    %rax, Ndiv6             //    N / 6
3601        movq    %rdx, Nmod6             //    N % 6
3602
3603
3604
3605#ifdef TRMMKERNEL
3606	vmovsd	%xmm12, OFFSET
3607	vmovsd	%xmm12, KK
3608#ifndef LEFT
3609	negq	KK
3610#endif
3611#endif
3612
3613	movq	Ndiv6,  J
3614	cmpq	$0, J
3615	je	.L1_0
3616	ALIGN_4
3617
3618.L2_01:
3619	// copy to sub buffer
3620	movq	B, BO1
3621	leaq    BUFFER1, BO		// first buffer to BO
3622	movq	K, %rax
3623	sarq	$2, %rax		// K / 4
3624	jz	.L2_01b
3625	ALIGN_4
3626
3627.L2_01a:
3628        prefetcht0 512(BO1)
3629        prefetchw  512(BO)
3630
3631	vmovups	      (BO1), %xmm0
3632	vmovups	2*SIZE(BO1), %xmm1
3633	vmovups	4*SIZE(BO1), %xmm2
3634	vmovups	6*SIZE(BO1), %xmm3
3635
3636	vmovups	%xmm0,       (BO)
3637	vmovups	%xmm1, 2*SIZE(BO)
3638	vmovups	%xmm2, 4*SIZE(BO)
3639	vmovups	%xmm3, 6*SIZE(BO)
3640
3641	addq	$8*SIZE,BO1
3642	addq	$8*SIZE,BO
3643	decq	%rax
3644	jnz	.L2_01a
3645
3646
3647.L2_01b:
3648
3649        movq    K, %rax
3650        andq    $3, %rax                // K % 4
3651        jz      .L2_02d
3652        ALIGN_4
3653
3654.L2_02c:
3655
3656	vmovups	(BO1), %xmm0
3657	vmovups	%xmm0, (BO)
3658	addq	$2*SIZE,BO1
3659	addq	$2*SIZE,BO
3660	decq	%rax
3661	jnz	.L2_02c
3662
3663.L2_02d:
3664
3665	movq	BO1, B			// next offset of B
3666
3667.L2_10:
3668	movq	C, CO1
3669	leaq	(C, LDC, 2), C		// c += 2 * ldc
3670
3671#if defined(TRMMKERNEL) && defined(LEFT)
3672        movq    OFFSET, %rax
3673        movq    %rax, KK
3674#endif
3675
3676	movq	A, AO		 	// aoffset = a
3677	addq	$32 * SIZE, AO
3678
3679	movq	M,  I
3680	sarq	$4, I			// i = (m >> 4)
3681	je	.L2_20
3682
3683	ALIGN_4
3684
3685.L2_11:
3686#if !defined(TRMMKERNEL) || \
3687        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3688        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3689        leaq    BUFFER1, BO             // first buffer to BO
3690        addq    $4 * SIZE, BO
3691#else
3692        movq    KK, %rax
3693        leaq    BUFFER1, BO             // first buffer to BO
3694        addq    $4 * SIZE, BO
3695        movq    %rax, BI                        //  Index for BO
3696        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3697        leaq    (BO, BI, SIZE), BO
3698        salq    $4, %rax                        // rax = rax * 16 ; number of values
3699        leaq    (AO, %rax, SIZE), AO
3700#endif
3701
3702
3703	vzeroall
3704
3705#ifndef TRMMKERNEL
3706        movq    K, %rax
3707#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3708        movq    K, %rax
3709        subq    KK, %rax
3710        movq    %rax, KKK
3711#else
3712        movq    KK, %rax
3713#ifdef LEFT
3714        addq    $16, %rax	// number of values in AO
3715#else
3716        addq    $2, %rax	// number of values in BO
3717#endif
3718        movq    %rax, KKK
3719#endif
3720
3721	andq	$-8, %rax			//  K = K - ( K % 8 )
3722	je	.L2_16
3723	movq    %rax, BI                        //  Index for BO
3724        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3725
3726	salq	$4, %rax			// rax = rax * 16 ; number of values
3727	leaq	(AO, %rax, SIZE), AO
3728	leaq	(BO, BI, SIZE), BO
3729	negq	BI
3730	negq	%rax
3731	ALIGN_4
3732
3733.L2_12:
3734
3735	prefetcht0      B_PR1(BO,BI,8)
3736	KERNEL16x2_1
3737	KERNEL16x2_2
3738	KERNEL16x2_3
3739	KERNEL16x2_4
3740
3741	prefetcht0      B_PR1(BO,BI,8)
3742	KERNEL16x2_1
3743	KERNEL16x2_2
3744	KERNEL16x2_3
3745	KERNEL16x2_4
3746
3747	je	.L2_16
3748
3749	prefetcht0      B_PR1(BO,BI,8)
3750	KERNEL16x2_1
3751	KERNEL16x2_2
3752	KERNEL16x2_3
3753	KERNEL16x2_4
3754
3755	prefetcht0      B_PR1(BO,BI,8)
3756	KERNEL16x2_1
3757	KERNEL16x2_2
3758	KERNEL16x2_3
3759	KERNEL16x2_4
3760
3761	je	.L2_16
3762
3763	jmp	.L2_12
3764	ALIGN_4
3765
3766.L2_16:
3767#ifndef TRMMKERNEL
3768        movq    K, %rax
3769#else
3770        movq    KKK, %rax
3771#endif
3772
3773	andq	$7, %rax		# if (k & 1)
3774	je .L2_19
3775
3776	movq    %rax, BI                        //  Index for BO
3777        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3778
3779	salq	$4, %rax			// rax = rax * 16 ; number of values
3780	leaq	(AO, %rax, SIZE), AO
3781	leaq	(BO, BI, SIZE), BO
3782	negq	BI
3783	negq	%rax
3784	ALIGN_4
3785
3786.L2_17:
3787
3788	KERNEL16x2_SUB
3789
3790	jl	.L2_17
3791	ALIGN_4
3792
3793
3794.L2_19:
3795
3796	SAVE16x2
3797
3798#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3799    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3800        movq    K, %rax
3801        subq    KKK, %rax
3802        movq    %rax, BI                        //  Index for BO
3803        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3804        leaq    (BO, BI, SIZE), BO
3805        salq    $4, %rax                        // rax = rax * 16 ; number of values
3806        leaq    (AO, %rax, SIZE), AO
3807#endif
3808
3809
3810#if defined(TRMMKERNEL) && defined(LEFT)
3811        addq    $16, KK
3812#endif
3813
3814	addq	$16 * SIZE, CO1		# coffset += 16
3815	decq	I			# i --
3816	jg	.L2_11
3817	ALIGN_4
3818
3819/**************************************************************************
3820* Rest of M
3821***************************************************************************/
3822.L2_20:
3823	// Test rest of M
3824
3825	testq	$15, M
3826	jz	.L2_60		// to next 3 lines of N
3827
3828	testq	$8, M
3829	jz	.L2_21pre
3830	ALIGN_4
3831
3832/**************************************************************************/
3833
3834.L2_20_1:
3835#if !defined(TRMMKERNEL) || \
3836        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3837        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3838        leaq    BUFFER1, BO             // first buffer to BO
3839        addq    $4 * SIZE, BO
3840#else
3841        movq    KK, %rax
3842        leaq    BUFFER1, BO             // first buffer to BO
3843        addq    $4 * SIZE, BO
3844        movq    %rax, BI                        //  Index for BO
3845        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3846        leaq    (BO, BI, SIZE), BO
3847        salq    $3, %rax                        // rax = rax * 8 ; number of values
3848        leaq    (AO, %rax, SIZE), AO
3849#endif
3850
3851
3852	vzeroall
3853
3854#ifndef TRMMKERNEL
3855        movq    K, %rax
3856#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3857        movq    K, %rax
3858        subq    KK, %rax
3859        movq    %rax, KKK
3860#else
3861        movq    KK, %rax
3862#ifdef LEFT
3863        addq    $8, %rax        // number of values in A
3864#else
3865        addq    $2, %rax        // number of values in BO
3866#endif
3867        movq    %rax, KKK
3868#endif
3869
3870
3871	andq	$-8, %rax
3872	je	.L2_20_6
3873	movq    %rax, BI                        //  Index for BO
3874        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3875
3876	salq	$3, %rax			// rax = rax * 8 ; number of values
3877	leaq	(AO, %rax, SIZE), AO
3878	leaq	(BO, BI, SIZE), BO
3879	negq	BI
3880	negq	%rax
3881	ALIGN_4
3882
3883.L2_20_2:
3884
3885	prefetcht0      B_PR1(BO,BI,8)
3886	KERNEL8x2_1
3887	KERNEL8x2_2
3888	KERNEL8x2_3
3889	KERNEL8x2_4
3890
3891	prefetcht0      B_PR1(BO,BI,8)
3892	KERNEL8x2_1
3893	KERNEL8x2_2
3894	KERNEL8x2_3
3895	KERNEL8x2_4
3896
3897	je	.L2_20_6
3898
3899	prefetcht0      B_PR1(BO,BI,8)
3900	KERNEL8x2_1
3901	KERNEL8x2_2
3902	KERNEL8x2_3
3903	KERNEL8x2_4
3904
3905	prefetcht0      B_PR1(BO,BI,8)
3906	KERNEL8x2_1
3907	KERNEL8x2_2
3908	KERNEL8x2_3
3909	KERNEL8x2_4
3910
3911	je	.L2_20_6
3912
3913	jmp	.L2_20_2
3914	ALIGN_4
3915
3916.L2_20_6:
3917#ifndef TRMMKERNEL
3918        movq    K, %rax
3919#else
3920        movq    KKK, %rax
3921#endif
3922
3923	andq	$7, %rax		# if (k & 1)
3924	je .L2_20_9
3925
3926	movq    %rax, BI                        //  Index for BO
3927        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3928
3929	salq	$3, %rax			// rax = rax * 8 ; number of values
3930	leaq	(AO, %rax, SIZE), AO
3931	leaq	(BO, BI, SIZE), BO
3932	negq	BI
3933	negq	%rax
3934	ALIGN_4
3935
3936.L2_20_7:
3937
3938	KERNEL8x2_SUB
3939
3940	jl	.L2_20_7
3941	ALIGN_4
3942
3943
3944.L2_20_9:
3945
3946	SAVE8x2
3947
3948#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3949    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3950        movq    K, %rax
3951        subq    KKK, %rax
3952        movq    %rax, BI                        //  Index for BO
3953        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3954        leaq    (BO, BI, SIZE), BO
3955        salq    $3, %rax                        // rax = rax * 8 ; number of values
3956        leaq    (AO, %rax, SIZE), AO
3957#endif
3958
3959
3960#if defined(TRMMKERNEL) && defined(LEFT)
3961        addq    $8, KK
3962#endif
3963
3964	addq	$8 * SIZE, CO1		# coffset += 8
3965	ALIGN_4
3966
3967
3968
3969/**************************************************************************/
3970
3971.L2_21pre:
3972
3973	testq	$4, M
3974	jz	.L2_30
3975	ALIGN_4
3976
3977.L2_21:
3978#if !defined(TRMMKERNEL) || \
3979        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3980        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3981        leaq    BUFFER1, BO             // first buffer to BO
3982        addq    $4 * SIZE, BO
3983#else
3984        movq    KK, %rax
3985        leaq    BUFFER1, BO             // first buffer to BO
3986        addq    $4 * SIZE, BO
3987        movq    %rax, BI                        //  Index for BO
3988        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
3989        leaq    (BO, BI, SIZE), BO
3990        salq    $2, %rax                        // rax = rax * 4 ; number of values
3991        leaq    (AO, %rax, SIZE), AO
3992#endif
3993
3994
3995	vzeroall
3996
3997#ifndef TRMMKERNEL
3998        movq    K, %rax
3999#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4000        movq    K, %rax
4001        subq    KK, %rax
4002        movq    %rax, KKK
4003#else
4004        movq    KK, %rax
4005#ifdef LEFT
4006        addq    $4, %rax        // number of values in A
4007#else
4008        addq    $2, %rax        // number of values in BO
4009#endif
4010        movq    %rax, KKK
4011#endif
4012
4013
4014	andq	$-8, %rax
4015	je	.L2_26
4016	movq    %rax, BI                        //  Index for BO
4017        leaq    (BI,BI,1), BI                   //  BI = BI * 1 ; number of values
4018
4019	salq	$2, %rax			// rax = rax * 4 ; number of values
4020	leaq	(AO, %rax, SIZE), AO
4021	leaq	(BO, BI, SIZE), BO
4022	negq	BI
4023	negq	%rax
4024	ALIGN_4
4025
4026.L2_22:
4027
4028	prefetcht0      B_PR1(BO,BI,8)
4029	KERNEL4x2_1
4030	KERNEL4x2_2
4031	KERNEL4x2_3
4032	KERNEL4x2_4
4033
4034	prefetcht0      B_PR1(BO,BI,8)
4035	KERNEL4x2_1
4036	KERNEL4x2_2
4037	KERNEL4x2_3
4038	KERNEL4x2_4
4039
4040	je	.L2_26
4041
4042	prefetcht0      B_PR1(BO,BI,8)
4043	KERNEL4x2_1
4044	KERNEL4x2_2
4045	KERNEL4x2_3
4046	KERNEL4x2_4
4047
4048	prefetcht0      B_PR1(BO,BI,8)
4049	KERNEL4x2_1
4050	KERNEL4x2_2
4051	KERNEL4x2_3
4052	KERNEL4x2_4
4053
4054	je	.L2_26
4055
4056	jmp	.L2_22
4057	ALIGN_4
4058
4059.L2_26:
4060#ifndef TRMMKERNEL
4061        movq    K, %rax
4062#else
4063        movq    KKK, %rax
4064#endif
4065
4066	andq	$7, %rax		# if (k & 1)
4067	je .L2_29
4068
4069	movq    %rax, BI                        //  Index for BO
4070        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4071
4072	salq	$2, %rax			// rax = rax * 4 ; number of values
4073	leaq	(AO, %rax, SIZE), AO
4074	leaq	(BO, BI, SIZE), BO
4075	negq	BI
4076	negq	%rax
4077	ALIGN_4
4078
4079.L2_27:
4080
4081	KERNEL4x2_SUB
4082
4083	jl	.L2_27
4084	ALIGN_4
4085
4086
4087.L2_29:
4088
4089	SAVE4x2
4090
4091#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4092    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4093        movq    K, %rax
4094        subq    KKK, %rax
4095        movq    %rax, BI                        //  Index for BO
4096        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4097        leaq    (BO, BI, SIZE), BO
4098        salq    $2, %rax                        // rax = rax * 4 ; number of values
4099        leaq    (AO, %rax, SIZE), AO
4100#endif
4101
4102
4103#if defined(TRMMKERNEL) && defined(LEFT)
4104        addq    $4, KK
4105#endif
4106
4107	addq	$4 * SIZE, CO1		# coffset += 4
4108	ALIGN_4
4109
4110
4111.L2_30:
4112	testq	$2, M
4113	jz	.L2_40
4114
4115	ALIGN_4
4116
4117.L2_31:
4118#if !defined(TRMMKERNEL) || \
4119        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4120        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4121        leaq    BUFFER1, BO             // first buffer to BO
4122        addq    $4 * SIZE, BO
4123#else
4124        movq    KK, %rax
4125        leaq    BUFFER1, BO             // first buffer to BO
4126        addq    $4 * SIZE, BO
4127        movq    %rax, BI                        //  Index for BO
4128        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4129        leaq    (BO, BI, SIZE), BO
4130        salq    $1, %rax                        // rax = rax * 2 ; number of values
4131        leaq    (AO, %rax, SIZE), AO
4132#endif
4133
4134
4135	vzeroall
4136
4137#ifndef TRMMKERNEL
4138        movq    K, %rax
4139#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4140        movq    K, %rax
4141        subq    KK, %rax
4142        movq    %rax, KKK
4143#else
4144        movq    KK, %rax
4145#ifdef LEFT
4146        addq    $2, %rax        // number of values in AO
4147#else
4148        addq    $2, %rax        // number of values in BO
4149#endif
4150        movq    %rax, KKK
4151#endif
4152
4153
4154	andq	$-8, %rax
4155	je	.L2_36
4156	movq    %rax, BI                        //  Index for BO
4157        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4158
4159	salq	$1, %rax			// rax = rax *2 ; number of values
4160	leaq	(AO, %rax, SIZE), AO
4161	leaq	(BO, BI, SIZE), BO
4162	negq	BI
4163	negq	%rax
4164	ALIGN_4
4165
4166.L2_32:
4167
4168	KERNEL2x2_1
4169	KERNEL2x2_2
4170	KERNEL2x2_3
4171	KERNEL2x2_4
4172
4173	KERNEL2x2_1
4174	KERNEL2x2_2
4175	KERNEL2x2_3
4176	KERNEL2x2_4
4177
4178	je	.L2_36
4179
4180	KERNEL2x2_1
4181	KERNEL2x2_2
4182	KERNEL2x2_3
4183	KERNEL2x2_4
4184
4185	KERNEL2x2_1
4186	KERNEL2x2_2
4187	KERNEL2x2_3
4188	KERNEL2x2_4
4189
4190	je	.L2_36
4191
4192	jmp	.L2_32
4193	ALIGN_4
4194
4195.L2_36:
4196#ifndef TRMMKERNEL
4197        movq    K, %rax
4198#else
4199        movq    KKK, %rax
4200#endif
4201
4202	andq	$7, %rax		# if (k & 1)
4203	je .L2_39
4204
4205	movq    %rax, BI                        //  Index for BO
4206        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4207
4208	salq	$1, %rax			// rax = rax *2 ; number of values
4209	leaq	(AO, %rax, SIZE), AO
4210	leaq	(BO, BI, SIZE), BO
4211	negq	BI
4212	negq	%rax
4213	ALIGN_4
4214
4215.L2_37:
4216
4217	KERNEL2x2_SUB
4218
4219	jl	.L2_37
4220	ALIGN_4
4221
4222
4223.L2_39:
4224
4225	SAVE2x2
4226
4227#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4228    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4229        movq    K, %rax
4230        subq    KKK, %rax
4231        movq    %rax, BI                        //  Index for BO
4232        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4233        leaq    (BO, BI, SIZE), BO
4234        salq    $1, %rax                        // rax = rax * 2 ; number of values
4235        leaq    (AO, %rax, SIZE), AO
4236#endif
4237
4238
4239#if defined(TRMMKERNEL) && defined(LEFT)
4240        addq    $2, KK
4241#endif
4242
4243	addq	$2 * SIZE, CO1		# coffset += 2
4244	ALIGN_4
4245
4246.L2_40:
4247	testq	$1, M
4248	jz	.L2_60		// to next 2 lines of N
4249
4250	ALIGN_4
4251
4252.L2_41:
4253#if !defined(TRMMKERNEL) || \
4254        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4255        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4256        leaq    BUFFER1, BO             // first buffer to BO
4257        addq    $4 * SIZE, BO
4258#else
4259        movq    KK, %rax
4260        leaq    BUFFER1, BO             // first buffer to BO
4261        addq    $4 * SIZE, BO
4262        movq    %rax, BI                        //  Index for BO
4263        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4264        leaq    (BO, BI, SIZE), BO
4265        leaq    (AO, %rax, SIZE), AO
4266#endif
4267
4268
4269	vzeroall
4270
4271#ifndef TRMMKERNEL
4272        movq    K, %rax
4273#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4274        movq    K, %rax
4275        subq    KK, %rax
4276        movq    %rax, KKK
4277#else
4278        movq    KK, %rax
4279#ifdef LEFT
4280        addq    $1, %rax        // number of values in AO
4281#else
4282        addq    $2, %rax        // number of values in BO
4283#endif
4284        movq    %rax, KKK
4285#endif
4286
4287	andq	$-8, %rax
4288	je	.L2_46
4289	movq    %rax, BI                        //  Index for BO
4290        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4291
4292	leaq	(AO, %rax, SIZE), AO
4293	leaq	(BO, BI, SIZE), BO
4294	negq	BI
4295	negq	%rax
4296	ALIGN_4
4297
4298.L2_42:
4299
4300	KERNEL1x2_1
4301	KERNEL1x2_2
4302	KERNEL1x2_3
4303	KERNEL1x2_4
4304
4305	KERNEL1x2_1
4306	KERNEL1x2_2
4307	KERNEL1x2_3
4308	KERNEL1x2_4
4309
4310	je	.L2_46
4311
4312	KERNEL1x2_1
4313	KERNEL1x2_2
4314	KERNEL1x2_3
4315	KERNEL1x2_4
4316
4317	KERNEL1x2_1
4318	KERNEL1x2_2
4319	KERNEL1x2_3
4320	KERNEL1x2_4
4321
4322	je	.L2_46
4323
4324	jmp	.L2_42
4325	ALIGN_4
4326
4327.L2_46:
4328#ifndef TRMMKERNEL
4329        movq    K, %rax
4330#else
4331        movq    KKK, %rax
4332#endif
4333
4334	andq	$7, %rax		# if (k & 1)
4335	je .L2_49
4336
4337	movq    %rax, BI                        //  Index for BO
4338        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4339
4340	leaq	(AO, %rax, SIZE), AO
4341	leaq	(BO, BI, SIZE), BO
4342	negq	BI
4343	negq	%rax
4344	ALIGN_4
4345
4346.L2_47:
4347
4348	KERNEL1x2_SUB
4349
4350	jl	.L2_47
4351	ALIGN_4
4352
4353
4354.L2_49:
4355
4356	SAVE1x2
4357
4358#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4359    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4360        movq    K, %rax
4361        subq    KKK, %rax
4362        movq    %rax, BI                        //  Index for BO
4363        leaq    (BI,BI,1), BI                   //  BI = BI * 2 ; number of values
4364        leaq    (BO, BI, SIZE), BO
4365        leaq    (AO, %rax, SIZE), AO
4366#endif
4367
4368
4369#if defined(TRMMKERNEL) && defined(LEFT)
4370        addq    $1, KK
4371#endif
4372
4373	addq	$1 * SIZE, CO1		# coffset += 1
4374	ALIGN_4
4375
4376
4377
4378
4379
4380.L2_60:
4381#if defined(TRMMKERNEL) && !defined(LEFT)
4382        addq    $2, KK
4383#endif
4384
4385	decq	J			// j --
4386	jg	.L2_01			// next 2 lines of N
4387
4388
4389
4390.L1_0:
4391
4392/************************************************************************************************
4393* Loop for Nmod6 % 2 > 0
4394*************************************************************************************************/
4395
4396	movq	Nmod6, J
4397	andq	$1, J			// j % 2
4398	je	.L999
4399	ALIGN_4
4400
4401.L1_01:
4402	// copy to sub buffer
4403	movq	B, BO1
4404	leaq    BUFFER1, BO		// first buffer to BO
4405	movq	K, %rax
4406	ALIGN_4
4407
4408.L1_02b:
4409
4410	vmovsd	(BO1), %xmm0
4411	vmovsd	%xmm0,       (BO)
4412	addq	$1*SIZE,BO1
4413	addq	$1*SIZE,BO
4414	decq	%rax
4415	jnz	.L1_02b
4416
4417.L1_02c:
4418
4419	movq	BO1, B			// next offset of B
4420
4421.L1_10:
4422	movq	C, CO1
4423	leaq	(C, LDC, 1), C		// c += 1 * ldc
4424
4425#if defined(TRMMKERNEL) && defined(LEFT)
4426        movq    OFFSET, %rax
4427        movq    %rax, KK
4428#endif
4429
4430	movq	A, AO		 	// aoffset = a
4431	addq	$32 * SIZE, AO
4432
4433	movq	M,  I
4434	sarq	$4, I			// i = (m >> 4)
4435	je	.L1_20
4436
4437	ALIGN_4
4438
4439.L1_11:
4440#if !defined(TRMMKERNEL) || \
4441        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4442        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4443        leaq    BUFFER1, BO             // first buffer to BO
4444        addq    $2 * SIZE, BO
4445#else
4446        movq    KK, %rax
4447        leaq    BUFFER1, BO             // first buffer to BO
4448        addq    $2 * SIZE, BO
4449        movq    %rax, BI                        //  Index for BO
4450        leaq    (BO, BI, SIZE), BO
4451        salq    $4, %rax                        // rax = rax * 16 ; number of values
4452        leaq    (AO, %rax, SIZE), AO
4453#endif
4454
4455
4456	vzeroall
4457
4458#ifndef TRMMKERNEL
4459        movq    K, %rax
4460#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4461        movq    K, %rax
4462        subq    KK, %rax
4463        movq    %rax, KKK
4464#else
4465        movq    KK, %rax
4466#ifdef LEFT
4467        addq    $16, %rax	// number of values in AO
4468#else
4469        addq    $1, %rax	// number of values in BO
4470#endif
4471        movq    %rax, KKK
4472#endif
4473
4474	andq	$-8, %rax			//  K = K - ( K % 8 )
4475	je	.L1_16
4476	movq    %rax, BI                        //  Index for BO
4477
4478	salq	$4, %rax			// rax = rax * 16 ; number of values
4479	leaq	(AO, %rax, SIZE), AO
4480	leaq	(BO, BI, SIZE), BO
4481	negq	BI
4482	negq	%rax
4483	ALIGN_4
4484
4485.L1_12:
4486
4487	prefetcht0      B_PR1(BO,BI,8)
4488	KERNEL16x1_1
4489	KERNEL16x1_2
4490	KERNEL16x1_3
4491	KERNEL16x1_4
4492
4493	KERNEL16x1_1
4494	KERNEL16x1_2
4495	KERNEL16x1_3
4496	KERNEL16x1_4
4497
4498	je	.L1_16
4499
4500	prefetcht0      B_PR1(BO,BI,8)
4501	KERNEL16x1_1
4502	KERNEL16x1_2
4503	KERNEL16x1_3
4504	KERNEL16x1_4
4505
4506	KERNEL16x1_1
4507	KERNEL16x1_2
4508	KERNEL16x1_3
4509	KERNEL16x1_4
4510
4511	je	.L1_16
4512
4513	jmp	.L1_12
4514	ALIGN_4
4515
4516.L1_16:
4517#ifndef TRMMKERNEL
4518        movq    K, %rax
4519#else
4520        movq    KKK, %rax
4521#endif
4522
4523	andq	$7, %rax		# if (k & 1)
4524	je .L1_19
4525
4526	movq    %rax, BI                        //  Index for BO
4527
4528	salq	$4, %rax			// rax = rax * 16 ; number of values
4529	leaq	(AO, %rax, SIZE), AO
4530	leaq	(BO, BI, SIZE), BO
4531	negq	BI
4532	negq	%rax
4533	ALIGN_4
4534
4535.L1_17:
4536
4537	KERNEL16x1_SUB
4538
4539	jl	.L1_17
4540	ALIGN_4
4541
4542
4543.L1_19:
4544
4545	SAVE16x1
4546
4547#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4548    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4549        movq    K, %rax
4550        subq    KKK, %rax
4551        movq    %rax, BI                        //  Index for BO
4552        leaq    (BO, BI, SIZE), BO
4553        salq    $4, %rax                        // rax = rax * 16 ; number of values
4554        leaq    (AO, %rax, SIZE), AO
4555#endif
4556
4557
4558#if defined(TRMMKERNEL) && defined(LEFT)
4559        addq    $16, KK
4560#endif
4561
4562	addq	$16 * SIZE, CO1		# coffset += 16
4563	decq	I			# i --
4564	jg	.L1_11
4565	ALIGN_4
4566
4567/**************************************************************************
4568* Rest of M
4569***************************************************************************/
4570.L1_20:
4571	// Test rest of M
4572
4573	testq	$15, M
4574	jz	.L999
4575
4576	testq	$8, M
4577	jz	.L1_21pre
4578	ALIGN_4
4579
4580/**************************************************************************/
4581
4582.L1_20_1:
4583#if !defined(TRMMKERNEL) || \
4584        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4585        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4586        leaq    BUFFER1, BO             // first buffer to BO
4587        addq    $2 * SIZE, BO
4588#else
4589        movq    KK, %rax
4590        leaq    BUFFER1, BO             // first buffer to BO
4591        addq    $2 * SIZE, BO
4592        movq    %rax, BI                        //  Index for BO
4593        leaq    (BO, BI, SIZE), BO
4594        salq    $3, %rax                        // rax = rax * 8 ; number of values
4595        leaq    (AO, %rax, SIZE), AO
4596#endif
4597
4598
4599	vzeroall
4600
4601#ifndef TRMMKERNEL
4602        movq    K, %rax
4603#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4604        movq    K, %rax
4605        subq    KK, %rax
4606        movq    %rax, KKK
4607#else
4608        movq    KK, %rax
4609#ifdef LEFT
4610        addq    $8, %rax        // number of values in A
4611#else
4612        addq    $1, %rax        // number of values in BO
4613#endif
4614        movq    %rax, KKK
4615#endif
4616
4617
4618	andq	$-8, %rax
4619	je	.L1_20_6
4620	movq    %rax, BI                        //  Index for BO
4621
4622	salq	$3, %rax			// rax = rax * 8 ; number of values
4623	leaq	(AO, %rax, SIZE), AO
4624	leaq	(BO, BI, SIZE), BO
4625	negq	BI
4626	negq	%rax
4627	ALIGN_4
4628
4629.L1_20_2:
4630
4631	prefetcht0      B_PR1(BO,BI,8)
4632	KERNEL8x1_1
4633	KERNEL8x1_2
4634	KERNEL8x1_3
4635	KERNEL8x1_4
4636
4637	KERNEL8x1_1
4638	KERNEL8x1_2
4639	KERNEL8x1_3
4640	KERNEL8x1_4
4641
4642	je	.L1_20_6
4643
4644	prefetcht0      B_PR1(BO,BI,8)
4645	KERNEL8x1_1
4646	KERNEL8x1_2
4647	KERNEL8x1_3
4648	KERNEL8x1_4
4649
4650	KERNEL8x1_1
4651	KERNEL8x1_2
4652	KERNEL8x1_3
4653	KERNEL8x1_4
4654
4655	je	.L1_20_6
4656
4657	jmp	.L1_20_2
4658	ALIGN_4
4659
4660.L1_20_6:
4661#ifndef TRMMKERNEL
4662        movq    K, %rax
4663#else
4664        movq    KKK, %rax
4665#endif
4666
4667	andq	$7, %rax		# if (k & 1)
4668	je .L1_20_9
4669
4670	movq    %rax, BI                        //  Index for BO
4671
4672	salq	$3, %rax			// rax = rax * 8 ; number of values
4673	leaq	(AO, %rax, SIZE), AO
4674	leaq	(BO, BI, SIZE), BO
4675	negq	BI
4676	negq	%rax
4677	ALIGN_4
4678
4679.L1_20_7:
4680
4681	KERNEL8x1_SUB
4682
4683	jl	.L1_20_7
4684	ALIGN_4
4685
4686
4687.L1_20_9:
4688
4689	SAVE8x1
4690
4691#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4692    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4693        movq    K, %rax
4694        subq    KKK, %rax
4695        movq    %rax, BI                        //  Index for BO
4696        leaq    (BO, BI, SIZE), BO
4697        salq    $3, %rax                        // rax = rax * 8 ; number of values
4698        leaq    (AO, %rax, SIZE), AO
4699#endif
4700
4701
4702#if defined(TRMMKERNEL) && defined(LEFT)
4703        addq    $8, KK
4704#endif
4705
4706	addq	$8 * SIZE, CO1		# coffset += 8
4707	ALIGN_4
4708
4709
4710
4711/**************************************************************************/
4712
4713.L1_21pre:
4714
4715	testq	$4, M
4716	jz	.L1_30
4717	ALIGN_4
4718
4719.L1_21:
4720#if !defined(TRMMKERNEL) || \
4721        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4722        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4723        leaq    BUFFER1, BO             // first buffer to BO
4724        addq    $2 * SIZE, BO
4725#else
4726        movq    KK, %rax
4727        leaq    BUFFER1, BO             // first buffer to BO
4728        addq    $2 * SIZE, BO
4729        movq    %rax, BI                        //  Index for BO
4730        leaq    (BO, BI, SIZE), BO
4731        salq    $2, %rax                        // rax = rax * 4 ; number of values
4732        leaq    (AO, %rax, SIZE), AO
4733#endif
4734
4735
4736	vzeroall
4737
4738#ifndef TRMMKERNEL
4739        movq    K, %rax
4740#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4741        movq    K, %rax
4742        subq    KK, %rax
4743        movq    %rax, KKK
4744#else
4745        movq    KK, %rax
4746#ifdef LEFT
4747        addq    $4, %rax        // number of values in A
4748#else
4749        addq    $1, %rax        // number of values in BO
4750#endif
4751        movq    %rax, KKK
4752#endif
4753
4754
4755	andq	$-8, %rax
4756	je	.L1_26
4757	movq    %rax, BI                        //  Index for BO
4758
4759	salq	$2, %rax			// rax = rax * 4 ; number of values
4760	leaq	(AO, %rax, SIZE), AO
4761	leaq	(BO, BI, SIZE), BO
4762	negq	BI
4763	negq	%rax
4764	ALIGN_4
4765
4766.L1_22:
4767
4768	prefetcht0      B_PR1(BO,BI,8)
4769	KERNEL4x1_1
4770	KERNEL4x1_2
4771	KERNEL4x1_3
4772	KERNEL4x1_4
4773
4774	KERNEL4x1_1
4775	KERNEL4x1_2
4776	KERNEL4x1_3
4777	KERNEL4x1_4
4778
4779	je	.L1_26
4780
4781	prefetcht0      B_PR1(BO,BI,8)
4782	KERNEL4x1_1
4783	KERNEL4x1_2
4784	KERNEL4x1_3
4785	KERNEL4x1_4
4786
4787	KERNEL4x1_1
4788	KERNEL4x1_2
4789	KERNEL4x1_3
4790	KERNEL4x1_4
4791
4792	je	.L1_26
4793
4794	jmp	.L1_22
4795	ALIGN_4
4796
4797.L1_26:
4798#ifndef TRMMKERNEL
4799        movq    K, %rax
4800#else
4801        movq    KKK, %rax
4802#endif
4803
4804	andq	$7, %rax		# if (k & 1)
4805	je .L1_29
4806
4807	movq    %rax, BI                        //  Index for BO
4808
4809	salq	$2, %rax			// rax = rax * 4 ; number of values
4810	leaq	(AO, %rax, SIZE), AO
4811	leaq	(BO, BI, SIZE), BO
4812	negq	BI
4813	negq	%rax
4814	ALIGN_4
4815
4816.L1_27:
4817
4818	KERNEL4x1_SUB
4819
4820	jl	.L1_27
4821	ALIGN_4
4822
4823
4824.L1_29:
4825
4826	SAVE4x1
4827
4828#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4829    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4830        movq    K, %rax
4831        subq    KKK, %rax
4832        movq    %rax, BI                        //  Index for BO
4833        leaq    (BO, BI, SIZE), BO
4834        salq    $2, %rax                        // rax = rax * 4 ; number of values
4835        leaq    (AO, %rax, SIZE), AO
4836#endif
4837
4838
4839#if defined(TRMMKERNEL) && defined(LEFT)
4840        addq    $4, KK
4841#endif
4842
4843	addq	$4 * SIZE, CO1		# coffset += 4
4844	ALIGN_4
4845
4846
4847.L1_30:
4848	testq	$2, M
4849	jz	.L1_40
4850
4851	ALIGN_4
4852
4853.L1_31:
4854#if !defined(TRMMKERNEL) || \
4855        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4856        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4857        leaq    BUFFER1, BO             // first buffer to BO
4858        addq    $2 * SIZE, BO
4859#else
4860        movq    KK, %rax
4861        leaq    BUFFER1, BO             // first buffer to BO
4862        addq    $2 * SIZE, BO
4863        movq    %rax, BI                        //  Index for BO
4864        leaq    (BO, BI, SIZE), BO
4865        salq    $1, %rax                        // rax = rax * 2 ; number of values
4866        leaq    (AO, %rax, SIZE), AO
4867#endif
4868
4869
4870	vzeroall
4871
4872#ifndef TRMMKERNEL
4873        movq    K, %rax
4874#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
4875        movq    K, %rax
4876        subq    KK, %rax
4877        movq    %rax, KKK
4878#else
4879        movq    KK, %rax
4880#ifdef LEFT
4881        addq    $2, %rax        // number of values in AO
4882#else
4883        addq    $1, %rax        // number of values in BO
4884#endif
4885        movq    %rax, KKK
4886#endif
4887
4888
4889	andq	$-8, %rax
4890	je	.L1_36
4891	movq    %rax, BI                        //  Index for BO
4892
4893	salq	$1, %rax			// rax = rax *2 ; number of values
4894	leaq	(AO, %rax, SIZE), AO
4895	leaq	(BO, BI, SIZE), BO
4896	negq	BI
4897	negq	%rax
4898	ALIGN_4
4899
4900.L1_32:
4901
4902	KERNEL2x1_1
4903	KERNEL2x1_2
4904	KERNEL2x1_3
4905	KERNEL2x1_4
4906
4907	KERNEL2x1_1
4908	KERNEL2x1_2
4909	KERNEL2x1_3
4910	KERNEL2x1_4
4911
4912	je	.L1_36
4913
4914	KERNEL2x1_1
4915	KERNEL2x1_2
4916	KERNEL2x1_3
4917	KERNEL2x1_4
4918
4919	KERNEL2x1_1
4920	KERNEL2x1_2
4921	KERNEL2x1_3
4922	KERNEL2x1_4
4923
4924	je	.L1_36
4925
4926	jmp	.L1_32
4927	ALIGN_4
4928
4929.L1_36:
4930#ifndef TRMMKERNEL
4931        movq    K, %rax
4932#else
4933        movq    KKK, %rax
4934#endif
4935
4936	andq	$7, %rax		# if (k & 1)
4937	je .L1_39
4938
4939	movq    %rax, BI                        //  Index for BO
4940
4941	salq	$1, %rax			// rax = rax *2 ; number of values
4942	leaq	(AO, %rax, SIZE), AO
4943	leaq	(BO, BI, SIZE), BO
4944	negq	BI
4945	negq	%rax
4946	ALIGN_4
4947
4948.L1_37:
4949
4950	KERNEL2x1_SUB
4951
4952	jl	.L1_37
4953	ALIGN_4
4954
4955
4956.L1_39:
4957
4958	SAVE2x1
4959
4960#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4961    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4962        movq    K, %rax
4963        subq    KKK, %rax
4964        movq    %rax, BI                        //  Index for BO
4965        leaq    (BO, BI, SIZE), BO
4966        salq    $1, %rax                        // rax = rax * 2 ; number of values
4967        leaq    (AO, %rax, SIZE), AO
4968#endif
4969
4970
4971#if defined(TRMMKERNEL) && defined(LEFT)
4972        addq    $2, KK
4973#endif
4974
4975	addq	$2 * SIZE, CO1		# coffset += 2
4976	ALIGN_4
4977
4978.L1_40:
4979	testq	$1, M
4980	jz	.L999
4981
4982	ALIGN_4
4983
4984.L1_41:
4985#if !defined(TRMMKERNEL) || \
4986        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
4987        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
4988        leaq    BUFFER1, BO             // first buffer to BO
4989        addq    $2 * SIZE, BO
4990#else
4991        movq    KK, %rax
4992        leaq    BUFFER1, BO             // first buffer to BO
4993        addq    $2 * SIZE, BO
4994        movq    %rax, BI                        //  Index for BO
4995        leaq    (BO, BI, SIZE), BO
4996        leaq    (AO, %rax, SIZE), AO
4997#endif
4998
4999
5000	vzeroall
5001
5002#ifndef TRMMKERNEL
5003        movq    K, %rax
5004#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
5005        movq    K, %rax
5006        subq    KK, %rax
5007        movq    %rax, KKK
5008#else
5009        movq    KK, %rax
5010#ifdef LEFT
5011        addq    $1, %rax        // number of values in AO
5012#else
5013        addq    $1, %rax        // number of values in BO
5014#endif
5015        movq    %rax, KKK
5016#endif
5017
5018	andq	$-8, %rax
5019	je	.L1_46
5020	movq    %rax, BI                        //  Index for BO
5021
5022	leaq	(AO, %rax, SIZE), AO
5023	leaq	(BO, BI, SIZE), BO
5024	negq	BI
5025	negq	%rax
5026	ALIGN_4
5027
5028.L1_42:
5029
5030	KERNEL1x1_1
5031	KERNEL1x1_2
5032	KERNEL1x1_3
5033	KERNEL1x1_4
5034
5035	KERNEL1x1_1
5036	KERNEL1x1_2
5037	KERNEL1x1_3
5038	KERNEL1x1_4
5039
5040	je	.L1_46
5041
5042	KERNEL1x1_1
5043	KERNEL1x1_2
5044	KERNEL1x1_3
5045	KERNEL1x1_4
5046
5047	KERNEL1x1_1
5048	KERNEL1x1_2
5049	KERNEL1x1_3
5050	KERNEL1x1_4
5051
5052	je	.L1_46
5053
5054	jmp	.L1_42
5055	ALIGN_4
5056
5057.L1_46:
5058#ifndef TRMMKERNEL
5059        movq    K, %rax
5060#else
5061        movq    KKK, %rax
5062#endif
5063
5064	andq	$7, %rax		# if (k & 1)
5065	je .L1_49
5066
5067	movq    %rax, BI                        //  Index for BO
5068
5069	leaq	(AO, %rax, SIZE), AO
5070	leaq	(BO, BI, SIZE), BO
5071	negq	BI
5072	negq	%rax
5073	ALIGN_4
5074
5075.L1_47:
5076
5077	KERNEL1x1_SUB
5078
5079	jl	.L1_47
5080	ALIGN_4
5081
5082
5083.L1_49:
5084
5085	SAVE1x1
5086
5087#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
5088    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
5089        movq    K, %rax
5090        subq    KKK, %rax
5091        movq    %rax, BI                        //  Index for BO
5092        leaq    (BO, BI, SIZE), BO
5093        leaq    (AO, %rax, SIZE), AO
5094#endif
5095
5096
5097#if defined(TRMMKERNEL) && defined(LEFT)
5098        addq    $1, KK
5099#endif
5100
5101	addq	$1 * SIZE, CO1		# coffset += 1
5102	ALIGN_4
5103
5104
5105.L999:
5106	movq   		SP, %rsp
5107	movq	   (%rsp), %rbx
5108	movq	  8(%rsp), %rbp
5109	movq	 16(%rsp), %r12
5110	movq	 24(%rsp), %r13
5111	movq	 32(%rsp), %r14
5112	movq	 40(%rsp), %r15
5113
5114#ifdef WINDOWS_ABI
5115	movq	 48(%rsp), %rdi
5116	movq	 56(%rsp), %rsi
5117	movups	 64(%rsp), %xmm6
5118	movups	 80(%rsp), %xmm7
5119	movups	 96(%rsp), %xmm8
5120	movups	112(%rsp), %xmm9
5121	movups	128(%rsp), %xmm10
5122	movups	144(%rsp), %xmm11
5123	movups	160(%rsp), %xmm12
5124	movups	176(%rsp), %xmm13
5125	movups	192(%rsp), %xmm14
5126	movups	208(%rsp), %xmm15
5127#endif
5128
5129	addq	$STACKSIZE, %rsp
5130	ret
5131
5132	EPILOGUE
5133
5134
5135
5136
5137
5138#endif
5139