1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%rdi
43#define N	%rsi
44#define K	%rdx
45
46#define A	%rcx
47#define B	%r8
48#define C	%r9
49#define LDC	%r10
50
51#define I	%r11
52#define J	%r12
53#define AO	%r13
54#define BO	%r14
55#define	CO1	%r15
56#define CO2	%rbx
57#define BB	%rbp
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 128
62
63#define OLD_LDC		 8 + STACKSIZE(%rsp)
64#define OLD_OFFSET	16 + STACKSIZE(%rsp)
65
66#define ALPHA	 48(%rsp)
67#define OFFSET	 56(%rsp)
68#define KKK	 64(%rsp)
69#define KK	 72(%rsp)
70
71#else
72
73#define STACKSIZE 256
74
75#define OLD_A		40 + STACKSIZE(%rsp)
76#define OLD_B		48 + STACKSIZE(%rsp)
77#define OLD_C		56 + STACKSIZE(%rsp)
78#define OLD_LDC		64 + STACKSIZE(%rsp)
79#define OLD_OFFSET	72 + STACKSIZE(%rsp)
80
81#define ALPHA	 224(%rsp)
82#define OFFSET	 232(%rsp)
83#define KK	 240(%rsp)
84#define KKK	 248(%rsp)
85
86#endif
87
88#define PREFETCH     prefetcht1
89#define PREFETCHSIZE (16 * 12 + 3)
90#define PREFETCH_R    (4 *  4 + 0)
91
92#define KERNEL1(address) \
93	mulpd	%xmm8, %xmm9 ;\
94	PREFETCH  (PREFETCHSIZE +  0) * SIZE + (address) * 2 * SIZE(AO);\
95	addpd	%xmm9, %xmm0;\
96	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
97	mulpd	%xmm8, %xmm9;\
98	addpd	%xmm9, %xmm1;\
99	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
100	mulpd	%xmm8, %xmm9;\
101	addpd	%xmm9, %xmm2;\
102	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
103	mulpd	%xmm8, %xmm9;\
104	movapd	 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
105	addpd	%xmm9, %xmm3;\
106	movddup	 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
107
108#define KERNEL2(address) \
109	mulpd	%xmm8, %xmm9;\
110	addpd	%xmm9, %xmm4;\
111	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
112	mulpd	%xmm8, %xmm9;\
113	addpd	%xmm9, %xmm5;\
114	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
115	mulpd	%xmm8, %xmm9;\
116	addpd	%xmm9, %xmm6;\
117	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
118	mulpd	%xmm8, %xmm9;\
119	movapd	 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
120	addpd	%xmm9, %xmm7;\
121	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
122
123#define KERNEL3(address) \
124	mulpd	%xmm8, %xmm9;\
125	addpd	%xmm9, %xmm0;\
126	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
127	mulpd	%xmm8, %xmm9;\
128	addpd	%xmm9, %xmm1;\
129	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
130	mulpd	%xmm8, %xmm9;\
131	addpd	%xmm9, %xmm2;\
132	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
133	mulpd	%xmm8, %xmm9;\
134	movapd	 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
135	addpd	%xmm9, %xmm3;\
136	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
137
138#define KERNEL4(address) \
139	mulpd	%xmm8, %xmm9;\
140	addpd	%xmm9, %xmm4;\
141	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
142	mulpd	%xmm8, %xmm9;\
143	addpd	%xmm9, %xmm5;\
144	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
145	mulpd	%xmm8, %xmm9;\
146	addpd	%xmm9, %xmm6;\
147	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
148	mulpd	%xmm8, %xmm9;\
149	movapd	32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
150	addpd	%xmm9, %xmm7;\
151	movddup	32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
152
153#define KERNEL5(address) \
154	mulpd	%xmm10, %xmm11;\
155	addpd	%xmm11, %xmm0;\
156	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
157	mulpd	%xmm10, %xmm11;\
158	addpd	%xmm11, %xmm1;\
159	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
160	mulpd	%xmm10, %xmm11;\
161	addpd	%xmm11, %xmm2;\
162	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
163	mulpd	%xmm10, %xmm11;\
164	movapd	10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
165	addpd	%xmm11, %xmm3;\
166	movddup	 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
167
168#define KERNEL6(address) \
169	mulpd	%xmm10, %xmm11;\
170	addpd	%xmm11, %xmm4;\
171	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
172	mulpd	%xmm10, %xmm11;\
173	addpd	%xmm11, %xmm5;\
174	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
175	mulpd	%xmm10, %xmm11;\
176	addpd	%xmm11, %xmm6;\
177	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
178	mulpd	%xmm10, %xmm11;\
179	movapd	12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
180	addpd	%xmm11, %xmm7;\
181	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
182
183#define KERNEL7(address) \
184	mulpd	%xmm10, %xmm11;\
185	addpd	%xmm11, %xmm0;\
186	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
187	mulpd	%xmm10, %xmm11;\
188	addpd	%xmm11, %xmm1;\
189	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
190	mulpd	%xmm10, %xmm11;\
191	addpd	%xmm11, %xmm2;\
192	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
193	mulpd	%xmm10, %xmm11;\
194	movapd	14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
195	addpd	%xmm11, %xmm3;\
196	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
197
198#define KERNEL8(address) \
199	mulpd	%xmm10, %xmm11;\
200	addpd	%xmm11, %xmm4;\
201	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
202	mulpd	%xmm10, %xmm11;\
203	addpd	%xmm11, %xmm5;\
204	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
205	mulpd	%xmm10, %xmm11;\
206	addpd	%xmm11, %xmm6;\
207	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
208	mulpd	%xmm10, %xmm11;\
209	movapd	40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
210	addpd	%xmm11, %xmm7;\
211	movddup	40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
212
213#define KERNEL9(address) \
214	mulpd	%xmm12, %xmm13;\
215	PREFETCH  (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
216	addpd	%xmm13, %xmm0;\
217	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
218	mulpd	%xmm12, %xmm13;\
219	addpd	%xmm13, %xmm1;\
220	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
221	mulpd	%xmm12, %xmm13;\
222	addpd	%xmm13, %xmm2;\
223	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
224	mulpd	%xmm12, %xmm13;\
225	movapd	18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
226	addpd	%xmm13, %xmm3;\
227	movddup	16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
228
229#define KERNEL10(address) \
230	mulpd	%xmm12, %xmm13;\
231	addpd	%xmm13, %xmm4;\
232	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
233	mulpd	%xmm12, %xmm13;\
234	addpd	%xmm13, %xmm5;\
235	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
236	mulpd	%xmm12, %xmm13;\
237	addpd	%xmm13, %xmm6;\
238	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
239	mulpd	%xmm12, %xmm13;\
240	movapd	20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
241	addpd	%xmm13, %xmm7;\
242	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
243
244#define KERNEL11(address) \
245	mulpd	%xmm12, %xmm13;\
246	addpd	%xmm13, %xmm0;\
247	movddup	21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
248	mulpd	%xmm12, %xmm13;\
249	addpd	%xmm13, %xmm1;\
250	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
251	mulpd	%xmm12, %xmm13;\
252	addpd	%xmm13, %xmm2;\
253	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
254	mulpd	%xmm12, %xmm13;\
255	movapd	22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
256	addpd	%xmm13, %xmm3;\
257	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
258
259#define KERNEL12(address) \
260	mulpd	%xmm12, %xmm13;\
261	addpd	%xmm13, %xmm4;\
262	movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
263	mulpd	%xmm12, %xmm13;\
264	addpd	%xmm13, %xmm5;\
265	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
266	mulpd	%xmm12, %xmm13;\
267	addpd	%xmm13, %xmm6;\
268	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
269	mulpd	%xmm12, %xmm13;\
270	movapd	48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
271	addpd	%xmm13, %xmm7;\
272	movddup	48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
273
274#define KERNEL13(address) \
275	mulpd	%xmm14, %xmm15;\
276	addpd	%xmm15, %xmm0;\
277	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
278	mulpd	%xmm14, %xmm15;\
279	addpd	%xmm15, %xmm1;\
280	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
281	mulpd	%xmm14, %xmm15;\
282	addpd	%xmm15, %xmm2;\
283	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
284	mulpd	%xmm14, %xmm15;\
285	movapd	26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
286	addpd	%xmm15, %xmm3;\
287	movddup	24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
288
289#define KERNEL14(address) \
290	mulpd	%xmm14, %xmm15;\
291	addpd	%xmm15, %xmm4;\
292	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
293	mulpd	%xmm14, %xmm15;\
294	addpd	%xmm15, %xmm5;\
295	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
296	mulpd	%xmm14, %xmm15;\
297	addpd	%xmm15, %xmm6;\
298	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
299	mulpd	%xmm14, %xmm15;\
300	movapd	28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
301	addpd	%xmm15, %xmm7;\
302	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
303
304#define KERNEL15(address) \
305	mulpd	%xmm14, %xmm15;\
306	addpd	%xmm15, %xmm0;\
307	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
308	mulpd	%xmm14, %xmm15;\
309	addpd	%xmm15, %xmm1;\
310	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
311	mulpd	%xmm14, %xmm15;\
312	addpd	%xmm15, %xmm2;\
313	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
314	mulpd	%xmm14, %xmm15;\
315	movapd	30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
316	addpd	%xmm15, %xmm3;\
317	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
318
319#define KERNEL16(address) \
320	mulpd	%xmm14, %xmm15;\
321	addpd	%xmm15, %xmm4;\
322	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
323	mulpd	%xmm14, %xmm15;\
324	addpd	%xmm15, %xmm5;\
325	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
326	mulpd	%xmm14, %xmm15;\
327	addpd	%xmm15, %xmm6;\
328	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
329	mulpd	%xmm14, %xmm15;\
330	movapd	56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
331	addpd	%xmm15, %xmm7;\
332	movddup	56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
333
334	PROLOGUE
335	PROFCODE
336
337	subq	$STACKSIZE, %rsp
338	movq	%rbx,  0(%rsp)
339	movq	%rbp,  8(%rsp)
340	movq	%r12, 16(%rsp)
341	movq	%r13, 24(%rsp)
342	movq	%r14, 32(%rsp)
343	movq	%r15, 40(%rsp)
344
345#ifdef WINDOWS_ABI
346	movq	%rdi,    48(%rsp)
347	movq	%rsi,    56(%rsp)
348	movups	%xmm6,   64(%rsp)
349	movups	%xmm7,   80(%rsp)
350	movups	%xmm8,   96(%rsp)
351	movups	%xmm9,  112(%rsp)
352	movups	%xmm10, 128(%rsp)
353	movups	%xmm11, 144(%rsp)
354	movups	%xmm12, 160(%rsp)
355	movups	%xmm13, 176(%rsp)
356	movups	%xmm14, 192(%rsp)
357	movups	%xmm15, 208(%rsp)
358
359	movq	ARG1,      M
360	movq	ARG2,      N
361	movq	ARG3,      K
362	movq	OLD_A,     A
363	movq	OLD_B,     B
364	movq	OLD_C,     C
365	movq	OLD_LDC,   LDC
366#ifdef TRMMKERNEL
367	movsd	OLD_OFFSET, %xmm4
368#endif
369	movaps	%xmm3, %xmm0
370
371#else
372	movq	OLD_LDC,   LDC
373#ifdef TRMMKERNEL
374	movsd	OLD_OFFSET, %xmm4
375#endif
376
377#endif
378
379	movsd	%xmm0, ALPHA
380
381#ifdef TRMMKERNEL
382	movsd	%xmm4, OFFSET
383	movsd	%xmm4, KK
384#ifndef LEFT
385	negq	KK
386#endif
387#endif
388
389	leaq	(, LDC, SIZE), LDC
390
391	movq	N,  J
392	sarq	$2, J		# j = (n >> 2)
393	jle	.L40
394	ALIGN_4
395
396.L10:
397#if defined(TRMMKERNEL) && defined(LEFT)
398	movq	OFFSET, %rax
399	movq	%rax, KK
400#endif
401
402	movq	C, CO1			# coffset1 = c
403	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
404	movq	A, AO		# aoffset = a
405
406	movq	K, %rax
407	salq	$BASE_SHIFT + 2, %rax
408	leaq	(B, %rax), BB
409
410	movq	M,  I
411	sarq	$2, I	# i = (m >> 2)
412	jle	.L20
413	ALIGN_4
414
415.L11:
416#if !defined(TRMMKERNEL) || \
417	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
418	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
419
420	movq	B, BO
421#else
422	movq	KK, %rax
423	leaq	(, %rax, SIZE), %rax
424	leaq	(AO, %rax, 4), AO
425	leaq	(B,  %rax, 4), BO
426#endif
427
428	movapd	 0 * SIZE(AO), %xmm8
429	pxor	%xmm0, %xmm0
430	movddup	 0 * SIZE(BO), %xmm9
431	pxor	%xmm1, %xmm1
432	movapd	 8 * SIZE(AO), %xmm10
433	pxor	%xmm2, %xmm2
434	movddup	 8 * SIZE(BO), %xmm11
435	pxor	%xmm3, %xmm3
436
437	movapd	16 * SIZE(AO), %xmm12
438	pxor	%xmm4, %xmm4
439	movddup 16 * SIZE(BO), %xmm13
440	pxor	%xmm5, %xmm5
441	movapd	24 * SIZE(AO), %xmm14
442	pxor	%xmm6, %xmm6
443	movddup	24 * SIZE(BO), %xmm15
444	pxor	%xmm7, %xmm7
445
446	prefetchnta     3 * SIZE(CO1)
447	prefetchnta     3 * SIZE(CO2)
448	prefetchnta     3 * SIZE(CO1, LDC, 2)
449	prefetchnta     3 * SIZE(CO2, LDC, 2)
450
451	prefetcht0	  0 * SIZE(BB)
452	subq	   $-8 * SIZE, BB
453
454#ifndef TRMMKERNEL
455	movq	K, %rax
456#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
457	movq	K, %rax
458	subq	KK, %rax
459	movq	%rax, KKK
460#else
461	movq	KK, %rax
462#ifdef LEFT
463	addq	$4, %rax
464#else
465	addq	$4, %rax
466#endif
467	movq	%rax, KKK
468#endif
469
470#if 1
471	andq	$-8, %rax
472	salq	$4, %rax
473	NOBRANCH
474	je	.L15
475
476.L1X:
477	KERNEL1 (16  *  0)
478	KERNEL2 (16  *  0)
479	KERNEL3 (16  *  0)
480	KERNEL4 (16  *  0)
481	KERNEL5 (16  *  0)
482	KERNEL6 (16  *  0)
483	KERNEL7 (16  *  0)
484	KERNEL8 (16  *  0)
485	KERNEL9 (16  *  0)
486	KERNEL10(16  *  0)
487	KERNEL11(16  *  0)
488	KERNEL12(16  *  0)
489	KERNEL13(16  *  0)
490	KERNEL14(16  *  0)
491	KERNEL15(16  *  0)
492	KERNEL16(16  *  0)
493	cmpq	$128 *  1, %rax
494	NOBRANCH
495	jle	.L12
496	KERNEL1 (16  *  1)
497	KERNEL2 (16  *  1)
498	KERNEL3 (16  *  1)
499	KERNEL4 (16  *  1)
500	KERNEL5 (16  *  1)
501	KERNEL6 (16  *  1)
502	KERNEL7 (16  *  1)
503	KERNEL8 (16  *  1)
504	KERNEL9 (16  *  1)
505	KERNEL10(16  *  1)
506	KERNEL11(16  *  1)
507	KERNEL12(16  *  1)
508	KERNEL13(16  *  1)
509	KERNEL14(16  *  1)
510	KERNEL15(16  *  1)
511	KERNEL16(16  *  1)
512	cmpq	$128 *  2, %rax
513	NOBRANCH
514	jle	.L12
515	KERNEL1 (16  *  2)
516	KERNEL2 (16  *  2)
517	KERNEL3 (16  *  2)
518	KERNEL4 (16  *  2)
519	KERNEL5 (16  *  2)
520	KERNEL6 (16  *  2)
521	KERNEL7 (16  *  2)
522	KERNEL8 (16  *  2)
523	KERNEL9 (16  *  2)
524	KERNEL10(16  *  2)
525	KERNEL11(16  *  2)
526	KERNEL12(16  *  2)
527	KERNEL13(16  *  2)
528	KERNEL14(16  *  2)
529	KERNEL15(16  *  2)
530	KERNEL16(16  *  2)
531	cmpq	$128 *  3, %rax
532	NOBRANCH
533	jle	.L12
534	KERNEL1 (16  *  3)
535	KERNEL2 (16  *  3)
536	KERNEL3 (16  *  3)
537	KERNEL4 (16  *  3)
538	KERNEL5 (16  *  3)
539	KERNEL6 (16  *  3)
540	KERNEL7 (16  *  3)
541	KERNEL8 (16  *  3)
542	KERNEL9 (16  *  3)
543	KERNEL10(16  *  3)
544	KERNEL11(16  *  3)
545	KERNEL12(16  *  3)
546	KERNEL13(16  *  3)
547	KERNEL14(16  *  3)
548	KERNEL15(16  *  3)
549	KERNEL16(16  *  3)
550	cmpq	$128 *  4, %rax
551	NOBRANCH
552	jle	.L12
553	KERNEL1 (16  *  4)
554	KERNEL2 (16  *  4)
555	KERNEL3 (16  *  4)
556	KERNEL4 (16  *  4)
557	KERNEL5 (16  *  4)
558	KERNEL6 (16  *  4)
559	KERNEL7 (16  *  4)
560	KERNEL8 (16  *  4)
561	KERNEL9 (16  *  4)
562	KERNEL10(16  *  4)
563	KERNEL11(16  *  4)
564	KERNEL12(16  *  4)
565	KERNEL13(16  *  4)
566	KERNEL14(16  *  4)
567	KERNEL15(16  *  4)
568	KERNEL16(16  *  4)
569	cmpq	$128 *  5, %rax
570	NOBRANCH
571	jle	.L12
572	KERNEL1 (16  *  5)
573	KERNEL2 (16  *  5)
574	KERNEL3 (16  *  5)
575	KERNEL4 (16  *  5)
576	KERNEL5 (16  *  5)
577	KERNEL6 (16  *  5)
578	KERNEL7 (16  *  5)
579	KERNEL8 (16  *  5)
580	KERNEL9 (16  *  5)
581	KERNEL10(16  *  5)
582	KERNEL11(16  *  5)
583	KERNEL12(16  *  5)
584	KERNEL13(16  *  5)
585	KERNEL14(16  *  5)
586	KERNEL15(16  *  5)
587	KERNEL16(16  *  5)
588	cmpq	$128 *  6, %rax
589	NOBRANCH
590	jle	.L12
591	KERNEL1 (16  *  6)
592	KERNEL2 (16  *  6)
593	KERNEL3 (16  *  6)
594	KERNEL4 (16  *  6)
595	KERNEL5 (16  *  6)
596	KERNEL6 (16  *  6)
597	KERNEL7 (16  *  6)
598	KERNEL8 (16  *  6)
599	KERNEL9 (16  *  6)
600	KERNEL10(16  *  6)
601	KERNEL11(16  *  6)
602	KERNEL12(16  *  6)
603	KERNEL13(16  *  6)
604	KERNEL14(16  *  6)
605	KERNEL15(16  *  6)
606	KERNEL16(16  *  6)
607	cmpq	$128 *  7, %rax
608	NOBRANCH
609	jle	.L12
610	KERNEL1 (16  *  7)
611	KERNEL2 (16  *  7)
612	KERNEL3 (16  *  7)
613	KERNEL4 (16  *  7)
614	KERNEL5 (16  *  7)
615	KERNEL6 (16  *  7)
616	KERNEL7 (16  *  7)
617	KERNEL8 (16  *  7)
618	KERNEL9 (16  *  7)
619	KERNEL10(16  *  7)
620	KERNEL11(16  *  7)
621	KERNEL12(16  *  7)
622	KERNEL13(16  *  7)
623	KERNEL14(16  *  7)
624	KERNEL15(16  *  7)
625	KERNEL16(16  *  7)
626
627	addq	$32 * 8  * SIZE, AO
628	addq	$32 * 8  * SIZE, BO
629	subq	$128 * 8, %rax
630	BRANCH
631	jg	.L1X
632
633.L12:
634	leaq	(AO, %rax, 2), AO	# * 16
635	leaq	(BO, %rax, 2), BO	# * 64
636
637#else
638	sarq	$3, %rax
639	je	.L15
640	ALIGN_4
641
642.L12:
643	mulpd	%xmm8, %xmm9
644	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
645	addpd	%xmm9, %xmm0
646	movddup	 1 * SIZE(BO), %xmm9
647	mulpd	%xmm8, %xmm9
648	addpd	%xmm9, %xmm1
649	movddup	 2 * SIZE(BO), %xmm9
650	mulpd	%xmm8, %xmm9
651	addpd	%xmm9, %xmm2
652	movddup	 3 * SIZE(BO), %xmm9
653	mulpd	%xmm8, %xmm9
654	movapd	 2 * SIZE(AO), %xmm8
655	addpd	%xmm9, %xmm3
656	movddup	 0 * SIZE(BO), %xmm9
657	mulpd	%xmm8, %xmm9
658	addpd	%xmm9, %xmm4
659	movddup	 1 * SIZE(BO), %xmm9
660	mulpd	%xmm8, %xmm9
661	addpd	%xmm9, %xmm5
662	movddup	 2 * SIZE(BO), %xmm9
663	mulpd	%xmm8, %xmm9
664	addpd	%xmm9, %xmm6
665	movddup	 3 * SIZE(BO), %xmm9
666	mulpd	%xmm8, %xmm9
667	movapd	 4 * SIZE(AO), %xmm8
668	addpd	%xmm9, %xmm7
669	movddup	 4 * SIZE(BO), %xmm9
670	mulpd	%xmm8, %xmm9
671	addpd	%xmm9, %xmm0
672	movddup	 5 * SIZE(BO), %xmm9
673	mulpd	%xmm8, %xmm9
674	addpd	%xmm9, %xmm1
675	movddup	 6 * SIZE(BO), %xmm9
676	mulpd	%xmm8, %xmm9
677	addpd	%xmm9, %xmm2
678	movddup	 7 * SIZE(BO), %xmm9
679	mulpd	%xmm8, %xmm9
680	movapd	 6 * SIZE(AO), %xmm8
681	addpd	%xmm9, %xmm3
682	movddup	 4 * SIZE(BO), %xmm9
683	mulpd	%xmm8, %xmm9
684	addpd	%xmm9, %xmm4
685	movddup	 5 * SIZE(BO), %xmm9
686	mulpd	%xmm8, %xmm9
687	addpd	%xmm9, %xmm5
688	movddup	 6 * SIZE(BO), %xmm9
689	mulpd	%xmm8, %xmm9
690	addpd	%xmm9, %xmm6
691	movddup	 7 * SIZE(BO), %xmm9
692	mulpd	%xmm8, %xmm9
693	movapd	32 * SIZE(AO), %xmm8
694	addpd	%xmm9, %xmm7
695
696	movddup	32 * SIZE(BO), %xmm9
697	mulpd	%xmm10, %xmm11
698	addpd	%xmm11, %xmm0
699	movddup	 9 * SIZE(BO), %xmm11
700	mulpd	%xmm10, %xmm11
701	addpd	%xmm11, %xmm1
702	movddup	10 * SIZE(BO), %xmm11
703	mulpd	%xmm10, %xmm11
704	addpd	%xmm11, %xmm2
705	movddup	11 * SIZE(BO), %xmm11
706	mulpd	%xmm10, %xmm11
707	movapd	10 * SIZE(AO), %xmm10
708	addpd	%xmm11, %xmm3
709
710	movddup	 8 * SIZE(BO), %xmm11
711	mulpd	%xmm10, %xmm11
712	addpd	%xmm11, %xmm4
713	movddup	 9 * SIZE(BO), %xmm11
714	mulpd	%xmm10, %xmm11
715	addpd	%xmm11, %xmm5
716	movddup	10 * SIZE(BO), %xmm11
717	mulpd	%xmm10, %xmm11
718	addpd	%xmm11, %xmm6
719	movddup	11 * SIZE(BO), %xmm11
720	mulpd	%xmm10, %xmm11
721	movapd	12 * SIZE(AO), %xmm10
722	addpd	%xmm11, %xmm7
723	movddup	12 * SIZE(BO), %xmm11
724	mulpd	%xmm10, %xmm11
725	addpd	%xmm11, %xmm0
726	movddup	13 * SIZE(BO), %xmm11
727	mulpd	%xmm10, %xmm11
728	addpd	%xmm11, %xmm1
729	movddup	14 * SIZE(BO), %xmm11
730	mulpd	%xmm10, %xmm11
731	addpd	%xmm11, %xmm2
732	movddup	15 * SIZE(BO), %xmm11
733	mulpd	%xmm10, %xmm11
734	movapd	14 * SIZE(AO), %xmm10
735	addpd	%xmm11, %xmm3
736
737	movddup	12 * SIZE(BO), %xmm11
738	mulpd	%xmm10, %xmm11
739	addpd	%xmm11, %xmm4
740	movddup	13 * SIZE(BO), %xmm11
741	mulpd	%xmm10, %xmm11
742	addpd	%xmm11, %xmm5
743	movddup	14 * SIZE(BO), %xmm11
744	mulpd	%xmm10, %xmm11
745	addpd	%xmm11, %xmm6
746	movddup	15 * SIZE(BO), %xmm11
747	mulpd	%xmm10, %xmm11
748	movapd	40 * SIZE(AO), %xmm10
749	addpd	%xmm11, %xmm7
750	movddup	40 * SIZE(BO), %xmm11
751
752	mulpd	%xmm12, %xmm13
753	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
754	addpd	%xmm13, %xmm0
755	movddup	17 * SIZE(BO), %xmm13
756	mulpd	%xmm12, %xmm13
757	addpd	%xmm13, %xmm1
758	movddup	18 * SIZE(BO), %xmm13
759	mulpd	%xmm12, %xmm13
760	addpd	%xmm13, %xmm2
761	movddup	19 * SIZE(BO), %xmm13
762	mulpd	%xmm12, %xmm13
763	movapd	18 * SIZE(AO), %xmm12
764	addpd	%xmm13, %xmm3
765
766	movddup	16 * SIZE(BO), %xmm13
767	mulpd	%xmm12, %xmm13
768	addpd	%xmm13, %xmm4
769	movddup	17 * SIZE(BO), %xmm13
770	mulpd	%xmm12, %xmm13
771	addpd	%xmm13, %xmm5
772	movddup	18 * SIZE(BO), %xmm13
773	mulpd	%xmm12, %xmm13
774	addpd	%xmm13, %xmm6
775	movddup	19 * SIZE(BO), %xmm13
776	mulpd	%xmm12, %xmm13
777	movapd	20 * SIZE(AO), %xmm12
778	addpd	%xmm13, %xmm7
779
780	movddup	20 * SIZE(BO), %xmm13
781	mulpd	%xmm12, %xmm13
782	addpd	%xmm13, %xmm0
783	movddup	21 * SIZE(BO), %xmm13
784	mulpd	%xmm12, %xmm13
785	addpd	%xmm13, %xmm1
786	movddup	22 * SIZE(BO), %xmm13
787	mulpd	%xmm12, %xmm13
788	addpd	%xmm13, %xmm2
789	movddup	23 * SIZE(BO), %xmm13
790	mulpd	%xmm12, %xmm13
791	movapd	22 * SIZE(AO), %xmm12
792	addpd	%xmm13, %xmm3
793
794	movddup	20 * SIZE(BO), %xmm13
795	mulpd	%xmm12, %xmm13
796	addpd	%xmm13, %xmm4
797	movddup 21 * SIZE(BO), %xmm13
798	mulpd	%xmm12, %xmm13
799	addpd	%xmm13, %xmm5
800	movddup	22 * SIZE(BO), %xmm13
801	mulpd	%xmm12, %xmm13
802	addpd	%xmm13, %xmm6
803	movddup	23 * SIZE(BO), %xmm13
804	mulpd	%xmm12, %xmm13
805	movapd	48 * SIZE(AO), %xmm12
806	addpd	%xmm13, %xmm7
807	movddup	48 * SIZE(BO), %xmm13
808
809	mulpd	%xmm14, %xmm15
810	addpd	%xmm15, %xmm0
811	movddup	25 * SIZE(BO), %xmm15
812	mulpd	%xmm14, %xmm15
813	addpd	%xmm15, %xmm1
814	movddup	26 * SIZE(BO), %xmm15
815	mulpd	%xmm14, %xmm15
816	addpd	%xmm15, %xmm2
817	movddup	27 * SIZE(BO), %xmm15
818	mulpd	%xmm14, %xmm15
819	movapd	26 * SIZE(AO), %xmm14
820	addpd	%xmm15, %xmm3
821
822	movddup	24 * SIZE(BO), %xmm15
823	mulpd	%xmm14, %xmm15
824	addpd	%xmm15, %xmm4
825	movddup	25 * SIZE(BO), %xmm15
826	mulpd	%xmm14, %xmm15
827	addpd	%xmm15, %xmm5
828	movddup	26 * SIZE(BO), %xmm15
829	mulpd	%xmm14, %xmm15
830	addpd	%xmm15, %xmm6
831	movddup	27 * SIZE(BO), %xmm15
832	mulpd	%xmm14, %xmm15
833	movapd	28 * SIZE(AO), %xmm14
834	addpd	%xmm15, %xmm7
835
836	movddup	28 * SIZE(BO), %xmm15
837	mulpd	%xmm14, %xmm15
838	addpd	%xmm15, %xmm0
839	movddup	29 * SIZE(BO), %xmm15
840	mulpd	%xmm14, %xmm15
841	addpd	%xmm15, %xmm1
842	movddup	30 * SIZE(BO), %xmm15
843	mulpd	%xmm14, %xmm15
844	addpd	%xmm15, %xmm2
845	movddup	31 * SIZE(BO), %xmm15
846	mulpd	%xmm14, %xmm15
847	movapd	30 * SIZE(AO), %xmm14
848	addpd	%xmm15, %xmm3
849
850	movddup	28 * SIZE(BO), %xmm15
851	mulpd	%xmm14, %xmm15
852	addpd	%xmm15, %xmm4
853	movddup	29 * SIZE(BO), %xmm15
854	mulpd	%xmm14, %xmm15
855	addpd	%xmm15, %xmm5
856	movddup	30 * SIZE(BO), %xmm15
857	mulpd	%xmm14, %xmm15
858	addpd	%xmm15, %xmm6
859	movddup	31 * SIZE(BO), %xmm15
860	mulpd	%xmm14, %xmm15
861	movapd	56 * SIZE(AO), %xmm14
862	addpd	%xmm15, %xmm7
863	movddup	56 * SIZE(BO), %xmm15
864
865	addq   $32 * SIZE, BO
866	addq   $32 * SIZE, AO
867	decq   %rax
868	BRANCH
869	jne    .L12
870#endif
871	ALIGN_4
872
873.L15:
874#ifndef TRMMKERNEL
875	movq	K, %rax
876#else
877	movq	KKK, %rax
878#endif
879	movddup	ALPHA, %xmm15
880	andq	$7, %rax		# if (k & 1)
881	BRANCH
882	BRANCH
883	je	.L19
884	ALIGN_4
885
886.L16:
887	mulpd	%xmm8, %xmm9
888	movapd	 2 * SIZE(AO), %xmm10
889	addpd	%xmm9, %xmm0
890	movddup	 1 * SIZE(BO), %xmm9
891	mulpd	%xmm8, %xmm9
892	movddup	 0 * SIZE(BO), %xmm11
893	addpd	%xmm9, %xmm1
894	movddup	 2 * SIZE(BO), %xmm9
895	mulpd	%xmm8, %xmm9
896	addpd	%xmm9, %xmm2
897	movddup	 3 * SIZE(BO), %xmm9
898	mulpd	%xmm8, %xmm9
899	movapd	 4 * SIZE(AO), %xmm8
900	addpd	%xmm9, %xmm3
901	movddup	 4 * SIZE(BO), %xmm9
902	mulpd	%xmm10, %xmm11
903	addpd	%xmm11, %xmm4
904	movddup	 1 * SIZE(BO), %xmm11
905	mulpd	%xmm10, %xmm11
906	addpd	%xmm11, %xmm5
907	movddup	 2 * SIZE(BO), %xmm11
908	mulpd	%xmm10, %xmm11
909	addpd	%xmm11, %xmm6
910	movddup	 3 * SIZE(BO), %xmm11
911	mulpd	%xmm10, %xmm11
912	addpd	%xmm11, %xmm7
913
914	addq	$4 * SIZE, AO		# aoffset  += 4
915	addq	$4 * SIZE, BO		# boffset1 += 8
916	decq	%rax
917	BRANCH
918	jg	.L16
919	ALIGN_4
920
921.L19:
922	mulpd	%xmm15, %xmm0
923	mulpd	%xmm15, %xmm4
924	mulpd	%xmm15, %xmm1
925	mulpd	%xmm15, %xmm5
926
927	testq	$15, CO1
928	NOBRANCH
929	jne	.L19x
930	testq	$15, LDC
931	NOBRANCH
932	jne	.L19x
933
934	mulpd	%xmm15, %xmm2
935	mulpd	%xmm15, %xmm3
936	mulpd	%xmm15, %xmm6
937	mulpd	%xmm15, %xmm7
938
939#if! defined(TRMMKERNEL) && !defined(BETAZERO)
940	addpd	0 * SIZE(CO1), %xmm0
941	addpd	2 * SIZE(CO1), %xmm4
942	addpd	0 * SIZE(CO2), %xmm1
943	addpd	2 * SIZE(CO2), %xmm5
944
945	addpd	0 * SIZE(CO1, LDC, 2), %xmm2
946	addpd	2 * SIZE(CO1, LDC, 2), %xmm6
947	addpd	0 * SIZE(CO2, LDC, 2), %xmm3
948	addpd	2 * SIZE(CO2, LDC, 2), %xmm7
949#endif
950
951	movapd	%xmm0, 0 * SIZE(CO1)
952	movapd	%xmm4, 2 * SIZE(CO1)
953	movapd	%xmm1, 0 * SIZE(CO2)
954	movapd	%xmm5, 2 * SIZE(CO2)
955
956	movapd	%xmm2, 0 * SIZE(CO1, LDC, 2)
957	movapd	%xmm6, 2 * SIZE(CO1, LDC, 2)
958	movapd	%xmm3, 0 * SIZE(CO2, LDC, 2)
959	movapd	%xmm7, 2 * SIZE(CO2, LDC, 2)
960
961#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
962    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
963	movq	K, %rax
964	subq	KKK, %rax
965	leaq	(,%rax, SIZE), %rax
966	leaq	(AO, %rax, 4), AO
967	leaq	(BO, %rax, 4), BO
968#endif
969
970#if defined(TRMMKERNEL) && defined(LEFT)
971	addq	$4, KK
972#endif
973
974	addq	$4 * SIZE, CO1		# coffset += 4
975	addq	$4 * SIZE, CO2		# coffset += 4
976
977	decq	I			# i --
978	jg	.L11
979	jmp	.L20
980	ALIGN_4
981
982.L19x:
983#if! defined(TRMMKERNEL) && !defined(BETAZERO)
984	movsd	0 * SIZE(CO1), %xmm8
985	movhpd	1 * SIZE(CO1), %xmm8
986	movsd	2 * SIZE(CO1), %xmm9
987	movhpd	3 * SIZE(CO1), %xmm9
988
989	movsd	0 * SIZE(CO2), %xmm10
990	movhpd	1 * SIZE(CO2), %xmm10
991	movsd	2 * SIZE(CO2), %xmm11
992	movhpd	3 * SIZE(CO2), %xmm11
993
994	addpd	%xmm8,  %xmm0
995	addpd	%xmm9,  %xmm4
996	addpd	%xmm10, %xmm1
997	addpd	%xmm11, %xmm5
998#endif
999
1000	mulpd	%xmm15, %xmm2
1001	mulpd	%xmm15, %xmm3
1002	mulpd	%xmm15, %xmm6
1003	mulpd	%xmm15, %xmm7
1004
1005#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1006	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1007	movhpd	1 * SIZE(CO1, LDC, 2), %xmm12
1008	movsd	2 * SIZE(CO1, LDC, 2), %xmm13
1009	movhpd	3 * SIZE(CO1, LDC, 2), %xmm13
1010
1011	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1012	movhpd	1 * SIZE(CO2, LDC, 2), %xmm14
1013	movsd	2 * SIZE(CO2, LDC, 2), %xmm15
1014	movhpd	3 * SIZE(CO2, LDC, 2), %xmm15
1015
1016	addpd	%xmm12, %xmm2
1017	addpd	%xmm13, %xmm6
1018	addpd	%xmm14, %xmm3
1019	addpd	%xmm15, %xmm7
1020#endif
1021
1022	movsd	%xmm0, 0 * SIZE(CO1)
1023	movhpd	%xmm0, 1 * SIZE(CO1)
1024	movsd	%xmm4, 2 * SIZE(CO1)
1025	movhpd	%xmm4, 3 * SIZE(CO1)
1026
1027	movsd	%xmm1, 0 * SIZE(CO2)
1028	movhpd	%xmm1, 1 * SIZE(CO2)
1029	movsd	%xmm5, 2 * SIZE(CO2)
1030	movhpd	%xmm5, 3 * SIZE(CO2)
1031
1032	movsd	%xmm2, 0 * SIZE(CO1, LDC, 2)
1033	movhpd	%xmm2, 1 * SIZE(CO1, LDC, 2)
1034	movsd	%xmm6, 2 * SIZE(CO1, LDC, 2)
1035	movhpd	%xmm6, 3 * SIZE(CO1, LDC, 2)
1036
1037	movsd	%xmm3, 0 * SIZE(CO2, LDC, 2)
1038	movhpd	%xmm3, 1 * SIZE(CO2, LDC, 2)
1039	movsd	%xmm7, 2 * SIZE(CO2, LDC, 2)
1040	movhpd	%xmm7, 3 * SIZE(CO2, LDC, 2)
1041
1042#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1043    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1044	movq	K, %rax
1045	subq	KKK, %rax
1046	leaq	(,%rax, SIZE), %rax
1047	leaq	(AO, %rax, 4), AO
1048	leaq	(BO, %rax, 4), BO
1049#endif
1050
1051#if defined(TRMMKERNEL) && defined(LEFT)
1052	addq	$4, KK
1053#endif
1054
1055	addq	$4 * SIZE, CO1		# coffset += 4
1056	addq	$4 * SIZE, CO2		# coffset += 4
1057
1058	decq	I			# i --
1059	jg	.L11
1060	ALIGN_4
1061
1062.L20:
1063	testq	$2, M
1064	BRANCH
1065	je	.L30
1066	ALIGN_4
1067
1068.L21:
1069#if !defined(TRMMKERNEL) || \
1070	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1071	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1072
1073	movq	B, BO
1074#else
1075	movq	KK, %rax
1076	leaq	(, %rax, SIZE), %rax
1077	leaq	(AO, %rax, 2), AO
1078	leaq	(B,  %rax, 4), BO
1079#endif
1080
1081	movapd	 0 * SIZE(AO), %xmm8
1082	pxor	%xmm0, %xmm0
1083	movddup	 0 * SIZE(BO), %xmm9
1084	pxor	%xmm1, %xmm1
1085	movapd	 8 * SIZE(AO), %xmm10
1086	pxor	%xmm2, %xmm2
1087	movddup	 8 * SIZE(BO), %xmm11
1088	pxor	%xmm3, %xmm3
1089
1090#ifndef TRMMKERNEL
1091	movq	K, %rax
1092#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1093	movq	K, %rax
1094	subq	KK, %rax
1095	movq	%rax, KKK
1096#else
1097	movq	KK, %rax
1098#ifdef LEFT
1099	addq	$2, %rax
1100#else
1101	addq	$4, %rax
1102#endif
1103	movq	%rax, KKK
1104#endif
1105	sarq	$3, %rax
1106	je	.L25
1107	ALIGN_4
1108
1109.L22:
1110	mulpd	%xmm8, %xmm9
1111	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1112	addpd	%xmm9, %xmm0
1113	movddup	 1 * SIZE(BO), %xmm9
1114	mulpd	%xmm8, %xmm9
1115	addpd	%xmm9, %xmm1
1116	movddup	 2 * SIZE(BO), %xmm9
1117	mulpd	%xmm8, %xmm9
1118	addpd	%xmm9, %xmm2
1119	movddup	 3 * SIZE(BO), %xmm9
1120	mulpd	%xmm8, %xmm9
1121	movapd	 2 * SIZE(AO), %xmm8
1122	addpd	%xmm9, %xmm3
1123	movddup	 4 * SIZE(BO), %xmm9
1124	mulpd	%xmm8, %xmm9
1125	addpd	%xmm9, %xmm0
1126	movddup	 5 * SIZE(BO), %xmm9
1127	mulpd	%xmm8, %xmm9
1128	addpd	%xmm9, %xmm1
1129	movddup	 6 * SIZE(BO), %xmm9
1130	mulpd	%xmm8, %xmm9
1131	addpd	%xmm9, %xmm2
1132	movddup	 7 * SIZE(BO), %xmm9
1133	mulpd	%xmm8, %xmm9
1134	movapd	 4 * SIZE(AO), %xmm8
1135	addpd	%xmm9, %xmm3
1136	movddup	16 * SIZE(BO), %xmm9
1137	mulpd	%xmm8, %xmm11
1138	addpd	%xmm11, %xmm0
1139	movddup	 9 * SIZE(BO), %xmm11
1140	mulpd	%xmm8, %xmm11
1141	addpd	%xmm11, %xmm1
1142	movddup	10 * SIZE(BO), %xmm11
1143	mulpd	%xmm8, %xmm11
1144	addpd	%xmm11, %xmm2
1145	movddup	11 * SIZE(BO), %xmm11
1146	mulpd	%xmm8, %xmm11
1147	movapd	 6 * SIZE(AO), %xmm8
1148	addpd	%xmm11, %xmm3
1149	movddup	12 * SIZE(BO), %xmm11
1150	mulpd	%xmm8, %xmm11
1151	addpd	%xmm11, %xmm0
1152	movddup	13 * SIZE(BO), %xmm11
1153	mulpd	%xmm8, %xmm11
1154	addpd	%xmm11, %xmm1
1155	movddup	14 * SIZE(BO), %xmm11
1156	mulpd	%xmm8, %xmm11
1157	addpd	%xmm11, %xmm2
1158	movddup	15 * SIZE(BO), %xmm11
1159	mulpd	%xmm8, %xmm11
1160	movapd	16 * SIZE(AO), %xmm8
1161	addpd	%xmm11, %xmm3
1162	movddup	24 * SIZE(BO), %xmm11
1163	mulpd	%xmm10, %xmm9
1164	addpd	%xmm9, %xmm0
1165	movddup	17 * SIZE(BO), %xmm9
1166	mulpd	%xmm10, %xmm9
1167	addpd	%xmm9, %xmm1
1168	movddup	18 * SIZE(BO), %xmm9
1169	mulpd	%xmm10, %xmm9
1170	addpd	%xmm9, %xmm2
1171	movddup	19 * SIZE(BO), %xmm9
1172	mulpd	%xmm10, %xmm9
1173	movapd	10 * SIZE(AO), %xmm10
1174	addpd	%xmm9, %xmm3
1175	movddup	20 * SIZE(BO), %xmm9
1176	mulpd	%xmm10, %xmm9
1177	addpd	%xmm9, %xmm0
1178	movddup	21 * SIZE(BO), %xmm9
1179	mulpd	%xmm10, %xmm9
1180	addpd	%xmm9, %xmm1
1181	movddup	22 * SIZE(BO), %xmm9
1182	mulpd	%xmm10, %xmm9
1183	addpd	%xmm9, %xmm2
1184	movddup	23 * SIZE(BO), %xmm9
1185	mulpd	%xmm10, %xmm9
1186	movapd	12 * SIZE(AO), %xmm10
1187	addpd	%xmm9, %xmm3
1188	movddup	32 * SIZE(BO), %xmm9
1189	mulpd	%xmm10, %xmm11
1190	addpd	%xmm11, %xmm0
1191	movddup	25 * SIZE(BO), %xmm11
1192	mulpd	%xmm10, %xmm11
1193	addpd	%xmm11, %xmm1
1194	movddup	26 * SIZE(BO), %xmm11
1195	mulpd	%xmm10, %xmm11
1196	addpd	%xmm11, %xmm2
1197	movddup	27 * SIZE(BO), %xmm11
1198	mulpd	%xmm10, %xmm11
1199	movapd	14 * SIZE(AO), %xmm10
1200	addpd	%xmm11, %xmm3
1201	movddup	28 * SIZE(BO), %xmm11
1202	mulpd	%xmm10, %xmm11
1203	addpd	%xmm11, %xmm0
1204	movddup	29 * SIZE(BO), %xmm11
1205	mulpd	%xmm10, %xmm11
1206	addpd	%xmm11, %xmm1
1207	movddup	30 * SIZE(BO), %xmm11
1208	mulpd	%xmm10, %xmm11
1209	addpd	%xmm11, %xmm2
1210	movddup	31 * SIZE(BO), %xmm11
1211	mulpd	%xmm10, %xmm11
1212	movapd	24 * SIZE(AO), %xmm10
1213	addpd	%xmm11, %xmm3
1214	movddup	40 * SIZE(BO), %xmm11
1215
1216	addq   $16 * SIZE, AO
1217	addq   $32 * SIZE, BO
1218	decq   %rax
1219	jne    .L22
1220	ALIGN_4
1221
1222.L25:
1223#ifndef TRMMKERNEL
1224	movq	K, %rax
1225#else
1226	movq	KKK, %rax
1227#endif
1228	movddup	ALPHA, %xmm15
1229	andq	$7, %rax		# if (k & 1)
1230	BRANCH
1231	je .L29
1232	ALIGN_4
1233
1234.L26:
1235	mulpd	%xmm8, %xmm9
1236	addpd	%xmm9, %xmm0
1237	movddup	 1 * SIZE(BO), %xmm9
1238	mulpd	%xmm8, %xmm9
1239	addpd	%xmm9, %xmm1
1240	movddup	 2 * SIZE(BO), %xmm9
1241	mulpd	%xmm8, %xmm9
1242	addpd	%xmm9, %xmm2
1243	movddup	 3 * SIZE(BO), %xmm9
1244	mulpd	%xmm8, %xmm9
1245	movapd	 2 * SIZE(AO), %xmm8
1246	addpd	%xmm9, %xmm3
1247	movddup	 4 * SIZE(BO), %xmm9
1248
1249	addq	$2 * SIZE, AO		# aoffset  += 4
1250	addq	$4 * SIZE, BO		# boffset1 += 8
1251	decq	%rax
1252	jg	.L26
1253	ALIGN_4
1254
1255.L29:
1256#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1257     movsd	0 * SIZE(CO1), %xmm8
1258	movhpd	1 * SIZE(CO1), %xmm8
1259	movsd	0 * SIZE(CO2), %xmm10
1260	movhpd	1 * SIZE(CO2), %xmm10
1261	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1262	movhpd	1 * SIZE(CO1, LDC, 2), %xmm12
1263	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1264	movhpd	1 * SIZE(CO2, LDC, 2), %xmm14
1265#endif
1266
1267	mulpd	%xmm15, %xmm0
1268	mulpd	%xmm15, %xmm1
1269	mulpd	%xmm15, %xmm2
1270	mulpd	%xmm15, %xmm3
1271
1272#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1273	addpd	%xmm8,  %xmm0
1274	addpd	%xmm10, %xmm1
1275	addpd	%xmm12, %xmm2
1276	addpd	%xmm14, %xmm3
1277#endif
1278
1279	movsd	%xmm0, 0 * SIZE(CO1)
1280	movhpd	%xmm0, 1 * SIZE(CO1)
1281	movsd	%xmm1, 0 * SIZE(CO2)
1282	movhpd	%xmm1, 1 * SIZE(CO2)
1283	movsd	%xmm2, 0 * SIZE(CO1, LDC, 2)
1284	movhpd	%xmm2, 1 * SIZE(CO1, LDC, 2)
1285	movsd	%xmm3, 0 * SIZE(CO2, LDC, 2)
1286	movhpd	%xmm3, 1 * SIZE(CO2, LDC, 2)
1287
1288#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1289    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1290	movq	K, %rax
1291	subq	KKK, %rax
1292	leaq	(,%rax, SIZE), %rax
1293	leaq	(AO, %rax, 2), AO
1294	leaq	(BO, %rax, 4), BO
1295#endif
1296
1297#if defined(TRMMKERNEL) && defined(LEFT)
1298	addq	$2, KK
1299#endif
1300
1301	addq	$2 * SIZE, CO1		# coffset += 4
1302	addq	$2 * SIZE, CO2		# coffset += 4
1303	ALIGN_4
1304
1305.L30:
1306	testq	$1, M
1307	je	.L39
1308	ALIGN_4
1309
1310.L31:
1311#if !defined(TRMMKERNEL) || \
1312	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1313	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1314
1315	movq	B, BO
1316#else
1317	movq	KK, %rax
1318	leaq	(, %rax, SIZE), %rax
1319	leaq	(AO, %rax, 1), AO
1320	leaq	(B,  %rax, 4), BO
1321#endif
1322
1323	movddup	 0 * SIZE(AO), %xmm8
1324	pxor	%xmm0, %xmm0
1325	movapd	 0 * SIZE(BO), %xmm9
1326	pxor	%xmm1, %xmm1
1327	movddup	 4 * SIZE(AO), %xmm10
1328	pxor	%xmm2, %xmm2
1329	movapd	 8 * SIZE(BO), %xmm11
1330	pxor	%xmm3, %xmm3
1331
1332#ifndef TRMMKERNEL
1333	movq	K, %rax
1334#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1335	movq	K, %rax
1336	subq	KK, %rax
1337	movq	%rax, KKK
1338#else
1339	movq	KK, %rax
1340#ifdef LEFT
1341	addq	$1, %rax
1342#else
1343	addq	$4, %rax
1344#endif
1345	movq	%rax, KKK
1346#endif
1347	sarq	$3, %rax
1348	je	.L35
1349	ALIGN_4
1350
1351.L32:
1352	mulpd	%xmm8, %xmm9
1353	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1354	addpd	%xmm9, %xmm0
1355	movapd	 2 * SIZE(BO), %xmm9
1356	mulpd	%xmm8, %xmm9
1357	movddup	 1 * SIZE(AO), %xmm8
1358	addpd	%xmm9, %xmm1
1359	movapd	 4 * SIZE(BO), %xmm9
1360	mulpd	%xmm8, %xmm9
1361	addpd	%xmm9, %xmm0
1362	movapd	 6 * SIZE(BO), %xmm9
1363	mulpd	%xmm8, %xmm9
1364	movddup	 2 * SIZE(AO), %xmm8
1365	addpd	%xmm9, %xmm1
1366	movapd	16 * SIZE(BO), %xmm9
1367	mulpd	%xmm8, %xmm11
1368	addpd	%xmm11, %xmm0
1369	movapd	10 * SIZE(BO), %xmm11
1370	mulpd	%xmm8, %xmm11
1371	movddup	 3 * SIZE(AO), %xmm8
1372	addpd	%xmm11, %xmm1
1373	movapd	12 * SIZE(BO), %xmm11
1374	mulpd	%xmm8, %xmm11
1375	addpd	%xmm11, %xmm0
1376	movapd	14 * SIZE(BO), %xmm11
1377	mulpd	%xmm8, %xmm11
1378	movddup	 8 * SIZE(AO), %xmm8
1379	addpd	%xmm11, %xmm1
1380	movapd	24 * SIZE(BO), %xmm11
1381	mulpd	%xmm10, %xmm9
1382	addpd	%xmm9, %xmm0
1383	movapd	18 * SIZE(BO), %xmm9
1384	mulpd	%xmm10, %xmm9
1385	movddup	 5 * SIZE(AO), %xmm10
1386	addpd	%xmm9, %xmm1
1387	movapd	20 * SIZE(BO), %xmm9
1388	mulpd	%xmm10, %xmm9
1389	addpd	%xmm9, %xmm0
1390	movapd	22 * SIZE(BO), %xmm9
1391	mulpd	%xmm10, %xmm9
1392	movddup	 6 * SIZE(AO), %xmm10
1393	addpd	%xmm9, %xmm1
1394	movapd	32 * SIZE(BO), %xmm9
1395	mulpd	%xmm10, %xmm11
1396	addpd	%xmm11, %xmm0
1397	movapd	26 * SIZE(BO), %xmm11
1398	mulpd	%xmm10, %xmm11
1399	movddup	 7 * SIZE(AO), %xmm10
1400	addpd	%xmm11, %xmm1
1401	movapd	28 * SIZE(BO), %xmm11
1402	mulpd	%xmm10, %xmm11
1403	addpd	%xmm11, %xmm0
1404	movapd	30 * SIZE(BO), %xmm11
1405	mulpd	%xmm10, %xmm11
1406	movddup	12 * SIZE(AO), %xmm10
1407	addpd	%xmm11, %xmm1
1408	movapd	40 * SIZE(BO), %xmm11
1409
1410	addq   $ 8 * SIZE, AO
1411	addq   $32 * SIZE, BO
1412	decq   %rax
1413	jne    .L32
1414	ALIGN_4
1415
1416.L35:
1417#ifndef TRMMKERNEL
1418	movq	K, %rax
1419#else
1420	movq	KKK, %rax
1421#endif
1422	movddup	ALPHA, %xmm15
1423	andq	$7, %rax		# if (k & 1)
1424	BRANCH
1425	je .L38
1426	ALIGN_4
1427
1428.L36:
1429	mulpd	%xmm8, %xmm9
1430	addpd	%xmm9, %xmm0
1431	movapd	 2 * SIZE(BO), %xmm9
1432	mulpd	%xmm8, %xmm9
1433	movddup	 1 * SIZE(AO), %xmm8
1434	addpd	%xmm9, %xmm1
1435	movapd	 4 * SIZE(BO), %xmm9
1436
1437	addq	$1 * SIZE, AO		# aoffset  += 4
1438	addq	$4 * SIZE, BO		# boffset1 += 8
1439	decq	%rax
1440	jg	.L36
1441	ALIGN_4
1442
1443.L38:
1444#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1445	movsd	0 * SIZE(CO1), %xmm8
1446	movhpd	0 * SIZE(CO2), %xmm8
1447	movsd	0 * SIZE(CO1, LDC, 2), %xmm9
1448	movhpd	0 * SIZE(CO2, LDC, 2), %xmm9
1449#endif
1450	mulpd	%xmm15, %xmm0
1451	mulpd	%xmm15, %xmm1
1452
1453#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1454	addpd	%xmm8,  %xmm0
1455	addpd	%xmm9,  %xmm1
1456#endif
1457
1458	movsd	%xmm0, 0 * SIZE(CO1)
1459	movhpd	%xmm0, 0 * SIZE(CO2)
1460	movsd	%xmm1, 0 * SIZE(CO1, LDC, 2)
1461	movhpd	%xmm1, 0 * SIZE(CO2, LDC, 2)
1462
1463#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1464    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1465	movq	K, %rax
1466	subq	KKK, %rax
1467	leaq	(,%rax, SIZE), %rax
1468	leaq	(AO, %rax, 1), AO
1469	leaq	(BO, %rax, 4), BO
1470#endif
1471
1472#if defined(TRMMKERNEL) && defined(LEFT)
1473	addq	$1, KK
1474#endif
1475	ALIGN_4
1476
1477.L39:
1478#if defined(TRMMKERNEL) && !defined(LEFT)
1479	addl	$4, KK
1480#endif
1481
1482	leaq	(C, LDC, 4), C		# c += 4 * ldc
1483	movq	BO, B
1484	decq	J			# j --
1485	jg	.L10
1486	ALIGN_4
1487
1488.L40:
1489	testq	$2, N
1490	je	.L80
1491	ALIGN_4
1492
1493#if defined(TRMMKERNEL) && defined(LEFT)
1494	movq	OFFSET, %rax
1495	movq	%rax, KK
1496#endif
1497
1498	movq	C, CO1			# coffset1 = c
1499	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1500	movq	A, AO		# aoffset = a
1501
1502	movq	K, %rax
1503	salq	$BASE_SHIFT + 1, %rax
1504	leaq	(B, %rax), BB
1505
1506	movq	M,  I
1507	sarq	$2, I	# i = (m >> 2)
1508	jle	.L60
1509	ALIGN_4
1510
1511.L51:
1512#if !defined(TRMMKERNEL) || \
1513	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1514	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1515
1516	movq	B, BO
1517#else
1518	movq	KK, %rax
1519	leaq	(, %rax, SIZE), %rax
1520	leaq	(AO, %rax, 4), AO
1521	leaq	(B,  %rax, 2), BO
1522#endif
1523
1524	prefetcht0	  0 * SIZE(BB)
1525	subq	   $-4 * SIZE, BB
1526
1527	movapd	 0 * SIZE(AO), %xmm8
1528	pxor	%xmm0, %xmm0
1529	movddup	 0 * SIZE(BO), %xmm9
1530	pxor	%xmm1, %xmm1
1531	movapd	 8 * SIZE(AO), %xmm10
1532	pxor	%xmm4, %xmm4
1533	movddup	 8 * SIZE(BO), %xmm11
1534	pxor	%xmm5, %xmm5
1535
1536#ifdef HAVE_3DNOW
1537	prefetchw      4 * SIZE(CO1)
1538	prefetchw      4 * SIZE(CO2)
1539#else
1540	prefetchnta     4 * SIZE(CO1)
1541	prefetchnta     4 * SIZE(CO2)
1542#endif
1543
1544#ifndef TRMMKERNEL
1545	movq	K, %rax
1546#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1547	movq	K, %rax
1548	subq	KK, %rax
1549	movq	%rax, KKK
1550#else
1551	movq	KK, %rax
1552#ifdef LEFT
1553	addq	$4, %rax
1554#else
1555	addq	$2, %rax
1556#endif
1557	movq	%rax, KKK
1558#endif
1559	sarq	$3, %rax
1560	je	.L55
1561	ALIGN_4
1562
1563.L52:
1564	mulpd	%xmm8, %xmm9
1565	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1566	addpd	%xmm9, %xmm0
1567	movddup	 1 * SIZE(BO), %xmm9
1568	mulpd	%xmm8, %xmm9
1569	movapd	 2 * SIZE(AO), %xmm8
1570	addpd	%xmm9, %xmm1
1571	movddup	 0 * SIZE(BO), %xmm9
1572	mulpd	%xmm8, %xmm9
1573	addpd	%xmm9, %xmm4
1574	movddup	 1 * SIZE(BO), %xmm9
1575	mulpd	%xmm8, %xmm9
1576	movapd	 4 * SIZE(AO), %xmm8
1577	addpd	%xmm9, %xmm5
1578	movddup	 2 * SIZE(BO), %xmm9
1579	mulpd	%xmm8, %xmm9
1580	addpd	%xmm9, %xmm0
1581	movddup	 3 * SIZE(BO), %xmm9
1582	mulpd	%xmm8, %xmm9
1583	movapd	 6 * SIZE(AO), %xmm8
1584	addpd	%xmm9, %xmm1
1585	movddup	 2 * SIZE(BO), %xmm9
1586	mulpd	%xmm8, %xmm9
1587	addpd	%xmm9, %xmm4
1588	movddup	 3 * SIZE(BO), %xmm9
1589	mulpd	%xmm8, %xmm9
1590	movapd	16 * SIZE(AO), %xmm8
1591	addpd	%xmm9, %xmm5
1592	movddup	 4 * SIZE(BO), %xmm9
1593	mulpd	%xmm10, %xmm9
1594	addpd	%xmm9, %xmm0
1595	movddup	 5 * SIZE(BO), %xmm9
1596	mulpd	%xmm10, %xmm9
1597	movapd	10 * SIZE(AO), %xmm10
1598	addpd	%xmm9, %xmm1
1599	movddup	 4 * SIZE(BO), %xmm9
1600	mulpd	%xmm10, %xmm9
1601	addpd	%xmm9, %xmm4
1602	movddup	 5 * SIZE(BO), %xmm9
1603	mulpd	%xmm10, %xmm9
1604	movapd	12 * SIZE(AO), %xmm10
1605	addpd	%xmm9, %xmm5
1606	movddup	 6 * SIZE(BO), %xmm9
1607	mulpd	%xmm10, %xmm9
1608	addpd	%xmm9, %xmm0
1609	movddup	 7 * SIZE(BO), %xmm9
1610	mulpd	%xmm10, %xmm9
1611	movapd	14 * SIZE(AO), %xmm10
1612	addpd	%xmm9, %xmm1
1613	movddup	 6 * SIZE(BO), %xmm9
1614	mulpd	%xmm10, %xmm9
1615	addpd	%xmm9, %xmm4
1616	movddup	 7 * SIZE(BO), %xmm9
1617	mulpd	%xmm10, %xmm9
1618	movapd	40 * SIZE(AO), %xmm10
1619	addpd	%xmm9, %xmm5
1620	movddup	16 * SIZE(BO), %xmm9
1621	mulpd	%xmm8, %xmm11
1622	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
1623	addpd	%xmm11, %xmm0
1624	movddup	 9 * SIZE(BO), %xmm11
1625	mulpd	%xmm8, %xmm11
1626	movapd	18 * SIZE(AO), %xmm8
1627	addpd	%xmm11, %xmm1
1628	movddup	 8 * SIZE(BO), %xmm11
1629	mulpd	%xmm8, %xmm11
1630	addpd	%xmm11, %xmm4
1631	movddup	 9 * SIZE(BO), %xmm11
1632	mulpd	%xmm8, %xmm11
1633	movapd	20 * SIZE(AO), %xmm8
1634	addpd	%xmm11, %xmm5
1635	movddup	10 * SIZE(BO), %xmm11
1636	mulpd	%xmm8, %xmm11
1637	addpd	%xmm11, %xmm0
1638	movddup	11 * SIZE(BO), %xmm11
1639	mulpd	%xmm8, %xmm11
1640	movapd	22 * SIZE(AO), %xmm8
1641	addpd	%xmm11, %xmm1
1642	movddup	10 * SIZE(BO), %xmm11
1643	mulpd	%xmm8, %xmm11
1644	addpd	%xmm11, %xmm4
1645	movddup	11 * SIZE(BO), %xmm11
1646	mulpd	%xmm8, %xmm11
1647	movapd	24 * SIZE(AO), %xmm8
1648	addpd	%xmm11, %xmm5
1649	movddup	12 * SIZE(BO), %xmm11
1650	mulpd	%xmm8, %xmm11
1651	addpd	%xmm11, %xmm0
1652	movddup	13 * SIZE(BO), %xmm11
1653	mulpd	%xmm8, %xmm11
1654	movapd	26 * SIZE(AO), %xmm8
1655	addpd	%xmm11, %xmm1
1656	movddup	12 * SIZE(BO), %xmm11
1657	mulpd	%xmm8, %xmm11
1658	addpd	%xmm11, %xmm4
1659	movddup	13 * SIZE(BO), %xmm11
1660	mulpd	%xmm8, %xmm11
1661	movapd	28 * SIZE(AO), %xmm8
1662	addpd	%xmm11, %xmm5
1663	movddup	14 * SIZE(BO), %xmm11
1664	mulpd	%xmm8, %xmm11
1665	addpd	%xmm11, %xmm0
1666	movddup	15 * SIZE(BO), %xmm11
1667	mulpd	%xmm8, %xmm11
1668	movapd	30 * SIZE(AO), %xmm8
1669	addpd	%xmm11, %xmm1
1670	movddup	14 * SIZE(BO), %xmm11
1671	mulpd	%xmm8, %xmm11
1672	addpd	%xmm11, %xmm4
1673	movddup	15 * SIZE(BO), %xmm11
1674	mulpd	%xmm8, %xmm11
1675	movapd	32 * SIZE(AO), %xmm8
1676	addpd	%xmm11, %xmm5
1677	movddup	24 * SIZE(BO), %xmm11
1678
1679	addq   $32 * SIZE, AO
1680	addq   $16 * SIZE, BO
1681	decq   %rax
1682	jne    .L52
1683	ALIGN_4
1684
1685.L55:
1686#ifndef TRMMKERNEL
1687	movq	K, %rax
1688#else
1689	movq	KKK, %rax
1690#endif
1691	movddup	ALPHA, %xmm15
1692	andq	$7, %rax		# if (k & 1)
1693	BRANCH
1694	je .L59
1695	ALIGN_4
1696
1697.L56:
1698	mulpd	%xmm8, %xmm9
1699	movapd	 2 * SIZE(AO), %xmm10
1700	addpd	%xmm9, %xmm0
1701	movddup	 1 * SIZE(BO), %xmm9
1702	mulpd	%xmm8, %xmm9
1703	movddup	 0 * SIZE(BO), %xmm11
1704	addpd	%xmm9, %xmm1
1705	movddup	 2 * SIZE(BO), %xmm9
1706	mulpd	%xmm10, %xmm11
1707	movapd	 4 * SIZE(AO), %xmm8
1708	addpd	%xmm11, %xmm4
1709	movddup	 1 * SIZE(BO), %xmm11
1710	mulpd	%xmm10, %xmm11
1711	addpd	%xmm11, %xmm5
1712
1713	addq	$4 * SIZE, AO		# aoffset  += 4
1714	addq	$2 * SIZE, BO		# boffset1 += 8
1715	decq	%rax
1716	jg	.L56
1717	ALIGN_4
1718
1719.L59:
1720#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1721     movsd	0 * SIZE(CO1), %xmm8
1722	movhpd	1 * SIZE(CO1), %xmm8
1723	movsd	2 * SIZE(CO1), %xmm9
1724	movhpd	3 * SIZE(CO1), %xmm9
1725	movsd	0 * SIZE(CO2), %xmm10
1726	movhpd	1 * SIZE(CO2), %xmm10
1727	movsd	2 * SIZE(CO2), %xmm11
1728	movhpd	3 * SIZE(CO2), %xmm11
1729#endif
1730
1731	mulpd	%xmm15, %xmm0
1732	mulpd	%xmm15, %xmm1
1733	mulpd	%xmm15, %xmm4
1734	mulpd	%xmm15, %xmm5
1735
1736#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1737	addpd	%xmm8,  %xmm0
1738	addpd	%xmm9,  %xmm4
1739	addpd	%xmm10, %xmm1
1740	addpd	%xmm11, %xmm5
1741#endif
1742
1743	movsd	%xmm0, 0 * SIZE(CO1)
1744	movhpd	%xmm0, 1 * SIZE(CO1)
1745	movsd	%xmm4, 2 * SIZE(CO1)
1746	movhpd	%xmm4, 3 * SIZE(CO1)
1747	movsd	%xmm1, 0 * SIZE(CO2)
1748	movhpd	%xmm1, 1 * SIZE(CO2)
1749	movsd	%xmm5, 2 * SIZE(CO2)
1750	movhpd	%xmm5, 3 * SIZE(CO2)
1751
1752#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1753    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1754	movq	K, %rax
1755	subq	KKK, %rax
1756	leaq	(,%rax, SIZE), %rax
1757	leaq	(AO, %rax, 4), AO
1758	leaq	(BO, %rax, 2), BO
1759#endif
1760
1761#if defined(TRMMKERNEL) && defined(LEFT)
1762	addq	$4, KK
1763#endif
1764
1765	addq	$4 * SIZE, CO1		# coffset += 4
1766	addq	$4 * SIZE, CO2		# coffset += 4
1767	decq	I			# i --
1768	jg	.L51
1769	ALIGN_4
1770
1771.L60:
1772	testq	$2, M
1773	je	.L70
1774	ALIGN_4
1775
1776.L61:
1777#if !defined(TRMMKERNEL) || \
1778	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1779	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1780
1781	movq	B, BO
1782#else
1783	movq	KK, %rax
1784	leaq	(, %rax, SIZE), %rax
1785	leaq	(AO, %rax, 2), AO
1786	leaq	(B,  %rax, 2), BO
1787#endif
1788
1789	movapd	 0 * SIZE(AO), %xmm8
1790	pxor	%xmm0, %xmm0
1791	movddup	 0 * SIZE(BO), %xmm9
1792	pxor	%xmm1, %xmm1
1793	movapd	 8 * SIZE(AO), %xmm10
1794	pxor	%xmm2, %xmm2
1795	movddup	 8 * SIZE(BO), %xmm11
1796	pxor	%xmm3, %xmm3
1797
1798#ifndef TRMMKERNEL
1799	movq	K, %rax
1800#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1801	movq	K, %rax
1802	subq	KK, %rax
1803	movq	%rax, KKK
1804#else
1805	movq	KK, %rax
1806#ifdef LEFT
1807	addq	$2, %rax
1808#else
1809	addq	$2, %rax
1810#endif
1811	movq	%rax, KKK
1812#endif
1813	sarq	$3, %rax
1814	je	.L65
1815	ALIGN_4
1816
1817.L62:
1818	mulpd	%xmm8, %xmm9
1819	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
1820	addpd	%xmm9, %xmm0
1821	movddup	 1 * SIZE(BO), %xmm9
1822	mulpd	%xmm8, %xmm9
1823	movapd	 2 * SIZE(AO), %xmm8
1824	addpd	%xmm9, %xmm1
1825	movddup	 2 * SIZE(BO), %xmm9
1826	mulpd	%xmm8, %xmm9
1827	addpd	%xmm9, %xmm2
1828	movddup	 3 * SIZE(BO), %xmm9
1829	mulpd	%xmm8, %xmm9
1830	movapd	 4 * SIZE(AO), %xmm8
1831	addpd	%xmm9, %xmm3
1832	movddup	 4 * SIZE(BO), %xmm9
1833	mulpd	%xmm8, %xmm9
1834	addpd	%xmm9, %xmm0
1835	movddup	 5 * SIZE(BO), %xmm9
1836	mulpd	%xmm8, %xmm9
1837	movapd	 6 * SIZE(AO), %xmm8
1838	addpd	%xmm9, %xmm1
1839	movddup	 6 * SIZE(BO), %xmm9
1840	mulpd	%xmm8, %xmm9
1841	addpd	%xmm9, %xmm2
1842	movddup	 7 * SIZE(BO), %xmm9
1843	mulpd	%xmm8, %xmm9
1844	movapd	16 * SIZE(AO), %xmm8
1845	addpd	%xmm9, %xmm3
1846	movddup	16 * SIZE(BO), %xmm9
1847	mulpd	%xmm10, %xmm11
1848	addpd	%xmm11, %xmm0
1849	movddup	 9 * SIZE(BO), %xmm11
1850	mulpd	%xmm10, %xmm11
1851	movapd	10 * SIZE(AO), %xmm10
1852	addpd	%xmm11, %xmm1
1853	movddup	10 * SIZE(BO), %xmm11
1854	mulpd	%xmm10, %xmm11
1855	addpd	%xmm11, %xmm2
1856	movddup	11 * SIZE(BO), %xmm11
1857	mulpd	%xmm10, %xmm11
1858	movapd	12 * SIZE(AO), %xmm10
1859	addpd	%xmm11, %xmm3
1860	movddup	12 * SIZE(BO), %xmm11
1861	mulpd	%xmm10, %xmm11
1862	addpd	%xmm11, %xmm0
1863	movddup	13 * SIZE(BO), %xmm11
1864	mulpd	%xmm10, %xmm11
1865	movapd	14 * SIZE(AO), %xmm10
1866	addpd	%xmm11, %xmm1
1867	movddup	14 * SIZE(BO), %xmm11
1868	mulpd	%xmm10, %xmm11
1869	addpd	%xmm11, %xmm2
1870	movddup	15 * SIZE(BO), %xmm11
1871	mulpd	%xmm10, %xmm11
1872	movapd	24 * SIZE(AO), %xmm10
1873	addpd	%xmm11, %xmm3
1874	movddup	24 * SIZE(BO), %xmm11
1875
1876	addq   $16 * SIZE, AO
1877	addq   $16 * SIZE, BO
1878	decq   %rax
1879	jne    .L62
1880	ALIGN_4
1881
1882.L65:
1883#ifndef TRMMKERNEL
1884	movq	K, %rax
1885#else
1886	movq	KKK, %rax
1887#endif
1888	movddup	ALPHA, %xmm15
1889	andq	$7, %rax		# if (k & 1)
1890	BRANCH
1891	je .L69
1892	ALIGN_4
1893
1894.L66:
1895	mulpd	%xmm8, %xmm9
1896	addpd	%xmm9, %xmm0
1897	movddup	 1 * SIZE(BO), %xmm9
1898	mulpd	%xmm8, %xmm9
1899	movapd	 2 * SIZE(AO), %xmm8
1900	addpd	%xmm9, %xmm1
1901	movddup	 2 * SIZE(BO), %xmm9
1902
1903	addq	$2 * SIZE, AO		# aoffset  += 4
1904	addq	$2 * SIZE, BO		# boffset1 += 8
1905	decq	%rax
1906	jg	.L66
1907	ALIGN_4
1908
1909.L69:
1910#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1911	movsd	0 * SIZE(CO1), %xmm8
1912	movhpd	1 * SIZE(CO1), %xmm8
1913	movsd	0 * SIZE(CO2), %xmm10
1914	movhpd	1 * SIZE(CO2), %xmm10
1915#endif
1916
1917	addpd	%xmm2, %xmm0
1918	addpd	%xmm3, %xmm1
1919
1920	mulpd	%xmm15, %xmm0
1921	mulpd	%xmm15, %xmm1
1922
1923#if! defined(TRMMKERNEL) && !defined(BETAZERO)
1924	addpd	%xmm8,  %xmm0
1925	addpd	%xmm10, %xmm1
1926#endif
1927
1928	movsd	%xmm0, 0 * SIZE(CO1)
1929	movhpd	%xmm0, 1 * SIZE(CO1)
1930	movsd	%xmm1, 0 * SIZE(CO2)
1931	movhpd	%xmm1, 1 * SIZE(CO2)
1932
1933#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1934    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1935	movq	K, %rax
1936	subq	KKK, %rax
1937	leaq	(,%rax, SIZE), %rax
1938	leaq	(AO, %rax, 2), AO
1939	leaq	(BO, %rax, 2), BO
1940#endif
1941
1942#if defined(TRMMKERNEL) && defined(LEFT)
1943	addq	$2, KK
1944#endif
1945	addq	$2 * SIZE, CO1		# coffset += 4
1946	addq	$2 * SIZE, CO2		# coffset += 4
1947	ALIGN_4
1948
1949.L70:
1950	testq	$1, M
1951	je	.L79
1952	ALIGN_4
1953
1954.L71:
1955#if !defined(TRMMKERNEL) || \
1956	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1957	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1958
1959	movq	B, BO
1960#else
1961	movq	KK, %rax
1962	leaq	(, %rax, SIZE), %rax
1963	leaq	(AO, %rax, 1), AO
1964	leaq	(B,  %rax, 2), BO
1965#endif
1966
1967	movddup	 0 * SIZE(AO), %xmm8
1968	pxor	%xmm0, %xmm0
1969	movapd	 0 * SIZE(BO), %xmm9
1970	pxor	%xmm1, %xmm1
1971	movddup	 4 * SIZE(AO), %xmm10
1972	pxor	%xmm2, %xmm2
1973	movapd	 8 * SIZE(BO), %xmm11
1974	pxor	%xmm3, %xmm3
1975
1976#ifndef TRMMKERNEL
1977	movq	K, %rax
1978#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1979	movq	K, %rax
1980	subq	KK, %rax
1981	movq	%rax, KKK
1982#else
1983	movq	KK, %rax
1984#ifdef LEFT
1985	addq	$1, %rax
1986#else
1987	addq	$2, %rax
1988#endif
1989	movq	%rax, KKK
1990#endif
1991	sarq	$3, %rax
1992	je	.L75
1993	ALIGN_4
1994
1995.L72:
1996	mulpd	%xmm8, %xmm9
1997	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
1998	movddup	 1 * SIZE(AO), %xmm8
1999	addpd	%xmm9, %xmm0
2000	mulpd	 2 * SIZE(BO), %xmm8
2001	movapd	16 * SIZE(BO), %xmm9
2002	addpd	%xmm8, %xmm1
2003	movddup	 2 * SIZE(AO), %xmm8
2004	mulpd	 4 * SIZE(BO), %xmm8
2005	addpd	%xmm8, %xmm2
2006	movddup	 3 * SIZE(AO), %xmm8
2007	mulpd	 6 * SIZE(BO), %xmm8
2008	addpd	%xmm8, %xmm3
2009	movddup	 8 * SIZE(AO), %xmm8
2010	mulpd	%xmm10, %xmm11
2011	movddup	 5 * SIZE(AO), %xmm10
2012	addpd	%xmm11, %xmm0
2013	mulpd	10 * SIZE(BO), %xmm10
2014	movapd	24 * SIZE(BO), %xmm11
2015	addpd	%xmm10, %xmm1
2016	movddup	 6 * SIZE(AO), %xmm10
2017	mulpd	12 * SIZE(BO), %xmm10
2018	addpd	%xmm10, %xmm2
2019	movddup	 7 * SIZE(AO), %xmm10
2020	mulpd	14 * SIZE(BO), %xmm10
2021	addpd	%xmm10, %xmm3
2022	movddup	12 * SIZE(AO), %xmm10
2023
2024	addq   $ 8 * SIZE, AO
2025	addq   $16 * SIZE, BO
2026	decq   %rax
2027	jne    .L72
2028	ALIGN_4
2029
2030.L75:
2031#ifndef TRMMKERNEL
2032	movq	K, %rax
2033#else
2034	movq	KKK, %rax
2035#endif
2036	movddup	ALPHA, %xmm15
2037	andq	$7, %rax		# if (k & 1)
2038	BRANCH
2039	je .L78
2040	ALIGN_4
2041
2042.L76:
2043	mulpd	%xmm8, %xmm9
2044	movddup	 1 * SIZE(AO), %xmm8
2045	addpd	%xmm9, %xmm0
2046	movapd	 2 * SIZE(BO), %xmm9
2047
2048	addq	$1 * SIZE, AO		# aoffset  += 4
2049	addq	$2 * SIZE, BO		# boffset1 += 8
2050	decq	%rax
2051	jg	.L76
2052	ALIGN_4
2053
2054.L78:
2055#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2056	movsd	0 * SIZE(CO1), %xmm8
2057	movhpd	0 * SIZE(CO2), %xmm8
2058#endif
2059
2060	addpd	%xmm1, %xmm0
2061	addpd	%xmm3, %xmm2
2062	addpd	%xmm2, %xmm0
2063
2064	mulpd	%xmm15, %xmm0
2065#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2066	addpd	%xmm8,  %xmm0
2067#endif
2068
2069	movsd	%xmm0, 0 * SIZE(CO1)
2070	movhpd	%xmm0, 0 * SIZE(CO2)
2071
2072#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2073    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2074	movq	K, %rax
2075	subq	KKK, %rax
2076	leaq	(,%rax, SIZE), %rax
2077	leaq	(AO, %rax, 1), AO
2078	leaq	(BO, %rax, 2), BO
2079#endif
2080
2081#if defined(TRMMKERNEL) && defined(LEFT)
2082	addq	$1, KK
2083#endif
2084	ALIGN_4
2085
2086.L79:
2087#if defined(TRMMKERNEL) && !defined(LEFT)
2088	addl	$2, KK
2089#endif
2090	leaq	(C, LDC, 2), C
2091	movq	BO, B
2092	ALIGN_4
2093
2094.L80:
2095	testq	$1, N
2096	je	.L999
2097	ALIGN_4
2098
2099#if defined(TRMMKERNEL) && defined(LEFT)
2100	movq	OFFSET, %rax
2101	movq	%rax, KK
2102#endif
2103
2104	movq	C, CO1
2105	movq	A, AO
2106
2107	movq	M,  I
2108	sarq	$2, I	# i = (m >> 2)
2109	jle	.L100
2110	ALIGN_4
2111
2112.L91:
2113#if !defined(TRMMKERNEL) || \
2114	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2115	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2116
2117	movq	B, BO
2118#else
2119	movq	KK, %rax
2120	leaq	(, %rax, SIZE), %rax
2121	leaq	(AO, %rax, 4), AO
2122	leaq	(B,  %rax, 1), BO
2123#endif
2124
2125	movapd	 0 * SIZE(AO), %xmm8
2126	pxor	%xmm0, %xmm0
2127	movddup	 0 * SIZE(BO), %xmm9
2128	pxor	%xmm1, %xmm1
2129	movapd	 8 * SIZE(AO), %xmm10
2130	pxor	%xmm2, %xmm2
2131	movddup	 4 * SIZE(BO), %xmm11
2132	pxor	%xmm3, %xmm3
2133
2134#ifdef HAVE_3DNOW
2135	prefetchw      4 * SIZE(CO1)
2136#else
2137	prefetchnta     4 * SIZE(CO1)
2138#endif
2139
2140#ifndef TRMMKERNEL
2141	movq	K, %rax
2142#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2143	movq	K, %rax
2144	subq	KK, %rax
2145	movq	%rax, KKK
2146#else
2147	movq	KK, %rax
2148#ifdef LEFT
2149	addq	$4, %rax
2150#else
2151	addq	$1, %rax
2152#endif
2153	movq	%rax, KKK
2154#endif
2155	sarq	$3, %rax
2156	je	.L95
2157	ALIGN_4
2158
2159.L92:
2160	mulpd	%xmm9, %xmm8
2161	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2162	mulpd	 2 * SIZE(AO), %xmm9
2163	addpd	%xmm8, %xmm0
2164	movapd	 4 * SIZE(AO), %xmm8
2165	addpd	%xmm9, %xmm1
2166	movddup	 1 * SIZE(BO), %xmm9
2167	mulpd	%xmm9, %xmm8
2168	mulpd	 6 * SIZE(AO), %xmm9
2169	addpd	%xmm8, %xmm2
2170	movapd	16 * SIZE(AO), %xmm8
2171	addpd	%xmm9, %xmm3
2172	movddup	 2 * SIZE(BO), %xmm9
2173	mulpd	%xmm9, %xmm10
2174	mulpd	10 * SIZE(AO), %xmm9
2175	addpd	%xmm10, %xmm0
2176	movapd	12 * SIZE(AO), %xmm10
2177	addpd	%xmm9, %xmm1
2178	movddup	 3 * SIZE(BO), %xmm9
2179	mulpd	%xmm9, %xmm10
2180	mulpd	14 * SIZE(AO), %xmm9
2181	addpd	%xmm10, %xmm2
2182	movapd	24 * SIZE(AO), %xmm10
2183	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
2184	addpd	%xmm9, %xmm3
2185	movddup	 8 * SIZE(BO), %xmm9
2186	mulpd	%xmm11, %xmm8
2187	mulpd	18 * SIZE(AO), %xmm11
2188	addpd	%xmm8, %xmm0
2189	movapd	20 * SIZE(AO), %xmm8
2190	addpd	%xmm11, %xmm1
2191	movddup	 5 * SIZE(BO), %xmm11
2192	mulpd	%xmm11, %xmm8
2193	mulpd	22 * SIZE(AO), %xmm11
2194	addpd	%xmm8, %xmm2
2195	movapd	32 * SIZE(AO), %xmm8
2196	addpd	%xmm11, %xmm3
2197	movddup	 6 * SIZE(BO), %xmm11
2198	mulpd	%xmm11, %xmm10
2199	mulpd	26 * SIZE(AO), %xmm11
2200	addpd	%xmm10, %xmm0
2201	movapd	28 * SIZE(AO), %xmm10
2202	addpd	%xmm11, %xmm1
2203	movddup	 7 * SIZE(BO), %xmm11
2204	mulpd	%xmm11, %xmm10
2205	mulpd	30 * SIZE(AO), %xmm11
2206	addpd	%xmm10, %xmm2
2207	movapd	40 * SIZE(AO), %xmm10
2208	addpd	%xmm11, %xmm3
2209	movddup	12 * SIZE(BO), %xmm11
2210
2211	addq   $32 * SIZE, AO
2212	addq   $8 * SIZE, BO
2213	decq   %rax
2214	jne    .L92
2215	ALIGN_4
2216
2217.L95:
2218#ifndef TRMMKERNEL
2219	movq	K, %rax
2220#else
2221	movq	KKK, %rax
2222#endif
2223	movddup	ALPHA, %xmm15
2224	andq	$7, %rax		# if (k & 1)
2225	BRANCH
2226	je .L99
2227	ALIGN_4
2228
2229.L96:
2230	mulpd	%xmm9, %xmm8
2231	mulpd	 2 * SIZE(AO), %xmm9
2232	addpd	%xmm8, %xmm0
2233	movapd	 4 * SIZE(AO), %xmm8
2234	addpd	%xmm9, %xmm1
2235	movddup	 1 * SIZE(BO), %xmm9
2236
2237	addq	$4 * SIZE, AO		# aoffset  += 4
2238	addq	$1 * SIZE, BO		# boffset1 += 8
2239	decq	%rax
2240	jg	.L96
2241	ALIGN_4
2242
2243.L99:
2244#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2245	movsd	0 * SIZE(CO1), %xmm8
2246	movhpd	1 * SIZE(CO1), %xmm8
2247	movsd	2 * SIZE(CO1), %xmm9
2248	movhpd	3 * SIZE(CO1), %xmm9
2249#endif
2250
2251	addpd	%xmm2, %xmm0
2252	addpd	%xmm3, %xmm1
2253
2254	mulpd	%xmm15, %xmm0
2255	mulpd	%xmm15, %xmm1
2256
2257#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2258	addpd	%xmm8,  %xmm0
2259	addpd	%xmm9,  %xmm1
2260#endif
2261
2262	movsd	%xmm0, 0 * SIZE(CO1)
2263	movhpd	%xmm0, 1 * SIZE(CO1)
2264	movsd	%xmm1, 2 * SIZE(CO1)
2265	movhpd	%xmm1, 3 * SIZE(CO1)
2266
2267#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2268    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2269	movq	K, %rax
2270	subq	KKK, %rax
2271	leaq	(,%rax, SIZE), %rax
2272	leaq	(AO, %rax, 4), AO
2273	leaq	(BO, %rax, 1), BO
2274#endif
2275
2276#if defined(TRMMKERNEL) && defined(LEFT)
2277	addq	$4, KK
2278#endif
2279
2280	addq	$4 * SIZE, CO1		# coffset += 4
2281	decq	I			# i --
2282	jg	.L91
2283	ALIGN_4
2284
2285.L100:
2286	testq	$2, M
2287	je	.L110
2288	ALIGN_4
2289
2290.L101:
2291#if !defined(TRMMKERNEL) || \
2292	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2293	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2294
2295	movq	B, BO
2296#else
2297	movq	KK, %rax
2298	leaq	(, %rax, SIZE), %rax
2299	leaq	(AO, %rax, 2), AO
2300	leaq	(B,  %rax, 1), BO
2301#endif
2302
2303	movapd	 0 * SIZE(AO), %xmm8
2304	pxor	%xmm0, %xmm0
2305	movddup	 0 * SIZE(BO), %xmm9
2306	pxor	%xmm1, %xmm1
2307	movapd	 8 * SIZE(AO), %xmm10
2308	pxor	%xmm2, %xmm2
2309	movddup	 4 * SIZE(BO), %xmm11
2310	pxor	%xmm3, %xmm3
2311
2312#ifndef TRMMKERNEL
2313	movq	K, %rax
2314#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2315	movq	K, %rax
2316	subq	KK, %rax
2317	movq	%rax, KKK
2318#else
2319	movq	KK, %rax
2320#ifdef LEFT
2321	addq	$2, %rax
2322#else
2323	addq	$1, %rax
2324#endif
2325	movq	%rax, KKK
2326#endif
2327	sarq	$3, %rax
2328	je	.L105
2329	ALIGN_4
2330
2331.L102:
2332	mulpd	%xmm9, %xmm8
2333	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2334	movddup	 1 * SIZE(BO), %xmm9
2335	addpd	%xmm8, %xmm0
2336	mulpd	 2 * SIZE(AO), %xmm9
2337	movapd	16 * SIZE(AO), %xmm8
2338	addpd	%xmm9, %xmm1
2339	movddup	 2 * SIZE(BO), %xmm9
2340	mulpd	 4 * SIZE(AO), %xmm9
2341	addpd	%xmm9, %xmm2
2342	movddup	 3 * SIZE(BO), %xmm9
2343	mulpd	 6 * SIZE(AO), %xmm9
2344	addpd	%xmm9, %xmm3
2345	movddup	 8 * SIZE(BO), %xmm9
2346	mulpd	%xmm11, %xmm10
2347	movddup	 5 * SIZE(BO), %xmm11
2348	addpd	%xmm10, %xmm0
2349	mulpd	10 * SIZE(AO), %xmm11
2350	movapd	24 * SIZE(AO), %xmm10
2351	addpd	%xmm11, %xmm1
2352	movddup	 6 * SIZE(BO), %xmm11
2353	mulpd	12 * SIZE(AO), %xmm11
2354	addpd	%xmm11, %xmm2
2355	movddup	 7 * SIZE(BO), %xmm11
2356	mulpd	14 * SIZE(AO), %xmm11
2357	addpd	%xmm11, %xmm3
2358	movddup	12 * SIZE(BO), %xmm11
2359
2360	addq   $16 * SIZE, AO
2361	addq   $ 8 * SIZE, BO
2362	decq   %rax
2363	jne    .L102
2364	ALIGN_4
2365
2366.L105:
2367#ifndef TRMMKERNEL
2368	movq	K, %rax
2369#else
2370	movq	KKK, %rax
2371#endif
2372	movddup	ALPHA, %xmm15
2373	andq	$7, %rax		# if (k & 1)
2374	BRANCH
2375	je .L109
2376	ALIGN_4
2377
2378.L106:
2379	mulpd	%xmm9, %xmm8
2380	movddup	 1 * SIZE(BO), %xmm9
2381	addpd	%xmm8, %xmm0
2382	movapd	 2 * SIZE(AO), %xmm8
2383
2384	addq	$2 * SIZE, AO		# aoffset  += 4
2385	addq	$1 * SIZE, BO		# boffset1 += 8
2386	decq	%rax
2387	jg	.L106
2388	ALIGN_4
2389
2390.L109:
2391#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2392	movsd	0 * SIZE(CO1), %xmm8
2393	movhpd	1 * SIZE(CO1), %xmm8
2394#endif
2395
2396	addpd	%xmm1, %xmm0
2397	addpd	%xmm3, %xmm2
2398	addpd	%xmm2, %xmm0
2399
2400	mulpd	%xmm15, %xmm0
2401#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2402     addpd	%xmm8,  %xmm0
2403#endif
2404
2405	movsd	%xmm0, 0 * SIZE(CO1)
2406	movhpd	%xmm0, 1 * SIZE(CO1)
2407
2408#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2409    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2410	movq	K, %rax
2411	subq	KKK, %rax
2412	leaq	(,%rax, SIZE), %rax
2413	leaq	(AO, %rax, 2), AO
2414	leaq	(BO, %rax, 1), BO
2415#endif
2416
2417#if defined(TRMMKERNEL) && defined(LEFT)
2418	addq	$2, KK
2419#endif
2420
2421	addq	$2 * SIZE, CO1		# coffset += 4
2422	ALIGN_4
2423
2424.L110:
2425	testq	$1, M
2426	je	.L999
2427	ALIGN_4
2428
2429.L111:
2430#if !defined(TRMMKERNEL) || \
2431	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2432	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2433
2434	movq	B, BO
2435#else
2436	movq	KK, %rax
2437	leaq	(, %rax, SIZE), %rax
2438	leaq	(AO, %rax, 1), AO
2439	leaq	(B,  %rax, 1), BO
2440#endif
2441
2442	movsd	 0 * SIZE(AO), %xmm8
2443	pxor	%xmm0, %xmm0
2444	movsd	 0 * SIZE(BO), %xmm9
2445	pxor	%xmm1, %xmm1
2446	movsd	 4 * SIZE(AO), %xmm10
2447	pxor	%xmm2, %xmm2
2448	movsd	 4 * SIZE(BO), %xmm11
2449	pxor	%xmm3, %xmm3
2450
2451	movapd	 0 * SIZE(AO), %xmm9
2452	movapd	 0 * SIZE(BO), %xmm8
2453	movapd	 4 * SIZE(AO), %xmm11
2454	movapd	 4 * SIZE(BO), %xmm10
2455
2456#ifndef TRMMKERNEL
2457	movq	K, %rax
2458#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2459	movq	K, %rax
2460	subq	KK, %rax
2461	movq	%rax, KKK
2462#else
2463	movq	KK, %rax
2464#ifdef LEFT
2465	addq	$1, %rax
2466#else
2467	addq	$1, %rax
2468#endif
2469	movq	%rax, KKK
2470#endif
2471	sarq	$3, %rax
2472	je	.L115
2473	ALIGN_4
2474
2475.L112:
2476	mulpd	%xmm9, %xmm8
2477	movapd	 2 * SIZE(AO), %xmm9
2478	addpd	%xmm8, %xmm0
2479	mulpd	 2 * SIZE(BO), %xmm9
2480	movapd	 8 * SIZE(BO), %xmm8
2481	addpd	%xmm9, %xmm1
2482	movapd	 8 * SIZE(AO), %xmm9
2483	mulpd	%xmm11, %xmm10
2484	movapd	 6 * SIZE(AO), %xmm11
2485	addpd	%xmm10, %xmm0
2486	mulpd	 6 * SIZE(BO), %xmm11
2487	movapd	12 * SIZE(BO), %xmm10
2488	addpd	%xmm11, %xmm1
2489	movapd	12 * SIZE(AO), %xmm11
2490
2491	addq   $8 * SIZE, AO
2492	addq   $8 * SIZE, BO
2493	decq   %rax
2494	jne    .L112
2495	ALIGN_4
2496
2497.L115:
2498#ifndef TRMMKERNEL
2499	movq	K, %rax
2500#else
2501	movq	KKK, %rax
2502#endif
2503	movddup	ALPHA, %xmm15
2504	andq	$7, %rax		# if (k & 1)
2505	BRANCH
2506	je .L118
2507	ALIGN_4
2508
2509.L116:
2510	mulsd	 0 * SIZE(BO), %xmm9
2511	addsd	%xmm9, %xmm0
2512	movsd	 1 * SIZE(AO), %xmm9
2513
2514	addq	$1 * SIZE, AO		# aoffset  += 4
2515	addq	$1 * SIZE, BO		# boffset1 += 8
2516	decq	%rax
2517	jg	.L116
2518	ALIGN_4
2519
2520.L118:
2521#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2522	movsd	0 * SIZE(CO1), %xmm8
2523#endif
2524
2525	addpd	%xmm1, %xmm0
2526	haddpd	%xmm0, %xmm0
2527	mulsd	%xmm15, %xmm0
2528#if! defined(TRMMKERNEL) && !defined(BETAZERO)
2529	addsd	%xmm8, %xmm0
2530#endif
2531
2532	movsd	%xmm0, 0 * SIZE(CO1)
2533	ALIGN_4
2534
2535.L999:
2536	movq	  0(%rsp), %rbx
2537	movq	  8(%rsp), %rbp
2538	movq	 16(%rsp), %r12
2539	movq	 24(%rsp), %r13
2540	movq	 32(%rsp), %r14
2541	movq	 40(%rsp), %r15
2542
2543#ifdef WINDOWS_ABI
2544	movq	 48(%rsp), %rdi
2545	movq	 56(%rsp), %rsi
2546	movups	 64(%rsp), %xmm6
2547	movups	 80(%rsp), %xmm7
2548	movups	 96(%rsp), %xmm8
2549	movups	112(%rsp), %xmm9
2550	movups	128(%rsp), %xmm10
2551	movups	144(%rsp), %xmm11
2552	movups	160(%rsp), %xmm12
2553	movups	176(%rsp), %xmm13
2554	movups	192(%rsp), %xmm14
2555	movups	208(%rsp), %xmm15
2556#endif
2557
2558	addq	$STACKSIZE, %rsp
2559	ret
2560
2561	EPILOGUE
2562