1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define M	%rdi
43#define N	%rsi
44#define K	%rdx
45
46#define A	%rcx
47#define B	%r8
48#define C	%r9
49#define LDC	%r10
50
51#define I	%r11
52#define J	%r12
53#define AO	%r13
54#define BO	%r14
55#define	CO1	%r15
56#define CO2	%rbx
57#define BB	%rbp
58
59#ifndef WINDOWS_ABI
60
61#define STACKSIZE 128
62
63#define OLD_LDC		 8 + STACKSIZE(%rsp)
64#define OLD_OFFSET	16 + STACKSIZE(%rsp)
65
66#define ALPHA_R	 48(%rsp)
67#define ALPHA_I	 56(%rsp)
68#define OFFSET	 64(%rsp)
69#define KKK	 72(%rsp)
70#define KK	 80(%rsp)
71
72#else
73
74#define STACKSIZE 512
75
76#define OLD_ALPHA_I	40 + STACKSIZE(%rsp)
77#define OLD_A		48 + STACKSIZE(%rsp)
78#define OLD_B		56 + STACKSIZE(%rsp)
79#define OLD_C		64 + STACKSIZE(%rsp)
80#define OLD_LDC		72 + STACKSIZE(%rsp)
81#define OLD_OFFSET	80 + STACKSIZE(%rsp)
82
83#define ALPHA_R	 224(%rsp)
84#define ALPHA_I	 232(%rsp)
85#define OFFSET	 240(%rsp)
86#define KK	 248(%rsp)
87#define KKK	 256(%rsp)
88
89#endif
90
91#define PREFETCH     prefetcht2
92#define PREFETCHSIZE (16 * 12 + 3)
93
94#define KERNEL1(address) \
95	mulpd	%xmm8, %xmm9 ;\
96	PREFETCH  (PREFETCHSIZE +  0) * SIZE + (address) * 2 * SIZE(AO);\
97	addpd	%xmm9, %xmm0;\
98	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
99	mulpd	%xmm8, %xmm9;\
100	addpd	%xmm9, %xmm1;\
101	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
102	mulpd	%xmm8, %xmm9;\
103	addpd	%xmm9, %xmm2;\
104	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
105	mulpd	%xmm8, %xmm9;\
106	movapd	 2 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
107	addpd	%xmm9, %xmm3;\
108	movddup	 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9
109
110#define KERNEL2(address) \
111	mulpd	%xmm8, %xmm9;\
112	addpd	%xmm9, %xmm4;\
113	movddup	 1 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
114	mulpd	%xmm8, %xmm9;\
115	addpd	%xmm9, %xmm5;\
116	movddup	 2 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
117	mulpd	%xmm8, %xmm9;\
118	addpd	%xmm9, %xmm6;\
119	movddup	 3 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
120	mulpd	%xmm8, %xmm9;\
121	movapd	 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
122	addpd	%xmm9, %xmm7;\
123	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
124
125#define KERNEL3(address) \
126	mulpd	%xmm8, %xmm9;\
127	addpd	%xmm9, %xmm0;\
128	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
129	mulpd	%xmm8, %xmm9;\
130	addpd	%xmm9, %xmm1;\
131	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
132	mulpd	%xmm8, %xmm9;\
133	addpd	%xmm9, %xmm2;\
134	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
135	mulpd	%xmm8, %xmm9;\
136	movapd	 6 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
137	addpd	%xmm9, %xmm3;\
138	movddup	 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9
139
140#define KERNEL4(address) \
141	mulpd	%xmm8, %xmm9;\
142	addpd	%xmm9, %xmm4;\
143	movddup	 5 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
144	mulpd	%xmm8, %xmm9;\
145	addpd	%xmm9, %xmm5;\
146	movddup	 6 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
147	mulpd	%xmm8, %xmm9;\
148	addpd	%xmm9, %xmm6;\
149	movddup	 7 * SIZE + (address) * 2 * SIZE(BO), %xmm9;\
150	mulpd	%xmm8, %xmm9;\
151	movapd	32 * SIZE + (address) * 2 * SIZE(AO), %xmm8;\
152	addpd	%xmm9, %xmm7;\
153	movddup	32 * SIZE + (address) * 2 * SIZE(BO), %xmm9
154
155#define KERNEL5(address) \
156	mulpd	%xmm10, %xmm11;\
157	addpd	%xmm11, %xmm0;\
158	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
159	mulpd	%xmm10, %xmm11;\
160	addpd	%xmm11, %xmm1;\
161	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
162	mulpd	%xmm10, %xmm11;\
163	addpd	%xmm11, %xmm2;\
164	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
165	mulpd	%xmm10, %xmm11;\
166	movapd	10 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
167	addpd	%xmm11, %xmm3;\
168	movddup	 8 * SIZE + (address) * 2 * SIZE(BO), %xmm11
169
170#define KERNEL6(address) \
171	mulpd	%xmm10, %xmm11;\
172	addpd	%xmm11, %xmm4;\
173	movddup	 9 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
174	mulpd	%xmm10, %xmm11;\
175	addpd	%xmm11, %xmm5;\
176	movddup	10 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
177	mulpd	%xmm10, %xmm11;\
178	addpd	%xmm11, %xmm6;\
179	movddup	11 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
180	mulpd	%xmm10, %xmm11;\
181	movapd	12 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
182	addpd	%xmm11, %xmm7;\
183	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
184
185#define KERNEL7(address) \
186	mulpd	%xmm10, %xmm11;\
187	addpd	%xmm11, %xmm0;\
188	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
189	mulpd	%xmm10, %xmm11;\
190	addpd	%xmm11, %xmm1;\
191	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
192	mulpd	%xmm10, %xmm11;\
193	addpd	%xmm11, %xmm2;\
194	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
195	mulpd	%xmm10, %xmm11;\
196	movapd	14 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
197	addpd	%xmm11, %xmm3;\
198	movddup	12 * SIZE + (address) * 2 * SIZE(BO), %xmm11
199
200#define KERNEL8(address) \
201	mulpd	%xmm10, %xmm11;\
202	addpd	%xmm11, %xmm4;\
203	movddup	13 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
204	mulpd	%xmm10, %xmm11;\
205	addpd	%xmm11, %xmm5;\
206	movddup	14 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
207	mulpd	%xmm10, %xmm11;\
208	addpd	%xmm11, %xmm6;\
209	movddup	15 * SIZE + (address) * 2 * SIZE(BO), %xmm11;\
210	mulpd	%xmm10, %xmm11;\
211	movapd	40 * SIZE + (address) * 2 * SIZE(AO), %xmm10;\
212	addpd	%xmm11, %xmm7;\
213	movddup	40 * SIZE + (address) * 2 * SIZE(BO), %xmm11
214
215#define KERNEL9(address) \
216	mulpd	%xmm12, %xmm13;\
217	PREFETCH  (PREFETCHSIZE + 16) * SIZE + (address) * 2 * SIZE(AO);\
218	addpd	%xmm13, %xmm0;\
219	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
220	mulpd	%xmm12, %xmm13;\
221	addpd	%xmm13, %xmm1;\
222	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
223	mulpd	%xmm12, %xmm13;\
224	addpd	%xmm13, %xmm2;\
225	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
226	mulpd	%xmm12, %xmm13;\
227	movapd	18 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
228	addpd	%xmm13, %xmm3;\
229	movddup	16 * SIZE + (address) * 2 * SIZE(BO), %xmm13
230
231#define KERNEL10(address) \
232	mulpd	%xmm12, %xmm13;\
233	addpd	%xmm13, %xmm4;\
234	movddup	17 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
235	mulpd	%xmm12, %xmm13;\
236	addpd	%xmm13, %xmm5;\
237	movddup	18 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
238	mulpd	%xmm12, %xmm13;\
239	addpd	%xmm13, %xmm6;\
240	movddup	19 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
241	mulpd	%xmm12, %xmm13;\
242	movapd	20 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
243	addpd	%xmm13, %xmm7;\
244	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
245
246#define KERNEL11(address) \
247	mulpd	%xmm12, %xmm13;\
248	addpd	%xmm13, %xmm0;\
249	movddup	21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
250	mulpd	%xmm12, %xmm13;\
251	addpd	%xmm13, %xmm1;\
252	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
253	mulpd	%xmm12, %xmm13;\
254	addpd	%xmm13, %xmm2;\
255	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
256	mulpd	%xmm12, %xmm13;\
257	movapd	22 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
258	addpd	%xmm13, %xmm3;\
259	movddup	20 * SIZE + (address) * 2 * SIZE(BO), %xmm13
260
261#define KERNEL12(address) \
262	mulpd	%xmm12, %xmm13;\
263	addpd	%xmm13, %xmm4;\
264	movddup 21 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
265	mulpd	%xmm12, %xmm13;\
266	addpd	%xmm13, %xmm5;\
267	movddup	22 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
268	mulpd	%xmm12, %xmm13;\
269	addpd	%xmm13, %xmm6;\
270	movddup	23 * SIZE + (address) * 2 * SIZE(BO), %xmm13;\
271	mulpd	%xmm12, %xmm13;\
272	movapd	48 * SIZE + (address) * 2 * SIZE(AO), %xmm12;\
273	addpd	%xmm13, %xmm7;\
274	movddup	48 * SIZE + (address) * 2 * SIZE(BO), %xmm13
275
276#define KERNEL13(address) \
277	mulpd	%xmm14, %xmm15;\
278	addpd	%xmm15, %xmm0;\
279	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
280	mulpd	%xmm14, %xmm15;\
281	addpd	%xmm15, %xmm1;\
282	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
283	mulpd	%xmm14, %xmm15;\
284	addpd	%xmm15, %xmm2;\
285	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
286	mulpd	%xmm14, %xmm15;\
287	movapd	26 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
288	addpd	%xmm15, %xmm3;\
289	movddup	24 * SIZE + (address) * 2 * SIZE(BO), %xmm15
290
291#define KERNEL14(address) \
292	mulpd	%xmm14, %xmm15;\
293	addpd	%xmm15, %xmm4;\
294	movddup	25 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
295	mulpd	%xmm14, %xmm15;\
296	addpd	%xmm15, %xmm5;\
297	movddup	26 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
298	mulpd	%xmm14, %xmm15;\
299	addpd	%xmm15, %xmm6;\
300	movddup	27 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
301	mulpd	%xmm14, %xmm15;\
302	movapd	28 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
303	addpd	%xmm15, %xmm7;\
304	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
305
306#define KERNEL15(address) \
307	mulpd	%xmm14, %xmm15;\
308	addpd	%xmm15, %xmm0;\
309	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
310	mulpd	%xmm14, %xmm15;\
311	addpd	%xmm15, %xmm1;\
312	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
313	mulpd	%xmm14, %xmm15;\
314	addpd	%xmm15, %xmm2;\
315	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
316	mulpd	%xmm14, %xmm15;\
317	movapd	30 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
318	addpd	%xmm15, %xmm3;\
319	movddup	28 * SIZE + (address) * 2 * SIZE(BO), %xmm15
320
321#define KERNEL16(address) \
322	mulpd	%xmm14, %xmm15;\
323	addpd	%xmm15, %xmm4;\
324	movddup	29 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
325	mulpd	%xmm14, %xmm15;\
326	addpd	%xmm15, %xmm5;\
327	movddup	30 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
328	mulpd	%xmm14, %xmm15;\
329	addpd	%xmm15, %xmm6;\
330	movddup	31 * SIZE + (address) * 2 * SIZE(BO), %xmm15;\
331	mulpd	%xmm14, %xmm15;\
332	movapd	56 * SIZE + (address) * 2 * SIZE(AO), %xmm14;\
333	addpd	%xmm15, %xmm7;\
334	movddup	56 * SIZE + (address) * 2 * SIZE(BO), %xmm15
335
336#if defined(OS_LINUX) && defined(CORE_BARCELONA)
337	.align 32768
338#endif
339	PROLOGUE
340	PROFCODE
341
342	subq	$STACKSIZE, %rsp
343	movq	%rbx,  0(%rsp)
344	movq	%rbp,  8(%rsp)
345	movq	%r12, 16(%rsp)
346	movq	%r13, 24(%rsp)
347	movq	%r14, 32(%rsp)
348	movq	%r15, 40(%rsp)
349
350#ifdef WINDOWS_ABI
351	movq	%rdi,    48(%rsp)
352	movq	%rsi,    56(%rsp)
353	movups	%xmm6,   64(%rsp)
354	movups	%xmm7,   80(%rsp)
355	movups	%xmm8,   96(%rsp)
356	movups	%xmm9,  112(%rsp)
357	movups	%xmm10, 128(%rsp)
358	movups	%xmm11, 144(%rsp)
359	movups	%xmm12, 160(%rsp)
360	movups	%xmm13, 176(%rsp)
361	movups	%xmm14, 192(%rsp)
362	movups	%xmm15, 208(%rsp)
363
364	movq	ARG1,      M
365	movq	ARG2,      N
366	movq	ARG3,      K
367	movq	OLD_A,     A
368	movq	OLD_B,     B
369	movq	OLD_C,     C
370	movq	OLD_LDC,   LDC
371
372	movaps	%xmm3, %xmm0
373	movsd	OLD_ALPHA_I, %xmm1
374#else
375	movq	OLD_LDC,   LDC
376#endif
377
378	movsd	 %xmm0, ALPHA_R
379	movsd	 %xmm1, ALPHA_I
380
381	salq	$ZBASE_SHIFT, LDC
382
383	movq	N,  J
384	sarq	$2, J		# j = (n >> 2)
385	jle	.L40
386	ALIGN_4
387
388.L10:
389#if defined(TRMMKERNEL) && defined(LEFT)
390	movq	OFFSET, %rax
391	movq	%rax, KK
392#endif
393
394	movq	C, CO1			# coffset1 = c
395	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
396	movq	A, AO		# aoffset = a
397
398	leaq	(, K, 4), BB
399	leaq	(B, BB, SIZE), BB
400
401	movq	M,  I
402	sarq	$2, I	# i = (m >> 2)
403	jle	.L20
404	ALIGN_4
405
406.L11:
407	prefetcht0	0 * SIZE(BB)
408	prefetcht0	8 * SIZE(BB)
409	subq	   $-8 * SIZE, BB
410
411#if !defined(TRMMKERNEL) || \
412	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
413	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
414
415	movq	B, BO
416#else
417	movq	KK, %rax
418	leaq	(, %rax, SIZE), %rax
419	leaq	(AO, %rax, 4), AO
420	leaq	(B,  %rax, 4), BO
421#endif
422
423	movapd	 0 * SIZE(AO), %xmm8
424	pxor	%xmm0, %xmm0
425	movddup	 0 * SIZE(BO), %xmm9
426	pxor	%xmm1, %xmm1
427	movapd	 8 * SIZE(AO), %xmm10
428	pxor	%xmm2, %xmm2
429	movddup	 8 * SIZE(BO), %xmm11
430	pxor	%xmm3, %xmm3
431
432	movapd	16 * SIZE(AO), %xmm12
433	pxor	%xmm4, %xmm4
434	movddup 16 * SIZE(BO), %xmm13
435	pxor	%xmm5, %xmm5
436	movapd	24 * SIZE(AO), %xmm14
437	pxor	%xmm6, %xmm6
438	movddup	24 * SIZE(BO), %xmm15
439	pxor	%xmm7, %xmm7
440
441	prefetchnta     7 * SIZE(CO1)
442	prefetchnta     7 * SIZE(CO2)
443	prefetchnta     7 * SIZE(CO1, LDC, 2)
444	prefetchnta     7 * SIZE(CO2, LDC, 2)
445
446#ifndef TRMMKERNEL
447	movq	K, %rax
448#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
449	movq	K, %rax
450	subq	KK, %rax
451	movq	%rax, KKK
452#else
453	movq	KK, %rax
454#ifdef LEFT
455	addq	$4, %rax
456#else
457	addq	$4, %rax
458#endif
459	movq	%rax, KKK
460#endif
461
462#if 1
463	andq	$-8, %rax
464	salq	$4, %rax
465	NOBRANCH
466	je	.L15
467
468.L1X:
469	KERNEL1 (16  *  0)
470	KERNEL2 (16  *  0)
471	KERNEL3 (16  *  0)
472	KERNEL4 (16  *  0)
473	KERNEL5 (16  *  0)
474	KERNEL6 (16  *  0)
475	KERNEL7 (16  *  0)
476	KERNEL8 (16  *  0)
477	KERNEL9 (16  *  0)
478	KERNEL10(16  *  0)
479	KERNEL11(16  *  0)
480	KERNEL12(16  *  0)
481	KERNEL13(16  *  0)
482	KERNEL14(16  *  0)
483	KERNEL15(16  *  0)
484	KERNEL16(16  *  0)
485	cmpq	$128 *  1, %rax
486	NOBRANCH
487	jle	.L12
488	KERNEL1 (16  *  1)
489	KERNEL2 (16  *  1)
490	KERNEL3 (16  *  1)
491	KERNEL4 (16  *  1)
492	KERNEL5 (16  *  1)
493	KERNEL6 (16  *  1)
494	KERNEL7 (16  *  1)
495	KERNEL8 (16  *  1)
496	KERNEL9 (16  *  1)
497	KERNEL10(16  *  1)
498	KERNEL11(16  *  1)
499	KERNEL12(16  *  1)
500	KERNEL13(16  *  1)
501	KERNEL14(16  *  1)
502	KERNEL15(16  *  1)
503	KERNEL16(16  *  1)
504	cmpq	$128 *  2, %rax
505	NOBRANCH
506	jle	.L12
507	KERNEL1 (16  *  2)
508	KERNEL2 (16  *  2)
509	KERNEL3 (16  *  2)
510	KERNEL4 (16  *  2)
511	KERNEL5 (16  *  2)
512	KERNEL6 (16  *  2)
513	KERNEL7 (16  *  2)
514	KERNEL8 (16  *  2)
515	KERNEL9 (16  *  2)
516	KERNEL10(16  *  2)
517	KERNEL11(16  *  2)
518	KERNEL12(16  *  2)
519	KERNEL13(16  *  2)
520	KERNEL14(16  *  2)
521	KERNEL15(16  *  2)
522	KERNEL16(16  *  2)
523	cmpq	$128 *  3, %rax
524	NOBRANCH
525	jle	.L12
526	KERNEL1 (16  *  3)
527	KERNEL2 (16  *  3)
528	KERNEL3 (16  *  3)
529	KERNEL4 (16  *  3)
530	KERNEL5 (16  *  3)
531	KERNEL6 (16  *  3)
532	KERNEL7 (16  *  3)
533	KERNEL8 (16  *  3)
534	KERNEL9 (16  *  3)
535	KERNEL10(16  *  3)
536	KERNEL11(16  *  3)
537	KERNEL12(16  *  3)
538	KERNEL13(16  *  3)
539	KERNEL14(16  *  3)
540	KERNEL15(16  *  3)
541	KERNEL16(16  *  3)
542	cmpq	$128 *  4, %rax
543	NOBRANCH
544	jle	.L12
545	KERNEL1 (16  *  4)
546	KERNEL2 (16  *  4)
547	KERNEL3 (16  *  4)
548	KERNEL4 (16  *  4)
549	KERNEL5 (16  *  4)
550	KERNEL6 (16  *  4)
551	KERNEL7 (16  *  4)
552	KERNEL8 (16  *  4)
553	KERNEL9 (16  *  4)
554	KERNEL10(16  *  4)
555	KERNEL11(16  *  4)
556	KERNEL12(16  *  4)
557	KERNEL13(16  *  4)
558	KERNEL14(16  *  4)
559	KERNEL15(16  *  4)
560	KERNEL16(16  *  4)
561	cmpq	$128 *  5, %rax
562	NOBRANCH
563	jle	.L12
564	KERNEL1 (16  *  5)
565	KERNEL2 (16  *  5)
566	KERNEL3 (16  *  5)
567	KERNEL4 (16  *  5)
568	KERNEL5 (16  *  5)
569	KERNEL6 (16  *  5)
570	KERNEL7 (16  *  5)
571	KERNEL8 (16  *  5)
572	KERNEL9 (16  *  5)
573	KERNEL10(16  *  5)
574	KERNEL11(16  *  5)
575	KERNEL12(16  *  5)
576	KERNEL13(16  *  5)
577	KERNEL14(16  *  5)
578	KERNEL15(16  *  5)
579	KERNEL16(16  *  5)
580	cmpq	$128 *  6, %rax
581	NOBRANCH
582	jle	.L12
583	KERNEL1 (16  *  6)
584	KERNEL2 (16  *  6)
585	KERNEL3 (16  *  6)
586	KERNEL4 (16  *  6)
587	KERNEL5 (16  *  6)
588	KERNEL6 (16  *  6)
589	KERNEL7 (16  *  6)
590	KERNEL8 (16  *  6)
591	KERNEL9 (16  *  6)
592	KERNEL10(16  *  6)
593	KERNEL11(16  *  6)
594	KERNEL12(16  *  6)
595	KERNEL13(16  *  6)
596	KERNEL14(16  *  6)
597	KERNEL15(16  *  6)
598	KERNEL16(16  *  6)
599	cmpq	$128 *  7, %rax
600	NOBRANCH
601	jle	.L12
602	KERNEL1 (16  *  7)
603	KERNEL2 (16  *  7)
604	KERNEL3 (16  *  7)
605	KERNEL4 (16  *  7)
606	KERNEL5 (16  *  7)
607	KERNEL6 (16  *  7)
608	KERNEL7 (16  *  7)
609	KERNEL8 (16  *  7)
610	KERNEL9 (16  *  7)
611	KERNEL10(16  *  7)
612	KERNEL11(16  *  7)
613	KERNEL12(16  *  7)
614	KERNEL13(16  *  7)
615	KERNEL14(16  *  7)
616	KERNEL15(16  *  7)
617	KERNEL16(16  *  7)
618
619	addq	$32 * 8  * SIZE, AO
620	addq	$32 * 8  * SIZE, BO
621	subq	$128 * 8, %rax
622	BRANCH
623	jg	.L1X
624
625.L12:
626	leaq	(AO, %rax, 2), AO	# * 16
627	leaq	(BO, %rax, 2), BO	# * 64
628
629#else
630	sarq	$3, %rax
631	je	.L15
632	ALIGN_4
633
634.L12:
635	mulpd	%xmm8, %xmm9
636	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
637	addpd	%xmm9, %xmm0
638	movddup	 1 * SIZE(BO), %xmm9
639	mulpd	%xmm8, %xmm9
640	addpd	%xmm9, %xmm1
641	movddup	 2 * SIZE(BO), %xmm9
642	mulpd	%xmm8, %xmm9
643	addpd	%xmm9, %xmm2
644	movddup	 3 * SIZE(BO), %xmm9
645	mulpd	%xmm8, %xmm9
646	movapd	 2 * SIZE(AO), %xmm8
647	addpd	%xmm9, %xmm3
648	movddup	 0 * SIZE(BO), %xmm9
649	mulpd	%xmm8, %xmm9
650	addpd	%xmm9, %xmm4
651	movddup	 1 * SIZE(BO), %xmm9
652	mulpd	%xmm8, %xmm9
653	addpd	%xmm9, %xmm5
654	movddup	 2 * SIZE(BO), %xmm9
655	mulpd	%xmm8, %xmm9
656	addpd	%xmm9, %xmm6
657	movddup	 3 * SIZE(BO), %xmm9
658	mulpd	%xmm8, %xmm9
659	movapd	 4 * SIZE(AO), %xmm8
660	addpd	%xmm9, %xmm7
661	movddup	 4 * SIZE(BO), %xmm9
662	mulpd	%xmm8, %xmm9
663	addpd	%xmm9, %xmm0
664	movddup	 5 * SIZE(BO), %xmm9
665	mulpd	%xmm8, %xmm9
666	addpd	%xmm9, %xmm1
667	movddup	 6 * SIZE(BO), %xmm9
668	mulpd	%xmm8, %xmm9
669	addpd	%xmm9, %xmm2
670	movddup	 7 * SIZE(BO), %xmm9
671	mulpd	%xmm8, %xmm9
672	movapd	 6 * SIZE(AO), %xmm8
673	addpd	%xmm9, %xmm3
674	movddup	 4 * SIZE(BO), %xmm9
675	mulpd	%xmm8, %xmm9
676	addpd	%xmm9, %xmm4
677	movddup	 5 * SIZE(BO), %xmm9
678	mulpd	%xmm8, %xmm9
679	addpd	%xmm9, %xmm5
680	movddup	 6 * SIZE(BO), %xmm9
681	mulpd	%xmm8, %xmm9
682	addpd	%xmm9, %xmm6
683	movddup	 7 * SIZE(BO), %xmm9
684	mulpd	%xmm8, %xmm9
685	movapd	32 * SIZE(AO), %xmm8
686	addpd	%xmm9, %xmm7
687
688	movddup	32 * SIZE(BO), %xmm9
689	mulpd	%xmm10, %xmm11
690	addpd	%xmm11, %xmm0
691	movddup	 9 * SIZE(BO), %xmm11
692	mulpd	%xmm10, %xmm11
693	addpd	%xmm11, %xmm1
694	movddup	10 * SIZE(BO), %xmm11
695	mulpd	%xmm10, %xmm11
696	addpd	%xmm11, %xmm2
697	movddup	11 * SIZE(BO), %xmm11
698	mulpd	%xmm10, %xmm11
699	movapd	10 * SIZE(AO), %xmm10
700	addpd	%xmm11, %xmm3
701
702	movddup	 8 * SIZE(BO), %xmm11
703	mulpd	%xmm10, %xmm11
704	addpd	%xmm11, %xmm4
705	movddup	 9 * SIZE(BO), %xmm11
706	mulpd	%xmm10, %xmm11
707	addpd	%xmm11, %xmm5
708	movddup	10 * SIZE(BO), %xmm11
709	mulpd	%xmm10, %xmm11
710	addpd	%xmm11, %xmm6
711	movddup	11 * SIZE(BO), %xmm11
712	mulpd	%xmm10, %xmm11
713	movapd	12 * SIZE(AO), %xmm10
714	addpd	%xmm11, %xmm7
715	movddup	12 * SIZE(BO), %xmm11
716	mulpd	%xmm10, %xmm11
717	addpd	%xmm11, %xmm0
718	movddup	13 * SIZE(BO), %xmm11
719	mulpd	%xmm10, %xmm11
720	addpd	%xmm11, %xmm1
721	movddup	14 * SIZE(BO), %xmm11
722	mulpd	%xmm10, %xmm11
723	addpd	%xmm11, %xmm2
724	movddup	15 * SIZE(BO), %xmm11
725	mulpd	%xmm10, %xmm11
726	movapd	14 * SIZE(AO), %xmm10
727	addpd	%xmm11, %xmm3
728
729	movddup	12 * SIZE(BO), %xmm11
730	mulpd	%xmm10, %xmm11
731	addpd	%xmm11, %xmm4
732	movddup	13 * SIZE(BO), %xmm11
733	mulpd	%xmm10, %xmm11
734	addpd	%xmm11, %xmm5
735	movddup	14 * SIZE(BO), %xmm11
736	mulpd	%xmm10, %xmm11
737	addpd	%xmm11, %xmm6
738	movddup	15 * SIZE(BO), %xmm11
739	mulpd	%xmm10, %xmm11
740	movapd	40 * SIZE(AO), %xmm10
741	addpd	%xmm11, %xmm7
742	movddup	40 * SIZE(BO), %xmm11
743
744	mulpd	%xmm12, %xmm13
745	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
746	addpd	%xmm13, %xmm0
747	movddup	17 * SIZE(BO), %xmm13
748	mulpd	%xmm12, %xmm13
749	addpd	%xmm13, %xmm1
750	movddup	18 * SIZE(BO), %xmm13
751	mulpd	%xmm12, %xmm13
752	addpd	%xmm13, %xmm2
753	movddup	19 * SIZE(BO), %xmm13
754	mulpd	%xmm12, %xmm13
755	movapd	18 * SIZE(AO), %xmm12
756	addpd	%xmm13, %xmm3
757
758	movddup	16 * SIZE(BO), %xmm13
759	mulpd	%xmm12, %xmm13
760	addpd	%xmm13, %xmm4
761	movddup	17 * SIZE(BO), %xmm13
762	mulpd	%xmm12, %xmm13
763	addpd	%xmm13, %xmm5
764	movddup	18 * SIZE(BO), %xmm13
765	mulpd	%xmm12, %xmm13
766	addpd	%xmm13, %xmm6
767	movddup	19 * SIZE(BO), %xmm13
768	mulpd	%xmm12, %xmm13
769	movapd	20 * SIZE(AO), %xmm12
770	addpd	%xmm13, %xmm7
771
772	movddup	20 * SIZE(BO), %xmm13
773	mulpd	%xmm12, %xmm13
774	addpd	%xmm13, %xmm0
775	movddup	21 * SIZE(BO), %xmm13
776	mulpd	%xmm12, %xmm13
777	addpd	%xmm13, %xmm1
778	movddup	22 * SIZE(BO), %xmm13
779	mulpd	%xmm12, %xmm13
780	addpd	%xmm13, %xmm2
781	movddup	23 * SIZE(BO), %xmm13
782	mulpd	%xmm12, %xmm13
783	movapd	22 * SIZE(AO), %xmm12
784	addpd	%xmm13, %xmm3
785
786	movddup	20 * SIZE(BO), %xmm13
787	mulpd	%xmm12, %xmm13
788	addpd	%xmm13, %xmm4
789	movddup 21 * SIZE(BO), %xmm13
790	mulpd	%xmm12, %xmm13
791	addpd	%xmm13, %xmm5
792	movddup	22 * SIZE(BO), %xmm13
793	mulpd	%xmm12, %xmm13
794	addpd	%xmm13, %xmm6
795	movddup	23 * SIZE(BO), %xmm13
796	mulpd	%xmm12, %xmm13
797	movapd	48 * SIZE(AO), %xmm12
798	addpd	%xmm13, %xmm7
799	movddup	48 * SIZE(BO), %xmm13
800
801	mulpd	%xmm14, %xmm15
802	addpd	%xmm15, %xmm0
803	movddup	25 * SIZE(BO), %xmm15
804	mulpd	%xmm14, %xmm15
805	addpd	%xmm15, %xmm1
806	movddup	26 * SIZE(BO), %xmm15
807	mulpd	%xmm14, %xmm15
808	addpd	%xmm15, %xmm2
809	movddup	27 * SIZE(BO), %xmm15
810	mulpd	%xmm14, %xmm15
811	movapd	26 * SIZE(AO), %xmm14
812	addpd	%xmm15, %xmm3
813
814	movddup	24 * SIZE(BO), %xmm15
815	mulpd	%xmm14, %xmm15
816	addpd	%xmm15, %xmm4
817	movddup	25 * SIZE(BO), %xmm15
818	mulpd	%xmm14, %xmm15
819	addpd	%xmm15, %xmm5
820	movddup	26 * SIZE(BO), %xmm15
821	mulpd	%xmm14, %xmm15
822	addpd	%xmm15, %xmm6
823	movddup	27 * SIZE(BO), %xmm15
824	mulpd	%xmm14, %xmm15
825	movapd	28 * SIZE(AO), %xmm14
826	addpd	%xmm15, %xmm7
827
828	movddup	28 * SIZE(BO), %xmm15
829	mulpd	%xmm14, %xmm15
830	addpd	%xmm15, %xmm0
831	movddup	29 * SIZE(BO), %xmm15
832	mulpd	%xmm14, %xmm15
833	addpd	%xmm15, %xmm1
834	movddup	30 * SIZE(BO), %xmm15
835	mulpd	%xmm14, %xmm15
836	addpd	%xmm15, %xmm2
837	movddup	31 * SIZE(BO), %xmm15
838	mulpd	%xmm14, %xmm15
839	movapd	30 * SIZE(AO), %xmm14
840	addpd	%xmm15, %xmm3
841
842	movddup	28 * SIZE(BO), %xmm15
843	mulpd	%xmm14, %xmm15
844	addpd	%xmm15, %xmm4
845	movddup	29 * SIZE(BO), %xmm15
846	mulpd	%xmm14, %xmm15
847	addpd	%xmm15, %xmm5
848	movddup	30 * SIZE(BO), %xmm15
849	mulpd	%xmm14, %xmm15
850	addpd	%xmm15, %xmm6
851	movddup	31 * SIZE(BO), %xmm15
852	mulpd	%xmm14, %xmm15
853	movapd	56 * SIZE(AO), %xmm14
854	addpd	%xmm15, %xmm7
855	movddup	56 * SIZE(BO), %xmm15
856
857	addq   $32 * SIZE, BO
858	addq   $32 * SIZE, AO
859	decq   %rax
860	BRANCH
861	jne    .L12
862#endif
863	ALIGN_4
864
865.L15:
866#ifndef TRMMKERNEL
867	movq	K, %rax
868#else
869	movq	KKK, %rax
870#endif
871	movsd	ALPHA_R, %xmm15
872	movhpd	ALPHA_I, %xmm15
873	andq	$7, %rax		# if (k & 1)
874	BRANCH
875	BRANCH
876	je	.L19
877	ALIGN_4
878
879.L16:
880	mulpd	%xmm8, %xmm9
881	movapd	 2 * SIZE(AO), %xmm10
882	addpd	%xmm9, %xmm0
883	movddup	 1 * SIZE(BO), %xmm9
884	mulpd	%xmm8, %xmm9
885	movddup	 0 * SIZE(BO), %xmm11
886	addpd	%xmm9, %xmm1
887	movddup	 2 * SIZE(BO), %xmm9
888	mulpd	%xmm8, %xmm9
889	addpd	%xmm9, %xmm2
890	movddup	 3 * SIZE(BO), %xmm9
891	mulpd	%xmm8, %xmm9
892	movapd	 4 * SIZE(AO), %xmm8
893	addpd	%xmm9, %xmm3
894	movddup	 4 * SIZE(BO), %xmm9
895	mulpd	%xmm10, %xmm11
896	addpd	%xmm11, %xmm4
897	movddup	 1 * SIZE(BO), %xmm11
898	mulpd	%xmm10, %xmm11
899	addpd	%xmm11, %xmm5
900	movddup	 2 * SIZE(BO), %xmm11
901	mulpd	%xmm10, %xmm11
902	addpd	%xmm11, %xmm6
903	movddup	 3 * SIZE(BO), %xmm11
904	mulpd	%xmm10, %xmm11
905	addpd	%xmm11, %xmm7
906
907	addq	$4 * SIZE, AO		# aoffset  += 4
908	addq	$4 * SIZE, BO		# boffset1 += 8
909	decq	%rax
910	BRANCH
911	jg	.L16
912	ALIGN_4
913
914.L19:
915	movsd	0 * SIZE(CO1), %xmm8
916	movhpd	1 * SIZE(CO1), %xmm8
917	movsd	2 * SIZE(CO1), %xmm9
918	movhpd	3 * SIZE(CO1), %xmm9
919
920	movsd	4 * SIZE(CO1), %xmm10
921	movhpd	5 * SIZE(CO1), %xmm10
922	movsd	6 * SIZE(CO1), %xmm11
923	movhpd	7 * SIZE(CO1), %xmm11
924
925	movddup	 %xmm0, %xmm12
926	unpckhpd %xmm0, %xmm0
927	movddup	 %xmm4, %xmm13
928	unpckhpd %xmm4, %xmm4
929
930	mulpd	 %xmm15, %xmm12
931	mulpd	 %xmm15, %xmm0
932	mulpd	 %xmm15, %xmm13
933	mulpd	 %xmm15, %xmm4
934
935	addpd	 %xmm12, %xmm8
936	addpd	 %xmm0,  %xmm9
937	addpd	 %xmm13, %xmm10
938	addpd	 %xmm4,  %xmm11
939
940	movsd	%xmm8,  0 * SIZE(CO1)
941	movhpd	%xmm8,  1 * SIZE(CO1)
942	movsd	%xmm9,  2 * SIZE(CO1)
943	movhpd	%xmm9,  3 * SIZE(CO1)
944
945	movsd	%xmm10, 4 * SIZE(CO1)
946	movhpd	%xmm10, 5 * SIZE(CO1)
947	movsd	%xmm11, 6 * SIZE(CO1)
948	movhpd	%xmm11, 7 * SIZE(CO1)
949
950	movsd	0 * SIZE(CO2), %xmm8
951	movhpd	1 * SIZE(CO2), %xmm8
952	movsd	2 * SIZE(CO2), %xmm9
953	movhpd	3 * SIZE(CO2), %xmm9
954
955	movsd	4 * SIZE(CO2), %xmm10
956	movhpd	5 * SIZE(CO2), %xmm10
957	movsd	6 * SIZE(CO2), %xmm11
958	movhpd	7 * SIZE(CO2), %xmm11
959
960	movddup	 %xmm1, %xmm12
961	unpckhpd %xmm1, %xmm1
962	movddup	 %xmm5, %xmm13
963	unpckhpd %xmm5, %xmm5
964
965	mulpd	 %xmm15, %xmm12
966	mulpd	 %xmm15, %xmm1
967	mulpd	 %xmm15, %xmm13
968	mulpd	 %xmm15, %xmm5
969
970	addpd	 %xmm12, %xmm8
971	addpd	 %xmm1,  %xmm9
972	addpd	 %xmm13, %xmm10
973	addpd	 %xmm5,  %xmm11
974
975	movsd	%xmm8,  0 * SIZE(CO2)
976	movhpd	%xmm8,  1 * SIZE(CO2)
977	movsd	%xmm9,  2 * SIZE(CO2)
978	movhpd	%xmm9,  3 * SIZE(CO2)
979
980	movsd	%xmm10, 4 * SIZE(CO2)
981	movhpd	%xmm10, 5 * SIZE(CO2)
982	movsd	%xmm11, 6 * SIZE(CO2)
983	movhpd	%xmm11, 7 * SIZE(CO2)
984
985	movsd	0 * SIZE(CO1, LDC, 2), %xmm8
986	movhpd	1 * SIZE(CO1, LDC, 2), %xmm8
987	movsd	2 * SIZE(CO1, LDC, 2), %xmm9
988	movhpd	3 * SIZE(CO1, LDC, 2), %xmm9
989
990	movsd	4 * SIZE(CO1, LDC, 2), %xmm10
991	movhpd	5 * SIZE(CO1, LDC, 2), %xmm10
992	movsd	6 * SIZE(CO1, LDC, 2), %xmm11
993	movhpd	7 * SIZE(CO1, LDC, 2), %xmm11
994
995	movddup	 %xmm2, %xmm12
996	unpckhpd %xmm2, %xmm2
997	movddup	 %xmm6, %xmm13
998	unpckhpd %xmm6, %xmm6
999
1000	mulpd	 %xmm15, %xmm12
1001	mulpd	 %xmm15, %xmm2
1002	mulpd	 %xmm15, %xmm13
1003	mulpd	 %xmm15, %xmm6
1004
1005	addpd	 %xmm12, %xmm8
1006	addpd	 %xmm2,  %xmm9
1007	addpd	 %xmm13, %xmm10
1008	addpd	 %xmm6,  %xmm11
1009
1010	movsd	%xmm8,  0 * SIZE(CO1, LDC, 2)
1011	movhpd	%xmm8,  1 * SIZE(CO1, LDC, 2)
1012	movsd	%xmm9,  2 * SIZE(CO1, LDC, 2)
1013	movhpd	%xmm9,  3 * SIZE(CO1, LDC, 2)
1014
1015	movsd	%xmm10, 4 * SIZE(CO1, LDC, 2)
1016	movhpd	%xmm10, 5 * SIZE(CO1, LDC, 2)
1017	movsd	%xmm11, 6 * SIZE(CO1, LDC, 2)
1018	movhpd	%xmm11, 7 * SIZE(CO1, LDC, 2)
1019
1020	movsd	0 * SIZE(CO2, LDC, 2), %xmm8
1021	movhpd	1 * SIZE(CO2, LDC, 2), %xmm8
1022	movsd	2 * SIZE(CO2, LDC, 2), %xmm9
1023	movhpd	3 * SIZE(CO2, LDC, 2), %xmm9
1024
1025	movsd	4 * SIZE(CO2, LDC, 2), %xmm10
1026	movhpd	5 * SIZE(CO2, LDC, 2), %xmm10
1027	movsd	6 * SIZE(CO2, LDC, 2), %xmm11
1028	movhpd	7 * SIZE(CO2, LDC, 2), %xmm11
1029
1030	movddup	 %xmm3, %xmm12
1031	unpckhpd %xmm3, %xmm3
1032	movddup	 %xmm7, %xmm13
1033	unpckhpd %xmm7, %xmm7
1034
1035	mulpd	 %xmm15, %xmm12
1036	mulpd	 %xmm15, %xmm3
1037	mulpd	 %xmm15, %xmm13
1038	mulpd	 %xmm15, %xmm7
1039
1040	addpd	 %xmm12, %xmm8
1041	addpd	 %xmm3,  %xmm9
1042	addpd	 %xmm13, %xmm10
1043	addpd	 %xmm7,  %xmm11
1044
1045	movsd	%xmm8,  0 * SIZE(CO2, LDC, 2)
1046	movhpd	%xmm8,  1 * SIZE(CO2, LDC, 2)
1047	movsd	%xmm9,  2 * SIZE(CO2, LDC, 2)
1048	movhpd	%xmm9,  3 * SIZE(CO2, LDC, 2)
1049
1050	movsd	%xmm10, 4 * SIZE(CO2, LDC, 2)
1051	movhpd	%xmm10, 5 * SIZE(CO2, LDC, 2)
1052	movsd	%xmm11, 6 * SIZE(CO2, LDC, 2)
1053	movhpd	%xmm11, 7 * SIZE(CO2, LDC, 2)
1054
1055	addq	$8 * SIZE, CO1		# coffset += 4
1056	addq	$8 * SIZE, CO2		# coffset += 4
1057
1058	decq	I			# i --
1059	jg	.L11
1060	jmp	.L20
1061	ALIGN_4
1062
1063.L20:
1064	testq	$2, M
1065	BRANCH
1066	je	.L30
1067	ALIGN_4
1068
1069.L21:
1070#if !defined(TRMMKERNEL) || \
1071	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1072	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1073
1074	movq	B, BO
1075#else
1076	movq	KK, %rax
1077	leaq	(, %rax, SIZE), %rax
1078	leaq	(AO, %rax, 2), AO
1079	leaq	(B,  %rax, 4), BO
1080#endif
1081
1082	movapd	 0 * SIZE(AO), %xmm8
1083	pxor	%xmm0, %xmm0
1084	movddup	 0 * SIZE(BO), %xmm9
1085	pxor	%xmm1, %xmm1
1086	movapd	 8 * SIZE(AO), %xmm10
1087	pxor	%xmm2, %xmm2
1088	movddup	 8 * SIZE(BO), %xmm11
1089	pxor	%xmm3, %xmm3
1090
1091#ifndef TRMMKERNEL
1092	movq	K, %rax
1093#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1094	movq	K, %rax
1095	subq	KK, %rax
1096	movq	%rax, KKK
1097#else
1098	movq	KK, %rax
1099#ifdef LEFT
1100	addq	$2, %rax
1101#else
1102	addq	$4, %rax
1103#endif
1104	movq	%rax, KKK
1105#endif
1106	sarq	$3, %rax
1107	je	.L25
1108	ALIGN_4
1109
1110.L22:
1111	mulpd	%xmm8, %xmm9
1112	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1113	addpd	%xmm9, %xmm0
1114	movddup	 1 * SIZE(BO), %xmm9
1115	mulpd	%xmm8, %xmm9
1116	addpd	%xmm9, %xmm1
1117	movddup	 2 * SIZE(BO), %xmm9
1118	mulpd	%xmm8, %xmm9
1119	addpd	%xmm9, %xmm2
1120	movddup	 3 * SIZE(BO), %xmm9
1121	mulpd	%xmm8, %xmm9
1122	movapd	 2 * SIZE(AO), %xmm8
1123	addpd	%xmm9, %xmm3
1124	movddup	 4 * SIZE(BO), %xmm9
1125	mulpd	%xmm8, %xmm9
1126	addpd	%xmm9, %xmm0
1127	movddup	 5 * SIZE(BO), %xmm9
1128	mulpd	%xmm8, %xmm9
1129	addpd	%xmm9, %xmm1
1130	movddup	 6 * SIZE(BO), %xmm9
1131	mulpd	%xmm8, %xmm9
1132	addpd	%xmm9, %xmm2
1133	movddup	 7 * SIZE(BO), %xmm9
1134	mulpd	%xmm8, %xmm9
1135	movapd	 4 * SIZE(AO), %xmm8
1136	addpd	%xmm9, %xmm3
1137	movddup	16 * SIZE(BO), %xmm9
1138	mulpd	%xmm8, %xmm11
1139	addpd	%xmm11, %xmm0
1140	movddup	 9 * SIZE(BO), %xmm11
1141	mulpd	%xmm8, %xmm11
1142	addpd	%xmm11, %xmm1
1143	movddup	10 * SIZE(BO), %xmm11
1144	mulpd	%xmm8, %xmm11
1145	addpd	%xmm11, %xmm2
1146	movddup	11 * SIZE(BO), %xmm11
1147	mulpd	%xmm8, %xmm11
1148	movapd	 6 * SIZE(AO), %xmm8
1149	addpd	%xmm11, %xmm3
1150	movddup	12 * SIZE(BO), %xmm11
1151	mulpd	%xmm8, %xmm11
1152	addpd	%xmm11, %xmm0
1153	movddup	13 * SIZE(BO), %xmm11
1154	mulpd	%xmm8, %xmm11
1155	addpd	%xmm11, %xmm1
1156	movddup	14 * SIZE(BO), %xmm11
1157	mulpd	%xmm8, %xmm11
1158	addpd	%xmm11, %xmm2
1159	movddup	15 * SIZE(BO), %xmm11
1160	mulpd	%xmm8, %xmm11
1161	movapd	16 * SIZE(AO), %xmm8
1162	addpd	%xmm11, %xmm3
1163	movddup	24 * SIZE(BO), %xmm11
1164	mulpd	%xmm10, %xmm9
1165	addpd	%xmm9, %xmm0
1166	movddup	17 * SIZE(BO), %xmm9
1167	mulpd	%xmm10, %xmm9
1168	addpd	%xmm9, %xmm1
1169	movddup	18 * SIZE(BO), %xmm9
1170	mulpd	%xmm10, %xmm9
1171	addpd	%xmm9, %xmm2
1172	movddup	19 * SIZE(BO), %xmm9
1173	mulpd	%xmm10, %xmm9
1174	movapd	10 * SIZE(AO), %xmm10
1175	addpd	%xmm9, %xmm3
1176	movddup	20 * SIZE(BO), %xmm9
1177	mulpd	%xmm10, %xmm9
1178	addpd	%xmm9, %xmm0
1179	movddup	21 * SIZE(BO), %xmm9
1180	mulpd	%xmm10, %xmm9
1181	addpd	%xmm9, %xmm1
1182	movddup	22 * SIZE(BO), %xmm9
1183	mulpd	%xmm10, %xmm9
1184	addpd	%xmm9, %xmm2
1185	movddup	23 * SIZE(BO), %xmm9
1186	mulpd	%xmm10, %xmm9
1187	movapd	12 * SIZE(AO), %xmm10
1188	addpd	%xmm9, %xmm3
1189	movddup	32 * SIZE(BO), %xmm9
1190	mulpd	%xmm10, %xmm11
1191	addpd	%xmm11, %xmm0
1192	movddup	25 * SIZE(BO), %xmm11
1193	mulpd	%xmm10, %xmm11
1194	addpd	%xmm11, %xmm1
1195	movddup	26 * SIZE(BO), %xmm11
1196	mulpd	%xmm10, %xmm11
1197	addpd	%xmm11, %xmm2
1198	movddup	27 * SIZE(BO), %xmm11
1199	mulpd	%xmm10, %xmm11
1200	movapd	14 * SIZE(AO), %xmm10
1201	addpd	%xmm11, %xmm3
1202	movddup	28 * SIZE(BO), %xmm11
1203	mulpd	%xmm10, %xmm11
1204	addpd	%xmm11, %xmm0
1205	movddup	29 * SIZE(BO), %xmm11
1206	mulpd	%xmm10, %xmm11
1207	addpd	%xmm11, %xmm1
1208	movddup	30 * SIZE(BO), %xmm11
1209	mulpd	%xmm10, %xmm11
1210	addpd	%xmm11, %xmm2
1211	movddup	31 * SIZE(BO), %xmm11
1212	mulpd	%xmm10, %xmm11
1213	movapd	24 * SIZE(AO), %xmm10
1214	addpd	%xmm11, %xmm3
1215	movddup	40 * SIZE(BO), %xmm11
1216
1217	addq   $16 * SIZE, AO
1218	addq   $32 * SIZE, BO
1219	decq   %rax
1220	jne    .L22
1221	ALIGN_4
1222
1223.L25:
1224#ifndef TRMMKERNEL
1225	movq	K, %rax
1226#else
1227	movq	KKK, %rax
1228#endif
1229	movsd	ALPHA_R, %xmm15
1230	movhpd	ALPHA_I, %xmm15
1231	andq	$7, %rax		# if (k & 1)
1232	BRANCH
1233	je .L29
1234	ALIGN_4
1235
1236.L26:
1237	mulpd	%xmm8, %xmm9
1238	addpd	%xmm9, %xmm0
1239	movddup	 1 * SIZE(BO), %xmm9
1240	mulpd	%xmm8, %xmm9
1241	addpd	%xmm9, %xmm1
1242	movddup	 2 * SIZE(BO), %xmm9
1243	mulpd	%xmm8, %xmm9
1244	addpd	%xmm9, %xmm2
1245	movddup	 3 * SIZE(BO), %xmm9
1246	mulpd	%xmm8, %xmm9
1247	movapd	 2 * SIZE(AO), %xmm8
1248	addpd	%xmm9, %xmm3
1249	movddup	 4 * SIZE(BO), %xmm9
1250
1251	addq	$2 * SIZE, AO		# aoffset  += 4
1252	addq	$4 * SIZE, BO		# boffset1 += 8
1253	decq	%rax
1254	jg	.L26
1255	ALIGN_4
1256
1257.L29:
1258	movsd	0 * SIZE(CO1), %xmm8
1259	movhpd	1 * SIZE(CO1), %xmm8
1260	movsd	2 * SIZE(CO1), %xmm9
1261	movhpd	3 * SIZE(CO1), %xmm9
1262
1263	movddup	 %xmm0, %xmm12
1264	unpckhpd %xmm0, %xmm0
1265
1266	mulpd	 %xmm15, %xmm12
1267	mulpd	 %xmm15, %xmm0
1268	addpd	 %xmm12, %xmm8
1269	addpd	 %xmm0,  %xmm9
1270
1271	movsd	%xmm8,  0 * SIZE(CO1)
1272	movhpd	%xmm8,  1 * SIZE(CO1)
1273	movsd	%xmm9,  2 * SIZE(CO1)
1274	movhpd	%xmm9,  3 * SIZE(CO1)
1275
1276	movsd	0 * SIZE(CO2), %xmm8
1277	movhpd	1 * SIZE(CO2), %xmm8
1278	movsd	2 * SIZE(CO2), %xmm9
1279	movhpd	3 * SIZE(CO2), %xmm9
1280
1281	movddup	 %xmm1, %xmm12
1282	unpckhpd %xmm1, %xmm1
1283
1284	mulpd	 %xmm15, %xmm12
1285	mulpd	 %xmm15, %xmm1
1286	addpd	 %xmm12, %xmm8
1287	addpd	 %xmm1,  %xmm9
1288
1289	movsd	%xmm8,  0 * SIZE(CO2)
1290	movhpd	%xmm8,  1 * SIZE(CO2)
1291	movsd	%xmm9,  2 * SIZE(CO2)
1292	movhpd	%xmm9,  3 * SIZE(CO2)
1293
1294	movsd	0 * SIZE(CO1, LDC, 2), %xmm8
1295	movhpd	1 * SIZE(CO1, LDC, 2), %xmm8
1296	movsd	2 * SIZE(CO1, LDC, 2), %xmm9
1297	movhpd	3 * SIZE(CO1, LDC, 2), %xmm9
1298
1299	movddup	 %xmm2, %xmm12
1300	unpckhpd %xmm2, %xmm2
1301
1302	mulpd	 %xmm15, %xmm12
1303	mulpd	 %xmm15, %xmm2
1304	addpd	 %xmm12, %xmm8
1305	addpd	 %xmm2,  %xmm9
1306
1307	movsd	%xmm8,  0 * SIZE(CO1, LDC, 2)
1308	movhpd	%xmm8,  1 * SIZE(CO1, LDC, 2)
1309	movsd	%xmm9,  2 * SIZE(CO1, LDC, 2)
1310	movhpd	%xmm9,  3 * SIZE(CO1, LDC, 2)
1311
1312	movsd	0 * SIZE(CO2, LDC, 2), %xmm8
1313	movhpd	1 * SIZE(CO2, LDC, 2), %xmm8
1314	movsd	2 * SIZE(CO2, LDC, 2), %xmm9
1315	movhpd	3 * SIZE(CO2, LDC, 2), %xmm9
1316
1317	movddup	 %xmm3, %xmm12
1318	unpckhpd %xmm3, %xmm3
1319
1320	mulpd	 %xmm15, %xmm12
1321	mulpd	 %xmm15, %xmm3
1322	addpd	 %xmm12, %xmm8
1323	addpd	 %xmm3,  %xmm9
1324
1325	movsd	%xmm8,  0 * SIZE(CO2, LDC, 2)
1326	movhpd	%xmm8,  1 * SIZE(CO2, LDC, 2)
1327	movsd	%xmm9,  2 * SIZE(CO2, LDC, 2)
1328	movhpd	%xmm9,  3 * SIZE(CO2, LDC, 2)
1329
1330	addq	$4 * SIZE, CO1		# coffset += 4
1331	addq	$4 * SIZE, CO2		# coffset += 4
1332	ALIGN_4
1333
1334.L30:
1335	testq	$1, M
1336	je	.L39
1337	ALIGN_4
1338
1339.L31:
1340#if !defined(TRMMKERNEL) || \
1341	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1342	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1343
1344	movq	B, BO
1345#else
1346	movq	KK, %rax
1347	leaq	(, %rax, SIZE), %rax
1348	leaq	(AO, %rax, 1), AO
1349	leaq	(B,  %rax, 4), BO
1350#endif
1351
1352	movddup	 0 * SIZE(AO), %xmm8
1353	pxor	%xmm0, %xmm0
1354	movapd	 0 * SIZE(BO), %xmm9
1355	pxor	%xmm1, %xmm1
1356	movddup	 4 * SIZE(AO), %xmm10
1357	pxor	%xmm2, %xmm2
1358	movapd	 8 * SIZE(BO), %xmm11
1359	pxor	%xmm3, %xmm3
1360
1361#ifndef TRMMKERNEL
1362	movq	K, %rax
1363#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1364	movq	K, %rax
1365	subq	KK, %rax
1366	movq	%rax, KKK
1367#else
1368	movq	KK, %rax
1369#ifdef LEFT
1370	addq	$1, %rax
1371#else
1372	addq	$4, %rax
1373#endif
1374	movq	%rax, KKK
1375#endif
1376	sarq	$3, %rax
1377	je	.L35
1378	ALIGN_4
1379
1380.L32:
1381	mulpd	%xmm8, %xmm9
1382	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1383	addpd	%xmm9, %xmm0
1384	movapd	 2 * SIZE(BO), %xmm9
1385	mulpd	%xmm8, %xmm9
1386	movddup	 1 * SIZE(AO), %xmm8
1387	addpd	%xmm9, %xmm1
1388	movapd	 4 * SIZE(BO), %xmm9
1389	mulpd	%xmm8, %xmm9
1390	addpd	%xmm9, %xmm0
1391	movapd	 6 * SIZE(BO), %xmm9
1392	mulpd	%xmm8, %xmm9
1393	movddup	 2 * SIZE(AO), %xmm8
1394	addpd	%xmm9, %xmm1
1395	movapd	16 * SIZE(BO), %xmm9
1396	mulpd	%xmm8, %xmm11
1397	addpd	%xmm11, %xmm0
1398	movapd	10 * SIZE(BO), %xmm11
1399	mulpd	%xmm8, %xmm11
1400	movddup	 3 * SIZE(AO), %xmm8
1401	addpd	%xmm11, %xmm1
1402	movapd	12 * SIZE(BO), %xmm11
1403	mulpd	%xmm8, %xmm11
1404	addpd	%xmm11, %xmm0
1405	movapd	14 * SIZE(BO), %xmm11
1406	mulpd	%xmm8, %xmm11
1407	movddup	 8 * SIZE(AO), %xmm8
1408	addpd	%xmm11, %xmm1
1409	movapd	24 * SIZE(BO), %xmm11
1410	mulpd	%xmm10, %xmm9
1411	addpd	%xmm9, %xmm0
1412	movapd	18 * SIZE(BO), %xmm9
1413	mulpd	%xmm10, %xmm9
1414	movddup	 5 * SIZE(AO), %xmm10
1415	addpd	%xmm9, %xmm1
1416	movapd	20 * SIZE(BO), %xmm9
1417	mulpd	%xmm10, %xmm9
1418	addpd	%xmm9, %xmm0
1419	movapd	22 * SIZE(BO), %xmm9
1420	mulpd	%xmm10, %xmm9
1421	movddup	 6 * SIZE(AO), %xmm10
1422	addpd	%xmm9, %xmm1
1423	movapd	32 * SIZE(BO), %xmm9
1424	mulpd	%xmm10, %xmm11
1425	addpd	%xmm11, %xmm0
1426	movapd	26 * SIZE(BO), %xmm11
1427	mulpd	%xmm10, %xmm11
1428	movddup	 7 * SIZE(AO), %xmm10
1429	addpd	%xmm11, %xmm1
1430	movapd	28 * SIZE(BO), %xmm11
1431	mulpd	%xmm10, %xmm11
1432	addpd	%xmm11, %xmm0
1433	movapd	30 * SIZE(BO), %xmm11
1434	mulpd	%xmm10, %xmm11
1435	movddup	12 * SIZE(AO), %xmm10
1436	addpd	%xmm11, %xmm1
1437	movapd	40 * SIZE(BO), %xmm11
1438
1439	addq   $ 8 * SIZE, AO
1440	addq   $32 * SIZE, BO
1441	decq   %rax
1442	jne    .L32
1443	ALIGN_4
1444
1445.L35:
1446#ifndef TRMMKERNEL
1447	movq	K, %rax
1448#else
1449	movq	KKK, %rax
1450#endif
1451	movsd	ALPHA_R, %xmm15
1452	movhpd	ALPHA_I, %xmm15
1453	andq	$7, %rax		# if (k & 1)
1454	BRANCH
1455	je .L38
1456	ALIGN_4
1457
1458.L36:
1459	mulpd	%xmm8, %xmm9
1460	addpd	%xmm9, %xmm0
1461	movapd	 2 * SIZE(BO), %xmm9
1462	mulpd	%xmm8, %xmm9
1463	movddup	 1 * SIZE(AO), %xmm8
1464	addpd	%xmm9, %xmm1
1465	movapd	 4 * SIZE(BO), %xmm9
1466
1467	addq	$1 * SIZE, AO		# aoffset  += 4
1468	addq	$4 * SIZE, BO		# boffset1 += 8
1469	decq	%rax
1470	jg	.L36
1471	ALIGN_4
1472
1473.L38:
1474	movsd	0 * SIZE(CO1), %xmm8
1475	movhpd	1 * SIZE(CO1), %xmm8
1476
1477	movddup	 %xmm0, %xmm12
1478
1479	mulpd	 %xmm15, %xmm12
1480	addpd	 %xmm12, %xmm8
1481
1482	movsd	%xmm8,  0 * SIZE(CO1)
1483	movhpd	%xmm8,  1 * SIZE(CO1)
1484
1485	movsd	0 * SIZE(CO2), %xmm8
1486	movhpd	1 * SIZE(CO2), %xmm8
1487
1488	unpckhpd  %xmm0, %xmm0
1489
1490	mulpd	 %xmm15, %xmm0
1491	addpd	 %xmm0,  %xmm8
1492
1493	movsd	%xmm8,  0 * SIZE(CO2)
1494	movhpd	%xmm8,  1 * SIZE(CO2)
1495
1496	movsd	0 * SIZE(CO1, LDC, 2), %xmm8
1497	movhpd	1 * SIZE(CO1, LDC, 2), %xmm8
1498
1499	movddup	 %xmm1,  %xmm12
1500
1501	mulpd	 %xmm15, %xmm12
1502	addpd	 %xmm12, %xmm8
1503
1504	movsd	%xmm8,  0 * SIZE(CO1, LDC, 2)
1505	movhpd	%xmm8,  1 * SIZE(CO1, LDC, 2)
1506
1507	movsd	0 * SIZE(CO2, LDC, 2), %xmm8
1508	movhpd	1 * SIZE(CO2, LDC, 2), %xmm8
1509
1510	unpckhpd %xmm1, %xmm1
1511
1512	mulpd	 %xmm15, %xmm1
1513	addpd	 %xmm1, %xmm8
1514
1515	movsd	%xmm8,  0 * SIZE(CO2, LDC, 2)
1516	movhpd	%xmm8,  1 * SIZE(CO2, LDC, 2)
1517	ALIGN_4
1518
1519.L39:
1520#if defined(TRMMKERNEL) && !defined(LEFT)
1521	addl	$4, KK
1522#endif
1523
1524	leaq	(C, LDC, 4), C		# c += 4 * ldc
1525	movq	BO, B
1526	decq	J			# j --
1527	jg	.L10
1528	ALIGN_4
1529
1530.L40:
1531	testq	$2, N
1532	je	.L80
1533	ALIGN_4
1534
1535#if defined(TRMMKERNEL) && defined(LEFT)
1536	movq	OFFSET, %rax
1537	movq	%rax, KK
1538#endif
1539
1540	movq	C, CO1			# coffset1 = c
1541	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1542	movq	A, AO		# aoffset = a
1543
1544	movq	M,  I
1545	sarq	$2, I	# i = (m >> 2)
1546	jle	.L60
1547	ALIGN_4
1548
1549.L51:
1550#if !defined(TRMMKERNEL) || \
1551	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1552	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1553
1554	movq	B, BO
1555#else
1556	movq	KK, %rax
1557	leaq	(, %rax, SIZE), %rax
1558	leaq	(AO, %rax, 4), AO
1559	leaq	(B,  %rax, 2), BO
1560#endif
1561
1562	movapd	 0 * SIZE(AO), %xmm8
1563	pxor	%xmm0, %xmm0
1564	movddup	 0 * SIZE(BO), %xmm9
1565	pxor	%xmm1, %xmm1
1566	movapd	 8 * SIZE(AO), %xmm10
1567	pxor	%xmm4, %xmm4
1568	movddup	 8 * SIZE(BO), %xmm11
1569	pxor	%xmm5, %xmm5
1570
1571#ifdef HAVE_3DNOW
1572	prefetchw      4 * SIZE(CO1)
1573	prefetchw      4 * SIZE(CO2)
1574#else
1575	prefetchnta     4 * SIZE(CO1)
1576	prefetchnta     4 * SIZE(CO2)
1577#endif
1578
1579#ifndef TRMMKERNEL
1580	movq	K, %rax
1581#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1582	movq	K, %rax
1583	subq	KK, %rax
1584	movq	%rax, KKK
1585#else
1586	movq	KK, %rax
1587#ifdef LEFT
1588	addq	$4, %rax
1589#else
1590	addq	$2, %rax
1591#endif
1592	movq	%rax, KKK
1593#endif
1594	sarq	$3, %rax
1595	je	.L55
1596	ALIGN_4
1597
1598.L52:
1599	mulpd	%xmm8, %xmm9
1600	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AO)
1601	addpd	%xmm9, %xmm0
1602	movddup	 1 * SIZE(BO), %xmm9
1603	mulpd	%xmm8, %xmm9
1604	movapd	 2 * SIZE(AO), %xmm8
1605	addpd	%xmm9, %xmm1
1606	movddup	 0 * SIZE(BO), %xmm9
1607	mulpd	%xmm8, %xmm9
1608	addpd	%xmm9, %xmm4
1609	movddup	 1 * SIZE(BO), %xmm9
1610	mulpd	%xmm8, %xmm9
1611	movapd	 4 * SIZE(AO), %xmm8
1612	addpd	%xmm9, %xmm5
1613	movddup	 2 * SIZE(BO), %xmm9
1614	mulpd	%xmm8, %xmm9
1615	addpd	%xmm9, %xmm0
1616	movddup	 3 * SIZE(BO), %xmm9
1617	mulpd	%xmm8, %xmm9
1618	movapd	 6 * SIZE(AO), %xmm8
1619	addpd	%xmm9, %xmm1
1620	movddup	 2 * SIZE(BO), %xmm9
1621	mulpd	%xmm8, %xmm9
1622	addpd	%xmm9, %xmm4
1623	movddup	 3 * SIZE(BO), %xmm9
1624	mulpd	%xmm8, %xmm9
1625	movapd	16 * SIZE(AO), %xmm8
1626	addpd	%xmm9, %xmm5
1627	movddup	 4 * SIZE(BO), %xmm9
1628	mulpd	%xmm10, %xmm9
1629	addpd	%xmm9, %xmm0
1630	movddup	 5 * SIZE(BO), %xmm9
1631	mulpd	%xmm10, %xmm9
1632	movapd	10 * SIZE(AO), %xmm10
1633	addpd	%xmm9, %xmm1
1634	movddup	 4 * SIZE(BO), %xmm9
1635	mulpd	%xmm10, %xmm9
1636	addpd	%xmm9, %xmm4
1637	movddup	 5 * SIZE(BO), %xmm9
1638	mulpd	%xmm10, %xmm9
1639	movapd	12 * SIZE(AO), %xmm10
1640	addpd	%xmm9, %xmm5
1641	movddup	 6 * SIZE(BO), %xmm9
1642	mulpd	%xmm10, %xmm9
1643	addpd	%xmm9, %xmm0
1644	movddup	 7 * SIZE(BO), %xmm9
1645	mulpd	%xmm10, %xmm9
1646	movapd	14 * SIZE(AO), %xmm10
1647	addpd	%xmm9, %xmm1
1648	movddup	 6 * SIZE(BO), %xmm9
1649	mulpd	%xmm10, %xmm9
1650	addpd	%xmm9, %xmm4
1651	movddup	 7 * SIZE(BO), %xmm9
1652	mulpd	%xmm10, %xmm9
1653	movapd	40 * SIZE(AO), %xmm10
1654	addpd	%xmm9, %xmm5
1655	movddup	16 * SIZE(BO), %xmm9
1656	mulpd	%xmm8, %xmm11
1657	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
1658	addpd	%xmm11, %xmm0
1659	movddup	 9 * SIZE(BO), %xmm11
1660	mulpd	%xmm8, %xmm11
1661	movapd	18 * SIZE(AO), %xmm8
1662	addpd	%xmm11, %xmm1
1663	movddup	 8 * SIZE(BO), %xmm11
1664	mulpd	%xmm8, %xmm11
1665	addpd	%xmm11, %xmm4
1666	movddup	 9 * SIZE(BO), %xmm11
1667	mulpd	%xmm8, %xmm11
1668	movapd	20 * SIZE(AO), %xmm8
1669	addpd	%xmm11, %xmm5
1670	movddup	10 * SIZE(BO), %xmm11
1671	mulpd	%xmm8, %xmm11
1672	addpd	%xmm11, %xmm0
1673	movddup	11 * SIZE(BO), %xmm11
1674	mulpd	%xmm8, %xmm11
1675	movapd	22 * SIZE(AO), %xmm8
1676	addpd	%xmm11, %xmm1
1677	movddup	10 * SIZE(BO), %xmm11
1678	mulpd	%xmm8, %xmm11
1679	addpd	%xmm11, %xmm4
1680	movddup	11 * SIZE(BO), %xmm11
1681	mulpd	%xmm8, %xmm11
1682	movapd	24 * SIZE(AO), %xmm8
1683	addpd	%xmm11, %xmm5
1684	movddup	12 * SIZE(BO), %xmm11
1685	mulpd	%xmm8, %xmm11
1686	addpd	%xmm11, %xmm0
1687	movddup	13 * SIZE(BO), %xmm11
1688	mulpd	%xmm8, %xmm11
1689	movapd	26 * SIZE(AO), %xmm8
1690	addpd	%xmm11, %xmm1
1691	movddup	12 * SIZE(BO), %xmm11
1692	mulpd	%xmm8, %xmm11
1693	addpd	%xmm11, %xmm4
1694	movddup	13 * SIZE(BO), %xmm11
1695	mulpd	%xmm8, %xmm11
1696	movapd	28 * SIZE(AO), %xmm8
1697	addpd	%xmm11, %xmm5
1698	movddup	14 * SIZE(BO), %xmm11
1699	mulpd	%xmm8, %xmm11
1700	addpd	%xmm11, %xmm0
1701	movddup	15 * SIZE(BO), %xmm11
1702	mulpd	%xmm8, %xmm11
1703	movapd	30 * SIZE(AO), %xmm8
1704	addpd	%xmm11, %xmm1
1705	movddup	14 * SIZE(BO), %xmm11
1706	mulpd	%xmm8, %xmm11
1707	addpd	%xmm11, %xmm4
1708	movddup	15 * SIZE(BO), %xmm11
1709	mulpd	%xmm8, %xmm11
1710	movapd	32 * SIZE(AO), %xmm8
1711	addpd	%xmm11, %xmm5
1712	movddup	24 * SIZE(BO), %xmm11
1713
1714	addq   $32 * SIZE, AO
1715	addq   $16 * SIZE, BO
1716	decq   %rax
1717	jne    .L52
1718	ALIGN_4
1719
1720.L55:
1721#ifndef TRMMKERNEL
1722	movq	K, %rax
1723#else
1724	movq	KKK, %rax
1725#endif
1726	movsd	ALPHA_R, %xmm15
1727	movhpd	ALPHA_I, %xmm15
1728	andq	$7, %rax		# if (k & 1)
1729	BRANCH
1730	je .L59
1731	ALIGN_4
1732
1733.L56:
1734	mulpd	%xmm8, %xmm9
1735	movapd	 2 * SIZE(AO), %xmm10
1736	addpd	%xmm9, %xmm0
1737	movddup	 1 * SIZE(BO), %xmm9
1738	mulpd	%xmm8, %xmm9
1739	movddup	 0 * SIZE(BO), %xmm11
1740	addpd	%xmm9, %xmm1
1741	movddup	 2 * SIZE(BO), %xmm9
1742	mulpd	%xmm10, %xmm11
1743	movapd	 4 * SIZE(AO), %xmm8
1744	addpd	%xmm11, %xmm4
1745	movddup	 1 * SIZE(BO), %xmm11
1746	mulpd	%xmm10, %xmm11
1747	addpd	%xmm11, %xmm5
1748
1749	addq	$4 * SIZE, AO		# aoffset  += 4
1750	addq	$2 * SIZE, BO		# boffset1 += 8
1751	decq	%rax
1752	jg	.L56
1753	ALIGN_4
1754
1755.L59:
1756	movsd	0 * SIZE(CO1), %xmm8
1757	movhpd	1 * SIZE(CO1), %xmm8
1758	movsd	2 * SIZE(CO1), %xmm9
1759	movhpd	3 * SIZE(CO1), %xmm9
1760
1761	movsd	4 * SIZE(CO1), %xmm10
1762	movhpd	5 * SIZE(CO1), %xmm10
1763	movsd	6 * SIZE(CO1), %xmm11
1764	movhpd	7 * SIZE(CO1), %xmm11
1765
1766	movddup	 %xmm0, %xmm12
1767	unpckhpd %xmm0, %xmm0
1768	movddup	 %xmm4, %xmm13
1769	unpckhpd %xmm4, %xmm4
1770
1771	mulpd	 %xmm15, %xmm12
1772	mulpd	 %xmm15, %xmm0
1773	mulpd	 %xmm15, %xmm13
1774	mulpd	 %xmm15, %xmm4
1775
1776	addpd	 %xmm12, %xmm8
1777	addpd	 %xmm0,  %xmm9
1778	addpd	 %xmm13, %xmm10
1779	addpd	 %xmm4,  %xmm11
1780
1781	movsd	%xmm8,  0 * SIZE(CO1)
1782	movhpd	%xmm8,  1 * SIZE(CO1)
1783	movsd	%xmm9,  2 * SIZE(CO1)
1784	movhpd	%xmm9,  3 * SIZE(CO1)
1785
1786	movsd	%xmm10, 4 * SIZE(CO1)
1787	movhpd	%xmm10, 5 * SIZE(CO1)
1788	movsd	%xmm11, 6 * SIZE(CO1)
1789	movhpd	%xmm11, 7 * SIZE(CO1)
1790
1791	movsd	0 * SIZE(CO2), %xmm8
1792	movhpd	1 * SIZE(CO2), %xmm8
1793	movsd	2 * SIZE(CO2), %xmm9
1794	movhpd	3 * SIZE(CO2), %xmm9
1795
1796	movsd	4 * SIZE(CO2), %xmm10
1797	movhpd	5 * SIZE(CO2), %xmm10
1798	movsd	6 * SIZE(CO2), %xmm11
1799	movhpd	7 * SIZE(CO2), %xmm11
1800
1801	movddup	 %xmm1, %xmm12
1802	unpckhpd %xmm1, %xmm1
1803	movddup	 %xmm5, %xmm13
1804	unpckhpd %xmm5, %xmm5
1805
1806	mulpd	 %xmm15, %xmm12
1807	mulpd	 %xmm15, %xmm1
1808	mulpd	 %xmm15, %xmm13
1809	mulpd	 %xmm15, %xmm5
1810
1811	addpd	 %xmm12, %xmm8
1812	addpd	 %xmm1,  %xmm9
1813	addpd	 %xmm13, %xmm10
1814	addpd	 %xmm5,  %xmm11
1815
1816	movsd	%xmm8,  0 * SIZE(CO2)
1817	movhpd	%xmm8,  1 * SIZE(CO2)
1818	movsd	%xmm9,  2 * SIZE(CO2)
1819	movhpd	%xmm9,  3 * SIZE(CO2)
1820
1821	movsd	%xmm10, 4 * SIZE(CO2)
1822	movhpd	%xmm10, 5 * SIZE(CO2)
1823	movsd	%xmm11, 6 * SIZE(CO2)
1824	movhpd	%xmm11, 7 * SIZE(CO2)
1825
1826	addq	$8 * SIZE, CO1		# coffset += 4
1827	addq	$8 * SIZE, CO2		# coffset += 4
1828
1829	decq	I			# i --
1830	jg	.L51
1831	ALIGN_4
1832
1833.L60:
1834	testq	$2, M
1835	je	.L70
1836	ALIGN_4
1837
1838.L61:
1839#if !defined(TRMMKERNEL) || \
1840	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1841	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1842
1843	movq	B, BO
1844#else
1845	movq	KK, %rax
1846	leaq	(, %rax, SIZE), %rax
1847	leaq	(AO, %rax, 2), AO
1848	leaq	(B,  %rax, 2), BO
1849#endif
1850
1851	movapd	 0 * SIZE(AO), %xmm8
1852	pxor	%xmm0, %xmm0
1853	movddup	 0 * SIZE(BO), %xmm9
1854	pxor	%xmm1, %xmm1
1855	movapd	 8 * SIZE(AO), %xmm10
1856	pxor	%xmm2, %xmm2
1857	movddup	 8 * SIZE(BO), %xmm11
1858	pxor	%xmm3, %xmm3
1859
1860#ifndef TRMMKERNEL
1861	movq	K, %rax
1862#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1863	movq	K, %rax
1864	subq	KK, %rax
1865	movq	%rax, KKK
1866#else
1867	movq	KK, %rax
1868#ifdef LEFT
1869	addq	$2, %rax
1870#else
1871	addq	$2, %rax
1872#endif
1873	movq	%rax, KKK
1874#endif
1875	sarq	$3, %rax
1876	je	.L65
1877	ALIGN_4
1878
1879.L62:
1880	mulpd	%xmm8, %xmm9
1881	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
1882	addpd	%xmm9, %xmm0
1883	movddup	 1 * SIZE(BO), %xmm9
1884	mulpd	%xmm8, %xmm9
1885	movapd	 2 * SIZE(AO), %xmm8
1886	addpd	%xmm9, %xmm1
1887	movddup	 2 * SIZE(BO), %xmm9
1888	mulpd	%xmm8, %xmm9
1889	addpd	%xmm9, %xmm2
1890	movddup	 3 * SIZE(BO), %xmm9
1891	mulpd	%xmm8, %xmm9
1892	movapd	 4 * SIZE(AO), %xmm8
1893	addpd	%xmm9, %xmm3
1894	movddup	 4 * SIZE(BO), %xmm9
1895	mulpd	%xmm8, %xmm9
1896	addpd	%xmm9, %xmm0
1897	movddup	 5 * SIZE(BO), %xmm9
1898	mulpd	%xmm8, %xmm9
1899	movapd	 6 * SIZE(AO), %xmm8
1900	addpd	%xmm9, %xmm1
1901	movddup	 6 * SIZE(BO), %xmm9
1902	mulpd	%xmm8, %xmm9
1903	addpd	%xmm9, %xmm2
1904	movddup	 7 * SIZE(BO), %xmm9
1905	mulpd	%xmm8, %xmm9
1906	movapd	16 * SIZE(AO), %xmm8
1907	addpd	%xmm9, %xmm3
1908	movddup	16 * SIZE(BO), %xmm9
1909	mulpd	%xmm10, %xmm11
1910	addpd	%xmm11, %xmm0
1911	movddup	 9 * SIZE(BO), %xmm11
1912	mulpd	%xmm10, %xmm11
1913	movapd	10 * SIZE(AO), %xmm10
1914	addpd	%xmm11, %xmm1
1915	movddup	10 * SIZE(BO), %xmm11
1916	mulpd	%xmm10, %xmm11
1917	addpd	%xmm11, %xmm2
1918	movddup	11 * SIZE(BO), %xmm11
1919	mulpd	%xmm10, %xmm11
1920	movapd	12 * SIZE(AO), %xmm10
1921	addpd	%xmm11, %xmm3
1922	movddup	12 * SIZE(BO), %xmm11
1923	mulpd	%xmm10, %xmm11
1924	addpd	%xmm11, %xmm0
1925	movddup	13 * SIZE(BO), %xmm11
1926	mulpd	%xmm10, %xmm11
1927	movapd	14 * SIZE(AO), %xmm10
1928	addpd	%xmm11, %xmm1
1929	movddup	14 * SIZE(BO), %xmm11
1930	mulpd	%xmm10, %xmm11
1931	addpd	%xmm11, %xmm2
1932	movddup	15 * SIZE(BO), %xmm11
1933	mulpd	%xmm10, %xmm11
1934	movapd	24 * SIZE(AO), %xmm10
1935	addpd	%xmm11, %xmm3
1936	movddup	24 * SIZE(BO), %xmm11
1937
1938	addq   $16 * SIZE, AO
1939	addq   $16 * SIZE, BO
1940	decq   %rax
1941	jne    .L62
1942	ALIGN_4
1943
1944.L65:
1945#ifndef TRMMKERNEL
1946	movq	K, %rax
1947#else
1948	movq	KKK, %rax
1949#endif
1950	movsd	ALPHA_R, %xmm15
1951	movhpd	ALPHA_I, %xmm15
1952	andq	$7, %rax		# if (k & 1)
1953	BRANCH
1954	je .L69
1955	ALIGN_4
1956
1957.L66:
1958	mulpd	%xmm8, %xmm9
1959	addpd	%xmm9, %xmm0
1960	movddup	 1 * SIZE(BO), %xmm9
1961	mulpd	%xmm8, %xmm9
1962	movapd	 2 * SIZE(AO), %xmm8
1963	addpd	%xmm9, %xmm1
1964	movddup	 2 * SIZE(BO), %xmm9
1965
1966	addq	$2 * SIZE, AO		# aoffset  += 4
1967	addq	$2 * SIZE, BO		# boffset1 += 8
1968	decq	%rax
1969	jg	.L66
1970	ALIGN_4
1971
1972.L69:
1973	addpd	%xmm2, %xmm0
1974	addpd	%xmm3, %xmm1
1975
1976	movsd	0 * SIZE(CO1), %xmm8
1977	movhpd	1 * SIZE(CO1), %xmm8
1978	movsd	2 * SIZE(CO1), %xmm9
1979	movhpd	3 * SIZE(CO1), %xmm9
1980
1981	movddup	 %xmm0, %xmm12
1982	unpckhpd %xmm0, %xmm0
1983
1984	mulpd	 %xmm15, %xmm12
1985	mulpd	 %xmm15, %xmm0
1986	addpd	 %xmm12, %xmm8
1987	addpd	 %xmm0,  %xmm9
1988
1989	movsd	%xmm8,  0 * SIZE(CO1)
1990	movhpd	%xmm8,  1 * SIZE(CO1)
1991	movsd	%xmm9,  2 * SIZE(CO1)
1992	movhpd	%xmm9,  3 * SIZE(CO1)
1993
1994	movsd	0 * SIZE(CO2), %xmm8
1995	movhpd	1 * SIZE(CO2), %xmm8
1996	movsd	2 * SIZE(CO2), %xmm9
1997	movhpd	3 * SIZE(CO2), %xmm9
1998
1999	movddup	 %xmm1, %xmm12
2000	unpckhpd %xmm1, %xmm1
2001
2002	mulpd	 %xmm15, %xmm12
2003	mulpd	 %xmm15, %xmm1
2004	addpd	 %xmm12, %xmm8
2005	addpd	 %xmm1,  %xmm9
2006
2007	movsd	%xmm8,  0 * SIZE(CO2)
2008	movhpd	%xmm8,  1 * SIZE(CO2)
2009	movsd	%xmm9,  2 * SIZE(CO2)
2010	movhpd	%xmm9,  3 * SIZE(CO2)
2011
2012	addq	$4 * SIZE, CO1
2013	addq	$4 * SIZE, CO2
2014	ALIGN_4
2015
2016.L70:
2017	testq	$1, M
2018	je	.L79
2019	ALIGN_4
2020
2021.L71:
2022#if !defined(TRMMKERNEL) || \
2023	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2024	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2025
2026	movq	B, BO
2027#else
2028	movq	KK, %rax
2029	leaq	(, %rax, SIZE), %rax
2030	leaq	(AO, %rax, 1), AO
2031	leaq	(B,  %rax, 2), BO
2032#endif
2033
2034	movddup	 0 * SIZE(AO), %xmm8
2035	pxor	%xmm0, %xmm0
2036	movapd	 0 * SIZE(BO), %xmm9
2037	pxor	%xmm1, %xmm1
2038	movddup	 4 * SIZE(AO), %xmm10
2039	pxor	%xmm2, %xmm2
2040	movapd	 8 * SIZE(BO), %xmm11
2041	pxor	%xmm3, %xmm3
2042
2043#ifndef TRMMKERNEL
2044	movq	K, %rax
2045#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2046	movq	K, %rax
2047	subq	KK, %rax
2048	movq	%rax, KKK
2049#else
2050	movq	KK, %rax
2051#ifdef LEFT
2052	addq	$1, %rax
2053#else
2054	addq	$2, %rax
2055#endif
2056	movq	%rax, KKK
2057#endif
2058	sarq	$3, %rax
2059	je	.L75
2060	ALIGN_4
2061
2062.L72:
2063	mulpd	%xmm8, %xmm9
2064	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2065	movddup	 1 * SIZE(AO), %xmm8
2066	addpd	%xmm9, %xmm0
2067	mulpd	 2 * SIZE(BO), %xmm8
2068	movapd	16 * SIZE(BO), %xmm9
2069	addpd	%xmm8, %xmm1
2070	movddup	 2 * SIZE(AO), %xmm8
2071	mulpd	 4 * SIZE(BO), %xmm8
2072	addpd	%xmm8, %xmm2
2073	movddup	 3 * SIZE(AO), %xmm8
2074	mulpd	 6 * SIZE(BO), %xmm8
2075	addpd	%xmm8, %xmm3
2076	movddup	 8 * SIZE(AO), %xmm8
2077	mulpd	%xmm10, %xmm11
2078	movddup	 5 * SIZE(AO), %xmm10
2079	addpd	%xmm11, %xmm0
2080	mulpd	10 * SIZE(BO), %xmm10
2081	movapd	24 * SIZE(BO), %xmm11
2082	addpd	%xmm10, %xmm1
2083	movddup	 6 * SIZE(AO), %xmm10
2084	mulpd	12 * SIZE(BO), %xmm10
2085	addpd	%xmm10, %xmm2
2086	movddup	 7 * SIZE(AO), %xmm10
2087	mulpd	14 * SIZE(BO), %xmm10
2088	addpd	%xmm10, %xmm3
2089	movddup	12 * SIZE(AO), %xmm10
2090
2091	addq   $ 8 * SIZE, AO
2092	addq   $16 * SIZE, BO
2093	decq   %rax
2094	jne    .L72
2095	ALIGN_4
2096
2097.L75:
2098#ifndef TRMMKERNEL
2099	movq	K, %rax
2100#else
2101	movq	KKK, %rax
2102#endif
2103	movsd	ALPHA_R, %xmm15
2104	movhpd	ALPHA_I, %xmm15
2105	andq	$7, %rax		# if (k & 1)
2106	BRANCH
2107	je .L78
2108	ALIGN_4
2109
2110.L76:
2111	mulpd	%xmm8, %xmm9
2112	movddup	 1 * SIZE(AO), %xmm8
2113	addpd	%xmm9, %xmm0
2114	movapd	 2 * SIZE(BO), %xmm9
2115
2116	addq	$1 * SIZE, AO		# aoffset  += 4
2117	addq	$2 * SIZE, BO		# boffset1 += 8
2118	decq	%rax
2119	jg	.L76
2120	ALIGN_4
2121
2122.L78:
2123	addpd	%xmm1, %xmm0
2124	addpd	%xmm3, %xmm2
2125	addpd	%xmm2, %xmm0
2126
2127	movsd	0 * SIZE(CO1), %xmm8
2128	movhpd	1 * SIZE(CO1), %xmm8
2129
2130	movddup	 %xmm0,  %xmm12
2131	mulpd	 %xmm15, %xmm12
2132	addpd	 %xmm12, %xmm8
2133
2134	movsd	%xmm8,  0 * SIZE(CO1)
2135	movhpd	%xmm8,  1 * SIZE(CO1)
2136
2137	movsd	0 * SIZE(CO2), %xmm8
2138	movhpd	1 * SIZE(CO2), %xmm8
2139
2140	unpckhpd %xmm0, %xmm0
2141
2142	mulpd	 %xmm15, %xmm0
2143	addpd	 %xmm0,  %xmm8
2144
2145	movsd	%xmm8,  0 * SIZE(CO2)
2146	movhpd	%xmm8,  1 * SIZE(CO2)
2147	ALIGN_4
2148
2149.L79:
2150#if defined(TRMMKERNEL) && !defined(LEFT)
2151	addl	$2, KK
2152#endif
2153	leaq	(C, LDC, 2), C
2154	movq	BO, B
2155	ALIGN_4
2156
2157.L80:
2158	testq	$1, N
2159	je	.L999
2160	ALIGN_4
2161
2162#if defined(TRMMKERNEL) && defined(LEFT)
2163	movq	OFFSET, %rax
2164	movq	%rax, KK
2165#endif
2166
2167	movq	C, CO1
2168	movq	A, AO
2169
2170	movq	M,  I
2171	sarq	$2, I	# i = (m >> 2)
2172	jle	.L100
2173	ALIGN_4
2174
2175.L91:
2176#if !defined(TRMMKERNEL) || \
2177	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2178	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2179
2180	movq	B, BO
2181#else
2182	movq	KK, %rax
2183	leaq	(, %rax, SIZE), %rax
2184	leaq	(AO, %rax, 4), AO
2185	leaq	(B,  %rax, 1), BO
2186#endif
2187
2188	movapd	 0 * SIZE(AO), %xmm8
2189	pxor	%xmm0, %xmm0
2190	movddup	 0 * SIZE(BO), %xmm9
2191	pxor	%xmm1, %xmm1
2192	movapd	 8 * SIZE(AO), %xmm10
2193	pxor	%xmm2, %xmm2
2194	movddup	 4 * SIZE(BO), %xmm11
2195	pxor	%xmm3, %xmm3
2196
2197#ifdef HAVE_3DNOW
2198	prefetchw      4 * SIZE(CO1)
2199#else
2200	prefetchnta     4 * SIZE(CO1)
2201#endif
2202
2203#ifndef TRMMKERNEL
2204	movq	K, %rax
2205#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2206	movq	K, %rax
2207	subq	KK, %rax
2208	movq	%rax, KKK
2209#else
2210	movq	KK, %rax
2211#ifdef LEFT
2212	addq	$4, %rax
2213#else
2214	addq	$1, %rax
2215#endif
2216	movq	%rax, KKK
2217#endif
2218	sarq	$3, %rax
2219	je	.L95
2220	ALIGN_4
2221
2222.L92:
2223	mulpd	%xmm9, %xmm8
2224	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2225	mulpd	 2 * SIZE(AO), %xmm9
2226	addpd	%xmm8, %xmm0
2227	movapd	 4 * SIZE(AO), %xmm8
2228	addpd	%xmm9, %xmm1
2229	movddup	 1 * SIZE(BO), %xmm9
2230	mulpd	%xmm9, %xmm8
2231	mulpd	 6 * SIZE(AO), %xmm9
2232	addpd	%xmm8, %xmm2
2233	movapd	16 * SIZE(AO), %xmm8
2234	addpd	%xmm9, %xmm3
2235	movddup	 2 * SIZE(BO), %xmm9
2236	mulpd	%xmm9, %xmm10
2237	mulpd	10 * SIZE(AO), %xmm9
2238	addpd	%xmm10, %xmm0
2239	movapd	12 * SIZE(AO), %xmm10
2240	addpd	%xmm9, %xmm1
2241	movddup	 3 * SIZE(BO), %xmm9
2242	mulpd	%xmm9, %xmm10
2243	mulpd	14 * SIZE(AO), %xmm9
2244	addpd	%xmm10, %xmm2
2245	movapd	24 * SIZE(AO), %xmm10
2246	PREFETCH  (PREFETCHSIZE + 16) * SIZE(AO)
2247	addpd	%xmm9, %xmm3
2248	movddup	 8 * SIZE(BO), %xmm9
2249	mulpd	%xmm11, %xmm8
2250	mulpd	18 * SIZE(AO), %xmm11
2251	addpd	%xmm8, %xmm0
2252	movapd	20 * SIZE(AO), %xmm8
2253	addpd	%xmm11, %xmm1
2254	movddup	 5 * SIZE(BO), %xmm11
2255	mulpd	%xmm11, %xmm8
2256	mulpd	22 * SIZE(AO), %xmm11
2257	addpd	%xmm8, %xmm2
2258	movapd	32 * SIZE(AO), %xmm8
2259	addpd	%xmm11, %xmm3
2260	movddup	 6 * SIZE(BO), %xmm11
2261	mulpd	%xmm11, %xmm10
2262	mulpd	26 * SIZE(AO), %xmm11
2263	addpd	%xmm10, %xmm0
2264	movapd	28 * SIZE(AO), %xmm10
2265	addpd	%xmm11, %xmm1
2266	movddup	 7 * SIZE(BO), %xmm11
2267	mulpd	%xmm11, %xmm10
2268	mulpd	30 * SIZE(AO), %xmm11
2269	addpd	%xmm10, %xmm2
2270	movapd	40 * SIZE(AO), %xmm10
2271	addpd	%xmm11, %xmm3
2272	movddup	12 * SIZE(BO), %xmm11
2273
2274	addq   $32 * SIZE, AO
2275	addq   $8 * SIZE, BO
2276	decq   %rax
2277	jne    .L92
2278	ALIGN_4
2279
2280.L95:
2281#ifndef TRMMKERNEL
2282	movq	K, %rax
2283#else
2284	movq	KKK, %rax
2285#endif
2286	movsd	ALPHA_R, %xmm15
2287	movhpd	ALPHA_I, %xmm15
2288	andq	$7, %rax		# if (k & 1)
2289	BRANCH
2290	je .L99
2291	ALIGN_4
2292
2293.L96:
2294	mulpd	%xmm9, %xmm8
2295	mulpd	 2 * SIZE(AO), %xmm9
2296	addpd	%xmm8, %xmm0
2297	movapd	 4 * SIZE(AO), %xmm8
2298	addpd	%xmm9, %xmm1
2299	movddup	 1 * SIZE(BO), %xmm9
2300
2301	addq	$4 * SIZE, AO		# aoffset  += 4
2302	addq	$1 * SIZE, BO		# boffset1 += 8
2303	decq	%rax
2304	jg	.L96
2305	ALIGN_4
2306
2307.L99:
2308	addpd	%xmm2, %xmm0
2309	addpd	%xmm3, %xmm1
2310
2311	movsd	0 * SIZE(CO1), %xmm8
2312	movhpd	1 * SIZE(CO1), %xmm8
2313	movsd	2 * SIZE(CO1), %xmm9
2314	movhpd	3 * SIZE(CO1), %xmm9
2315
2316	movsd	4 * SIZE(CO1), %xmm10
2317	movhpd	5 * SIZE(CO1), %xmm10
2318	movsd	6 * SIZE(CO1), %xmm11
2319	movhpd	7 * SIZE(CO1), %xmm11
2320
2321	movddup	 %xmm0, %xmm12
2322	unpckhpd %xmm0, %xmm0
2323	movddup	 %xmm1, %xmm13
2324	unpckhpd %xmm1, %xmm1
2325
2326	mulpd	 %xmm15, %xmm12
2327	mulpd	 %xmm15, %xmm0
2328	mulpd	 %xmm15, %xmm13
2329	mulpd	 %xmm15, %xmm1
2330
2331	addpd	 %xmm12, %xmm8
2332	addpd	 %xmm0,  %xmm9
2333	addpd	 %xmm13, %xmm10
2334	addpd	 %xmm1,  %xmm11
2335
2336	movsd	%xmm8,  0 * SIZE(CO1)
2337	movhpd	%xmm8,  1 * SIZE(CO1)
2338	movsd	%xmm9,  2 * SIZE(CO1)
2339	movhpd	%xmm9,  3 * SIZE(CO1)
2340
2341	movsd	%xmm10, 4 * SIZE(CO1)
2342	movhpd	%xmm10, 5 * SIZE(CO1)
2343	movsd	%xmm11, 6 * SIZE(CO1)
2344	movhpd	%xmm11, 7 * SIZE(CO1)
2345
2346	addq	$8 * SIZE, CO1		# coffset += 4
2347	decq	I			# i --
2348	jg	.L91
2349	ALIGN_4
2350
2351.L100:
2352	testq	$2, M
2353	je	.L110
2354	ALIGN_4
2355
2356.L101:
2357#if !defined(TRMMKERNEL) || \
2358	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2359	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2360
2361	movq	B, BO
2362#else
2363	movq	KK, %rax
2364	leaq	(, %rax, SIZE), %rax
2365	leaq	(AO, %rax, 2), AO
2366	leaq	(B,  %rax, 1), BO
2367#endif
2368
2369	movapd	 0 * SIZE(AO), %xmm8
2370	pxor	%xmm0, %xmm0
2371	movddup	 0 * SIZE(BO), %xmm9
2372	pxor	%xmm1, %xmm1
2373	movapd	 8 * SIZE(AO), %xmm10
2374	pxor	%xmm2, %xmm2
2375	movddup	 4 * SIZE(BO), %xmm11
2376	pxor	%xmm3, %xmm3
2377
2378#ifndef TRMMKERNEL
2379	movq	K, %rax
2380#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2381	movq	K, %rax
2382	subq	KK, %rax
2383	movq	%rax, KKK
2384#else
2385	movq	KK, %rax
2386#ifdef LEFT
2387	addq	$2, %rax
2388#else
2389	addq	$1, %rax
2390#endif
2391	movq	%rax, KKK
2392#endif
2393	sarq	$3, %rax
2394	je	.L105
2395	ALIGN_4
2396
2397.L102:
2398	mulpd	%xmm9, %xmm8
2399	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AO)
2400	movddup	 1 * SIZE(BO), %xmm9
2401	addpd	%xmm8, %xmm0
2402	mulpd	 2 * SIZE(AO), %xmm9
2403	movapd	16 * SIZE(AO), %xmm8
2404	addpd	%xmm9, %xmm1
2405	movddup	 2 * SIZE(BO), %xmm9
2406	mulpd	 4 * SIZE(AO), %xmm9
2407	addpd	%xmm9, %xmm2
2408	movddup	 3 * SIZE(BO), %xmm9
2409	mulpd	 6 * SIZE(AO), %xmm9
2410	addpd	%xmm9, %xmm3
2411	movddup	 8 * SIZE(BO), %xmm9
2412	mulpd	%xmm11, %xmm10
2413	movddup	 5 * SIZE(BO), %xmm11
2414	addpd	%xmm10, %xmm0
2415	mulpd	10 * SIZE(AO), %xmm11
2416	movapd	24 * SIZE(AO), %xmm10
2417	addpd	%xmm11, %xmm1
2418	movddup	 6 * SIZE(BO), %xmm11
2419	mulpd	12 * SIZE(AO), %xmm11
2420	addpd	%xmm11, %xmm2
2421	movddup	 7 * SIZE(BO), %xmm11
2422	mulpd	14 * SIZE(AO), %xmm11
2423	addpd	%xmm11, %xmm3
2424	movddup	12 * SIZE(BO), %xmm11
2425
2426	addq   $16 * SIZE, AO
2427	addq   $ 8 * SIZE, BO
2428	decq   %rax
2429	jne    .L102
2430	ALIGN_4
2431
2432.L105:
2433#ifndef TRMMKERNEL
2434	movq	K, %rax
2435#else
2436	movq	KKK, %rax
2437#endif
2438	movsd	ALPHA_R, %xmm15
2439	movhpd	ALPHA_I, %xmm15
2440	andq	$7, %rax		# if (k & 1)
2441	BRANCH
2442	je .L109
2443	ALIGN_4
2444
2445.L106:
2446	mulpd	%xmm9, %xmm8
2447	movddup	 1 * SIZE(BO), %xmm9
2448	addpd	%xmm8, %xmm0
2449	movapd	 2 * SIZE(AO), %xmm8
2450
2451	addq	$2 * SIZE, AO		# aoffset  += 4
2452	addq	$1 * SIZE, BO		# boffset1 += 8
2453	decq	%rax
2454	jg	.L106
2455	ALIGN_4
2456
2457.L109:
2458	addpd	%xmm1, %xmm0
2459	addpd	%xmm3, %xmm2
2460	addpd	%xmm2, %xmm0
2461
2462	movsd	0 * SIZE(CO1), %xmm8
2463	movhpd	1 * SIZE(CO1), %xmm8
2464	movsd	2 * SIZE(CO1), %xmm9
2465	movhpd	3 * SIZE(CO1), %xmm9
2466
2467	movddup	 %xmm0, %xmm12
2468	unpckhpd %xmm0, %xmm0
2469
2470	mulpd	 %xmm15, %xmm12
2471	mulpd	 %xmm15, %xmm0
2472	addpd	 %xmm12, %xmm8
2473	addpd	 %xmm0,  %xmm9
2474
2475	movsd	%xmm8,  0 * SIZE(CO1)
2476	movhpd	%xmm8,  1 * SIZE(CO1)
2477	movsd	%xmm9,  2 * SIZE(CO1)
2478	movhpd	%xmm9,  3 * SIZE(CO1)
2479
2480	addq	$4 * SIZE, CO1
2481	ALIGN_4
2482
2483.L110:
2484	testq	$1, M
2485	je	.L999
2486	ALIGN_4
2487
2488.L111:
2489#if !defined(TRMMKERNEL) || \
2490	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2491	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2492
2493	movq	B, BO
2494#else
2495	movq	KK, %rax
2496	leaq	(, %rax, SIZE), %rax
2497	leaq	(AO, %rax, 1), AO
2498	leaq	(B,  %rax, 1), BO
2499#endif
2500
2501	movsd	 0 * SIZE(AO), %xmm8
2502	pxor	%xmm0, %xmm0
2503	movsd	 0 * SIZE(BO), %xmm9
2504	pxor	%xmm1, %xmm1
2505	movsd	 4 * SIZE(AO), %xmm10
2506	pxor	%xmm2, %xmm2
2507	movsd	 4 * SIZE(BO), %xmm11
2508	pxor	%xmm3, %xmm3
2509
2510	movapd	 0 * SIZE(AO), %xmm9
2511	movapd	 0 * SIZE(BO), %xmm8
2512	movapd	 4 * SIZE(AO), %xmm11
2513	movapd	 4 * SIZE(BO), %xmm10
2514
2515#ifndef TRMMKERNEL
2516	movq	K, %rax
2517#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2518	movq	K, %rax
2519	subq	KK, %rax
2520	movq	%rax, KKK
2521#else
2522	movq	KK, %rax
2523#ifdef LEFT
2524	addq	$1, %rax
2525#else
2526	addq	$1, %rax
2527#endif
2528	movq	%rax, KKK
2529#endif
2530	sarq	$3, %rax
2531	je	.L115
2532	ALIGN_4
2533
2534.L112:
2535	mulpd	%xmm9, %xmm8
2536	movapd	 2 * SIZE(AO), %xmm9
2537	addpd	%xmm8, %xmm0
2538	mulpd	 2 * SIZE(BO), %xmm9
2539	movapd	 8 * SIZE(BO), %xmm8
2540	addpd	%xmm9, %xmm1
2541	movapd	 8 * SIZE(AO), %xmm9
2542	mulpd	%xmm11, %xmm10
2543	movapd	 6 * SIZE(AO), %xmm11
2544	addpd	%xmm10, %xmm0
2545	mulpd	 6 * SIZE(BO), %xmm11
2546	movapd	12 * SIZE(BO), %xmm10
2547	addpd	%xmm11, %xmm1
2548	movapd	12 * SIZE(AO), %xmm11
2549
2550	addq   $8 * SIZE, AO
2551	addq   $8 * SIZE, BO
2552	decq   %rax
2553	jne    .L112
2554	ALIGN_4
2555
2556.L115:
2557#ifndef TRMMKERNEL
2558	movq	K, %rax
2559#else
2560	movq	KKK, %rax
2561#endif
2562	movsd	ALPHA_R, %xmm15
2563	movhpd	ALPHA_I, %xmm15
2564	andq	$7, %rax		# if (k & 1)
2565	BRANCH
2566	je .L118
2567	ALIGN_4
2568
2569.L116:
2570	mulsd	 0 * SIZE(BO), %xmm9
2571	addsd	%xmm9, %xmm0
2572	movsd	 1 * SIZE(AO), %xmm9
2573
2574	addq	$1 * SIZE, AO		# aoffset  += 4
2575	addq	$1 * SIZE, BO		# boffset1 += 8
2576	decq	%rax
2577	jg	.L116
2578	ALIGN_4
2579
2580.L118:
2581	addpd	%xmm1, %xmm0
2582	haddpd	%xmm0, %xmm0
2583
2584	movsd	0 * SIZE(CO1), %xmm8
2585	movhpd	1 * SIZE(CO1), %xmm8
2586
2587	movddup	 %xmm0, %xmm12
2588
2589	mulpd	 %xmm15, %xmm12
2590	addpd	 %xmm12, %xmm8
2591
2592	movsd	%xmm8,  0 * SIZE(CO1)
2593	movhpd	%xmm8,  1 * SIZE(CO1)
2594	ALIGN_4
2595
2596.L999:
2597	movq	  0(%rsp), %rbx
2598	movq	  8(%rsp), %rbp
2599	movq	 16(%rsp), %r12
2600	movq	 24(%rsp), %r13
2601	movq	 32(%rsp), %r14
2602	movq	 40(%rsp), %r15
2603
2604#ifdef WINDOWS_ABI
2605	movq	 48(%rsp), %rdi
2606	movq	 56(%rsp), %rsi
2607	movups	 64(%rsp), %xmm6
2608	movups	 80(%rsp), %xmm7
2609	movups	 96(%rsp), %xmm8
2610	movups	112(%rsp), %xmm9
2611	movups	128(%rsp), %xmm10
2612	movups	144(%rsp), %xmm11
2613	movups	160(%rsp), %xmm12
2614	movups	176(%rsp), %xmm13
2615	movups	192(%rsp), %xmm14
2616	movups	208(%rsp), %xmm15
2617#endif
2618
2619	addq	$STACKSIZE, %rsp
2620	ret
2621
2622	EPILOGUE
2623