1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44#define M	%r13
45#define N	%r14
46#define K	%rdx
47
48#define A	%rcx
49#define B	%r8
50#define C	%r9
51#define LDC	%r10
52
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%rbp
58#define BB	%r12
59
60#ifndef WINDOWS_ABI
61
62#define STACKSIZE 64
63
64#define OLD_LDC		 8 + STACKSIZE(%rsp)
65#define OLD_OFFSET	16 + STACKSIZE(%rsp)
66
67#else
68
69#define STACKSIZE 256
70
71#define OLD_A		40 + STACKSIZE(%rsp)
72#define OLD_B		48 + STACKSIZE(%rsp)
73#define OLD_C		56 + STACKSIZE(%rsp)
74#define OLD_LDC		64 + STACKSIZE(%rsp)
75#define OLD_OFFSET	72 + STACKSIZE(%rsp)
76
77#endif
78
79#define ALPHA	  0(%rsp)
80#define J	 16(%rsp)
81#define OFFSET	 24(%rsp)
82#define KK	 32(%rsp)
83#define KKK	 40(%rsp)
84#define BUFFER	256(%rsp)
85
86#ifdef OPTERON
87#define PREFETCH     prefetch
88#define PREFETCHW    prefetchw
89#define PREFETCHSIZE (8 * 9 + 4)
90#define movsd	movlps
91#define movapd	movaps
92#endif
93
94#ifdef GENERIC
95#define PREFETCH     prefetcht0
96#define PREFETCHW    prefetcht0
97#define PREFETCHSIZE (8 * 13 + 4)
98#define movapd	movaps
99#endif
100
101#ifndef GENERIC
102#define KERNEL1(xx) \
103	mulpd	%xmm0, %xmm1 ;\
104	addpd	%xmm1, %xmm8 ;\
105	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
106	mulpd	%xmm0, %xmm3 ;\
107	addpd	%xmm3, %xmm9 ;\
108	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
109	mulpd	%xmm0, %xmm5 ;\
110	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
111	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
112	addpd	%xmm5, %xmm10 ;\
113	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
114	addpd	%xmm0, %xmm11 ;\
115	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
116
117#define KERNEL2(xx) \
118	mulpd	%xmm2, %xmm1 ;\
119	addpd	%xmm1, %xmm12 ;\
120	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
121	mulpd	%xmm2, %xmm3 ;\
122	addpd	%xmm3, %xmm13 ;\
123	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
124	mulpd	%xmm2, %xmm5 ;\
125	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
126	addpd	%xmm5, %xmm14 ;\
127	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
128	addpd	%xmm2, %xmm15 ;\
129	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
130
131#define KERNEL3(xx) \
132	mulpd	%xmm4, %xmm7 ;\
133	addpd	%xmm7, %xmm8 ;\
134	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
135	mulpd	%xmm4, %xmm3 ;\
136	addpd	%xmm3, %xmm9 ;\
137	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
138	mulpd	%xmm4, %xmm5 ;\
139	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
140	addpd	%xmm5, %xmm10 ;\
141	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
142	addpd	%xmm4, %xmm11 ;\
143	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
144
145#define KERNEL4(xx) \
146	mulpd	%xmm6, %xmm7 ;\
147	addpd	%xmm7, %xmm12 ;\
148	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
149	mulpd	%xmm6, %xmm3 ;\
150	addpd	%xmm3, %xmm13 ;\
151	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
152	mulpd	%xmm6, %xmm5 ;\
153	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
154	addpd	%xmm5, %xmm14 ;\
155	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
156 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
157	addpd	%xmm6, %xmm15 ;\
158	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
159
160#define KERNEL5(xx) \
161	mulpd	%xmm0, %xmm1 ;\
162	addpd	%xmm1, %xmm8 ;\
163	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
164	mulpd	%xmm0, %xmm3 ;\
165	addpd	%xmm3, %xmm9 ;\
166	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
167	mulpd	%xmm0, %xmm5 ;\
168	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
169	addpd	%xmm5, %xmm10 ;\
170	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
171	addpd	%xmm0, %xmm11 ;\
172	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
173
174#define KERNEL6(xx) \
175	mulpd	%xmm2, %xmm1 ;\
176	addpd	%xmm1, %xmm12 ;\
177	movapd	16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
178	mulpd	%xmm2, %xmm3 ;\
179	addpd	%xmm3, %xmm13 ;\
180	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
181	mulpd	%xmm2, %xmm5 ;\
182	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
183	addpd	%xmm5, %xmm14 ;\
184	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
185	addpd	%xmm2, %xmm15 ;\
186	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
187
188#define KERNEL7(xx) \
189	mulpd	%xmm4, %xmm7 ;\
190	addpd	%xmm7, %xmm8 ;\
191	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
192	mulpd	%xmm4, %xmm3 ;\
193	addpd	%xmm3, %xmm9 ;\
194	movapd	10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
195	mulpd	%xmm4, %xmm5 ;\
196	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
197	addpd	%xmm5, %xmm10 ;\
198	movapd	12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
199	addpd	%xmm4, %xmm11 ;\
200	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
201
202#define KERNEL8(xx) \
203	mulpd	%xmm6, %xmm7 ;\
204	addpd	%xmm7, %xmm12 ;\
205	movapd	24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
206	mulpd	%xmm6, %xmm3 ;\
207	addpd	%xmm3, %xmm13 ;\
208	movapd	18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
209	mulpd	%xmm6, %xmm5 ;\
210	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
211	addpd	%xmm5, %xmm14 ;\
212	movapd	20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
213	addpd	%xmm6, %xmm15 ;\
214	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
215
216#else
217
218#define KERNEL1(xx) \
219	mulpd	%xmm0, %xmm1 ;\
220	addpd	%xmm1, %xmm8 ;\
221	movapd	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
222	mulpd	%xmm0, %xmm3 ;\
223	addpd	%xmm3, %xmm9 ;\
224	movapd	-14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
225	mulpd	%xmm0, %xmm5 ;\
226	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
227	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
228	addpd	%xmm5, %xmm10 ;\
229	movapd	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
230	addpd	%xmm0, %xmm11 ;\
231	movapd	-8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
232
233#define KERNEL2(xx) \
234	mulpd	%xmm2, %xmm1 ;\
235	addpd	%xmm1, %xmm12 ;\
236	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
237	mulpd	%xmm2, %xmm3 ;\
238	addpd	%xmm3, %xmm13 ;\
239	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
240	mulpd	%xmm2, %xmm5 ;\
241	mulpd	-10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
242	addpd	%xmm5, %xmm14 ;\
243	movapd	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
244	addpd	%xmm2, %xmm15 ;\
245	movapd	-6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
246
247#define KERNEL3(xx) \
248	mulpd	%xmm4, %xmm7 ;\
249	addpd	%xmm7, %xmm8 ;\
250	movapd	-8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
251	mulpd	%xmm4, %xmm3 ;\
252	addpd	%xmm3, %xmm9 ;\
253	movapd	-6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
254	mulpd	%xmm4, %xmm5 ;\
255	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
256	addpd	%xmm5, %xmm10 ;\
257	movapd	-4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
258	addpd	%xmm4, %xmm11 ;\
259	movapd	-4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
260
261#define KERNEL4(xx) \
262	mulpd	%xmm6, %xmm7 ;\
263	addpd	%xmm7, %xmm12 ;\
264	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
265	mulpd	%xmm6, %xmm3 ;\
266	addpd	%xmm3, %xmm13 ;\
267	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
268	mulpd	%xmm6, %xmm5 ;\
269	mulpd	-2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
270	addpd	%xmm5, %xmm14 ;\
271	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
272 	PREFETCH	(PREFETCHSIZE     +  8) * SIZE + 1 * (xx) * SIZE(AO) ;\
273	addpd	%xmm6, %xmm15 ;\
274	movapd	-2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
275
276#define KERNEL5(xx) \
277	mulpd	%xmm0, %xmm1 ;\
278	addpd	%xmm1, %xmm8 ;\
279	movapd	 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
280	mulpd	%xmm0, %xmm3 ;\
281	addpd	%xmm3, %xmm9 ;\
282	movapd	 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
283	mulpd	%xmm0, %xmm5 ;\
284	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
285	addpd	%xmm5, %xmm10 ;\
286	movapd	 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
287	addpd	%xmm0, %xmm11 ;\
288	movapd	 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
289
290#define KERNEL6(xx) \
291	mulpd	%xmm2, %xmm1 ;\
292	addpd	%xmm1, %xmm12 ;\
293	movapd	16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
294	mulpd	%xmm2, %xmm3 ;\
295	addpd	%xmm3, %xmm13 ;\
296	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
297	mulpd	%xmm2, %xmm5 ;\
298	mulpd	 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
299	addpd	%xmm5, %xmm14 ;\
300	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
301	addpd	%xmm2, %xmm15 ;\
302	movapd	 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
303
304#define KERNEL7(xx) \
305	mulpd	%xmm4, %xmm7 ;\
306	addpd	%xmm7, %xmm8 ;\
307	movapd	 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
308	mulpd	%xmm4, %xmm3 ;\
309	addpd	%xmm3, %xmm9 ;\
310	movapd	10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
311	mulpd	%xmm4, %xmm5 ;\
312	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
313	addpd	%xmm5, %xmm10 ;\
314	movapd	12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
315	addpd	%xmm4, %xmm11 ;\
316	movapd	 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
317
318#define KERNEL8(xx) \
319	mulpd	%xmm6, %xmm7 ;\
320	addpd	%xmm7, %xmm12 ;\
321	movapd	24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
322	mulpd	%xmm6, %xmm3 ;\
323	addpd	%xmm3, %xmm13 ;\
324	movapd	18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
325	mulpd	%xmm6, %xmm5 ;\
326	mulpd	14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
327	addpd	%xmm5, %xmm14 ;\
328	movapd	20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
329	addpd	%xmm6, %xmm15 ;\
330	movapd	 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
331#endif
332
333	PROLOGUE
334	PROFCODE
335
336	subq	$STACKSIZE, %rsp
337	movq	%rbx,  0(%rsp)
338	movq	%rbp,  8(%rsp)
339	movq	%r12, 16(%rsp)
340	movq	%r13, 24(%rsp)
341	movq	%r14, 32(%rsp)
342	movq	%r15, 40(%rsp)
343
344#ifdef WINDOWS_ABI
345	movq	%rdi,    48(%rsp)
346	movq	%rsi,    56(%rsp)
347	movups	%xmm6,   64(%rsp)
348	movups	%xmm7,   80(%rsp)
349	movups	%xmm8,   96(%rsp)
350	movups	%xmm9,  112(%rsp)
351	movups	%xmm10, 128(%rsp)
352	movups	%xmm11, 144(%rsp)
353	movups	%xmm12, 160(%rsp)
354	movups	%xmm13, 176(%rsp)
355	movups	%xmm14, 192(%rsp)
356	movups	%xmm15, 208(%rsp)
357
358	movq	ARG1,      OLD_M
359	movq	ARG2,      OLD_N
360	movq	ARG3,      K
361	movq	OLD_A,     A
362	movq	OLD_B,     B
363	movq	OLD_C,     C
364	movq	OLD_LDC,   LDC
365#ifdef TRMMKERNEL
366	movsd	OLD_OFFSET, %xmm12
367#endif
368	movaps	%xmm3, %xmm0
369
370#else
371	movq	OLD_LDC,   LDC
372#ifdef TRMMKERNEL
373	movsd	OLD_OFFSET, %xmm12
374#endif
375
376#endif
377
378	EMMS
379
380	movq	%rsp, %rbx	# save old stack
381	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
382	andq	$-4096, %rsp	# align stack
383
384	STACK_TOUCHING
385
386	movq	OLD_M, M
387	movq	OLD_N, N
388
389	subq	$-16 * SIZE, A
390
391	unpcklpd %xmm0, %xmm0
392	movapd	 %xmm0, ALPHA
393
394	leaq	(, LDC, SIZE), LDC
395
396#ifdef TRMMKERNEL
397	movsd	%xmm12, OFFSET
398	movsd	%xmm12, KK
399#ifndef LEFT
400	negq	KK
401#endif
402#endif
403	movq	N,  J
404	sarq	$2, J		# j = (n >> 2)
405	jle	.L40
406	ALIGN_3
407
408.L01:
409/* Copying to Sub Buffer */
410	leaq	16 * SIZE + BUFFER, BO
411	movq	C, CO1			# coffset1 = c
412	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
413
414#if defined(TRMMKERNEL) && defined(LEFT)
415	movq	OFFSET, %rax
416	movq	%rax, KK
417#endif
418
419	movq	K, %rax
420	sarq	$2, %rax
421	jle	.L03
422	ALIGN_3
423
424
425#define RPREFETCHSIZE (8 *  7 + 4)
426#define WPREFETCHSIZE (8 *  8 + 4)
427
428.L02:
429	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
430
431	movq	 0 * SIZE(B), %mm0
432	movq	%mm0,  -16 * SIZE(BO)
433	movq	%mm0,  -15 * SIZE(BO)
434	movq	 1 * SIZE(B), %mm1
435	movq	%mm1,  -14 * SIZE(BO)
436	movq	%mm1,  -13 * SIZE(BO)
437
438	movq	 2 * SIZE(B), %mm2
439	movq	%mm2,  -12 * SIZE(BO)
440	movq	%mm2,  -11 * SIZE(BO)
441	movq	 3 * SIZE(B), %mm3
442	movq	%mm3,  -10 * SIZE(BO)
443	movq	%mm3,   -9 * SIZE(BO)
444
445	PREFETCHW	 (WPREFETCHSIZE +  0)  * SIZE(BO)
446
447	movq	 4 * SIZE(B), %mm4
448	movq	%mm4,   -8 * SIZE(BO)
449	movq	%mm4,   -7 * SIZE(BO)
450	movq	 5 * SIZE(B), %mm5
451	movq	%mm5,   -6 * SIZE(BO)
452	movq	%mm5,   -5 * SIZE(BO)
453
454	PREFETCHW	 (WPREFETCHSIZE +  8)  * SIZE(BO)
455
456	movq	 6 * SIZE(B), %mm6
457	movq	%mm6,   -4 * SIZE(BO)
458	movq	%mm6,   -3 * SIZE(BO)
459	movq	 7 * SIZE(B), %mm7
460	movq	%mm7,   -2 * SIZE(BO)
461	movq	%mm7,   -1 * SIZE(BO)
462
463	PREFETCH	 (RPREFETCHSIZE +  8)  * SIZE(B)
464
465	movq	 8 * SIZE(B), %mm0
466	movq	%mm0,   0 * SIZE(BO)
467	movq	%mm0,   1 * SIZE(BO)
468	movq	 9 * SIZE(B), %mm1
469	movq	%mm1,   2 * SIZE(BO)
470	movq	%mm1,   3 * SIZE(BO)
471
472	movq	10 * SIZE(B), %mm2
473	movq	%mm2,   4 * SIZE(BO)
474	movq	%mm2,   5 * SIZE(BO)
475	movq	11 * SIZE(B), %mm3
476	movq	%mm3,   6 * SIZE(BO)
477	movq	%mm3,   7 * SIZE(BO)
478
479	PREFETCHW	 (WPREFETCHSIZE + 16)  * SIZE(BO)
480
481	movq	12 * SIZE(B), %mm4
482	movq	%mm4,   8 * SIZE(BO)
483	movq	%mm4,   9 * SIZE(BO)
484	movq	13 * SIZE(B), %mm5
485	movq	%mm5,  10 * SIZE(BO)
486	movq	%mm5,  11 * SIZE(BO)
487
488	PREFETCHW	 (WPREFETCHSIZE + 24)  * SIZE(BO)
489
490	movq	14 * SIZE(B), %mm6
491	movq	%mm6,  12 * SIZE(BO)
492	movq	%mm6,  13 * SIZE(BO)
493	movq	15 * SIZE(B), %mm7
494	movq	%mm7,  14 * SIZE(BO)
495	movq	%mm7,  15 * SIZE(BO)
496
497	addq	$ 32 * SIZE, BO
498	subq	$-16 * SIZE, B
499
500	subq	$1, %rax
501	jne	.L02
502	ALIGN_3
503
504.L03:
505	movq	K, %rax
506	andq	$3, %rax
507	BRANCH
508	jle	.L10
509	ALIGN_3
510
511.L04:
512	movq	 0 * SIZE(B), %mm0
513	movq	%mm0, -16 * SIZE(BO)
514	movq	%mm0, -15 * SIZE(BO)
515	movq	 1 * SIZE(B), %mm1
516	movq	%mm1, -14 * SIZE(BO)
517	movq	%mm1, -13 * SIZE(BO)
518
519	movq	 2 * SIZE(B), %mm2
520	movq	%mm2, -12 * SIZE(BO)
521	movq	%mm2, -11 * SIZE(BO)
522	movq	 3 * SIZE(B), %mm3
523	movq	%mm3, -10 * SIZE(BO)
524	movq	%mm3,  -9 * SIZE(BO)
525
526	addq	$4 * SIZE, B
527	addq	$8 * SIZE, BO
528	subq	$1, %rax
529	jne	.L04
530	ALIGN_3
531
532.L10:
533	movq	A, AO		# aoffset = a
534
535	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
536
537	movq	M,  I
538	sarq	$2, I	# i = (m >> 2)
539	jle	.L20
540	ALIGN_3
541
542.L11:
543#if !defined(TRMMKERNEL) || \
544	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
545	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
546
547	leaq	16 * SIZE + BUFFER, BO
548#else
549	leaq	16 * SIZE + BUFFER, BO
550	movq	KK, %rax
551	leaq	(, %rax, SIZE), %rax
552	leaq	(AO, %rax, 4), AO
553	leaq	(BO, %rax, 8), BO
554#endif
555
556	movapd	-16 * SIZE(AO), %xmm0
557	movapd	-16 * SIZE(BO), %xmm1
558	pxor	%xmm8, %xmm8
559	movapd	-14 * SIZE(AO), %xmm2
560	movapd	-14 * SIZE(BO), %xmm3
561	pxor	%xmm9, %xmm9
562	movapd	-12 * SIZE(AO), %xmm4
563	movapd	-12 * SIZE(BO), %xmm5
564	pxor	%xmm10, %xmm10
565	movapd	-10 * SIZE(AO), %xmm6
566	movapd	 -8 * SIZE(BO), %xmm7
567	pxor	%xmm11, %xmm11
568
569	PREFETCHW      3 * SIZE(CO1)
570	pxor	%xmm12, %xmm12
571	PREFETCHW      7 * SIZE(CO2)
572	pxor	%xmm13, %xmm13
573	PREFETCHW      3 * SIZE(CO1, LDC, 2)
574	pxor	%xmm14, %xmm14
575	PREFETCHW      7 * SIZE(CO2, LDC, 2)
576	pxor	%xmm15, %xmm15
577
578	PREFETCH	 0  * SIZE(BB)
579
580#ifndef TRMMKERNEL
581	movq	K, %rax
582#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
583	movq	K, %rax
584	subq	KK, %rax
585	movq	%rax, KKK
586#else
587	movq	KK, %rax
588#ifdef LEFT
589	addq	$4, %rax
590#else
591	addq	$4, %rax
592#endif
593	movq	%rax, KKK
594#endif
595
596#ifndef GENERIC
597	andq	$-8, %rax
598
599	leaq	(, %rax, SIZE), %rax
600	leaq	(AO, %rax, 4), AO
601	leaq	(BO, %rax, 8), BO
602	negq	%rax
603	NOBRANCH
604	je	.L15
605	ALIGN_3
606
607.L12:
608	KERNEL1(16 *  0)
609	KERNEL2(16 *  0)
610	KERNEL3(16 *  0)
611	KERNEL4(16 *  0)
612	KERNEL5(16 *  0)
613	KERNEL6(16 *  0)
614	KERNEL7(16 *  0)
615	KERNEL8(16 *  0)
616
617	KERNEL1(16 *  1)
618	KERNEL2(16 *  1)
619	KERNEL3(16 *  1)
620	KERNEL4(16 *  1)
621	KERNEL5(16 *  1)
622	KERNEL6(16 *  1)
623	KERNEL7(16 *  1)
624	KERNEL8(16 *  1)
625
626	addq	$8 * SIZE, %rax
627	NOBRANCH
628	je	.L15
629
630	KERNEL1(16 *  0)
631	KERNEL2(16 *  0)
632	KERNEL3(16 *  0)
633	KERNEL4(16 *  0)
634	KERNEL5(16 *  0)
635	KERNEL6(16 *  0)
636	KERNEL7(16 *  0)
637	KERNEL8(16 *  0)
638
639	KERNEL1(16 *  1)
640	KERNEL2(16 *  1)
641	KERNEL3(16 *  1)
642	KERNEL4(16 *  1)
643	KERNEL5(16 *  1)
644	KERNEL6(16 *  1)
645	KERNEL7(16 *  1)
646	KERNEL8(16 *  1)
647
648	addq	$8 * SIZE, %rax
649	NOBRANCH
650	je	.L15
651
652	KERNEL1(16 *  0)
653	KERNEL2(16 *  0)
654	KERNEL3(16 *  0)
655	KERNEL4(16 *  0)
656	KERNEL5(16 *  0)
657	KERNEL6(16 *  0)
658	KERNEL7(16 *  0)
659	KERNEL8(16 *  0)
660
661	KERNEL1(16 *  1)
662	KERNEL2(16 *  1)
663	KERNEL3(16 *  1)
664	KERNEL4(16 *  1)
665	KERNEL5(16 *  1)
666	KERNEL6(16 *  1)
667	KERNEL7(16 *  1)
668	KERNEL8(16 *  1)
669
670	addq	$8 * SIZE, %rax
671	NOBRANCH
672	je	.L15
673
674	KERNEL1(16 *  0)
675	KERNEL2(16 *  0)
676	KERNEL3(16 *  0)
677	KERNEL4(16 *  0)
678	KERNEL5(16 *  0)
679	KERNEL6(16 *  0)
680	KERNEL7(16 *  0)
681	KERNEL8(16 *  0)
682
683	KERNEL1(16 *  1)
684	KERNEL2(16 *  1)
685	KERNEL3(16 *  1)
686	KERNEL4(16 *  1)
687	KERNEL5(16 *  1)
688	KERNEL6(16 *  1)
689	KERNEL7(16 *  1)
690	KERNEL8(16 *  1)
691
692	addq	$8 * SIZE, %rax
693	NOBRANCH
694	je	.L15
695
696	KERNEL1(16 *  0)
697	KERNEL2(16 *  0)
698	KERNEL3(16 *  0)
699	KERNEL4(16 *  0)
700	KERNEL5(16 *  0)
701	KERNEL6(16 *  0)
702	KERNEL7(16 *  0)
703	KERNEL8(16 *  0)
704
705	KERNEL1(16 *  1)
706	KERNEL2(16 *  1)
707	KERNEL3(16 *  1)
708	KERNEL4(16 *  1)
709	KERNEL5(16 *  1)
710	KERNEL6(16 *  1)
711	KERNEL7(16 *  1)
712	KERNEL8(16 *  1)
713
714	addq	$8 * SIZE, %rax
715	NOBRANCH
716	je	.L15
717
718	KERNEL1(16 *  0)
719	KERNEL2(16 *  0)
720	KERNEL3(16 *  0)
721	KERNEL4(16 *  0)
722	KERNEL5(16 *  0)
723	KERNEL6(16 *  0)
724	KERNEL7(16 *  0)
725	KERNEL8(16 *  0)
726
727	KERNEL1(16 *  1)
728	KERNEL2(16 *  1)
729	KERNEL3(16 *  1)
730	KERNEL4(16 *  1)
731	KERNEL5(16 *  1)
732	KERNEL6(16 *  1)
733	KERNEL7(16 *  1)
734	KERNEL8(16 *  1)
735
736	addq	$8 * SIZE, %rax
737	NOBRANCH
738	je	.L15
739
740	KERNEL1(16 *  0)
741	KERNEL2(16 *  0)
742	KERNEL3(16 *  0)
743	KERNEL4(16 *  0)
744	KERNEL5(16 *  0)
745	KERNEL6(16 *  0)
746	KERNEL7(16 *  0)
747	KERNEL8(16 *  0)
748
749	KERNEL1(16 *  1)
750	KERNEL2(16 *  1)
751	KERNEL3(16 *  1)
752	KERNEL4(16 *  1)
753	KERNEL5(16 *  1)
754	KERNEL6(16 *  1)
755	KERNEL7(16 *  1)
756	KERNEL8(16 *  1)
757
758	addq	$8 * SIZE, %rax
759	NOBRANCH
760	je	.L15
761
762	KERNEL1(16 *  0)
763	KERNEL2(16 *  0)
764	KERNEL3(16 *  0)
765	KERNEL4(16 *  0)
766	KERNEL5(16 *  0)
767	KERNEL6(16 *  0)
768	KERNEL7(16 *  0)
769	KERNEL8(16 *  0)
770
771	KERNEL1(16 *  1)
772	KERNEL2(16 *  1)
773	KERNEL3(16 *  1)
774	KERNEL4(16 *  1)
775	KERNEL5(16 *  1)
776	KERNEL6(16 *  1)
777	KERNEL7(16 *  1)
778	KERNEL8(16 *  1)
779
780	addq	$8 * SIZE, %rax
781	BRANCH
782	jl	.L12
783	ALIGN_3
784
785.L15:
786#ifndef TRMMKERNEL
787	movq	K, %rax
788#else
789	movq	KKK, %rax
790#endif
791	testq	$4, %rax
792	je .L16
793	xorq	%rax, %rax
794	ALIGN_3
795
796	KERNEL1(16 *  0)
797	KERNEL2(16 *  0)
798	KERNEL3(16 *  0)
799	KERNEL4(16 *  0)
800	KERNEL5(16 *  0)
801	KERNEL6(16 *  0)
802	KERNEL7(16 *  0)
803	KERNEL8(16 *  0)
804
805	addq	$32 * SIZE, BO
806	addq	$16 * SIZE, AO
807	ALIGN_3
808
809#else
810	sarq	$2, %rax
811	NOBRANCH
812	jle	.L16
813	ALIGN_3
814
815.L12:
816	KERNEL1(16 *  0)
817	KERNEL2(16 *  0)
818	KERNEL3(16 *  0)
819	KERNEL4(16 *  0)
820	KERNEL5(16 *  0)
821	KERNEL6(16 *  0)
822	KERNEL7(16 *  0)
823	KERNEL8(16 *  0)
824
825	addq	$ 32 * SIZE, BO
826	subq	$-16 * SIZE, AO
827	decq	%rax
828	BRANCH
829	jg	.L12
830#endif
831
832.L16:
833	movapd	ALPHA, %xmm7
834
835#ifndef TRMMKERNEL
836	movq	K, %rax
837#else
838	movq	KKK, %rax
839#endif
840	andq	$3, %rax		# if (k & 1)
841	je .L19
842
843	leaq	(, %rax, SIZE), %rax
844	leaq	(AO, %rax, 4), AO
845	leaq	(BO, %rax, 8), BO
846	negq	%rax
847	ALIGN_3
848
849.L17:
850	mulpd	%xmm0, %xmm1
851	addpd	%xmm1, %xmm8
852	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
853	mulpd	%xmm0, %xmm1
854	addpd	%xmm1, %xmm9
855	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
856	mulpd	%xmm0, %xmm1
857	mulpd	-10 * SIZE(BO, %rax, 8), %xmm0
858	addpd	%xmm1, %xmm10
859	movapd	-16 * SIZE(BO, %rax, 8), %xmm1
860	addpd	%xmm0, %xmm11
861	movapd	-12 * SIZE(AO, %rax, 4), %xmm0
862	mulpd	%xmm2, %xmm1
863	addpd	%xmm1, %xmm12
864	movapd	-14 * SIZE(BO, %rax, 8), %xmm1
865	mulpd	%xmm2, %xmm1
866	addpd	%xmm1, %xmm13
867	movapd	-12 * SIZE(BO, %rax, 8), %xmm1
868	mulpd	%xmm2, %xmm1
869	mulpd	-10 * SIZE(BO, %rax, 8), %xmm2
870	addpd	%xmm1, %xmm14
871	movapd	 -8 * SIZE(BO, %rax, 8), %xmm1
872	addpd	%xmm2, %xmm15
873	movapd	-10 * SIZE(AO, %rax, 4), %xmm2
874
875	addq	$SIZE, %rax
876	jl	.L17
877	ALIGN_3
878
879.L19:
880	PREFETCH	 8  * SIZE(BB)
881	subq		 $-12 * SIZE, BB
882
883#ifndef TRMMKERNEL
884	movsd	0 * SIZE(CO1), %xmm0
885	movhpd	1 * SIZE(CO1), %xmm0
886	movsd	2 * SIZE(CO1), %xmm1
887	movhpd	3 * SIZE(CO1), %xmm1
888
889	movsd	0 * SIZE(CO2), %xmm2
890	movhpd	1 * SIZE(CO2), %xmm2
891	movsd	2 * SIZE(CO2), %xmm3
892	movhpd	3 * SIZE(CO2), %xmm3
893#endif
894
895	mulpd	%xmm7, %xmm8
896	mulpd	%xmm7, %xmm9
897	mulpd	%xmm7, %xmm10
898	mulpd	%xmm7, %xmm11
899
900	mulpd	%xmm7, %xmm12
901	mulpd	%xmm7, %xmm13
902	mulpd	%xmm7, %xmm14
903	mulpd	%xmm7, %xmm15
904
905#ifndef TRMMKERNEL
906	movlpd	0 * SIZE(CO1, LDC, 2), %xmm4
907	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
908	movlpd	2 * SIZE(CO1, LDC, 2), %xmm5
909	movhpd	3 * SIZE(CO1, LDC, 2), %xmm5
910
911	movlpd	0 * SIZE(CO2, LDC, 2), %xmm6
912	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
913	movlpd	2 * SIZE(CO2, LDC, 2), %xmm7
914	movhpd	3 * SIZE(CO2, LDC, 2), %xmm7
915
916	addpd	%xmm0, %xmm8
917	addpd	%xmm1, %xmm12
918	addpd	%xmm2, %xmm9
919	addpd	%xmm3, %xmm13
920#endif
921
922	movlpd	%xmm8, 0 * SIZE(CO1)
923	movhpd	%xmm8, 1 * SIZE(CO1)
924	movlpd	%xmm12, 2 * SIZE(CO1)
925	movhpd	%xmm12, 3 * SIZE(CO1)
926
927	movlpd	%xmm9, 0 * SIZE(CO2)
928	movhpd	%xmm9, 1 * SIZE(CO2)
929	movlpd	%xmm13, 2 * SIZE(CO2)
930	movhpd	%xmm13, 3 * SIZE(CO2)
931
932#ifndef TRMMKERNEL
933	addpd	%xmm4, %xmm10
934	addpd	%xmm5, %xmm14
935	addpd	%xmm6, %xmm11
936	addpd	%xmm7, %xmm15
937#endif
938
939	movlpd	%xmm10, 0 * SIZE(CO1, LDC, 2)
940	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
941	movlpd	%xmm14, 2 * SIZE(CO1, LDC, 2)
942	movhpd	%xmm14, 3 * SIZE(CO1, LDC, 2)
943
944	movlpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
945	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
946	movlpd	%xmm15, 2 * SIZE(CO2, LDC, 2)
947	movhpd	%xmm15, 3 * SIZE(CO2, LDC, 2)
948
949#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
950    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
951	movq	K, %rax
952	subq	KKK, %rax
953	leaq	(,%rax, SIZE), %rax
954	leaq	(AO, %rax, 4), AO
955	leaq	(BO, %rax, 8), BO
956#endif
957
958#if defined(TRMMKERNEL) && defined(LEFT)
959	addq	$4, KK
960#endif
961
962	addq	$4 * SIZE, CO1		# coffset += 4
963	addq	$4 * SIZE, CO2		# coffset += 4
964	decq	I			# i --
965	BRANCH
966	jg	.L11
967	ALIGN_3
968
969.L20:
970	testq	$3, M
971	je	.L39
972
973	testq	$2, M
974	je	.L30
975	ALIGN_3
976
977.L21:
978#if !defined(TRMMKERNEL) || \
979	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
980	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
981
982	leaq	BUFFER, BO
983#else
984	leaq	BUFFER, BO
985	movq	KK, %rax
986	leaq	(, %rax, SIZE), %rax
987	leaq	(AO, %rax, 2), AO
988	leaq	(BO, %rax, 8), BO
989#endif
990
991	movapd	-16 * SIZE(AO), %xmm0
992	pxor	%xmm8, %xmm8
993	movapd	 0 * SIZE(BO), %xmm1
994	pxor	%xmm9, %xmm9
995	movapd	 -8 * SIZE(AO), %xmm2
996	pxor	%xmm10, %xmm10
997	movapd	 8 * SIZE(BO), %xmm3
998	pxor	%xmm11, %xmm11
999
1000	movapd	16 * SIZE(BO), %xmm5
1001	movapd	24 * SIZE(BO), %xmm7
1002
1003#ifndef TRMMKERNEL
1004	movq	K, %rax
1005#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1006	movq	K, %rax
1007	subq	KK, %rax
1008	movq	%rax, KKK
1009#else
1010	movq	KK, %rax
1011#ifdef LEFT
1012	addq	$2, %rax
1013#else
1014	addq	$4, %rax
1015#endif
1016	movq	%rax, KKK
1017#endif
1018	sarq	$3, %rax
1019	je	.L25
1020	ALIGN_3
1021
1022.L22:
1023	mulpd	%xmm0, %xmm1
1024	addpd	%xmm1, %xmm8
1025	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1026	movapd	 2 * SIZE(BO), %xmm1
1027	mulpd	%xmm0, %xmm1
1028	addpd	%xmm1, %xmm9
1029	movapd	 4 * SIZE(BO), %xmm1
1030	mulpd	%xmm0, %xmm1
1031	mulpd	 6 * SIZE(BO), %xmm0
1032	addpd	%xmm1, %xmm10
1033	movapd	32 * SIZE(BO), %xmm1
1034	addpd	%xmm0, %xmm11
1035	movapd	-14 * SIZE(AO), %xmm0
1036
1037	mulpd	%xmm0, %xmm3
1038	addpd	%xmm3, %xmm8
1039	movapd	10 * SIZE(BO), %xmm3
1040	mulpd	%xmm0, %xmm3
1041	addpd	%xmm3, %xmm9
1042	movapd	12 * SIZE(BO), %xmm3
1043	mulpd	%xmm0, %xmm3
1044	mulpd	14 * SIZE(BO), %xmm0
1045	addpd	%xmm3, %xmm10
1046	movapd	40 * SIZE(BO), %xmm3
1047	addpd	%xmm0, %xmm11
1048	movapd	-12 * SIZE(AO), %xmm0
1049
1050	mulpd	%xmm0, %xmm5
1051	addpd	%xmm5, %xmm8
1052	movapd	18 * SIZE(BO), %xmm5
1053	mulpd	%xmm0, %xmm5
1054	addpd	%xmm5, %xmm9
1055	movapd	20 * SIZE(BO), %xmm5
1056	mulpd	%xmm0, %xmm5
1057	mulpd	22 * SIZE(BO), %xmm0
1058	addpd	%xmm5, %xmm10
1059	movapd	48 * SIZE(BO), %xmm5
1060	addpd	%xmm0, %xmm11
1061	movapd	-10 * SIZE(AO), %xmm0
1062
1063	mulpd	%xmm0, %xmm7
1064	addpd	%xmm7, %xmm8
1065	movapd	26 * SIZE(BO), %xmm7
1066	mulpd	%xmm0, %xmm7
1067	addpd	%xmm7, %xmm9
1068	movapd	28 * SIZE(BO), %xmm7
1069	mulpd	%xmm0, %xmm7
1070	mulpd	30 * SIZE(BO), %xmm0
1071	addpd	%xmm7, %xmm10
1072	movapd	56 * SIZE(BO), %xmm7
1073	addpd	%xmm0, %xmm11
1074	movapd	  0 * SIZE(AO), %xmm0
1075
1076	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1077	mulpd	%xmm2, %xmm1
1078	addpd	%xmm1, %xmm8
1079	movapd	34 * SIZE(BO), %xmm1
1080	mulpd	%xmm2, %xmm1
1081	addpd	%xmm1, %xmm9
1082	movapd	36 * SIZE(BO), %xmm1
1083	mulpd	%xmm2, %xmm1
1084	mulpd	38 * SIZE(BO), %xmm2
1085	addpd	%xmm1, %xmm10
1086	movapd	64 * SIZE(BO), %xmm1
1087	addpd	%xmm2, %xmm11
1088	movapd	-6 * SIZE(AO), %xmm2
1089
1090	mulpd	%xmm2, %xmm3
1091	addpd	%xmm3, %xmm8
1092	movapd	42 * SIZE(BO), %xmm3
1093	mulpd	%xmm2, %xmm3
1094	addpd	%xmm3, %xmm9
1095	movapd	44 * SIZE(BO), %xmm3
1096	mulpd	%xmm2, %xmm3
1097	mulpd	46 * SIZE(BO), %xmm2
1098	addpd	%xmm3, %xmm10
1099	movapd	72 * SIZE(BO), %xmm3
1100	addpd	%xmm2, %xmm11
1101	movapd	-4 * SIZE(AO), %xmm2
1102
1103	mulpd	%xmm2, %xmm5
1104	addpd	%xmm5, %xmm8
1105	movapd	50 * SIZE(BO), %xmm5
1106	mulpd	%xmm2, %xmm5
1107	addpd	%xmm5, %xmm9
1108	movapd	52 * SIZE(BO), %xmm5
1109	mulpd	%xmm2, %xmm5
1110	mulpd	54 * SIZE(BO), %xmm2
1111	addpd	%xmm5, %xmm10
1112	movapd	80 * SIZE(BO), %xmm5
1113	addpd	%xmm2, %xmm11
1114	movapd	-2 * SIZE(AO), %xmm2
1115
1116	mulpd	%xmm2, %xmm7
1117	addpd	%xmm7, %xmm8
1118	movapd	58 * SIZE(BO), %xmm7
1119	mulpd	%xmm2, %xmm7
1120	addpd	%xmm7, %xmm9
1121	movapd	60 * SIZE(BO), %xmm7
1122	mulpd	%xmm2, %xmm7
1123	mulpd	62 * SIZE(BO), %xmm2
1124	addpd	%xmm7, %xmm10
1125	movapd	88 * SIZE(BO), %xmm7
1126	addpd	%xmm2, %xmm11
1127	movapd	 8 * SIZE(AO), %xmm2
1128
1129	addq   $16 * SIZE, AO
1130	addq   $64 * SIZE, BO
1131	decq   %rax
1132	jne    .L22
1133	ALIGN_3
1134
1135.L25:
1136#ifndef TRMMKERNEL
1137	movq	K, %rax
1138#else
1139	movq	KKK, %rax
1140#endif
1141	movapd	ALPHA, %xmm7
1142	andq	$7, %rax		# if (k & 1)
1143	BRANCH
1144	je .L29
1145	ALIGN_3
1146
1147.L26:
1148	mulpd	%xmm0, %xmm1
1149	addpd	%xmm1, %xmm8
1150	movapd	  2 * SIZE(BO), %xmm1
1151	mulpd	%xmm0, %xmm1
1152	addpd	%xmm1, %xmm9
1153	movapd	  4 * SIZE(BO), %xmm1
1154	mulpd	%xmm0, %xmm1
1155	mulpd	  6 * SIZE(BO), %xmm0
1156	addpd	%xmm1, %xmm10
1157	movapd	  8 * SIZE(BO), %xmm1
1158	addpd	%xmm0, %xmm11
1159	movapd	-14 * SIZE(AO), %xmm0
1160
1161	addq	$2 * SIZE, AO		# aoffset  += 4
1162	addq	$8 * SIZE, BO		# boffset1 += 8
1163	decq	%rax
1164	jg	.L26
1165	ALIGN_3
1166
1167.L29:
1168#ifndef TRMMKERNEL
1169	movlpd	0 * SIZE(CO1), %xmm0
1170	movhpd	1 * SIZE(CO1), %xmm0
1171	movlpd	0 * SIZE(CO2), %xmm2
1172	movhpd	1 * SIZE(CO2), %xmm2
1173
1174	movlpd	0 * SIZE(CO1, LDC, 2), %xmm4
1175	movhpd	1 * SIZE(CO1, LDC, 2), %xmm4
1176	movlpd	0 * SIZE(CO2, LDC, 2), %xmm6
1177	movhpd	1 * SIZE(CO2, LDC, 2), %xmm6
1178#endif
1179	mulpd	%xmm7, %xmm8
1180	mulpd	%xmm7, %xmm9
1181	mulpd	%xmm7, %xmm10
1182	mulpd	%xmm7, %xmm11
1183
1184#ifndef TRMMKERNEL
1185	addpd	%xmm0,  %xmm8
1186	addpd	%xmm2, %xmm9
1187	addpd	%xmm4, %xmm10
1188	addpd	%xmm6, %xmm11
1189#endif
1190
1191	movlpd	%xmm8, 0 * SIZE(CO1)
1192	movhpd	%xmm8, 1 * SIZE(CO1)
1193	movlpd	%xmm9, 0 * SIZE(CO2)
1194	movhpd	%xmm9, 1 * SIZE(CO2)
1195	movlpd	%xmm10, 0 * SIZE(CO1, LDC, 2)
1196	movhpd	%xmm10, 1 * SIZE(CO1, LDC, 2)
1197	movlpd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1198	movhpd	%xmm11, 1 * SIZE(CO2, LDC, 2)
1199
1200#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1201    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1202	movq	K, %rax
1203	subq	KKK, %rax
1204	leaq	(,%rax, SIZE), %rax
1205	leaq	(AO, %rax, 2), AO
1206	leaq	(BO, %rax, 8), BO
1207#endif
1208
1209#if defined(TRMMKERNEL) && defined(LEFT)
1210	addq	$2, KK
1211#endif
1212
1213	addq	$2 * SIZE, CO1		# coffset += 4
1214	addq	$2 * SIZE, CO2		# coffset += 4
1215	ALIGN_3
1216
1217.L30:
1218	testq	$1, M
1219	je	.L39
1220	ALIGN_3
1221
1222.L31:
1223#if !defined(TRMMKERNEL) || \
1224	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1225	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1226
1227	leaq	BUFFER, BO
1228#else
1229	leaq	BUFFER, BO
1230	movq	KK, %rax
1231	leaq	(, %rax, SIZE), %rax
1232	leaq	(AO, %rax, 1), AO
1233	leaq	(BO, %rax, 8), BO
1234#endif
1235
1236	movsd	-16 * SIZE(AO), %xmm0
1237	pxor	%xmm8, %xmm8
1238	movsd	 0 * SIZE(BO), %xmm1
1239	pxor	%xmm9, %xmm9
1240	movsd	 -8 * SIZE(AO), %xmm2
1241	pxor	%xmm10, %xmm10
1242	movsd	 8 * SIZE(BO), %xmm3
1243	pxor	%xmm11, %xmm11
1244
1245	movsd	16 * SIZE(BO), %xmm5
1246	movsd	24 * SIZE(BO), %xmm7
1247
1248#ifndef TRMMKERNEL
1249	movq	K, %rax
1250#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1251	movq	K, %rax
1252	subq	KK, %rax
1253	movq	%rax, KKK
1254#else
1255	movq	KK, %rax
1256#ifdef LEFT
1257	addq	$1, %rax
1258#else
1259	addq	$4, %rax
1260#endif
1261	movq	%rax, KKK
1262#endif
1263	sarq	$3, %rax
1264	je	.L35
1265	ALIGN_3
1266
1267.L32:
1268	mulsd	%xmm0, %xmm1
1269	addsd	%xmm1, %xmm8
1270	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1271	movsd	 2 * SIZE(BO), %xmm1
1272	mulsd	%xmm0, %xmm1
1273	addsd	%xmm1, %xmm9
1274	movsd	 4 * SIZE(BO), %xmm1
1275	mulsd	%xmm0, %xmm1
1276	mulsd	 6 * SIZE(BO), %xmm0
1277	addsd	%xmm1, %xmm10
1278	movsd	32 * SIZE(BO), %xmm1
1279	addsd	%xmm0, %xmm11
1280	movsd	-15 * SIZE(AO), %xmm0
1281
1282	mulsd	%xmm0, %xmm3
1283	addsd	%xmm3, %xmm8
1284	movsd	10 * SIZE(BO), %xmm3
1285	mulsd	%xmm0, %xmm3
1286	addsd	%xmm3, %xmm9
1287	movsd	12 * SIZE(BO), %xmm3
1288	mulsd	%xmm0, %xmm3
1289	mulsd	14 * SIZE(BO), %xmm0
1290	addsd	%xmm3, %xmm10
1291	movsd	40 * SIZE(BO), %xmm3
1292	addsd	%xmm0, %xmm11
1293	movsd	-14 * SIZE(AO), %xmm0
1294
1295	mulsd	%xmm0, %xmm5
1296	addsd	%xmm5, %xmm8
1297	movsd	18 * SIZE(BO), %xmm5
1298	mulsd	%xmm0, %xmm5
1299	addsd	%xmm5, %xmm9
1300	movsd	20 * SIZE(BO), %xmm5
1301	mulsd	%xmm0, %xmm5
1302	mulsd	22 * SIZE(BO), %xmm0
1303	addsd	%xmm5, %xmm10
1304	movsd	48 * SIZE(BO), %xmm5
1305	addsd	%xmm0, %xmm11
1306	movsd	-13 * SIZE(AO), %xmm0
1307
1308	mulsd	%xmm0, %xmm7
1309	addsd	%xmm7, %xmm8
1310	movsd	26 * SIZE(BO), %xmm7
1311	mulsd	%xmm0, %xmm7
1312	addsd	%xmm7, %xmm9
1313	movsd	28 * SIZE(BO), %xmm7
1314	mulsd	%xmm0, %xmm7
1315	mulsd	30 * SIZE(BO), %xmm0
1316	addsd	%xmm7, %xmm10
1317	movsd	56 * SIZE(BO), %xmm7
1318	addsd	%xmm0, %xmm11
1319	movsd	-12 * SIZE(AO), %xmm0
1320
1321	mulsd	%xmm0, %xmm1
1322	addsd	%xmm1, %xmm8
1323	movsd	34 * SIZE(BO), %xmm1
1324	mulsd	%xmm0, %xmm1
1325	addsd	%xmm1, %xmm9
1326	movsd	36 * SIZE(BO), %xmm1
1327	mulsd	%xmm0, %xmm1
1328	mulsd	38 * SIZE(BO), %xmm0
1329	addsd	%xmm1, %xmm10
1330	movsd	64 * SIZE(BO), %xmm1
1331	addsd	%xmm0, %xmm11
1332	movsd	-11 * SIZE(AO), %xmm0
1333
1334	mulsd	%xmm0, %xmm3
1335	addsd	%xmm3, %xmm8
1336	movsd	42 * SIZE(BO), %xmm3
1337	mulsd	%xmm0, %xmm3
1338	addsd	%xmm3, %xmm9
1339	movsd	44 * SIZE(BO), %xmm3
1340	mulsd	%xmm0, %xmm3
1341	mulsd	46 * SIZE(BO), %xmm0
1342	addsd	%xmm3, %xmm10
1343	movsd	72 * SIZE(BO), %xmm3
1344	addsd	%xmm0, %xmm11
1345	movsd	-10 * SIZE(AO), %xmm0
1346
1347	mulsd	%xmm0, %xmm5
1348	addsd	%xmm5, %xmm8
1349	movsd	50 * SIZE(BO), %xmm5
1350	mulsd	%xmm0, %xmm5
1351	addsd	%xmm5, %xmm9
1352	movsd	52 * SIZE(BO), %xmm5
1353	mulsd	%xmm0, %xmm5
1354	mulsd	54 * SIZE(BO), %xmm0
1355	addsd	%xmm5, %xmm10
1356	movsd	80 * SIZE(BO), %xmm5
1357	addsd	%xmm0, %xmm11
1358	movsd	-9 * SIZE(AO), %xmm0
1359
1360	mulsd	%xmm0, %xmm7
1361	addsd	%xmm7, %xmm8
1362	movsd	58 * SIZE(BO), %xmm7
1363	mulsd	%xmm0, %xmm7
1364	addsd	%xmm7, %xmm9
1365	movsd	60 * SIZE(BO), %xmm7
1366	mulsd	%xmm0, %xmm7
1367	mulsd	62 * SIZE(BO), %xmm0
1368	addsd	%xmm7, %xmm10
1369	movsd	88 * SIZE(BO), %xmm7
1370	addsd	%xmm0, %xmm11
1371	movsd	-8 * SIZE(AO), %xmm0
1372
1373	addq   $ 8 * SIZE, AO
1374	addq   $64 * SIZE, BO
1375	decq   %rax
1376	jne    .L32
1377	ALIGN_3
1378
1379.L35:
1380#ifndef TRMMKERNEL
1381	movq	K, %rax
1382#else
1383	movq	KKK, %rax
1384#endif
1385	movsd	ALPHA, %xmm7
1386	andq	$7, %rax		# if (k & 1)
1387	BRANCH
1388	je .L38
1389	ALIGN_3
1390
1391.L36:
1392	mulsd	%xmm0, %xmm1
1393	addsd	%xmm1, %xmm8
1394	movsd	 2 * SIZE(BO), %xmm1
1395	mulsd	%xmm0, %xmm1
1396	addsd	%xmm1, %xmm9
1397	movsd	 4 * SIZE(BO), %xmm1
1398	mulsd	%xmm0, %xmm1
1399	mulsd	 6 * SIZE(BO), %xmm0
1400	addsd	%xmm1, %xmm10
1401	movsd	 8 * SIZE(BO), %xmm1
1402	addsd	%xmm0, %xmm11
1403	movsd	-15 * SIZE(AO), %xmm0
1404
1405	addq	$1 * SIZE, AO		# aoffset  += 4
1406	addq	$8 * SIZE, BO		# boffset1 += 8
1407	decq	%rax
1408	jg	.L36
1409	ALIGN_3
1410
1411.L38:
1412#ifndef TRMMKERNEL
1413	movsd	0 * SIZE(CO1), %xmm0
1414	movsd	0 * SIZE(CO2), %xmm2
1415	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
1416	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
1417#endif
1418
1419	mulsd	%xmm7, %xmm8
1420	mulsd	%xmm7, %xmm9
1421	mulsd	%xmm7, %xmm10
1422	mulsd	%xmm7, %xmm11
1423
1424#ifndef TRMMKERNEL
1425	addsd	%xmm0,  %xmm8
1426	addsd	%xmm2, %xmm9
1427	addsd	%xmm4, %xmm10
1428	addsd	%xmm6, %xmm11
1429#endif
1430
1431	movsd	%xmm8, 0 * SIZE(CO1)
1432	movsd	%xmm9, 0 * SIZE(CO2)
1433	movsd	%xmm10, 0 * SIZE(CO1, LDC, 2)
1434	movsd	%xmm11, 0 * SIZE(CO2, LDC, 2)
1435
1436#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1437    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1438	movq	K, %rax
1439	subq	KKK, %rax
1440	leaq	(,%rax, SIZE), %rax
1441	leaq	(AO, %rax, 1), AO
1442	leaq	(BO, %rax, 8), BO
1443#endif
1444
1445#if defined(TRMMKERNEL) && defined(LEFT)
1446	addq	$1, KK
1447#endif
1448	ALIGN_3
1449
1450.L39:
1451#if defined(TRMMKERNEL) && !defined(LEFT)
1452	addl	$4, KK
1453#endif
1454
1455	leaq	(C, LDC, 4), C		# c += 4 * ldc
1456	decq	J			# j --
1457	jg	.L01
1458	ALIGN_3
1459
1460.L40:
1461	testq	$3, N
1462	je	.L999
1463
1464	testq	$2, N
1465	je	.L80
1466	ALIGN_4
1467
1468.L41:
1469/* Copying to Sub Buffer */
1470	leaq	BUFFER, BO
1471
1472#if defined(TRMMKERNEL) && defined(LEFT)
1473	movq	OFFSET, %rax
1474	movq	%rax, KK
1475#endif
1476
1477	movq	K, %rax
1478	sarq	$2, %rax
1479	jle	.L43
1480	ALIGN_3
1481
1482.L42:
1483	PREFETCH	 56 * SIZE(B)
1484
1485	movq	 0 * SIZE(B), %mm0
1486	movq	 1 * SIZE(B), %mm1
1487	movq	 2 * SIZE(B), %mm2
1488	movq	 3 * SIZE(B), %mm3
1489	movq	 4 * SIZE(B), %mm4
1490	movq	 5 * SIZE(B), %mm5
1491	movq	 6 * SIZE(B), %mm6
1492	movq	 7 * SIZE(B), %mm7
1493
1494	addq	$ 8 * SIZE, B
1495	addq	$16 * SIZE, BO
1496
1497	movq	%mm0, -16 * SIZE(BO)
1498	movq	%mm0, -15 * SIZE(BO)
1499	movq	%mm1, -14 * SIZE(BO)
1500	movq	%mm1, -13 * SIZE(BO)
1501	movq	%mm2, -12 * SIZE(BO)
1502	movq	%mm2, -11 * SIZE(BO)
1503	movq	%mm3, -10 * SIZE(BO)
1504	movq	%mm3,  -9 * SIZE(BO)
1505	movq	%mm4,  -8 * SIZE(BO)
1506	movq	%mm4,  -7 * SIZE(BO)
1507	movq	%mm5,  -6 * SIZE(BO)
1508	movq	%mm5,  -5 * SIZE(BO)
1509	movq	%mm6,  -4 * SIZE(BO)
1510	movq	%mm6,  -3 * SIZE(BO)
1511	movq	%mm7,  -2 * SIZE(BO)
1512	movq	%mm7,  -1 * SIZE(BO)
1513
1514	decq	%rax
1515	jne	.L42
1516	ALIGN_3
1517
1518.L43:
1519	movq	K, %rax
1520	andq	$3, %rax
1521	BRANCH
1522	jle	.L50
1523	ALIGN_3
1524
1525.L44:
1526	movq	 0 * SIZE(B), %mm0
1527	movq	 1 * SIZE(B), %mm1
1528
1529	movq	%mm0,  0 * SIZE(BO)
1530	movq	%mm0,  1 * SIZE(BO)
1531	movq	%mm1,  2 * SIZE(BO)
1532	movq	%mm1,  3 * SIZE(BO)
1533
1534	addq	$2 * SIZE, B
1535	addq	$4 * SIZE, BO
1536	decq	%rax
1537	jne	.L44
1538	ALIGN_3
1539
1540.L50:
1541	movq	C, CO1			# coffset1 = c
1542	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1543	movq	A, AO		# aoffset = a
1544
1545	movq	M,  I
1546	sarq	$2, I	# i = (m >> 2)
1547	jle	.L60
1548	ALIGN_3
1549
1550.L51:
1551#if !defined(TRMMKERNEL) || \
1552	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1553	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1554
1555	leaq	BUFFER, BO
1556#else
1557	leaq	BUFFER, BO
1558	movq	KK, %rax
1559	leaq	(, %rax, SIZE), %rax
1560	leaq	(AO, %rax, 4), AO
1561	leaq	(BO, %rax, 4), BO
1562#endif
1563
1564	movapd	-16 * SIZE(AO), %xmm0
1565	pxor	%xmm8, %xmm8
1566	movapd	 0 * SIZE(BO), %xmm1
1567	pxor	%xmm9, %xmm9
1568	movapd	 -8 * SIZE(AO), %xmm2
1569	pxor	%xmm12, %xmm12
1570	movapd	 8 * SIZE(BO), %xmm3
1571	pxor	%xmm13, %xmm13
1572
1573	movapd	 0 * SIZE(AO), %xmm4
1574	movapd	16 * SIZE(BO), %xmm5
1575	movapd	 8 * SIZE(AO), %xmm6
1576	movapd	24 * SIZE(BO), %xmm7
1577
1578	PREFETCHW      4 * SIZE(CO1)
1579	PREFETCHW      4 * SIZE(CO2)
1580
1581#ifndef TRMMKERNEL
1582	movq	K, %rax
1583#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1584	movq	K, %rax
1585	subq	KK, %rax
1586	movq	%rax, KKK
1587#else
1588	movq	KK, %rax
1589#ifdef LEFT
1590	addq	$4, %rax
1591#else
1592	addq	$2, %rax
1593#endif
1594	movq	%rax, KKK
1595#endif
1596	sarq	$3, %rax
1597	je	.L55
1598	ALIGN_3
1599
1600.L52:
1601	mulpd	%xmm0, %xmm1
1602	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1603	mulpd	 2 * SIZE(BO), %xmm0
1604	addpd	%xmm1, %xmm8
1605	movapd	 0 * SIZE(BO), %xmm1
1606	addpd	%xmm0, %xmm9
1607	movapd	-14 * SIZE(AO), %xmm0
1608	mulpd	%xmm0, %xmm1
1609	mulpd	 2 * SIZE(BO), %xmm0
1610	addpd	%xmm1, %xmm12
1611	movapd	 4 * SIZE(BO), %xmm1
1612	addpd	%xmm0, %xmm13
1613	movapd	-12 * SIZE(AO), %xmm0
1614
1615	mulpd	%xmm0, %xmm1
1616	mulpd	 6 * SIZE(BO), %xmm0
1617	addpd	%xmm1, %xmm8
1618	movapd	 4 * SIZE(BO), %xmm1
1619	addpd	%xmm0, %xmm9
1620	movapd	-10 * SIZE(AO), %xmm0
1621	mulpd	%xmm0, %xmm1
1622	mulpd	 6 * SIZE(BO), %xmm0
1623	addpd	%xmm1, %xmm12
1624	movapd	32 * SIZE(BO), %xmm1
1625	addpd	%xmm0, %xmm13
1626	movapd	16 * SIZE(AO), %xmm0
1627
1628	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1629	mulpd	%xmm2, %xmm3
1630	mulpd	10 * SIZE(BO), %xmm2
1631	addpd	%xmm3, %xmm8
1632	movapd	 8 * SIZE(BO), %xmm3
1633	addpd	%xmm2, %xmm9
1634	movapd	-6 * SIZE(AO), %xmm2
1635	mulpd	%xmm2, %xmm3
1636	mulpd	10 * SIZE(BO), %xmm2
1637	addpd	%xmm3, %xmm12
1638	movapd	12 * SIZE(BO), %xmm3
1639	addpd	%xmm2, %xmm13
1640	movapd	-4 * SIZE(AO), %xmm2
1641
1642	mulpd	%xmm2, %xmm3
1643	mulpd	14 * SIZE(BO), %xmm2
1644	addpd	%xmm3, %xmm8
1645	movapd	12 * SIZE(BO), %xmm3
1646	addpd	%xmm2, %xmm9
1647	movapd	-2 * SIZE(AO), %xmm2
1648	mulpd	%xmm2, %xmm3
1649	mulpd	14 * SIZE(BO), %xmm2
1650	addpd	%xmm3, %xmm12
1651	movapd	40 * SIZE(BO), %xmm3
1652	addpd	%xmm2, %xmm13
1653	movapd	24 * SIZE(AO), %xmm2
1654
1655	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
1656	mulpd	%xmm4, %xmm5
1657	mulpd	18 * SIZE(BO), %xmm4
1658	addpd	%xmm5, %xmm8
1659	movapd	16 * SIZE(BO), %xmm5
1660	addpd	%xmm4, %xmm9
1661	movapd	 2 * SIZE(AO), %xmm4
1662	mulpd	%xmm4, %xmm5
1663	mulpd	18 * SIZE(BO), %xmm4
1664	addpd	%xmm5, %xmm12
1665	movapd	20 * SIZE(BO), %xmm5
1666	addpd	%xmm4, %xmm13
1667	movapd	 4 * SIZE(AO), %xmm4
1668
1669	mulpd	%xmm4, %xmm5
1670	mulpd	22 * SIZE(BO), %xmm4
1671	addpd	%xmm5, %xmm8
1672	movapd	20 * SIZE(BO), %xmm5
1673	addpd	%xmm4, %xmm9
1674	movapd	 6 * SIZE(AO), %xmm4
1675	mulpd	%xmm4, %xmm5
1676	mulpd	22 * SIZE(BO), %xmm4
1677	addpd	%xmm5, %xmm12
1678	movapd	48 * SIZE(BO), %xmm5
1679	addpd	%xmm4, %xmm13
1680	movapd	32 * SIZE(AO), %xmm4
1681
1682	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
1683	mulpd	%xmm6, %xmm7
1684	mulpd	26 * SIZE(BO), %xmm6
1685	addpd	%xmm7, %xmm8
1686	movapd	24 * SIZE(BO), %xmm7
1687	addpd	%xmm6, %xmm9
1688	movapd	10 * SIZE(AO), %xmm6
1689	mulpd	%xmm6, %xmm7
1690	mulpd	26 * SIZE(BO), %xmm6
1691	addpd	%xmm7, %xmm12
1692	movapd	28 * SIZE(BO), %xmm7
1693	addpd	%xmm6, %xmm13
1694	movapd	12 * SIZE(AO), %xmm6
1695
1696	mulpd	%xmm6, %xmm7
1697	mulpd	30 * SIZE(BO), %xmm6
1698	addpd	%xmm7, %xmm8
1699	movapd	28 * SIZE(BO), %xmm7
1700	addpd	%xmm6, %xmm9
1701	movapd	14 * SIZE(AO), %xmm6
1702	mulpd	%xmm6, %xmm7
1703	mulpd	30 * SIZE(BO), %xmm6
1704	addpd	%xmm7, %xmm12
1705	movapd	56 * SIZE(BO), %xmm7
1706	addpd	%xmm6, %xmm13
1707	movapd	40 * SIZE(AO), %xmm6
1708
1709	addq   $32 * SIZE, AO
1710	addq   $32 * SIZE, BO
1711	decq   %rax
1712	jne    .L52
1713	ALIGN_3
1714
1715.L55:
1716#ifndef TRMMKERNEL
1717	movq	K, %rax
1718#else
1719	movq	KKK, %rax
1720#endif
1721	movapd	ALPHA, %xmm7
1722	andq	$7, %rax		# if (k & 1)
1723	BRANCH
1724	je .L59
1725	ALIGN_3
1726
1727.L56:
1728	movapd	 0 * SIZE(BO), %xmm1
1729	mulpd	%xmm0, %xmm1
1730	addpd	%xmm1, %xmm8
1731	mulpd	 2 * SIZE(BO), %xmm0
1732	addpd	%xmm0, %xmm9
1733	movapd	-14 * SIZE(AO), %xmm0
1734	movapd	 0 * SIZE(BO), %xmm1
1735	mulpd	%xmm0, %xmm1
1736	addpd	%xmm1, %xmm12
1737	mulpd	 2 * SIZE(BO), %xmm0
1738	addpd	%xmm0, %xmm13
1739	movapd	-12 * SIZE(AO), %xmm0
1740
1741	addq	$4 * SIZE, AO		# aoffset  += 4
1742	addq	$4 * SIZE, BO		# boffset1 += 8
1743	decq	%rax
1744	jg	.L56
1745	ALIGN_3
1746
1747.L59:
1748#ifndef TRMMKERNEL
1749	movsd	0 * SIZE(CO1), %xmm0
1750	movhpd	1 * SIZE(CO1), %xmm0
1751	movsd	2 * SIZE(CO1), %xmm1
1752	movhpd	3 * SIZE(CO1), %xmm1
1753	movsd	0 * SIZE(CO2), %xmm2
1754	movhpd	1 * SIZE(CO2), %xmm2
1755	movsd	2 * SIZE(CO2), %xmm3
1756	movhpd	3 * SIZE(CO2), %xmm3
1757#endif
1758
1759	mulpd	%xmm7, %xmm8
1760	mulpd	%xmm7, %xmm9
1761	mulpd	%xmm7, %xmm12
1762	mulpd	%xmm7, %xmm13
1763
1764#ifndef TRMMKERNEL
1765	addpd	%xmm0,  %xmm8
1766	addpd	%xmm1,  %xmm12
1767	addpd	%xmm2, %xmm9
1768	addpd	%xmm3, %xmm13
1769#endif
1770
1771	movsd	%xmm8, 0 * SIZE(CO1)
1772	movhpd	%xmm8, 1 * SIZE(CO1)
1773	movsd	%xmm12, 2 * SIZE(CO1)
1774	movhpd	%xmm12, 3 * SIZE(CO1)
1775	movsd	%xmm9, 0 * SIZE(CO2)
1776	movhpd	%xmm9, 1 * SIZE(CO2)
1777	movsd	%xmm13, 2 * SIZE(CO2)
1778	movhpd	%xmm13, 3 * SIZE(CO2)
1779
1780#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1781    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1782	movq	K, %rax
1783	subq	KKK, %rax
1784	leaq	(,%rax, SIZE), %rax
1785	leaq	(AO, %rax, 4), AO
1786	leaq	(BO, %rax, 4), BO
1787#endif
1788
1789#if defined(TRMMKERNEL) && defined(LEFT)
1790	addq	$4, KK
1791#endif
1792
1793	addq	$4 * SIZE, CO1		# coffset += 4
1794	addq	$4 * SIZE, CO2		# coffset += 4
1795	decq	I			# i --
1796	jg	.L51
1797	ALIGN_3
1798
1799.L60:
1800	testq	$2, M
1801	je	.L70
1802	ALIGN_3
1803
1804.L61:
1805#if !defined(TRMMKERNEL) || \
1806	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1807	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1808
1809	leaq	BUFFER, BO
1810#else
1811	leaq	BUFFER, BO
1812	movq	KK, %rax
1813	leaq	(, %rax, SIZE), %rax
1814	leaq	(AO, %rax, 2), AO
1815	leaq	(BO, %rax, 4), BO
1816#endif
1817
1818	movapd	-16 * SIZE(AO), %xmm0
1819	pxor	%xmm8, %xmm8
1820	movapd	 0 * SIZE(BO), %xmm1
1821	pxor	%xmm9, %xmm9
1822	movapd	 -8 * SIZE(AO), %xmm2
1823	pxor	%xmm10, %xmm10
1824	movapd	 8 * SIZE(BO), %xmm3
1825	pxor	%xmm11, %xmm11
1826
1827	movapd	16 * SIZE(BO), %xmm5
1828	movapd	24 * SIZE(BO), %xmm7
1829
1830#ifndef TRMMKERNEL
1831	movq	K, %rax
1832#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1833	movq	K, %rax
1834	subq	KK, %rax
1835	movq	%rax, KKK
1836#else
1837	movq	KK, %rax
1838#ifdef LEFT
1839	addq	$2, %rax
1840#else
1841	addq	$2, %rax
1842#endif
1843	movq	%rax, KKK
1844#endif
1845	sarq	$3, %rax
1846	je	.L65
1847	ALIGN_3
1848
1849.L62:
1850	mulpd	%xmm0, %xmm1
1851	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
1852	mulpd	 2 * SIZE(BO), %xmm0
1853	addpd	%xmm1, %xmm8
1854	movapd	 4 * SIZE(BO), %xmm1
1855	addpd	%xmm0, %xmm9
1856	movapd	-14 * SIZE(AO), %xmm0
1857
1858	mulpd	%xmm0, %xmm1
1859	mulpd	 6 * SIZE(BO), %xmm0
1860	addpd	%xmm1, %xmm10
1861	movapd	32 * SIZE(BO), %xmm1
1862	addpd	%xmm0, %xmm11
1863	movapd	-12 * SIZE(AO), %xmm0
1864
1865	mulpd	%xmm0, %xmm3
1866	mulpd	10 * SIZE(BO), %xmm0
1867	addpd	%xmm3, %xmm8
1868	movapd	12 * SIZE(BO), %xmm3
1869	addpd	%xmm0, %xmm9
1870	movapd	-10 * SIZE(AO), %xmm0
1871
1872	mulpd	%xmm0, %xmm3
1873	mulpd	14 * SIZE(BO), %xmm0
1874	addpd	%xmm3, %xmm10
1875	movapd	40 * SIZE(BO), %xmm3
1876	addpd	%xmm0, %xmm11
1877	movapd	 0 * SIZE(AO), %xmm0
1878
1879	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
1880	mulpd	%xmm2, %xmm5
1881	mulpd	18 * SIZE(BO), %xmm2
1882	addpd	%xmm5, %xmm8
1883	movapd	20 * SIZE(BO), %xmm5
1884	addpd	%xmm2, %xmm9
1885	movapd	-6 * SIZE(AO), %xmm2
1886
1887	mulpd	%xmm2, %xmm5
1888	mulpd	22 * SIZE(BO), %xmm2
1889	addpd	%xmm5, %xmm10
1890	movapd	48 * SIZE(BO), %xmm5
1891	addpd	%xmm2, %xmm11
1892	movapd	-4 * SIZE(AO), %xmm2
1893
1894	mulpd	%xmm2, %xmm7
1895	mulpd	26 * SIZE(BO), %xmm2
1896	addpd	%xmm7, %xmm8
1897	movapd	28 * SIZE(BO), %xmm7
1898	addpd	%xmm2, %xmm9
1899	movapd	-2 * SIZE(AO), %xmm2
1900
1901	mulpd	%xmm2, %xmm7
1902	mulpd	30 * SIZE(BO), %xmm2
1903	addpd	%xmm7, %xmm10
1904	movapd	56 * SIZE(BO), %xmm7
1905	addpd	%xmm2, %xmm11
1906	movapd	 8 * SIZE(AO), %xmm2
1907
1908	addq   $16 * SIZE, AO
1909	addq   $32 * SIZE, BO
1910	decq   %rax
1911	jne    .L62
1912	ALIGN_3
1913
1914.L65:
1915#ifndef TRMMKERNEL
1916	movq	K, %rax
1917#else
1918	movq	KKK, %rax
1919#endif
1920	movapd	ALPHA, %xmm7
1921	andq	$7, %rax		# if (k & 1)
1922	BRANCH
1923	je .L69
1924	ALIGN_3
1925
1926.L66:
1927	mulpd	%xmm0, %xmm1
1928	mulpd	 2 * SIZE(BO), %xmm0
1929	addpd	%xmm1, %xmm8
1930	movapd	 4 * SIZE(BO), %xmm1
1931	addpd	%xmm0, %xmm9
1932	movapd	-14 * SIZE(AO), %xmm0
1933
1934	addq	$2 * SIZE, AO		# aoffset  += 4
1935	addq	$4 * SIZE, BO		# boffset1 += 8
1936	decq	%rax
1937	jg	.L66
1938	ALIGN_3
1939
1940.L69:
1941#ifndef TRMMKERNEL
1942	movsd	0 * SIZE(CO1), %xmm0
1943	movhpd	1 * SIZE(CO1), %xmm0
1944	movsd	0 * SIZE(CO2), %xmm2
1945	movhpd	1 * SIZE(CO2), %xmm2
1946#endif
1947
1948	addpd	%xmm10, %xmm8
1949	addpd	%xmm11, %xmm9
1950
1951	mulpd	%xmm7, %xmm8
1952	mulpd	%xmm7, %xmm9
1953
1954#ifndef TRMMKERNEL
1955	addpd	%xmm0,  %xmm8
1956	addpd	%xmm2, %xmm9
1957#endif
1958
1959	movsd	%xmm8, 0 * SIZE(CO1)
1960	movhpd	%xmm8, 1 * SIZE(CO1)
1961	movsd	%xmm9, 0 * SIZE(CO2)
1962	movhpd	%xmm9, 1 * SIZE(CO2)
1963
1964#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1965    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1966	movq	K, %rax
1967	subq	KKK, %rax
1968	leaq	(,%rax, SIZE), %rax
1969	leaq	(AO, %rax, 2), AO
1970	leaq	(BO, %rax, 4), BO
1971#endif
1972
1973#if defined(TRMMKERNEL) && defined(LEFT)
1974	addq	$2, KK
1975#endif
1976
1977	addq	$2 * SIZE, CO1		# coffset += 4
1978	addq	$2 * SIZE, CO2		# coffset += 4
1979	ALIGN_3
1980
1981.L70:
1982	testq	$1, M
1983	je	.L79
1984	ALIGN_3
1985
1986.L71:
1987#if !defined(TRMMKERNEL) || \
1988	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1989	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1990
1991	leaq	BUFFER, BO
1992#else
1993	leaq	BUFFER, BO
1994	movq	KK, %rax
1995	leaq	(, %rax, SIZE), %rax
1996	leaq	(AO, %rax, 1), AO
1997	leaq	(BO, %rax, 4), BO
1998#endif
1999
2000	movsd	-16 * SIZE(AO), %xmm0
2001	pxor	%xmm8, %xmm8
2002	movsd	 0 * SIZE(BO), %xmm1
2003	pxor	%xmm9, %xmm9
2004	movsd	-12 * SIZE(AO), %xmm2
2005	pxor	%xmm10, %xmm10
2006	movsd	 8 * SIZE(BO), %xmm3
2007	pxor	%xmm11, %xmm11
2008
2009	movsd	16 * SIZE(BO), %xmm5
2010	movsd	24 * SIZE(BO), %xmm7
2011
2012#ifndef TRMMKERNEL
2013	movq	K, %rax
2014#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2015	movq	K, %rax
2016	subq	KK, %rax
2017	movq	%rax, KKK
2018#else
2019	movq	KK, %rax
2020#ifdef LEFT
2021	addq	$1, %rax
2022#else
2023	addq	$2, %rax
2024#endif
2025	movq	%rax, KKK
2026#endif
2027	sarq	$3, %rax
2028	je	.L75
2029	ALIGN_3
2030
2031.L72:
2032	mulsd	%xmm0, %xmm1
2033	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2034	mulsd	 2 * SIZE(BO), %xmm0
2035	addsd	%xmm1, %xmm8
2036	movsd	 4 * SIZE(BO), %xmm1
2037	addsd	%xmm0, %xmm9
2038	movsd	-15 * SIZE(AO), %xmm0
2039
2040	mulsd	%xmm0, %xmm1
2041	mulsd	 6 * SIZE(BO), %xmm0
2042	addsd	%xmm1, %xmm10
2043	movsd	32 * SIZE(BO), %xmm1
2044	addsd	%xmm0, %xmm11
2045	movsd	-14 * SIZE(AO), %xmm0
2046
2047	mulsd	%xmm0, %xmm3
2048	mulsd	10 * SIZE(BO), %xmm0
2049	addsd	%xmm3, %xmm8
2050	movsd	12 * SIZE(BO), %xmm3
2051	addsd	%xmm0, %xmm9
2052	movsd	-13 * SIZE(AO), %xmm0
2053
2054	mulsd	%xmm0, %xmm3
2055	mulsd	14 * SIZE(BO), %xmm0
2056	addsd	%xmm3, %xmm10
2057	movsd	40 * SIZE(BO), %xmm3
2058	addsd	%xmm0, %xmm11
2059	movsd	-8 * SIZE(AO), %xmm0
2060
2061	mulsd	%xmm2, %xmm5
2062	mulsd	18 * SIZE(BO), %xmm2
2063	addsd	%xmm5, %xmm8
2064	movsd	20 * SIZE(BO), %xmm5
2065	addsd	%xmm2, %xmm9
2066	movsd	-11 * SIZE(AO), %xmm2
2067
2068	mulsd	%xmm2, %xmm5
2069	mulsd	22 * SIZE(BO), %xmm2
2070	addsd	%xmm5, %xmm10
2071	movsd	48 * SIZE(BO), %xmm5
2072	addsd	%xmm2, %xmm11
2073	movsd	-10 * SIZE(AO), %xmm2
2074
2075	mulsd	%xmm2, %xmm7
2076	mulsd	26 * SIZE(BO), %xmm2
2077	addsd	%xmm7, %xmm8
2078	movsd	28 * SIZE(BO), %xmm7
2079	addsd	%xmm2, %xmm9
2080	movsd	-9 * SIZE(AO), %xmm2
2081
2082	mulsd	%xmm2, %xmm7
2083	mulsd	30 * SIZE(BO), %xmm2
2084	addsd	%xmm7, %xmm10
2085	movsd	56 * SIZE(BO), %xmm7
2086	addsd	%xmm2, %xmm11
2087	movsd	-4 * SIZE(AO), %xmm2
2088
2089	addq   $ 8 * SIZE, AO
2090	addq   $32 * SIZE, BO
2091	decq   %rax
2092	jne    .L72
2093	ALIGN_3
2094
2095.L75:
2096#ifndef TRMMKERNEL
2097	movq	K, %rax
2098#else
2099	movq	KKK, %rax
2100#endif
2101	movsd	ALPHA, %xmm7
2102	andq	$7, %rax		# if (k & 1)
2103	BRANCH
2104	je .L78
2105	ALIGN_3
2106
2107.L76:
2108	mulsd	%xmm0, %xmm1
2109	mulsd	 2 * SIZE(BO), %xmm0
2110	addsd	%xmm1, %xmm8
2111	addsd	%xmm0, %xmm9
2112	movsd	-15 * SIZE(AO), %xmm0
2113	movsd	 4 * SIZE(BO), %xmm1
2114
2115	addq	$1 * SIZE, AO		# aoffset  += 4
2116	addq	$4 * SIZE, BO		# boffset1 += 8
2117	decq	%rax
2118	jg	.L76
2119	ALIGN_3
2120
2121.L78:
2122#ifndef TRMMKERNEL
2123	movsd	0 * SIZE(CO1), %xmm0
2124	movsd	0 * SIZE(CO2), %xmm2
2125#endif
2126
2127	addsd	%xmm10, %xmm8
2128	addsd	%xmm11, %xmm9
2129
2130	mulsd	%xmm7, %xmm8
2131	mulsd	%xmm7, %xmm9
2132
2133#ifndef TRMMKERNEL
2134	addsd	%xmm0,  %xmm8
2135	addsd	%xmm2, %xmm9
2136#endif
2137
2138	movsd	%xmm8, 0 * SIZE(CO1)
2139	movsd	%xmm9, 0 * SIZE(CO2)
2140
2141#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2142    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2143	movq	K, %rax
2144	subq	KKK, %rax
2145	leaq	(,%rax, SIZE), %rax
2146	leaq	(AO, %rax, 1), AO
2147	leaq	(BO, %rax, 4), BO
2148#endif
2149
2150#if defined(TRMMKERNEL) && defined(LEFT)
2151	addq	$1, KK
2152#endif
2153	ALIGN_3
2154
2155.L79:
2156#if defined(TRMMKERNEL) && !defined(LEFT)
2157	addl	$2, KK
2158#endif
2159	leaq	(C, LDC, 2), C
2160	ALIGN_3
2161
2162.L80:
2163	testq	$1, N
2164	je	.L999
2165	ALIGN_4
2166
2167.L81:
2168/* Copying to Sub Buffer */
2169	leaq	BUFFER, BO
2170
2171#if defined(TRMMKERNEL) && defined(LEFT)
2172	movq	OFFSET, %rax
2173	movq	%rax, KK
2174#endif
2175
2176	movq	K, %rax
2177	sarq	$3, %rax
2178	jle	.L83
2179	ALIGN_3
2180
2181.L82:
2182	PREFETCH	 56 * SIZE(B)
2183
2184	movq	 0 * SIZE(B), %mm0
2185	movq	 1 * SIZE(B), %mm1
2186	movq	 2 * SIZE(B), %mm2
2187	movq	 3 * SIZE(B), %mm3
2188	movq	 4 * SIZE(B), %mm4
2189	movq	 5 * SIZE(B), %mm5
2190	movq	 6 * SIZE(B), %mm6
2191	movq	 7 * SIZE(B), %mm7
2192
2193	addq	$ 8 * SIZE, B
2194	addq	$16 * SIZE, BO
2195
2196	movq	%mm0, -16 * SIZE(BO)
2197	movq	%mm0, -15 * SIZE(BO)
2198	movq	%mm1, -14 * SIZE(BO)
2199	movq	%mm1, -13 * SIZE(BO)
2200	movq	%mm2, -12 * SIZE(BO)
2201	movq	%mm2, -11 * SIZE(BO)
2202	movq	%mm3, -10 * SIZE(BO)
2203	movq	%mm3,  -9 * SIZE(BO)
2204	movq	%mm4,  -8 * SIZE(BO)
2205	movq	%mm4,  -7 * SIZE(BO)
2206	movq	%mm5,  -6 * SIZE(BO)
2207	movq	%mm5,  -5 * SIZE(BO)
2208	movq	%mm6,  -4 * SIZE(BO)
2209	movq	%mm6,  -3 * SIZE(BO)
2210	movq	%mm7,  -2 * SIZE(BO)
2211	movq	%mm7,  -1 * SIZE(BO)
2212
2213	decq	%rax
2214	jne	.L82
2215	ALIGN_3
2216
2217.L83:
2218	movq	K, %rax
2219	andq	$7, %rax
2220	BRANCH
2221	jle	.L90
2222	ALIGN_3
2223
2224.L84:
2225	movq	 0 * SIZE(B), %mm0
2226
2227	movq	%mm0,  0 * SIZE(BO)
2228	movq	%mm0,  1 * SIZE(BO)
2229
2230	addq	$1 * SIZE, B
2231	addq	$2 * SIZE, BO
2232	decq	%rax
2233	jne	.L84
2234	ALIGN_3
2235
2236.L90:
2237	movq	C, CO1			# coffset1 = c
2238	movq	A, AO		# aoffset = a
2239
2240	movq	M,  I
2241	sarq	$2, I	# i = (m >> 2)
2242	jle	.L100
2243	ALIGN_3
2244
2245.L91:
2246#if !defined(TRMMKERNEL) || \
2247	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2248	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2249
2250	leaq	BUFFER, BO
2251#else
2252	leaq	BUFFER, BO
2253	movq	KK, %rax
2254	leaq	(, %rax, SIZE), %rax
2255	leaq	(AO, %rax, 4), AO
2256	leaq	(BO, %rax, 2), BO
2257#endif
2258
2259	movapd	-16 * SIZE(AO), %xmm0
2260	pxor	%xmm8, %xmm8
2261	movapd	 0 * SIZE(BO), %xmm1
2262	pxor	%xmm9, %xmm9
2263	movapd	 -8 * SIZE(AO), %xmm2
2264	pxor	%xmm10, %xmm10
2265	movapd	 8 * SIZE(BO), %xmm3
2266	pxor	%xmm11, %xmm11
2267
2268	movapd	 0 * SIZE(AO), %xmm4
2269	movapd	 8 * SIZE(AO), %xmm6
2270
2271	PREFETCHW      4 * SIZE(CO1)
2272
2273#ifndef TRMMKERNEL
2274	movq	K, %rax
2275#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2276	movq	K, %rax
2277	subq	KK, %rax
2278	movq	%rax, KKK
2279#else
2280	movq	KK, %rax
2281#ifdef LEFT
2282	addq	$4, %rax
2283#else
2284	addq	$1, %rax
2285#endif
2286	movq	%rax, KKK
2287#endif
2288	sarq	$3, %rax
2289	je	.L95
2290	ALIGN_3
2291
2292.L92:
2293	mulpd	%xmm1, %xmm0
2294	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2295	mulpd	-14 * SIZE(AO), %xmm1
2296	addpd	%xmm0, %xmm8
2297	movapd	-12 * SIZE(AO), %xmm0
2298	addpd	%xmm1, %xmm9
2299	movapd	 2 * SIZE(BO), %xmm1
2300	mulpd	%xmm1, %xmm0
2301	mulpd	-10 * SIZE(AO), %xmm1
2302	addpd	%xmm0, %xmm10
2303	movapd	 16 * SIZE(AO), %xmm0
2304	PREFETCH	(PREFETCHSIZE     +  8) * SIZE(AO)
2305	addpd	%xmm1, %xmm11
2306	movapd	 4 * SIZE(BO), %xmm1
2307	mulpd	%xmm1, %xmm2
2308	mulpd	-6 * SIZE(AO), %xmm1
2309	addpd	%xmm2, %xmm8
2310	movapd	-4 * SIZE(AO), %xmm2
2311	addpd	%xmm1, %xmm9
2312	movapd	 6 * SIZE(BO), %xmm1
2313	mulpd	%xmm1, %xmm2
2314	mulpd	-2 * SIZE(AO), %xmm1
2315	addpd	%xmm2, %xmm10
2316	movapd	24 * SIZE(AO), %xmm2
2317	PREFETCH	(PREFETCHSIZE     + 16) * SIZE(AO)
2318	addpd	%xmm1, %xmm11
2319	movapd	16 * SIZE(BO), %xmm1
2320	mulpd	%xmm3, %xmm4
2321	mulpd	 2 * SIZE(AO), %xmm3
2322	addpd	%xmm4, %xmm8
2323	movapd	 4 * SIZE(AO), %xmm4
2324	addpd	%xmm3, %xmm9
2325	movapd	10 * SIZE(BO), %xmm3
2326	mulpd	%xmm3, %xmm4
2327	mulpd	 6 * SIZE(AO), %xmm3
2328	addpd	%xmm4, %xmm10
2329	movapd	32 * SIZE(AO), %xmm4
2330	PREFETCH	(PREFETCHSIZE     + 24) * SIZE(AO)
2331	addpd	%xmm3, %xmm11
2332	movapd	12 * SIZE(BO), %xmm3
2333	mulpd	%xmm3, %xmm6
2334	mulpd	10 * SIZE(AO), %xmm3
2335	addpd	%xmm6, %xmm8
2336	movapd	12 * SIZE(AO), %xmm6
2337	addpd	%xmm3, %xmm9
2338	movapd	14 * SIZE(BO), %xmm3
2339	mulpd	%xmm3, %xmm6
2340	mulpd	14 * SIZE(AO), %xmm3
2341	addpd	%xmm6, %xmm10
2342	movapd	40 * SIZE(AO), %xmm6
2343	addpd	%xmm3, %xmm11
2344	movapd	24 * SIZE(BO), %xmm3
2345
2346	addq   $32 * SIZE, AO
2347	addq   $16 * SIZE, BO
2348	decq   %rax
2349	jne    .L92
2350	ALIGN_3
2351
2352.L95:
2353#ifndef TRMMKERNEL
2354	movq	K, %rax
2355#else
2356	movq	KKK, %rax
2357#endif
2358	movapd	ALPHA, %xmm7
2359	andq	$7, %rax		# if (k & 1)
2360	BRANCH
2361	je .L99
2362	ALIGN_3
2363
2364.L96:
2365	mulpd	%xmm1, %xmm0
2366	mulpd	-14 * SIZE(AO), %xmm1
2367	addpd	%xmm0, %xmm8
2368	movapd	-12 * SIZE(AO), %xmm0
2369	addpd	%xmm1, %xmm9
2370	movapd	 2 * SIZE(BO), %xmm1
2371
2372	addq	$4 * SIZE, AO		# aoffset  += 4
2373	addq	$2 * SIZE, BO		# boffset1 += 8
2374	decq	%rax
2375	jg	.L96
2376	ALIGN_3
2377
2378.L99:
2379#ifndef TRMMKERNEL
2380	movsd	0 * SIZE(CO1), %xmm0
2381	movhpd	1 * SIZE(CO1), %xmm0
2382	movsd	2 * SIZE(CO1), %xmm1
2383	movhpd	3 * SIZE(CO1), %xmm1
2384#endif
2385
2386	addpd	%xmm10, %xmm8
2387	addpd	%xmm11, %xmm9
2388
2389	mulpd	%xmm7, %xmm8
2390	mulpd	%xmm7, %xmm9
2391
2392#ifndef TRMMKERNEL
2393	addpd	%xmm0,  %xmm8
2394	addpd	%xmm1,  %xmm9
2395#endif
2396
2397	movsd	%xmm8, 0 * SIZE(CO1)
2398	movhpd	%xmm8, 1 * SIZE(CO1)
2399	movsd	%xmm9, 2 * SIZE(CO1)
2400	movhpd	%xmm9, 3 * SIZE(CO1)
2401
2402#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2403    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2404	movq	K, %rax
2405	subq	KKK, %rax
2406	leaq	(,%rax, SIZE), %rax
2407	leaq	(AO, %rax, 4), AO
2408	leaq	(BO, %rax, 2), BO
2409#endif
2410
2411#if defined(TRMMKERNEL) && defined(LEFT)
2412	addq	$4, KK
2413#endif
2414
2415	addq	$4 * SIZE, CO1		# coffset += 4
2416	decq	I			# i --
2417	jg	.L91
2418	ALIGN_3
2419
2420.L100:
2421	testq	$2, M
2422	je	.L110
2423	ALIGN_3
2424
2425.L101:
2426#if !defined(TRMMKERNEL) || \
2427	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2428	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2429
2430	leaq	BUFFER, BO
2431#else
2432	leaq	BUFFER, BO
2433	movq	KK, %rax
2434	leaq	(, %rax, SIZE), %rax
2435	leaq	(AO, %rax, 2), AO
2436	leaq	(BO, %rax, 2), BO
2437#endif
2438
2439	movapd	-16 * SIZE(AO), %xmm0
2440	pxor	%xmm8, %xmm8
2441	movapd	 0 * SIZE(BO), %xmm1
2442	pxor	%xmm9, %xmm9
2443	movapd	-8 * SIZE(AO), %xmm2
2444	pxor	%xmm10, %xmm10
2445	movapd	 8 * SIZE(BO), %xmm3
2446	pxor	%xmm11, %xmm11
2447
2448#ifndef TRMMKERNEL
2449	movq	K, %rax
2450#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2451	movq	K, %rax
2452	subq	KK, %rax
2453	movq	%rax, KKK
2454#else
2455	movq	KK, %rax
2456#ifdef LEFT
2457	addq	$2, %rax
2458#else
2459	addq	$1, %rax
2460#endif
2461	movq	%rax, KKK
2462#endif
2463	sarq	$3, %rax
2464	je	.L105
2465	ALIGN_3
2466
2467.L102:
2468	mulpd	%xmm0, %xmm1
2469	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2470	movapd	-14 * SIZE(AO), %xmm0
2471	mulpd	 2 * SIZE(BO), %xmm0
2472	addpd	%xmm1, %xmm8
2473	movapd	16 * SIZE(BO), %xmm1
2474	addpd	%xmm0, %xmm9
2475	movapd	-12 * SIZE(AO), %xmm0
2476	mulpd	 4 * SIZE(BO), %xmm0
2477	addpd	%xmm0, %xmm10
2478	movapd	-10 * SIZE(AO), %xmm0
2479	mulpd	 6 * SIZE(BO), %xmm0
2480	addpd	%xmm0, %xmm11
2481	movapd	 0 * SIZE(AO), %xmm0
2482	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2483	mulpd	%xmm2, %xmm3
2484	movapd	-6 * SIZE(AO), %xmm2
2485	mulpd	10 * SIZE(BO), %xmm2
2486	addpd	%xmm3, %xmm8
2487	movapd	24 * SIZE(BO), %xmm3
2488	addpd	%xmm2, %xmm9
2489	movapd	-4 * SIZE(AO), %xmm2
2490	mulpd	12 * SIZE(BO), %xmm2
2491	addpd	%xmm2, %xmm10
2492	movapd	-2 * SIZE(AO), %xmm2
2493	mulpd	14 * SIZE(BO), %xmm2
2494	addpd	%xmm2, %xmm11
2495	movapd	 8 * SIZE(AO), %xmm2
2496
2497	addq   $16 * SIZE, AO
2498	addq   $16 * SIZE, BO
2499	decq   %rax
2500	jne    .L102
2501	ALIGN_3
2502
2503.L105:
2504#ifndef TRMMKERNEL
2505	movq	K, %rax
2506#else
2507	movq	KKK, %rax
2508#endif
2509	movapd	ALPHA, %xmm7
2510	andq	$7, %rax		# if (k & 1)
2511	BRANCH
2512	je .L109
2513	ALIGN_3
2514
2515.L106:
2516	mulpd	%xmm0, %xmm1
2517	addpd	%xmm1, %xmm8
2518	movapd	-14 * SIZE(AO), %xmm0
2519	movapd	  2 * SIZE(BO), %xmm1
2520
2521	addq	$2 * SIZE, AO		# aoffset  += 4
2522	addq	$2 * SIZE, BO		# boffset1 += 8
2523	decq	%rax
2524	jg	.L106
2525	ALIGN_3
2526
2527.L109:
2528	addpd	%xmm9, %xmm8
2529	addpd	%xmm11, %xmm10
2530	addpd	%xmm10, %xmm8
2531
2532	mulpd	%xmm7, %xmm8
2533
2534#ifndef TRMMKERNEL
2535	movsd	0 * SIZE(CO1), %xmm0
2536	movhpd	1 * SIZE(CO1), %xmm0
2537
2538	addpd	%xmm0,  %xmm8
2539#endif
2540
2541	movsd	%xmm8, 0 * SIZE(CO1)
2542	movhpd	%xmm8, 1 * SIZE(CO1)
2543	addq	$2 * SIZE, CO1		# coffset += 4
2544
2545#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2546    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2547	movq	K, %rax
2548	subq	KKK, %rax
2549	leaq	(,%rax, SIZE), %rax
2550	leaq	(AO, %rax, 2), AO
2551	leaq	(BO, %rax, 2), BO
2552#endif
2553
2554#if defined(TRMMKERNEL) && defined(LEFT)
2555	addq	$2, KK
2556#endif
2557	ALIGN_3
2558
2559.L110:
2560	testq	$1, M
2561	je	.L999
2562	ALIGN_3
2563
2564.L111:
2565#if !defined(TRMMKERNEL) || \
2566	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2567	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2568
2569	leaq	BUFFER, BO
2570#else
2571	leaq	BUFFER, BO
2572	movq	KK, %rax
2573	leaq	(, %rax, SIZE), %rax
2574	leaq	(AO, %rax, 1), AO
2575	leaq	(BO, %rax, 2), BO
2576#endif
2577
2578	movsd	-16 * SIZE(AO), %xmm0
2579	pxor	%xmm8, %xmm8
2580	movsd	 0 * SIZE(BO), %xmm1
2581	pxor	%xmm9, %xmm9
2582	movsd	-12 * SIZE(AO), %xmm2
2583	pxor	%xmm10, %xmm10
2584	movsd	 8 * SIZE(BO), %xmm3
2585	pxor	%xmm11, %xmm11
2586
2587#ifndef TRMMKERNEL
2588	movq	K, %rax
2589#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2590	movq	K, %rax
2591	subq	KK, %rax
2592	movq	%rax, KKK
2593#else
2594	movq	KK, %rax
2595#ifdef LEFT
2596	addq	$1, %rax
2597#else
2598	addq	$1, %rax
2599#endif
2600	movq	%rax, KKK
2601#endif
2602	sarq	$3, %rax
2603	je	.L115
2604	ALIGN_3
2605
2606.L112:
2607	mulsd	%xmm0, %xmm1
2608	PREFETCH	(PREFETCHSIZE     +  0) * SIZE(AO)
2609	movsd	-15 * SIZE(AO), %xmm0
2610	addsd	%xmm1, %xmm8
2611	movsd	16 * SIZE(BO), %xmm1
2612	mulsd	 2 * SIZE(BO), %xmm0
2613	addsd	%xmm0, %xmm9
2614	movsd	-14 * SIZE(AO), %xmm0
2615	mulsd	 4 * SIZE(BO), %xmm0
2616	addsd	%xmm0, %xmm10
2617	movsd	-13 * SIZE(AO), %xmm0
2618	mulsd	 6 * SIZE(BO), %xmm0
2619	addsd	%xmm0, %xmm11
2620	movsd	 -8 * SIZE(AO), %xmm0
2621	mulsd	%xmm2, %xmm3
2622	movsd	-11 * SIZE(AO), %xmm2
2623	addsd	%xmm3, %xmm8
2624	movsd	24 * SIZE(BO), %xmm3
2625	mulsd	10 * SIZE(BO), %xmm2
2626	addsd	%xmm2, %xmm9
2627	movsd	-10 * SIZE(AO), %xmm2
2628	mulsd	12 * SIZE(BO), %xmm2
2629	addsd	%xmm2, %xmm10
2630	movsd	-9 * SIZE(AO), %xmm2
2631	mulsd	14 * SIZE(BO), %xmm2
2632	addsd	%xmm2, %xmm11
2633	movsd	-4 * SIZE(AO), %xmm2
2634
2635	addq   $ 8 * SIZE, AO
2636	addq   $16 * SIZE, BO
2637	decq   %rax
2638	jne    .L112
2639	ALIGN_3
2640
2641.L115:
2642#ifndef TRMMKERNEL
2643	movq	K, %rax
2644#else
2645	movq	KKK, %rax
2646#endif
2647	movsd	ALPHA, %xmm7
2648	andq	$7, %rax		# if (k & 1)
2649	BRANCH
2650	je .L118
2651	ALIGN_3
2652
2653.L116:
2654	mulsd	%xmm0, %xmm1
2655	movsd	-15 * SIZE(AO), %xmm0
2656	addsd	%xmm1, %xmm8
2657	movsd	 2 * SIZE(BO), %xmm1
2658
2659	addq	$1 * SIZE, AO		# aoffset  += 4
2660	addq	$2 * SIZE, BO		# boffset1 += 8
2661	decq	%rax
2662	jg	.L116
2663	ALIGN_3
2664
2665.L118:
2666	addsd	%xmm10, %xmm8
2667	addsd	%xmm11, %xmm9
2668	addsd	%xmm9, %xmm8
2669
2670	mulsd	%xmm7, %xmm8
2671#ifndef TRMMKERNEL
2672	addsd	0 * SIZE(CO1), %xmm8
2673#endif
2674	movsd	%xmm8, 0 * SIZE(CO1)
2675	ALIGN_3
2676
2677.L999:
2678	movq	%rbx, %rsp
2679
2680	EMMS
2681
2682	movq	  0(%rsp), %rbx
2683	movq	  8(%rsp), %rbp
2684	movq	 16(%rsp), %r12
2685	movq	 24(%rsp), %r13
2686	movq	 32(%rsp), %r14
2687	movq	 40(%rsp), %r15
2688
2689#ifdef WINDOWS_ABI
2690	movq	 48(%rsp), %rdi
2691	movq	 56(%rsp), %rsi
2692	movups	 64(%rsp), %xmm6
2693	movups	 80(%rsp), %xmm7
2694	movups	 96(%rsp), %xmm8
2695	movups	112(%rsp), %xmm9
2696	movups	128(%rsp), %xmm10
2697	movups	144(%rsp), %xmm11
2698	movups	160(%rsp), %xmm12
2699	movups	176(%rsp), %xmm13
2700	movups	192(%rsp), %xmm14
2701	movups	208(%rsp), %xmm15
2702#endif
2703
2704	addq	$STACKSIZE, %rsp
2705	ret
2706
2707	EPILOGUE
2708