1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define OLD_M	%rdi
26#define OLD_N	%rsi
27
28#define M	%r13
29#define N	%r14
30#define K	%rdx
31
32#define A	%rcx
33#define B	%r8
34#define C	%r9
35#define LDC	%r10
36#define I	%r11
37#define AO	%rdi
38#define BO	%rsi
39#define	CO1	%r15
40#define CO2	%rbp
41#define BB	%r12
42
43#ifndef WINDOWS_ABI
44
45#define STACKSIZE 64
46
47#define OLD_LDC		 8 + STACKSIZE(%rsp)
48#define OLD_OFFSET	16 + STACKSIZE(%rsp)
49
50#else
51
52#define STACKSIZE 256
53
54#define OLD_A		40 + STACKSIZE(%rsp)
55#define OLD_B		48 + STACKSIZE(%rsp)
56#define OLD_C		56 + STACKSIZE(%rsp)
57#define OLD_LDC		64 + STACKSIZE(%rsp)
58#define OLD_OFFSET	72 + STACKSIZE(%rsp)
59
60#endif
61
62#define ALPHA	  0(%rsp)
63#define J	 16(%rsp)
64#define OFFSET	 24(%rsp)
65#define KK	 32(%rsp)
66#define KKK	 40(%rsp)
67#define BUFFER	256(%rsp)
68
69#ifdef OPTERON
70#define movsd movlps
71#endif
72
73#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
74#define PREFETCH     prefetch
75#define PREFETCHW    prefetchw
76#define PREFETCHSIZE (16 * 9 + 8)
77#endif
78
79#if defined(GENERIC) || defined(NANO)
80#define PREFETCH     prefetcht0
81#define PREFETCHW    prefetcht0
82#define PREFETCHSIZE (16 * 5 + 8)
83#endif
84
85#define RPREFETCHSIZE (8 *  7 + 4)
86#define WPREFETCHSIZE (8 *  8 + 4)
87
88#ifndef GENERIC
89#define KERNEL1(xx) \
90	mulps	%xmm0, %xmm1 ;\
91	addps	%xmm1, %xmm8 ;\
92	movaps	-32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
93	mulps	%xmm0, %xmm3 ;\
94	addps	%xmm3, %xmm9 ;\
95	movaps	-28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
96	mulps	%xmm0, %xmm5 ;\
97	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
98	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
99	addps	%xmm5, %xmm10 ;\
100	movaps	-24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
101	addps	%xmm0, %xmm11 ;\
102	movaps	-16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
103
104#define KERNEL2(xx) \
105	mulps	%xmm2, %xmm1 ;\
106	addps	%xmm1, %xmm12 ;\
107	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
108	mulps	%xmm2, %xmm3 ;\
109	addps	%xmm3, %xmm13 ;\
110	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
111	mulps	%xmm2, %xmm5 ;\
112	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
113	addps	%xmm5, %xmm14 ;\
114	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
115	addps	%xmm2, %xmm15 ;\
116	movaps	-12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
117
118#define KERNEL3(xx) \
119	mulps	%xmm4, %xmm7 ;\
120	addps	%xmm7, %xmm8 ;\
121	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
122	mulps	%xmm4, %xmm3 ;\
123	addps	%xmm3, %xmm9 ;\
124	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
125	mulps	%xmm4, %xmm5 ;\
126	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
127	addps	%xmm5, %xmm10 ;\
128	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
129	addps	%xmm4, %xmm11 ;\
130	movaps	 -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
131
132#define KERNEL4(xx) \
133	mulps	%xmm6, %xmm7 ;\
134	addps	%xmm7, %xmm12 ;\
135	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
136	mulps	%xmm6, %xmm3 ;\
137	addps	%xmm3, %xmm13 ;\
138	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
139	mulps	%xmm6, %xmm5 ;\
140	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
141	addps	%xmm5, %xmm14 ;\
142	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
143 	PREFETCH	(PREFETCHSIZE     + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
144	addps	%xmm6, %xmm15 ;\
145	movaps	 -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
146
147#define KERNEL5(xx) \
148	mulps	%xmm0, %xmm1 ;\
149	addps	%xmm1, %xmm8 ;\
150	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
151	mulps	%xmm0, %xmm3 ;\
152	addps	%xmm3, %xmm9 ;\
153	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
154	mulps	%xmm0, %xmm5 ;\
155	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
156	addps	%xmm5, %xmm10 ;\
157	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
158	addps	%xmm0, %xmm11 ;\
159	movaps	  0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
160
161#define KERNEL6(xx) \
162	mulps	%xmm2, %xmm1 ;\
163	addps	%xmm1, %xmm12 ;\
164	movaps	 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
165	mulps	%xmm2, %xmm3 ;\
166	addps	%xmm3, %xmm13 ;\
167	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
168	mulps	%xmm2, %xmm5 ;\
169	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
170	addps	%xmm5, %xmm14 ;\
171	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
172	addps	%xmm2, %xmm15 ;\
173	movaps	  4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
174
175#define KERNEL7(xx) \
176	mulps	%xmm4, %xmm7 ;\
177	addps	%xmm7, %xmm8 ;\
178	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
179	mulps	%xmm4, %xmm3 ;\
180	addps	%xmm3, %xmm9 ;\
181	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
182	mulps	%xmm4, %xmm5 ;\
183	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
184	addps	%xmm5, %xmm10 ;\
185	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
186	addps	%xmm4, %xmm11 ;\
187	movaps	  8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
188
189#define KERNEL8(xx) \
190	mulps	%xmm6, %xmm7 ;\
191	addps	%xmm7, %xmm12 ;\
192	movaps	 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
193	mulps	%xmm6, %xmm3 ;\
194	addps	%xmm3, %xmm13 ;\
195	movaps	 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
196	mulps	%xmm6, %xmm5 ;\
197	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
198	addps	%xmm5, %xmm14 ;\
199	movaps	 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
200	addps	%xmm6, %xmm15 ;\
201	movaps	 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
202
203#else
204#define KERNEL1(xx) \
205	mulps	%xmm0, %xmm1 ;\
206	addps	%xmm1, %xmm8 ;\
207	movaps	-32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
208	mulps	%xmm0, %xmm3 ;\
209	addps	%xmm3, %xmm9 ;\
210	movaps	-28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
211	mulps	%xmm0, %xmm5 ;\
212	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
213	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
214	addps	%xmm5, %xmm10 ;\
215	movaps	-24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
216	addps	%xmm0, %xmm11 ;\
217	movaps	-16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
218
219#define KERNEL2(xx) \
220	mulps	%xmm2, %xmm1 ;\
221	addps	%xmm1, %xmm12 ;\
222	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
223	mulps	%xmm2, %xmm3 ;\
224	addps	%xmm3, %xmm13 ;\
225	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
226	mulps	%xmm2, %xmm5 ;\
227	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
228	addps	%xmm5, %xmm14 ;\
229	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
230	addps	%xmm2, %xmm15 ;\
231	movaps	-12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\
232
233#define KERNEL3(xx) \
234	mulps	%xmm4, %xmm7 ;\
235	addps	%xmm7, %xmm8 ;\
236	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
237	mulps	%xmm4, %xmm3 ;\
238	addps	%xmm3, %xmm9 ;\
239	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
240	mulps	%xmm4, %xmm5 ;\
241	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
242	addps	%xmm5, %xmm10 ;\
243	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
244	addps	%xmm4, %xmm11 ;\
245	movaps	 -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
246
247#define KERNEL4(xx) \
248	mulps	%xmm6, %xmm7 ;\
249	addps	%xmm7, %xmm12 ;\
250	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
251	mulps	%xmm6, %xmm3 ;\
252	addps	%xmm3, %xmm13 ;\
253	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
254	mulps	%xmm6, %xmm5 ;\
255	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
256	addps	%xmm5, %xmm14 ;\
257	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
258	addps	%xmm6, %xmm15 ;\
259	movaps	 -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
260
261#define KERNEL5(xx) \
262	mulps	%xmm0, %xmm1 ;\
263 	PREFETCH	(PREFETCHSIZE     + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\
264	addps	%xmm1, %xmm8 ;\
265	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
266	mulps	%xmm0, %xmm3 ;\
267	addps	%xmm3, %xmm9 ;\
268	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
269	mulps	%xmm0, %xmm5 ;\
270	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
271	addps	%xmm5, %xmm10 ;\
272	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
273	addps	%xmm0, %xmm11 ;\
274	movaps	  0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
275
276#define KERNEL6(xx) \
277	mulps	%xmm2, %xmm1 ;\
278	addps	%xmm1, %xmm12 ;\
279	movaps	 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
280	mulps	%xmm2, %xmm3 ;\
281	addps	%xmm3, %xmm13 ;\
282	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
283	mulps	%xmm2, %xmm5 ;\
284	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
285	addps	%xmm5, %xmm14 ;\
286	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
287	addps	%xmm2, %xmm15 ;\
288	movaps	  4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
289
290#define KERNEL7(xx) \
291	mulps	%xmm4, %xmm7 ;\
292	addps	%xmm7, %xmm8 ;\
293	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
294	mulps	%xmm4, %xmm3 ;\
295	addps	%xmm3, %xmm9 ;\
296	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
297	mulps	%xmm4, %xmm5 ;\
298	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
299	addps	%xmm5, %xmm10 ;\
300	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
301	addps	%xmm4, %xmm11 ;\
302	movaps	  8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
303
304#define KERNEL8(xx) \
305	mulps	%xmm6, %xmm7 ;\
306	addps	%xmm7, %xmm12 ;\
307	movaps	 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
308	mulps	%xmm6, %xmm3 ;\
309	addps	%xmm3, %xmm13 ;\
310	movaps	 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
311	mulps	%xmm6, %xmm5 ;\
312	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
313	addps	%xmm5, %xmm14 ;\
314	movaps	 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
315	addps	%xmm6, %xmm15 ;\
316	movaps	 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
317
318#endif
319
320	PROLOGUE
321	PROFCODE
322
323	subq	$STACKSIZE, %rsp
324
325	movq	%rbx,  0(%rsp)
326	movq	%rbp,  8(%rsp)
327	movq	%r12, 16(%rsp)
328	movq	%r13, 24(%rsp)
329	movq	%r14, 32(%rsp)
330	movq	%r15, 40(%rsp)
331
332#ifdef WINDOWS_ABI
333	movq	%rdi,    48(%rsp)
334	movq	%rsi,    56(%rsp)
335	movups	%xmm6,   64(%rsp)
336	movups	%xmm7,   80(%rsp)
337	movups	%xmm8,   96(%rsp)
338	movups	%xmm9,  112(%rsp)
339	movups	%xmm10, 128(%rsp)
340	movups	%xmm11, 144(%rsp)
341	movups	%xmm12, 160(%rsp)
342	movups	%xmm13, 176(%rsp)
343	movups	%xmm14, 192(%rsp)
344	movups	%xmm15, 208(%rsp)
345
346	movq	ARG1,      OLD_M
347	movq	ARG2,      OLD_N
348	movq	ARG3,      K
349	movq	OLD_A,     A
350	movq	OLD_B,     B
351	movq	OLD_C,     C
352	movq	OLD_LDC,   LDC
353#ifdef TRMMKERNEL
354	movsd	OLD_OFFSET, %xmm4
355#endif
356	movaps	%xmm3, %xmm0
357
358#else
359	movq	OLD_LDC,   LDC
360#ifdef TRMMKERNEL
361	movsd	OLD_OFFSET, %xmm4
362#endif
363
364#endif
365
366	EMMS
367
368	movq	%rsp, %rbx	# save old stack
369	subq	$128 + LOCAL_BUFFER_SIZE, %rsp
370	andq	$-4096, %rsp	# align stack
371
372	STACK_TOUCHING
373
374	movq	OLD_M, M
375	movq	OLD_N, N
376
377	shufps	$0, %xmm0, %xmm0
378	movaps	%xmm0, ALPHA
379
380#ifdef TRMMKERNEL
381	movsd	%xmm4, OFFSET
382	movsd	%xmm4, KK
383#ifndef LEFT
384	negq	KK
385#endif
386#endif
387
388	subq	$-32 * SIZE, A
389
390	leaq	(, LDC, SIZE), LDC
391
392	movq	N,  J
393	sarq	$2, J		# j = (n >> 2)
394	jle	.L50
395
396.L01:
397#if defined(TRMMKERNEL) && defined(LEFT)
398	movq	OFFSET, %rax
399	movq	%rax, KK
400#endif
401
402/* Copying to Sub Buffer */
403	leaq	BUFFER, BO
404
405	movd	 0 * SIZE(B), %mm0
406
407	movq	K, %rax
408	sarq	$2, %rax
409	jle	.L03
410
411	addq	%rax, %rax
412	ALIGN_4
413
414.L02:
415	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
416
417	movd	 1 * SIZE(B), %mm1
418	movd	 2 * SIZE(B), %mm2
419	movd	 3 * SIZE(B), %mm3
420	movd	 4 * SIZE(B), %mm4
421	movd	 5 * SIZE(B), %mm5
422	movd	 6 * SIZE(B), %mm6
423	movd	 7 * SIZE(B), %mm7
424
425	PREFETCHW	 (WPREFETCHSIZE +  0) * SIZE(BO)
426
427	punpckldq %mm0, %mm0
428	movq	%mm0,  0 * SIZE(BO)
429	movq	%mm0,  2 * SIZE(BO)
430	punpckldq %mm1, %mm1
431	movd	 8 * SIZE(B), %mm0
432	movq	%mm1,  4 * SIZE(BO)
433	movq	%mm1,  6 * SIZE(BO)
434	punpckldq %mm2, %mm2
435	movq	%mm2,  8 * SIZE(BO)
436	movq	%mm2, 10 * SIZE(BO)
437	punpckldq %mm3, %mm3
438	movq	%mm3, 12 * SIZE(BO)
439	movq	%mm3, 14 * SIZE(BO)
440
441	PREFETCHW	 (WPREFETCHSIZE + 16) * SIZE(BO)
442
443	punpckldq %mm4, %mm4
444	movq	%mm4, 16 * SIZE(BO)
445	movq	%mm4, 18 * SIZE(BO)
446	punpckldq %mm5, %mm5
447	movq	%mm5, 20 * SIZE(BO)
448	movq	%mm5, 22 * SIZE(BO)
449	punpckldq %mm6, %mm6
450	movq	%mm6, 24 * SIZE(BO)
451	movq	%mm6, 26 * SIZE(BO)
452	punpckldq %mm7, %mm7
453	movq	%mm7, 28 * SIZE(BO)
454	movq	%mm7, 30 * SIZE(BO)
455
456
457	addq	$ 8 * SIZE, B
458	addq	$32 * SIZE, BO
459
460	decq	%rax
461	jne	.L02
462	ALIGN_4
463
464.L03:
465	movq	K, %rax
466	andq	$3, %rax
467	BRANCH
468	jle	.L10
469	ALIGN_4
470
471.L04:
472	movd	 0 * SIZE(B), %mm0
473	movd	 1 * SIZE(B), %mm1
474	movd	 2 * SIZE(B), %mm2
475	movd	 3 * SIZE(B), %mm3
476
477	punpckldq %mm0, %mm0
478	punpckldq %mm1, %mm1
479	punpckldq %mm2, %mm2
480	punpckldq %mm3, %mm3
481
482	movq	%mm0,  0 * SIZE(BO)
483	movq	%mm0,  2 * SIZE(BO)
484	movq	%mm1,  4 * SIZE(BO)
485	movq	%mm1,  6 * SIZE(BO)
486	movq	%mm2,  8 * SIZE(BO)
487	movq	%mm2, 10 * SIZE(BO)
488	movq	%mm3, 12 * SIZE(BO)
489	movq	%mm3, 14 * SIZE(BO)
490
491	addq	$ 4 * SIZE, B
492	addq	$16 * SIZE, BO
493	decq	%rax
494	jne	.L04
495	ALIGN_4
496
497.L10:
498	movq	C, CO1			# coffset1 = c
499	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
500	movq	A, AO		# aoffset = a
501
502	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
503
504	movq	M,  I
505	sarq	$3, I	# i = (m >> 3)
506	jle	.L20
507	ALIGN_4
508
509.L11:
510#if !defined(TRMMKERNEL) || \
511	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
512	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
513
514	leaq	32 * SIZE + BUFFER, BO
515#else
516	leaq	32 * SIZE + BUFFER, BO
517	movq	KK, %rax
518	leaq	(, %rax,   8), %rax
519	leaq	(AO, %rax, 4), AO
520	leaq	(BO, %rax, 8), BO
521#endif
522
523	movaps	-32 * SIZE(AO), %xmm0
524	movaps	-32 * SIZE(BO), %xmm1
525	xorps	%xmm8, %xmm8
526	movaps	-28 * SIZE(AO), %xmm2
527	movaps	-28 * SIZE(BO), %xmm3
528	xorps	%xmm9, %xmm9
529	movaps	-24 * SIZE(AO), %xmm4
530	movaps	-24 * SIZE(BO), %xmm5
531	xorps	%xmm10, %xmm10
532	movaps	-20 * SIZE(AO), %xmm6
533	movaps	-16 * SIZE(BO), %xmm7
534	xorps	%xmm11, %xmm11
535
536	PREFETCHW      7 * SIZE(CO1)
537	xorps	%xmm12, %xmm12
538	PREFETCHW     15 * SIZE(CO2)
539	xorps	%xmm13, %xmm13
540	PREFETCHW      7 * SIZE(CO1, LDC, 2)
541	xorps	%xmm14, %xmm14
542	PREFETCHW     15 * SIZE(CO2, LDC, 2)
543	xorps	%xmm15, %xmm15
544	PREFETCH	 -32  * SIZE(BB)
545
546#ifndef TRMMKERNEL
547	movq	K, %rax
548#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
549	movq	K, %rax
550	subq	KK, %rax
551	movq	%rax, KKK
552#else
553	movq	KK, %rax
554#ifdef LEFT
555	addq	$8, %rax
556#else
557	addq	$4, %rax
558#endif
559	movq	%rax, KKK
560#endif
561#ifndef GENERIC
562	andq	$-8, %rax
563
564	leaq	(, %rax, 8), %rax
565	leaq	(AO, %rax, 4), AO
566	leaq	(BO, %rax, 8), BO
567	negq	%rax
568	NOBRANCH
569	je	.L15
570	ALIGN_3
571
572.L12:
573	KERNEL1(16 *  0)
574	KERNEL2(16 *  0)
575	KERNEL3(16 *  0)
576	KERNEL4(16 *  0)
577	KERNEL5(16 *  0)
578	KERNEL6(16 *  0)
579	KERNEL7(16 *  0)
580	KERNEL8(16 *  0)
581
582	KERNEL1(16 *  2)
583	KERNEL2(16 *  2)
584	KERNEL3(16 *  2)
585	KERNEL4(16 *  2)
586	KERNEL5(16 *  2)
587	KERNEL6(16 *  2)
588	KERNEL7(16 *  2)
589	KERNEL8(16 *  2)
590
591	addq	$16 * SIZE, %rax
592	NOBRANCH
593	je	.L15
594	KERNEL1(16 *  0)
595	KERNEL2(16 *  0)
596	KERNEL3(16 *  0)
597	KERNEL4(16 *  0)
598	KERNEL5(16 *  0)
599	KERNEL6(16 *  0)
600	KERNEL7(16 *  0)
601	KERNEL8(16 *  0)
602
603	KERNEL1(16 *  2)
604	KERNEL2(16 *  2)
605	KERNEL3(16 *  2)
606	KERNEL4(16 *  2)
607	KERNEL5(16 *  2)
608	KERNEL6(16 *  2)
609	KERNEL7(16 *  2)
610	KERNEL8(16 *  2)
611
612	addq	$16 * SIZE, %rax
613	NOBRANCH
614	je	.L15
615	KERNEL1(16 *  0)
616	KERNEL2(16 *  0)
617	KERNEL3(16 *  0)
618	KERNEL4(16 *  0)
619	KERNEL5(16 *  0)
620	KERNEL6(16 *  0)
621	KERNEL7(16 *  0)
622	KERNEL8(16 *  0)
623
624	KERNEL1(16 *  2)
625	KERNEL2(16 *  2)
626	KERNEL3(16 *  2)
627	KERNEL4(16 *  2)
628	KERNEL5(16 *  2)
629	KERNEL6(16 *  2)
630	KERNEL7(16 *  2)
631	KERNEL8(16 *  2)
632
633	addq	$16 * SIZE, %rax
634	NOBRANCH
635	je	.L15
636	KERNEL1(16 *  0)
637	KERNEL2(16 *  0)
638	KERNEL3(16 *  0)
639	KERNEL4(16 *  0)
640	KERNEL5(16 *  0)
641	KERNEL6(16 *  0)
642	KERNEL7(16 *  0)
643	KERNEL8(16 *  0)
644
645	KERNEL1(16 *  2)
646	KERNEL2(16 *  2)
647	KERNEL3(16 *  2)
648	KERNEL4(16 *  2)
649	KERNEL5(16 *  2)
650	KERNEL6(16 *  2)
651	KERNEL7(16 *  2)
652	KERNEL8(16 *  2)
653
654	addq	$16 * SIZE, %rax
655	NOBRANCH
656	je	.L15
657	KERNEL1(16 *  0)
658	KERNEL2(16 *  0)
659	KERNEL3(16 *  0)
660	KERNEL4(16 *  0)
661	KERNEL5(16 *  0)
662	KERNEL6(16 *  0)
663	KERNEL7(16 *  0)
664	KERNEL8(16 *  0)
665
666	KERNEL1(16 *  2)
667	KERNEL2(16 *  2)
668	KERNEL3(16 *  2)
669	KERNEL4(16 *  2)
670	KERNEL5(16 *  2)
671	KERNEL6(16 *  2)
672	KERNEL7(16 *  2)
673	KERNEL8(16 *  2)
674
675	addq	$16 * SIZE, %rax
676	NOBRANCH
677	je	.L15
678	KERNEL1(16 *  0)
679	KERNEL2(16 *  0)
680	KERNEL3(16 *  0)
681	KERNEL4(16 *  0)
682	KERNEL5(16 *  0)
683	KERNEL6(16 *  0)
684	KERNEL7(16 *  0)
685	KERNEL8(16 *  0)
686
687	KERNEL1(16 *  2)
688	KERNEL2(16 *  2)
689	KERNEL3(16 *  2)
690	KERNEL4(16 *  2)
691	KERNEL5(16 *  2)
692	KERNEL6(16 *  2)
693	KERNEL7(16 *  2)
694	KERNEL8(16 *  2)
695
696	addq	$16 * SIZE, %rax
697	NOBRANCH
698	je	.L15
699	KERNEL1(16 *  0)
700	KERNEL2(16 *  0)
701	KERNEL3(16 *  0)
702	KERNEL4(16 *  0)
703	KERNEL5(16 *  0)
704	KERNEL6(16 *  0)
705	KERNEL7(16 *  0)
706	KERNEL8(16 *  0)
707
708	KERNEL1(16 *  2)
709	KERNEL2(16 *  2)
710	KERNEL3(16 *  2)
711	KERNEL4(16 *  2)
712	KERNEL5(16 *  2)
713	KERNEL6(16 *  2)
714	KERNEL7(16 *  2)
715	KERNEL8(16 *  2)
716
717	addq	$16 * SIZE, %rax
718	NOBRANCH
719	je	.L15
720	KERNEL1(16 *  0)
721	KERNEL2(16 *  0)
722	KERNEL3(16 *  0)
723	KERNEL4(16 *  0)
724	KERNEL5(16 *  0)
725	KERNEL6(16 *  0)
726	KERNEL7(16 *  0)
727	KERNEL8(16 *  0)
728
729	KERNEL1(16 *  2)
730	KERNEL2(16 *  2)
731	KERNEL3(16 *  2)
732	KERNEL4(16 *  2)
733	KERNEL5(16 *  2)
734	KERNEL6(16 *  2)
735	KERNEL7(16 *  2)
736	KERNEL8(16 *  2)
737
738	addq	$16 * SIZE, %rax
739	BRANCH
740	jl	.L12
741	ALIGN_3
742
743.L15:
744	PREFETCH	  -16 * SIZE(BB)
745	subq		 $-16 * SIZE, BB
746
747#ifndef TRMMKERNEL
748	movq	K, %rax
749#else
750	movq	KKK, %rax
751#endif
752	testq	$4, %rax
753	je .L16
754	xorq	%rax, %rax
755	ALIGN_3
756
757	KERNEL1(16 *  0)
758	KERNEL2(16 *  0)
759	KERNEL3(16 *  0)
760	KERNEL4(16 *  0)
761	KERNEL5(16 *  0)
762	KERNEL6(16 *  0)
763	KERNEL7(16 *  0)
764	KERNEL8(16 *  0)
765
766	addq	$64 * SIZE, BO
767	addq	$32 * SIZE, AO
768	ALIGN_3
769#else
770	sarq	$2, %rax
771	NOBRANCH
772	jle	.L16
773	ALIGN_3
774
775.L12:
776	KERNEL1(16 *  0)
777	KERNEL2(16 *  0)
778	KERNEL3(16 *  0)
779	KERNEL4(16 *  0)
780	KERNEL5(16 *  0)
781	KERNEL6(16 *  0)
782	KERNEL7(16 *  0)
783	KERNEL8(16 *  0)
784
785	addq	$ 64 * SIZE, BO
786	subq	$-32 * SIZE, AO
787	decq	%rax
788	BRANCH
789	jg	.L12
790#endif
791
792.L16:
793	movaps	ALPHA, %xmm7
794
795#ifndef TRMMKERNEL
796	movq	K, %rax
797#else
798	movq	KKK, %rax
799#endif
800	andq	$3, %rax		# if (k & 1)
801	je .L18
802
803	leaq	(, %rax, 8), %rax
804	leaq	(AO, %rax, 4), AO
805	leaq	(BO, %rax, 8), BO
806	negq	%rax
807	ALIGN_4
808
809.L17:
810	mulps	%xmm0, %xmm1
811	addps	%xmm1, %xmm8
812	movaps	-28 * SIZE(BO, %rax, 8), %xmm1
813	mulps	%xmm0, %xmm1
814	addps	%xmm1, %xmm9
815	movaps	-24 * SIZE(BO, %rax, 8), %xmm1
816	mulps	%xmm0, %xmm1
817	mulps	-20 * SIZE(BO, %rax, 8), %xmm0
818	addps	%xmm1, %xmm10
819	movaps	-32 * SIZE(BO, %rax, 8), %xmm1
820	addps	%xmm0, %xmm11
821	movaps	-24 * SIZE(AO, %rax, 4), %xmm0
822	mulps	%xmm2, %xmm1
823	addps	%xmm1, %xmm12
824	movaps	-28 * SIZE(BO, %rax, 8), %xmm1
825	mulps	%xmm2, %xmm1
826	addps	%xmm1, %xmm13
827	movaps	-24 * SIZE(BO, %rax, 8), %xmm1
828	mulps	%xmm2, %xmm1
829	mulps	-20 * SIZE(BO, %rax, 8), %xmm2
830	addps	%xmm1, %xmm14
831	movaps	-16 * SIZE(BO, %rax, 8), %xmm1
832	addps	%xmm2, %xmm15
833	movaps	-20 * SIZE(AO, %rax, 4), %xmm2
834
835	addq	$SIZE * 2, %rax
836	jl	.L17
837	ALIGN_4
838
839.L18:
840#ifndef TRMMKERNEL
841	movsd	0 * SIZE(CO1), %xmm0
842	movhps	2 * SIZE(CO1), %xmm0
843	movsd	4 * SIZE(CO1), %xmm1
844	movhps	6 * SIZE(CO1), %xmm1
845
846	movsd	0 * SIZE(CO2), %xmm2
847	movhps	2 * SIZE(CO2), %xmm2
848	movsd	4 * SIZE(CO2), %xmm3
849	movhps	6 * SIZE(CO2), %xmm3
850#endif
851
852	mulps	%xmm7, %xmm8
853	mulps	%xmm7, %xmm9
854	mulps	%xmm7, %xmm10
855	mulps	%xmm7, %xmm11
856
857	mulps	%xmm7, %xmm12
858	mulps	%xmm7, %xmm13
859	mulps	%xmm7, %xmm14
860	mulps	%xmm7, %xmm15
861
862#ifndef TRMMKERNEL
863	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
864	movhps	2 * SIZE(CO1, LDC, 2), %xmm4
865	movsd	4 * SIZE(CO1, LDC, 2), %xmm5
866	movhps	6 * SIZE(CO1, LDC, 2), %xmm5
867
868	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
869	movhps	2 * SIZE(CO2, LDC, 2), %xmm6
870	movsd	4 * SIZE(CO2, LDC, 2), %xmm7
871	movhps	6 * SIZE(CO2, LDC, 2), %xmm7
872
873	addps	%xmm0, %xmm8
874	addps	%xmm1, %xmm12
875	addps	%xmm2, %xmm9
876	addps	%xmm3, %xmm13
877#endif
878
879	movlps	%xmm8,  0 * SIZE(CO1)
880	movhps	%xmm8,  2 * SIZE(CO1)
881	movlps	%xmm12, 4 * SIZE(CO1)
882	movhps	%xmm12, 6 * SIZE(CO1)
883
884	movlps	%xmm9,  0 * SIZE(CO2)
885	movhps	%xmm9,  2 * SIZE(CO2)
886	movlps	%xmm13, 4 * SIZE(CO2)
887	movhps	%xmm13, 6 * SIZE(CO2)
888
889#ifndef TRMMKERNEL
890	addps	%xmm4, %xmm10
891	addps	%xmm5, %xmm14
892	addps	%xmm6, %xmm11
893	addps	%xmm7, %xmm15
894#endif
895
896	movlps	%xmm10, 0 * SIZE(CO1, LDC, 2)
897	movhps	%xmm10, 2 * SIZE(CO1, LDC, 2)
898	movlps	%xmm14, 4 * SIZE(CO1, LDC, 2)
899	movhps	%xmm14, 6 * SIZE(CO1, LDC, 2)
900
901	movlps	%xmm11, 0 * SIZE(CO2, LDC, 2)
902	movhps	%xmm11, 2 * SIZE(CO2, LDC, 2)
903	movlps	%xmm15, 4 * SIZE(CO2, LDC, 2)
904	movhps	%xmm15, 6 * SIZE(CO2, LDC, 2)
905
906#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
907    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
908	movq	K, %rax
909	subq	KKK, %rax
910	leaq	(,%rax,    8), %rax
911	leaq	(AO, %rax, 4), AO
912	leaq	(BO, %rax, 8), BO
913#endif
914
915#if defined(TRMMKERNEL) && defined(LEFT)
916	addq	$8, KK
917#endif
918
919	addq	$8 * SIZE, CO1		# coffset += 4
920	addq	$8 * SIZE, CO2		# coffset += 4
921	decq	I			# i --
922	jg	.L11
923	ALIGN_4
924
925.L20:
926	testq	$4, M
927	je	.L30
928
929#if !defined(TRMMKERNEL) || \
930	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
931	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
932
933	leaq	BUFFER, BO
934#else
935	leaq	BUFFER, BO
936	movq	KK, %rax
937	leaq	(, %rax,   8), %rax
938	leaq	(AO, %rax, 2), AO
939	leaq	(BO, %rax, 8), BO
940#endif
941
942	movaps	-32 * SIZE(AO), %xmm8
943	movaps	-16 * SIZE(AO), %xmm10
944
945	movaps	  0 * SIZE(BO), %xmm9
946	movaps	 16 * SIZE(BO), %xmm11
947	movaps	 32 * SIZE(BO), %xmm13
948	movaps	 48 * SIZE(BO), %xmm15
949
950	xorps	%xmm0, %xmm0
951	xorps	%xmm1, %xmm1
952	xorps	%xmm2, %xmm2
953	xorps	%xmm3, %xmm3
954
955#ifndef TRMMKERNEL
956	movq	K, %rax
957#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
958	movq	K, %rax
959	subq	KK, %rax
960	movq	%rax, KKK
961#else
962	movq	KK, %rax
963#ifdef LEFT
964	addq	$4, %rax
965#else
966	addq	$4, %rax
967#endif
968	movq	%rax, KKK
969#endif
970	sarq	$3, %rax
971	je	.L25
972	ALIGN_4
973
974.L22:
975	mulps	%xmm8, %xmm9
976	addps	%xmm9, %xmm0
977#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
978	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
979#endif
980	movaps	 4 * SIZE(BO), %xmm9
981	mulps	%xmm8, %xmm9
982	addps	%xmm9, %xmm1
983	movaps	 8 * SIZE(BO), %xmm9
984	mulps	%xmm8, %xmm9
985	mulps	12 * SIZE(BO), %xmm8
986	addps	%xmm9, %xmm2
987	movaps	64 * SIZE(BO), %xmm9
988	addps	%xmm8, %xmm3
989	movaps	-28 * SIZE(AO), %xmm8
990
991	mulps	%xmm8, %xmm11
992	addps	%xmm11, %xmm0
993	movaps	20 * SIZE(BO), %xmm11
994	mulps	%xmm8, %xmm11
995	addps	%xmm11, %xmm1
996	movaps	24 * SIZE(BO), %xmm11
997	mulps	%xmm8, %xmm11
998	mulps	28 * SIZE(BO), %xmm8
999	addps	%xmm11, %xmm2
1000	movaps	80 * SIZE(BO), %xmm11
1001	addps	%xmm8, %xmm3
1002	movaps	-24 * SIZE(AO), %xmm8
1003
1004	mulps	%xmm8, %xmm13
1005	addps	%xmm13, %xmm0
1006	movaps	36 * SIZE(BO), %xmm13
1007	mulps	%xmm8, %xmm13
1008	addps	%xmm13, %xmm1
1009	movaps	40 * SIZE(BO), %xmm13
1010	mulps	%xmm8, %xmm13
1011	mulps	44 * SIZE(BO), %xmm8
1012	addps	%xmm13, %xmm2
1013	movaps	96 * SIZE(BO), %xmm13
1014	addps	%xmm8, %xmm3
1015	movaps	-20 * SIZE(AO), %xmm8
1016
1017	mulps	%xmm8, %xmm15
1018	addps	%xmm15, %xmm0
1019	movaps	52 * SIZE(BO), %xmm15
1020	mulps	%xmm8, %xmm15
1021	addps	%xmm15, %xmm1
1022	movaps	56 * SIZE(BO), %xmm15
1023	mulps	%xmm8, %xmm15
1024	mulps	60 * SIZE(BO), %xmm8
1025	addps	%xmm15, %xmm2
1026	movaps	112 * SIZE(BO), %xmm15
1027	addps	%xmm8, %xmm3
1028	movaps	 0 * SIZE(AO), %xmm8
1029
1030#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1031	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
1032#endif
1033	mulps	%xmm10, %xmm9
1034	addps	%xmm9, %xmm0
1035	movaps	68 * SIZE(BO), %xmm9
1036	mulps	%xmm10, %xmm9
1037	addps	%xmm9, %xmm1
1038	movaps	72 * SIZE(BO), %xmm9
1039	mulps	%xmm10, %xmm9
1040	mulps	76 * SIZE(BO), %xmm10
1041	addps	%xmm9, %xmm2
1042	movaps	128 * SIZE(BO), %xmm9
1043	addps	%xmm10, %xmm3
1044	movaps	-12 * SIZE(AO), %xmm10
1045
1046	mulps	%xmm10, %xmm11
1047	addps	%xmm11, %xmm0
1048	movaps	84 * SIZE(BO), %xmm11
1049	mulps	%xmm10, %xmm11
1050	addps	%xmm11, %xmm1
1051	movaps	88 * SIZE(BO), %xmm11
1052	mulps	%xmm10, %xmm11
1053	mulps	92 * SIZE(BO), %xmm10
1054	addps	%xmm11, %xmm2
1055	movaps	144 * SIZE(BO), %xmm11
1056	addps	%xmm10, %xmm3
1057	movaps	-8 * SIZE(AO), %xmm10
1058
1059	mulps	%xmm10, %xmm13
1060	addps	%xmm13, %xmm0
1061	movaps	100 * SIZE(BO), %xmm13
1062	mulps	%xmm10, %xmm13
1063	addps	%xmm13, %xmm1
1064	movaps	104 * SIZE(BO), %xmm13
1065	mulps	%xmm10, %xmm13
1066	mulps	108 * SIZE(BO), %xmm10
1067	addps	%xmm13, %xmm2
1068	movaps	160 * SIZE(BO), %xmm13
1069	addps	%xmm10, %xmm3
1070	movaps	-4 * SIZE(AO), %xmm10
1071
1072	mulps	%xmm10, %xmm15
1073	addps	%xmm15, %xmm0
1074	movaps	116 * SIZE(BO), %xmm15
1075	mulps	%xmm10, %xmm15
1076	addps	%xmm15, %xmm1
1077	movaps	120 * SIZE(BO), %xmm15
1078	mulps	%xmm10, %xmm15
1079	mulps	124 * SIZE(BO), %xmm10
1080	addps	%xmm15, %xmm2
1081	movaps	176 * SIZE(BO), %xmm15
1082	addps	%xmm10, %xmm3
1083	movaps	16 * SIZE(AO), %xmm10
1084
1085	addq   $ 32 * SIZE, AO
1086	addq   $128 * SIZE, BO
1087	decq   %rax
1088	jne    .L22
1089	ALIGN_4
1090
1091.L25:
1092#ifndef TRMMKERNEL
1093	movq	K, %rax
1094#else
1095	movq	KKK, %rax
1096#endif
1097	movaps	ALPHA, %xmm15
1098	andq	$7, %rax		# if (k & 1)
1099	BRANCH
1100	je .L28
1101	ALIGN_4
1102
1103.L26:
1104	mulps	%xmm8, %xmm9
1105	addps	%xmm9, %xmm0
1106	movaps	 4 * SIZE(BO), %xmm9
1107	mulps	%xmm8, %xmm9
1108	addps	%xmm9, %xmm1
1109	movaps	 8 * SIZE(BO), %xmm9
1110	mulps	%xmm8, %xmm9
1111	mulps	12 * SIZE(BO), %xmm8
1112	addps	%xmm9, %xmm2
1113	movaps	16 * SIZE(BO), %xmm9
1114	addps	%xmm8, %xmm3
1115	movaps	-28 * SIZE(AO), %xmm8
1116
1117	addq	$ 4 * SIZE, AO		# aoffset  += 4
1118	addq	$16 * SIZE, BO		# boffset1 += 8
1119	decq	%rax
1120	jg	.L26
1121	ALIGN_4
1122
1123.L28:
1124	mulps	%xmm15, %xmm0
1125	mulps	%xmm15, %xmm1
1126 	mulps	%xmm15, %xmm2
1127	mulps	%xmm15, %xmm3
1128
1129#ifndef TRMMKERNEL
1130	movsd	0 * SIZE(CO1), %xmm8
1131	movhps	2 * SIZE(CO1), %xmm8
1132	movsd	0 * SIZE(CO2), %xmm10
1133	movhps	2 * SIZE(CO2), %xmm10
1134
1135	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1136	movhps	2 * SIZE(CO1, LDC, 2), %xmm12
1137	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1138	movhps	2 * SIZE(CO2, LDC, 2), %xmm14
1139
1140	addps	%xmm8,  %xmm0
1141	addps	%xmm10, %xmm1
1142	addps	%xmm12, %xmm2
1143	addps	%xmm14, %xmm3
1144#endif
1145
1146	movlps	%xmm0, 0 * SIZE(CO1)
1147	movhps	%xmm0, 2 * SIZE(CO1)
1148	movlps	%xmm1, 0 * SIZE(CO2)
1149	movhps	%xmm1, 2 * SIZE(CO2)
1150
1151	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1152	movhps	%xmm2, 2 * SIZE(CO1, LDC, 2)
1153	movlps	%xmm3, 0 * SIZE(CO2, LDC, 2)
1154	movhps	%xmm3, 2 * SIZE(CO2, LDC, 2)
1155
1156#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1157    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1158	movq	K, %rax
1159	subq	KKK, %rax
1160	leaq	(,%rax,    8), %rax
1161	leaq	(AO, %rax, 2), AO
1162	leaq	(BO, %rax, 8), BO
1163#endif
1164
1165#if defined(TRMMKERNEL) && defined(LEFT)
1166	addq	$4, KK
1167#endif
1168
1169	addq	$4 * SIZE, CO1		# coffset += 4
1170	addq	$4 * SIZE, CO2		# coffset += 4
1171	ALIGN_4
1172
1173.L30:
1174	testq	$2, M
1175	je	.L40
1176
1177#if !defined(TRMMKERNEL) || \
1178	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1179	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1180
1181	leaq	BUFFER, BO
1182#else
1183	leaq	BUFFER, BO
1184	movq	KK, %rax
1185	leaq	(, %rax,   8), %rax
1186	leaq	(AO, %rax, 1), AO
1187	leaq	(BO, %rax, 8), BO
1188#endif
1189
1190	movaps	-32 * SIZE(AO), %xmm8
1191	movaps	-24 * SIZE(AO), %xmm10
1192
1193	movaps	 0 * SIZE(BO), %xmm9
1194	movaps	16 * SIZE(BO), %xmm11
1195	movaps	32 * SIZE(BO), %xmm13
1196	movaps	48 * SIZE(BO), %xmm15
1197
1198	xorps	%xmm0, %xmm0
1199	xorps	%xmm1, %xmm1
1200	xorps	%xmm2, %xmm2
1201	xorps	%xmm3, %xmm3
1202
1203#ifndef TRMMKERNEL
1204	movq	K, %rax
1205#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1206	movq	K, %rax
1207	subq	KK, %rax
1208	movq	%rax, KKK
1209#else
1210	movq	KK, %rax
1211#ifdef LEFT
1212	addq	$2, %rax
1213#else
1214	addq	$4, %rax
1215#endif
1216	movq	%rax, KKK
1217#endif
1218	sarq	$3, %rax
1219	je	.L35
1220	ALIGN_4
1221
1222.L32:
1223	mulps	%xmm8, %xmm9
1224	addps	%xmm9, %xmm0
1225#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1226	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1227#endif
1228	movaps	 4 * SIZE(BO), %xmm9
1229	mulps	%xmm8, %xmm9
1230	addps	%xmm9, %xmm1
1231	movaps	 8 * SIZE(BO), %xmm9
1232	mulps	%xmm8, %xmm9
1233	addps	%xmm9, %xmm2
1234	movaps	12 * SIZE(BO), %xmm9
1235	mulps	%xmm8, %xmm9
1236	movsd	-30 * SIZE(AO), %xmm8
1237	addps	%xmm9, %xmm3
1238	movaps	64 * SIZE(BO), %xmm9
1239
1240	mulps	%xmm8, %xmm11
1241	addps	%xmm11, %xmm0
1242	movaps	20 * SIZE(BO), %xmm11
1243	mulps	%xmm8, %xmm11
1244	addps	%xmm11, %xmm1
1245	movaps	24 * SIZE(BO), %xmm11
1246	mulps	%xmm8, %xmm11
1247	addps	%xmm11, %xmm2
1248	movaps	28 * SIZE(BO), %xmm11
1249	mulps	%xmm8, %xmm11
1250	movsd	-28 * SIZE(AO), %xmm8
1251	addps	%xmm11, %xmm3
1252	movaps	80 * SIZE(BO), %xmm11
1253
1254	mulps	%xmm8, %xmm13
1255	addps	%xmm13, %xmm0
1256	movaps	36 * SIZE(BO), %xmm13
1257	mulps	%xmm8, %xmm13
1258	addps	%xmm13, %xmm1
1259	movaps	40 * SIZE(BO), %xmm13
1260	mulps	%xmm8, %xmm13
1261	addps	%xmm13, %xmm2
1262	movaps	44 * SIZE(BO), %xmm13
1263	mulps	%xmm8, %xmm13
1264	movsd	-26 * SIZE(AO), %xmm8
1265	addps	%xmm13, %xmm3
1266	movaps	96 * SIZE(BO), %xmm13
1267
1268	mulps	%xmm8, %xmm15
1269	addps	%xmm15, %xmm0
1270	movaps	52 * SIZE(BO), %xmm15
1271	mulps	%xmm8, %xmm15
1272	addps	%xmm15, %xmm1
1273	movaps	56 * SIZE(BO), %xmm15
1274	mulps	%xmm8, %xmm15
1275	addps	%xmm15, %xmm2
1276	movaps	60 * SIZE(BO), %xmm15
1277	mulps	%xmm8, %xmm15
1278	movsd	-16 * SIZE(AO), %xmm8
1279	addps	%xmm15, %xmm3
1280	movaps	112 * SIZE(BO), %xmm15
1281
1282	mulps	%xmm10, %xmm9
1283	addps	%xmm9, %xmm0
1284	movaps	68 * SIZE(BO), %xmm9
1285	mulps	%xmm10, %xmm9
1286	addps	%xmm9, %xmm1
1287	movaps	72 * SIZE(BO), %xmm9
1288	mulps	%xmm10, %xmm9
1289	addps	%xmm9, %xmm2
1290	movaps	76 * SIZE(BO), %xmm9
1291	mulps	%xmm10, %xmm9
1292	movsd	-22 * SIZE(AO), %xmm10
1293	addps	%xmm9, %xmm3
1294	movaps	128 * SIZE(BO), %xmm9
1295
1296	mulps	%xmm10, %xmm11
1297	addps	%xmm11, %xmm0
1298	movaps	84 * SIZE(BO), %xmm11
1299	mulps	%xmm10, %xmm11
1300	addps	%xmm11, %xmm1
1301	movaps	88 * SIZE(BO), %xmm11
1302	mulps	%xmm10, %xmm11
1303	addps	%xmm11, %xmm2
1304	movaps	92 * SIZE(BO), %xmm11
1305	mulps	%xmm10, %xmm11
1306	movsd	-20 * SIZE(AO), %xmm10
1307	addps	%xmm11, %xmm3
1308	movaps	144 * SIZE(BO), %xmm11
1309
1310	mulps	%xmm10, %xmm13
1311	addps	%xmm13, %xmm0
1312	movaps	100 * SIZE(BO), %xmm13
1313	mulps	%xmm10, %xmm13
1314	addps	%xmm13, %xmm1
1315	movaps	104 * SIZE(BO), %xmm13
1316	mulps	%xmm10, %xmm13
1317	addps	%xmm13, %xmm2
1318	movaps	108 * SIZE(BO), %xmm13
1319	mulps	%xmm10, %xmm13
1320	movsd	-18 * SIZE(AO), %xmm10
1321	addps	%xmm13, %xmm3
1322	movaps	160 * SIZE(BO), %xmm13
1323
1324	mulps	%xmm10, %xmm15
1325	addps	%xmm15, %xmm0
1326	movaps	116 * SIZE(BO), %xmm15
1327	mulps	%xmm10, %xmm15
1328	addps	%xmm15, %xmm1
1329	movaps	120 * SIZE(BO), %xmm15
1330	mulps	%xmm10, %xmm15
1331	addps	%xmm15, %xmm2
1332	movaps	124 * SIZE(BO), %xmm15
1333	mulps	%xmm10, %xmm15
1334	movsd	 -8 * SIZE(AO), %xmm10
1335	addps	%xmm15, %xmm3
1336	movaps	176 * SIZE(BO), %xmm15
1337
1338	addq   $ 16 * SIZE, AO
1339	addq   $128 * SIZE, BO
1340	decq   %rax
1341	jne    .L32
1342	ALIGN_4
1343
1344.L35:
1345#ifndef TRMMKERNEL
1346	movq	K, %rax
1347#else
1348	movq	KKK, %rax
1349#endif
1350	movaps	ALPHA, %xmm15
1351	andq	$7, %rax		# if (k & 1)
1352	BRANCH
1353	je .L38
1354	ALIGN_4
1355
1356.L36:
1357	mulps	%xmm8, %xmm9
1358	addps	%xmm9, %xmm0
1359	movaps	  4 * SIZE(BO), %xmm9
1360	mulps	%xmm8, %xmm9
1361	addps	%xmm9, %xmm1
1362	movaps	  8 * SIZE(BO), %xmm9
1363	mulps	%xmm8, %xmm9
1364	addps	%xmm9, %xmm2
1365	movaps	 12 * SIZE(BO), %xmm9
1366	mulps	%xmm8, %xmm9
1367	movsd	-30 * SIZE(AO), %xmm8
1368	addps	%xmm9, %xmm3
1369	movaps	 16 * SIZE(BO), %xmm9
1370
1371	addq	$ 2 * SIZE, AO		# aoffset  += 4
1372	addq	$16 * SIZE, BO		# boffset1 += 8
1373	decq	%rax
1374	jg	.L36
1375	ALIGN_4
1376
1377.L38:
1378	mulps	%xmm15, %xmm0
1379	mulps	%xmm15, %xmm1
1380 	mulps	%xmm15, %xmm2
1381	mulps	%xmm15, %xmm3
1382
1383#ifndef TRMMKERNEL
1384#ifdef movsd
1385	xorps	%xmm8,  %xmm8
1386#endif
1387	movsd	0 * SIZE(CO1), %xmm8
1388#ifdef movsd
1389	xorps	%xmm10,  %xmm10
1390#endif
1391	movsd	0 * SIZE(CO2), %xmm10
1392#ifdef movsd
1393	xorps	%xmm12,  %xmm12
1394#endif
1395	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1396#ifdef movsd
1397	xorps	%xmm14,  %xmm14
1398#endif
1399	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1400
1401	addps	%xmm8,  %xmm0
1402	addps	%xmm10, %xmm1
1403	addps	%xmm12, %xmm2
1404	addps	%xmm14, %xmm3
1405#endif
1406
1407	movlps	%xmm0, 0 * SIZE(CO1)
1408	movlps	%xmm1, 0 * SIZE(CO2)
1409	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1410	movlps	%xmm3, 0 * SIZE(CO2, LDC, 2)
1411
1412#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1413    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1414	movq	K, %rax
1415	subq	KKK, %rax
1416	leaq	(,%rax,    8), %rax
1417	leaq	(AO, %rax, 1), AO
1418	leaq	(BO, %rax, 8), BO
1419#endif
1420
1421#if defined(TRMMKERNEL) && defined(LEFT)
1422	addq	$2, KK
1423#endif
1424
1425	addq	$2 * SIZE, CO1		# coffset += 4
1426	addq	$2 * SIZE, CO2		# coffset += 4
1427	ALIGN_4
1428
1429.L40:
1430	testq	$1, M
1431	je	.L49
1432
1433#if !defined(TRMMKERNEL) || \
1434	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1435	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1436
1437	leaq	BUFFER, BO
1438#else
1439	leaq	BUFFER, BO
1440	movq	KK, %rax
1441	leaq	(, %rax,   4), %rax
1442	leaq	(AO, %rax, 1), AO
1443	leaq	(BO, %rax, 8), BO
1444	leaq	(BO, %rax, 8), BO
1445#endif
1446
1447	movss	-32 * SIZE(AO), %xmm8
1448	movss	-28 * SIZE(AO), %xmm10
1449
1450	movss	 0 * SIZE(BO), %xmm9
1451	movss	16 * SIZE(BO), %xmm11
1452	movss	32 * SIZE(BO), %xmm13
1453	movss	48 * SIZE(BO), %xmm15
1454
1455	xorps	%xmm0, %xmm0
1456	xorps	%xmm1, %xmm1
1457	xorps	%xmm2, %xmm2
1458	xorps	%xmm3, %xmm3
1459
1460#ifndef TRMMKERNEL
1461	movq	K, %rax
1462#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1463	movq	K, %rax
1464	subq	KK, %rax
1465	movq	%rax, KKK
1466#else
1467	movq	KK, %rax
1468#ifdef LEFT
1469	addq	$1, %rax
1470#else
1471	addq	$4, %rax
1472#endif
1473	movq	%rax, KKK
1474#endif
1475	sarq	$3, %rax
1476	je	.L45
1477	ALIGN_4
1478
1479.L42:
1480	mulss	%xmm8, %xmm9
1481	addss	%xmm9, %xmm0
1482#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1483	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1484#endif
1485	movss	 4 * SIZE(BO), %xmm9
1486	mulss	%xmm8, %xmm9
1487	addss	%xmm9, %xmm1
1488	movss	 8 * SIZE(BO), %xmm9
1489	mulss	%xmm8, %xmm9
1490	addss	%xmm9, %xmm2
1491	movss	12 * SIZE(BO), %xmm9
1492	mulss	%xmm8, %xmm9
1493	movss	-31 * SIZE(AO), %xmm8
1494	addss	%xmm9, %xmm3
1495	movss	64 * SIZE(BO), %xmm9
1496
1497	mulss	%xmm8, %xmm11
1498	addss	%xmm11, %xmm0
1499	movss	20 * SIZE(BO), %xmm11
1500	mulss	%xmm8, %xmm11
1501	addss	%xmm11, %xmm1
1502	movss	24 * SIZE(BO), %xmm11
1503	mulss	%xmm8, %xmm11
1504	addss	%xmm11, %xmm2
1505	movss	28 * SIZE(BO), %xmm11
1506	mulss	%xmm8, %xmm11
1507	movss	-30 * SIZE(AO), %xmm8
1508	addss	%xmm11, %xmm3
1509	movss	80 * SIZE(BO), %xmm11
1510
1511	mulss	%xmm8, %xmm13
1512	addss	%xmm13, %xmm0
1513	movss	36 * SIZE(BO), %xmm13
1514	mulss	%xmm8, %xmm13
1515	addss	%xmm13, %xmm1
1516	movss	40 * SIZE(BO), %xmm13
1517	mulss	%xmm8, %xmm13
1518	addss	%xmm13, %xmm2
1519	movss	44 * SIZE(BO), %xmm13
1520	mulss	%xmm8, %xmm13
1521	movss	-29 * SIZE(AO), %xmm8
1522	addss	%xmm13, %xmm3
1523	movss	96 * SIZE(BO), %xmm13
1524
1525	mulss	%xmm8, %xmm15
1526	addss	%xmm15, %xmm0
1527	movss	52 * SIZE(BO), %xmm15
1528	mulss	%xmm8, %xmm15
1529	addss	%xmm15, %xmm1
1530	movss	56 * SIZE(BO), %xmm15
1531	mulss	%xmm8, %xmm15
1532	addss	%xmm15, %xmm2
1533	movss	60 * SIZE(BO), %xmm15
1534	mulss	%xmm8, %xmm15
1535	movss	-24 * SIZE(AO), %xmm8
1536	addss	%xmm15, %xmm3
1537	movss	112 * SIZE(BO), %xmm15
1538
1539	mulss	%xmm10, %xmm9
1540	addss	%xmm9, %xmm0
1541	movss	68 * SIZE(BO), %xmm9
1542	mulss	%xmm10, %xmm9
1543	addss	%xmm9, %xmm1
1544	movss	72 * SIZE(BO), %xmm9
1545	mulss	%xmm10, %xmm9
1546	addss	%xmm9, %xmm2
1547	movss	76 * SIZE(BO), %xmm9
1548	mulss	%xmm10, %xmm9
1549	movss	-27 * SIZE(AO), %xmm10
1550	addss	%xmm9, %xmm3
1551	movss	128 * SIZE(BO), %xmm9
1552
1553	mulss	%xmm10, %xmm11
1554	addss	%xmm11, %xmm0
1555	movss	84 * SIZE(BO), %xmm11
1556	mulss	%xmm10, %xmm11
1557	addss	%xmm11, %xmm1
1558	movss	88 * SIZE(BO), %xmm11
1559	mulss	%xmm10, %xmm11
1560	addss	%xmm11, %xmm2
1561	movss	92 * SIZE(BO), %xmm11
1562	mulss	%xmm10, %xmm11
1563	movss	-26 * SIZE(AO), %xmm10
1564	addss	%xmm11, %xmm3
1565	movss	144 * SIZE(BO), %xmm11
1566
1567	mulss	%xmm10, %xmm13
1568	addss	%xmm13, %xmm0
1569	movss	100 * SIZE(BO), %xmm13
1570	mulss	%xmm10, %xmm13
1571	addss	%xmm13, %xmm1
1572	movss	104 * SIZE(BO), %xmm13
1573	mulss	%xmm10, %xmm13
1574	addss	%xmm13, %xmm2
1575	movss	108 * SIZE(BO), %xmm13
1576	mulss	%xmm10, %xmm13
1577	movss	-25 * SIZE(AO), %xmm10
1578	addss	%xmm13, %xmm3
1579	movss	160 * SIZE(BO), %xmm13
1580
1581	mulss	%xmm10, %xmm15
1582	addss	%xmm15, %xmm0
1583	movss	116 * SIZE(BO), %xmm15
1584	mulss	%xmm10, %xmm15
1585	addss	%xmm15, %xmm1
1586	movss	120 * SIZE(BO), %xmm15
1587	mulss	%xmm10, %xmm15
1588	addss	%xmm15, %xmm2
1589	movss	124 * SIZE(BO), %xmm15
1590	mulss	%xmm10, %xmm15
1591	movss	-20 * SIZE(AO), %xmm10
1592	addss	%xmm15, %xmm3
1593	movss	176 * SIZE(BO), %xmm15
1594
1595	addq   $  8 * SIZE, AO
1596	addq   $128 * SIZE, BO
1597	decq   %rax
1598	jne    .L42
1599	ALIGN_4
1600
1601.L45:
1602#ifndef TRMMKERNEL
1603	movq	K, %rax
1604#else
1605	movq	KKK, %rax
1606#endif
1607	movaps	ALPHA, %xmm15
1608	andq	$7, %rax		# if (k & 1)
1609	BRANCH
1610	je .L48
1611	ALIGN_4
1612
1613.L46:
1614	mulps	%xmm8, %xmm9
1615	addps	%xmm9, %xmm0
1616	movss	 4 * SIZE(BO), %xmm9
1617	mulps	%xmm8, %xmm9
1618	addps	%xmm9, %xmm1
1619	movss	 8 * SIZE(BO), %xmm9
1620	mulps	%xmm8, %xmm9
1621	addps	%xmm9, %xmm2
1622	movss	12 * SIZE(BO), %xmm9
1623	mulps	%xmm8, %xmm9
1624	movss	-31 * SIZE(AO), %xmm8
1625	addps	%xmm9, %xmm3
1626	movss	16 * SIZE(BO), %xmm9
1627
1628	addq	$ 1 * SIZE, AO		# aoffset  += 4
1629	addq	$16 * SIZE, BO		# boffset1 += 8
1630	decq	%rax
1631	jg	.L46
1632	ALIGN_4
1633
1634.L48:
1635	mulss	%xmm15, %xmm0
1636	mulss	%xmm15, %xmm1
1637 	mulss	%xmm15, %xmm2
1638	mulss	%xmm15, %xmm3
1639
1640#ifndef TRMMKERNEL
1641	movss	0 * SIZE(CO1), %xmm8
1642	movss	0 * SIZE(CO2), %xmm10
1643	movss	0 * SIZE(CO1, LDC, 2), %xmm12
1644	movss	0 * SIZE(CO2, LDC, 2), %xmm14
1645
1646	addss	%xmm8,  %xmm0
1647	addss	%xmm10, %xmm1
1648	addss	%xmm12, %xmm2
1649	addss	%xmm14, %xmm3
1650#endif
1651
1652	movss	%xmm0, 0 * SIZE(CO1)
1653	movss	%xmm1, 0 * SIZE(CO2)
1654	movss	%xmm2, 0 * SIZE(CO1, LDC, 2)
1655	movss	%xmm3, 0 * SIZE(CO2, LDC, 2)
1656
1657#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1658    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1659	movq	K, %rax
1660	subq	KKK, %rax
1661	leaq	(,%rax,    4), %rax
1662	leaq	(AO, %rax, 1), AO
1663	leaq	(BO, %rax, 8), BO
1664	leaq	(BO, %rax, 8), BO
1665#endif
1666
1667#if defined(TRMMKERNEL) && defined(LEFT)
1668	addq	$1, KK
1669#endif
1670	ALIGN_4
1671
1672.L49:
1673#if defined(TRMMKERNEL) && !defined(LEFT)
1674	addl	$4, KK
1675#endif
1676	leaq	(C, LDC, 4), C		# c += 4 * ldc
1677	decq	J			# j --
1678	jg	.L01
1679
1680.L50:
1681	testq	$2, N
1682	je	.L100
1683
1684.L51:
1685#if defined(TRMMKERNEL) && defined(LEFT)
1686	movq	OFFSET, %rax
1687	movq	%rax, KK
1688#endif
1689
1690/* Copying to Sub Buffer */
1691	leaq	BUFFER, BO
1692
1693	movq	K, %rax
1694	sarq	$2, %rax
1695	jle	.L53
1696	ALIGN_4
1697
1698.L52:
1699#if defined(PENTIUM4) || defined(GENERIC)
1700	movss	 0 * SIZE(B), %xmm0
1701	movss	 1 * SIZE(B), %xmm1
1702	movss	 2 * SIZE(B), %xmm2
1703	movss	 3 * SIZE(B), %xmm3
1704	movss	 4 * SIZE(B), %xmm4
1705	movss	 5 * SIZE(B), %xmm5
1706	movss	 6 * SIZE(B), %xmm6
1707	movss	 7 * SIZE(B), %xmm7
1708
1709	PREFETCH	 32 * SIZE(B)
1710
1711	shufps	 $0, %xmm0, %xmm0
1712	shufps	 $0, %xmm1, %xmm1
1713	shufps	 $0, %xmm2, %xmm2
1714	shufps	 $0, %xmm3, %xmm3
1715	shufps	 $0, %xmm4, %xmm4
1716	shufps	 $0, %xmm5, %xmm5
1717	shufps	 $0, %xmm6, %xmm6
1718	shufps	 $0, %xmm7, %xmm7
1719
1720	movaps	%xmm0,  0 * SIZE(BO)
1721	movaps	%xmm1,  4 * SIZE(BO)
1722	movaps	%xmm2,  8 * SIZE(BO)
1723	movaps	%xmm3, 12 * SIZE(BO)
1724	movaps	%xmm4, 16 * SIZE(BO)
1725	movaps	%xmm5, 20 * SIZE(BO)
1726	movaps	%xmm6, 24 * SIZE(BO)
1727	movaps	%xmm7, 28 * SIZE(BO)
1728
1729	addq	$ 8 * SIZE, B
1730	addq	$32 * SIZE, BO
1731#endif
1732
1733#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1734	PREFETCH	 32 * SIZE(B)
1735
1736	movd	 0 * SIZE(B), %mm0
1737	movd	 1 * SIZE(B), %mm1
1738	movd	 2 * SIZE(B), %mm2
1739	movd	 3 * SIZE(B), %mm3
1740	movd	 4 * SIZE(B), %mm4
1741	movd	 5 * SIZE(B), %mm5
1742	movd	 6 * SIZE(B), %mm6
1743	movd	 7 * SIZE(B), %mm7
1744
1745	punpckldq %mm0, %mm0
1746	punpckldq %mm1, %mm1
1747	punpckldq %mm2, %mm2
1748	punpckldq %mm3, %mm3
1749	punpckldq %mm4, %mm4
1750	punpckldq %mm5, %mm5
1751	punpckldq %mm6, %mm6
1752	punpckldq %mm7, %mm7
1753
1754	movq	%mm0,  0 * SIZE(BO)
1755	movq	%mm0,  2 * SIZE(BO)
1756	movq	%mm1,  4 * SIZE(BO)
1757	movq	%mm1,  6 * SIZE(BO)
1758	movq	%mm2,  8 * SIZE(BO)
1759	movq	%mm2, 10 * SIZE(BO)
1760	movq	%mm3, 12 * SIZE(BO)
1761	movq	%mm3, 14 * SIZE(BO)
1762	movq	%mm4, 16 * SIZE(BO)
1763	movq	%mm4, 18 * SIZE(BO)
1764	movq	%mm5, 20 * SIZE(BO)
1765	movq	%mm5, 22 * SIZE(BO)
1766	movq	%mm6, 24 * SIZE(BO)
1767	movq	%mm6, 26 * SIZE(BO)
1768	movq	%mm7, 28 * SIZE(BO)
1769	movq	%mm7, 30 * SIZE(BO)
1770
1771	addq	$ 8 * SIZE, B
1772	addq	$32 * SIZE, BO
1773#endif
1774
1775	decq	%rax
1776	jne	.L52
1777	ALIGN_4
1778
1779.L53:
1780	movq	K, %rax
1781	andq	$3, %rax
1782	BRANCH
1783	jle	.L60
1784	ALIGN_4
1785
1786.L54:
1787#if defined(PENTIUM4) || defined(GENERIC)
1788	movss	 0 * SIZE(B), %xmm0
1789	movss	 1 * SIZE(B), %xmm1
1790
1791	shufps	 $0, %xmm0, %xmm0
1792	shufps	 $0, %xmm1, %xmm1
1793
1794	movaps	%xmm0,  0 * SIZE(BO)
1795	movaps	%xmm1,  4 * SIZE(BO)
1796#endif
1797
1798#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1799	movd	 0 * SIZE(B), %mm0
1800	movd	 1 * SIZE(B), %mm1
1801
1802	punpckldq %mm0, %mm0
1803	punpckldq %mm1, %mm1
1804
1805	movq	%mm0,  0 * SIZE(BO)
1806	movq	%mm0,  2 * SIZE(BO)
1807	movq	%mm1,  4 * SIZE(BO)
1808	movq	%mm1,  6 * SIZE(BO)
1809#endif
1810
1811	addq	$ 2 * SIZE, B
1812	addq	$ 8 * SIZE, BO
1813	decq	%rax
1814	jne	.L54
1815	ALIGN_4
1816
1817.L60:
1818	movq	C, CO1			# coffset1 = c
1819	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1820	movq	A, AO		# aoffset = a
1821
1822	movq	M,  I
1823	sarq	$3, I	# i = (m >> 3)
1824	jle	.L70
1825	ALIGN_4
1826
1827.L61:
1828#if !defined(TRMMKERNEL) || \
1829	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1830	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1831
1832	leaq	BUFFER, BO
1833#else
1834	leaq	BUFFER, BO
1835	movq	KK, %rax
1836	leaq	(, %rax,   8), %rax
1837	leaq	(AO, %rax, 4), AO
1838	leaq	(BO, %rax, 4), BO
1839#endif
1840
1841	movaps	-32 * SIZE(AO), %xmm8
1842	movaps	-16 * SIZE(AO), %xmm10
1843	movaps	  0 * SIZE(AO), %xmm12
1844	movaps	 16 * SIZE(AO), %xmm14
1845
1846	movaps	 0 * SIZE(BO), %xmm9
1847	movaps	16 * SIZE(BO), %xmm11
1848	movaps	32 * SIZE(BO), %xmm13
1849	movaps	48 * SIZE(BO), %xmm15
1850
1851	xorps	%xmm0, %xmm0
1852	xorps	%xmm1, %xmm1
1853
1854	PREFETCHW      7 * SIZE(CO1)
1855	xorps	%xmm4, %xmm4
1856	PREFETCHW      7 * SIZE(CO2)
1857	xorps	%xmm5, %xmm5
1858
1859#ifndef TRMMKERNEL
1860	movq	K, %rax
1861#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1862	movq	K, %rax
1863	subq	KK, %rax
1864	movq	%rax, KKK
1865#else
1866	movq	KK, %rax
1867#ifdef LEFT
1868	addq	$8, %rax
1869#else
1870	addq	$2, %rax
1871#endif
1872	movq	%rax, KKK
1873#endif
1874	sarq	$3, %rax
1875	je	.L65
1876	ALIGN_4
1877
1878.L62:
1879	mulps	%xmm8, %xmm9
1880#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1881	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1882#endif
1883	mulps	 4 * SIZE(BO), %xmm8
1884	addps	%xmm9, %xmm0
1885	movaps	 0 * SIZE(BO), %xmm9
1886	addps	%xmm8, %xmm1
1887	movaps	-28 * SIZE(AO), %xmm8
1888	mulps	%xmm8, %xmm9
1889	mulps	 4 * SIZE(BO), %xmm8
1890	addps	%xmm9, %xmm4
1891	movaps	 8 * SIZE(BO), %xmm9
1892	addps	%xmm8, %xmm5
1893	movaps	-24 * SIZE(AO), %xmm8
1894
1895	mulps	%xmm8, %xmm9
1896	mulps	12 * SIZE(BO), %xmm8
1897	addps	%xmm9, %xmm0
1898	movaps	 8 * SIZE(BO), %xmm9
1899	addps	%xmm8, %xmm1
1900	movaps	-20 * SIZE(AO), %xmm8
1901	mulps	%xmm8, %xmm9
1902	mulps	12 * SIZE(BO), %xmm8
1903	addps	%xmm9, %xmm4
1904	movaps	64 * SIZE(BO), %xmm9
1905	addps	%xmm8, %xmm5
1906	movaps	32 * SIZE(AO), %xmm8
1907
1908#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1909	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
1910#endif
1911	mulps	%xmm10, %xmm11
1912	mulps	20 * SIZE(BO), %xmm10
1913	addps	%xmm11, %xmm0
1914	movaps	16 * SIZE(BO), %xmm11
1915	addps	%xmm10, %xmm1
1916	movaps	-12 * SIZE(AO), %xmm10
1917	mulps	%xmm10, %xmm11
1918	mulps	20 * SIZE(BO), %xmm10
1919	addps	%xmm11, %xmm4
1920	movaps	24 * SIZE(BO), %xmm11
1921	addps	%xmm10, %xmm5
1922	movaps	 -8 * SIZE(AO), %xmm10
1923
1924	mulps	%xmm10, %xmm11
1925	mulps	28 * SIZE(BO), %xmm10
1926	addps	%xmm11, %xmm0
1927	movaps	24 * SIZE(BO), %xmm11
1928	addps	%xmm10, %xmm1
1929	movaps	-4 * SIZE(AO), %xmm10
1930	mulps	%xmm10, %xmm11
1931	mulps	28 * SIZE(BO), %xmm10
1932	addps	%xmm11, %xmm4
1933	movaps	80 * SIZE(BO), %xmm11
1934	addps	%xmm10, %xmm5
1935	movaps	48 * SIZE(AO), %xmm10
1936
1937#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1938	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
1939#endif
1940	mulps	%xmm12, %xmm13
1941	mulps	36 * SIZE(BO), %xmm12
1942	addps	%xmm13, %xmm0
1943	movaps	32 * SIZE(BO), %xmm13
1944	addps	%xmm12, %xmm1
1945	movaps	 4 * SIZE(AO), %xmm12
1946	mulps	%xmm12, %xmm13
1947	mulps	36 * SIZE(BO), %xmm12
1948	addps	%xmm13, %xmm4
1949	movaps	40 * SIZE(BO), %xmm13
1950	addps	%xmm12, %xmm5
1951	movaps	 8 * SIZE(AO), %xmm12
1952
1953	mulps	%xmm12, %xmm13
1954	mulps	44 * SIZE(BO), %xmm12
1955	addps	%xmm13, %xmm0
1956	movaps	40 * SIZE(BO), %xmm13
1957	addps	%xmm12, %xmm1
1958	movaps	12 * SIZE(AO), %xmm12
1959	mulps	%xmm12, %xmm13
1960	mulps	44 * SIZE(BO), %xmm12
1961	addps	%xmm13, %xmm4
1962	movaps	96 * SIZE(BO), %xmm13
1963	addps	%xmm12, %xmm5
1964	movaps	64 * SIZE(AO), %xmm12
1965
1966#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1967	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
1968#endif
1969	mulps	%xmm14, %xmm15
1970	mulps	52 * SIZE(BO), %xmm14
1971	addps	%xmm15, %xmm0
1972	movaps	48 * SIZE(BO), %xmm15
1973	addps	%xmm14, %xmm1
1974	movaps	20 * SIZE(AO), %xmm14
1975	mulps	%xmm14, %xmm15
1976	mulps	52 * SIZE(BO), %xmm14
1977	addps	%xmm15, %xmm4
1978	movaps	56 * SIZE(BO), %xmm15
1979	addps	%xmm14, %xmm5
1980	movaps	24 * SIZE(AO), %xmm14
1981
1982	mulps	%xmm14, %xmm15
1983	mulps	60 * SIZE(BO), %xmm14
1984	addps	%xmm15, %xmm0
1985	movaps	56 * SIZE(BO), %xmm15
1986	addps	%xmm14, %xmm1
1987	movaps	28 * SIZE(AO), %xmm14
1988	mulps	%xmm14, %xmm15
1989	mulps	60 * SIZE(BO), %xmm14
1990	addps	%xmm15, %xmm4
1991	movaps	112 * SIZE(BO), %xmm15
1992	addps	%xmm14, %xmm5
1993	movaps	80 * SIZE(AO), %xmm14
1994
1995	addq   $64 * SIZE, AO
1996	addq   $64 * SIZE, BO
1997	decq   %rax
1998	jne    .L62
1999	ALIGN_4
2000
2001.L65:
2002#ifndef TRMMKERNEL
2003	movq	K, %rax
2004#else
2005	movq	KKK, %rax
2006#endif
2007	movaps	ALPHA, %xmm15
2008	andq	$7, %rax		# if (k & 1)
2009	BRANCH
2010	je .L68
2011	ALIGN_4
2012
2013.L66:
2014	mulps	%xmm8, %xmm9
2015	mulps	 4 * SIZE(BO), %xmm8
2016	addps	%xmm9, %xmm0
2017	movaps	 0 * SIZE(BO), %xmm9
2018	addps	%xmm8, %xmm1
2019	movaps	-28 * SIZE(AO), %xmm8
2020	mulps	%xmm8, %xmm9
2021	mulps	 4 * SIZE(BO), %xmm8
2022	addps	%xmm9, %xmm4
2023	movaps	 8 * SIZE(BO), %xmm9
2024	addps	%xmm8, %xmm5
2025	movaps	-24 * SIZE(AO), %xmm8
2026
2027	addq	$8 * SIZE, AO		# aoffset  += 4
2028	addq	$8 * SIZE, BO		# boffset1 += 8
2029	decq	%rax
2030	jg	.L66
2031	ALIGN_4
2032
2033.L68:
2034#ifndef TRMMKERNEL
2035	movsd	0 * SIZE(CO1), %xmm8
2036	movhps	2 * SIZE(CO1), %xmm8
2037	movsd	4 * SIZE(CO1), %xmm9
2038	movhps	6 * SIZE(CO1), %xmm9
2039
2040	movsd	0 * SIZE(CO2), %xmm10
2041	movhps	2 * SIZE(CO2), %xmm10
2042	movsd	4 * SIZE(CO2), %xmm11
2043	movhps	6 * SIZE(CO2), %xmm11
2044#endif
2045
2046	mulps	%xmm15, %xmm0
2047	mulps	%xmm15, %xmm4
2048	mulps	%xmm15, %xmm1
2049	mulps	%xmm15, %xmm5
2050
2051#ifndef TRMMKERNEL
2052	addps	%xmm8,  %xmm0
2053	addps	%xmm9,  %xmm4
2054	addps	%xmm10, %xmm1
2055	addps	%xmm11, %xmm5
2056#endif
2057
2058	movlps	%xmm0, 0 * SIZE(CO1)
2059	movhps	%xmm0, 2 * SIZE(CO1)
2060	movlps	%xmm4, 4 * SIZE(CO1)
2061	movhps	%xmm4, 6 * SIZE(CO1)
2062
2063	movlps	%xmm1, 0 * SIZE(CO2)
2064	movhps	%xmm1, 2 * SIZE(CO2)
2065	movlps	%xmm5, 4 * SIZE(CO2)
2066	movhps	%xmm5, 6 * SIZE(CO2)
2067
2068#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2069    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2070	movq	K, %rax
2071	subq	KKK, %rax
2072	leaq	(,%rax,    8), %rax
2073	leaq	(AO, %rax, 4), AO
2074	leaq	(BO, %rax, 4), BO
2075#endif
2076
2077#if defined(TRMMKERNEL) && defined(LEFT)
2078	addq	$8, KK
2079#endif
2080
2081	addq	$8 * SIZE, CO1		# coffset += 4
2082	addq	$8 * SIZE, CO2		# coffset += 4
2083	decq	I			# i --
2084	jg	.L61
2085	ALIGN_4
2086
2087.L70:
2088	testq	$4, M
2089	je	.L80
2090
2091
2092#if !defined(TRMMKERNEL) || \
2093	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2094	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2095
2096	leaq	BUFFER, BO
2097#else
2098	leaq	BUFFER, BO
2099	movq	KK, %rax
2100	leaq	(, %rax,   8), %rax
2101	leaq	(AO, %rax, 2), AO
2102	leaq	(BO, %rax, 4), BO
2103#endif
2104
2105	movaps	-32 * SIZE(AO), %xmm8
2106	movaps	-16 * SIZE(AO), %xmm10
2107
2108	movaps	 0 * SIZE(BO), %xmm9
2109	movaps	16 * SIZE(BO), %xmm11
2110	movaps	32 * SIZE(BO), %xmm13
2111	movaps	48 * SIZE(BO), %xmm15
2112
2113	xorps	%xmm0, %xmm0
2114	xorps	%xmm1, %xmm1
2115	xorps	%xmm2, %xmm2
2116	xorps	%xmm3, %xmm3
2117
2118#ifndef TRMMKERNEL
2119	movq	K, %rax
2120#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2121	movq	K, %rax
2122	subq	KK, %rax
2123	movq	%rax, KKK
2124#else
2125	movq	KK, %rax
2126#ifdef LEFT
2127	addq	$4, %rax
2128#else
2129	addq	$2, %rax
2130#endif
2131	movq	%rax, KKK
2132#endif
2133	sarq	$3, %rax
2134	je	.L75
2135	ALIGN_4
2136
2137.L72:
2138	mulps	%xmm8, %xmm9
2139#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2140	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2141#endif
2142
2143	mulps	 4 * SIZE(BO), %xmm8
2144	addps	%xmm9, %xmm0
2145	movaps	 8 * SIZE(BO), %xmm9
2146	addps	%xmm8, %xmm1
2147	movaps	-28 * SIZE(AO), %xmm8
2148
2149	mulps	%xmm8, %xmm9
2150	mulps	12 * SIZE(BO), %xmm8
2151	addps	%xmm9, %xmm2
2152	movaps	64 * SIZE(BO), %xmm9
2153	addps	%xmm8, %xmm3
2154	movaps	-24 * SIZE(AO), %xmm8
2155
2156	mulps	%xmm8, %xmm11
2157	mulps	20 * SIZE(BO), %xmm8
2158	addps	%xmm11, %xmm0
2159	movaps	24 * SIZE(BO), %xmm11
2160	addps	%xmm8, %xmm1
2161	movaps	-20 * SIZE(AO), %xmm8
2162
2163	mulps	%xmm8, %xmm11
2164	mulps	28 * SIZE(BO), %xmm8
2165	addps	%xmm11, %xmm2
2166	movaps	80 * SIZE(BO), %xmm11
2167	addps	%xmm8, %xmm3
2168	movaps	 0 * SIZE(AO), %xmm8
2169
2170	mulps	%xmm10, %xmm13
2171	mulps	36 * SIZE(BO), %xmm10
2172	addps	%xmm13, %xmm0
2173	movaps	40 * SIZE(BO), %xmm13
2174	addps	%xmm10, %xmm1
2175	movaps	-12 * SIZE(AO), %xmm10
2176
2177	mulps	%xmm10, %xmm13
2178	mulps	44 * SIZE(BO), %xmm10
2179	addps	%xmm13, %xmm2
2180	movaps	96 * SIZE(BO), %xmm13
2181	addps	%xmm10, %xmm3
2182	movaps	 -8 * SIZE(AO), %xmm10
2183
2184	mulps	%xmm10, %xmm15
2185	mulps	52 * SIZE(BO), %xmm10
2186	addps	%xmm15, %xmm0
2187	movaps	56 * SIZE(BO), %xmm15
2188	addps	%xmm10, %xmm1
2189	movaps	 -4 * SIZE(AO), %xmm10
2190
2191	mulps	%xmm10, %xmm15
2192	mulps	60 * SIZE(BO), %xmm10
2193	addps	%xmm15, %xmm2
2194	movaps	112 * SIZE(BO), %xmm15
2195	addps	%xmm10, %xmm3
2196	movaps	16 * SIZE(AO), %xmm10
2197
2198	addq   $32 * SIZE, AO
2199	addq   $64 * SIZE, BO
2200	decq   %rax
2201	jne    .L72
2202	ALIGN_4
2203
2204.L75:
2205#ifndef TRMMKERNEL
2206	movq	K, %rax
2207#else
2208	movq	KKK, %rax
2209#endif
2210	movaps	ALPHA, %xmm15
2211	andq	$7, %rax		# if (k & 1)
2212	BRANCH
2213	je .L78
2214	ALIGN_4
2215
2216.L76:
2217	mulps	%xmm8, %xmm9
2218	mulps	 4 * SIZE(BO), %xmm8
2219	addps	%xmm9, %xmm0
2220	movaps	 8 * SIZE(BO), %xmm9
2221	addps	%xmm8, %xmm1
2222	movaps	-28 * SIZE(AO), %xmm8
2223
2224	addq	$4 * SIZE, AO		# aoffset  += 4
2225	addq	$8 * SIZE, BO		# boffset1 += 8
2226	decq	%rax
2227	jg	.L76
2228	ALIGN_4
2229
2230.L78:
2231#ifndef TRMMKERNEL
2232	movsd	0 * SIZE(CO1), %xmm8
2233	movhps	2 * SIZE(CO1), %xmm8
2234	movsd	0 * SIZE(CO2), %xmm10
2235	movhps	2 * SIZE(CO2), %xmm10
2236#endif
2237
2238	addps	%xmm2, %xmm0
2239	addps	%xmm3, %xmm1
2240
2241	mulps	%xmm15, %xmm0
2242	mulps	%xmm15, %xmm1
2243
2244#ifndef TRMMKERNEL
2245	addps	%xmm8,  %xmm0
2246	addps	%xmm10, %xmm1
2247#endif
2248
2249	movlps	%xmm0, 0 * SIZE(CO1)
2250	movhps	%xmm0, 2 * SIZE(CO1)
2251	movlps	%xmm1, 0 * SIZE(CO2)
2252	movhps	%xmm1, 2 * SIZE(CO2)
2253
2254#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2255    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2256	movq	K, %rax
2257	subq	KKK, %rax
2258	leaq	(,%rax,    8), %rax
2259	leaq	(AO, %rax, 2), AO
2260	leaq	(BO, %rax, 4), BO
2261#endif
2262
2263#if defined(TRMMKERNEL) && defined(LEFT)
2264	addq	$4, KK
2265#endif
2266
2267	addq	$4 * SIZE, CO1		# coffset += 4
2268	addq	$4 * SIZE, CO2		# coffset += 4
2269	ALIGN_4
2270
2271.L80:
2272	testq	$2, M
2273	je	.L90
2274
2275#if !defined(TRMMKERNEL) || \
2276	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2277	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2278
2279	leaq	BUFFER, BO
2280#else
2281	leaq	BUFFER, BO
2282	movq	KK, %rax
2283	leaq	(, %rax,   8), %rax
2284	leaq	(AO, %rax, 1), AO
2285	leaq	(BO, %rax, 4), BO
2286#endif
2287
2288	movaps	-32 * SIZE(AO), %xmm8
2289	movaps	-24 * SIZE(AO), %xmm10
2290
2291	movaps	 0 * SIZE(BO), %xmm9
2292	movaps	16 * SIZE(BO), %xmm11
2293	movaps	32 * SIZE(BO), %xmm13
2294	movaps	48 * SIZE(BO), %xmm15
2295
2296	xorps	%xmm0, %xmm0
2297	xorps	%xmm1, %xmm1
2298	xorps	%xmm2, %xmm2
2299	xorps	%xmm3, %xmm3
2300
2301#ifndef TRMMKERNEL
2302	movq	K, %rax
2303#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2304	movq	K, %rax
2305	subq	KK, %rax
2306	movq	%rax, KKK
2307#else
2308	movq	KK, %rax
2309#ifdef LEFT
2310	addq	$2, %rax
2311#else
2312	addq	$2, %rax
2313#endif
2314	movq	%rax, KKK
2315#endif
2316	sarq	$3, %rax
2317	je	.L85
2318	ALIGN_4
2319
2320.L82:
2321	mulps	%xmm8, %xmm9
2322	addps	%xmm9, %xmm0
2323#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2324	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2325#endif
2326	movaps	 4 * SIZE(BO), %xmm9
2327	mulps	%xmm8, %xmm9
2328	movsd	-30 * SIZE(AO), %xmm8
2329	addps	%xmm9, %xmm1
2330	movaps	 8 * SIZE(BO), %xmm9
2331
2332	mulps	%xmm8, %xmm9
2333	addps	%xmm9, %xmm2
2334	movaps	12 * SIZE(BO), %xmm9
2335	mulps	%xmm8, %xmm9
2336	movsd	-28 * SIZE(AO), %xmm8
2337	addps	%xmm9, %xmm3
2338	movaps	64 * SIZE(BO), %xmm9
2339
2340	mulps	%xmm8, %xmm11
2341	addps	%xmm11, %xmm0
2342	movaps	20 * SIZE(BO), %xmm11
2343	mulps	%xmm8, %xmm11
2344	movsd	-26 * SIZE(AO), %xmm8
2345	addps	%xmm11, %xmm1
2346	movaps	24 * SIZE(BO), %xmm11
2347
2348	mulps	%xmm8, %xmm11
2349	addps	%xmm11, %xmm2
2350	movaps	28 * SIZE(BO), %xmm11
2351	mulps	%xmm8, %xmm11
2352	movsd	-16 * SIZE(AO), %xmm8
2353	addps	%xmm11, %xmm3
2354	movaps	 80 * SIZE(BO), %xmm11
2355
2356	mulps	%xmm10, %xmm13
2357	addps	%xmm13, %xmm0
2358	movaps	36 * SIZE(BO), %xmm13
2359	mulps	%xmm10, %xmm13
2360	movsd	-22 * SIZE(AO), %xmm10
2361	addps	%xmm13, %xmm1
2362	movaps	40 * SIZE(BO), %xmm13
2363
2364	mulps	%xmm10, %xmm13
2365	addps	%xmm13, %xmm2
2366	movaps	44 * SIZE(BO), %xmm13
2367	mulps	%xmm10, %xmm13
2368	movsd	-20 * SIZE(AO), %xmm10
2369	addps	%xmm13, %xmm3
2370	movaps	 96 * SIZE(BO), %xmm13
2371
2372	mulps	%xmm10, %xmm15
2373	addps	%xmm15, %xmm0
2374	movaps	52 * SIZE(BO), %xmm15
2375	mulps	%xmm10, %xmm15
2376	movsd	-18 * SIZE(AO), %xmm10
2377	addps	%xmm15, %xmm1
2378	movaps	56 * SIZE(BO), %xmm15
2379
2380	mulps	%xmm10, %xmm15
2381	addps	%xmm15, %xmm2
2382	movaps	60 * SIZE(BO), %xmm15
2383	mulps	%xmm10, %xmm15
2384	movsd	-8 * SIZE(AO), %xmm10
2385	addps	%xmm15, %xmm3
2386	movaps	112 * SIZE(BO), %xmm15
2387
2388	addq   $16 * SIZE, AO
2389	addq   $64 * SIZE, BO
2390	decq   %rax
2391	jne    .L82
2392	ALIGN_4
2393
2394.L85:
2395#ifndef TRMMKERNEL
2396	movq	K, %rax
2397#else
2398	movq	KKK, %rax
2399#endif
2400	movaps	ALPHA, %xmm15
2401	andq	$7, %rax		# if (k & 1)
2402	BRANCH
2403	je .L88
2404	ALIGN_4
2405
2406.L86:
2407	mulps	%xmm8, %xmm9
2408	addps	%xmm9, %xmm0
2409	movaps	  4 * SIZE(BO), %xmm9
2410	mulps	%xmm8, %xmm9
2411	movsd	-30 * SIZE(AO), %xmm8
2412	addps	%xmm9, %xmm1
2413	movaps	  8 * SIZE(BO), %xmm9
2414
2415	addq	$2 * SIZE, AO		# aoffset  += 4
2416	addq	$8 * SIZE, BO		# boffset1 += 8
2417	decq	%rax
2418	jg	.L86
2419	ALIGN_4
2420
2421.L88:
2422#ifndef TRMMKERNEL
2423#ifdef movsd
2424	xorps	%xmm8,  %xmm8
2425#endif
2426	movsd	0 * SIZE(CO1), %xmm8
2427#ifdef movsd
2428	xorps	%xmm10, %xmm10
2429#endif
2430	movsd	0 * SIZE(CO2), %xmm10
2431#endif
2432
2433	addps	%xmm2, %xmm0
2434	addps	%xmm3, %xmm1
2435
2436	mulps	%xmm15, %xmm0
2437	mulps	%xmm15, %xmm1
2438
2439#ifndef TRMMKERNEL
2440	addps	%xmm8,  %xmm0
2441	addps	%xmm10, %xmm1
2442#endif
2443
2444	movlps	%xmm0, 0 * SIZE(CO1)
2445	movlps	%xmm1, 0 * SIZE(CO2)
2446
2447#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2448    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2449	movq	K, %rax
2450	subq	KKK, %rax
2451	leaq	(,%rax,    8), %rax
2452	leaq	(AO, %rax, 1), AO
2453	leaq	(BO, %rax, 4), BO
2454#endif
2455
2456#if defined(TRMMKERNEL) && defined(LEFT)
2457	addq	$2, KK
2458#endif
2459
2460	addq	$2 * SIZE, CO1		# coffset += 4
2461	addq	$2 * SIZE, CO2		# coffset += 4
2462	ALIGN_4
2463
2464.L90:
2465	testq	$1, M
2466	je	.L99
2467
2468#if !defined(TRMMKERNEL) || \
2469	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2470	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2471
2472	leaq	BUFFER, BO
2473#else
2474	leaq	BUFFER, BO
2475	movq	KK, %rax
2476	leaq	(, %rax,   4), %rax
2477	leaq	(AO, %rax, 1), AO
2478	leaq	(BO, %rax, 8), BO
2479#endif
2480
2481	movss	-32 * SIZE(AO), %xmm8
2482	movss	-28 * SIZE(AO), %xmm10
2483
2484	movss	 0 * SIZE(BO), %xmm9
2485	movss	16 * SIZE(BO), %xmm11
2486	movss	32 * SIZE(BO), %xmm13
2487	movss	48 * SIZE(BO), %xmm15
2488
2489	xorps	%xmm0, %xmm0
2490	xorps	%xmm1, %xmm1
2491	xorps	%xmm2, %xmm2
2492	xorps	%xmm3, %xmm3
2493
2494#ifndef TRMMKERNEL
2495	movq	K, %rax
2496#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2497	movq	K, %rax
2498	subq	KK, %rax
2499	movq	%rax, KKK
2500#else
2501	movq	KK, %rax
2502#ifdef LEFT
2503	addq	$1, %rax
2504#else
2505	addq	$2, %rax
2506#endif
2507	movq	%rax, KKK
2508#endif
2509	sarq	$3, %rax
2510	je	.L95
2511	ALIGN_4
2512
2513.L92:
2514	mulps	%xmm8, %xmm9
2515	addps	%xmm9, %xmm0
2516#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2517	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2518#endif
2519	movss	 4 * SIZE(BO), %xmm9
2520	mulps	%xmm8, %xmm9
2521	movss	-31 * SIZE(AO), %xmm8
2522	addps	%xmm9, %xmm1
2523	movss	 8 * SIZE(BO), %xmm9
2524
2525	mulps	%xmm8, %xmm9
2526	addps	%xmm9, %xmm2
2527	movss	12 * SIZE(BO), %xmm9
2528	mulps	%xmm8, %xmm9
2529	movss	-30 * SIZE(AO), %xmm8
2530	addps	%xmm9, %xmm3
2531	movss	64 * SIZE(BO), %xmm9
2532
2533	mulps	%xmm8, %xmm11
2534	addps	%xmm11, %xmm0
2535	movss	20 * SIZE(BO), %xmm11
2536	mulps	%xmm8, %xmm11
2537	movss	-29 * SIZE(AO), %xmm8
2538	addps	%xmm11, %xmm1
2539	movss	24 * SIZE(BO), %xmm11
2540
2541	mulps	%xmm8, %xmm11
2542	addps	%xmm11, %xmm2
2543	movss	28 * SIZE(BO), %xmm11
2544	mulps	%xmm8, %xmm11
2545	movss	-24 * SIZE(AO), %xmm8
2546	addps	%xmm11, %xmm3
2547	movss	 80 * SIZE(BO), %xmm11
2548
2549	mulps	%xmm10, %xmm13
2550	addps	%xmm13, %xmm0
2551	movss	36 * SIZE(BO), %xmm13
2552	mulps	%xmm10, %xmm13
2553	movss	-27 * SIZE(AO), %xmm10
2554	addps	%xmm13, %xmm1
2555	movss	40 * SIZE(BO), %xmm13
2556
2557	mulps	%xmm10, %xmm13
2558	addps	%xmm13, %xmm2
2559	movss	44 * SIZE(BO), %xmm13
2560	mulps	%xmm10, %xmm13
2561	movss	-26 * SIZE(AO), %xmm10
2562	addps	%xmm13, %xmm3
2563	movss	 96 * SIZE(BO), %xmm13
2564
2565	mulps	%xmm10, %xmm15
2566	addps	%xmm15, %xmm0
2567	movss	52 * SIZE(BO), %xmm15
2568	mulps	%xmm10, %xmm15
2569	movss	-25 * SIZE(AO), %xmm10
2570	addps	%xmm15, %xmm1
2571	movss	56 * SIZE(BO), %xmm15
2572
2573	mulps	%xmm10, %xmm15
2574	addps	%xmm15, %xmm2
2575	movss	60 * SIZE(BO), %xmm15
2576	mulps	%xmm10, %xmm15
2577	movss	-20 * SIZE(AO), %xmm10
2578	addps	%xmm15, %xmm3
2579	movss	112 * SIZE(BO), %xmm15
2580
2581	addq   $ 8 * SIZE, AO
2582	addq   $64 * SIZE, BO
2583	decq   %rax
2584	jne    .L92
2585	ALIGN_4
2586
2587.L95:
2588#ifndef TRMMKERNEL
2589	movq	K, %rax
2590#else
2591	movq	KKK, %rax
2592#endif
2593	movaps	ALPHA, %xmm15
2594	andq	$7, %rax		# if (k & 1)
2595	BRANCH
2596	je .L98
2597	ALIGN_4
2598
2599.L96:
2600	mulps	%xmm8, %xmm9
2601	addps	%xmm9, %xmm0
2602	movss	 4 * SIZE(BO), %xmm9
2603	mulps	%xmm8, %xmm9
2604	movss	-31 * SIZE(AO), %xmm8
2605	addps	%xmm9, %xmm1
2606	movss	 8 * SIZE(BO), %xmm9
2607
2608	addq	$1 * SIZE, AO		# aoffset  += 4
2609	addq	$8 * SIZE, BO		# boffset1 += 8
2610	decq	%rax
2611	jg	.L96
2612	ALIGN_4
2613
2614.L98:
2615#ifndef TRMMKERNEL
2616	movss	0 * SIZE(CO1), %xmm8
2617	movss	0 * SIZE(CO2), %xmm10
2618#endif
2619
2620	addss	%xmm2, %xmm0
2621	addss	%xmm3, %xmm1
2622	mulss	%xmm15, %xmm0
2623	mulss	%xmm15, %xmm1
2624
2625#ifndef TRMMKERNEL
2626	addss	%xmm8,  %xmm0
2627	addss	%xmm10, %xmm1
2628#endif
2629
2630	movss	%xmm0, 0 * SIZE(CO1)
2631	movss	%xmm1, 0 * SIZE(CO2)
2632
2633#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2634    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2635	movq	K, %rax
2636	subq	KKK, %rax
2637	leaq	(,%rax,    4), %rax
2638	leaq	(AO, %rax, 1), AO
2639	leaq	(BO, %rax, 8), BO
2640#endif
2641
2642#if defined(TRMMKERNEL) && defined(LEFT)
2643	addq	$1, KK
2644#endif
2645	ALIGN_4
2646
2647.L99:
2648#if defined(TRMMKERNEL) && !defined(LEFT)
2649	addl	$2, KK
2650#endif
2651	leaq	(C, LDC, 2), C		# c += 4 * ldc
2652	ALIGN_4
2653
2654
2655.L100:
2656	testq	$1, N
2657	je	.L999
2658
2659.L101:
2660#if defined(TRMMKERNEL) && defined(LEFT)
2661	movq	OFFSET, %rax
2662	movq	%rax, KK
2663#endif
2664
2665/* Copying to Sub Buffer */
2666	leaq	BUFFER, BO
2667
2668	movq	K, %rax
2669	sarq	$3, %rax
2670	jle	.L103
2671	ALIGN_4
2672
2673
2674.L102:
2675#if defined(PENTIUM4) || defined(GENERIC)
2676	movss	 0 * SIZE(B), %xmm0
2677	movss	 1 * SIZE(B), %xmm1
2678	movss	 2 * SIZE(B), %xmm2
2679	movss	 3 * SIZE(B), %xmm3
2680	movss	 4 * SIZE(B), %xmm4
2681	movss	 5 * SIZE(B), %xmm5
2682	movss	 6 * SIZE(B), %xmm6
2683	movss	 7 * SIZE(B), %xmm7
2684
2685	PREFETCH	 32 * SIZE(B)
2686
2687	shufps	 $0, %xmm0, %xmm0
2688	shufps	 $0, %xmm1, %xmm1
2689	shufps	 $0, %xmm2, %xmm2
2690	shufps	 $0, %xmm3, %xmm3
2691	shufps	 $0, %xmm4, %xmm4
2692	shufps	 $0, %xmm5, %xmm5
2693	shufps	 $0, %xmm6, %xmm6
2694	shufps	 $0, %xmm7, %xmm7
2695
2696	movaps	%xmm0,  0 * SIZE(BO)
2697	movaps	%xmm1,  4 * SIZE(BO)
2698	movaps	%xmm2,  8 * SIZE(BO)
2699	movaps	%xmm3, 12 * SIZE(BO)
2700	movaps	%xmm4, 16 * SIZE(BO)
2701	movaps	%xmm5, 20 * SIZE(BO)
2702	movaps	%xmm6, 24 * SIZE(BO)
2703	movaps	%xmm7, 28 * SIZE(BO)
2704
2705	addq	$ 8 * SIZE, B
2706	addq	$32 * SIZE, BO
2707#endif
2708
2709#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2710	PREFETCH	 32 * SIZE(B)
2711
2712	movd	 0 * SIZE(B), %mm0
2713	movd	 1 * SIZE(B), %mm1
2714	movd	 2 * SIZE(B), %mm2
2715	movd	 3 * SIZE(B), %mm3
2716	movd	 4 * SIZE(B), %mm4
2717	movd	 5 * SIZE(B), %mm5
2718	movd	 6 * SIZE(B), %mm6
2719	movd	 7 * SIZE(B), %mm7
2720
2721	punpckldq %mm0, %mm0
2722	punpckldq %mm1, %mm1
2723	punpckldq %mm2, %mm2
2724	punpckldq %mm3, %mm3
2725	punpckldq %mm4, %mm4
2726	punpckldq %mm5, %mm5
2727	punpckldq %mm6, %mm6
2728	punpckldq %mm7, %mm7
2729
2730	movq	%mm0,  0 * SIZE(BO)
2731	movq	%mm0,  2 * SIZE(BO)
2732	movq	%mm1,  4 * SIZE(BO)
2733	movq	%mm1,  6 * SIZE(BO)
2734	movq	%mm2,  8 * SIZE(BO)
2735	movq	%mm2, 10 * SIZE(BO)
2736	movq	%mm3, 12 * SIZE(BO)
2737	movq	%mm3, 14 * SIZE(BO)
2738	movq	%mm4, 16 * SIZE(BO)
2739	movq	%mm4, 18 * SIZE(BO)
2740	movq	%mm5, 20 * SIZE(BO)
2741	movq	%mm5, 22 * SIZE(BO)
2742	movq	%mm6, 24 * SIZE(BO)
2743	movq	%mm6, 26 * SIZE(BO)
2744	movq	%mm7, 28 * SIZE(BO)
2745	movq	%mm7, 30 * SIZE(BO)
2746
2747	addq	$ 8 * SIZE, B
2748	addq	$32 * SIZE, BO
2749#endif
2750
2751	decq	%rax
2752	jne	.L102
2753	ALIGN_4
2754
2755.L103:
2756	movq	K, %rax
2757	andq	$7, %rax
2758	BRANCH
2759	jle	.L110
2760	ALIGN_4
2761
2762.L104:
2763#if defined(PENTIUM4) || defined(GENERIC)
2764	movss	 0 * SIZE(B), %xmm0
2765	shufps	 $0, %xmm0, %xmm0
2766	movaps	%xmm0,  0 * SIZE(BO)
2767#endif
2768
2769#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2770	movd	 0 * SIZE(B), %mm0
2771	punpckldq %mm0, %mm0
2772	movq	%mm0,  0 * SIZE(BO)
2773	movq	%mm0,  2 * SIZE(BO)
2774#endif
2775
2776	addq	$ 1 * SIZE, B
2777	addq	$ 4 * SIZE, BO
2778	decq	%rax
2779	jne	.L104
2780	ALIGN_4
2781
2782.L110:
2783	movq	C, CO1			# coffset1 = c
2784	movq	A, AO		# aoffset = a
2785
2786	movq	M,  I
2787	sarq	$3, I	# i = (m >> 3)
2788	jle	.L120
2789	ALIGN_4
2790
2791.L111:
2792#if !defined(TRMMKERNEL) || \
2793	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2794	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2795
2796	leaq	BUFFER, BO
2797#else
2798	leaq	BUFFER, BO
2799	movq	KK, %rax
2800	leaq	(, %rax,   8), %rax
2801	leaq	(AO, %rax, 4), AO
2802	leaq	(BO, %rax, 2), BO
2803#endif
2804
2805	movaps	-32 * SIZE(AO), %xmm8
2806	movaps	-16 * SIZE(AO), %xmm10
2807	movaps	  0 * SIZE(AO), %xmm12
2808	movaps	 16 * SIZE(AO), %xmm14
2809
2810	movaps	 0 * SIZE(BO), %xmm9
2811	movaps	16 * SIZE(BO), %xmm11
2812	movaps	32 * SIZE(BO), %xmm13
2813	movaps	48 * SIZE(BO), %xmm15
2814
2815	xorps	%xmm0, %xmm0
2816	xorps	%xmm1, %xmm1
2817
2818	PREFETCHW      7 * SIZE(CO1)
2819	xorps	%xmm4, %xmm4
2820	xorps	%xmm5, %xmm5
2821
2822#ifndef TRMMKERNEL
2823	movq	K, %rax
2824#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2825	movq	K, %rax
2826	subq	KK, %rax
2827	movq	%rax, KKK
2828#else
2829	movq	KK, %rax
2830#ifdef LEFT
2831	addq	$8, %rax
2832#else
2833	addq	$1, %rax
2834#endif
2835	movq	%rax, KKK
2836#endif
2837	sarq	$3, %rax
2838	je	.L115
2839	ALIGN_4
2840
2841.L112:
2842	mulps	%xmm9, %xmm8
2843#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2844	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2845#endif
2846
2847	mulps	-28 * SIZE(AO), %xmm9
2848	addps	%xmm8, %xmm0
2849	movaps	-24 * SIZE(AO), %xmm8
2850	addps	%xmm9, %xmm4
2851	movaps	 4 * SIZE(BO), %xmm9
2852
2853	mulps	%xmm9, %xmm8
2854	mulps	-20 * SIZE(AO), %xmm9
2855	addps	%xmm8, %xmm0
2856	movaps	 32 * SIZE(AO), %xmm8
2857	addps	%xmm9, %xmm4
2858	movaps	 8 * SIZE(BO), %xmm9
2859
2860#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2861	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
2862#endif
2863	mulps	%xmm9, %xmm10
2864	mulps	-12 * SIZE(AO), %xmm9
2865	addps	%xmm10, %xmm0
2866	movaps	 -8 * SIZE(AO), %xmm10
2867	addps	%xmm9, %xmm4
2868	movaps	12 * SIZE(BO), %xmm9
2869
2870	mulps	%xmm9, %xmm10
2871	mulps	 -4 * SIZE(AO), %xmm9
2872	addps	%xmm10, %xmm0
2873	movaps	 48 * SIZE(AO), %xmm10
2874	addps	%xmm9, %xmm4
2875	movaps	32 * SIZE(BO), %xmm9
2876
2877#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2878	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
2879#endif
2880	mulps	%xmm11, %xmm12
2881	mulps	 4 * SIZE(AO), %xmm11
2882	addps	%xmm12, %xmm0
2883	movaps	 8 * SIZE(AO), %xmm12
2884	addps	%xmm11, %xmm4
2885	movaps	20 * SIZE(BO), %xmm11
2886
2887	mulps	%xmm11, %xmm12
2888	mulps	12 * SIZE(AO), %xmm11
2889	addps	%xmm12, %xmm0
2890	movaps	64 * SIZE(AO), %xmm12
2891	addps	%xmm11, %xmm4
2892	movaps	24 * SIZE(BO), %xmm11
2893
2894#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2895	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
2896#endif
2897	mulps	%xmm11, %xmm14
2898	mulps	20 * SIZE(AO), %xmm11
2899	addps	%xmm14, %xmm0
2900	movaps	24 * SIZE(AO), %xmm14
2901	addps	%xmm11, %xmm4
2902	movaps	28 * SIZE(BO), %xmm11
2903
2904	mulps	%xmm11, %xmm14
2905	mulps	28 * SIZE(AO), %xmm11
2906	addps	%xmm14, %xmm0
2907	movaps	80 * SIZE(AO), %xmm14
2908	addps	%xmm11, %xmm4
2909	movaps	48 * SIZE(BO), %xmm11
2910
2911	addq   $64 * SIZE, AO
2912	addq   $32 * SIZE, BO
2913	decq   %rax
2914	jne    .L112
2915	ALIGN_4
2916
2917.L115:
2918#ifndef TRMMKERNEL
2919	movq	K, %rax
2920#else
2921	movq	KKK, %rax
2922#endif
2923	movaps	ALPHA, %xmm15
2924	andq	$7, %rax		# if (k & 1)
2925	BRANCH
2926	je .L118
2927	ALIGN_4
2928
2929.L116:
2930	mulps	%xmm9, %xmm8
2931	mulps	-28 * SIZE(AO), %xmm9
2932	addps	%xmm8, %xmm0
2933	movaps	-24 * SIZE(AO), %xmm8
2934	addps	%xmm9, %xmm4
2935	movaps	 4 * SIZE(BO), %xmm9
2936
2937	addq	$8 * SIZE, AO		# aoffset  += 4
2938	addq	$4 * SIZE, BO		# boffset1 += 8
2939	decq	%rax
2940	jg	.L116
2941	ALIGN_4
2942
2943.L118:
2944#ifndef TRMMKERNEL
2945	movsd	0 * SIZE(CO1), %xmm8
2946	movhps	2 * SIZE(CO1), %xmm8
2947	movsd	4 * SIZE(CO1), %xmm9
2948	movhps	6 * SIZE(CO1), %xmm9
2949#endif
2950
2951	mulps	%xmm15, %xmm0
2952	mulps	%xmm15, %xmm4
2953#ifndef TRMMKERNEL
2954	addps	%xmm8,  %xmm0
2955	addps	%xmm9,  %xmm4
2956#endif
2957
2958	movlps	%xmm0, 0 * SIZE(CO1)
2959	movhps	%xmm0, 2 * SIZE(CO1)
2960	movlps	%xmm4, 4 * SIZE(CO1)
2961	movhps	%xmm4, 6 * SIZE(CO1)
2962
2963#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2964    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2965	movq	K, %rax
2966	subq	KKK, %rax
2967	leaq	(,%rax,    8), %rax
2968	leaq	(AO, %rax, 4), AO
2969	leaq	(BO, %rax, 2), BO
2970#endif
2971
2972#if defined(TRMMKERNEL) && defined(LEFT)
2973	addq	$8, KK
2974#endif
2975
2976	addq	$8 * SIZE, CO1		# coffset += 4
2977	decq	I			# i --
2978	jg	.L111
2979	ALIGN_4
2980
2981.L120:
2982	testq	$4, M
2983	je	.L130
2984
2985#if !defined(TRMMKERNEL) || \
2986	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2987	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2988
2989	leaq	BUFFER, BO
2990#else
2991	leaq	BUFFER, BO
2992	movq	KK, %rax
2993	leaq	(, %rax,   8), %rax
2994	leaq	(AO, %rax, 2), AO
2995	leaq	(BO, %rax, 2), BO
2996#endif
2997
2998	movaps	-32 * SIZE(AO), %xmm8
2999	movaps	-16 * SIZE(AO), %xmm10
3000
3001	movaps	 0 * SIZE(BO), %xmm9
3002	movaps	16 * SIZE(BO), %xmm11
3003
3004	xorps	%xmm0, %xmm0
3005	xorps	%xmm1, %xmm1
3006	xorps	%xmm2, %xmm2
3007	xorps	%xmm3, %xmm3
3008
3009#ifndef TRMMKERNEL
3010	movq	K, %rax
3011#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3012	movq	K, %rax
3013	subq	KK, %rax
3014	movq	%rax, KKK
3015#else
3016	movq	KK, %rax
3017#ifdef LEFT
3018	addq	$4, %rax
3019#else
3020	addq	$1, %rax
3021#endif
3022	movq	%rax, KKK
3023#endif
3024	sarq	$3, %rax
3025	je	.L125
3026	ALIGN_4
3027
3028.L122:
3029	mulps	%xmm8, %xmm9
3030#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3031	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3032#endif
3033	movaps	-28 * SIZE(AO), %xmm8
3034	mulps	 4 * SIZE(BO), %xmm8
3035	addps	%xmm9, %xmm0
3036	movaps	32 * SIZE(BO), %xmm9
3037	addps	%xmm8, %xmm1
3038	movaps	-24 * SIZE(AO), %xmm8
3039	mulps	 8 * SIZE(BO), %xmm8
3040	addps	%xmm8, %xmm2
3041	movaps	-20 * SIZE(AO), %xmm8
3042	mulps	12 * SIZE(BO), %xmm8
3043	addps	%xmm8, %xmm3
3044	movaps	  0 * SIZE(AO), %xmm8
3045
3046#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3047	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
3048#endif
3049	mulps	%xmm10, %xmm11
3050	movaps	-12 * SIZE(AO), %xmm10
3051	mulps	20 * SIZE(BO), %xmm10
3052	addps	%xmm11, %xmm0
3053	movaps	48 * SIZE(BO), %xmm11
3054	addps	%xmm10, %xmm1
3055	movaps	 -8 * SIZE(AO), %xmm10
3056	mulps	24 * SIZE(BO), %xmm10
3057	addps	%xmm10, %xmm2
3058	movaps	-4 * SIZE(AO), %xmm10
3059	mulps	28 * SIZE(BO), %xmm10
3060	addps	%xmm10, %xmm3
3061	movaps	16 * SIZE(AO), %xmm10
3062
3063	addq   $32 * SIZE, AO
3064	addq   $32 * SIZE, BO
3065	decq   %rax
3066	jne    .L122
3067	ALIGN_4
3068
3069.L125:
3070#ifndef TRMMKERNEL
3071	movq	K, %rax
3072#else
3073	movq	KKK, %rax
3074#endif
3075	movaps	ALPHA, %xmm15
3076	andq	$7, %rax		# if (k & 1)
3077	BRANCH
3078	je .L128
3079	ALIGN_4
3080
3081.L126:
3082	mulps	%xmm8, %xmm9
3083	movaps	-28 * SIZE(AO), %xmm8
3084	addps	%xmm9, %xmm0
3085	movaps	 4 * SIZE(BO), %xmm9
3086
3087	addq	$4 * SIZE, AO		# aoffset  += 4
3088	addq	$4 * SIZE, BO		# boffset1 += 8
3089	decq	%rax
3090	jg	.L126
3091	ALIGN_4
3092
3093.L128:
3094#ifndef TRMMKERNEL
3095	movsd	0 * SIZE(CO1), %xmm8
3096	movhps	2 * SIZE(CO1), %xmm8
3097#endif
3098
3099	addps	%xmm1, %xmm0
3100	addps	%xmm3, %xmm2
3101	addps	%xmm2, %xmm0
3102
3103	mulps	%xmm15, %xmm0
3104#ifndef TRMMKERNEL
3105	addps	%xmm8,  %xmm0
3106#endif
3107
3108	movlps	%xmm0, 0 * SIZE(CO1)
3109	movhps	%xmm0, 2 * SIZE(CO1)
3110
3111#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3112    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3113	movq	K, %rax
3114	subq	KKK, %rax
3115	leaq	(,%rax,    8), %rax
3116	leaq	(AO, %rax, 2), AO
3117	leaq	(BO, %rax, 2), BO
3118#endif
3119
3120#if defined(TRMMKERNEL) && defined(LEFT)
3121	addq	$4, KK
3122#endif
3123
3124	addq	$4 * SIZE, CO1		# coffset += 4
3125	ALIGN_4
3126
3127.L130:
3128	testq	$2, M
3129	je	.L140
3130
3131#if !defined(TRMMKERNEL) || \
3132	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3133	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3134
3135	leaq	BUFFER, BO
3136#else
3137	leaq	BUFFER, BO
3138	movq	KK, %rax
3139	leaq	(, %rax,   8), %rax
3140	leaq	(AO, %rax, 1), AO
3141	leaq	(BO, %rax, 2), BO
3142#endif
3143
3144	movaps	-32 * SIZE(AO), %xmm8
3145	movaps	-24 * SIZE(AO), %xmm10
3146
3147	movaps	 0 * SIZE(BO), %xmm9
3148	movaps	16 * SIZE(BO), %xmm11
3149
3150	xorps	%xmm0, %xmm0
3151	xorps	%xmm1, %xmm1
3152	xorps	%xmm2, %xmm2
3153	xorps	%xmm3, %xmm3
3154
3155#ifndef TRMMKERNEL
3156	movq	K, %rax
3157#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3158	movq	K, %rax
3159	subq	KK, %rax
3160	movq	%rax, KKK
3161#else
3162	movq	KK, %rax
3163#ifdef LEFT
3164	addq	$2, %rax
3165#else
3166	addq	$1, %rax
3167#endif
3168	movq	%rax, KKK
3169#endif
3170	sarq	$3, %rax
3171	je	.L135
3172	ALIGN_4
3173
3174.L132:
3175	mulps	%xmm8, %xmm9
3176#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3177	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3178#endif
3179	movsd	-30 * SIZE(AO), %xmm8
3180	addps	%xmm9, %xmm0
3181	movaps	 4 * SIZE(BO), %xmm9
3182	mulps	%xmm8, %xmm9
3183	movsd	-28 * SIZE(AO), %xmm8
3184	addps	%xmm9, %xmm1
3185	movaps	 8 * SIZE(BO), %xmm9
3186
3187	mulps	%xmm8, %xmm9
3188	movsd	-26 * SIZE(AO), %xmm8
3189	addps	%xmm9, %xmm0
3190	movaps	12 * SIZE(BO), %xmm9
3191
3192	mulps	%xmm8, %xmm9
3193	movsd	-16 * SIZE(AO), %xmm8
3194	addps	%xmm9, %xmm1
3195	movaps	32 * SIZE(BO), %xmm9
3196
3197	mulps	%xmm10, %xmm11
3198	movsd	-22 * SIZE(AO), %xmm10
3199	addps	%xmm11, %xmm0
3200	movaps	20 * SIZE(BO), %xmm11
3201
3202	mulps	%xmm10, %xmm11
3203	movsd	-20 * SIZE(AO), %xmm10
3204	addps	%xmm11, %xmm1
3205	movaps	24 * SIZE(BO), %xmm11
3206
3207	mulps	%xmm10, %xmm11
3208	movsd	-18 * SIZE(AO), %xmm10
3209	addps	%xmm11, %xmm0
3210	movaps	28 * SIZE(BO), %xmm11
3211
3212	mulps	%xmm10, %xmm11
3213	movsd	 -8 * SIZE(AO), %xmm10
3214	addps	%xmm11, %xmm1
3215	movaps	48 * SIZE(BO), %xmm11
3216
3217	addq   $16 * SIZE, AO
3218	addq   $32 * SIZE, BO
3219	decq   %rax
3220	jne    .L132
3221	ALIGN_4
3222
3223.L135:
3224#ifndef TRMMKERNEL
3225	movq	K, %rax
3226#else
3227	movq	KKK, %rax
3228#endif
3229	movaps	ALPHA, %xmm15
3230	andq	$7, %rax		# if (k & 1)
3231	BRANCH
3232	je .L138
3233	ALIGN_4
3234
3235.L136:
3236	mulps	%xmm8, %xmm9
3237	movsd	-30 * SIZE(AO), %xmm8
3238	addps	%xmm9, %xmm0
3239	movaps	 4 * SIZE(BO), %xmm9
3240
3241	addq	$2 * SIZE, AO		# aoffset  += 4
3242	addq	$4 * SIZE, BO		# boffset1 += 8
3243	decq	%rax
3244	jg	.L136
3245	ALIGN_4
3246
3247.L138:
3248	addps	%xmm1,  %xmm0
3249	mulps	%xmm15, %xmm0
3250
3251#ifndef TRMMKERNEL
3252#ifdef movsd
3253	xorps	%xmm8,  %xmm8
3254#endif
3255	movsd	0 * SIZE(CO1), %xmm8
3256	addps	%xmm8,  %xmm0
3257#endif
3258
3259	movlps	%xmm0, 0 * SIZE(CO1)
3260
3261#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3262    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3263	movq	K, %rax
3264	subq	KKK, %rax
3265	leaq	(,%rax,    8), %rax
3266	leaq	(AO, %rax, 1), AO
3267	leaq	(BO, %rax, 2), BO
3268#endif
3269
3270#if defined(TRMMKERNEL) && defined(LEFT)
3271	addq	$2, KK
3272#endif
3273
3274	addq	$2 * SIZE, CO1		# coffset += 4
3275	ALIGN_4
3276
3277.L140:
3278	testq	$1, M
3279	je	.L999
3280
3281#if !defined(TRMMKERNEL) || \
3282	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3283	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3284
3285	leaq	BUFFER, BO
3286#else
3287	leaq	BUFFER, BO
3288	movq	KK, %rax
3289	leaq	(, %rax,   4), %rax
3290	leaq	(AO, %rax, 1), AO
3291	leaq	(BO, %rax, 4), BO
3292#endif
3293
3294	movss	-32 * SIZE(AO), %xmm8
3295	movss	-28 * SIZE(AO), %xmm10
3296
3297	movss	 0 * SIZE(BO), %xmm9
3298	movss	16 * SIZE(BO), %xmm11
3299
3300	xorps	%xmm0, %xmm0
3301	xorps	%xmm1, %xmm1
3302	xorps	%xmm2, %xmm2
3303	xorps	%xmm3, %xmm3
3304
3305#ifndef TRMMKERNEL
3306	movq	K, %rax
3307#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3308	movq	K, %rax
3309	subq	KK, %rax
3310	movq	%rax, KKK
3311#else
3312	movq	KK, %rax
3313#ifdef LEFT
3314	addq	$1, %rax
3315#else
3316	addq	$1, %rax
3317#endif
3318	movq	%rax, KKK
3319#endif
3320	sarq	$3, %rax
3321	je	.L145
3322	ALIGN_4
3323
3324.L142:
3325	mulss	%xmm8, %xmm9
3326#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3327	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3328#endif
3329	movss	-31 * SIZE(AO), %xmm8
3330	mulss	 4 * SIZE(BO), %xmm8
3331	addss	%xmm9, %xmm0
3332	movss	32 * SIZE(BO), %xmm9
3333	addss	%xmm8, %xmm1
3334	movss	-30 * SIZE(AO), %xmm8
3335	mulss	 8 * SIZE(BO), %xmm8
3336	addss	%xmm8, %xmm2
3337	movss	-29 * SIZE(AO), %xmm8
3338	mulss	12 * SIZE(BO), %xmm8
3339	addss	%xmm8, %xmm3
3340	movss	-24 * SIZE(AO), %xmm8
3341	mulss	%xmm10, %xmm11
3342	movss	-27 * SIZE(AO), %xmm10
3343	mulss	20 * SIZE(BO), %xmm10
3344	addss	%xmm11, %xmm0
3345	movss	48 * SIZE(BO), %xmm11
3346	addss	%xmm10, %xmm1
3347	movss	-26 * SIZE(AO), %xmm10
3348	mulss	24 * SIZE(BO), %xmm10
3349	addss	%xmm10, %xmm2
3350	movss	-25 * SIZE(AO), %xmm10
3351	mulss	28 * SIZE(BO), %xmm10
3352	addss	%xmm10, %xmm3
3353	movss	-20 * SIZE(AO), %xmm10
3354
3355	addq   $ 8 * SIZE, AO
3356	addq   $32 * SIZE, BO
3357	decq   %rax
3358	jne    .L142
3359	ALIGN_4
3360
3361.L145:
3362#ifndef TRMMKERNEL
3363	movq	K, %rax
3364#else
3365	movq	KKK, %rax
3366#endif
3367	movss	ALPHA, %xmm15
3368	andq	$7, %rax		# if (k & 1)
3369	BRANCH
3370	je .L148
3371	ALIGN_4
3372
3373.L146:
3374	mulss	%xmm8, %xmm9
3375	movss	-31 * SIZE(AO), %xmm8
3376	addss	%xmm9, %xmm0
3377	movss	 4 * SIZE(BO), %xmm9
3378
3379	addq	$1 * SIZE, AO
3380	addq	$4 * SIZE, BO
3381	decq	%rax
3382	jg	.L146
3383	ALIGN_4
3384
3385.L148:
3386	addss	%xmm1, %xmm0
3387	addss	%xmm3, %xmm2
3388	addss	%xmm2, %xmm0
3389
3390	mulss	%xmm15, %xmm0
3391
3392#ifndef TRMMKERNEL
3393	movss	0 * SIZE(CO1), %xmm8
3394	addss	%xmm8,  %xmm0
3395#endif
3396	movss	%xmm0, 0 * SIZE(CO1)
3397	ALIGN_4
3398
3399.L999:
3400	movq	%rbx, %rsp
3401
3402	EMMS
3403
3404	movq	  0(%rsp), %rbx
3405	movq	  8(%rsp), %rbp
3406	movq	 16(%rsp), %r12
3407	movq	 24(%rsp), %r13
3408	movq	 32(%rsp), %r14
3409	movq	 40(%rsp), %r15
3410
3411#ifdef WINDOWS_ABI
3412	movq	 48(%rsp), %rdi
3413	movq	 56(%rsp), %rsi
3414	movups	 64(%rsp), %xmm6
3415	movups	 80(%rsp), %xmm7
3416	movups	 96(%rsp), %xmm8
3417	movups	112(%rsp), %xmm9
3418	movups	128(%rsp), %xmm10
3419	movups	144(%rsp), %xmm11
3420	movups	160(%rsp), %xmm12
3421	movups	176(%rsp), %xmm13
3422	movups	192(%rsp), %xmm14
3423	movups	208(%rsp), %xmm15
3424#endif
3425
3426	addq	$STACKSIZE, %rsp
3427	ret
3428
3429	EPILOGUE
3430