1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define OLD_M	%rdi
43#define OLD_N	%rsi
44
45#define M	%r13
46#define N	%r14
47#define K	%rdx
48
49#define A	%rcx
50#define B	%r8
51#define C	%r9
52#define LDC	%r10
53#define I	%r11
54#define AO	%rdi
55#define BO	%rsi
56#define	CO1	%r15
57#define CO2	%rbp
58#define BB	%r12
59
60#ifndef WINDOWS_ABI
61
62#define STACKSIZE 64
63
64#define OLD_LDC		 8 + STACKSIZE(%rsp)
65#define OLD_OFFSET	16 + STACKSIZE(%rsp)
66
67#else
68
69#define STACKSIZE 256
70
71#define OLD_A		40 + STACKSIZE(%rsp)
72#define OLD_B		48 + STACKSIZE(%rsp)
73#define OLD_C		56 + STACKSIZE(%rsp)
74#define OLD_LDC		64 + STACKSIZE(%rsp)
75#define OLD_OFFSET	72 + STACKSIZE(%rsp)
76
77#endif
78
79#define ALPHA	  0(%rsp)
80#define J	 16(%rsp)
81#define OFFSET	 24(%rsp)
82#define KK	 32(%rsp)
83#define KKK	 40(%rsp)
84#define BUFFER	256(%rsp)
85
86#ifdef OPTERON
87#define movsd movlps
88#endif
89
90#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
91#define PREFETCH     prefetch
92#define PREFETCHW    prefetchw
93#define PREFETCHSIZE (16 * 9 + 8)
94#endif
95
96#if defined(GENERIC) || defined(NANO)
97#define PREFETCH     prefetcht0
98#define PREFETCHW    prefetcht0
99#define PREFETCHSIZE (16 * 5 + 8)
100#endif
101
102#define RPREFETCHSIZE (8 *  7 + 4)
103#define WPREFETCHSIZE (8 *  8 + 4)
104
105#ifndef GENERIC
106#define KERNEL1(xx) \
107	mulps	%xmm0, %xmm1 ;\
108	addps	%xmm1, %xmm8 ;\
109	movaps	-32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
110	mulps	%xmm0, %xmm3 ;\
111	addps	%xmm3, %xmm9 ;\
112	movaps	-28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
113	mulps	%xmm0, %xmm5 ;\
114	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
115	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
116	addps	%xmm5, %xmm10 ;\
117	movaps	-24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
118	addps	%xmm0, %xmm11 ;\
119	movaps	-16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
120
121#define KERNEL2(xx) \
122	mulps	%xmm2, %xmm1 ;\
123	addps	%xmm1, %xmm12 ;\
124	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
125	mulps	%xmm2, %xmm3 ;\
126	addps	%xmm3, %xmm13 ;\
127	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
128	mulps	%xmm2, %xmm5 ;\
129	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
130	addps	%xmm5, %xmm14 ;\
131	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
132	addps	%xmm2, %xmm15 ;\
133	movaps	-12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
134
135#define KERNEL3(xx) \
136	mulps	%xmm4, %xmm7 ;\
137	addps	%xmm7, %xmm8 ;\
138	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
139	mulps	%xmm4, %xmm3 ;\
140	addps	%xmm3, %xmm9 ;\
141	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
142	mulps	%xmm4, %xmm5 ;\
143	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
144	addps	%xmm5, %xmm10 ;\
145	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
146	addps	%xmm4, %xmm11 ;\
147	movaps	 -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
148
149#define KERNEL4(xx) \
150	mulps	%xmm6, %xmm7 ;\
151	addps	%xmm7, %xmm12 ;\
152	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
153	mulps	%xmm6, %xmm3 ;\
154	addps	%xmm3, %xmm13 ;\
155	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
156	mulps	%xmm6, %xmm5 ;\
157	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
158	addps	%xmm5, %xmm14 ;\
159	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
160 	PREFETCH	(PREFETCHSIZE     + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\
161	addps	%xmm6, %xmm15 ;\
162	movaps	 -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
163
164#define KERNEL5(xx) \
165	mulps	%xmm0, %xmm1 ;\
166	addps	%xmm1, %xmm8 ;\
167	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
168	mulps	%xmm0, %xmm3 ;\
169	addps	%xmm3, %xmm9 ;\
170	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
171	mulps	%xmm0, %xmm5 ;\
172	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\
173	addps	%xmm5, %xmm10 ;\
174	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
175	addps	%xmm0, %xmm11 ;\
176	movaps	  0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0
177
178#define KERNEL6(xx) \
179	mulps	%xmm2, %xmm1 ;\
180	addps	%xmm1, %xmm12 ;\
181	movaps	 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\
182	mulps	%xmm2, %xmm3 ;\
183	addps	%xmm3, %xmm13 ;\
184	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
185	mulps	%xmm2, %xmm5 ;\
186	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\
187	addps	%xmm5, %xmm14 ;\
188	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
189	addps	%xmm2, %xmm15 ;\
190	movaps	  4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2
191
192#define KERNEL7(xx) \
193	mulps	%xmm4, %xmm7 ;\
194	addps	%xmm7, %xmm8 ;\
195	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
196	mulps	%xmm4, %xmm3 ;\
197	addps	%xmm3, %xmm9 ;\
198	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
199	mulps	%xmm4, %xmm5 ;\
200	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\
201	addps	%xmm5, %xmm10 ;\
202	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
203	addps	%xmm4, %xmm11 ;\
204	movaps	  8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4
205
206#define KERNEL8(xx) \
207	mulps	%xmm6, %xmm7 ;\
208	addps	%xmm7, %xmm12 ;\
209	movaps	 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\
210	mulps	%xmm6, %xmm3 ;\
211	addps	%xmm3, %xmm13 ;\
212	movaps	 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\
213	mulps	%xmm6, %xmm5 ;\
214	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\
215	addps	%xmm5, %xmm14 ;\
216	movaps	 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\
217	addps	%xmm6, %xmm15 ;\
218	movaps	 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6
219
220#else
221#define KERNEL1(xx) \
222	mulps	%xmm0, %xmm1 ;\
223	addps	%xmm1, %xmm8 ;\
224	movaps	-32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
225	mulps	%xmm0, %xmm3 ;\
226	addps	%xmm3, %xmm9 ;\
227	movaps	-28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
228	mulps	%xmm0, %xmm5 ;\
229	PREFETCH	(PREFETCHSIZE     +  0) * SIZE + 1 * (xx) * SIZE(AO) ;\
230	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
231	addps	%xmm5, %xmm10 ;\
232	movaps	-24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
233	addps	%xmm0, %xmm11 ;\
234	movaps	-16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
235
236#define KERNEL2(xx) \
237	mulps	%xmm2, %xmm1 ;\
238	addps	%xmm1, %xmm12 ;\
239	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
240	mulps	%xmm2, %xmm3 ;\
241	addps	%xmm3, %xmm13 ;\
242	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
243	mulps	%xmm2, %xmm5 ;\
244	mulps	-20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
245	addps	%xmm5, %xmm14 ;\
246	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
247	addps	%xmm2, %xmm15 ;\
248	movaps	-12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\
249
250#define KERNEL3(xx) \
251	mulps	%xmm4, %xmm7 ;\
252	addps	%xmm7, %xmm8 ;\
253	movaps	-16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
254	mulps	%xmm4, %xmm3 ;\
255	addps	%xmm3, %xmm9 ;\
256	movaps	-12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
257	mulps	%xmm4, %xmm5 ;\
258	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
259	addps	%xmm5, %xmm10 ;\
260	movaps	 -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
261	addps	%xmm4, %xmm11 ;\
262	movaps	 -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
263
264#define KERNEL4(xx) \
265	mulps	%xmm6, %xmm7 ;\
266	addps	%xmm7, %xmm12 ;\
267	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
268	mulps	%xmm6, %xmm3 ;\
269	addps	%xmm3, %xmm13 ;\
270	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
271	mulps	%xmm6, %xmm5 ;\
272	mulps	 -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
273	addps	%xmm5, %xmm14 ;\
274	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
275	addps	%xmm6, %xmm15 ;\
276	movaps	 -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
277
278#define KERNEL5(xx) \
279	mulps	%xmm0, %xmm1 ;\
280 	PREFETCH	(PREFETCHSIZE     + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\
281	addps	%xmm1, %xmm8 ;\
282	movaps	  0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
283	mulps	%xmm0, %xmm3 ;\
284	addps	%xmm3, %xmm9 ;\
285	movaps	  4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
286	mulps	%xmm0, %xmm5 ;\
287	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\
288	addps	%xmm5, %xmm10 ;\
289	movaps	  8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
290	addps	%xmm0, %xmm11 ;\
291	movaps	  0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0
292
293#define KERNEL6(xx) \
294	mulps	%xmm2, %xmm1 ;\
295	addps	%xmm1, %xmm12 ;\
296	movaps	 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\
297	mulps	%xmm2, %xmm3 ;\
298	addps	%xmm3, %xmm13 ;\
299	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
300	mulps	%xmm2, %xmm5 ;\
301	mulps	 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\
302	addps	%xmm5, %xmm14 ;\
303	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
304	addps	%xmm2, %xmm15 ;\
305	movaps	  4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2
306
307#define KERNEL7(xx) \
308	mulps	%xmm4, %xmm7 ;\
309	addps	%xmm7, %xmm8 ;\
310	movaps	 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
311	mulps	%xmm4, %xmm3 ;\
312	addps	%xmm3, %xmm9 ;\
313	movaps	 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
314	mulps	%xmm4, %xmm5 ;\
315	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\
316	addps	%xmm5, %xmm10 ;\
317	movaps	 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
318	addps	%xmm4, %xmm11 ;\
319	movaps	  8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4
320
321#define KERNEL8(xx) \
322	mulps	%xmm6, %xmm7 ;\
323	addps	%xmm7, %xmm12 ;\
324	movaps	 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\
325	mulps	%xmm6, %xmm3 ;\
326	addps	%xmm3, %xmm13 ;\
327	movaps	 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\
328	mulps	%xmm6, %xmm5 ;\
329	mulps	 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\
330	addps	%xmm5, %xmm14 ;\
331	movaps	 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\
332	addps	%xmm6, %xmm15 ;\
333	movaps	 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6
334
335#endif
336
337	PROLOGUE
338	PROFCODE
339
340	subq	$STACKSIZE, %rsp
341
342	movq	%rbx,  0(%rsp)
343	movq	%rbp,  8(%rsp)
344	movq	%r12, 16(%rsp)
345	movq	%r13, 24(%rsp)
346	movq	%r14, 32(%rsp)
347	movq	%r15, 40(%rsp)
348
349#ifdef WINDOWS_ABI
350	movq	%rdi,    48(%rsp)
351	movq	%rsi,    56(%rsp)
352	movups	%xmm6,   64(%rsp)
353	movups	%xmm7,   80(%rsp)
354	movups	%xmm8,   96(%rsp)
355	movups	%xmm9,  112(%rsp)
356	movups	%xmm10, 128(%rsp)
357	movups	%xmm11, 144(%rsp)
358	movups	%xmm12, 160(%rsp)
359	movups	%xmm13, 176(%rsp)
360	movups	%xmm14, 192(%rsp)
361	movups	%xmm15, 208(%rsp)
362
363	movq	ARG1,      OLD_M
364	movq	ARG2,      OLD_N
365	movq	ARG3,      K
366	movq	OLD_A,     A
367	movq	OLD_B,     B
368	movq	OLD_C,     C
369	movq	OLD_LDC,   LDC
370#ifdef TRMMKERNEL
371	movsd	OLD_OFFSET, %xmm4
372#endif
373	movaps	%xmm3, %xmm0
374
375#else
376	movq	OLD_LDC,   LDC
377#ifdef TRMMKERNEL
378	movsd	OLD_OFFSET, %xmm4
379#endif
380
381#endif
382
383	EMMS
384
385	movq	%rsp, %rbx	# save old stack
386	subq	$256 + LOCAL_BUFFER_SIZE, %rsp
387	andq	$-4096, %rsp	# align stack
388
389	STACK_TOUCHING
390
391	movq	OLD_M, M
392	movq	OLD_N, N
393
394	shufps	$0, %xmm0, %xmm0
395	movaps	%xmm0, ALPHA
396
397#ifdef TRMMKERNEL
398	movsd	%xmm4, OFFSET
399	movsd	%xmm4, KK
400#ifndef LEFT
401	negq	KK
402#endif
403#endif
404
405	subq	$-32 * SIZE, A
406
407	leaq	(, LDC, SIZE), LDC
408
409	movq	N,  J
410	sarq	$2, J		# j = (n >> 2)
411	jle	.L50
412
413.L01:
414#if defined(TRMMKERNEL) && defined(LEFT)
415	movq	OFFSET, %rax
416	movq	%rax, KK
417#endif
418
419/* Copying to Sub Buffer */
420	leaq	BUFFER, BO
421
422	movd	 0 * SIZE(B), %mm0
423
424	movq	K, %rax
425	sarq	$2, %rax
426	jle	.L03
427
428	addq	%rax, %rax
429	ALIGN_4
430
431.L02:
432	PREFETCH	 (RPREFETCHSIZE +  0)  * SIZE(B)
433
434	movd	 1 * SIZE(B), %mm1
435	movd	 2 * SIZE(B), %mm2
436	movd	 3 * SIZE(B), %mm3
437	movd	 4 * SIZE(B), %mm4
438	movd	 5 * SIZE(B), %mm5
439	movd	 6 * SIZE(B), %mm6
440	movd	 7 * SIZE(B), %mm7
441
442	PREFETCHW	 (WPREFETCHSIZE +  0) * SIZE(BO)
443
444	punpckldq %mm0, %mm0
445	movq	%mm0,  0 * SIZE(BO)
446	movq	%mm0,  2 * SIZE(BO)
447	punpckldq %mm1, %mm1
448	movd	 8 * SIZE(B), %mm0
449	movq	%mm1,  4 * SIZE(BO)
450	movq	%mm1,  6 * SIZE(BO)
451	punpckldq %mm2, %mm2
452	movq	%mm2,  8 * SIZE(BO)
453	movq	%mm2, 10 * SIZE(BO)
454	punpckldq %mm3, %mm3
455	movq	%mm3, 12 * SIZE(BO)
456	movq	%mm3, 14 * SIZE(BO)
457
458	PREFETCHW	 (WPREFETCHSIZE + 16) * SIZE(BO)
459
460	punpckldq %mm4, %mm4
461	movq	%mm4, 16 * SIZE(BO)
462	movq	%mm4, 18 * SIZE(BO)
463	punpckldq %mm5, %mm5
464	movq	%mm5, 20 * SIZE(BO)
465	movq	%mm5, 22 * SIZE(BO)
466	punpckldq %mm6, %mm6
467	movq	%mm6, 24 * SIZE(BO)
468	movq	%mm6, 26 * SIZE(BO)
469	punpckldq %mm7, %mm7
470	movq	%mm7, 28 * SIZE(BO)
471	movq	%mm7, 30 * SIZE(BO)
472
473
474	addq	$ 8 * SIZE, B
475	addq	$32 * SIZE, BO
476
477	decq	%rax
478	jne	.L02
479	ALIGN_4
480
481.L03:
482	movq	K, %rax
483	andq	$3, %rax
484	BRANCH
485	jle	.L10
486	ALIGN_4
487
488.L04:
489	movd	 0 * SIZE(B), %mm0
490	movd	 1 * SIZE(B), %mm1
491	movd	 2 * SIZE(B), %mm2
492	movd	 3 * SIZE(B), %mm3
493
494	punpckldq %mm0, %mm0
495	punpckldq %mm1, %mm1
496	punpckldq %mm2, %mm2
497	punpckldq %mm3, %mm3
498
499	movq	%mm0,  0 * SIZE(BO)
500	movq	%mm0,  2 * SIZE(BO)
501	movq	%mm1,  4 * SIZE(BO)
502	movq	%mm1,  6 * SIZE(BO)
503	movq	%mm2,  8 * SIZE(BO)
504	movq	%mm2, 10 * SIZE(BO)
505	movq	%mm3, 12 * SIZE(BO)
506	movq	%mm3, 14 * SIZE(BO)
507
508	addq	$ 4 * SIZE, B
509	addq	$16 * SIZE, BO
510	decq	%rax
511	jne	.L04
512	ALIGN_4
513
514.L10:
515	movq	C, CO1			# coffset1 = c
516	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
517	movq	A, AO		# aoffset = a
518
519	leaq	 (RPREFETCHSIZE +  0)  * SIZE(B), BB
520
521	movq	M,  I
522	sarq	$3, I	# i = (m >> 3)
523	jle	.L20
524	ALIGN_4
525
526.L11:
527#if !defined(TRMMKERNEL) || \
528	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
529	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
530
531	leaq	32 * SIZE + BUFFER, BO
532#else
533	leaq	32 * SIZE + BUFFER, BO
534	movq	KK, %rax
535	leaq	(, %rax,   8), %rax
536	leaq	(AO, %rax, 4), AO
537	leaq	(BO, %rax, 8), BO
538#endif
539
540	movaps	-32 * SIZE(AO), %xmm0
541	movaps	-32 * SIZE(BO), %xmm1
542	xorps	%xmm8, %xmm8
543	movaps	-28 * SIZE(AO), %xmm2
544	movaps	-28 * SIZE(BO), %xmm3
545	xorps	%xmm9, %xmm9
546	movaps	-24 * SIZE(AO), %xmm4
547	movaps	-24 * SIZE(BO), %xmm5
548	xorps	%xmm10, %xmm10
549	movaps	-20 * SIZE(AO), %xmm6
550	movaps	-16 * SIZE(BO), %xmm7
551	xorps	%xmm11, %xmm11
552
553	PREFETCHW      7 * SIZE(CO1)
554	xorps	%xmm12, %xmm12
555	PREFETCHW     15 * SIZE(CO2)
556	xorps	%xmm13, %xmm13
557	PREFETCHW      7 * SIZE(CO1, LDC, 2)
558	xorps	%xmm14, %xmm14
559	PREFETCHW     15 * SIZE(CO2, LDC, 2)
560	xorps	%xmm15, %xmm15
561	PREFETCH	 -32  * SIZE(BB)
562
563#ifndef TRMMKERNEL
564	movq	K, %rax
565#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
566	movq	K, %rax
567	subq	KK, %rax
568	movq	%rax, KKK
569#else
570	movq	KK, %rax
571#ifdef LEFT
572	addq	$8, %rax
573#else
574	addq	$4, %rax
575#endif
576	movq	%rax, KKK
577#endif
578#ifndef GENERIC
579	andq	$-8, %rax
580
581	leaq	(, %rax, 8), %rax
582	leaq	(AO, %rax, 4), AO
583	leaq	(BO, %rax, 8), BO
584	negq	%rax
585	NOBRANCH
586	je	.L15
587	ALIGN_3
588
589.L12:
590	KERNEL1(16 *  0)
591	KERNEL2(16 *  0)
592	KERNEL3(16 *  0)
593	KERNEL4(16 *  0)
594	KERNEL5(16 *  0)
595	KERNEL6(16 *  0)
596	KERNEL7(16 *  0)
597	KERNEL8(16 *  0)
598
599	KERNEL1(16 *  2)
600	KERNEL2(16 *  2)
601	KERNEL3(16 *  2)
602	KERNEL4(16 *  2)
603	KERNEL5(16 *  2)
604	KERNEL6(16 *  2)
605	KERNEL7(16 *  2)
606	KERNEL8(16 *  2)
607
608	addq	$16 * SIZE, %rax
609	NOBRANCH
610	je	.L15
611	KERNEL1(16 *  0)
612	KERNEL2(16 *  0)
613	KERNEL3(16 *  0)
614	KERNEL4(16 *  0)
615	KERNEL5(16 *  0)
616	KERNEL6(16 *  0)
617	KERNEL7(16 *  0)
618	KERNEL8(16 *  0)
619
620	KERNEL1(16 *  2)
621	KERNEL2(16 *  2)
622	KERNEL3(16 *  2)
623	KERNEL4(16 *  2)
624	KERNEL5(16 *  2)
625	KERNEL6(16 *  2)
626	KERNEL7(16 *  2)
627	KERNEL8(16 *  2)
628
629	addq	$16 * SIZE, %rax
630	NOBRANCH
631	je	.L15
632	KERNEL1(16 *  0)
633	KERNEL2(16 *  0)
634	KERNEL3(16 *  0)
635	KERNEL4(16 *  0)
636	KERNEL5(16 *  0)
637	KERNEL6(16 *  0)
638	KERNEL7(16 *  0)
639	KERNEL8(16 *  0)
640
641	KERNEL1(16 *  2)
642	KERNEL2(16 *  2)
643	KERNEL3(16 *  2)
644	KERNEL4(16 *  2)
645	KERNEL5(16 *  2)
646	KERNEL6(16 *  2)
647	KERNEL7(16 *  2)
648	KERNEL8(16 *  2)
649
650	addq	$16 * SIZE, %rax
651	NOBRANCH
652	je	.L15
653	KERNEL1(16 *  0)
654	KERNEL2(16 *  0)
655	KERNEL3(16 *  0)
656	KERNEL4(16 *  0)
657	KERNEL5(16 *  0)
658	KERNEL6(16 *  0)
659	KERNEL7(16 *  0)
660	KERNEL8(16 *  0)
661
662	KERNEL1(16 *  2)
663	KERNEL2(16 *  2)
664	KERNEL3(16 *  2)
665	KERNEL4(16 *  2)
666	KERNEL5(16 *  2)
667	KERNEL6(16 *  2)
668	KERNEL7(16 *  2)
669	KERNEL8(16 *  2)
670
671	addq	$16 * SIZE, %rax
672	NOBRANCH
673	je	.L15
674	KERNEL1(16 *  0)
675	KERNEL2(16 *  0)
676	KERNEL3(16 *  0)
677	KERNEL4(16 *  0)
678	KERNEL5(16 *  0)
679	KERNEL6(16 *  0)
680	KERNEL7(16 *  0)
681	KERNEL8(16 *  0)
682
683	KERNEL1(16 *  2)
684	KERNEL2(16 *  2)
685	KERNEL3(16 *  2)
686	KERNEL4(16 *  2)
687	KERNEL5(16 *  2)
688	KERNEL6(16 *  2)
689	KERNEL7(16 *  2)
690	KERNEL8(16 *  2)
691
692	addq	$16 * SIZE, %rax
693	NOBRANCH
694	je	.L15
695	KERNEL1(16 *  0)
696	KERNEL2(16 *  0)
697	KERNEL3(16 *  0)
698	KERNEL4(16 *  0)
699	KERNEL5(16 *  0)
700	KERNEL6(16 *  0)
701	KERNEL7(16 *  0)
702	KERNEL8(16 *  0)
703
704	KERNEL1(16 *  2)
705	KERNEL2(16 *  2)
706	KERNEL3(16 *  2)
707	KERNEL4(16 *  2)
708	KERNEL5(16 *  2)
709	KERNEL6(16 *  2)
710	KERNEL7(16 *  2)
711	KERNEL8(16 *  2)
712
713	addq	$16 * SIZE, %rax
714	NOBRANCH
715	je	.L15
716	KERNEL1(16 *  0)
717	KERNEL2(16 *  0)
718	KERNEL3(16 *  0)
719	KERNEL4(16 *  0)
720	KERNEL5(16 *  0)
721	KERNEL6(16 *  0)
722	KERNEL7(16 *  0)
723	KERNEL8(16 *  0)
724
725	KERNEL1(16 *  2)
726	KERNEL2(16 *  2)
727	KERNEL3(16 *  2)
728	KERNEL4(16 *  2)
729	KERNEL5(16 *  2)
730	KERNEL6(16 *  2)
731	KERNEL7(16 *  2)
732	KERNEL8(16 *  2)
733
734	addq	$16 * SIZE, %rax
735	NOBRANCH
736	je	.L15
737	KERNEL1(16 *  0)
738	KERNEL2(16 *  0)
739	KERNEL3(16 *  0)
740	KERNEL4(16 *  0)
741	KERNEL5(16 *  0)
742	KERNEL6(16 *  0)
743	KERNEL7(16 *  0)
744	KERNEL8(16 *  0)
745
746	KERNEL1(16 *  2)
747	KERNEL2(16 *  2)
748	KERNEL3(16 *  2)
749	KERNEL4(16 *  2)
750	KERNEL5(16 *  2)
751	KERNEL6(16 *  2)
752	KERNEL7(16 *  2)
753	KERNEL8(16 *  2)
754
755	addq	$16 * SIZE, %rax
756	BRANCH
757	jl	.L12
758	ALIGN_3
759
760.L15:
761	PREFETCH	  -16 * SIZE(BB)
762	subq		 $-16 * SIZE, BB
763
764#ifndef TRMMKERNEL
765	movq	K, %rax
766#else
767	movq	KKK, %rax
768#endif
769	testq	$4, %rax
770	je .L16
771	xorq	%rax, %rax
772	ALIGN_3
773
774	KERNEL1(16 *  0)
775	KERNEL2(16 *  0)
776	KERNEL3(16 *  0)
777	KERNEL4(16 *  0)
778	KERNEL5(16 *  0)
779	KERNEL6(16 *  0)
780	KERNEL7(16 *  0)
781	KERNEL8(16 *  0)
782
783	addq	$64 * SIZE, BO
784	addq	$32 * SIZE, AO
785	ALIGN_3
786#else
787	sarq	$2, %rax
788	NOBRANCH
789	jle	.L16
790	ALIGN_3
791
792.L12:
793	KERNEL1(16 *  0)
794	KERNEL2(16 *  0)
795	KERNEL3(16 *  0)
796	KERNEL4(16 *  0)
797	KERNEL5(16 *  0)
798	KERNEL6(16 *  0)
799	KERNEL7(16 *  0)
800	KERNEL8(16 *  0)
801
802	addq	$ 64 * SIZE, BO
803	subq	$-32 * SIZE, AO
804	decq	%rax
805	BRANCH
806	jg	.L12
807#endif
808
809.L16:
810	movaps	ALPHA, %xmm7
811
812#ifndef TRMMKERNEL
813	movq	K, %rax
814#else
815	movq	KKK, %rax
816#endif
817	andq	$3, %rax		# if (k & 1)
818	je .L18
819
820	leaq	(, %rax, 8), %rax
821	leaq	(AO, %rax, 4), AO
822	leaq	(BO, %rax, 8), BO
823	negq	%rax
824	ALIGN_4
825
826.L17:
827	mulps	%xmm0, %xmm1
828	addps	%xmm1, %xmm8
829	movaps	-28 * SIZE(BO, %rax, 8), %xmm1
830	mulps	%xmm0, %xmm1
831	addps	%xmm1, %xmm9
832	movaps	-24 * SIZE(BO, %rax, 8), %xmm1
833	mulps	%xmm0, %xmm1
834	mulps	-20 * SIZE(BO, %rax, 8), %xmm0
835	addps	%xmm1, %xmm10
836	movaps	-32 * SIZE(BO, %rax, 8), %xmm1
837	addps	%xmm0, %xmm11
838	movaps	-24 * SIZE(AO, %rax, 4), %xmm0
839	mulps	%xmm2, %xmm1
840	addps	%xmm1, %xmm12
841	movaps	-28 * SIZE(BO, %rax, 8), %xmm1
842	mulps	%xmm2, %xmm1
843	addps	%xmm1, %xmm13
844	movaps	-24 * SIZE(BO, %rax, 8), %xmm1
845	mulps	%xmm2, %xmm1
846	mulps	-20 * SIZE(BO, %rax, 8), %xmm2
847	addps	%xmm1, %xmm14
848	movaps	-16 * SIZE(BO, %rax, 8), %xmm1
849	addps	%xmm2, %xmm15
850	movaps	-20 * SIZE(AO, %rax, 4), %xmm2
851
852	addq	$SIZE * 2, %rax
853	jl	.L17
854	ALIGN_4
855
856.L18:
857#ifndef TRMMKERNEL
858	movsd	0 * SIZE(CO1), %xmm0
859	movhps	2 * SIZE(CO1), %xmm0
860	movsd	4 * SIZE(CO1), %xmm1
861	movhps	6 * SIZE(CO1), %xmm1
862
863	movsd	0 * SIZE(CO2), %xmm2
864	movhps	2 * SIZE(CO2), %xmm2
865	movsd	4 * SIZE(CO2), %xmm3
866	movhps	6 * SIZE(CO2), %xmm3
867#endif
868
869	mulps	%xmm7, %xmm8
870	mulps	%xmm7, %xmm9
871	mulps	%xmm7, %xmm10
872	mulps	%xmm7, %xmm11
873
874	mulps	%xmm7, %xmm12
875	mulps	%xmm7, %xmm13
876	mulps	%xmm7, %xmm14
877	mulps	%xmm7, %xmm15
878
879#ifndef TRMMKERNEL
880	movsd	0 * SIZE(CO1, LDC, 2), %xmm4
881	movhps	2 * SIZE(CO1, LDC, 2), %xmm4
882	movsd	4 * SIZE(CO1, LDC, 2), %xmm5
883	movhps	6 * SIZE(CO1, LDC, 2), %xmm5
884
885	movsd	0 * SIZE(CO2, LDC, 2), %xmm6
886	movhps	2 * SIZE(CO2, LDC, 2), %xmm6
887	movsd	4 * SIZE(CO2, LDC, 2), %xmm7
888	movhps	6 * SIZE(CO2, LDC, 2), %xmm7
889
890	addps	%xmm0, %xmm8
891	addps	%xmm1, %xmm12
892	addps	%xmm2, %xmm9
893	addps	%xmm3, %xmm13
894#endif
895
896	movlps	%xmm8,  0 * SIZE(CO1)
897	movhps	%xmm8,  2 * SIZE(CO1)
898	movlps	%xmm12, 4 * SIZE(CO1)
899	movhps	%xmm12, 6 * SIZE(CO1)
900
901	movlps	%xmm9,  0 * SIZE(CO2)
902	movhps	%xmm9,  2 * SIZE(CO2)
903	movlps	%xmm13, 4 * SIZE(CO2)
904	movhps	%xmm13, 6 * SIZE(CO2)
905
906#ifndef TRMMKERNEL
907	addps	%xmm4, %xmm10
908	addps	%xmm5, %xmm14
909	addps	%xmm6, %xmm11
910	addps	%xmm7, %xmm15
911#endif
912
913	movlps	%xmm10, 0 * SIZE(CO1, LDC, 2)
914	movhps	%xmm10, 2 * SIZE(CO1, LDC, 2)
915	movlps	%xmm14, 4 * SIZE(CO1, LDC, 2)
916	movhps	%xmm14, 6 * SIZE(CO1, LDC, 2)
917
918	movlps	%xmm11, 0 * SIZE(CO2, LDC, 2)
919	movhps	%xmm11, 2 * SIZE(CO2, LDC, 2)
920	movlps	%xmm15, 4 * SIZE(CO2, LDC, 2)
921	movhps	%xmm15, 6 * SIZE(CO2, LDC, 2)
922
923#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
924    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
925	movq	K, %rax
926	subq	KKK, %rax
927	leaq	(,%rax,    8), %rax
928	leaq	(AO, %rax, 4), AO
929	leaq	(BO, %rax, 8), BO
930#endif
931
932#if defined(TRMMKERNEL) && defined(LEFT)
933	addq	$8, KK
934#endif
935
936	addq	$8 * SIZE, CO1		# coffset += 4
937	addq	$8 * SIZE, CO2		# coffset += 4
938	decq	I			# i --
939	jg	.L11
940	ALIGN_4
941
942.L20:
943	testq	$4, M
944	je	.L30
945
946#if !defined(TRMMKERNEL) || \
947	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
948	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
949
950	leaq	BUFFER, BO
951#else
952	leaq	BUFFER, BO
953	movq	KK, %rax
954	leaq	(, %rax,   8), %rax
955	leaq	(AO, %rax, 2), AO
956	leaq	(BO, %rax, 8), BO
957#endif
958
959	movaps	-32 * SIZE(AO), %xmm8
960	movaps	-16 * SIZE(AO), %xmm10
961
962	movaps	  0 * SIZE(BO), %xmm9
963	movaps	 16 * SIZE(BO), %xmm11
964	movaps	 32 * SIZE(BO), %xmm13
965	movaps	 48 * SIZE(BO), %xmm15
966
967	xorps	%xmm0, %xmm0
968	xorps	%xmm1, %xmm1
969	xorps	%xmm2, %xmm2
970	xorps	%xmm3, %xmm3
971
972#ifndef TRMMKERNEL
973	movq	K, %rax
974#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
975	movq	K, %rax
976	subq	KK, %rax
977	movq	%rax, KKK
978#else
979	movq	KK, %rax
980#ifdef LEFT
981	addq	$4, %rax
982#else
983	addq	$4, %rax
984#endif
985	movq	%rax, KKK
986#endif
987	sarq	$3, %rax
988	je	.L25
989	ALIGN_4
990
991.L22:
992	mulps	%xmm8, %xmm9
993	addps	%xmm9, %xmm0
994#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
995	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
996#endif
997	movaps	 4 * SIZE(BO), %xmm9
998	mulps	%xmm8, %xmm9
999	addps	%xmm9, %xmm1
1000	movaps	 8 * SIZE(BO), %xmm9
1001	mulps	%xmm8, %xmm9
1002	mulps	12 * SIZE(BO), %xmm8
1003	addps	%xmm9, %xmm2
1004	movaps	64 * SIZE(BO), %xmm9
1005	addps	%xmm8, %xmm3
1006	movaps	-28 * SIZE(AO), %xmm8
1007
1008	mulps	%xmm8, %xmm11
1009	addps	%xmm11, %xmm0
1010	movaps	20 * SIZE(BO), %xmm11
1011	mulps	%xmm8, %xmm11
1012	addps	%xmm11, %xmm1
1013	movaps	24 * SIZE(BO), %xmm11
1014	mulps	%xmm8, %xmm11
1015	mulps	28 * SIZE(BO), %xmm8
1016	addps	%xmm11, %xmm2
1017	movaps	80 * SIZE(BO), %xmm11
1018	addps	%xmm8, %xmm3
1019	movaps	-24 * SIZE(AO), %xmm8
1020
1021	mulps	%xmm8, %xmm13
1022	addps	%xmm13, %xmm0
1023	movaps	36 * SIZE(BO), %xmm13
1024	mulps	%xmm8, %xmm13
1025	addps	%xmm13, %xmm1
1026	movaps	40 * SIZE(BO), %xmm13
1027	mulps	%xmm8, %xmm13
1028	mulps	44 * SIZE(BO), %xmm8
1029	addps	%xmm13, %xmm2
1030	movaps	96 * SIZE(BO), %xmm13
1031	addps	%xmm8, %xmm3
1032	movaps	-20 * SIZE(AO), %xmm8
1033
1034	mulps	%xmm8, %xmm15
1035	addps	%xmm15, %xmm0
1036	movaps	52 * SIZE(BO), %xmm15
1037	mulps	%xmm8, %xmm15
1038	addps	%xmm15, %xmm1
1039	movaps	56 * SIZE(BO), %xmm15
1040	mulps	%xmm8, %xmm15
1041	mulps	60 * SIZE(BO), %xmm8
1042	addps	%xmm15, %xmm2
1043	movaps	112 * SIZE(BO), %xmm15
1044	addps	%xmm8, %xmm3
1045	movaps	 0 * SIZE(AO), %xmm8
1046
1047#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1048	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
1049#endif
1050	mulps	%xmm10, %xmm9
1051	addps	%xmm9, %xmm0
1052	movaps	68 * SIZE(BO), %xmm9
1053	mulps	%xmm10, %xmm9
1054	addps	%xmm9, %xmm1
1055	movaps	72 * SIZE(BO), %xmm9
1056	mulps	%xmm10, %xmm9
1057	mulps	76 * SIZE(BO), %xmm10
1058	addps	%xmm9, %xmm2
1059	movaps	128 * SIZE(BO), %xmm9
1060	addps	%xmm10, %xmm3
1061	movaps	-12 * SIZE(AO), %xmm10
1062
1063	mulps	%xmm10, %xmm11
1064	addps	%xmm11, %xmm0
1065	movaps	84 * SIZE(BO), %xmm11
1066	mulps	%xmm10, %xmm11
1067	addps	%xmm11, %xmm1
1068	movaps	88 * SIZE(BO), %xmm11
1069	mulps	%xmm10, %xmm11
1070	mulps	92 * SIZE(BO), %xmm10
1071	addps	%xmm11, %xmm2
1072	movaps	144 * SIZE(BO), %xmm11
1073	addps	%xmm10, %xmm3
1074	movaps	-8 * SIZE(AO), %xmm10
1075
1076	mulps	%xmm10, %xmm13
1077	addps	%xmm13, %xmm0
1078	movaps	100 * SIZE(BO), %xmm13
1079	mulps	%xmm10, %xmm13
1080	addps	%xmm13, %xmm1
1081	movaps	104 * SIZE(BO), %xmm13
1082	mulps	%xmm10, %xmm13
1083	mulps	108 * SIZE(BO), %xmm10
1084	addps	%xmm13, %xmm2
1085	movaps	160 * SIZE(BO), %xmm13
1086	addps	%xmm10, %xmm3
1087	movaps	-4 * SIZE(AO), %xmm10
1088
1089	mulps	%xmm10, %xmm15
1090	addps	%xmm15, %xmm0
1091	movaps	116 * SIZE(BO), %xmm15
1092	mulps	%xmm10, %xmm15
1093	addps	%xmm15, %xmm1
1094	movaps	120 * SIZE(BO), %xmm15
1095	mulps	%xmm10, %xmm15
1096	mulps	124 * SIZE(BO), %xmm10
1097	addps	%xmm15, %xmm2
1098	movaps	176 * SIZE(BO), %xmm15
1099	addps	%xmm10, %xmm3
1100	movaps	16 * SIZE(AO), %xmm10
1101
1102	addq   $ 32 * SIZE, AO
1103	addq   $128 * SIZE, BO
1104	decq   %rax
1105	jne    .L22
1106	ALIGN_4
1107
1108.L25:
1109#ifndef TRMMKERNEL
1110	movq	K, %rax
1111#else
1112	movq	KKK, %rax
1113#endif
1114	movaps	ALPHA, %xmm15
1115	andq	$7, %rax		# if (k & 1)
1116	BRANCH
1117	je .L28
1118	ALIGN_4
1119
1120.L26:
1121	mulps	%xmm8, %xmm9
1122	addps	%xmm9, %xmm0
1123	movaps	 4 * SIZE(BO), %xmm9
1124	mulps	%xmm8, %xmm9
1125	addps	%xmm9, %xmm1
1126	movaps	 8 * SIZE(BO), %xmm9
1127	mulps	%xmm8, %xmm9
1128	mulps	12 * SIZE(BO), %xmm8
1129	addps	%xmm9, %xmm2
1130	movaps	16 * SIZE(BO), %xmm9
1131	addps	%xmm8, %xmm3
1132	movaps	-28 * SIZE(AO), %xmm8
1133
1134	addq	$ 4 * SIZE, AO		# aoffset  += 4
1135	addq	$16 * SIZE, BO		# boffset1 += 8
1136	decq	%rax
1137	jg	.L26
1138	ALIGN_4
1139
1140.L28:
1141	mulps	%xmm15, %xmm0
1142	mulps	%xmm15, %xmm1
1143 	mulps	%xmm15, %xmm2
1144	mulps	%xmm15, %xmm3
1145
1146#ifndef TRMMKERNEL
1147	movsd	0 * SIZE(CO1), %xmm8
1148	movhps	2 * SIZE(CO1), %xmm8
1149	movsd	0 * SIZE(CO2), %xmm10
1150	movhps	2 * SIZE(CO2), %xmm10
1151
1152	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1153	movhps	2 * SIZE(CO1, LDC, 2), %xmm12
1154	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1155	movhps	2 * SIZE(CO2, LDC, 2), %xmm14
1156
1157	addps	%xmm8,  %xmm0
1158	addps	%xmm10, %xmm1
1159	addps	%xmm12, %xmm2
1160	addps	%xmm14, %xmm3
1161#endif
1162
1163	movlps	%xmm0, 0 * SIZE(CO1)
1164	movhps	%xmm0, 2 * SIZE(CO1)
1165	movlps	%xmm1, 0 * SIZE(CO2)
1166	movhps	%xmm1, 2 * SIZE(CO2)
1167
1168	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1169	movhps	%xmm2, 2 * SIZE(CO1, LDC, 2)
1170	movlps	%xmm3, 0 * SIZE(CO2, LDC, 2)
1171	movhps	%xmm3, 2 * SIZE(CO2, LDC, 2)
1172
1173#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1174    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1175	movq	K, %rax
1176	subq	KKK, %rax
1177	leaq	(,%rax,    8), %rax
1178	leaq	(AO, %rax, 2), AO
1179	leaq	(BO, %rax, 8), BO
1180#endif
1181
1182#if defined(TRMMKERNEL) && defined(LEFT)
1183	addq	$4, KK
1184#endif
1185
1186	addq	$4 * SIZE, CO1		# coffset += 4
1187	addq	$4 * SIZE, CO2		# coffset += 4
1188	ALIGN_4
1189
1190.L30:
1191	testq	$2, M
1192	je	.L40
1193
1194#if !defined(TRMMKERNEL) || \
1195	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1196	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1197
1198	leaq	BUFFER, BO
1199#else
1200	leaq	BUFFER, BO
1201	movq	KK, %rax
1202	leaq	(, %rax,   8), %rax
1203	leaq	(AO, %rax, 1), AO
1204	leaq	(BO, %rax, 8), BO
1205#endif
1206
1207	movaps	-32 * SIZE(AO), %xmm8
1208	movaps	-24 * SIZE(AO), %xmm10
1209
1210	movaps	 0 * SIZE(BO), %xmm9
1211	movaps	16 * SIZE(BO), %xmm11
1212	movaps	32 * SIZE(BO), %xmm13
1213	movaps	48 * SIZE(BO), %xmm15
1214
1215	xorps	%xmm0, %xmm0
1216	xorps	%xmm1, %xmm1
1217	xorps	%xmm2, %xmm2
1218	xorps	%xmm3, %xmm3
1219
1220#ifndef TRMMKERNEL
1221	movq	K, %rax
1222#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1223	movq	K, %rax
1224	subq	KK, %rax
1225	movq	%rax, KKK
1226#else
1227	movq	KK, %rax
1228#ifdef LEFT
1229	addq	$2, %rax
1230#else
1231	addq	$4, %rax
1232#endif
1233	movq	%rax, KKK
1234#endif
1235	sarq	$3, %rax
1236	je	.L35
1237	ALIGN_4
1238
1239.L32:
1240	mulps	%xmm8, %xmm9
1241	addps	%xmm9, %xmm0
1242#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1243	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1244#endif
1245	movaps	 4 * SIZE(BO), %xmm9
1246	mulps	%xmm8, %xmm9
1247	addps	%xmm9, %xmm1
1248	movaps	 8 * SIZE(BO), %xmm9
1249	mulps	%xmm8, %xmm9
1250	addps	%xmm9, %xmm2
1251	movaps	12 * SIZE(BO), %xmm9
1252	mulps	%xmm8, %xmm9
1253	movsd	-30 * SIZE(AO), %xmm8
1254	addps	%xmm9, %xmm3
1255	movaps	64 * SIZE(BO), %xmm9
1256
1257	mulps	%xmm8, %xmm11
1258	addps	%xmm11, %xmm0
1259	movaps	20 * SIZE(BO), %xmm11
1260	mulps	%xmm8, %xmm11
1261	addps	%xmm11, %xmm1
1262	movaps	24 * SIZE(BO), %xmm11
1263	mulps	%xmm8, %xmm11
1264	addps	%xmm11, %xmm2
1265	movaps	28 * SIZE(BO), %xmm11
1266	mulps	%xmm8, %xmm11
1267	movsd	-28 * SIZE(AO), %xmm8
1268	addps	%xmm11, %xmm3
1269	movaps	80 * SIZE(BO), %xmm11
1270
1271	mulps	%xmm8, %xmm13
1272	addps	%xmm13, %xmm0
1273	movaps	36 * SIZE(BO), %xmm13
1274	mulps	%xmm8, %xmm13
1275	addps	%xmm13, %xmm1
1276	movaps	40 * SIZE(BO), %xmm13
1277	mulps	%xmm8, %xmm13
1278	addps	%xmm13, %xmm2
1279	movaps	44 * SIZE(BO), %xmm13
1280	mulps	%xmm8, %xmm13
1281	movsd	-26 * SIZE(AO), %xmm8
1282	addps	%xmm13, %xmm3
1283	movaps	96 * SIZE(BO), %xmm13
1284
1285	mulps	%xmm8, %xmm15
1286	addps	%xmm15, %xmm0
1287	movaps	52 * SIZE(BO), %xmm15
1288	mulps	%xmm8, %xmm15
1289	addps	%xmm15, %xmm1
1290	movaps	56 * SIZE(BO), %xmm15
1291	mulps	%xmm8, %xmm15
1292	addps	%xmm15, %xmm2
1293	movaps	60 * SIZE(BO), %xmm15
1294	mulps	%xmm8, %xmm15
1295	movsd	-16 * SIZE(AO), %xmm8
1296	addps	%xmm15, %xmm3
1297	movaps	112 * SIZE(BO), %xmm15
1298
1299	mulps	%xmm10, %xmm9
1300	addps	%xmm9, %xmm0
1301	movaps	68 * SIZE(BO), %xmm9
1302	mulps	%xmm10, %xmm9
1303	addps	%xmm9, %xmm1
1304	movaps	72 * SIZE(BO), %xmm9
1305	mulps	%xmm10, %xmm9
1306	addps	%xmm9, %xmm2
1307	movaps	76 * SIZE(BO), %xmm9
1308	mulps	%xmm10, %xmm9
1309	movsd	-22 * SIZE(AO), %xmm10
1310	addps	%xmm9, %xmm3
1311	movaps	128 * SIZE(BO), %xmm9
1312
1313	mulps	%xmm10, %xmm11
1314	addps	%xmm11, %xmm0
1315	movaps	84 * SIZE(BO), %xmm11
1316	mulps	%xmm10, %xmm11
1317	addps	%xmm11, %xmm1
1318	movaps	88 * SIZE(BO), %xmm11
1319	mulps	%xmm10, %xmm11
1320	addps	%xmm11, %xmm2
1321	movaps	92 * SIZE(BO), %xmm11
1322	mulps	%xmm10, %xmm11
1323	movsd	-20 * SIZE(AO), %xmm10
1324	addps	%xmm11, %xmm3
1325	movaps	144 * SIZE(BO), %xmm11
1326
1327	mulps	%xmm10, %xmm13
1328	addps	%xmm13, %xmm0
1329	movaps	100 * SIZE(BO), %xmm13
1330	mulps	%xmm10, %xmm13
1331	addps	%xmm13, %xmm1
1332	movaps	104 * SIZE(BO), %xmm13
1333	mulps	%xmm10, %xmm13
1334	addps	%xmm13, %xmm2
1335	movaps	108 * SIZE(BO), %xmm13
1336	mulps	%xmm10, %xmm13
1337	movsd	-18 * SIZE(AO), %xmm10
1338	addps	%xmm13, %xmm3
1339	movaps	160 * SIZE(BO), %xmm13
1340
1341	mulps	%xmm10, %xmm15
1342	addps	%xmm15, %xmm0
1343	movaps	116 * SIZE(BO), %xmm15
1344	mulps	%xmm10, %xmm15
1345	addps	%xmm15, %xmm1
1346	movaps	120 * SIZE(BO), %xmm15
1347	mulps	%xmm10, %xmm15
1348	addps	%xmm15, %xmm2
1349	movaps	124 * SIZE(BO), %xmm15
1350	mulps	%xmm10, %xmm15
1351	movsd	 -8 * SIZE(AO), %xmm10
1352	addps	%xmm15, %xmm3
1353	movaps	176 * SIZE(BO), %xmm15
1354
1355	addq   $ 16 * SIZE, AO
1356	addq   $128 * SIZE, BO
1357	decq   %rax
1358	jne    .L32
1359	ALIGN_4
1360
1361.L35:
1362#ifndef TRMMKERNEL
1363	movq	K, %rax
1364#else
1365	movq	KKK, %rax
1366#endif
1367	movaps	ALPHA, %xmm15
1368	andq	$7, %rax		# if (k & 1)
1369	BRANCH
1370	je .L38
1371	ALIGN_4
1372
1373.L36:
1374	mulps	%xmm8, %xmm9
1375	addps	%xmm9, %xmm0
1376	movaps	  4 * SIZE(BO), %xmm9
1377	mulps	%xmm8, %xmm9
1378	addps	%xmm9, %xmm1
1379	movaps	  8 * SIZE(BO), %xmm9
1380	mulps	%xmm8, %xmm9
1381	addps	%xmm9, %xmm2
1382	movaps	 12 * SIZE(BO), %xmm9
1383	mulps	%xmm8, %xmm9
1384	movsd	-30 * SIZE(AO), %xmm8
1385	addps	%xmm9, %xmm3
1386	movaps	 16 * SIZE(BO), %xmm9
1387
1388	addq	$ 2 * SIZE, AO		# aoffset  += 4
1389	addq	$16 * SIZE, BO		# boffset1 += 8
1390	decq	%rax
1391	jg	.L36
1392	ALIGN_4
1393
1394.L38:
1395	mulps	%xmm15, %xmm0
1396	mulps	%xmm15, %xmm1
1397 	mulps	%xmm15, %xmm2
1398	mulps	%xmm15, %xmm3
1399
1400#ifndef TRMMKERNEL
1401#ifdef movsd
1402	xorps	%xmm8,  %xmm8
1403#endif
1404	movsd	0 * SIZE(CO1), %xmm8
1405#ifdef movsd
1406	xorps	%xmm10,  %xmm10
1407#endif
1408	movsd	0 * SIZE(CO2), %xmm10
1409#ifdef movsd
1410	xorps	%xmm12,  %xmm12
1411#endif
1412	movsd	0 * SIZE(CO1, LDC, 2), %xmm12
1413#ifdef movsd
1414	xorps	%xmm14,  %xmm14
1415#endif
1416	movsd	0 * SIZE(CO2, LDC, 2), %xmm14
1417
1418	addps	%xmm8,  %xmm0
1419	addps	%xmm10, %xmm1
1420	addps	%xmm12, %xmm2
1421	addps	%xmm14, %xmm3
1422#endif
1423
1424	movlps	%xmm0, 0 * SIZE(CO1)
1425	movlps	%xmm1, 0 * SIZE(CO2)
1426	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1427	movlps	%xmm3, 0 * SIZE(CO2, LDC, 2)
1428
1429#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1430    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1431	movq	K, %rax
1432	subq	KKK, %rax
1433	leaq	(,%rax,    8), %rax
1434	leaq	(AO, %rax, 1), AO
1435	leaq	(BO, %rax, 8), BO
1436#endif
1437
1438#if defined(TRMMKERNEL) && defined(LEFT)
1439	addq	$2, KK
1440#endif
1441
1442	addq	$2 * SIZE, CO1		# coffset += 4
1443	addq	$2 * SIZE, CO2		# coffset += 4
1444	ALIGN_4
1445
1446.L40:
1447	testq	$1, M
1448	je	.L49
1449
1450#if !defined(TRMMKERNEL) || \
1451	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1452	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1453
1454	leaq	BUFFER, BO
1455#else
1456	leaq	BUFFER, BO
1457	movq	KK, %rax
1458	leaq	(, %rax,   4), %rax
1459	leaq	(AO, %rax, 1), AO
1460	leaq	(BO, %rax, 8), BO
1461	leaq	(BO, %rax, 8), BO
1462#endif
1463
1464	movss	-32 * SIZE(AO), %xmm8
1465	movss	-28 * SIZE(AO), %xmm10
1466
1467	movss	 0 * SIZE(BO), %xmm9
1468	movss	16 * SIZE(BO), %xmm11
1469	movss	32 * SIZE(BO), %xmm13
1470	movss	48 * SIZE(BO), %xmm15
1471
1472	xorps	%xmm0, %xmm0
1473	xorps	%xmm1, %xmm1
1474	xorps	%xmm2, %xmm2
1475	xorps	%xmm3, %xmm3
1476
1477#ifndef TRMMKERNEL
1478	movq	K, %rax
1479#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1480	movq	K, %rax
1481	subq	KK, %rax
1482	movq	%rax, KKK
1483#else
1484	movq	KK, %rax
1485#ifdef LEFT
1486	addq	$1, %rax
1487#else
1488	addq	$4, %rax
1489#endif
1490	movq	%rax, KKK
1491#endif
1492	sarq	$3, %rax
1493	je	.L45
1494	ALIGN_4
1495
1496.L42:
1497	mulss	%xmm8, %xmm9
1498	addss	%xmm9, %xmm0
1499#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1500	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1501#endif
1502	movss	 4 * SIZE(BO), %xmm9
1503	mulss	%xmm8, %xmm9
1504	addss	%xmm9, %xmm1
1505	movss	 8 * SIZE(BO), %xmm9
1506	mulss	%xmm8, %xmm9
1507	addss	%xmm9, %xmm2
1508	movss	12 * SIZE(BO), %xmm9
1509	mulss	%xmm8, %xmm9
1510	movss	-31 * SIZE(AO), %xmm8
1511	addss	%xmm9, %xmm3
1512	movss	64 * SIZE(BO), %xmm9
1513
1514	mulss	%xmm8, %xmm11
1515	addss	%xmm11, %xmm0
1516	movss	20 * SIZE(BO), %xmm11
1517	mulss	%xmm8, %xmm11
1518	addss	%xmm11, %xmm1
1519	movss	24 * SIZE(BO), %xmm11
1520	mulss	%xmm8, %xmm11
1521	addss	%xmm11, %xmm2
1522	movss	28 * SIZE(BO), %xmm11
1523	mulss	%xmm8, %xmm11
1524	movss	-30 * SIZE(AO), %xmm8
1525	addss	%xmm11, %xmm3
1526	movss	80 * SIZE(BO), %xmm11
1527
1528	mulss	%xmm8, %xmm13
1529	addss	%xmm13, %xmm0
1530	movss	36 * SIZE(BO), %xmm13
1531	mulss	%xmm8, %xmm13
1532	addss	%xmm13, %xmm1
1533	movss	40 * SIZE(BO), %xmm13
1534	mulss	%xmm8, %xmm13
1535	addss	%xmm13, %xmm2
1536	movss	44 * SIZE(BO), %xmm13
1537	mulss	%xmm8, %xmm13
1538	movss	-29 * SIZE(AO), %xmm8
1539	addss	%xmm13, %xmm3
1540	movss	96 * SIZE(BO), %xmm13
1541
1542	mulss	%xmm8, %xmm15
1543	addss	%xmm15, %xmm0
1544	movss	52 * SIZE(BO), %xmm15
1545	mulss	%xmm8, %xmm15
1546	addss	%xmm15, %xmm1
1547	movss	56 * SIZE(BO), %xmm15
1548	mulss	%xmm8, %xmm15
1549	addss	%xmm15, %xmm2
1550	movss	60 * SIZE(BO), %xmm15
1551	mulss	%xmm8, %xmm15
1552	movss	-24 * SIZE(AO), %xmm8
1553	addss	%xmm15, %xmm3
1554	movss	112 * SIZE(BO), %xmm15
1555
1556	mulss	%xmm10, %xmm9
1557	addss	%xmm9, %xmm0
1558	movss	68 * SIZE(BO), %xmm9
1559	mulss	%xmm10, %xmm9
1560	addss	%xmm9, %xmm1
1561	movss	72 * SIZE(BO), %xmm9
1562	mulss	%xmm10, %xmm9
1563	addss	%xmm9, %xmm2
1564	movss	76 * SIZE(BO), %xmm9
1565	mulss	%xmm10, %xmm9
1566	movss	-27 * SIZE(AO), %xmm10
1567	addss	%xmm9, %xmm3
1568	movss	128 * SIZE(BO), %xmm9
1569
1570	mulss	%xmm10, %xmm11
1571	addss	%xmm11, %xmm0
1572	movss	84 * SIZE(BO), %xmm11
1573	mulss	%xmm10, %xmm11
1574	addss	%xmm11, %xmm1
1575	movss	88 * SIZE(BO), %xmm11
1576	mulss	%xmm10, %xmm11
1577	addss	%xmm11, %xmm2
1578	movss	92 * SIZE(BO), %xmm11
1579	mulss	%xmm10, %xmm11
1580	movss	-26 * SIZE(AO), %xmm10
1581	addss	%xmm11, %xmm3
1582	movss	144 * SIZE(BO), %xmm11
1583
1584	mulss	%xmm10, %xmm13
1585	addss	%xmm13, %xmm0
1586	movss	100 * SIZE(BO), %xmm13
1587	mulss	%xmm10, %xmm13
1588	addss	%xmm13, %xmm1
1589	movss	104 * SIZE(BO), %xmm13
1590	mulss	%xmm10, %xmm13
1591	addss	%xmm13, %xmm2
1592	movss	108 * SIZE(BO), %xmm13
1593	mulss	%xmm10, %xmm13
1594	movss	-25 * SIZE(AO), %xmm10
1595	addss	%xmm13, %xmm3
1596	movss	160 * SIZE(BO), %xmm13
1597
1598	mulss	%xmm10, %xmm15
1599	addss	%xmm15, %xmm0
1600	movss	116 * SIZE(BO), %xmm15
1601	mulss	%xmm10, %xmm15
1602	addss	%xmm15, %xmm1
1603	movss	120 * SIZE(BO), %xmm15
1604	mulss	%xmm10, %xmm15
1605	addss	%xmm15, %xmm2
1606	movss	124 * SIZE(BO), %xmm15
1607	mulss	%xmm10, %xmm15
1608	movss	-20 * SIZE(AO), %xmm10
1609	addss	%xmm15, %xmm3
1610	movss	176 * SIZE(BO), %xmm15
1611
1612	addq   $  8 * SIZE, AO
1613	addq   $128 * SIZE, BO
1614	decq   %rax
1615	jne    .L42
1616	ALIGN_4
1617
1618.L45:
1619#ifndef TRMMKERNEL
1620	movq	K, %rax
1621#else
1622	movq	KKK, %rax
1623#endif
1624	movaps	ALPHA, %xmm15
1625	andq	$7, %rax		# if (k & 1)
1626	BRANCH
1627	je .L48
1628	ALIGN_4
1629
1630.L46:
1631	mulps	%xmm8, %xmm9
1632	addps	%xmm9, %xmm0
1633	movss	 4 * SIZE(BO), %xmm9
1634	mulps	%xmm8, %xmm9
1635	addps	%xmm9, %xmm1
1636	movss	 8 * SIZE(BO), %xmm9
1637	mulps	%xmm8, %xmm9
1638	addps	%xmm9, %xmm2
1639	movss	12 * SIZE(BO), %xmm9
1640	mulps	%xmm8, %xmm9
1641	movss	-31 * SIZE(AO), %xmm8
1642	addps	%xmm9, %xmm3
1643	movss	16 * SIZE(BO), %xmm9
1644
1645	addq	$ 1 * SIZE, AO		# aoffset  += 4
1646	addq	$16 * SIZE, BO		# boffset1 += 8
1647	decq	%rax
1648	jg	.L46
1649	ALIGN_4
1650
1651.L48:
1652	mulss	%xmm15, %xmm0
1653	mulss	%xmm15, %xmm1
1654 	mulss	%xmm15, %xmm2
1655	mulss	%xmm15, %xmm3
1656
1657#ifndef TRMMKERNEL
1658	movss	0 * SIZE(CO1), %xmm8
1659	movss	0 * SIZE(CO2), %xmm10
1660	movss	0 * SIZE(CO1, LDC, 2), %xmm12
1661	movss	0 * SIZE(CO2, LDC, 2), %xmm14
1662
1663	addss	%xmm8,  %xmm0
1664	addss	%xmm10, %xmm1
1665	addss	%xmm12, %xmm2
1666	addss	%xmm14, %xmm3
1667#endif
1668
1669	movss	%xmm0, 0 * SIZE(CO1)
1670	movss	%xmm1, 0 * SIZE(CO2)
1671	movss	%xmm2, 0 * SIZE(CO1, LDC, 2)
1672	movss	%xmm3, 0 * SIZE(CO2, LDC, 2)
1673
1674#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1675    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1676	movq	K, %rax
1677	subq	KKK, %rax
1678	leaq	(,%rax,    4), %rax
1679	leaq	(AO, %rax, 1), AO
1680	leaq	(BO, %rax, 8), BO
1681	leaq	(BO, %rax, 8), BO
1682#endif
1683
1684#if defined(TRMMKERNEL) && defined(LEFT)
1685	addq	$1, KK
1686#endif
1687	ALIGN_4
1688
1689.L49:
1690#if defined(TRMMKERNEL) && !defined(LEFT)
1691	addl	$4, KK
1692#endif
1693	leaq	(C, LDC, 4), C		# c += 4 * ldc
1694	decq	J			# j --
1695	jg	.L01
1696
1697.L50:
1698	testq	$2, N
1699	je	.L100
1700
1701.L51:
1702#if defined(TRMMKERNEL) && defined(LEFT)
1703	movq	OFFSET, %rax
1704	movq	%rax, KK
1705#endif
1706
1707/* Copying to Sub Buffer */
1708	leaq	BUFFER, BO
1709
1710	movq	K, %rax
1711	sarq	$2, %rax
1712	jle	.L53
1713	ALIGN_4
1714
1715.L52:
1716#if defined(PENTIUM4) || defined(GENERIC)
1717	movss	 0 * SIZE(B), %xmm0
1718	movss	 1 * SIZE(B), %xmm1
1719	movss	 2 * SIZE(B), %xmm2
1720	movss	 3 * SIZE(B), %xmm3
1721	movss	 4 * SIZE(B), %xmm4
1722	movss	 5 * SIZE(B), %xmm5
1723	movss	 6 * SIZE(B), %xmm6
1724	movss	 7 * SIZE(B), %xmm7
1725
1726	PREFETCH	 32 * SIZE(B)
1727
1728	shufps	 $0, %xmm0, %xmm0
1729	shufps	 $0, %xmm1, %xmm1
1730	shufps	 $0, %xmm2, %xmm2
1731	shufps	 $0, %xmm3, %xmm3
1732	shufps	 $0, %xmm4, %xmm4
1733	shufps	 $0, %xmm5, %xmm5
1734	shufps	 $0, %xmm6, %xmm6
1735	shufps	 $0, %xmm7, %xmm7
1736
1737	movaps	%xmm0,  0 * SIZE(BO)
1738	movaps	%xmm1,  4 * SIZE(BO)
1739	movaps	%xmm2,  8 * SIZE(BO)
1740	movaps	%xmm3, 12 * SIZE(BO)
1741	movaps	%xmm4, 16 * SIZE(BO)
1742	movaps	%xmm5, 20 * SIZE(BO)
1743	movaps	%xmm6, 24 * SIZE(BO)
1744	movaps	%xmm7, 28 * SIZE(BO)
1745
1746	addq	$ 8 * SIZE, B
1747	addq	$32 * SIZE, BO
1748#endif
1749
1750#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1751	PREFETCH	 32 * SIZE(B)
1752
1753	movd	 0 * SIZE(B), %mm0
1754	movd	 1 * SIZE(B), %mm1
1755	movd	 2 * SIZE(B), %mm2
1756	movd	 3 * SIZE(B), %mm3
1757	movd	 4 * SIZE(B), %mm4
1758	movd	 5 * SIZE(B), %mm5
1759	movd	 6 * SIZE(B), %mm6
1760	movd	 7 * SIZE(B), %mm7
1761
1762	punpckldq %mm0, %mm0
1763	punpckldq %mm1, %mm1
1764	punpckldq %mm2, %mm2
1765	punpckldq %mm3, %mm3
1766	punpckldq %mm4, %mm4
1767	punpckldq %mm5, %mm5
1768	punpckldq %mm6, %mm6
1769	punpckldq %mm7, %mm7
1770
1771	movq	%mm0,  0 * SIZE(BO)
1772	movq	%mm0,  2 * SIZE(BO)
1773	movq	%mm1,  4 * SIZE(BO)
1774	movq	%mm1,  6 * SIZE(BO)
1775	movq	%mm2,  8 * SIZE(BO)
1776	movq	%mm2, 10 * SIZE(BO)
1777	movq	%mm3, 12 * SIZE(BO)
1778	movq	%mm3, 14 * SIZE(BO)
1779	movq	%mm4, 16 * SIZE(BO)
1780	movq	%mm4, 18 * SIZE(BO)
1781	movq	%mm5, 20 * SIZE(BO)
1782	movq	%mm5, 22 * SIZE(BO)
1783	movq	%mm6, 24 * SIZE(BO)
1784	movq	%mm6, 26 * SIZE(BO)
1785	movq	%mm7, 28 * SIZE(BO)
1786	movq	%mm7, 30 * SIZE(BO)
1787
1788	addq	$ 8 * SIZE, B
1789	addq	$32 * SIZE, BO
1790#endif
1791
1792	decq	%rax
1793	jne	.L52
1794	ALIGN_4
1795
1796.L53:
1797	movq	K, %rax
1798	andq	$3, %rax
1799	BRANCH
1800	jle	.L60
1801	ALIGN_4
1802
1803.L54:
1804#if defined(PENTIUM4) || defined(GENERIC)
1805	movss	 0 * SIZE(B), %xmm0
1806	movss	 1 * SIZE(B), %xmm1
1807
1808	shufps	 $0, %xmm0, %xmm0
1809	shufps	 $0, %xmm1, %xmm1
1810
1811	movaps	%xmm0,  0 * SIZE(BO)
1812	movaps	%xmm1,  4 * SIZE(BO)
1813#endif
1814
1815#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1816	movd	 0 * SIZE(B), %mm0
1817	movd	 1 * SIZE(B), %mm1
1818
1819	punpckldq %mm0, %mm0
1820	punpckldq %mm1, %mm1
1821
1822	movq	%mm0,  0 * SIZE(BO)
1823	movq	%mm0,  2 * SIZE(BO)
1824	movq	%mm1,  4 * SIZE(BO)
1825	movq	%mm1,  6 * SIZE(BO)
1826#endif
1827
1828	addq	$ 2 * SIZE, B
1829	addq	$ 8 * SIZE, BO
1830	decq	%rax
1831	jne	.L54
1832	ALIGN_4
1833
1834.L60:
1835	movq	C, CO1			# coffset1 = c
1836	leaq	(C, LDC, 1), CO2	# coffset2 = c + ldc
1837	movq	A, AO		# aoffset = a
1838
1839	movq	M,  I
1840	sarq	$3, I	# i = (m >> 3)
1841	jle	.L70
1842	ALIGN_4
1843
1844.L61:
1845#if !defined(TRMMKERNEL) || \
1846	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1847	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1848
1849	leaq	BUFFER, BO
1850#else
1851	leaq	BUFFER, BO
1852	movq	KK, %rax
1853	leaq	(, %rax,   8), %rax
1854	leaq	(AO, %rax, 4), AO
1855	leaq	(BO, %rax, 4), BO
1856#endif
1857
1858	movaps	-32 * SIZE(AO), %xmm8
1859	movaps	-16 * SIZE(AO), %xmm10
1860	movaps	  0 * SIZE(AO), %xmm12
1861	movaps	 16 * SIZE(AO), %xmm14
1862
1863	movaps	 0 * SIZE(BO), %xmm9
1864	movaps	16 * SIZE(BO), %xmm11
1865	movaps	32 * SIZE(BO), %xmm13
1866	movaps	48 * SIZE(BO), %xmm15
1867
1868	xorps	%xmm0, %xmm0
1869	xorps	%xmm1, %xmm1
1870
1871	PREFETCHW      7 * SIZE(CO1)
1872	xorps	%xmm4, %xmm4
1873	PREFETCHW      7 * SIZE(CO2)
1874	xorps	%xmm5, %xmm5
1875
1876#ifndef TRMMKERNEL
1877	movq	K, %rax
1878#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1879	movq	K, %rax
1880	subq	KK, %rax
1881	movq	%rax, KKK
1882#else
1883	movq	KK, %rax
1884#ifdef LEFT
1885	addq	$8, %rax
1886#else
1887	addq	$2, %rax
1888#endif
1889	movq	%rax, KKK
1890#endif
1891	sarq	$3, %rax
1892	je	.L65
1893	ALIGN_4
1894
1895.L62:
1896	mulps	%xmm8, %xmm9
1897#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1898	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
1899#endif
1900	mulps	 4 * SIZE(BO), %xmm8
1901	addps	%xmm9, %xmm0
1902	movaps	 0 * SIZE(BO), %xmm9
1903	addps	%xmm8, %xmm1
1904	movaps	-28 * SIZE(AO), %xmm8
1905	mulps	%xmm8, %xmm9
1906	mulps	 4 * SIZE(BO), %xmm8
1907	addps	%xmm9, %xmm4
1908	movaps	 8 * SIZE(BO), %xmm9
1909	addps	%xmm8, %xmm5
1910	movaps	-24 * SIZE(AO), %xmm8
1911
1912	mulps	%xmm8, %xmm9
1913	mulps	12 * SIZE(BO), %xmm8
1914	addps	%xmm9, %xmm0
1915	movaps	 8 * SIZE(BO), %xmm9
1916	addps	%xmm8, %xmm1
1917	movaps	-20 * SIZE(AO), %xmm8
1918	mulps	%xmm8, %xmm9
1919	mulps	12 * SIZE(BO), %xmm8
1920	addps	%xmm9, %xmm4
1921	movaps	64 * SIZE(BO), %xmm9
1922	addps	%xmm8, %xmm5
1923	movaps	32 * SIZE(AO), %xmm8
1924
1925#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1926	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
1927#endif
1928	mulps	%xmm10, %xmm11
1929	mulps	20 * SIZE(BO), %xmm10
1930	addps	%xmm11, %xmm0
1931	movaps	16 * SIZE(BO), %xmm11
1932	addps	%xmm10, %xmm1
1933	movaps	-12 * SIZE(AO), %xmm10
1934	mulps	%xmm10, %xmm11
1935	mulps	20 * SIZE(BO), %xmm10
1936	addps	%xmm11, %xmm4
1937	movaps	24 * SIZE(BO), %xmm11
1938	addps	%xmm10, %xmm5
1939	movaps	 -8 * SIZE(AO), %xmm10
1940
1941	mulps	%xmm10, %xmm11
1942	mulps	28 * SIZE(BO), %xmm10
1943	addps	%xmm11, %xmm0
1944	movaps	24 * SIZE(BO), %xmm11
1945	addps	%xmm10, %xmm1
1946	movaps	-4 * SIZE(AO), %xmm10
1947	mulps	%xmm10, %xmm11
1948	mulps	28 * SIZE(BO), %xmm10
1949	addps	%xmm11, %xmm4
1950	movaps	80 * SIZE(BO), %xmm11
1951	addps	%xmm10, %xmm5
1952	movaps	48 * SIZE(AO), %xmm10
1953
1954#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1955	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
1956#endif
1957	mulps	%xmm12, %xmm13
1958	mulps	36 * SIZE(BO), %xmm12
1959	addps	%xmm13, %xmm0
1960	movaps	32 * SIZE(BO), %xmm13
1961	addps	%xmm12, %xmm1
1962	movaps	 4 * SIZE(AO), %xmm12
1963	mulps	%xmm12, %xmm13
1964	mulps	36 * SIZE(BO), %xmm12
1965	addps	%xmm13, %xmm4
1966	movaps	40 * SIZE(BO), %xmm13
1967	addps	%xmm12, %xmm5
1968	movaps	 8 * SIZE(AO), %xmm12
1969
1970	mulps	%xmm12, %xmm13
1971	mulps	44 * SIZE(BO), %xmm12
1972	addps	%xmm13, %xmm0
1973	movaps	40 * SIZE(BO), %xmm13
1974	addps	%xmm12, %xmm1
1975	movaps	12 * SIZE(AO), %xmm12
1976	mulps	%xmm12, %xmm13
1977	mulps	44 * SIZE(BO), %xmm12
1978	addps	%xmm13, %xmm4
1979	movaps	96 * SIZE(BO), %xmm13
1980	addps	%xmm12, %xmm5
1981	movaps	64 * SIZE(AO), %xmm12
1982
1983#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
1984	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
1985#endif
1986	mulps	%xmm14, %xmm15
1987	mulps	52 * SIZE(BO), %xmm14
1988	addps	%xmm15, %xmm0
1989	movaps	48 * SIZE(BO), %xmm15
1990	addps	%xmm14, %xmm1
1991	movaps	20 * SIZE(AO), %xmm14
1992	mulps	%xmm14, %xmm15
1993	mulps	52 * SIZE(BO), %xmm14
1994	addps	%xmm15, %xmm4
1995	movaps	56 * SIZE(BO), %xmm15
1996	addps	%xmm14, %xmm5
1997	movaps	24 * SIZE(AO), %xmm14
1998
1999	mulps	%xmm14, %xmm15
2000	mulps	60 * SIZE(BO), %xmm14
2001	addps	%xmm15, %xmm0
2002	movaps	56 * SIZE(BO), %xmm15
2003	addps	%xmm14, %xmm1
2004	movaps	28 * SIZE(AO), %xmm14
2005	mulps	%xmm14, %xmm15
2006	mulps	60 * SIZE(BO), %xmm14
2007	addps	%xmm15, %xmm4
2008	movaps	112 * SIZE(BO), %xmm15
2009	addps	%xmm14, %xmm5
2010	movaps	80 * SIZE(AO), %xmm14
2011
2012	addq   $64 * SIZE, AO
2013	addq   $64 * SIZE, BO
2014	decq   %rax
2015	jne    .L62
2016	ALIGN_4
2017
2018.L65:
2019#ifndef TRMMKERNEL
2020	movq	K, %rax
2021#else
2022	movq	KKK, %rax
2023#endif
2024	movaps	ALPHA, %xmm15
2025	andq	$7, %rax		# if (k & 1)
2026	BRANCH
2027	je .L68
2028	ALIGN_4
2029
2030.L66:
2031	mulps	%xmm8, %xmm9
2032	mulps	 4 * SIZE(BO), %xmm8
2033	addps	%xmm9, %xmm0
2034	movaps	 0 * SIZE(BO), %xmm9
2035	addps	%xmm8, %xmm1
2036	movaps	-28 * SIZE(AO), %xmm8
2037	mulps	%xmm8, %xmm9
2038	mulps	 4 * SIZE(BO), %xmm8
2039	addps	%xmm9, %xmm4
2040	movaps	 8 * SIZE(BO), %xmm9
2041	addps	%xmm8, %xmm5
2042	movaps	-24 * SIZE(AO), %xmm8
2043
2044	addq	$8 * SIZE, AO		# aoffset  += 4
2045	addq	$8 * SIZE, BO		# boffset1 += 8
2046	decq	%rax
2047	jg	.L66
2048	ALIGN_4
2049
2050.L68:
2051#ifndef TRMMKERNEL
2052	movsd	0 * SIZE(CO1), %xmm8
2053	movhps	2 * SIZE(CO1), %xmm8
2054	movsd	4 * SIZE(CO1), %xmm9
2055	movhps	6 * SIZE(CO1), %xmm9
2056
2057	movsd	0 * SIZE(CO2), %xmm10
2058	movhps	2 * SIZE(CO2), %xmm10
2059	movsd	4 * SIZE(CO2), %xmm11
2060	movhps	6 * SIZE(CO2), %xmm11
2061#endif
2062
2063	mulps	%xmm15, %xmm0
2064	mulps	%xmm15, %xmm4
2065	mulps	%xmm15, %xmm1
2066	mulps	%xmm15, %xmm5
2067
2068#ifndef TRMMKERNEL
2069	addps	%xmm8,  %xmm0
2070	addps	%xmm9,  %xmm4
2071	addps	%xmm10, %xmm1
2072	addps	%xmm11, %xmm5
2073#endif
2074
2075	movlps	%xmm0, 0 * SIZE(CO1)
2076	movhps	%xmm0, 2 * SIZE(CO1)
2077	movlps	%xmm4, 4 * SIZE(CO1)
2078	movhps	%xmm4, 6 * SIZE(CO1)
2079
2080	movlps	%xmm1, 0 * SIZE(CO2)
2081	movhps	%xmm1, 2 * SIZE(CO2)
2082	movlps	%xmm5, 4 * SIZE(CO2)
2083	movhps	%xmm5, 6 * SIZE(CO2)
2084
2085#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2086    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2087	movq	K, %rax
2088	subq	KKK, %rax
2089	leaq	(,%rax,    8), %rax
2090	leaq	(AO, %rax, 4), AO
2091	leaq	(BO, %rax, 4), BO
2092#endif
2093
2094#if defined(TRMMKERNEL) && defined(LEFT)
2095	addq	$8, KK
2096#endif
2097
2098	addq	$8 * SIZE, CO1		# coffset += 4
2099	addq	$8 * SIZE, CO2		# coffset += 4
2100	decq	I			# i --
2101	jg	.L61
2102	ALIGN_4
2103
2104.L70:
2105	testq	$4, M
2106	je	.L80
2107
2108
2109#if !defined(TRMMKERNEL) || \
2110	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2111	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2112
2113	leaq	BUFFER, BO
2114#else
2115	leaq	BUFFER, BO
2116	movq	KK, %rax
2117	leaq	(, %rax,   8), %rax
2118	leaq	(AO, %rax, 2), AO
2119	leaq	(BO, %rax, 4), BO
2120#endif
2121
2122	movaps	-32 * SIZE(AO), %xmm8
2123	movaps	-16 * SIZE(AO), %xmm10
2124
2125	movaps	 0 * SIZE(BO), %xmm9
2126	movaps	16 * SIZE(BO), %xmm11
2127	movaps	32 * SIZE(BO), %xmm13
2128	movaps	48 * SIZE(BO), %xmm15
2129
2130	xorps	%xmm0, %xmm0
2131	xorps	%xmm1, %xmm1
2132	xorps	%xmm2, %xmm2
2133	xorps	%xmm3, %xmm3
2134
2135#ifndef TRMMKERNEL
2136	movq	K, %rax
2137#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2138	movq	K, %rax
2139	subq	KK, %rax
2140	movq	%rax, KKK
2141#else
2142	movq	KK, %rax
2143#ifdef LEFT
2144	addq	$4, %rax
2145#else
2146	addq	$2, %rax
2147#endif
2148	movq	%rax, KKK
2149#endif
2150	sarq	$3, %rax
2151	je	.L75
2152	ALIGN_4
2153
2154.L72:
2155	mulps	%xmm8, %xmm9
2156#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2157	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2158#endif
2159
2160	mulps	 4 * SIZE(BO), %xmm8
2161	addps	%xmm9, %xmm0
2162	movaps	 8 * SIZE(BO), %xmm9
2163	addps	%xmm8, %xmm1
2164	movaps	-28 * SIZE(AO), %xmm8
2165
2166	mulps	%xmm8, %xmm9
2167	mulps	12 * SIZE(BO), %xmm8
2168	addps	%xmm9, %xmm2
2169	movaps	64 * SIZE(BO), %xmm9
2170	addps	%xmm8, %xmm3
2171	movaps	-24 * SIZE(AO), %xmm8
2172
2173	mulps	%xmm8, %xmm11
2174	mulps	20 * SIZE(BO), %xmm8
2175	addps	%xmm11, %xmm0
2176	movaps	24 * SIZE(BO), %xmm11
2177	addps	%xmm8, %xmm1
2178	movaps	-20 * SIZE(AO), %xmm8
2179
2180	mulps	%xmm8, %xmm11
2181	mulps	28 * SIZE(BO), %xmm8
2182	addps	%xmm11, %xmm2
2183	movaps	80 * SIZE(BO), %xmm11
2184	addps	%xmm8, %xmm3
2185	movaps	 0 * SIZE(AO), %xmm8
2186
2187	mulps	%xmm10, %xmm13
2188	mulps	36 * SIZE(BO), %xmm10
2189	addps	%xmm13, %xmm0
2190	movaps	40 * SIZE(BO), %xmm13
2191	addps	%xmm10, %xmm1
2192	movaps	-12 * SIZE(AO), %xmm10
2193
2194	mulps	%xmm10, %xmm13
2195	mulps	44 * SIZE(BO), %xmm10
2196	addps	%xmm13, %xmm2
2197	movaps	96 * SIZE(BO), %xmm13
2198	addps	%xmm10, %xmm3
2199	movaps	 -8 * SIZE(AO), %xmm10
2200
2201	mulps	%xmm10, %xmm15
2202	mulps	52 * SIZE(BO), %xmm10
2203	addps	%xmm15, %xmm0
2204	movaps	56 * SIZE(BO), %xmm15
2205	addps	%xmm10, %xmm1
2206	movaps	 -4 * SIZE(AO), %xmm10
2207
2208	mulps	%xmm10, %xmm15
2209	mulps	60 * SIZE(BO), %xmm10
2210	addps	%xmm15, %xmm2
2211	movaps	112 * SIZE(BO), %xmm15
2212	addps	%xmm10, %xmm3
2213	movaps	16 * SIZE(AO), %xmm10
2214
2215	addq   $32 * SIZE, AO
2216	addq   $64 * SIZE, BO
2217	decq   %rax
2218	jne    .L72
2219	ALIGN_4
2220
2221.L75:
2222#ifndef TRMMKERNEL
2223	movq	K, %rax
2224#else
2225	movq	KKK, %rax
2226#endif
2227	movaps	ALPHA, %xmm15
2228	andq	$7, %rax		# if (k & 1)
2229	BRANCH
2230	je .L78
2231	ALIGN_4
2232
2233.L76:
2234	mulps	%xmm8, %xmm9
2235	mulps	 4 * SIZE(BO), %xmm8
2236	addps	%xmm9, %xmm0
2237	movaps	 8 * SIZE(BO), %xmm9
2238	addps	%xmm8, %xmm1
2239	movaps	-28 * SIZE(AO), %xmm8
2240
2241	addq	$4 * SIZE, AO		# aoffset  += 4
2242	addq	$8 * SIZE, BO		# boffset1 += 8
2243	decq	%rax
2244	jg	.L76
2245	ALIGN_4
2246
2247.L78:
2248#ifndef TRMMKERNEL
2249	movsd	0 * SIZE(CO1), %xmm8
2250	movhps	2 * SIZE(CO1), %xmm8
2251	movsd	0 * SIZE(CO2), %xmm10
2252	movhps	2 * SIZE(CO2), %xmm10
2253#endif
2254
2255	addps	%xmm2, %xmm0
2256	addps	%xmm3, %xmm1
2257
2258	mulps	%xmm15, %xmm0
2259	mulps	%xmm15, %xmm1
2260
2261#ifndef TRMMKERNEL
2262	addps	%xmm8,  %xmm0
2263	addps	%xmm10, %xmm1
2264#endif
2265
2266	movlps	%xmm0, 0 * SIZE(CO1)
2267	movhps	%xmm0, 2 * SIZE(CO1)
2268	movlps	%xmm1, 0 * SIZE(CO2)
2269	movhps	%xmm1, 2 * SIZE(CO2)
2270
2271#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2272    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2273	movq	K, %rax
2274	subq	KKK, %rax
2275	leaq	(,%rax,    8), %rax
2276	leaq	(AO, %rax, 2), AO
2277	leaq	(BO, %rax, 4), BO
2278#endif
2279
2280#if defined(TRMMKERNEL) && defined(LEFT)
2281	addq	$4, KK
2282#endif
2283
2284	addq	$4 * SIZE, CO1		# coffset += 4
2285	addq	$4 * SIZE, CO2		# coffset += 4
2286	ALIGN_4
2287
2288.L80:
2289	testq	$2, M
2290	je	.L90
2291
2292#if !defined(TRMMKERNEL) || \
2293	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2294	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2295
2296	leaq	BUFFER, BO
2297#else
2298	leaq	BUFFER, BO
2299	movq	KK, %rax
2300	leaq	(, %rax,   8), %rax
2301	leaq	(AO, %rax, 1), AO
2302	leaq	(BO, %rax, 4), BO
2303#endif
2304
2305	movaps	-32 * SIZE(AO), %xmm8
2306	movaps	-24 * SIZE(AO), %xmm10
2307
2308	movaps	 0 * SIZE(BO), %xmm9
2309	movaps	16 * SIZE(BO), %xmm11
2310	movaps	32 * SIZE(BO), %xmm13
2311	movaps	48 * SIZE(BO), %xmm15
2312
2313	xorps	%xmm0, %xmm0
2314	xorps	%xmm1, %xmm1
2315	xorps	%xmm2, %xmm2
2316	xorps	%xmm3, %xmm3
2317
2318#ifndef TRMMKERNEL
2319	movq	K, %rax
2320#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2321	movq	K, %rax
2322	subq	KK, %rax
2323	movq	%rax, KKK
2324#else
2325	movq	KK, %rax
2326#ifdef LEFT
2327	addq	$2, %rax
2328#else
2329	addq	$2, %rax
2330#endif
2331	movq	%rax, KKK
2332#endif
2333	sarq	$3, %rax
2334	je	.L85
2335	ALIGN_4
2336
2337.L82:
2338	mulps	%xmm8, %xmm9
2339	addps	%xmm9, %xmm0
2340#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2341	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2342#endif
2343	movaps	 4 * SIZE(BO), %xmm9
2344	mulps	%xmm8, %xmm9
2345	movsd	-30 * SIZE(AO), %xmm8
2346	addps	%xmm9, %xmm1
2347	movaps	 8 * SIZE(BO), %xmm9
2348
2349	mulps	%xmm8, %xmm9
2350	addps	%xmm9, %xmm2
2351	movaps	12 * SIZE(BO), %xmm9
2352	mulps	%xmm8, %xmm9
2353	movsd	-28 * SIZE(AO), %xmm8
2354	addps	%xmm9, %xmm3
2355	movaps	64 * SIZE(BO), %xmm9
2356
2357	mulps	%xmm8, %xmm11
2358	addps	%xmm11, %xmm0
2359	movaps	20 * SIZE(BO), %xmm11
2360	mulps	%xmm8, %xmm11
2361	movsd	-26 * SIZE(AO), %xmm8
2362	addps	%xmm11, %xmm1
2363	movaps	24 * SIZE(BO), %xmm11
2364
2365	mulps	%xmm8, %xmm11
2366	addps	%xmm11, %xmm2
2367	movaps	28 * SIZE(BO), %xmm11
2368	mulps	%xmm8, %xmm11
2369	movsd	-16 * SIZE(AO), %xmm8
2370	addps	%xmm11, %xmm3
2371	movaps	 80 * SIZE(BO), %xmm11
2372
2373	mulps	%xmm10, %xmm13
2374	addps	%xmm13, %xmm0
2375	movaps	36 * SIZE(BO), %xmm13
2376	mulps	%xmm10, %xmm13
2377	movsd	-22 * SIZE(AO), %xmm10
2378	addps	%xmm13, %xmm1
2379	movaps	40 * SIZE(BO), %xmm13
2380
2381	mulps	%xmm10, %xmm13
2382	addps	%xmm13, %xmm2
2383	movaps	44 * SIZE(BO), %xmm13
2384	mulps	%xmm10, %xmm13
2385	movsd	-20 * SIZE(AO), %xmm10
2386	addps	%xmm13, %xmm3
2387	movaps	 96 * SIZE(BO), %xmm13
2388
2389	mulps	%xmm10, %xmm15
2390	addps	%xmm15, %xmm0
2391	movaps	52 * SIZE(BO), %xmm15
2392	mulps	%xmm10, %xmm15
2393	movsd	-18 * SIZE(AO), %xmm10
2394	addps	%xmm15, %xmm1
2395	movaps	56 * SIZE(BO), %xmm15
2396
2397	mulps	%xmm10, %xmm15
2398	addps	%xmm15, %xmm2
2399	movaps	60 * SIZE(BO), %xmm15
2400	mulps	%xmm10, %xmm15
2401	movsd	-8 * SIZE(AO), %xmm10
2402	addps	%xmm15, %xmm3
2403	movaps	112 * SIZE(BO), %xmm15
2404
2405	addq   $16 * SIZE, AO
2406	addq   $64 * SIZE, BO
2407	decq   %rax
2408	jne    .L82
2409	ALIGN_4
2410
2411.L85:
2412#ifndef TRMMKERNEL
2413	movq	K, %rax
2414#else
2415	movq	KKK, %rax
2416#endif
2417	movaps	ALPHA, %xmm15
2418	andq	$7, %rax		# if (k & 1)
2419	BRANCH
2420	je .L88
2421	ALIGN_4
2422
2423.L86:
2424	mulps	%xmm8, %xmm9
2425	addps	%xmm9, %xmm0
2426	movaps	  4 * SIZE(BO), %xmm9
2427	mulps	%xmm8, %xmm9
2428	movsd	-30 * SIZE(AO), %xmm8
2429	addps	%xmm9, %xmm1
2430	movaps	  8 * SIZE(BO), %xmm9
2431
2432	addq	$2 * SIZE, AO		# aoffset  += 4
2433	addq	$8 * SIZE, BO		# boffset1 += 8
2434	decq	%rax
2435	jg	.L86
2436	ALIGN_4
2437
2438.L88:
2439#ifndef TRMMKERNEL
2440#ifdef movsd
2441	xorps	%xmm8,  %xmm8
2442#endif
2443	movsd	0 * SIZE(CO1), %xmm8
2444#ifdef movsd
2445	xorps	%xmm10, %xmm10
2446#endif
2447	movsd	0 * SIZE(CO2), %xmm10
2448#endif
2449
2450	addps	%xmm2, %xmm0
2451	addps	%xmm3, %xmm1
2452
2453	mulps	%xmm15, %xmm0
2454	mulps	%xmm15, %xmm1
2455
2456#ifndef TRMMKERNEL
2457	addps	%xmm8,  %xmm0
2458	addps	%xmm10, %xmm1
2459#endif
2460
2461	movlps	%xmm0, 0 * SIZE(CO1)
2462	movlps	%xmm1, 0 * SIZE(CO2)
2463
2464#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2465    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2466	movq	K, %rax
2467	subq	KKK, %rax
2468	leaq	(,%rax,    8), %rax
2469	leaq	(AO, %rax, 1), AO
2470	leaq	(BO, %rax, 4), BO
2471#endif
2472
2473#if defined(TRMMKERNEL) && defined(LEFT)
2474	addq	$2, KK
2475#endif
2476
2477	addq	$2 * SIZE, CO1		# coffset += 4
2478	addq	$2 * SIZE, CO2		# coffset += 4
2479	ALIGN_4
2480
2481.L90:
2482	testq	$1, M
2483	je	.L99
2484
2485#if !defined(TRMMKERNEL) || \
2486	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2487	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2488
2489	leaq	BUFFER, BO
2490#else
2491	leaq	BUFFER, BO
2492	movq	KK, %rax
2493	leaq	(, %rax,   4), %rax
2494	leaq	(AO, %rax, 1), AO
2495	leaq	(BO, %rax, 8), BO
2496#endif
2497
2498	movss	-32 * SIZE(AO), %xmm8
2499	movss	-28 * SIZE(AO), %xmm10
2500
2501	movss	 0 * SIZE(BO), %xmm9
2502	movss	16 * SIZE(BO), %xmm11
2503	movss	32 * SIZE(BO), %xmm13
2504	movss	48 * SIZE(BO), %xmm15
2505
2506	xorps	%xmm0, %xmm0
2507	xorps	%xmm1, %xmm1
2508	xorps	%xmm2, %xmm2
2509	xorps	%xmm3, %xmm3
2510
2511#ifndef TRMMKERNEL
2512	movq	K, %rax
2513#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2514	movq	K, %rax
2515	subq	KK, %rax
2516	movq	%rax, KKK
2517#else
2518	movq	KK, %rax
2519#ifdef LEFT
2520	addq	$1, %rax
2521#else
2522	addq	$2, %rax
2523#endif
2524	movq	%rax, KKK
2525#endif
2526	sarq	$3, %rax
2527	je	.L95
2528	ALIGN_4
2529
2530.L92:
2531	mulps	%xmm8, %xmm9
2532	addps	%xmm9, %xmm0
2533#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2534	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2535#endif
2536	movss	 4 * SIZE(BO), %xmm9
2537	mulps	%xmm8, %xmm9
2538	movss	-31 * SIZE(AO), %xmm8
2539	addps	%xmm9, %xmm1
2540	movss	 8 * SIZE(BO), %xmm9
2541
2542	mulps	%xmm8, %xmm9
2543	addps	%xmm9, %xmm2
2544	movss	12 * SIZE(BO), %xmm9
2545	mulps	%xmm8, %xmm9
2546	movss	-30 * SIZE(AO), %xmm8
2547	addps	%xmm9, %xmm3
2548	movss	64 * SIZE(BO), %xmm9
2549
2550	mulps	%xmm8, %xmm11
2551	addps	%xmm11, %xmm0
2552	movss	20 * SIZE(BO), %xmm11
2553	mulps	%xmm8, %xmm11
2554	movss	-29 * SIZE(AO), %xmm8
2555	addps	%xmm11, %xmm1
2556	movss	24 * SIZE(BO), %xmm11
2557
2558	mulps	%xmm8, %xmm11
2559	addps	%xmm11, %xmm2
2560	movss	28 * SIZE(BO), %xmm11
2561	mulps	%xmm8, %xmm11
2562	movss	-24 * SIZE(AO), %xmm8
2563	addps	%xmm11, %xmm3
2564	movss	 80 * SIZE(BO), %xmm11
2565
2566	mulps	%xmm10, %xmm13
2567	addps	%xmm13, %xmm0
2568	movss	36 * SIZE(BO), %xmm13
2569	mulps	%xmm10, %xmm13
2570	movss	-27 * SIZE(AO), %xmm10
2571	addps	%xmm13, %xmm1
2572	movss	40 * SIZE(BO), %xmm13
2573
2574	mulps	%xmm10, %xmm13
2575	addps	%xmm13, %xmm2
2576	movss	44 * SIZE(BO), %xmm13
2577	mulps	%xmm10, %xmm13
2578	movss	-26 * SIZE(AO), %xmm10
2579	addps	%xmm13, %xmm3
2580	movss	 96 * SIZE(BO), %xmm13
2581
2582	mulps	%xmm10, %xmm15
2583	addps	%xmm15, %xmm0
2584	movss	52 * SIZE(BO), %xmm15
2585	mulps	%xmm10, %xmm15
2586	movss	-25 * SIZE(AO), %xmm10
2587	addps	%xmm15, %xmm1
2588	movss	56 * SIZE(BO), %xmm15
2589
2590	mulps	%xmm10, %xmm15
2591	addps	%xmm15, %xmm2
2592	movss	60 * SIZE(BO), %xmm15
2593	mulps	%xmm10, %xmm15
2594	movss	-20 * SIZE(AO), %xmm10
2595	addps	%xmm15, %xmm3
2596	movss	112 * SIZE(BO), %xmm15
2597
2598	addq   $ 8 * SIZE, AO
2599	addq   $64 * SIZE, BO
2600	decq   %rax
2601	jne    .L92
2602	ALIGN_4
2603
2604.L95:
2605#ifndef TRMMKERNEL
2606	movq	K, %rax
2607#else
2608	movq	KKK, %rax
2609#endif
2610	movaps	ALPHA, %xmm15
2611	andq	$7, %rax		# if (k & 1)
2612	BRANCH
2613	je .L98
2614	ALIGN_4
2615
2616.L96:
2617	mulps	%xmm8, %xmm9
2618	addps	%xmm9, %xmm0
2619	movss	 4 * SIZE(BO), %xmm9
2620	mulps	%xmm8, %xmm9
2621	movss	-31 * SIZE(AO), %xmm8
2622	addps	%xmm9, %xmm1
2623	movss	 8 * SIZE(BO), %xmm9
2624
2625	addq	$1 * SIZE, AO		# aoffset  += 4
2626	addq	$8 * SIZE, BO		# boffset1 += 8
2627	decq	%rax
2628	jg	.L96
2629	ALIGN_4
2630
2631.L98:
2632#ifndef TRMMKERNEL
2633	movss	0 * SIZE(CO1), %xmm8
2634	movss	0 * SIZE(CO2), %xmm10
2635#endif
2636
2637	addss	%xmm2, %xmm0
2638	addss	%xmm3, %xmm1
2639	mulss	%xmm15, %xmm0
2640	mulss	%xmm15, %xmm1
2641
2642#ifndef TRMMKERNEL
2643	addss	%xmm8,  %xmm0
2644	addss	%xmm10, %xmm1
2645#endif
2646
2647	movss	%xmm0, 0 * SIZE(CO1)
2648	movss	%xmm1, 0 * SIZE(CO2)
2649
2650#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2651    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2652	movq	K, %rax
2653	subq	KKK, %rax
2654	leaq	(,%rax,    4), %rax
2655	leaq	(AO, %rax, 1), AO
2656	leaq	(BO, %rax, 8), BO
2657#endif
2658
2659#if defined(TRMMKERNEL) && defined(LEFT)
2660	addq	$1, KK
2661#endif
2662	ALIGN_4
2663
2664.L99:
2665#if defined(TRMMKERNEL) && !defined(LEFT)
2666	addl	$2, KK
2667#endif
2668	leaq	(C, LDC, 2), C		# c += 4 * ldc
2669	ALIGN_4
2670
2671
2672.L100:
2673	testq	$1, N
2674	je	.L999
2675
2676.L101:
2677#if defined(TRMMKERNEL) && defined(LEFT)
2678	movq	OFFSET, %rax
2679	movq	%rax, KK
2680#endif
2681
2682/* Copying to Sub Buffer */
2683	leaq	BUFFER, BO
2684
2685	movq	K, %rax
2686	sarq	$3, %rax
2687	jle	.L103
2688	ALIGN_4
2689
2690
2691.L102:
2692#if defined(PENTIUM4) || defined(GENERIC)
2693	movss	 0 * SIZE(B), %xmm0
2694	movss	 1 * SIZE(B), %xmm1
2695	movss	 2 * SIZE(B), %xmm2
2696	movss	 3 * SIZE(B), %xmm3
2697	movss	 4 * SIZE(B), %xmm4
2698	movss	 5 * SIZE(B), %xmm5
2699	movss	 6 * SIZE(B), %xmm6
2700	movss	 7 * SIZE(B), %xmm7
2701
2702	PREFETCH	 32 * SIZE(B)
2703
2704	shufps	 $0, %xmm0, %xmm0
2705	shufps	 $0, %xmm1, %xmm1
2706	shufps	 $0, %xmm2, %xmm2
2707	shufps	 $0, %xmm3, %xmm3
2708	shufps	 $0, %xmm4, %xmm4
2709	shufps	 $0, %xmm5, %xmm5
2710	shufps	 $0, %xmm6, %xmm6
2711	shufps	 $0, %xmm7, %xmm7
2712
2713	movaps	%xmm0,  0 * SIZE(BO)
2714	movaps	%xmm1,  4 * SIZE(BO)
2715	movaps	%xmm2,  8 * SIZE(BO)
2716	movaps	%xmm3, 12 * SIZE(BO)
2717	movaps	%xmm4, 16 * SIZE(BO)
2718	movaps	%xmm5, 20 * SIZE(BO)
2719	movaps	%xmm6, 24 * SIZE(BO)
2720	movaps	%xmm7, 28 * SIZE(BO)
2721
2722	addq	$ 8 * SIZE, B
2723	addq	$32 * SIZE, BO
2724#endif
2725
2726#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2727	PREFETCH	 32 * SIZE(B)
2728
2729	movd	 0 * SIZE(B), %mm0
2730	movd	 1 * SIZE(B), %mm1
2731	movd	 2 * SIZE(B), %mm2
2732	movd	 3 * SIZE(B), %mm3
2733	movd	 4 * SIZE(B), %mm4
2734	movd	 5 * SIZE(B), %mm5
2735	movd	 6 * SIZE(B), %mm6
2736	movd	 7 * SIZE(B), %mm7
2737
2738	punpckldq %mm0, %mm0
2739	punpckldq %mm1, %mm1
2740	punpckldq %mm2, %mm2
2741	punpckldq %mm3, %mm3
2742	punpckldq %mm4, %mm4
2743	punpckldq %mm5, %mm5
2744	punpckldq %mm6, %mm6
2745	punpckldq %mm7, %mm7
2746
2747	movq	%mm0,  0 * SIZE(BO)
2748	movq	%mm0,  2 * SIZE(BO)
2749	movq	%mm1,  4 * SIZE(BO)
2750	movq	%mm1,  6 * SIZE(BO)
2751	movq	%mm2,  8 * SIZE(BO)
2752	movq	%mm2, 10 * SIZE(BO)
2753	movq	%mm3, 12 * SIZE(BO)
2754	movq	%mm3, 14 * SIZE(BO)
2755	movq	%mm4, 16 * SIZE(BO)
2756	movq	%mm4, 18 * SIZE(BO)
2757	movq	%mm5, 20 * SIZE(BO)
2758	movq	%mm5, 22 * SIZE(BO)
2759	movq	%mm6, 24 * SIZE(BO)
2760	movq	%mm6, 26 * SIZE(BO)
2761	movq	%mm7, 28 * SIZE(BO)
2762	movq	%mm7, 30 * SIZE(BO)
2763
2764	addq	$ 8 * SIZE, B
2765	addq	$32 * SIZE, BO
2766#endif
2767
2768	decq	%rax
2769	jne	.L102
2770	ALIGN_4
2771
2772.L103:
2773	movq	K, %rax
2774	andq	$7, %rax
2775	BRANCH
2776	jle	.L110
2777	ALIGN_4
2778
2779.L104:
2780#if defined(PENTIUM4) || defined(GENERIC)
2781	movss	 0 * SIZE(B), %xmm0
2782	shufps	 $0, %xmm0, %xmm0
2783	movaps	%xmm0,  0 * SIZE(BO)
2784#endif
2785
2786#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2787	movd	 0 * SIZE(B), %mm0
2788	punpckldq %mm0, %mm0
2789	movq	%mm0,  0 * SIZE(BO)
2790	movq	%mm0,  2 * SIZE(BO)
2791#endif
2792
2793	addq	$ 1 * SIZE, B
2794	addq	$ 4 * SIZE, BO
2795	decq	%rax
2796	jne	.L104
2797	ALIGN_4
2798
2799.L110:
2800	movq	C, CO1			# coffset1 = c
2801	movq	A, AO		# aoffset = a
2802
2803	movq	M,  I
2804	sarq	$3, I	# i = (m >> 3)
2805	jle	.L120
2806	ALIGN_4
2807
2808.L111:
2809#if !defined(TRMMKERNEL) || \
2810	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2811	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2812
2813	leaq	BUFFER, BO
2814#else
2815	leaq	BUFFER, BO
2816	movq	KK, %rax
2817	leaq	(, %rax,   8), %rax
2818	leaq	(AO, %rax, 4), AO
2819	leaq	(BO, %rax, 2), BO
2820#endif
2821
2822	movaps	-32 * SIZE(AO), %xmm8
2823	movaps	-16 * SIZE(AO), %xmm10
2824	movaps	  0 * SIZE(AO), %xmm12
2825	movaps	 16 * SIZE(AO), %xmm14
2826
2827	movaps	 0 * SIZE(BO), %xmm9
2828	movaps	16 * SIZE(BO), %xmm11
2829	movaps	32 * SIZE(BO), %xmm13
2830	movaps	48 * SIZE(BO), %xmm15
2831
2832	xorps	%xmm0, %xmm0
2833	xorps	%xmm1, %xmm1
2834
2835	PREFETCHW      7 * SIZE(CO1)
2836	xorps	%xmm4, %xmm4
2837	xorps	%xmm5, %xmm5
2838
2839#ifndef TRMMKERNEL
2840	movq	K, %rax
2841#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
2842	movq	K, %rax
2843	subq	KK, %rax
2844	movq	%rax, KKK
2845#else
2846	movq	KK, %rax
2847#ifdef LEFT
2848	addq	$8, %rax
2849#else
2850	addq	$1, %rax
2851#endif
2852	movq	%rax, KKK
2853#endif
2854	sarq	$3, %rax
2855	je	.L115
2856	ALIGN_4
2857
2858.L112:
2859	mulps	%xmm9, %xmm8
2860#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2861	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
2862#endif
2863
2864	mulps	-28 * SIZE(AO), %xmm9
2865	addps	%xmm8, %xmm0
2866	movaps	-24 * SIZE(AO), %xmm8
2867	addps	%xmm9, %xmm4
2868	movaps	 4 * SIZE(BO), %xmm9
2869
2870	mulps	%xmm9, %xmm8
2871	mulps	-20 * SIZE(AO), %xmm9
2872	addps	%xmm8, %xmm0
2873	movaps	 32 * SIZE(AO), %xmm8
2874	addps	%xmm9, %xmm4
2875	movaps	 8 * SIZE(BO), %xmm9
2876
2877#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2878	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
2879#endif
2880	mulps	%xmm9, %xmm10
2881	mulps	-12 * SIZE(AO), %xmm9
2882	addps	%xmm10, %xmm0
2883	movaps	 -8 * SIZE(AO), %xmm10
2884	addps	%xmm9, %xmm4
2885	movaps	12 * SIZE(BO), %xmm9
2886
2887	mulps	%xmm9, %xmm10
2888	mulps	 -4 * SIZE(AO), %xmm9
2889	addps	%xmm10, %xmm0
2890	movaps	 48 * SIZE(AO), %xmm10
2891	addps	%xmm9, %xmm4
2892	movaps	32 * SIZE(BO), %xmm9
2893
2894#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2895	PREFETCH	(PREFETCHSIZE + 32) * SIZE(AO)
2896#endif
2897	mulps	%xmm11, %xmm12
2898	mulps	 4 * SIZE(AO), %xmm11
2899	addps	%xmm12, %xmm0
2900	movaps	 8 * SIZE(AO), %xmm12
2901	addps	%xmm11, %xmm4
2902	movaps	20 * SIZE(BO), %xmm11
2903
2904	mulps	%xmm11, %xmm12
2905	mulps	12 * SIZE(AO), %xmm11
2906	addps	%xmm12, %xmm0
2907	movaps	64 * SIZE(AO), %xmm12
2908	addps	%xmm11, %xmm4
2909	movaps	24 * SIZE(BO), %xmm11
2910
2911#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
2912	PREFETCH	(PREFETCHSIZE + 48) * SIZE(AO)
2913#endif
2914	mulps	%xmm11, %xmm14
2915	mulps	20 * SIZE(AO), %xmm11
2916	addps	%xmm14, %xmm0
2917	movaps	24 * SIZE(AO), %xmm14
2918	addps	%xmm11, %xmm4
2919	movaps	28 * SIZE(BO), %xmm11
2920
2921	mulps	%xmm11, %xmm14
2922	mulps	28 * SIZE(AO), %xmm11
2923	addps	%xmm14, %xmm0
2924	movaps	80 * SIZE(AO), %xmm14
2925	addps	%xmm11, %xmm4
2926	movaps	48 * SIZE(BO), %xmm11
2927
2928	addq   $64 * SIZE, AO
2929	addq   $32 * SIZE, BO
2930	decq   %rax
2931	jne    .L112
2932	ALIGN_4
2933
2934.L115:
2935#ifndef TRMMKERNEL
2936	movq	K, %rax
2937#else
2938	movq	KKK, %rax
2939#endif
2940	movaps	ALPHA, %xmm15
2941	andq	$7, %rax		# if (k & 1)
2942	BRANCH
2943	je .L118
2944	ALIGN_4
2945
2946.L116:
2947	mulps	%xmm9, %xmm8
2948	mulps	-28 * SIZE(AO), %xmm9
2949	addps	%xmm8, %xmm0
2950	movaps	-24 * SIZE(AO), %xmm8
2951	addps	%xmm9, %xmm4
2952	movaps	 4 * SIZE(BO), %xmm9
2953
2954	addq	$8 * SIZE, AO		# aoffset  += 4
2955	addq	$4 * SIZE, BO		# boffset1 += 8
2956	decq	%rax
2957	jg	.L116
2958	ALIGN_4
2959
2960.L118:
2961#ifndef TRMMKERNEL
2962	movsd	0 * SIZE(CO1), %xmm8
2963	movhps	2 * SIZE(CO1), %xmm8
2964	movsd	4 * SIZE(CO1), %xmm9
2965	movhps	6 * SIZE(CO1), %xmm9
2966#endif
2967
2968	mulps	%xmm15, %xmm0
2969	mulps	%xmm15, %xmm4
2970#ifndef TRMMKERNEL
2971	addps	%xmm8,  %xmm0
2972	addps	%xmm9,  %xmm4
2973#endif
2974
2975	movlps	%xmm0, 0 * SIZE(CO1)
2976	movhps	%xmm0, 2 * SIZE(CO1)
2977	movlps	%xmm4, 4 * SIZE(CO1)
2978	movhps	%xmm4, 6 * SIZE(CO1)
2979
2980#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
2981    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
2982	movq	K, %rax
2983	subq	KKK, %rax
2984	leaq	(,%rax,    8), %rax
2985	leaq	(AO, %rax, 4), AO
2986	leaq	(BO, %rax, 2), BO
2987#endif
2988
2989#if defined(TRMMKERNEL) && defined(LEFT)
2990	addq	$8, KK
2991#endif
2992
2993	addq	$8 * SIZE, CO1		# coffset += 4
2994	decq	I			# i --
2995	jg	.L111
2996	ALIGN_4
2997
2998.L120:
2999	testq	$4, M
3000	je	.L130
3001
3002#if !defined(TRMMKERNEL) || \
3003	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3004	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3005
3006	leaq	BUFFER, BO
3007#else
3008	leaq	BUFFER, BO
3009	movq	KK, %rax
3010	leaq	(, %rax,   8), %rax
3011	leaq	(AO, %rax, 2), AO
3012	leaq	(BO, %rax, 2), BO
3013#endif
3014
3015	movaps	-32 * SIZE(AO), %xmm8
3016	movaps	-16 * SIZE(AO), %xmm10
3017
3018	movaps	 0 * SIZE(BO), %xmm9
3019	movaps	16 * SIZE(BO), %xmm11
3020
3021	xorps	%xmm0, %xmm0
3022	xorps	%xmm1, %xmm1
3023	xorps	%xmm2, %xmm2
3024	xorps	%xmm3, %xmm3
3025
3026#ifndef TRMMKERNEL
3027	movq	K, %rax
3028#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3029	movq	K, %rax
3030	subq	KK, %rax
3031	movq	%rax, KKK
3032#else
3033	movq	KK, %rax
3034#ifdef LEFT
3035	addq	$4, %rax
3036#else
3037	addq	$1, %rax
3038#endif
3039	movq	%rax, KKK
3040#endif
3041	sarq	$3, %rax
3042	je	.L125
3043	ALIGN_4
3044
3045.L122:
3046	mulps	%xmm8, %xmm9
3047#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3048	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3049#endif
3050	movaps	-28 * SIZE(AO), %xmm8
3051	mulps	 4 * SIZE(BO), %xmm8
3052	addps	%xmm9, %xmm0
3053	movaps	32 * SIZE(BO), %xmm9
3054	addps	%xmm8, %xmm1
3055	movaps	-24 * SIZE(AO), %xmm8
3056	mulps	 8 * SIZE(BO), %xmm8
3057	addps	%xmm8, %xmm2
3058	movaps	-20 * SIZE(AO), %xmm8
3059	mulps	12 * SIZE(BO), %xmm8
3060	addps	%xmm8, %xmm3
3061	movaps	  0 * SIZE(AO), %xmm8
3062
3063#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3064	PREFETCH	(PREFETCHSIZE + 16) * SIZE(AO)
3065#endif
3066	mulps	%xmm10, %xmm11
3067	movaps	-12 * SIZE(AO), %xmm10
3068	mulps	20 * SIZE(BO), %xmm10
3069	addps	%xmm11, %xmm0
3070	movaps	48 * SIZE(BO), %xmm11
3071	addps	%xmm10, %xmm1
3072	movaps	 -8 * SIZE(AO), %xmm10
3073	mulps	24 * SIZE(BO), %xmm10
3074	addps	%xmm10, %xmm2
3075	movaps	-4 * SIZE(AO), %xmm10
3076	mulps	28 * SIZE(BO), %xmm10
3077	addps	%xmm10, %xmm3
3078	movaps	16 * SIZE(AO), %xmm10
3079
3080	addq   $32 * SIZE, AO
3081	addq   $32 * SIZE, BO
3082	decq   %rax
3083	jne    .L122
3084	ALIGN_4
3085
3086.L125:
3087#ifndef TRMMKERNEL
3088	movq	K, %rax
3089#else
3090	movq	KKK, %rax
3091#endif
3092	movaps	ALPHA, %xmm15
3093	andq	$7, %rax		# if (k & 1)
3094	BRANCH
3095	je .L128
3096	ALIGN_4
3097
3098.L126:
3099	mulps	%xmm8, %xmm9
3100	movaps	-28 * SIZE(AO), %xmm8
3101	addps	%xmm9, %xmm0
3102	movaps	 4 * SIZE(BO), %xmm9
3103
3104	addq	$4 * SIZE, AO		# aoffset  += 4
3105	addq	$4 * SIZE, BO		# boffset1 += 8
3106	decq	%rax
3107	jg	.L126
3108	ALIGN_4
3109
3110.L128:
3111#ifndef TRMMKERNEL
3112	movsd	0 * SIZE(CO1), %xmm8
3113	movhps	2 * SIZE(CO1), %xmm8
3114#endif
3115
3116	addps	%xmm1, %xmm0
3117	addps	%xmm3, %xmm2
3118	addps	%xmm2, %xmm0
3119
3120	mulps	%xmm15, %xmm0
3121#ifndef TRMMKERNEL
3122	addps	%xmm8,  %xmm0
3123#endif
3124
3125	movlps	%xmm0, 0 * SIZE(CO1)
3126	movhps	%xmm0, 2 * SIZE(CO1)
3127
3128#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3129    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3130	movq	K, %rax
3131	subq	KKK, %rax
3132	leaq	(,%rax,    8), %rax
3133	leaq	(AO, %rax, 2), AO
3134	leaq	(BO, %rax, 2), BO
3135#endif
3136
3137#if defined(TRMMKERNEL) && defined(LEFT)
3138	addq	$4, KK
3139#endif
3140
3141	addq	$4 * SIZE, CO1		# coffset += 4
3142	ALIGN_4
3143
3144.L130:
3145	testq	$2, M
3146	je	.L140
3147
3148#if !defined(TRMMKERNEL) || \
3149	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3150	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3151
3152	leaq	BUFFER, BO
3153#else
3154	leaq	BUFFER, BO
3155	movq	KK, %rax
3156	leaq	(, %rax,   8), %rax
3157	leaq	(AO, %rax, 1), AO
3158	leaq	(BO, %rax, 2), BO
3159#endif
3160
3161	movaps	-32 * SIZE(AO), %xmm8
3162	movaps	-24 * SIZE(AO), %xmm10
3163
3164	movaps	 0 * SIZE(BO), %xmm9
3165	movaps	16 * SIZE(BO), %xmm11
3166
3167	xorps	%xmm0, %xmm0
3168	xorps	%xmm1, %xmm1
3169	xorps	%xmm2, %xmm2
3170	xorps	%xmm3, %xmm3
3171
3172#ifndef TRMMKERNEL
3173	movq	K, %rax
3174#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3175	movq	K, %rax
3176	subq	KK, %rax
3177	movq	%rax, KKK
3178#else
3179	movq	KK, %rax
3180#ifdef LEFT
3181	addq	$2, %rax
3182#else
3183	addq	$1, %rax
3184#endif
3185	movq	%rax, KKK
3186#endif
3187	sarq	$3, %rax
3188	je	.L135
3189	ALIGN_4
3190
3191.L132:
3192	mulps	%xmm8, %xmm9
3193#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3194	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3195#endif
3196	movsd	-30 * SIZE(AO), %xmm8
3197	addps	%xmm9, %xmm0
3198	movaps	 4 * SIZE(BO), %xmm9
3199	mulps	%xmm8, %xmm9
3200	movsd	-28 * SIZE(AO), %xmm8
3201	addps	%xmm9, %xmm1
3202	movaps	 8 * SIZE(BO), %xmm9
3203
3204	mulps	%xmm8, %xmm9
3205	movsd	-26 * SIZE(AO), %xmm8
3206	addps	%xmm9, %xmm0
3207	movaps	12 * SIZE(BO), %xmm9
3208
3209	mulps	%xmm8, %xmm9
3210	movsd	-16 * SIZE(AO), %xmm8
3211	addps	%xmm9, %xmm1
3212	movaps	32 * SIZE(BO), %xmm9
3213
3214	mulps	%xmm10, %xmm11
3215	movsd	-22 * SIZE(AO), %xmm10
3216	addps	%xmm11, %xmm0
3217	movaps	20 * SIZE(BO), %xmm11
3218
3219	mulps	%xmm10, %xmm11
3220	movsd	-20 * SIZE(AO), %xmm10
3221	addps	%xmm11, %xmm1
3222	movaps	24 * SIZE(BO), %xmm11
3223
3224	mulps	%xmm10, %xmm11
3225	movsd	-18 * SIZE(AO), %xmm10
3226	addps	%xmm11, %xmm0
3227	movaps	28 * SIZE(BO), %xmm11
3228
3229	mulps	%xmm10, %xmm11
3230	movsd	 -8 * SIZE(AO), %xmm10
3231	addps	%xmm11, %xmm1
3232	movaps	48 * SIZE(BO), %xmm11
3233
3234	addq   $16 * SIZE, AO
3235	addq   $32 * SIZE, BO
3236	decq   %rax
3237	jne    .L132
3238	ALIGN_4
3239
3240.L135:
3241#ifndef TRMMKERNEL
3242	movq	K, %rax
3243#else
3244	movq	KKK, %rax
3245#endif
3246	movaps	ALPHA, %xmm15
3247	andq	$7, %rax		# if (k & 1)
3248	BRANCH
3249	je .L138
3250	ALIGN_4
3251
3252.L136:
3253	mulps	%xmm8, %xmm9
3254	movsd	-30 * SIZE(AO), %xmm8
3255	addps	%xmm9, %xmm0
3256	movaps	 4 * SIZE(BO), %xmm9
3257
3258	addq	$2 * SIZE, AO		# aoffset  += 4
3259	addq	$4 * SIZE, BO		# boffset1 += 8
3260	decq	%rax
3261	jg	.L136
3262	ALIGN_4
3263
3264.L138:
3265	addps	%xmm1,  %xmm0
3266	mulps	%xmm15, %xmm0
3267
3268#ifndef TRMMKERNEL
3269#ifdef movsd
3270	xorps	%xmm8,  %xmm8
3271#endif
3272	movsd	0 * SIZE(CO1), %xmm8
3273	addps	%xmm8,  %xmm0
3274#endif
3275
3276	movlps	%xmm0, 0 * SIZE(CO1)
3277
3278#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3279    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3280	movq	K, %rax
3281	subq	KKK, %rax
3282	leaq	(,%rax,    8), %rax
3283	leaq	(AO, %rax, 1), AO
3284	leaq	(BO, %rax, 2), BO
3285#endif
3286
3287#if defined(TRMMKERNEL) && defined(LEFT)
3288	addq	$2, KK
3289#endif
3290
3291	addq	$2 * SIZE, CO1		# coffset += 4
3292	ALIGN_4
3293
3294.L140:
3295	testq	$1, M
3296	je	.L999
3297
3298#if !defined(TRMMKERNEL) || \
3299	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
3300	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
3301
3302	leaq	BUFFER, BO
3303#else
3304	leaq	BUFFER, BO
3305	movq	KK, %rax
3306	leaq	(, %rax,   4), %rax
3307	leaq	(AO, %rax, 1), AO
3308	leaq	(BO, %rax, 4), BO
3309#endif
3310
3311	movss	-32 * SIZE(AO), %xmm8
3312	movss	-28 * SIZE(AO), %xmm10
3313
3314	movss	 0 * SIZE(BO), %xmm9
3315	movss	16 * SIZE(BO), %xmm11
3316
3317	xorps	%xmm0, %xmm0
3318	xorps	%xmm1, %xmm1
3319	xorps	%xmm2, %xmm2
3320	xorps	%xmm3, %xmm3
3321
3322#ifndef TRMMKERNEL
3323	movq	K, %rax
3324#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
3325	movq	K, %rax
3326	subq	KK, %rax
3327	movq	%rax, KKK
3328#else
3329	movq	KK, %rax
3330#ifdef LEFT
3331	addq	$1, %rax
3332#else
3333	addq	$1, %rax
3334#endif
3335	movq	%rax, KKK
3336#endif
3337	sarq	$3, %rax
3338	je	.L145
3339	ALIGN_4
3340
3341.L142:
3342	mulss	%xmm8, %xmm9
3343#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
3344	PREFETCH	(PREFETCHSIZE +  0) * SIZE(AO)
3345#endif
3346	movss	-31 * SIZE(AO), %xmm8
3347	mulss	 4 * SIZE(BO), %xmm8
3348	addss	%xmm9, %xmm0
3349	movss	32 * SIZE(BO), %xmm9
3350	addss	%xmm8, %xmm1
3351	movss	-30 * SIZE(AO), %xmm8
3352	mulss	 8 * SIZE(BO), %xmm8
3353	addss	%xmm8, %xmm2
3354	movss	-29 * SIZE(AO), %xmm8
3355	mulss	12 * SIZE(BO), %xmm8
3356	addss	%xmm8, %xmm3
3357	movss	-24 * SIZE(AO), %xmm8
3358	mulss	%xmm10, %xmm11
3359	movss	-27 * SIZE(AO), %xmm10
3360	mulss	20 * SIZE(BO), %xmm10
3361	addss	%xmm11, %xmm0
3362	movss	48 * SIZE(BO), %xmm11
3363	addss	%xmm10, %xmm1
3364	movss	-26 * SIZE(AO), %xmm10
3365	mulss	24 * SIZE(BO), %xmm10
3366	addss	%xmm10, %xmm2
3367	movss	-25 * SIZE(AO), %xmm10
3368	mulss	28 * SIZE(BO), %xmm10
3369	addss	%xmm10, %xmm3
3370	movss	-20 * SIZE(AO), %xmm10
3371
3372	addq   $ 8 * SIZE, AO
3373	addq   $32 * SIZE, BO
3374	decq   %rax
3375	jne    .L142
3376	ALIGN_4
3377
3378.L145:
3379#ifndef TRMMKERNEL
3380	movq	K, %rax
3381#else
3382	movq	KKK, %rax
3383#endif
3384	movss	ALPHA, %xmm15
3385	andq	$7, %rax		# if (k & 1)
3386	BRANCH
3387	je .L148
3388	ALIGN_4
3389
3390.L146:
3391	mulss	%xmm8, %xmm9
3392	movss	-31 * SIZE(AO), %xmm8
3393	addss	%xmm9, %xmm0
3394	movss	 4 * SIZE(BO), %xmm9
3395
3396	addq	$1 * SIZE, AO
3397	addq	$4 * SIZE, BO
3398	decq	%rax
3399	jg	.L146
3400	ALIGN_4
3401
3402.L148:
3403	addss	%xmm1, %xmm0
3404	addss	%xmm3, %xmm2
3405	addss	%xmm2, %xmm0
3406
3407	mulss	%xmm15, %xmm0
3408
3409#ifndef TRMMKERNEL
3410	movss	0 * SIZE(CO1), %xmm8
3411	addss	%xmm8,  %xmm0
3412#endif
3413	movss	%xmm0, 0 * SIZE(CO1)
3414	ALIGN_4
3415
3416.L999:
3417	movq	%rbx, %rsp
3418
3419	EMMS
3420
3421	movq	  0(%rsp), %rbx
3422	movq	  8(%rsp), %rbp
3423	movq	 16(%rsp), %r12
3424	movq	 24(%rsp), %r13
3425	movq	 32(%rsp), %r14
3426	movq	 40(%rsp), %r15
3427
3428#ifdef WINDOWS_ABI
3429	movq	 48(%rsp), %rdi
3430	movq	 56(%rsp), %rsi
3431	movups	 64(%rsp), %xmm6
3432	movups	 80(%rsp), %xmm7
3433	movups	 96(%rsp), %xmm8
3434	movups	112(%rsp), %xmm9
3435	movups	128(%rsp), %xmm10
3436	movups	144(%rsp), %xmm11
3437	movups	160(%rsp), %xmm12
3438	movups	176(%rsp), %xmm13
3439	movups	192(%rsp), %xmm14
3440	movups	208(%rsp), %xmm15
3441#endif
3442
3443	addq	$STACKSIZE, %rsp
3444	ret
3445
3446	EPILOGUE
3447