1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26
27#define OLD_M	 4 + STACK(%esi)
28#define OLD_N	 8 + STACK(%esi)
29#define OLD_K	12 + STACK(%esi)
30#define OLD_ALPHA_R	16 + STACK(%esi)
31#define OLD_ALPHA_I	20 + STACK(%esi)
32#define OLD_A	24 + STACK(%esi)
33#define OLD_B	28 + STACK(%esi)
34#define OLD_C	32 + STACK(%esi)
35#define OLD_LDC	36 + STACK(%esi)
36
37#define ALPHA	 0(%esp)
38#define K	16(%esp)
39#define N	20(%esp)
40#define M	24(%esp)
41#define A	28(%esp)
42#define C	32(%esp)
43#define J	36(%esp)
44#define OLD_STACK 40(%esp)
45#define OFFSET  44(%esp)
46#define KK	48(%esp)
47#define KKK	52(%esp)
48#define BUFFER 128(%esp)
49
50#if defined(PENRYN) || defined(DUNNINGTON)
51#define PREFETCH     prefetcht0
52#define PREFETCHSIZE   96
53#endif
54
55#ifdef PENTIUM4
56#define PREFETCH     prefetcht0
57#define PREFETCHSIZE   96
58#endif
59
60#ifdef PENTIUMM
61#define PREFETCH     prefetcht0
62#define PREFETCHSIZE   96
63#endif
64
65#define AA	%edx
66#define	BB	%ecx
67#define LDC	%ebp
68
69#define KERNEL1(address) \
70	mulps	%xmm0, %xmm2; \
71	PREFETCH  (PREFETCHSIZE +  0) * SIZE + (address) * SIZE(AA); \
72	addps	%xmm2, %xmm4; \
73	movshdup  0 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
74	mulps	%xmm0, %xmm2; \
75	addps	%xmm2, %xmm5; \
76	movsldup  4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
77	mulps	%xmm0, %xmm2; \
78	addps	%xmm2, %xmm6; \
79	movshdup  4 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
80	mulps	%xmm0, %xmm2; \
81	movaps	  4 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \
82	addps	%xmm2, %xmm7; \
83	movsldup  8 * SIZE + 2 * (address) * SIZE(BB), %xmm2
84
85#define KERNEL2(address) \
86	mulps	%xmm0, %xmm2; \
87	addps	%xmm2, %xmm4; \
88	movshdup  8 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
89	mulps	%xmm0, %xmm2; \
90	addps	%xmm2, %xmm5; \
91	movsldup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
92	mulps	%xmm0, %xmm2; \
93	addps	%xmm2, %xmm6; \
94	movshdup 12 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
95	mulps	%xmm0, %xmm2; \
96	movaps	  8 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \
97	addps	%xmm2, %xmm7; \
98	movsldup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2
99
100#define KERNEL3(address) \
101	mulps	%xmm0, %xmm3; \
102	addps	%xmm3, %xmm4; \
103	movshdup 16 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
104	mulps	%xmm0, %xmm3; \
105	addps	%xmm3, %xmm5; \
106	movsldup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
107	mulps	%xmm0, %xmm3; \
108	addps	%xmm3, %xmm6; \
109	movshdup 20 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
110	mulps	%xmm0, %xmm3; \
111	movaps	 12 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \
112	addps	%xmm3, %xmm7; \
113	movsldup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3
114
115#define KERNEL4(address) \
116	mulps	%xmm0, %xmm3; \
117	addps	%xmm3, %xmm4; \
118	movshdup 24 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
119	mulps	%xmm0, %xmm3; \
120	addps	%xmm3, %xmm5; \
121	movsldup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
122	mulps	%xmm0, %xmm3; \
123	addps	%xmm3, %xmm6; \
124	movshdup 28 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
125	mulps	%xmm0, %xmm3; \
126	movaps	 32 * SIZE + 1 * (address) * SIZE(AA), %xmm0; \
127	addps	%xmm3, %xmm7; \
128	movsldup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3
129
130#define KERNEL5(address) \
131	mulps	%xmm1, %xmm2; \
132	addps	%xmm2, %xmm4; \
133	movshdup 32 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
134	mulps	%xmm1, %xmm2; \
135	addps	%xmm2, %xmm5; \
136	movsldup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
137	mulps	%xmm1, %xmm2; \
138	addps	%xmm2, %xmm6; \
139	movshdup 36 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
140	mulps	%xmm1, %xmm2; \
141	movaps	 20 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \
142	addps	%xmm2, %xmm7
143
144#define KERNEL6(address) \
145	movsldup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
146	mulps	%xmm1, %xmm2; \
147	addps	%xmm2, %xmm4; \
148	movshdup 40 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
149	mulps	%xmm1, %xmm2; \
150	addps	%xmm2, %xmm5; \
151	movsldup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
152	mulps	%xmm1, %xmm2; \
153	addps	%xmm2, %xmm6; \
154	movshdup 44 * SIZE + 2 * (address) * SIZE(BB), %xmm2; \
155	mulps	%xmm1, %xmm2; \
156	movaps	 24 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \
157	addps	%xmm2, %xmm7; \
158	movsldup 64 * SIZE + 2 * (address) * SIZE(BB), %xmm2
159
160#define KERNEL7(address) \
161	mulps	%xmm1, %xmm3; \
162	addps	%xmm3, %xmm4; \
163	movshdup 48 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
164	mulps	%xmm1, %xmm3; \
165	addps	%xmm3, %xmm5; \
166	movsldup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
167	mulps	%xmm1, %xmm3; \
168	addps	%xmm3, %xmm6; \
169	movshdup 52 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
170	mulps	%xmm1, %xmm3; \
171	movaps	 28 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \
172	addps	%xmm3, %xmm7; \
173	movsldup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3
174
175#define KERNEL8(address) \
176	mulps	%xmm1, %xmm3; \
177	addps	%xmm3, %xmm4; \
178	movshdup 56 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
179	mulps	%xmm1, %xmm3; \
180	addps	%xmm3, %xmm5; \
181	movsldup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
182	mulps	%xmm1, %xmm3; \
183	addps	%xmm3, %xmm6; \
184	movshdup 60 * SIZE + 2 * (address) * SIZE(BB), %xmm3; \
185	mulps	%xmm1, %xmm3; \
186	movaps	 48 * SIZE + 1 * (address) * SIZE(AA), %xmm1; \
187	addps	%xmm3, %xmm7; \
188	movsldup 80 * SIZE + 2 * (address) * SIZE(BB), %xmm3
189
190
191	PROLOGUE
192
193	pushl	%ebp
194	pushl	%edi
195	pushl	%esi
196	pushl	%ebx
197
198	PROFCODE
199
200	movl	%esp, %esi	# save old stack
201	subl	$128 + LOCAL_BUFFER_SIZE, %esp
202	movl	OLD_M, %ebx
203	andl	$-1024, %esp	# align stack
204
205	STACK_TOUCHING
206
207	movl	OLD_N, %eax
208	movl	OLD_K, %ecx
209	movl	OLD_A, %edx
210	movss	OLD_ALPHA_R,  %xmm0
211	movss	OLD_ALPHA_I,  %xmm1
212
213	movl	%ebx, M
214	movl	%eax, N
215	movl	%ecx, K
216	movl	%edx, A
217	movl	%esi, OLD_STACK
218
219	movl	OLD_B, %edi
220	movl	OLD_C, %ebx
221
222	unpcklps %xmm1, %xmm0
223	movlhps	 %xmm0, %xmm0
224
225	movaps	 %xmm0, ALPHA
226
227	movl	%ebx, C
228	movl	OLD_LDC, LDC
229#ifdef TRMMKERNEL
230	movss	%xmm4, OFFSET
231	movss	%xmm4, KK
232#ifndef LEFT
233	negl	KK
234#endif
235#endif
236
237	sall	$ZBASE_SHIFT, LDC
238
239	sarl	$2, %eax
240	movl	%eax, J
241	jle	.L40
242
243.L01:
244#if defined(TRMMKERNEL) && defined(LEFT)
245	movl	OFFSET, %eax
246	movl	%eax, KK
247#endif
248
249/* Copying to Sub Buffer */
250	leal	BUFFER, %ecx
251
252	movl	K, %eax
253	sarl	$2, %eax
254	jle	.L05
255	ALIGN_4
256
257.L02:
258	movddup	 0 * SIZE(%edi), %xmm0
259	movddup	 2 * SIZE(%edi), %xmm1
260	movddup	 4 * SIZE(%edi), %xmm2
261	movddup	 6 * SIZE(%edi), %xmm3
262	movddup	 8 * SIZE(%edi), %xmm4
263	movddup	10 * SIZE(%edi), %xmm5
264	movddup	12 * SIZE(%edi), %xmm6
265	movddup	14 * SIZE(%edi), %xmm7
266
267	movaps	%xmm0,  0 * SIZE(%ecx)
268	movaps	%xmm1,  4 * SIZE(%ecx)
269	movaps	%xmm2,  8 * SIZE(%ecx)
270	movaps	%xmm3, 12 * SIZE(%ecx)
271	movaps	%xmm4, 16 * SIZE(%ecx)
272	movaps	%xmm5, 20 * SIZE(%ecx)
273	movaps	%xmm6, 24 * SIZE(%ecx)
274	movaps	%xmm7, 28 * SIZE(%ecx)
275
276#	prefetcht1	128 * SIZE(%ecx)
277	prefetcht0	112 * SIZE(%edi)
278
279	addl	$16 * SIZE, %edi
280	addl	$32 * SIZE, %ecx
281	decl	%eax
282	jne	.L02
283	ALIGN_2
284
285.L05:
286	movl	K, %eax
287	andl	$3, %eax
288	BRANCH
289	jle	.L10
290	ALIGN_2
291
292.L06:
293	movddup	 0 * SIZE(%edi), %xmm0
294	movddup	 2 * SIZE(%edi), %xmm1
295
296	movaps	%xmm0,  0 * SIZE(%ecx)
297	movaps	%xmm1,  4 * SIZE(%ecx)
298
299	addl	$4 * SIZE, %edi
300	addl	$8 * SIZE, %ecx
301	decl	%eax
302	jne	.L06
303	ALIGN_4
304
305.L10:
306	movl	C, %esi		# coffset = c
307	movl	A, %edx		# aoffset = a
308	movl	M,  %ebx
309	sarl	$2, %ebx	# i = (m >> 2)
310	jle	.L20
311	ALIGN_4
312
313.L11:
314#if !defined(TRMMKERNEL) || \
315	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
316	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
317
318	leal	BUFFER, BB	# boffset1 = boffset
319#else
320	leal	BUFFER, BB	# boffset1 = boffset
321	movl	KK, %eax
322	leal	(, %eax,   8), %eax
323	leal	(AA, %eax, 2), AA
324	leal	(BB, %eax, 4), BB
325#endif
326
327	movaps	 0 * SIZE(AA), %xmm0
328	pxor	%xmm4, %xmm4
329	movaps	16 * SIZE(AA), %xmm1
330	pxor	%xmm5, %xmm5
331	movsldup  0 * SIZE(BB), %xmm2
332	pxor	%xmm6, %xmm6
333	movsldup 16 * SIZE(BB), %xmm3
334	pxor	%xmm7, %xmm7
335
336	leal	(LDC, LDC, 2), %eax
337
338	prefetchnta	4 * SIZE(%esi)
339	prefetchnta	4 * SIZE(%esi, LDC)
340	prefetchnta	4 * SIZE(%esi, LDC, 2)
341	prefetchnta	4 * SIZE(%esi, %eax)
342
343#ifndef TRMMKERNEL
344	movl	K, %eax
345#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
346	movl	K, %eax
347	subl	KK, %eax
348	movl	%eax, KKK
349#else
350	movl	KK, %eax
351#ifdef LEFT
352	addl	$4, %eax
353#else
354	addl	$4, %eax
355#endif
356	movl	%eax, KKK
357#endif
358
359#if 1
360	andl	$-8, %eax
361	sall	$4, %eax
362	je	.L15
363.L1X:
364	KERNEL1(32 *  0)
365	KERNEL2(32 *  0)
366	KERNEL3(32 *  0)
367	KERNEL4(32 *  0)
368	KERNEL5(32 *  0)
369	KERNEL6(32 *  0)
370	KERNEL7(32 *  0)
371	KERNEL8(32 *  0)
372	cmpl	$128 *  1, %eax
373	jle	.L12
374	KERNEL1(32 *  1)
375	KERNEL2(32 *  1)
376	KERNEL3(32 *  1)
377	KERNEL4(32 *  1)
378	KERNEL5(32 *  1)
379	KERNEL6(32 *  1)
380	KERNEL7(32 *  1)
381	KERNEL8(32 *  1)
382	cmpl	$128 *  2, %eax
383	jle	.L12
384	KERNEL1(32 *  2)
385	KERNEL2(32 *  2)
386	KERNEL3(32 *  2)
387	KERNEL4(32 *  2)
388	KERNEL5(32 *  2)
389	KERNEL6(32 *  2)
390	KERNEL7(32 *  2)
391	KERNEL8(32 *  2)
392	cmpl	$128 *  3, %eax
393	jle	.L12
394	KERNEL1(32 *  3)
395	KERNEL2(32 *  3)
396	KERNEL3(32 *  3)
397	KERNEL4(32 *  3)
398	KERNEL5(32 *  3)
399	KERNEL6(32 *  3)
400	KERNEL7(32 *  3)
401	KERNEL8(32 *  3)
402	cmpl	$128 *  4, %eax
403	jle	.L12
404	KERNEL1(32 *  4)
405	KERNEL2(32 *  4)
406	KERNEL3(32 *  4)
407	KERNEL4(32 *  4)
408	KERNEL5(32 *  4)
409	KERNEL6(32 *  4)
410	KERNEL7(32 *  4)
411	KERNEL8(32 *  4)
412	cmpl	$128 *  5, %eax
413	jle	.L12
414	KERNEL1(32 *  5)
415	KERNEL2(32 *  5)
416	KERNEL3(32 *  5)
417	KERNEL4(32 *  5)
418	KERNEL5(32 *  5)
419	KERNEL6(32 *  5)
420	KERNEL7(32 *  5)
421	KERNEL8(32 *  5)
422	cmpl	$128 *  6, %eax
423	jle	.L12
424	KERNEL1(32 *  6)
425	KERNEL2(32 *  6)
426	KERNEL3(32 *  6)
427	KERNEL4(32 *  6)
428	KERNEL5(32 *  6)
429	KERNEL6(32 *  6)
430	KERNEL7(32 *  6)
431	KERNEL8(32 *  6)
432	cmpl	$128 *  7, %eax
433	jle	.L12
434	KERNEL1(32 *  7)
435	KERNEL2(32 *  7)
436	KERNEL3(32 *  7)
437	KERNEL4(32 *  7)
438	KERNEL5(32 *  7)
439	KERNEL6(32 *  7)
440	KERNEL7(32 *  7)
441	KERNEL8(32 *  7)
442#if 1
443	cmpl	$128 *  8, %eax
444	jle	.L12
445	KERNEL1(32 *  8)
446	KERNEL2(32 *  8)
447	KERNEL3(32 *  8)
448	KERNEL4(32 *  8)
449	KERNEL5(32 *  8)
450	KERNEL6(32 *  8)
451	KERNEL7(32 *  8)
452	KERNEL8(32 *  8)
453	cmpl	$128 *  9, %eax
454	jle	.L12
455	KERNEL1(32 *  9)
456	KERNEL2(32 *  9)
457	KERNEL3(32 *  9)
458	KERNEL4(32 *  9)
459	KERNEL5(32 *  9)
460	KERNEL6(32 *  9)
461	KERNEL7(32 *  9)
462	KERNEL8(32 *  9)
463	cmpl	$128 * 10, %eax
464	jle	.L12
465	KERNEL1(32 * 10)
466	KERNEL2(32 * 10)
467	KERNEL3(32 * 10)
468	KERNEL4(32 * 10)
469	KERNEL5(32 * 10)
470	KERNEL6(32 * 10)
471	KERNEL7(32 * 10)
472	KERNEL8(32 * 10)
473	cmpl	$128 * 11, %eax
474	jle	.L12
475	KERNEL1(32 * 11)
476	KERNEL2(32 * 11)
477	KERNEL3(32 * 11)
478	KERNEL4(32 * 11)
479	KERNEL5(32 * 11)
480	KERNEL6(32 * 11)
481	KERNEL7(32 * 11)
482	KERNEL8(32 * 11)
483	cmpl	$128 * 12, %eax
484	jle	.L12
485	KERNEL1(32 * 12)
486	KERNEL2(32 * 12)
487	KERNEL3(32 * 12)
488	KERNEL4(32 * 12)
489	KERNEL5(32 * 12)
490	KERNEL6(32 * 12)
491	KERNEL7(32 * 12)
492	KERNEL8(32 * 12)
493	cmpl	$128 * 13, %eax
494	jle	.L12
495	KERNEL1(32 * 13)
496	KERNEL2(32 * 13)
497	KERNEL3(32 * 13)
498	KERNEL4(32 * 13)
499	KERNEL5(32 * 13)
500	KERNEL6(32 * 13)
501	KERNEL7(32 * 13)
502	KERNEL8(32 * 13)
503	cmpl	$128 * 14, %eax
504	jle	.L12
505	KERNEL1(32 * 14)
506	KERNEL2(32 * 14)
507	KERNEL3(32 * 14)
508	KERNEL4(32 * 14)
509	KERNEL5(32 * 14)
510	KERNEL6(32 * 14)
511	KERNEL7(32 * 14)
512	KERNEL8(32 * 14)
513	cmpl	$128 * 15, %eax
514	jle	.L12
515	KERNEL1(32 * 15)
516	KERNEL2(32 * 15)
517	KERNEL3(32 * 15)
518	KERNEL4(32 * 15)
519	KERNEL5(32 * 15)
520	KERNEL6(32 * 15)
521	KERNEL7(32 * 15)
522	KERNEL8(32 * 15)
523#else
524	addl	$128 * 4  * SIZE, BB
525	addl	$128 * 2  * SIZE, AA
526	subl	$128 * 8, %eax
527	jg	.L1X
528	jmp	.L15
529#endif
530
531.L12:
532	leal	(AA, %eax, 1), AA
533	leal	(BB, %eax, 2), BB
534	ALIGN_4
535#else
536	sarl	$3, %eax
537	je	.L15
538	ALIGN_4
539
540.L12:
541	KERNEL1(32 *  7)
542	KERNEL2(32 *  7)
543	KERNEL3(32 *  7)
544	KERNEL4(32 *  7)
545	KERNEL5(32 *  7)
546	KERNEL6(32 *  7)
547	KERNEL7(32 *  7)
548	KERNEL8(32 *  7)
549
550	addl   $32 * SIZE, AA
551	addl   $64 * SIZE, BB
552	decl   %eax
553	jne    .L12
554	ALIGN_4
555#endif
556
557.L15:
558#ifndef TRMMKERNEL
559	movl	K, %eax
560#else
561	movl	KKK, %eax
562#endif
563	movaps	ALPHA,  %xmm3
564	andl	$7, %eax		# if (k & 1)
565	BRANCH
566	je .L18
567	ALIGN_4
568
569.L16:
570	mulps	%xmm0, %xmm2
571	addps	%xmm2, %xmm4
572	movshdup  0 * SIZE(BB), %xmm2
573	mulps	%xmm0, %xmm2
574	addps	%xmm2, %xmm5
575	movsldup  4 * SIZE(BB), %xmm2
576	mulps	%xmm0, %xmm2
577	addps	%xmm2, %xmm6
578	movshdup  4 * SIZE(BB), %xmm2
579	mulps	%xmm0, %xmm2
580	movaps	  4 * SIZE(AA), %xmm0
581	addps	%xmm2, %xmm7
582	movsldup  8 * SIZE(BB), %xmm2
583
584	addl	$4 * SIZE, AA
585	addl	$8 * SIZE, BB
586	decl	%eax
587	jg	.L16
588	ALIGN_4
589
590.L18:
591	leal	(LDC, LDC, 2), %eax
592
593	movsd	 0 * SIZE(%esi), %xmm0
594	movhps	 2 * SIZE(%esi), %xmm0
595	movsd	 4 * SIZE(%esi), %xmm1
596	movhps	 6 * SIZE(%esi), %xmm1
597
598	pshufd	$0x50, %xmm4,  %xmm2
599	pshufd	$0xfa, %xmm4,  %xmm4
600
601	mulps	%xmm3, %xmm2
602	mulps	%xmm3, %xmm4
603
604	addps	%xmm2, %xmm0
605	addps	%xmm4, %xmm1
606
607	movlps	%xmm0,   0 * SIZE(%esi)
608	movhps	%xmm0,   2 * SIZE(%esi)
609	movlps	%xmm1,   4 * SIZE(%esi)
610	movhps	%xmm1,   6 * SIZE(%esi)
611
612	movsd	 0 * SIZE(%esi, LDC), %xmm0
613	movhps	 2 * SIZE(%esi, LDC), %xmm0
614	movsd	 4 * SIZE(%esi, LDC), %xmm1
615	movhps	 6 * SIZE(%esi, LDC), %xmm1
616
617	pshufd	$0x50, %xmm5,  %xmm2
618	pshufd	$0xfa, %xmm5,  %xmm5
619
620	mulps	%xmm3, %xmm2
621	mulps	%xmm3, %xmm5
622
623	addps	%xmm2, %xmm0
624	addps	%xmm5, %xmm1
625
626	movlps	%xmm0,   0 * SIZE(%esi, LDC)
627	movhps	%xmm0,   2 * SIZE(%esi, LDC)
628	movlps	%xmm1,   4 * SIZE(%esi, LDC)
629	movhps	%xmm1,   6 * SIZE(%esi, LDC)
630
631	movsd	 0 * SIZE(%esi, LDC, 2), %xmm0
632	movhps	 2 * SIZE(%esi, LDC, 2), %xmm0
633	movsd	 4 * SIZE(%esi, LDC, 2), %xmm1
634	movhps	 6 * SIZE(%esi, LDC, 2), %xmm1
635
636	pshufd	$0x50, %xmm6,  %xmm2
637	pshufd	$0xfa, %xmm6,  %xmm6
638
639	mulps	%xmm3, %xmm2
640	mulps	%xmm3, %xmm6
641
642	addps	%xmm2, %xmm0
643	addps	%xmm6, %xmm1
644
645	movlps	%xmm0,   0 * SIZE(%esi, LDC, 2)
646	movhps	%xmm0,   2 * SIZE(%esi, LDC, 2)
647	movlps	%xmm1,   4 * SIZE(%esi, LDC, 2)
648	movhps	%xmm1,   6 * SIZE(%esi, LDC, 2)
649
650	movsd	 0 * SIZE(%esi, %eax), %xmm0
651	movhps	 2 * SIZE(%esi, %eax), %xmm0
652	movsd	 4 * SIZE(%esi, %eax), %xmm1
653	movhps	 6 * SIZE(%esi, %eax), %xmm1
654
655	pshufd	$0x50, %xmm7,  %xmm2
656	pshufd	$0xfa, %xmm7,  %xmm7
657
658	mulps	%xmm3, %xmm2
659	mulps	%xmm3, %xmm7
660
661	addps	%xmm2, %xmm0
662	addps	%xmm7, %xmm1
663
664	movlps	%xmm0,   0 * SIZE(%esi, %eax)
665	movhps	%xmm0,   2 * SIZE(%esi, %eax)
666	movlps	%xmm1,   4 * SIZE(%esi, %eax)
667	movhps	%xmm1,   6 * SIZE(%esi, %eax)
668
669	addl	$8 * SIZE, %esi		# coffset += 2
670	decl	%ebx			# i --
671	jg	.L11
672	ALIGN_4
673
674.L20:
675	testl	$2, M
676	je	.L30
677
678#if !defined(TRMMKERNEL) || \
679	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
680	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
681
682	leal	BUFFER, BB	# boffset1 = boffset
683#else
684	leal	BUFFER, BB	# boffset1 = boffset
685	movl	KK, %eax
686	leal	(, %eax,   8), %eax
687	leal	(AA, %eax, 1), AA
688	leal	(BB, %eax, 4), BB
689#endif
690
691	movddup	  0 * SIZE(AA), %xmm0
692	pxor	%xmm4, %xmm4
693	movddup	  8 * SIZE(AA), %xmm1
694	pxor	%xmm5, %xmm5
695	movsd     0 * SIZE(BB), %xmm2
696	movsd    16 * SIZE(BB), %xmm3
697
698#ifndef TRMMKERNEL
699	movl	K, %eax
700#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
701	movl	K, %eax
702	subl	KK, %eax
703	movl	%eax, KKK
704#else
705	movl	KK, %eax
706#ifdef LEFT
707	addl	$2, %eax
708#else
709	addl	$4, %eax
710#endif
711	movl	%eax, KKK
712#endif
713	sarl	$3, %eax
714	je	.L25
715	ALIGN_4
716
717.L22:
718	shufps	$0x50, %xmm2, %xmm2
719	mulps	%xmm0, %xmm2
720	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
721	addps	%xmm2, %xmm4
722	movsd     4 * SIZE(BB), %xmm2
723	shufps	$0x50, %xmm2, %xmm2
724	mulps	%xmm0, %xmm2
725	movddup	  2 * SIZE(AA), %xmm0
726	addps	%xmm2, %xmm5
727	movsd     8 * SIZE(BB), %xmm2
728	shufps	$0x50, %xmm2, %xmm2
729	mulps	%xmm0, %xmm2
730	addps	%xmm2, %xmm4
731	movsd    12 * SIZE(BB), %xmm2
732	shufps	$0x50, %xmm2, %xmm2
733	mulps	%xmm0, %xmm2
734	movddup	  4 * SIZE(AA), %xmm0
735	addps	%xmm2, %xmm5
736	movsd    32 * SIZE(BB), %xmm2
737	shufps	$0x50, %xmm3, %xmm3
738	mulps	%xmm0, %xmm3
739	addps	%xmm3, %xmm4
740	movsd    20 * SIZE(BB), %xmm3
741	shufps	$0x50, %xmm3, %xmm3
742	mulps	%xmm0, %xmm3
743	movddup	  6 * SIZE(AA), %xmm0
744	addps	%xmm3, %xmm5
745	movsd    24 * SIZE(BB), %xmm3
746	shufps	$0x50, %xmm3, %xmm3
747	mulps	%xmm0, %xmm3
748	addps	%xmm3, %xmm4
749	movsd    28 * SIZE(BB), %xmm3
750	shufps	$0x50, %xmm3, %xmm3
751	mulps	%xmm0, %xmm3
752	movddup	 16 * SIZE(AA), %xmm0
753	addps	%xmm3, %xmm5
754	movsd    48 * SIZE(BB), %xmm3
755	shufps	$0x50, %xmm2, %xmm2
756	mulps	%xmm1, %xmm2
757	addps	%xmm2, %xmm4
758	movsd    36 * SIZE(BB), %xmm2
759	shufps	$0x50, %xmm2, %xmm2
760	mulps	%xmm1, %xmm2
761	movddup	 10 * SIZE(AA), %xmm1
762	addps	%xmm2, %xmm5
763	movsd    40 * SIZE(BB), %xmm2
764	shufps	$0x50, %xmm2, %xmm2
765	mulps	%xmm1, %xmm2
766	addps	%xmm2, %xmm4
767	movsd    44 * SIZE(BB), %xmm2
768	shufps	$0x50, %xmm2, %xmm2
769	mulps	%xmm1, %xmm2
770	movddup	 12 * SIZE(AA), %xmm1
771	addps	%xmm2, %xmm5
772	movsd    64 * SIZE(BB), %xmm2
773	shufps	$0x50, %xmm3, %xmm3
774	mulps	%xmm1, %xmm3
775	addps	%xmm3, %xmm4
776	movsd    52 * SIZE(BB), %xmm3
777	shufps	$0x50, %xmm3, %xmm3
778	mulps	%xmm1, %xmm3
779	movddup	 14 * SIZE(AA), %xmm1
780	addps	%xmm3, %xmm5
781	movsd    56 * SIZE(BB), %xmm3
782	shufps	$0x50, %xmm3, %xmm3
783	mulps	%xmm1, %xmm3
784	addps	%xmm3, %xmm4
785	movsd    60 * SIZE(BB), %xmm3
786	shufps	$0x50, %xmm3, %xmm3
787	mulps	%xmm1, %xmm3
788	movddup	 24 * SIZE(AA), %xmm1
789	addps	%xmm3, %xmm5
790	movsd    80 * SIZE(BB), %xmm3
791
792	addl	$16 * SIZE, AA
793	addl	$64 * SIZE, BB
794	decl   %eax
795	jne    .L22
796	ALIGN_4
797
798.L25:
799#ifndef TRMMKERNEL
800	movl	K, %eax
801#else
802	movl	KKK, %eax
803#endif
804	movaps	ALPHA,  %xmm3
805	andl	$7, %eax		# if (k & 1)
806	BRANCH
807	je .L28
808	ALIGN_4
809
810.L26:
811	shufps	$0x50, %xmm2, %xmm2
812	mulps	%xmm0, %xmm2
813	addps	%xmm2, %xmm4
814	movsd     4 * SIZE(BB), %xmm2
815	shufps	$0x50, %xmm2, %xmm2
816	mulps	%xmm0, %xmm2
817 	movddup	  2 * SIZE(AA), %xmm0
818	addps	%xmm2, %xmm5
819	movsd     8 * SIZE(BB), %xmm2
820
821	addl	$2 * SIZE, AA
822	addl	$8 * SIZE, BB
823	decl	%eax
824	jg	.L26
825	ALIGN_4
826
827.L28:
828	leal	(LDC, LDC, 2), %eax
829
830	movsd	 0 * SIZE(%esi), %xmm0
831	movhps	 2 * SIZE(%esi), %xmm0
832	movsd	 0 * SIZE(%esi, LDC), %xmm1
833	movhps	 2 * SIZE(%esi, LDC), %xmm1
834
835	pshufd	$0x50, %xmm4,  %xmm2
836	pshufd	$0xfa, %xmm4,  %xmm4
837
838	mulps	%xmm3, %xmm2
839	mulps	%xmm3, %xmm4
840
841	addps	%xmm2, %xmm0
842	addps	%xmm4, %xmm1
843
844	movlps	%xmm0,   0 * SIZE(%esi)
845	movhps	%xmm0,   2 * SIZE(%esi)
846	movlps	%xmm1,   0 * SIZE(%esi, LDC)
847	movhps	%xmm1,   2 * SIZE(%esi, LDC)
848
849	movsd	 0 * SIZE(%esi, LDC, 2), %xmm0
850	movhps	 2 * SIZE(%esi, LDC, 2), %xmm0
851	movsd	 0 * SIZE(%esi, %eax), %xmm1
852	movhps	 2 * SIZE(%esi, %eax), %xmm1
853
854	pshufd	$0x50, %xmm5,  %xmm2
855	pshufd	$0xfa, %xmm5,  %xmm5
856
857	mulps	%xmm3, %xmm2
858	mulps	%xmm3, %xmm5
859
860	addps	%xmm2, %xmm0
861	addps	%xmm5, %xmm1
862
863	movlps	%xmm0,   0 * SIZE(%esi, LDC, 2)
864	movhps	%xmm0,   2 * SIZE(%esi, LDC, 2)
865	movlps	%xmm1,   0 * SIZE(%esi, %eax)
866	movhps	%xmm1,   2 * SIZE(%esi, %eax)
867
868	addl	$4 * SIZE, %esi		# coffset += 2
869	ALIGN_4
870
871.L30:
872	testl	$1, M
873	je	.L39
874
875#if !defined(TRMMKERNEL) || \
876	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
877	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
878
879	leal	BUFFER, BB	# boffset1 = boffset
880#else
881	leal	BUFFER, BB	# boffset1 = boffset
882	movl	KK, %eax
883	leal	(, %eax,   4), %eax
884	leal	(AA, %eax, 1), AA
885	leal	(BB, %eax, 8), BB
886#endif
887
888	movss	 0 * SIZE(AA), %xmm0
889	pxor	%xmm4, %xmm4
890	movss	 4 * SIZE(AA), %xmm1
891	pxor	%xmm5, %xmm5
892	movsd	 0 * SIZE(BB), %xmm2
893	movsd	16 * SIZE(BB), %xmm3
894
895#ifndef TRMMKERNEL
896	movl	K, %eax
897#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
898	movl	K, %eax
899	subl	KK, %eax
900	movl	%eax, KKK
901#else
902	movl	KK, %eax
903#ifdef LEFT
904	addl	$1, %eax
905#else
906	addl	$4, %eax
907#endif
908	movl	%eax, KKK
909#endif
910	sarl	$3, %eax
911	je	.L35
912	ALIGN_4
913
914.L32:
915	shufps	$0, %xmm0, %xmm0
916	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
917	movhps	 4 * SIZE(BB), %xmm2
918	mulps	%xmm0, %xmm2
919	movss	 1 * SIZE(AA), %xmm0
920	addps	%xmm2, %xmm4
921	movsd	 8 * SIZE(BB), %xmm2
922	shufps	$0, %xmm0, %xmm0
923	movhps	12 * SIZE(BB), %xmm2
924	mulps	%xmm0, %xmm2
925	movss	 2 * SIZE(AA), %xmm0
926	addps	%xmm2, %xmm5
927	movhps	20 * SIZE(BB), %xmm3
928	shufps	$0, %xmm0, %xmm0
929	movsd	32 * SIZE(BB), %xmm2
930	mulps	%xmm0, %xmm3
931	movss	 3 * SIZE(AA), %xmm0
932	addps	%xmm3, %xmm4
933	movsd	24 * SIZE(BB), %xmm3
934	shufps	$0, %xmm0, %xmm0
935	movhps	28 * SIZE(BB), %xmm3
936	mulps	%xmm0, %xmm3
937	movss	 8 * SIZE(AA), %xmm0
938	addps	%xmm3, %xmm5
939	movsd	48 * SIZE(BB), %xmm3
940	shufps	$0, %xmm1, %xmm1
941	movhps	36 * SIZE(BB), %xmm2
942	mulps	%xmm1, %xmm2
943	movss	 5 * SIZE(AA), %xmm1
944	addps	%xmm2, %xmm4
945	movsd	40 * SIZE(BB), %xmm2
946	shufps	$0, %xmm1, %xmm1
947	movhps	44 * SIZE(BB), %xmm2
948	mulps	%xmm1, %xmm2
949	movss	 6 * SIZE(AA), %xmm1
950	addps	%xmm2, %xmm5
951	movsd	64 * SIZE(BB), %xmm2
952	shufps	$0, %xmm1, %xmm1
953	movhps	52 * SIZE(BB), %xmm3
954	mulps	%xmm1, %xmm3
955	movss	 7 * SIZE(AA), %xmm1
956	addps	%xmm3, %xmm4
957	movsd	56 * SIZE(BB), %xmm3
958	shufps	$0, %xmm1, %xmm1
959	movhps	60 * SIZE(BB), %xmm3
960	mulps	%xmm1, %xmm3
961	movss	12 * SIZE(AA), %xmm1
962	addps	%xmm3, %xmm5
963	movsd	80 * SIZE(BB), %xmm3
964
965	addl	$ 8 * SIZE, AA
966	addl	$64 * SIZE, BB
967	decl   %eax
968	jne    .L32
969	ALIGN_4
970
971.L35:
972#ifndef TRMMKERNEL
973	movl	K, %eax
974#else
975	movl	KKK, %eax
976#endif
977	movaps	ALPHA,  %xmm3
978	andl	$7, %eax		# if (k & 1)
979	BRANCH
980	je .L38
981	ALIGN_4
982
983.L36:
984	shufps	$0, %xmm0, %xmm0
985	movhps	 4 * SIZE(BB), %xmm2
986	mulps	%xmm0, %xmm2
987	movss	 1 * SIZE(AA), %xmm0
988	addps	%xmm2, %xmm4
989	movsd	 8 * SIZE(BB), %xmm2
990
991	addl	$1 * SIZE, AA
992	addl	$8 * SIZE, BB
993	decl	%eax
994	jg	.L36
995	ALIGN_4
996
997.L38:
998	leal	(LDC, LDC, 2), %eax
999
1000	addps	%xmm5, %xmm4
1001
1002	movsd	 (%esi), %xmm0
1003	movhps	 (%esi, LDC), %xmm0
1004	movsd	 (%esi, LDC, 2), %xmm1
1005	movhps	 (%esi, %eax), %xmm1
1006
1007	pshufd	$0x50, %xmm4,  %xmm2
1008	pshufd	$0xfa, %xmm4,  %xmm4
1009
1010	mulps	%xmm3, %xmm2
1011	mulps	%xmm3, %xmm4
1012
1013	addps	%xmm2, %xmm0
1014	addps	%xmm4, %xmm1
1015
1016	movlps	%xmm0,   (%esi)
1017	movhps	%xmm0,   (%esi, LDC)
1018	movlps	%xmm1,   (%esi, LDC, 2)
1019	movhps	%xmm1,   (%esi, %eax)
1020	ALIGN_4
1021
1022.L39:
1023#if defined(TRMMKERNEL) && !defined(LEFT)
1024	addl	$4, KK
1025#endif
1026
1027	leal	(, LDC, 4), %eax
1028	addl	%eax, C			# c += 4 * ldc
1029	decl	J			# j --
1030	jg	.L01
1031	ALIGN_4
1032
1033.L40:
1034	testl	$2, N
1035	je	.L80
1036
1037#if defined(TRMMKERNEL) && defined(LEFT)
1038	movl	OFFSET, %eax
1039	movl	%eax, KK
1040#endif
1041
1042	movl	K, %eax
1043	leal	BUFFER, %ecx
1044	sarl	$3, %eax
1045	jle	.L45
1046	ALIGN_4
1047
1048.L42:
1049	movddup	 0 * SIZE(%edi), %xmm0
1050	movddup	 2 * SIZE(%edi), %xmm1
1051	movddup	 4 * SIZE(%edi), %xmm2
1052	movddup	 6 * SIZE(%edi), %xmm3
1053	movddup	 8 * SIZE(%edi), %xmm4
1054	movddup	10 * SIZE(%edi), %xmm5
1055	movddup	12 * SIZE(%edi), %xmm6
1056	movddup	14 * SIZE(%edi), %xmm7
1057
1058	movaps	%xmm0,  0 * SIZE(%ecx)
1059	movaps	%xmm1,  4 * SIZE(%ecx)
1060	movaps	%xmm2,  8 * SIZE(%ecx)
1061	movaps	%xmm3, 12 * SIZE(%ecx)
1062	movaps	%xmm4, 16 * SIZE(%ecx)
1063	movaps	%xmm5, 20 * SIZE(%ecx)
1064	movaps	%xmm6, 24 * SIZE(%ecx)
1065	movaps	%xmm7, 28 * SIZE(%ecx)
1066
1067#	prefetcht1	128 * SIZE(%ecx)
1068	prefetcht0	112 * SIZE(%edi)
1069
1070	addl	$16 * SIZE, %edi
1071	addl	$32 * SIZE, %ecx
1072	decl	%eax
1073	jne	.L42
1074	ALIGN_4
1075
1076.L45:
1077	movl	K, %eax
1078	andl	$7, %eax
1079	BRANCH
1080	jle	.L50
1081	ALIGN_4
1082
1083.L46:
1084	movddup	 0 * SIZE(%edi), %xmm0
1085	movaps	%xmm0,  0 * SIZE(%ecx)
1086
1087	addl	$2 * SIZE, %edi
1088	addl	$4 * SIZE, %ecx
1089	decl	%eax
1090	jne	.L46
1091	ALIGN_4
1092
1093.L50:
1094	movl	C, %esi		# coffset = c
1095	movl	A, %edx		# aoffset = a
1096	movl	M,  %ebx
1097	sarl	$2, %ebx	# i = (m >> 2)
1098	jle	.L60
1099	ALIGN_4
1100
1101.L51:
1102#if !defined(TRMMKERNEL) || \
1103	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1104	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1105
1106	leal	BUFFER, BB	# boffset1 = boffset
1107#else
1108	leal	BUFFER, BB	# boffset1 = boffset
1109	movl	KK, %eax
1110	leal	(, %eax,   8), %eax
1111	leal	(AA, %eax, 2), AA
1112	leal	(BB, %eax, 2), BB
1113#endif
1114
1115	movaps	  0 * SIZE(AA), %xmm0
1116	pxor	%xmm4, %xmm4
1117	movaps	 16 * SIZE(AA), %xmm1
1118	pxor	%xmm5, %xmm5
1119	movsldup  0 * SIZE(BB), %xmm2
1120	pxor	%xmm6, %xmm6
1121	movsldup 16 * SIZE(BB), %xmm3
1122	pxor	%xmm7, %xmm7
1123
1124	prefetcht2	4 * SIZE(%esi)
1125	prefetcht2	4 * SIZE(%esi, LDC)
1126
1127#ifndef TRMMKERNEL
1128	movl	K, %eax
1129#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1130	movl	K, %eax
1131	subl	KK, %eax
1132	movl	%eax, KKK
1133#else
1134	movl	KK, %eax
1135#ifdef LEFT
1136	addl	$4, %eax
1137#else
1138	addl	$2, %eax
1139#endif
1140	movl	%eax, KKK
1141#endif
1142	sarl	$3, %eax
1143	je	.L55
1144	ALIGN_4
1145
1146.L52:
1147	mulps	%xmm0, %xmm2
1148	addps	%xmm2, %xmm4
1149	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1150	movshdup  0 * SIZE(BB), %xmm2
1151	mulps	%xmm0, %xmm2
1152	movaps	  4 * SIZE(AA), %xmm0
1153	addps	%xmm2, %xmm5
1154	movsldup  4 * SIZE(BB), %xmm2
1155	mulps	%xmm0, %xmm2
1156	addps	%xmm2, %xmm4
1157	movshdup  4 * SIZE(BB), %xmm2
1158	mulps	%xmm0, %xmm2
1159	movaps	  8 * SIZE(AA), %xmm0
1160	addps	%xmm2, %xmm5
1161	movsldup  8 * SIZE(BB), %xmm2
1162	mulps	%xmm0, %xmm2
1163	addps	%xmm2, %xmm4
1164	movshdup  8 * SIZE(BB), %xmm2
1165	mulps	%xmm0, %xmm2
1166	movaps	 12 * SIZE(AA), %xmm0
1167	addps	%xmm2, %xmm5
1168	movsldup 12 * SIZE(BB), %xmm2
1169	mulps	%xmm0, %xmm2
1170	addps	%xmm2, %xmm4
1171	movshdup 12 * SIZE(BB), %xmm2
1172	mulps	%xmm0, %xmm2
1173	movaps	 32 * SIZE(AA), %xmm0
1174	addps	%xmm2, %xmm5
1175	movsldup 32 * SIZE(BB), %xmm2
1176	mulps	%xmm1, %xmm3
1177	addps	%xmm3, %xmm4
1178	movshdup 16 * SIZE(BB), %xmm3
1179	mulps	%xmm1, %xmm3
1180	movaps	 20 * SIZE(AA), %xmm1
1181	addps	%xmm3, %xmm5
1182	movsldup 20 * SIZE(BB), %xmm3
1183	mulps	%xmm1, %xmm3
1184	addps	%xmm3, %xmm4
1185	movshdup 20 * SIZE(BB), %xmm3
1186	mulps	%xmm1, %xmm3
1187	movaps	 24 * SIZE(AA), %xmm1
1188	addps	%xmm3, %xmm5
1189	movsldup 24 * SIZE(BB), %xmm3
1190	mulps	%xmm1, %xmm3
1191	addps	%xmm3, %xmm4
1192	movshdup 24 * SIZE(BB), %xmm3
1193	mulps	%xmm1, %xmm3
1194	movaps	 28 * SIZE(AA), %xmm1
1195	addps	%xmm3, %xmm5
1196	movsldup 28 * SIZE(BB), %xmm3
1197	mulps	%xmm1, %xmm3
1198	addps	%xmm3, %xmm4
1199	movshdup 28 * SIZE(BB), %xmm3
1200	mulps	%xmm1, %xmm3
1201	movaps	 48 * SIZE(AA), %xmm1
1202	addps	%xmm3, %xmm5
1203	movsldup 48 * SIZE(BB), %xmm3
1204
1205	addl	$32 * SIZE, AA
1206	addl	$32 * SIZE, BB
1207	decl   %eax
1208	jne    .L52
1209	ALIGN_4
1210
1211.L55:
1212#ifndef TRMMKERNEL
1213	movl	K, %eax
1214#else
1215	movl	KKK, %eax
1216#endif
1217	movaps	ALPHA,  %xmm3
1218	andl	$7, %eax		# if (k & 1)
1219	BRANCH
1220	je .L58
1221	ALIGN_4
1222
1223.L56:
1224	mulps	%xmm0, %xmm2
1225	addps	%xmm2, %xmm4
1226	movshdup  0 * SIZE(BB), %xmm2
1227	mulps	%xmm0, %xmm2
1228	movaps	  4 * SIZE(AA), %xmm0
1229	addps	%xmm2, %xmm5
1230	movsldup  4 * SIZE(BB), %xmm2
1231
1232	addl	$4 * SIZE, AA
1233	addl	$4 * SIZE, BB
1234	decl	%eax
1235	jg	.L56
1236	ALIGN_4
1237
1238.L58:
1239	movsd	 0 * SIZE(%esi), %xmm0
1240	movhps	 2 * SIZE(%esi), %xmm0
1241	movsd	 4 * SIZE(%esi), %xmm1
1242	movhps	 6 * SIZE(%esi), %xmm1
1243
1244	pshufd	$0x50, %xmm4,  %xmm2
1245	pshufd	$0xfa, %xmm4,  %xmm4
1246
1247	mulps	%xmm3, %xmm2
1248	mulps	%xmm3, %xmm4
1249
1250	addps	%xmm2, %xmm0
1251	addps	%xmm4, %xmm1
1252
1253	movlps	%xmm0,   0 * SIZE(%esi)
1254	movhps	%xmm0,   2 * SIZE(%esi)
1255	movlps	%xmm1,   4 * SIZE(%esi)
1256	movhps	%xmm1,   6 * SIZE(%esi)
1257
1258	movsd	 0 * SIZE(%esi, LDC), %xmm0
1259	movhps	 2 * SIZE(%esi, LDC), %xmm0
1260	movsd	 4 * SIZE(%esi, LDC), %xmm1
1261	movhps	 6 * SIZE(%esi, LDC), %xmm1
1262
1263	pshufd	$0x50, %xmm5,  %xmm2
1264	pshufd	$0xfa, %xmm5,  %xmm5
1265
1266	mulps	%xmm3, %xmm2
1267	mulps	%xmm3, %xmm5
1268
1269	addps	%xmm2, %xmm0
1270	addps	%xmm5, %xmm1
1271
1272	movlps	%xmm0,   0 * SIZE(%esi, LDC)
1273	movhps	%xmm0,   2 * SIZE(%esi, LDC)
1274	movlps	%xmm1,   4 * SIZE(%esi, LDC)
1275	movhps	%xmm1,   6 * SIZE(%esi, LDC)
1276
1277	addl	$8 * SIZE, %esi		# coffset += 2
1278	decl	%ebx			# i --
1279	jg	.L51
1280	ALIGN_4
1281
1282.L60:
1283	testl	$2, M
1284	je	.L70
1285
1286#if !defined(TRMMKERNEL) || \
1287	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1288	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1289
1290	leal	BUFFER, BB	# boffset1 = boffset
1291#else
1292	leal	BUFFER, BB	# boffset1 = boffset
1293	movl	KK, %eax
1294	leal	(, %eax,   8), %eax
1295	leal	(AA, %eax, 1), AA
1296	leal	(BB, %eax, 2), BB
1297#endif
1298
1299	movddup	  0 * SIZE(AA), %xmm0
1300	pxor	%xmm4, %xmm4
1301	movddup	  8 * SIZE(AA), %xmm1
1302	pxor	%xmm5, %xmm5
1303	movsd     0 * SIZE(BB), %xmm2
1304	movsd    16 * SIZE(BB), %xmm3
1305
1306	leal	(LDC, LDC, 2), %eax
1307
1308#ifndef TRMMKERNEL
1309	movl	K, %eax
1310#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1311	movl	K, %eax
1312	subl	KK, %eax
1313	movl	%eax, KKK
1314#else
1315	movl	KK, %eax
1316#ifdef LEFT
1317	addl	$2, %eax
1318#else
1319	addl	$2, %eax
1320#endif
1321	movl	%eax, KKK
1322#endif
1323	sarl	$3, %eax
1324	je	.L65
1325	ALIGN_4
1326
1327.L62:
1328	shufps	$0x50, %xmm2, %xmm2
1329	mulps	%xmm0, %xmm2
1330	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1331	movddup	  2 * SIZE(AA), %xmm0
1332	addps	%xmm2, %xmm4
1333	movsd     4 * SIZE(BB), %xmm2
1334	shufps	$0x50, %xmm2, %xmm2
1335	mulps	%xmm0, %xmm2
1336	movddup	  4 * SIZE(AA), %xmm0
1337	addps	%xmm2, %xmm5
1338	movsd     8 * SIZE(BB), %xmm2
1339	shufps	$0x50, %xmm2, %xmm2
1340	mulps	%xmm0, %xmm2
1341	movddup	  6 * SIZE(AA), %xmm0
1342	addps	%xmm2, %xmm4
1343	movsd    12 * SIZE(BB), %xmm2
1344	shufps	$0x50, %xmm2, %xmm2
1345	mulps	%xmm0, %xmm2
1346	movddup	 16 * SIZE(AA), %xmm0
1347	addps	%xmm2, %xmm5
1348	movsd    32 * SIZE(BB), %xmm2
1349	shufps	$0x50, %xmm3, %xmm3
1350	mulps	%xmm1, %xmm3
1351	movddup	 10 * SIZE(AA), %xmm1
1352	addps	%xmm3, %xmm4
1353	movsd    20 * SIZE(BB), %xmm3
1354	shufps	$0x50, %xmm3, %xmm3
1355	mulps	%xmm1, %xmm3
1356	movddup	 12 * SIZE(AA), %xmm1
1357	addps	%xmm3, %xmm5
1358	movsd    24 * SIZE(BB), %xmm3
1359	shufps	$0x50, %xmm3, %xmm3
1360	mulps	%xmm1, %xmm3
1361	movddup	 14 * SIZE(AA), %xmm1
1362	addps	%xmm3, %xmm4
1363	movsd    28 * SIZE(BB), %xmm3
1364	shufps	$0x50, %xmm3, %xmm3
1365	mulps	%xmm1, %xmm3
1366	movddup	 24 * SIZE(AA), %xmm1
1367	addps	%xmm3, %xmm5
1368	movsd    48 * SIZE(BB), %xmm3
1369
1370	addl	$16 * SIZE, AA
1371	addl	$32 * SIZE, BB
1372	decl   %eax
1373	jne    .L62
1374	ALIGN_4
1375
1376.L65:
1377#ifndef TRMMKERNEL
1378	movl	K, %eax
1379#else
1380	movl	KKK, %eax
1381#endif
1382	movaps	ALPHA,  %xmm3
1383	andl	$7, %eax		# if (k & 1)
1384	BRANCH
1385	je .L68
1386	ALIGN_4
1387
1388.L66:
1389	shufps	$0x50, %xmm2, %xmm2
1390	mulps	%xmm0, %xmm2
1391	movddup	  2 * SIZE(AA), %xmm0
1392	addps	%xmm2, %xmm4
1393	movsd     4 * SIZE(BB), %xmm2
1394
1395	addl	$2 * SIZE, AA
1396	addl	$4 * SIZE, BB
1397	decl	%eax
1398	jg	.L66
1399	ALIGN_4
1400
1401.L68:
1402	addps	%xmm5, %xmm4
1403
1404	movsd	 0 * SIZE(%esi), %xmm0
1405	movhps	 2 * SIZE(%esi), %xmm0
1406	movsd	 0 * SIZE(%esi, LDC), %xmm1
1407	movhps	 2 * SIZE(%esi, LDC), %xmm1
1408
1409	pshufd	$0x50, %xmm4,  %xmm2
1410	pshufd	$0xfa, %xmm4,  %xmm4
1411
1412	mulps	%xmm3, %xmm2
1413	mulps	%xmm3, %xmm4
1414
1415	addps	%xmm2, %xmm0
1416	addps	%xmm4, %xmm1
1417
1418	movlps	%xmm0,   0 * SIZE(%esi)
1419	movhps	%xmm0,   2 * SIZE(%esi)
1420	movlps	%xmm1,   0 * SIZE(%esi, LDC)
1421	movhps	%xmm1,   2 * SIZE(%esi, LDC)
1422
1423	addl	$4 * SIZE, %esi
1424	ALIGN_4
1425
1426.L70:
1427	testl	$1, M
1428	je	.L79
1429
1430#if !defined(TRMMKERNEL) || \
1431	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1432	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1433
1434	leal	BUFFER, BB	# boffset1 = boffset
1435#else
1436	leal	BUFFER, BB	# boffset1 = boffset
1437	movl	KK, %eax
1438	leal	(, %eax,   4), %eax
1439	leal	(AA, %eax, 1), AA
1440	leal	(BB, %eax, 4), BB
1441#endif
1442
1443	movss	 0 * SIZE(AA), %xmm0
1444	pxor	%xmm4, %xmm4
1445	movss	 4 * SIZE(AA), %xmm1
1446	pxor	%xmm5, %xmm5
1447	movsd	 0 * SIZE(BB), %xmm2
1448	movsd	16 * SIZE(BB), %xmm3
1449
1450	leal	(LDC, LDC, 2), %eax
1451
1452#ifndef TRMMKERNEL
1453	movl	K, %eax
1454#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1455	movl	K, %eax
1456	subl	KK, %eax
1457	movl	%eax, KKK
1458#else
1459	movl	KK, %eax
1460#ifdef LEFT
1461	addl	$1, %eax
1462#else
1463	addl	$2, %eax
1464#endif
1465	movl	%eax, KKK
1466#endif
1467	sarl	$3, %eax
1468	je	.L75
1469	ALIGN_4
1470
1471.L72:
1472	shufps	$0, %xmm0, %xmm0
1473	mulps	%xmm0, %xmm2
1474	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1475	movss	 1 * SIZE(AA), %xmm0
1476	addps	%xmm2, %xmm4
1477	shufps	$0, %xmm0, %xmm0
1478	movsd	 4 * SIZE(BB), %xmm2
1479	mulps	%xmm0, %xmm2
1480	movss	 2 * SIZE(AA), %xmm0
1481	addps	%xmm2, %xmm5
1482	shufps	$0, %xmm0, %xmm0
1483	movsd	 8 * SIZE(BB), %xmm2
1484	mulps	%xmm0, %xmm2
1485	movss	 3 * SIZE(AA), %xmm0
1486	addps	%xmm2, %xmm4
1487	shufps	$0, %xmm0, %xmm0
1488	movsd	12 * SIZE(BB), %xmm2
1489	mulps	%xmm0, %xmm2
1490	movss	 8 * SIZE(AA), %xmm0
1491	addps	%xmm2, %xmm5
1492	movsd	32 * SIZE(BB), %xmm2
1493	shufps	$0, %xmm1, %xmm1
1494	mulps	%xmm1, %xmm3
1495	movss	 5 * SIZE(AA), %xmm1
1496	addps	%xmm3, %xmm4
1497	shufps	$0, %xmm1, %xmm1
1498	movsd	20 * SIZE(BB), %xmm3
1499	mulps	%xmm1, %xmm3
1500	movss	 6 * SIZE(AA), %xmm1
1501	addps	%xmm3, %xmm5
1502	shufps	$0, %xmm1, %xmm1
1503	movsd	24 * SIZE(BB), %xmm3
1504	mulps	%xmm1, %xmm3
1505	movss	 7 * SIZE(AA), %xmm1
1506	addps	%xmm3, %xmm4
1507	shufps	$0, %xmm1, %xmm1
1508	movsd	28 * SIZE(BB), %xmm3
1509	mulps	%xmm1, %xmm3
1510	movss	12 * SIZE(AA), %xmm1
1511	addps	%xmm3, %xmm5
1512	movsd	48 * SIZE(BB), %xmm3
1513
1514	addl	$ 8 * SIZE, AA
1515	addl	$32 * SIZE, BB
1516	decl   %eax
1517	jne    .L72
1518	ALIGN_4
1519
1520.L75:
1521#ifndef TRMMKERNEL
1522	movl	K, %eax
1523#else
1524	movl	KKK, %eax
1525#endif
1526	movaps	ALPHA,  %xmm3
1527	andl	$7, %eax		# if (k & 1)
1528	BRANCH
1529	je .L78
1530	ALIGN_4
1531
1532.L76:
1533	shufps	$0, %xmm0, %xmm0
1534	mulps	%xmm0, %xmm2
1535	movss	 1 * SIZE(AA), %xmm0
1536	addps	%xmm2, %xmm4
1537	movsd	 4 * SIZE(BB), %xmm2
1538
1539	addl	$ 1 * SIZE, AA
1540	addl	$ 4 * SIZE, BB
1541	decl	%eax
1542	jg	.L76
1543	ALIGN_4
1544
1545.L78:
1546	addps	%xmm5, %xmm4
1547
1548	movsd	 (%esi), %xmm0
1549	movhps	 (%esi, LDC), %xmm0
1550
1551	pshufd	$0x50, %xmm4,  %xmm2
1552	mulps	%xmm3, %xmm2
1553	addps	%xmm2, %xmm0
1554
1555	movlps	%xmm0,   (%esi)
1556	movhps	%xmm0,   (%esi, LDC)
1557	ALIGN_4
1558
1559.L79:
1560#if defined(TRMMKERNEL) && !defined(LEFT)
1561	addl	$2, KK
1562#endif
1563	leal	(, LDC, 2), %eax
1564	addl	%eax, C
1565	ALIGN_4
1566
1567.L80:
1568	testl	$1, N
1569	je	.L999
1570
1571#if defined(TRMMKERNEL) && defined(LEFT)
1572	movl	OFFSET, %eax
1573	movl	%eax, KK
1574#endif
1575
1576	movl	K, %eax
1577	leal	BUFFER, %ecx
1578	sarl	$3, %eax
1579	jle	.L85
1580	ALIGN_4
1581
1582.L82:
1583	movss	 0 * SIZE(%edi), %xmm0
1584	movss	 1 * SIZE(%edi), %xmm1
1585	movss	 2 * SIZE(%edi), %xmm2
1586	movss	 3 * SIZE(%edi), %xmm3
1587	movss	 4 * SIZE(%edi), %xmm4
1588	movss	 5 * SIZE(%edi), %xmm5
1589	movss	 6 * SIZE(%edi), %xmm6
1590	movss	 7 * SIZE(%edi), %xmm7
1591
1592	movss	%xmm0,  0 * SIZE(%ecx)
1593	movss	%xmm0,  1 * SIZE(%ecx)
1594	movss	%xmm1,  2 * SIZE(%ecx)
1595	movss	%xmm1,  3 * SIZE(%ecx)
1596	movss	%xmm2,  4 * SIZE(%ecx)
1597	movss	%xmm2,  5 * SIZE(%ecx)
1598	movss	%xmm3,  6 * SIZE(%ecx)
1599	movss	%xmm3,  7 * SIZE(%ecx)
1600	movss	%xmm4,  8 * SIZE(%ecx)
1601	movss	%xmm4,  9 * SIZE(%ecx)
1602	movss	%xmm5, 10 * SIZE(%ecx)
1603	movss	%xmm5, 11 * SIZE(%ecx)
1604	movss	%xmm6, 12 * SIZE(%ecx)
1605	movss	%xmm6, 13 * SIZE(%ecx)
1606	movss	%xmm7, 14 * SIZE(%ecx)
1607	movss	%xmm7, 15 * SIZE(%ecx)
1608
1609#	prefetcht1	128 * SIZE(%ecx)
1610	prefetcht0	112 * SIZE(%edi)
1611
1612	addl	$ 8 * SIZE, %edi
1613	addl	$16 * SIZE, %ecx
1614	decl	%eax
1615	jne	.L82
1616	ALIGN_4
1617
1618.L85:
1619	movl	K, %eax
1620	andl	$7, %eax
1621	BRANCH
1622	jle	.L90
1623	ALIGN_4
1624
1625.L86:
1626	movss	 0 * SIZE(%edi), %xmm0
1627	movss	%xmm0,  0 * SIZE(%ecx)
1628	movss	%xmm0,  1 * SIZE(%ecx)
1629
1630	addl	$1 * SIZE, %edi
1631	addl	$2 * SIZE, %ecx
1632	decl	%eax
1633	jne	.L86
1634	ALIGN_4
1635
1636.L90:
1637	movl	C, %esi		# coffset = c
1638	movl	A, %edx		# aoffset = a
1639	movl	M,  %ebx
1640	sarl	$2, %ebx	# i = (m >> 2)
1641	jle	.L100
1642	ALIGN_4
1643
1644.L91:
1645#if !defined(TRMMKERNEL) || \
1646	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1647	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1648
1649	leal	BUFFER, BB	# boffset1 = boffset
1650#else
1651	leal	BUFFER, BB	# boffset1 = boffset
1652	movl	KK, %eax
1653	leal	(, %eax,   8), %eax
1654	leal	(AA, %eax, 2), AA
1655	leal	(BB, %eax, 1), BB
1656#endif
1657
1658	movaps	 0 * SIZE(AA), %xmm0
1659	pxor	%xmm4, %xmm4
1660	movddup  0 * SIZE(BB), %xmm2
1661	pxor	%xmm5, %xmm5
1662	movaps	16 * SIZE(AA), %xmm1
1663	movddup  8 * SIZE(BB), %xmm3
1664
1665#ifdef HAVE_3DNOW
1666	prefetchw	4 * SIZE(%esi)
1667#elif defined(HAVE_SSE) || defined(HAVE_SSE2)
1668	prefetcht2	4 * SIZE(%esi)
1669#endif
1670
1671#ifndef TRMMKERNEL
1672	movl	K, %eax
1673#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1674	movl	K, %eax
1675	subl	KK, %eax
1676	movl	%eax, KKK
1677#else
1678	movl	KK, %eax
1679#ifdef LEFT
1680	addl	$4, %eax
1681#else
1682	addl	$1, %eax
1683#endif
1684	movl	%eax, KKK
1685#endif
1686	sarl	$3, %eax
1687	je	.L95
1688	ALIGN_4
1689
1690.L92:
1691	mulps	%xmm0, %xmm2
1692	movaps	 4 * SIZE(AA), %xmm0
1693	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1694	addps	%xmm2, %xmm4
1695	movddup  2 * SIZE(BB), %xmm2
1696	mulps	%xmm0, %xmm2
1697	movaps	 8 * SIZE(AA), %xmm0
1698	addps	%xmm2, %xmm5
1699	movddup  4 * SIZE(BB), %xmm2
1700	mulps	%xmm0, %xmm2
1701	movaps	12 * SIZE(AA), %xmm0
1702	addps	%xmm2, %xmm4
1703	movddup  6 * SIZE(BB), %xmm2
1704	mulps	%xmm0, %xmm2
1705	movaps	32 * SIZE(AA), %xmm0
1706	addps	%xmm2, %xmm5
1707	movddup 16 * SIZE(BB), %xmm2
1708	mulps	%xmm1, %xmm3
1709	movaps	20 * SIZE(AA), %xmm1
1710	addps	%xmm3, %xmm4
1711	movddup 10 * SIZE(BB), %xmm3
1712	mulps	%xmm1, %xmm3
1713	movaps	24 * SIZE(AA), %xmm1
1714	addps	%xmm3, %xmm5
1715	movddup 12 * SIZE(BB), %xmm3
1716	mulps	%xmm1, %xmm3
1717	movaps	28 * SIZE(AA), %xmm1
1718	addps	%xmm3, %xmm4
1719	movddup 14 * SIZE(BB), %xmm3
1720	mulps	%xmm1, %xmm3
1721	movaps	48 * SIZE(AA), %xmm1
1722	addps	%xmm3, %xmm5
1723	movddup 24 * SIZE(BB), %xmm3
1724
1725	addl	$32 * SIZE, AA
1726	addl	$16 * SIZE, BB
1727	decl   %eax
1728	jne    .L92
1729	ALIGN_4
1730
1731.L95:
1732#ifndef TRMMKERNEL
1733	movl	K, %eax
1734#else
1735	movl	KKK, %eax
1736#endif
1737	movaps	ALPHA,  %xmm3
1738	andl	$7, %eax		# if (k & 1)
1739	BRANCH
1740	je .L98
1741	ALIGN_4
1742
1743.L96:
1744	mulps	%xmm0, %xmm2
1745	movaps	 4 * SIZE(AA), %xmm0
1746	addps	%xmm2, %xmm4
1747	movddup  2 * SIZE(BB), %xmm2
1748
1749	addl	$4 * SIZE, AA
1750	addl	$2 * SIZE, BB
1751	decl	%eax
1752	jg	.L96
1753	ALIGN_4
1754
1755.L98:
1756	addps	%xmm5, %xmm4
1757
1758	movsd	 0 * SIZE(%esi), %xmm0
1759	movhps	 2 * SIZE(%esi), %xmm0
1760	movsd	 4 * SIZE(%esi), %xmm1
1761	movhps	 6 * SIZE(%esi), %xmm1
1762
1763	pshufd	$0x50, %xmm4,  %xmm2
1764	pshufd	$0xfa, %xmm4,  %xmm4
1765
1766	mulps	%xmm3, %xmm2
1767	mulps	%xmm3, %xmm4
1768
1769	addps	%xmm2, %xmm0
1770	addps	%xmm4, %xmm1
1771
1772	movlps	%xmm0,   0 * SIZE(%esi)
1773	movhps	%xmm0,   2 * SIZE(%esi)
1774	movlps	%xmm1,   4 * SIZE(%esi)
1775	movhps	%xmm1,   6 * SIZE(%esi)
1776
1777	addl	$8 * SIZE, %esi
1778	decl	%ebx			# i --
1779	jg	.L91
1780	ALIGN_4
1781
1782.L100:
1783	testl	$2, M
1784	je	.L110
1785
1786#if !defined(TRMMKERNEL) || \
1787	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1788	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1789
1790	leal	BUFFER, BB	# boffset1 = boffset
1791#else
1792	leal	BUFFER, BB	# boffset1 = boffset
1793	movl	KK, %eax
1794	leal	(, %eax,   8), %eax
1795	leal	(AA, %eax, 1), AA
1796	leal	(BB, %eax, 1), BB
1797#endif
1798
1799	pxor	%xmm4, %xmm4
1800	pxor	%xmm5, %xmm5
1801	pxor	%xmm6, %xmm6
1802	pxor	%xmm7, %xmm7
1803
1804 	movsd	  0 * SIZE(AA), %xmm0
1805	movsd     0 * SIZE(BB), %xmm2
1806 	movsd	  8 * SIZE(AA), %xmm1
1807	movsd     8 * SIZE(BB), %xmm3
1808
1809	leal	(LDC, LDC, 2), %eax
1810
1811#ifndef TRMMKERNEL
1812	movl	K, %eax
1813#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1814	movl	K, %eax
1815	subl	KK, %eax
1816	movl	%eax, KKK
1817#else
1818	movl	KK, %eax
1819#ifdef LEFT
1820	addl	$2, %eax
1821#else
1822	addl	$1, %eax
1823#endif
1824	movl	%eax, KKK
1825#endif
1826	sarl	$3, %eax
1827	je	.L105
1828	ALIGN_4
1829
1830.L102:
1831	mulps	%xmm0, %xmm2
1832 	movsd	  2 * SIZE(AA), %xmm0
1833	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1834	addps	%xmm2, %xmm4
1835	movsd     2 * SIZE(BB), %xmm2
1836	mulps	%xmm0, %xmm2
1837 	movsd	  4 * SIZE(AA), %xmm0
1838	addps	%xmm2, %xmm5
1839	movsd     4 * SIZE(BB), %xmm2
1840	mulps	%xmm0, %xmm2
1841 	movsd	  6 * SIZE(AA), %xmm0
1842	addps	%xmm2, %xmm4
1843	movsd     6 * SIZE(BB), %xmm2
1844	mulps	%xmm0, %xmm2
1845 	movsd	 16 * SIZE(AA), %xmm0
1846	addps	%xmm2, %xmm5
1847	movsd    16 * SIZE(BB), %xmm2
1848	mulps	%xmm1, %xmm3
1849 	movsd	 10 * SIZE(AA), %xmm1
1850	addps	%xmm3, %xmm4
1851	movsd    10 * SIZE(BB), %xmm3
1852	mulps	%xmm1, %xmm3
1853 	movsd	 12 * SIZE(AA), %xmm1
1854	addps	%xmm3, %xmm5
1855	movsd    12 * SIZE(BB), %xmm3
1856	mulps	%xmm1, %xmm3
1857 	movsd	 14 * SIZE(AA), %xmm1
1858	addps	%xmm3, %xmm4
1859	movsd    14 * SIZE(BB), %xmm3
1860	mulps	%xmm1, %xmm3
1861 	movsd	 24 * SIZE(AA), %xmm1
1862	addps	%xmm3, %xmm5
1863	movsd    24 * SIZE(BB), %xmm3
1864
1865	addl	$16 * SIZE, AA
1866	addl	$16 * SIZE, BB
1867	decl   %eax
1868	jne    .L102
1869	ALIGN_4
1870
1871.L105:
1872#ifndef TRMMKERNEL
1873	movl	K, %eax
1874#else
1875	movl	KKK, %eax
1876#endif
1877	movaps	ALPHA,  %xmm3
1878	andl	$7, %eax		# if (k & 1)
1879	BRANCH
1880	je .L108
1881	ALIGN_4
1882
1883.L106:
1884	mulps	%xmm0, %xmm2
1885 	movsd	  2 * SIZE(AA), %xmm0
1886	addps	%xmm2, %xmm4
1887	movsd     2 * SIZE(BB), %xmm2
1888
1889	addl	$2 * SIZE, AA
1890	addl	$2 * SIZE, BB
1891	decl	%eax
1892	jg	.L106
1893	ALIGN_4
1894
1895.L108:
1896	addps	%xmm5, %xmm4
1897	movhlps	%xmm4, %xmm5
1898	addps	%xmm5, %xmm4
1899
1900	movsd	 0 * SIZE(%esi), %xmm0
1901	movhps	 2 * SIZE(%esi), %xmm0
1902
1903	pshufd	$0x50, %xmm4,  %xmm2
1904	mulps	%xmm3, %xmm2
1905	addps	%xmm2, %xmm0
1906
1907	movlps	%xmm0,   0 * SIZE(%esi)
1908	movhps	%xmm0,   2 * SIZE(%esi)
1909
1910	addl	$4 * SIZE, %esi		# coffset += 2
1911	ALIGN_4
1912
1913.L110:
1914	testl	$1, M
1915	je	.L999
1916
1917#if !defined(TRMMKERNEL) || \
1918	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1919	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1920
1921	leal	BUFFER, BB	# boffset1 = boffset
1922#else
1923	leal	BUFFER, BB	# boffset1 = boffset
1924	movl	KK, %eax
1925	leal	(, %eax,   4), %eax
1926	leal	(AA, %eax, 1), AA
1927	leal	(BB, %eax, 2), BB
1928#endif
1929
1930 	movss	  0 * SIZE(AA), %xmm0
1931	pxor	%xmm4, %xmm4
1932	movss     0 * SIZE(BB), %xmm2
1933	pxor	%xmm5, %xmm5
1934 	movss	  4 * SIZE(AA), %xmm1
1935	movss     8 * SIZE(BB), %xmm3
1936
1937	leal	(LDC, LDC, 2), %eax
1938
1939#ifndef TRMMKERNEL
1940	movl	K, %eax
1941#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1942	movl	K, %eax
1943	subl	KK, %eax
1944	movl	%eax, KKK
1945#else
1946	movl	KK, %eax
1947#ifdef LEFT
1948	addl	$1, %eax
1949#else
1950	addl	$1, %eax
1951#endif
1952	movl	%eax, KKK
1953#endif
1954	sarl	$3, %eax
1955	je	.L115
1956	ALIGN_4
1957
1958.L112:
1959	mulss	%xmm0, %xmm2
1960	PREFETCH  (PREFETCHSIZE +  0) * SIZE(AA)
1961 	movss	  1 * SIZE(AA), %xmm0
1962	addss	%xmm2, %xmm4
1963	movss     2 * SIZE(BB), %xmm2
1964	mulss	%xmm0, %xmm2
1965 	movss	  2 * SIZE(AA), %xmm0
1966	addss	%xmm2, %xmm5
1967	movss     4 * SIZE(BB), %xmm2
1968	mulss	%xmm0, %xmm2
1969 	movss	  3 * SIZE(AA), %xmm0
1970	addss	%xmm2, %xmm4
1971	movss     6 * SIZE(BB), %xmm2
1972	mulss	%xmm0, %xmm2
1973 	movss	  8 * SIZE(AA), %xmm0
1974	addss	%xmm2, %xmm5
1975	movss    16 * SIZE(BB), %xmm2
1976	mulss	%xmm1, %xmm3
1977 	movss	  5 * SIZE(AA), %xmm1
1978	addss	%xmm3, %xmm4
1979	movss    10 * SIZE(BB), %xmm3
1980	mulss	%xmm1, %xmm3
1981 	movss	  6 * SIZE(AA), %xmm1
1982	addss	%xmm3, %xmm5
1983	movss    12 * SIZE(BB), %xmm3
1984	mulss	%xmm1, %xmm3
1985 	movss	  7 * SIZE(AA), %xmm1
1986	addss	%xmm3, %xmm4
1987	movss    14 * SIZE(BB), %xmm3
1988	mulss	%xmm1, %xmm3
1989 	movss	 12 * SIZE(AA), %xmm1
1990	addss	%xmm3, %xmm5
1991	movss    24 * SIZE(BB), %xmm3
1992
1993	addl	$ 8 * SIZE, AA
1994	addl	$16 * SIZE, BB
1995	decl   %eax
1996	jne    .L112
1997	ALIGN_4
1998
1999.L115:
2000#ifndef TRMMKERNEL
2001	movl	K, %eax
2002#else
2003	movl	KKK, %eax
2004#endif
2005	movaps	ALPHA,  %xmm3
2006	andl	$7, %eax		# if (k & 1)
2007	BRANCH
2008	je .L118
2009	ALIGN_4
2010
2011.L116:
2012	mulss	%xmm0, %xmm2
2013 	movss	  1 * SIZE(AA), %xmm0
2014	addss	%xmm2, %xmm4
2015	movss     2 * SIZE(BB), %xmm2
2016
2017	addl	$1 * SIZE, AA
2018	addl	$2 * SIZE, BB
2019	decl	%eax
2020	jg	.L116
2021	ALIGN_4
2022
2023.L118:
2024 	addss	%xmm5, %xmm4
2025
2026	movsd	 (%esi), %xmm0
2027
2028	pshufd	$0x50, %xmm4,  %xmm2
2029	mulps	%xmm3, %xmm2
2030	addps	%xmm2, %xmm0
2031
2032	movlps	%xmm0,   (%esi)
2033	ALIGN_4
2034
2035.L999:
2036	movl	OLD_STACK, %esp
2037	popl	%ebx
2038	popl	%esi
2039	popl	%edi
2040	popl	%ebp
2041	ret
2042
2043	EPILOGUE
2044