1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	16
44
45#define M	 4 + STACK + ARGS(%esp)
46#define N	 8 + STACK + ARGS(%esp)
47#define K	12 + STACK + ARGS(%esp)
48#define ALPHA	16 + STACK + ARGS(%esp)
49#define A	24 + STACK + ARGS(%esp)
50#define ARG_B	28 + STACK + ARGS(%esp)
51#define C	32 + STACK + ARGS(%esp)
52#define ARG_LDC	36 + STACK + ARGS(%esp)
53#define OFFSET	40 + STACK + ARGS(%esp)
54
55#define J	 0 + STACK(%esp)
56#define BX	 4 + STACK(%esp)
57#define KK	 8 + STACK(%esp)
58#define KKK	12 + STACK(%esp)
59
60#ifdef PENTIUM4
61#define PREFETCH_R   (8 * 4)
62#define PREFETCH     prefetcht1
63#define PREFETCHSIZE 84
64#endif
65
66#ifdef PENTIUMM
67#define PREFETCH_R   (8 * 4)
68#define PREFETCH     prefetcht1
69#define PREFETCHSIZE 84
70#endif
71
72#define AA	%edx
73#define BB	%ecx
74#define LDC	%ebp
75#define B	%edi
76
77#define KERNEL1(address) \
78	mulpd	 %xmm0, %xmm2; \
79	PREFETCH  (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
80	addpd	 %xmm2, %xmm4; \
81	movddup	 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
82	mulpd	 %xmm0, %xmm2; \
83	addpd	 %xmm2, %xmm5; \
84	movddup	 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
85	mulpd	 %xmm0, %xmm2; \
86	addpd	 %xmm2, %xmm6; \
87	movddup	 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
88	mulpd	 %xmm0, %xmm2; \
89	movapd	 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
90	addpd	 %xmm2, %xmm7; \
91	movddup	 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2
92
93#define KERNEL2(address) \
94	mulpd	 %xmm0, %xmm2; \
95	addpd	 %xmm2, %xmm4; \
96	movddup	 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
97	mulpd	 %xmm0, %xmm2; \
98	addpd	 %xmm2, %xmm5; \
99	movddup	 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
100	mulpd	 %xmm0, %xmm2; \
101	addpd	 %xmm2, %xmm6; \
102	movddup	 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
103	mulpd	 %xmm0, %xmm2; \
104	movapd	 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
105	addpd	 %xmm2, %xmm7; \
106	movddup	16 * SIZE + (address) * 2 * SIZE(BB), %xmm2
107
108#define KERNEL3(address) \
109	mulpd	 %xmm0, %xmm3; \
110	addpd	 %xmm3, %xmm4; \
111	movddup	 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
112	mulpd	 %xmm0, %xmm3; \
113	addpd	 %xmm3, %xmm5; \
114	movddup	10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
115	mulpd	 %xmm0, %xmm3; \
116	addpd	 %xmm3, %xmm6; \
117	movddup	11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
118	mulpd	 %xmm0, %xmm3; \
119	movapd	 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
120	addpd	 %xmm3, %xmm7; \
121	movddup	12 * SIZE + (address) * 2 * SIZE(BB), %xmm3
122
123#define KERNEL4(address) \
124	mulpd	 %xmm0, %xmm3; \
125	addpd	 %xmm3, %xmm4; \
126	movddup	13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
127	mulpd	 %xmm0, %xmm3; \
128	addpd	 %xmm3, %xmm5; \
129	movddup	14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
130	mulpd	 %xmm0, %xmm3; \
131	addpd	 %xmm3, %xmm6; \
132	movddup	15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
133	mulpd	 %xmm0, %xmm3; \
134	movapd	16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \
135	addpd	 %xmm3, %xmm7; \
136	movddup	24 * SIZE + (address) * 2 * SIZE(BB), %xmm3
137
138#define KERNEL5(address) \
139	mulpd	 %xmm1, %xmm2; \
140	addpd	 %xmm2, %xmm4; \
141	movddup	17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
142	mulpd	 %xmm1, %xmm2; \
143	addpd	 %xmm2, %xmm5; \
144	movddup	18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
145	mulpd	 %xmm1, %xmm2; \
146	addpd	 %xmm2, %xmm6; \
147	movddup	19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
148	mulpd	 %xmm1, %xmm2; \
149	movapd	10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
150	addpd	 %xmm2, %xmm7
151
152#define KERNEL6(address) \
153	movddup	20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
154	mulpd	 %xmm1, %xmm2; \
155	addpd	 %xmm2, %xmm4; \
156	movddup	21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
157	mulpd	 %xmm1, %xmm2; \
158	addpd	 %xmm2, %xmm5; \
159	movddup	22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
160	mulpd	 %xmm1, %xmm2; \
161	addpd	 %xmm2, %xmm6; \
162	movddup	23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \
163	mulpd	 %xmm1, %xmm2; \
164	movapd	12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
165	addpd	 %xmm2, %xmm7; \
166	movddup	32 * SIZE + (address) * 2 * SIZE(BB), %xmm2
167
168#define KERNEL7(address) \
169	mulpd	 %xmm1, %xmm3; \
170	addpd	 %xmm3, %xmm4; \
171	movddup	25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
172	mulpd	 %xmm1, %xmm3; \
173	addpd	 %xmm3, %xmm5; \
174	movddup	26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
175	mulpd	 %xmm1, %xmm3; \
176	addpd	 %xmm3, %xmm6; \
177	movddup	27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
178	mulpd	 %xmm1, %xmm3; \
179	movapd	14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
180	addpd	 %xmm3, %xmm7; \
181	movddup	28 * SIZE + (address) * 2 * SIZE(BB), %xmm3
182
183#define KERNEL8(address) \
184	mulpd	 %xmm1, %xmm3; \
185	addpd	 %xmm3, %xmm4; \
186	movddup	29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
187	mulpd	 %xmm1, %xmm3; \
188	addpd	 %xmm3, %xmm5; \
189	movddup	30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
190	mulpd	 %xmm1, %xmm3; \
191	addpd	 %xmm3, %xmm6; \
192	movddup	31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \
193	mulpd	 %xmm1, %xmm3; \
194	movapd	24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \
195	addpd	 %xmm3, %xmm7; \
196	movddup	40 * SIZE + (address) * 2 * SIZE(BB), %xmm3
197
198	PROLOGUE
199
200	subl	$ARGS, %esp
201
202	pushl	%ebp
203	pushl	%edi
204	pushl	%esi
205	pushl	%ebx
206
207	PROFCODE
208
209	movl	ARG_B,   B
210	movl	ARG_LDC, LDC
211
212#ifdef TRMMKERNEL
213	movl	OFFSET, %eax
214#ifndef LEFT
215	negl	%eax
216#endif
217	movl	%eax, KK
218#endif
219
220	leal	(, LDC, SIZE), LDC
221
222	movl	N,  %eax
223	sarl	$2, %eax
224	movl	%eax, J
225	jle	.L30
226	ALIGN_2
227
228.L10:
229#if defined(TRMMKERNEL) && defined(LEFT)
230	movl	OFFSET, %eax
231	movl	%eax, KK
232#endif
233
234	movl	K, %eax
235	sall	$BASE_SHIFT + 2, %eax
236	leal	(B, %eax), %eax
237	movl	%eax, BX
238
239	movl	C, %esi		# coffset = c
240	movl	A, AA		# aoffset = a
241
242	movl	M,  %ebx
243	sarl	$1, %ebx	# i = (m >> 2)
244	jle	.L20
245	ALIGN_4
246
247.L11:
248#if !defined(TRMMKERNEL) || \
249	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
250	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
251
252	movl	B, BB
253#else
254	movl	KK, %eax
255	leal	(, %eax, SIZE), %eax
256	leal	(AA, %eax, 2), AA
257	leal	(B,  %eax, 4), BB
258#endif
259
260	movl	BX, %eax
261	prefetcht2  0 * SIZE(%eax)
262	subl	$-4 * SIZE, BX
263
264	movapd	 0 * SIZE(AA), %xmm0
265	pxor	%xmm4, %xmm4
266	movapd	 8 * SIZE(AA), %xmm1
267	pxor	%xmm5, %xmm5
268	movddup	 0 * SIZE(BB), %xmm2
269	pxor	%xmm6, %xmm6
270	movddup	 8 * SIZE(BB), %xmm3
271	pxor	%xmm7, %xmm7
272
273	leal	(LDC, LDC, 2), %eax
274
275#ifdef PENTIUM4
276	prefetchnta	3 * SIZE(%esi)
277	prefetchnta	3 * SIZE(%esi, LDC, 1)
278	prefetchnta	3 * SIZE(%esi, LDC, 2)
279	prefetchnta	3 * SIZE(%esi, %eax, 1)
280#endif
281
282#ifndef TRMMKERNEL
283	movl	K, %eax
284#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
285	movl	K, %eax
286	subl	KK, %eax
287	movl	%eax, KKK
288#else
289	movl	KK, %eax
290#ifdef LEFT
291	addl	$2, %eax
292#else
293	addl	$4, %eax
294#endif
295	movl	%eax, KKK
296#endif
297
298#ifdef CORE_PRESCOTT
299	andl	$-8, %eax
300	sall	$4, %eax
301	je	.L15
302
303.L1X:
304	KERNEL1(16  *  0)
305	KERNEL2(16  *  0)
306	KERNEL3(16  *  0)
307	KERNEL4(16  *  0)
308	KERNEL5(16  *  0)
309	KERNEL6(16  *  0)
310	KERNEL7(16  *  0)
311	KERNEL8(16  *  0)
312	cmpl	$128 *  1, %eax
313	jle	.L12
314	KERNEL1(16  *  1)
315	KERNEL2(16  *  1)
316	KERNEL3(16  *  1)
317	KERNEL4(16  *  1)
318	KERNEL5(16  *  1)
319	KERNEL6(16  *  1)
320	KERNEL7(16  *  1)
321	KERNEL8(16  *  1)
322	cmpl	$128 *  2, %eax
323	jle	.L12
324	KERNEL1(16  *  2)
325	KERNEL2(16  *  2)
326	KERNEL3(16  *  2)
327	KERNEL4(16  *  2)
328	KERNEL5(16  *  2)
329	KERNEL6(16  *  2)
330	KERNEL7(16  *  2)
331	KERNEL8(16  *  2)
332	cmpl	$128 *  3, %eax
333	jle	.L12
334	KERNEL1(16  *  3)
335	KERNEL2(16  *  3)
336	KERNEL3(16  *  3)
337	KERNEL4(16  *  3)
338	KERNEL5(16  *  3)
339	KERNEL6(16  *  3)
340	KERNEL7(16  *  3)
341	KERNEL8(16  *  3)
342	cmpl	$128 *  4, %eax
343	jle	.L12
344	KERNEL1(16  *  4)
345	KERNEL2(16  *  4)
346	KERNEL3(16  *  4)
347	KERNEL4(16  *  4)
348	KERNEL5(16  *  4)
349	KERNEL6(16  *  4)
350	KERNEL7(16  *  4)
351	KERNEL8(16  *  4)
352	cmpl	$128 *  5, %eax
353	jle	.L12
354	KERNEL1(16  *  5)
355	KERNEL2(16  *  5)
356	KERNEL3(16  *  5)
357	KERNEL4(16  *  5)
358	KERNEL5(16  *  5)
359	KERNEL6(16  *  5)
360	KERNEL7(16  *  5)
361	KERNEL8(16  *  5)
362	cmpl	$128 *  6, %eax
363	jle	.L12
364	KERNEL1(16  *  6)
365	KERNEL2(16  *  6)
366	KERNEL3(16  *  6)
367	KERNEL4(16  *  6)
368	KERNEL5(16  *  6)
369	KERNEL6(16  *  6)
370	KERNEL7(16  *  6)
371	KERNEL8(16  *  6)
372	cmpl	$128 *  7, %eax
373	jle	.L12
374	KERNEL1(16  *  7)
375	KERNEL2(16  *  7)
376	KERNEL3(16  *  7)
377	KERNEL4(16  *  7)
378	KERNEL5(16  *  7)
379	KERNEL6(16  *  7)
380	KERNEL7(16  *  7)
381	KERNEL8(16  *  7)
382#if 1
383	cmpl	$128 *  8, %eax
384	jle	.L12
385	KERNEL1(16  *  8)
386	KERNEL2(16  *  8)
387	KERNEL3(16  *  8)
388	KERNEL4(16  *  8)
389	KERNEL5(16  *  8)
390	KERNEL6(16  *  8)
391	KERNEL7(16  *  8)
392	KERNEL8(16  *  8)
393	cmpl	$128 *  9, %eax
394	jle	.L12
395	KERNEL1(16  *  9)
396	KERNEL2(16  *  9)
397	KERNEL3(16  *  9)
398	KERNEL4(16  *  9)
399	KERNEL5(16  *  9)
400	KERNEL6(16  *  9)
401	KERNEL7(16  *  9)
402	KERNEL8(16  *  9)
403	cmpl	$128 * 10, %eax
404	jle	.L12
405	KERNEL1(16  * 10)
406	KERNEL2(16  * 10)
407	KERNEL3(16  * 10)
408	KERNEL4(16  * 10)
409	KERNEL5(16  * 10)
410	KERNEL6(16  * 10)
411	KERNEL7(16  * 10)
412	KERNEL8(16  * 10)
413	cmpl	$128 * 11, %eax
414	jle	.L12
415	KERNEL1(16  * 11)
416	KERNEL2(16  * 11)
417	KERNEL3(16  * 11)
418	KERNEL4(16  * 11)
419	KERNEL5(16  * 11)
420	KERNEL6(16  * 11)
421	KERNEL7(16  * 11)
422	KERNEL8(16  * 11)
423	cmpl	$128 * 12, %eax
424	jle	.L12
425	KERNEL1(16  * 12)
426	KERNEL2(16  * 12)
427	KERNEL3(16  * 12)
428	KERNEL4(16  * 12)
429	KERNEL5(16  * 12)
430	KERNEL6(16  * 12)
431	KERNEL7(16  * 12)
432	KERNEL8(16  * 12)
433	cmpl	$128 * 13, %eax
434	jle	.L12
435	KERNEL1(16  * 13)
436	KERNEL2(16  * 13)
437	KERNEL3(16  * 13)
438	KERNEL4(16  * 13)
439	KERNEL5(16  * 13)
440	KERNEL6(16  * 13)
441	KERNEL7(16  * 13)
442	KERNEL8(16  * 13)
443	cmpl	$128 * 14, %eax
444	jle	.L12
445	KERNEL1(16  * 14)
446	KERNEL2(16  * 14)
447	KERNEL3(16  * 14)
448	KERNEL4(16  * 14)
449	KERNEL5(16  * 14)
450	KERNEL6(16  * 14)
451	KERNEL7(16  * 14)
452	KERNEL8(16  * 14)
453	cmpl	$128 * 15, %eax
454	jle	.L12
455	KERNEL1(16  * 15)
456	KERNEL2(16  * 15)
457	KERNEL3(16  * 15)
458	KERNEL4(16  * 15)
459	KERNEL5(16  * 15)
460	KERNEL6(16  * 15)
461	KERNEL7(16  * 15)
462	KERNEL8(16  * 15)
463#else
464	addl	$32 * 4  * SIZE, AA
465	addl	$32 * 8  * SIZE, BB
466	subl	$128 * 8, %eax
467	jg	.L1X
468#endif
469
470.L12:
471	leal	(AA, %eax, 1), AA	# * 16
472	leal	(BB, %eax, 2), BB	# * 64
473
474#else
475
476	sarl	$3, %eax
477	je	.L15
478	ALIGN_4
479
480.L12:
481	mulpd	 %xmm0, %xmm2
482	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
483	addpd	 %xmm2, %xmm4
484	movddup	 1 * SIZE(BB), %xmm2
485	mulpd	 %xmm0, %xmm2
486	addpd	 %xmm2, %xmm5
487	movddup	 2 * SIZE(BB), %xmm2
488	mulpd	 %xmm0, %xmm2
489	addpd	 %xmm2, %xmm6
490	movddup	 3 * SIZE(BB), %xmm2
491	mulpd	 %xmm0, %xmm2
492	movapd	 2 * SIZE(AA), %xmm0
493	addpd	 %xmm2, %xmm7
494	movddup	 4 * SIZE(BB), %xmm2
495	mulpd	 %xmm0, %xmm2
496	addpd	 %xmm2, %xmm4
497	movddup	 5 * SIZE(BB), %xmm2
498	mulpd	 %xmm0, %xmm2
499	addpd	 %xmm2, %xmm5
500	movddup	 6 * SIZE(BB), %xmm2
501	mulpd	 %xmm0, %xmm2
502	addpd	 %xmm2, %xmm6
503	movddup	 7 * SIZE(BB), %xmm2
504	mulpd	 %xmm0, %xmm2
505	movapd	 4 * SIZE(AA), %xmm0
506	addpd	 %xmm2, %xmm7
507	movddup	16 * SIZE(BB), %xmm2
508	mulpd	 %xmm0, %xmm3
509	addpd	 %xmm3, %xmm4
510	movddup	 9 * SIZE(BB), %xmm3
511	mulpd	 %xmm0, %xmm3
512	addpd	 %xmm3, %xmm5
513	movddup	10 * SIZE(BB), %xmm3
514	mulpd	 %xmm0, %xmm3
515	addpd	 %xmm3, %xmm6
516	movddup	11 * SIZE(BB), %xmm3
517	mulpd	 %xmm0, %xmm3
518	movapd	 6 * SIZE(AA), %xmm0
519	addpd	 %xmm3, %xmm7
520	movddup	12 * SIZE(BB), %xmm3
521	mulpd	 %xmm0, %xmm3
522	addpd	 %xmm3, %xmm4
523	movddup	13 * SIZE(BB), %xmm3
524	mulpd	 %xmm0, %xmm3
525	addpd	 %xmm3, %xmm5
526	movddup	14 * SIZE(BB), %xmm3
527	mulpd	 %xmm0, %xmm3
528	addpd	 %xmm3, %xmm6
529	movddup	15 * SIZE(BB), %xmm3
530	mulpd	 %xmm0, %xmm3
531	movapd	16 * SIZE(AA), %xmm0
532	addpd	 %xmm3, %xmm7
533	movddup	24 * SIZE(BB), %xmm3
534	mulpd	 %xmm1, %xmm2
535	addpd	 %xmm2, %xmm4
536	movddup	17 * SIZE(BB), %xmm2
537	mulpd	 %xmm1, %xmm2
538	addpd	 %xmm2, %xmm5
539	movddup	18 * SIZE(BB), %xmm2
540	mulpd	 %xmm1, %xmm2
541	addpd	 %xmm2, %xmm6
542	movddup	19 * SIZE(BB), %xmm2
543	mulpd	 %xmm1, %xmm2
544	movapd	10 * SIZE(AA), %xmm1
545	addpd	 %xmm2, %xmm7
546	movddup	20 * SIZE(BB), %xmm2
547	mulpd	 %xmm1, %xmm2
548	addpd	 %xmm2, %xmm4
549	movddup	21 * SIZE(BB), %xmm2
550	mulpd	 %xmm1, %xmm2
551	addpd	 %xmm2, %xmm5
552	movddup	22 * SIZE(BB), %xmm2
553	mulpd	 %xmm1, %xmm2
554	addpd	 %xmm2, %xmm6
555	movddup	23 * SIZE(BB), %xmm2
556	mulpd	 %xmm1, %xmm2
557	movapd	12 * SIZE(AA), %xmm1
558	addpd	 %xmm2, %xmm7
559	movddup	32 * SIZE(BB), %xmm2
560	mulpd	 %xmm1, %xmm3
561	addpd	 %xmm3, %xmm4
562	movddup	25 * SIZE(BB), %xmm3
563	mulpd	 %xmm1, %xmm3
564	addpd	 %xmm3, %xmm5
565	movddup	26 * SIZE(BB), %xmm3
566	mulpd	 %xmm1, %xmm3
567	addpd	 %xmm3, %xmm6
568	movddup	27 * SIZE(BB), %xmm3
569	mulpd	 %xmm1, %xmm3
570	movapd	14 * SIZE(AA), %xmm1
571	addpd	 %xmm3, %xmm7
572	movddup	28 * SIZE(BB), %xmm3
573	mulpd	 %xmm1, %xmm3
574	addpd	 %xmm3, %xmm4
575	movddup	29 * SIZE(BB), %xmm3
576	mulpd	 %xmm1, %xmm3
577	addpd	 %xmm3, %xmm5
578	movddup	30 * SIZE(BB), %xmm3
579	mulpd	 %xmm1, %xmm3
580	addpd	 %xmm3, %xmm6
581	movddup	31 * SIZE(BB), %xmm3
582	mulpd	 %xmm1, %xmm3
583	movapd	24 * SIZE(AA), %xmm1
584	addpd	 %xmm3, %xmm7
585	movddup	40 * SIZE(BB), %xmm3
586
587	addl   $32 * SIZE, BB
588	addl   $16 * SIZE, AA
589	decl   %eax
590	jne    .L12
591	ALIGN_4
592#endif
593
594.L15:
595#ifndef TRMMKERNEL
596	movl	K, %eax
597#else
598	movl	KKK, %eax
599#endif
600	movddup	ALPHA,  %xmm3
601	andl	$7, %eax		# if (k & 1)
602	BRANCH
603	je .L18
604	ALIGN_3
605
606.L16:
607	mulpd	 %xmm0, %xmm2
608	addpd	 %xmm2, %xmm4
609	movddup	 1 * SIZE(BB), %xmm2
610	mulpd	 %xmm0, %xmm2
611	addpd	 %xmm2, %xmm5
612	movddup	 2 * SIZE(BB), %xmm2
613	mulpd	 %xmm0, %xmm2
614	addpd	 %xmm2, %xmm6
615	movddup	 3 * SIZE(BB), %xmm2
616	mulpd	 %xmm0, %xmm2
617	movapd	 2 * SIZE(AA), %xmm0
618	addpd	 %xmm2, %xmm7
619	movddup	 4 * SIZE(BB), %xmm2
620
621	addl	$2 * SIZE, AA
622	addl	$4 * SIZE, BB
623	decl	%eax
624	jg	.L16
625	ALIGN_4
626
627.L18:
628	SHUFPD_2 %xmm0, %xmm0
629	SHUFPD_2 %xmm1, %xmm1
630	SHUFPD_2 %xmm2, %xmm2
631	SHUFPD_2 %xmm3, %xmm3
632
633	mulpd	%xmm3, %xmm4
634	mulpd	%xmm3, %xmm5
635	mulpd	%xmm3, %xmm6
636	mulpd	%xmm3, %xmm7
637
638	movl	%esi, %eax
639	orl	LDC,  %eax
640	testl	$15,  %eax
641	NOBRANCH
642	jne	.L18x
643
644	leal	(LDC, LDC, 2), %eax
645
646#ifndef TRMMKERNEL
647	movapd	0 * SIZE(%esi), %xmm0
648	movapd	0 * SIZE(%esi, LDC, 1), %xmm1
649	movapd	0 * SIZE(%esi, LDC, 2), %xmm2
650	movapd	0 * SIZE(%esi, %eax, 1), %xmm3
651
652	addpd	%xmm0, %xmm4
653	addpd	%xmm1, %xmm5
654	addpd	%xmm2, %xmm6
655	addpd	%xmm3, %xmm7
656#endif
657
658	movapd	%xmm4, 0 * SIZE(%esi)
659	movapd	%xmm5, 0 * SIZE(%esi, LDC, 1)
660	movapd	%xmm6, 0 * SIZE(%esi, LDC, 2)
661	movapd	%xmm7, 0 * SIZE(%esi, %eax, 1)
662
663#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
664    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
665	movl	K, %eax
666	subl	KKK, %eax
667	leal	(,%eax, SIZE), %eax
668	leal	(AA, %eax, 2), AA
669	leal	(BB, %eax, 4), BB
670#endif
671
672#if defined(TRMMKERNEL) && defined(LEFT)
673	addl	$2, KK
674#endif
675
676	addl	$2 * SIZE, %esi		# coffset += 2
677	decl	%ebx			# i --
678	jg	.L11
679	jmp	.L20
680	ALIGN_4
681
682.L18x:
683	leal	(LDC, LDC, 2), %eax
684
685#ifndef TRMMKERNEL
686	movsd	0 * SIZE(%esi), %xmm0
687	movhpd	1 * SIZE(%esi), %xmm0
688	movsd	0 * SIZE(%esi, LDC, 1), %xmm1
689	movhpd	1 * SIZE(%esi, LDC, 1), %xmm1
690	movsd	0 * SIZE(%esi, LDC, 2), %xmm2
691	movhpd	1 * SIZE(%esi, LDC, 2), %xmm2
692	movsd	0 * SIZE(%esi, %eax, 1), %xmm3
693	movhpd	1 * SIZE(%esi, %eax, 1), %xmm3
694
695	addpd	%xmm0, %xmm4
696	addpd	%xmm1, %xmm5
697	addpd	%xmm2, %xmm6
698	addpd	%xmm3, %xmm7
699#endif
700
701	movsd	%xmm4, 0 * SIZE(%esi)
702	movhpd	%xmm4, 1 * SIZE(%esi)
703	movsd	%xmm5, 0 * SIZE(%esi, LDC, 1)
704	movhpd	%xmm5, 1 * SIZE(%esi, LDC, 1)
705	movsd	%xmm6, 0 * SIZE(%esi, LDC, 2)
706	movhpd	%xmm6, 1 * SIZE(%esi, LDC, 2)
707	movsd	%xmm7, 0 * SIZE(%esi, %eax, 1)
708	movhpd	%xmm7, 1 * SIZE(%esi, %eax, 1)
709
710#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
711    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
712	movl	K, %eax
713	subl	KKK, %eax
714	leal	(,%eax, SIZE), %eax
715	leal	(AA, %eax, 2), AA
716	leal	(BB, %eax, 4), BB
717#endif
718
719#if defined(TRMMKERNEL) && defined(LEFT)
720	addl	$2, KK
721#endif
722
723	addl	$2 * SIZE, %esi		# coffset += 2
724	decl	%ebx			# i --
725	jg	.L11
726	ALIGN_3
727
728.L20:
729	movl	M,  %ebx
730	testl	$1, %ebx	# i = (m >> 2)
731	jle	.L29
732
733
734#if !defined(TRMMKERNEL) || \
735	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
736	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
737
738	movl	B, BB
739#else
740	movl	KK, %eax
741	leal	(, %eax, SIZE), %eax
742	leal	(AA, %eax, 1), AA
743	leal	(B,  %eax, 4), BB
744#endif
745
746	movddup	 0 * SIZE(AA), %xmm0
747	pxor	%xmm4, %xmm4
748	movddup	 8 * SIZE(AA), %xmm1
749	pxor	%xmm5, %xmm5
750	movapd	 0 * SIZE(BB), %xmm2
751	pxor	%xmm6, %xmm6
752	movapd	 8 * SIZE(BB), %xmm3
753	pxor	%xmm7, %xmm7
754
755#ifndef TRMMKERNEL
756	movl	K, %eax
757#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
758	movl	K, %eax
759	subl	KK, %eax
760	movl	%eax, KKK
761#else
762	movl	KK, %eax
763#ifdef LEFT
764	addl	$1, %eax
765#else
766	addl	$4, %eax
767#endif
768	movl	%eax, KKK
769#endif
770	sarl	$4, %eax
771	je	.L25
772	ALIGN_4
773
774.L22:
775	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
776	mulpd	 %xmm0, %xmm2
777	mulpd	 2 * SIZE(BB), %xmm0
778	addpd	 %xmm2, %xmm4
779	movapd	 4 * SIZE(BB), %xmm2
780	addpd	 %xmm0, %xmm5
781	movddup	 1 * SIZE(AA), %xmm0
782	mulpd	 %xmm0, %xmm2
783	mulpd	 6 * SIZE(BB), %xmm0
784	addpd	 %xmm2, %xmm6
785	movapd	16 * SIZE(BB), %xmm2
786	addpd	 %xmm0, %xmm7
787	movddup	 2 * SIZE(AA), %xmm0
788	mulpd	 %xmm0, %xmm3
789	mulpd	10 * SIZE(BB), %xmm0
790	addpd	 %xmm3, %xmm4
791	movapd	12 * SIZE(BB), %xmm3
792	addpd	 %xmm0, %xmm5
793	movddup	 3 * SIZE(AA), %xmm0
794	mulpd	 %xmm0, %xmm3
795	mulpd	14 * SIZE(BB), %xmm0
796	addpd	 %xmm3, %xmm6
797	movapd	24 * SIZE(BB), %xmm3
798	addpd	 %xmm0, %xmm7
799	movddup	 4 * SIZE(AA), %xmm0
800	mulpd	 %xmm0, %xmm2
801	mulpd	18 * SIZE(BB), %xmm0
802	addpd	 %xmm2, %xmm4
803	movapd	20 * SIZE(BB), %xmm2
804	addpd	 %xmm0, %xmm5
805	movddup	 5 * SIZE(AA), %xmm0
806	mulpd	 %xmm0, %xmm2
807	mulpd	22 * SIZE(BB), %xmm0
808	addpd	 %xmm2, %xmm6
809	movapd	32 * SIZE(BB), %xmm2
810	addpd	 %xmm0, %xmm7
811	movddup	 6 * SIZE(AA), %xmm0
812	mulpd	 %xmm0, %xmm3
813	mulpd	26 * SIZE(BB), %xmm0
814	addpd	 %xmm3, %xmm4
815	movapd	28 * SIZE(BB), %xmm3
816	addpd	 %xmm0, %xmm5
817	movddup	 7 * SIZE(AA), %xmm0
818	mulpd	 %xmm0, %xmm3
819	mulpd	30 * SIZE(BB), %xmm0
820	addpd	 %xmm3, %xmm6
821	movapd	40 * SIZE(BB), %xmm3
822	addpd	 %xmm0, %xmm7
823	movddup	16 * SIZE(AA), %xmm0
824	mulpd	 %xmm1, %xmm2
825	mulpd	34 * SIZE(BB), %xmm1
826	addpd	 %xmm2, %xmm4
827	movapd	36 * SIZE(BB), %xmm2
828	addpd	 %xmm1, %xmm5
829	movddup	 9 * SIZE(AA), %xmm1
830	mulpd	 %xmm1, %xmm2
831	mulpd	38 * SIZE(BB), %xmm1
832	addpd	 %xmm2, %xmm6
833	movapd	48 * SIZE(BB), %xmm2
834	addpd	 %xmm1, %xmm7
835	movddup	10 * SIZE(AA), %xmm1
836	mulpd	 %xmm1, %xmm3
837	mulpd	42 * SIZE(BB), %xmm1
838	addpd	 %xmm3, %xmm4
839	movapd	44 * SIZE(BB), %xmm3
840	addpd	 %xmm1, %xmm5
841	movddup	11 * SIZE(AA), %xmm1
842	mulpd	 %xmm1, %xmm3
843	mulpd	46 * SIZE(BB), %xmm1
844	addpd	 %xmm3, %xmm6
845	movapd	56 * SIZE(BB), %xmm3
846	addpd	 %xmm1, %xmm7
847	movddup	12 * SIZE(AA), %xmm1
848	mulpd	 %xmm1, %xmm2
849	mulpd	50 * SIZE(BB), %xmm1
850	addpd	 %xmm2, %xmm4
851	movapd	52 * SIZE(BB), %xmm2
852	addpd	 %xmm1, %xmm5
853	movddup	13 * SIZE(AA), %xmm1
854	mulpd	 %xmm1, %xmm2
855	mulpd	54 * SIZE(BB), %xmm1
856	addpd	 %xmm2, %xmm6
857	movapd	64 * SIZE(BB), %xmm2
858	addpd	 %xmm1, %xmm7
859	movddup	14 * SIZE(AA), %xmm1
860	mulpd	 %xmm1, %xmm3
861	mulpd	58 * SIZE(BB), %xmm1
862	addpd	 %xmm3, %xmm4
863	movapd	60 * SIZE(BB), %xmm3
864	addpd	 %xmm1, %xmm5
865	movddup	15 * SIZE(AA), %xmm1
866	mulpd	 %xmm1, %xmm3
867	mulpd	62 * SIZE(BB), %xmm1
868	addpd	 %xmm3, %xmm6
869	movapd	72 * SIZE(BB), %xmm3
870	addpd	 %xmm1, %xmm7
871	movddup	24 * SIZE(AA), %xmm1
872
873	addl   $16 * SIZE, AA
874	addl   $64 * SIZE, BB
875	decl   %eax
876	jne    .L22
877	ALIGN_4
878
879.L25:
880#ifndef TRMMKERNEL
881	movl	K, %eax
882#else
883	movl	KKK, %eax
884#endif
885	movddup	ALPHA,  %xmm3
886	andl	$15, %eax		# if (k & 1)
887	BRANCH
888	je .L28
889
890.L26:
891	mulpd	 %xmm0, %xmm2
892	mulpd	 2 * SIZE(BB), %xmm0
893	addpd	 %xmm2, %xmm4
894	movapd	 4 * SIZE(BB), %xmm2
895	addpd	 %xmm0, %xmm5
896	movddup	 1 * SIZE(AA), %xmm0
897
898	addl	$1 * SIZE, AA
899	addl	$4 * SIZE, BB
900
901	decl	%eax
902	jg	.L26
903	ALIGN_4
904
905.L28:
906	leal	(%esi, LDC, 1), %eax
907
908	addpd	%xmm6, %xmm4
909	addpd	%xmm7, %xmm5
910
911	mulpd	%xmm3, %xmm4
912	mulpd	%xmm3, %xmm5
913
914#ifndef TRMMKERNEL
915
916#ifdef PENTIUM4
917	SHUFPD_2 %xmm0, %xmm0
918	SHUFPD_2 %xmm1, %xmm1
919#endif
920
921	movsd	0 * SIZE(%esi), %xmm0
922	movhpd	0 * SIZE(%eax), %xmm0
923	movsd	0 * SIZE(%esi, LDC, 2), %xmm1
924	movhpd	0 * SIZE(%eax, LDC, 2), %xmm1
925
926	addpd	%xmm0, %xmm4
927	addpd	%xmm1, %xmm5
928#endif
929
930	movsd	%xmm4, 0 * SIZE(%esi)
931	movhpd	%xmm4, 0 * SIZE(%eax)
932	movsd	%xmm5, 0 * SIZE(%esi, LDC, 2)
933	movhpd	%xmm5, 0 * SIZE(%eax, LDC, 2)
934
935#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
936    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
937	movl	K, %eax
938	subl	KKK, %eax
939	leal	(,%eax, SIZE), %eax
940	leal	(AA, %eax, 1), AA
941	leal	(BB, %eax, 4), BB
942#endif
943
944#if defined(TRMMKERNEL) && defined(LEFT)
945	addl	$1, KK
946#endif
947	ALIGN_4
948
949.L29:
950#if defined(TRMMKERNEL) && !defined(LEFT)
951	addl	$4, KK
952#endif
953
954	leal	(, LDC, 4), %eax
955	movl	BB, B
956	addl	%eax, C			# c += 4 * ldc
957	decl	J			# j --
958	jg	.L10
959	ALIGN_4
960
961.L30:
962	testl	$2, N
963	je	.L60
964
965	movl	C, %esi		# coffset = c
966	movl	A, AA		# aoffset = a
967
968#if defined(TRMMKERNEL) && defined(LEFT)
969	movl	OFFSET, %eax
970	movl	%eax, KK
971#endif
972
973	movl	M,  %ebx
974	sarl	$1, %ebx	# i = (m >> 2)
975	jle	.L50
976	ALIGN_4
977
978.L41:
979#if !defined(TRMMKERNEL) || \
980	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
981	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
982
983	movl	B, BB
984#else
985	movl	KK, %eax
986	leal	(, %eax, SIZE), %eax
987	leal	(AA, %eax, 2), AA
988	leal	(B,  %eax, 2), BB
989#endif
990
991	movapd	 0 * SIZE(AA), %xmm0
992	pxor	%xmm4, %xmm4
993	movapd	 8 * SIZE(AA), %xmm1
994	pxor	%xmm5, %xmm5
995	movddup	 0 * SIZE(BB), %xmm2
996	pxor	%xmm6, %xmm6
997	movddup	 8 * SIZE(BB), %xmm3
998	pxor	%xmm7, %xmm7
999
1000#ifdef HAVE_3DNOW
1001	prefetchw 2 * SIZE(%esi)
1002	prefetchw 2 * SIZE(%esi, LDC)
1003#endif
1004
1005#ifdef PENTIUM4
1006	prefetchnta	3 * SIZE(%esi)
1007	prefetchnta	3 * SIZE(%esi, LDC)
1008#endif
1009
1010#ifndef TRMMKERNEL
1011	movl	K, %eax
1012#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1013	movl	K, %eax
1014	subl	KK, %eax
1015	movl	%eax, KKK
1016#else
1017	movl	KK, %eax
1018#ifdef LEFT
1019	addl	$2, %eax
1020#else
1021	addl	$2, %eax
1022#endif
1023	movl	%eax, KKK
1024#endif
1025	sarl	$3, %eax
1026	je	.L45
1027	ALIGN_4
1028
1029.L42:
1030	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1031	mulpd	 %xmm0, %xmm2
1032	addpd	 %xmm2, %xmm4
1033	movddup	 1 * SIZE(BB), %xmm2
1034	mulpd	 %xmm0, %xmm2
1035	movapd	 2 * SIZE(AA), %xmm0
1036	addpd	 %xmm2, %xmm5
1037	movddup	 2 * SIZE(BB), %xmm2
1038	mulpd	 %xmm0, %xmm2
1039	addpd	 %xmm2, %xmm6
1040	movddup	 3 * SIZE(BB), %xmm2
1041	mulpd	 %xmm0, %xmm2
1042	movapd	 4 * SIZE(AA), %xmm0
1043	addpd	 %xmm2, %xmm7
1044	movddup	 4 * SIZE(BB), %xmm2
1045	mulpd	 %xmm0, %xmm2
1046	addpd	 %xmm2, %xmm4
1047	movddup	 5 * SIZE(BB), %xmm2
1048	mulpd	 %xmm0, %xmm2
1049	movapd	 6 * SIZE(AA), %xmm0
1050	addpd	 %xmm2, %xmm5
1051	movddup	 6 * SIZE(BB), %xmm2
1052	mulpd	 %xmm0, %xmm2
1053	addpd	 %xmm2, %xmm6
1054	movddup	 7 * SIZE(BB), %xmm2
1055	mulpd	 %xmm0, %xmm2
1056	movapd	16 * SIZE(AA), %xmm0
1057	addpd	 %xmm2, %xmm7
1058	movddup	16 * SIZE(BB), %xmm2
1059	mulpd	 %xmm1, %xmm3
1060	addpd	 %xmm3, %xmm4
1061	movddup	 9 * SIZE(BB), %xmm3
1062	mulpd	 %xmm1, %xmm3
1063	movapd	10 * SIZE(AA), %xmm1
1064	addpd	 %xmm3, %xmm5
1065	movddup	10 * SIZE(BB), %xmm3
1066	mulpd	 %xmm1, %xmm3
1067	addpd	 %xmm3, %xmm6
1068	movddup	11 * SIZE(BB), %xmm3
1069	mulpd	 %xmm1, %xmm3
1070	movapd	12 * SIZE(AA), %xmm1
1071	addpd	 %xmm3, %xmm7
1072	movddup	12 * SIZE(BB), %xmm3
1073	mulpd	 %xmm1, %xmm3
1074	addpd	 %xmm3, %xmm4
1075	movddup	13 * SIZE(BB), %xmm3
1076	mulpd	 %xmm1, %xmm3
1077	movapd	14 * SIZE(AA), %xmm1
1078	addpd	 %xmm3, %xmm5
1079	movddup	14 * SIZE(BB), %xmm3
1080	mulpd	 %xmm1, %xmm3
1081	addpd	 %xmm3, %xmm6
1082	movddup	15 * SIZE(BB), %xmm3
1083	mulpd	 %xmm1, %xmm3
1084	movapd	24 * SIZE(AA), %xmm1
1085	addpd	 %xmm3, %xmm7
1086	movddup	24 * SIZE(BB), %xmm3
1087
1088	addl   $16 * SIZE, AA
1089	addl   $16 * SIZE, BB
1090	decl   %eax
1091	jne    .L42
1092	ALIGN_4
1093
1094.L45:
1095#ifndef TRMMKERNEL
1096	movl	K, %eax
1097#else
1098	movl	KKK, %eax
1099#endif
1100	movddup	ALPHA,  %xmm3
1101	andl	$7, %eax		# if (k & 1)
1102	BRANCH
1103	je .L48
1104	ALIGN_3
1105
1106.L46:
1107	mulpd	 %xmm0, %xmm2
1108	addpd	 %xmm2, %xmm4
1109	movddup	 1 * SIZE(BB), %xmm2
1110	mulpd	 %xmm0, %xmm2
1111	movapd	 2 * SIZE(AA), %xmm0
1112	addpd	 %xmm2, %xmm5
1113	movddup	 2 * SIZE(BB), %xmm2
1114
1115	addl	$2 * SIZE, AA
1116	addl	$2 * SIZE, BB
1117	decl	%eax
1118	jg	.L46
1119	ALIGN_4
1120
1121.L48:
1122	addpd	%xmm6, %xmm4
1123	addpd	%xmm7, %xmm5
1124
1125	mulpd	%xmm3, %xmm4
1126	mulpd	%xmm3, %xmm5
1127
1128#ifndef TRMMKERNEL
1129#ifdef PENTIUM4
1130	SHUFPD_2 %xmm0, %xmm0
1131	SHUFPD_2 %xmm1, %xmm1
1132#endif
1133
1134	movsd	0 * SIZE(%esi), %xmm0
1135	movhpd	1 * SIZE(%esi), %xmm0
1136	movsd	0 * SIZE(%esi, LDC, 1), %xmm1
1137	movhpd	1 * SIZE(%esi, LDC, 1), %xmm1
1138
1139	addpd	%xmm0, %xmm4
1140	addpd	%xmm1, %xmm5
1141#endif
1142
1143	movsd	%xmm4, 0 * SIZE(%esi)
1144	movhpd	%xmm4, 1 * SIZE(%esi)
1145	movsd	%xmm5, 0 * SIZE(%esi, LDC, 1)
1146	movhpd	%xmm5, 1 * SIZE(%esi, LDC, 1)
1147
1148
1149#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1150    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1151	movl	K, %eax
1152	subl	KKK, %eax
1153	leal	(,%eax, SIZE), %eax
1154	leal	(AA, %eax, 2), AA
1155	leal	(BB, %eax, 2), BB
1156#endif
1157
1158#if defined(TRMMKERNEL) && defined(LEFT)
1159	addl	$2, KK
1160#endif
1161
1162	addl	$2 * SIZE, %esi		# coffset += 2
1163	decl	%ebx			# i --
1164	jg	.L41
1165	ALIGN_4
1166
1167.L50:
1168	movl	M,  %ebx
1169	testl	$1, %ebx	# i = (m >> 2)
1170	jle	.L59
1171
1172#if !defined(TRMMKERNEL) || \
1173	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1174	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1175
1176	movl	B, BB
1177#else
1178	movl	KK, %eax
1179	leal	(, %eax, SIZE), %eax
1180	leal	(AA, %eax, 1), AA
1181	leal	(B,  %eax, 2), BB
1182#endif
1183
1184	movddup	 0 * SIZE(AA), %xmm0
1185	pxor	%xmm4, %xmm4
1186	movddup	 8 * SIZE(AA), %xmm1
1187	pxor	%xmm5, %xmm5
1188	movapd	 0 * SIZE(BB), %xmm2
1189	pxor	%xmm6, %xmm6
1190	movapd	 8 * SIZE(BB), %xmm3
1191	pxor	%xmm7, %xmm7
1192
1193#ifndef TRMMKERNEL
1194	movl	K, %eax
1195#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1196	movl	K, %eax
1197	subl	KK, %eax
1198	movl	%eax, KKK
1199#else
1200	movl	KK, %eax
1201#ifdef LEFT
1202	addl	$1, %eax
1203#else
1204	addl	$2, %eax
1205#endif
1206	movl	%eax, KKK
1207#endif
1208	sarl	$4, %eax
1209	je	.L55
1210	ALIGN_4
1211
1212.L52:
1213	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1214	mulpd	 %xmm0, %xmm2
1215	movddup	 1 * SIZE(AA), %xmm0
1216	addpd	 %xmm2, %xmm4
1217	mulpd	 2 * SIZE(BB), %xmm0
1218	movapd	16 * SIZE(BB), %xmm2
1219	addpd	 %xmm0, %xmm5
1220	movddup	 2 * SIZE(AA), %xmm0
1221	mulpd	 4 * SIZE(BB), %xmm0
1222	addpd	 %xmm0, %xmm6
1223	movddup	 3 * SIZE(AA), %xmm0
1224	mulpd	 6 * SIZE(BB), %xmm0
1225	addpd	 %xmm0, %xmm7
1226	movddup	 4 * SIZE(AA), %xmm0
1227	mulpd	 %xmm0, %xmm3
1228	movddup	 5 * SIZE(AA), %xmm0
1229	addpd	 %xmm3, %xmm4
1230	mulpd	10 * SIZE(BB), %xmm0
1231	movapd	24 * SIZE(BB), %xmm3
1232	addpd	 %xmm0, %xmm5
1233	movddup	 6 * SIZE(AA), %xmm0
1234	mulpd	12 * SIZE(BB), %xmm0
1235	addpd	 %xmm0, %xmm6
1236	movddup	 7 * SIZE(AA), %xmm0
1237	mulpd	14 * SIZE(BB), %xmm0
1238	addpd	 %xmm0, %xmm7
1239	movddup	16 * SIZE(AA), %xmm0
1240	mulpd	 %xmm1, %xmm2
1241	movddup	 9 * SIZE(AA), %xmm1
1242	addpd	 %xmm2, %xmm4
1243	mulpd	18 * SIZE(BB), %xmm1
1244	movapd	32 * SIZE(BB), %xmm2
1245	addpd	 %xmm1, %xmm5
1246	movddup	10 * SIZE(AA), %xmm1
1247	mulpd	20 * SIZE(BB), %xmm1
1248	addpd	 %xmm1, %xmm6
1249	movddup	11 * SIZE(AA), %xmm1
1250	mulpd	22 * SIZE(BB), %xmm1
1251	addpd	 %xmm1, %xmm7
1252	movddup	12 * SIZE(AA), %xmm1
1253	mulpd	 %xmm1, %xmm3
1254	movddup	13 * SIZE(AA), %xmm1
1255	addpd	 %xmm3, %xmm4
1256	mulpd	26 * SIZE(BB), %xmm1
1257	movapd	40 * SIZE(BB), %xmm3
1258	addpd	 %xmm1, %xmm5
1259	movddup	14 * SIZE(AA), %xmm1
1260	mulpd	28 * SIZE(BB), %xmm1
1261	addpd	 %xmm1, %xmm6
1262	movddup	15 * SIZE(AA), %xmm1
1263	mulpd	30 * SIZE(BB), %xmm1
1264	addpd	 %xmm1, %xmm7
1265	movddup	24 * SIZE(AA), %xmm1
1266
1267	addl   $16 * SIZE, AA
1268	addl   $32 * SIZE, BB
1269	decl   %eax
1270	jne    .L52
1271	ALIGN_4
1272
1273.L55:
1274#ifndef TRMMKERNEL
1275	movl	K, %eax
1276#else
1277	movl	KKK, %eax
1278#endif
1279	movddup	ALPHA,  %xmm3
1280	andl	$15, %eax		# if (k & 1)
1281	BRANCH
1282	je .L58
1283
1284.L56:
1285	mulpd	 %xmm0, %xmm2
1286	movddup	 1 * SIZE(AA), %xmm0
1287	addpd	 %xmm2, %xmm4
1288	movapd	 2 * SIZE(BB), %xmm2
1289
1290	addl	$1 * SIZE, AA
1291	addl	$2 * SIZE, BB
1292	decl	%eax
1293	jg	.L56
1294	ALIGN_4
1295
1296.L58:
1297	addpd	%xmm5, %xmm4
1298	addpd	%xmm7, %xmm6
1299	addpd	%xmm6, %xmm4
1300
1301	mulpd	%xmm3, %xmm4
1302
1303#ifndef TRMMKERNEL
1304#ifdef PENTIUM4
1305	SHUFPD_2 %xmm0, %xmm0
1306#endif
1307
1308	movsd	0 * SIZE(%esi), %xmm0
1309	movhpd	0 * SIZE(%esi, LDC, 1), %xmm0
1310
1311	addpd	%xmm0, %xmm4
1312#endif
1313
1314	movsd	%xmm4, 0 * SIZE(%esi)
1315	movhpd	%xmm4, 0 * SIZE(%esi, LDC, 1)
1316
1317#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1318    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1319	movl	K, %eax
1320	subl	KKK, %eax
1321	leal	(,%eax, SIZE), %eax
1322	leal	(AA, %eax, 1), AA
1323	leal	(BB, %eax, 2), BB
1324#endif
1325
1326#if defined(TRMMKERNEL) && defined(LEFT)
1327	addl	$1, KK
1328#endif
1329	ALIGN_4
1330
1331.L59:
1332#if defined(TRMMKERNEL) && !defined(LEFT)
1333	addl	$2, KK
1334#endif
1335
1336	leal	(, LDC, 2), %eax
1337	movl	BB, B
1338	addl	%eax, C			# c += 4 * ldc
1339	ALIGN_4
1340
1341.L60:
1342	testl	$1, N
1343	je	.L999
1344
1345	movl	C, %esi		# coffset = c
1346	movl	A, AA		# aoffset = a
1347
1348#if defined(TRMMKERNEL) && defined(LEFT)
1349	movl	OFFSET, %eax
1350	movl	%eax, KK
1351#endif
1352
1353	movl	M,  %ebx
1354	sarl	$1, %ebx	# i = (m >> 2)
1355	jle	.L80
1356	ALIGN_4
1357
1358.L71:
1359#if !defined(TRMMKERNEL) || \
1360	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1361	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1362
1363	movl	B, BB
1364#else
1365	movl	KK, %eax
1366	leal	(, %eax, SIZE), %eax
1367	leal	(AA, %eax, 2), AA
1368	leal	(B,  %eax, 1), BB
1369#endif
1370
1371	movapd	 0 * SIZE(AA), %xmm0
1372	pxor	%xmm4, %xmm4
1373	movapd	 8 * SIZE(AA), %xmm1
1374	pxor	%xmm5, %xmm5
1375	movddup	 0 * SIZE(BB), %xmm2
1376	pxor	%xmm6, %xmm6
1377	movddup	 4 * SIZE(BB), %xmm3
1378	pxor	%xmm7, %xmm7
1379
1380#ifdef PENTIUM4
1381	prefetchnta 3 * SIZE(%esi)
1382#endif
1383
1384#ifndef TRMMKERNEL
1385	movl	K, %eax
1386#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1387	movl	K, %eax
1388	subl	KK, %eax
1389	movl	%eax, KKK
1390#else
1391	movl	KK, %eax
1392#ifdef LEFT
1393	addl	$2, %eax
1394#else
1395	addl	$1, %eax
1396#endif
1397	movl	%eax, KKK
1398#endif
1399	sarl	$3, %eax
1400	je	.L75
1401	ALIGN_4
1402
1403.L72:
1404	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1405	mulpd	 %xmm2, %xmm0
1406	movddup	 1 * SIZE(BB), %xmm2
1407	addpd	 %xmm0, %xmm4
1408	movapd	16 * SIZE(AA), %xmm0
1409	mulpd	 2 * SIZE(AA), %xmm2
1410	addpd	 %xmm2, %xmm5
1411	movddup	 2 * SIZE(BB), %xmm2
1412	mulpd	 4 * SIZE(AA), %xmm2
1413	addpd	 %xmm2, %xmm6
1414	movddup	 3 * SIZE(BB), %xmm2
1415	mulpd	 6 * SIZE(AA), %xmm2
1416	addpd	 %xmm2, %xmm7
1417	movddup	 8 * SIZE(BB), %xmm2
1418	mulpd	 %xmm3, %xmm1
1419	movddup	 5 * SIZE(BB), %xmm3
1420	addpd	 %xmm1, %xmm4
1421	movapd	24 * SIZE(AA), %xmm1
1422	mulpd	10 * SIZE(AA), %xmm3
1423	addpd	 %xmm3, %xmm5
1424	movddup	 6 * SIZE(BB), %xmm3
1425	mulpd	12 * SIZE(AA), %xmm3
1426	addpd	 %xmm3, %xmm6
1427	movddup	 7 * SIZE(BB), %xmm3
1428	mulpd	14 * SIZE(AA), %xmm3
1429	addpd	 %xmm3, %xmm7
1430	movddup	12 * SIZE(BB), %xmm3
1431
1432	addl   $16 * SIZE, AA
1433	addl   $ 8 * SIZE, BB
1434	decl   %eax
1435	jne    .L72
1436	ALIGN_4
1437
1438.L75:
1439#ifndef TRMMKERNEL
1440	movl	K, %eax
1441#else
1442	movl	KKK, %eax
1443#endif
1444	movddup	ALPHA,  %xmm3
1445	andl	$7, %eax		# if (k & 1)
1446	BRANCH
1447	je .L78
1448	ALIGN_3
1449
1450.L76:
1451	mulpd	 %xmm2, %xmm0
1452	movddup	 1 * SIZE(BB), %xmm2
1453	addpd	 %xmm0, %xmm4
1454	movapd	 2 * SIZE(AA), %xmm0
1455
1456	addl	$2 * SIZE, AA
1457	addl	$1 * SIZE, BB
1458	decl	%eax
1459	jg	.L76
1460	ALIGN_4
1461
1462.L78:
1463	addpd	%xmm5, %xmm4
1464	addpd	%xmm7, %xmm6
1465	addpd	%xmm6, %xmm4
1466
1467	mulpd	%xmm3, %xmm4
1468
1469#ifndef TRMMKERNEL
1470#ifdef PENTIUM4
1471	SHUFPD_2 %xmm0, %xmm0
1472#endif
1473
1474	movsd	0 * SIZE(%esi), %xmm0
1475	movhpd	1 * SIZE(%esi), %xmm0
1476
1477	addpd	%xmm0, %xmm4
1478#endif
1479
1480	movsd	%xmm4, 0 * SIZE(%esi)
1481	movhpd	%xmm4, 1 * SIZE(%esi)
1482
1483#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1484    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1485	movl	K, %eax
1486	subl	KKK, %eax
1487	leal	(,%eax, SIZE), %eax
1488	leal	(AA, %eax, 2), AA
1489	leal	(BB, %eax, 1), BB
1490#endif
1491
1492#if defined(TRMMKERNEL) && defined(LEFT)
1493	addl	$2, KK
1494#endif
1495
1496	addl	$2 * SIZE, %esi		# coffset += 2
1497	decl	%ebx			# i --
1498	jg	.L71
1499	ALIGN_4
1500
1501.L80:
1502	movl	M,  %ebx
1503	testl	$1, %ebx	# i = (m >> 2)
1504	jle	.L999
1505
1506#if !defined(TRMMKERNEL) || \
1507	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1508	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1509
1510	movl	B, BB
1511#else
1512	movl	KK, %eax
1513	leal	(, %eax, SIZE), %eax
1514	leal	(AA, %eax, 1), AA
1515	leal	(B,  %eax, 1), BB
1516#endif
1517
1518	movapd	 0 * SIZE(AA), %xmm0
1519	pxor	%xmm4, %xmm4
1520	movapd	 8 * SIZE(AA), %xmm1
1521	pxor	%xmm5, %xmm5
1522	movapd	 0 * SIZE(BB), %xmm2
1523	pxor	%xmm6, %xmm6
1524	movapd	 8 * SIZE(BB), %xmm3
1525	pxor	%xmm7, %xmm7
1526
1527#ifndef TRMMKERNEL
1528	movl	K, %eax
1529#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1530	movl	K, %eax
1531	subl	KK, %eax
1532	movl	%eax, KKK
1533#else
1534	movl	KK, %eax
1535#ifdef LEFT
1536	addl	$1, %eax
1537#else
1538	addl	$1, %eax
1539#endif
1540	movl	%eax, KKK
1541#endif
1542	sarl	$4, %eax
1543	je	.L85
1544	ALIGN_4
1545
1546.L82:
1547	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1548	mulpd	 %xmm0, %xmm2
1549	movapd	 2 * SIZE(AA), %xmm0
1550	addpd	 %xmm2, %xmm4
1551	mulpd	 2 * SIZE(BB), %xmm0
1552	movapd	16 * SIZE(BB), %xmm2
1553	addpd	 %xmm0, %xmm5
1554	movapd	 4 * SIZE(AA), %xmm0
1555	mulpd	 4 * SIZE(BB), %xmm0
1556	addpd	 %xmm0, %xmm6
1557	movapd	 6 * SIZE(AA), %xmm0
1558	mulpd	 6 * SIZE(BB), %xmm0
1559	addpd	 %xmm0, %xmm7
1560	movapd	16 * SIZE(AA), %xmm0
1561	mulpd	 %xmm1, %xmm3
1562	movapd	10 * SIZE(AA), %xmm1
1563	addpd	 %xmm3, %xmm4
1564	mulpd	10 * SIZE(BB), %xmm1
1565	movapd	24 * SIZE(BB), %xmm3
1566	addpd	 %xmm1, %xmm5
1567	movapd	12 * SIZE(AA), %xmm1
1568	mulpd	12 * SIZE(BB), %xmm1
1569	addpd	 %xmm1, %xmm6
1570	movapd	14 * SIZE(AA), %xmm1
1571	mulpd	14 * SIZE(BB), %xmm1
1572	addpd	 %xmm1, %xmm7
1573	movapd	24 * SIZE(AA), %xmm1
1574
1575	addl   $16 * SIZE, AA
1576	addl   $16 * SIZE, BB
1577	decl   %eax
1578	jne    .L82
1579	ALIGN_4
1580
1581.L85:
1582#ifndef TRMMKERNEL
1583	movl	K, %eax
1584#else
1585	movl	KKK, %eax
1586#endif
1587	movddup	ALPHA,  %xmm3
1588	andl	$15, %eax		# if (k & 1)
1589	BRANCH
1590	je .L88
1591
1592.L86:
1593	mulsd	 %xmm0, %xmm2
1594	movsd	 1 * SIZE(AA), %xmm0
1595	addsd	 %xmm2, %xmm4
1596	movsd	 1 * SIZE(BB), %xmm2
1597
1598	addl	$1 * SIZE, AA
1599	addl	$1 * SIZE, BB
1600	decl	%eax
1601	jg	.L86
1602	ALIGN_4
1603
1604.L88:
1605	addpd	%xmm5, %xmm4
1606	addpd	%xmm7, %xmm6
1607	addpd	%xmm6, %xmm4
1608
1609	haddpd	%xmm4, %xmm4
1610
1611	mulsd	%xmm3, %xmm4
1612
1613#ifndef TRMMKERNEL
1614#ifdef PENTIUM4
1615	SHUFPD_2 %xmm0, %xmm0
1616#endif
1617
1618	movsd	0 * SIZE(%esi), %xmm0
1619
1620	addsd	%xmm0, %xmm4
1621#endif
1622
1623	movsd	%xmm4, 0 * SIZE(%esi)
1624	ALIGN_4
1625
1626.L999:
1627	popl	%ebx
1628	popl	%esi
1629	popl	%edi
1630	popl	%ebp
1631
1632	addl	$ARGS, %esp
1633	ret
1634
1635	EPILOGUE
1636