1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#if !defined(HAVE_SSE) || !defined(HAVE_MMX)
43#error  You have to check your configuration.
44#endif
45
46#define STACK	16
47#define ARGS	 0
48
49#define STACK_M	 4 + STACK + ARGS(%esi)
50#define STACK_N	 8 + STACK + ARGS(%esi)
51#define STACK_K	12 + STACK + ARGS(%esi)
52#define STACK_ALPHA_R	16 + STACK + ARGS(%esi)
53#define STACK_ALPHA_I	20 + STACK + ARGS(%esi)
54#define STACK_A	24 + STACK + ARGS(%esi)
55#define STACK_B	28 + STACK + ARGS(%esi)
56#define STACK_C	32 + STACK + ARGS(%esi)
57#define STACK_LDC	36 + STACK + ARGS(%esi)
58#define STACK_OFFT	40 + STACK + ARGS(%esi)
59
60#define POSINV	 0(%esp)
61#define ALPHA_R	16(%esp)
62#define ALPHA_I	32(%esp)
63#define K	48(%esp)
64#define N	52(%esp)
65#define M	56(%esp)
66#define A	60(%esp)
67#define C	64(%esp)
68#define J	68(%esp)
69#define OLD_STACK 72(%esp)
70#define TEMP    76(%esp)
71#define OFFSET  80(%esp)
72#define KK	84(%esp)
73#define KKK	88(%esp)
74#define BUFFER 128(%esp)
75
76#define B	%edi
77#define LDC	%ebp
78
79#define STACK_ALIGN	4096
80#define STACK_OFFSET	1024
81
82#define AA	%edx
83#define BB	%ecx
84
85#if !defined(HAVE_SSE2) || defined(OPTERON)
86#define movsd	movlps
87#endif
88
89#ifdef HAVE_SSE2
90#define xorps	pxor
91#endif
92
93#define KERNEL1(address) \
94	mulps	%xmm0, %xmm2; \
95	mulps	 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
96	addps	%xmm2, %xmm4; \
97	movaps	 0 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
98	addps	%xmm0, %xmm5; \
99	movaps	 4 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
100	mulps	%xmm0, %xmm2; \
101	mulps	 4 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
102	addps	%xmm2, %xmm6; \
103	movaps	 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
104	addps	%xmm0, %xmm7; \
105	movaps	 8 * SIZE + (address) * SIZE * 2(AA), %xmm0
106
107#define KERNEL2(address) \
108	mulps	%xmm0, %xmm2; \
109	mulps	12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
110	addps	%xmm2, %xmm4; \
111	movaps	 8 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
112	addps	%xmm0, %xmm5; \
113	movaps	12 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
114	mulps	%xmm0, %xmm2; \
115	mulps	12 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
116	addps	%xmm2, %xmm6; \
117	movaps	32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
118	addps	%xmm0, %xmm7; \
119	movaps	32 * SIZE + (address) * SIZE * 2(AA), %xmm0
120
121#define KERNEL3(address) \
122	mulps	%xmm1, %xmm3; \
123	mulps	20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
124	addps	%xmm3, %xmm4; \
125	movaps	16 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
126	addps	%xmm1, %xmm5; \
127	movaps	20 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
128	mulps	%xmm1, %xmm3; \
129	mulps	20 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
130	addps	%xmm3, %xmm6; \
131	movaps	24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
132	addps	%xmm1, %xmm7; \
133	movaps	24 * SIZE + (address) * SIZE * 2(AA), %xmm1
134
135#define KERNEL4(address) \
136	mulps	%xmm1, %xmm3; \
137	mulps	28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
138	addps	%xmm3, %xmm4; \
139	movaps	24 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
140	addps	%xmm1, %xmm5; \
141	movaps	28 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
142	mulps	%xmm1, %xmm3; \
143	mulps	28 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
144	addps	%xmm3, %xmm6; \
145	movaps	48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
146	addps	%xmm1, %xmm7; \
147	movaps	48 * SIZE + (address) * SIZE * 2(AA), %xmm1
148
149#define KERNEL5(address) \
150	mulps	%xmm0, %xmm2; \
151	mulps	36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
152	addps	%xmm2, %xmm4; \
153	movaps	32 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
154	addps	%xmm0, %xmm5; \
155	movaps	36 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
156	mulps	%xmm0, %xmm2; \
157	mulps	36 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
158	addps	%xmm2, %xmm6; \
159	movaps	40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
160	addps	%xmm0, %xmm7; \
161	movaps	40 * SIZE + (address) * SIZE * 2(AA), %xmm0
162
163#define KERNEL6(address) \
164	mulps	%xmm0, %xmm2; \
165	mulps	44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
166	addps	%xmm2, %xmm4; \
167	movaps	40 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
168	addps	%xmm0, %xmm5; \
169	movaps	44 * SIZE + (address) * SIZE * 2(AA), %xmm0; \
170	mulps	%xmm0, %xmm2; \
171	mulps	44 * SIZE + (address) * SIZE * 2(BB), %xmm0; \
172	addps	%xmm2, %xmm6; \
173	movaps	64 * SIZE + (address) * SIZE * 2(BB), %xmm2; \
174	addps	%xmm0, %xmm7; \
175	movaps	64 * SIZE + (address) * SIZE * 2(AA), %xmm0
176
177#define KERNEL7(address) \
178	mulps	%xmm1, %xmm3; \
179	mulps	52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
180	addps	%xmm3, %xmm4; \
181	movaps	48 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
182	addps	%xmm1, %xmm5; \
183	movaps	52 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
184	mulps	%xmm1, %xmm3; \
185	mulps	52 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
186	addps	%xmm3, %xmm6; \
187	movaps	56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
188	addps	%xmm1, %xmm7; \
189	movaps	56 * SIZE + (address) * SIZE * 2(AA), %xmm1
190
191#define KERNEL8(address) \
192	mulps	%xmm1, %xmm3; \
193	mulps	60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
194	addps	%xmm3, %xmm4; \
195	movaps	56 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
196	addps	%xmm1, %xmm5; \
197	movaps	60 * SIZE + (address) * SIZE * 2(AA), %xmm1; \
198	mulps	%xmm1, %xmm3; \
199	mulps	60 * SIZE + (address) * SIZE * 2(BB), %xmm1; \
200	addps	%xmm3, %xmm6; \
201	movaps	80 * SIZE + (address) * SIZE * 2(BB), %xmm3; \
202	addps	%xmm1, %xmm7; \
203	movaps	80 * SIZE + (address) * SIZE * 2(AA), %xmm1
204
205	PROLOGUE
206
207	pushl	%ebp
208	pushl	%edi
209	pushl	%esi
210	pushl	%ebx
211
212	PROFCODE
213
214	EMMS
215
216	movl	%esp, %esi	# save old stack
217
218	subl	$128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
219	andl	$-STACK_ALIGN, %esp	# align stack
220	addl	$STACK_OFFSET, %esp
221
222	STACK_TOUCHING
223
224	movd	STACK_M, %mm0
225	movl	STACK_N, %eax
226	movd	STACK_K, %mm1
227	movd	STACK_A, %mm2
228	movl	STACK_B, B
229	movd	STACK_C, %mm3
230	movl	STACK_LDC, LDC
231#ifdef TRMMKERNEL
232	movd	STACK_OFFT, %mm4
233#endif
234
235	movd	%mm1, K
236	movd	%mm0, M
237	movl	%eax, N
238	movd	%mm2, A
239	movd	%mm3, C
240	movl	%esi, OLD_STACK
241#ifdef TRMMKERNEL
242	movd	%mm4, OFFSET
243	movd	%mm4, KK
244#ifndef LEFT
245	negl	KK
246#endif
247#endif
248
249	leal	(, LDC, SIZE * 2), LDC
250
251	movss	STACK_ALPHA_R, %xmm0
252	movss	STACK_ALPHA_I, %xmm1
253
254#ifdef HAVE_SSE2
255	pxor	%xmm7, %xmm7
256	cmpeqps	%xmm7, %xmm7
257	pslld	$31, %xmm7	# Generate mask
258#else
259	movl	$0x80000000, TEMP
260	movss	TEMP, %xmm7
261	shufps	$0, %xmm7, %xmm7
262#endif
263	xorps	%xmm2, %xmm2
264
265	shufps	$0, %xmm0, %xmm0
266
267	movaps	 %xmm0,  0 + ALPHA_R
268	movss	 %xmm1,  4 + ALPHA_I
269	movss	 %xmm1, 12 + ALPHA_I
270	xorps	 %xmm7, %xmm1
271	movss	 %xmm1,  0 + ALPHA_I
272	movss	 %xmm1,  8 + ALPHA_I
273
274#if   defined(NN) || defined(NT) || defined(NR) || defined(NC) || \
275      defined(TN) || defined(TT) || defined(TR) || defined(TC)
276	movss	  %xmm7,   0 + POSINV
277	movss	  %xmm2,   4 + POSINV
278	movss	  %xmm7,   8 + POSINV
279	movss	  %xmm2,  12 + POSINV
280#else
281	movss	  %xmm2,   0 + POSINV
282	movss	  %xmm7,   4 + POSINV
283	movss	  %xmm2,   8 + POSINV
284	movss	  %xmm7,  12 + POSINV
285#endif
286
287	movl	%eax, J			# j = n
288	testl	%eax, %eax
289	jle	.L999
290
291.L01:
292#if defined(TRMMKERNEL) && defined(LEFT)
293	movl	OFFSET, %eax
294	movl	%eax, KK
295#endif
296
297 	leal	BUFFER, BB
298	movaps	POSINV, %xmm7
299
300	movl	K, %eax
301	sarl	$2, %eax
302	jle	.L03
303
304.L02:
305	movss	0 * SIZE(B), %xmm0
306	movss	1 * SIZE(B), %xmm1
307	movss	2 * SIZE(B), %xmm2
308	movss	3 * SIZE(B), %xmm3
309
310	shufps	$0, %xmm0, %xmm0
311	shufps	$0, %xmm1, %xmm1
312	shufps	$0, %xmm2, %xmm2
313	shufps	$0, %xmm3, %xmm3
314
315#if defined(NN) || defined(NT) || defined(NR) || defined(NC)  || \
316    defined(TN) || defined(TT) || defined(TR) || defined(TC)
317	xorps	 %xmm7, %xmm1
318	xorps	 %xmm7, %xmm3
319#else
320	xorps	 %xmm7, %xmm0
321	xorps	 %xmm7, %xmm2
322#endif
323
324	movaps	%xmm0,  0 * SIZE(BB)
325	movaps	%xmm1,  4 * SIZE(BB)
326	movaps	%xmm2,  8 * SIZE(BB)
327	movaps	%xmm3, 12 * SIZE(BB)
328
329	movss	4 * SIZE(B), %xmm0
330	movss	5 * SIZE(B), %xmm1
331	movss	6 * SIZE(B), %xmm2
332	movss	7 * SIZE(B), %xmm3
333
334	shufps	$0, %xmm0, %xmm0
335	shufps	$0, %xmm1, %xmm1
336	shufps	$0, %xmm2, %xmm2
337	shufps	$0, %xmm3, %xmm3
338
339#if defined(NN) || defined(NT) || defined(NR) || defined(NC)  || \
340    defined(TN) || defined(TT) || defined(TR) || defined(TC)
341	xorps	 %xmm7, %xmm1
342	xorps	 %xmm7, %xmm3
343#else
344	xorps	 %xmm7, %xmm0
345	xorps	 %xmm7, %xmm2
346#endif
347
348	movaps	%xmm0, 16 * SIZE(BB)
349	movaps	%xmm1, 20 * SIZE(BB)
350	movaps	%xmm2, 24 * SIZE(BB)
351	movaps	%xmm3, 28 * SIZE(BB)
352
353	prefetcht0	 104 * SIZE(B)
354
355	addl	$ 8 * SIZE, B
356	addl	$32 * SIZE, BB
357	decl	%eax
358	jne	.L02
359
360.L03:
361	movl	K, %eax
362	andl	$3, %eax
363	BRANCH
364	jle	.L05
365
366.L04:
367	movss	0 * SIZE(B), %xmm0
368	movss	1 * SIZE(B), %xmm1
369
370	shufps	$0, %xmm0, %xmm0
371	shufps	$0, %xmm1, %xmm1
372
373#if defined(NN) || defined(NT) || defined(NR) || defined(NC)  || \
374    defined(TN) || defined(TT) || defined(TR) || defined(TC)
375	xorps	 %xmm7, %xmm1
376#else
377	xorps	 %xmm7, %xmm0
378#endif
379
380	movaps	%xmm0,  0 * SIZE(BB)
381	movaps	%xmm1,  4 * SIZE(BB)
382
383	addl	$2 * SIZE, B
384	addl	$8 * SIZE, BB
385	decl	%eax
386	jne	.L04
387	ALIGN_4
388
389.L05:
390	movl	C, %esi		# coffset = c
391	movl	A, AA		# aoffset = a
392	movl	M,  %ebx
393	sarl	$2, %ebx	# i = (m >> 2)
394	jle	.L50
395	ALIGN_4
396
397.L10:
398
399#ifdef PENTIUM4
400
401#if !defined(TRMMKERNEL) || \
402	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
403	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
404
405	leal	BUFFER, BB
406	movaps	 0 * SIZE + BUFFER, %xmm2
407	xorps	%xmm4, %xmm4
408	movaps	 0 * SIZE(AA), %xmm0
409	xorps	%xmm5, %xmm5
410	movaps	 16 * SIZE + BUFFER, %xmm3
411	xorps	%xmm6, %xmm6
412	movaps	 16 * SIZE(AA), %xmm1
413	xorps	%xmm7, %xmm7
414
415#else
416
417	leal	BUFFER, BB
418	movl	KK, %eax
419	leal	(, %eax,   8), %eax
420	leal	(AA, %eax, 4), AA
421	leal	(BB, %eax, 4), BB /* because it's doubled */
422
423	movaps	 0 * SIZE(BB), %xmm2
424	xorps	%xmm4, %xmm4
425	movaps	 0 * SIZE(AA), %xmm0
426	xorps	%xmm5, %xmm5
427	movaps	 16 * SIZE(BB), %xmm3
428	xorps	%xmm6, %xmm6
429	movaps	 16 * SIZE(AA), %xmm1
430	xorps	%xmm7, %xmm7
431
432#endif
433
434	prefetchnta    8 * SIZE(%esi)
435
436#ifndef TRMMKERNEL
437	movl	K, %eax
438#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
439	movl	K, %eax
440	subl	KK, %eax
441	movl	%eax, KKK
442#else
443	movl	KK, %eax
444#ifdef LEFT
445	addl	$4, %eax
446#else
447	addl	$1, %eax
448#endif
449	movl	%eax, KKK
450#endif
451	andl	$-8, %eax
452	je	.L12
453	sall	$3, %eax
454
455.L1X:
456	KERNEL1(32  *  0)
457	KERNEL2(32  *  0)
458	KERNEL3(32  *  0)
459	KERNEL4(32  *  0)
460	KERNEL5(32  *  0)
461	KERNEL6(32  *  0)
462	KERNEL7(32  *  0)
463	KERNEL8(32  *  0)
464	cmpl	$64 *  1, %eax
465	NOBRANCH
466	jle	.L11
467	KERNEL1(32  *  1)
468	KERNEL2(32  *  1)
469	KERNEL3(32  *  1)
470	KERNEL4(32  *  1)
471	KERNEL5(32  *  1)
472	KERNEL6(32  *  1)
473	KERNEL7(32  *  1)
474	KERNEL8(32  *  1)
475	cmpl	$64 *  2, %eax
476	NOBRANCH
477	jle	.L11
478	KERNEL1(32  *  2)
479	KERNEL2(32  *  2)
480	KERNEL3(32  *  2)
481	KERNEL4(32  *  2)
482	KERNEL5(32  *  2)
483	KERNEL6(32  *  2)
484	KERNEL7(32  *  2)
485	KERNEL8(32  *  2)
486	cmpl	$64 *  3, %eax
487	NOBRANCH
488	jle	.L11
489	KERNEL1(32  *  3)
490	KERNEL2(32  *  3)
491	KERNEL3(32  *  3)
492	KERNEL4(32  *  3)
493	KERNEL5(32  *  3)
494	KERNEL6(32  *  3)
495	KERNEL7(32  *  3)
496	KERNEL8(32  *  3)
497	cmpl	$64 *  4, %eax
498	NOBRANCH
499	jle	.L11
500	KERNEL1(32  *  4)
501	KERNEL2(32  *  4)
502	KERNEL3(32  *  4)
503	KERNEL4(32  *  4)
504	KERNEL5(32  *  4)
505	KERNEL6(32  *  4)
506	KERNEL7(32  *  4)
507	KERNEL8(32  *  4)
508	cmpl	$64 *  5, %eax
509	NOBRANCH
510	jle	.L11
511	KERNEL1(32  *  5)
512	KERNEL2(32  *  5)
513	KERNEL3(32  *  5)
514	KERNEL4(32  *  5)
515	KERNEL5(32  *  5)
516	KERNEL6(32  *  5)
517	KERNEL7(32  *  5)
518	KERNEL8(32  *  5)
519	cmpl	$64 *  6, %eax
520	NOBRANCH
521	jle	.L11
522	KERNEL1(32  *  6)
523	KERNEL2(32  *  6)
524	KERNEL3(32  *  6)
525	KERNEL4(32  *  6)
526	KERNEL5(32  *  6)
527	KERNEL6(32  *  6)
528	KERNEL7(32  *  6)
529	KERNEL8(32  *  6)
530	cmpl	$64 *  7, %eax
531	NOBRANCH
532	jle	.L11
533	KERNEL1(32  *  7)
534	KERNEL2(32  *  7)
535	KERNEL3(32  *  7)
536	KERNEL4(32  *  7)
537	KERNEL5(32  *  7)
538	KERNEL6(32  *  7)
539	KERNEL7(32  *  7)
540	KERNEL8(32  *  7)
541
542	addl	$128 * 4  * SIZE, AA
543	addl	$128 * 4  * SIZE, BB
544	subl	$ 64 * 8, %eax
545	BRANCH
546	jg	.L1X
547
548.L11:
549	leal	(AA, %eax, 4), AA
550	leal	(BB, %eax, 4), BB
551
552#else
553
554#if !defined(TRMMKERNEL) || \
555	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
556	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
557
558	leal	BUFFER, BB
559	movaps	 0 * SIZE + BUFFER, %xmm2
560	xorps	%xmm4, %xmm4
561	movaps	 0 * SIZE(AA), %xmm0
562	xorps	%xmm5, %xmm5
563	movaps	 8 * SIZE + BUFFER, %xmm3
564	xorps	%xmm6, %xmm6
565	movaps	 8 * SIZE(AA), %xmm1
566	xorps	%xmm7, %xmm7
567
568#else
569
570	leal	BUFFER, BB
571	movl	KK, %eax
572	leal	(, %eax,   8), %eax
573	leal	(AA, %eax, 4), AA
574	leal	(BB, %eax, 4), BB /* because it's doubled */
575
576	movaps	 0 * SIZE(BB), %xmm2
577	xorps	%xmm4, %xmm4
578	movaps	 0 * SIZE(AA), %xmm0
579	xorps	%xmm5, %xmm5
580	movaps	 8 * SIZE(BB), %xmm3
581	xorps	%xmm6, %xmm6
582	movaps	 8 * SIZE(AA), %xmm1
583	xorps	%xmm7, %xmm7
584
585#endif
586
587#ifndef TRMMKERNEL
588	movl	K, %eax
589#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
590	movl	K, %eax
591	subl	KK, %eax
592	movl	%eax, KKK
593#else
594	movl	KK, %eax
595#ifdef LEFT
596	addl	$4, %eax
597#else
598	addl	$1, %eax
599#endif
600	movl	%eax, KKK
601#endif
602	sarl	$3, %eax
603	prefetcht0   8 * SIZE(%esi)
604	je	.L12
605	ALIGN_4
606
607#define PREFETCHSIZE 48
608
609.L11:
610#ifdef CORE_KATMAI
611	prefetcht0	PREFETCHSIZE * SIZE(AA)
612#endif
613
614	mulps	%xmm0, %xmm2
615	mulps	 4 * SIZE(BB), %xmm0
616	addps	%xmm2, %xmm4
617	movaps	 0 * SIZE(BB), %xmm2
618
619	addps	%xmm0, %xmm5
620	movaps	 4 * SIZE(AA), %xmm0
621	mulps	%xmm0, %xmm2
622	mulps	 4 * SIZE(BB), %xmm0
623
624	addps	%xmm2, %xmm6
625	movaps	16 * SIZE(BB), %xmm2
626	addps	%xmm0, %xmm7
627	movaps	16 * SIZE(AA), %xmm0
628
629#ifdef CORE_KATMAI
630	prefetcht0	(PREFETCHSIZE + 8) * SIZE(AA)
631#endif
632
633	mulps	%xmm1, %xmm3
634	mulps	12 * SIZE(BB), %xmm1
635	addps	%xmm3, %xmm4
636	movaps	 8 * SIZE(BB), %xmm3
637
638	addps	%xmm1, %xmm5
639	movaps	12 * SIZE(AA), %xmm1
640	mulps	%xmm1, %xmm3
641	mulps	12 * SIZE(BB), %xmm1
642
643	addps	%xmm3, %xmm6
644	movaps	24 * SIZE(BB), %xmm3
645	addps	%xmm1, %xmm7
646	movaps	24 * SIZE(AA), %xmm1
647
648#ifdef CORE_KATMAI
649	prefetcht0	(PREFETCHSIZE + 16) * SIZE(AA)
650#endif
651
652	mulps	%xmm0, %xmm2
653	mulps	20 * SIZE(BB), %xmm0
654	addps	%xmm2, %xmm4
655	movaps	16 * SIZE(BB), %xmm2
656
657	addps	%xmm0, %xmm5
658	movaps	20 * SIZE(AA), %xmm0
659	mulps	%xmm0, %xmm2
660	mulps	20 * SIZE(BB), %xmm0
661
662	addps	%xmm2, %xmm6
663	movaps	32 * SIZE(BB), %xmm2
664	addps	%xmm0, %xmm7
665	movaps	32 * SIZE(AA), %xmm0
666
667#ifdef CORE_KATMAI
668	prefetcht0	(PREFETCHSIZE + 24) * SIZE(AA)
669#endif
670
671	mulps	%xmm1, %xmm3
672	mulps	28 * SIZE(BB), %xmm1
673	addps	%xmm3, %xmm4
674	movaps	24 * SIZE(BB), %xmm3
675
676	addps	%xmm1, %xmm5
677	movaps	28 * SIZE(AA), %xmm1
678	mulps	%xmm1, %xmm3
679	mulps	28 * SIZE(BB), %xmm1
680
681	addps	%xmm3, %xmm6
682	movaps	40 * SIZE(BB), %xmm3
683	addps	%xmm1, %xmm7
684	movaps	40 * SIZE(AA), %xmm1
685
686#ifdef CORE_KATMAI
687	prefetcht0	(PREFETCHSIZE + 32) * SIZE(AA)
688#endif
689
690	mulps	%xmm0, %xmm2
691	mulps	36 * SIZE(BB), %xmm0
692	addps	%xmm2, %xmm4
693	movaps	32 * SIZE(BB), %xmm2
694
695	addps	%xmm0, %xmm5
696	movaps	36 * SIZE(AA), %xmm0
697	mulps	%xmm0, %xmm2
698	mulps	36 * SIZE(BB), %xmm0
699
700	addps	%xmm2, %xmm6
701	movaps	48 * SIZE(BB), %xmm2
702	addps	%xmm0, %xmm7
703	movaps	48 * SIZE(AA), %xmm0
704
705#ifdef CORE_KATMAI
706	prefetcht0	(PREFETCHSIZE + 40) * SIZE(AA)
707#endif
708
709	mulps	%xmm1, %xmm3
710	mulps	44 * SIZE(BB), %xmm1
711	addps	%xmm3, %xmm4
712	movaps	40 * SIZE(BB), %xmm3
713
714	addps	%xmm1, %xmm5
715	movaps	44 * SIZE(AA), %xmm1
716	mulps	%xmm1, %xmm3
717	mulps	44 * SIZE(BB), %xmm1
718
719	addps	%xmm3, %xmm6
720	movaps	56 * SIZE(BB), %xmm3
721	addps	%xmm1, %xmm7
722	movaps	56 * SIZE(AA), %xmm1
723
724#ifdef CORE_KATMAI
725	prefetcht0	(PREFETCHSIZE + 48) * SIZE(AA)
726#endif
727
728	mulps	%xmm0, %xmm2
729	mulps	52 * SIZE(BB), %xmm0
730	addps	%xmm2, %xmm4
731	movaps	48 * SIZE(BB), %xmm2
732
733	addps	%xmm0, %xmm5
734	movaps	52 * SIZE(AA), %xmm0
735	mulps	%xmm0, %xmm2
736	mulps	52 * SIZE(BB), %xmm0
737
738	addps	%xmm2, %xmm6
739	movaps	64 * SIZE(BB), %xmm2
740	addps	%xmm0, %xmm7
741	movaps	64 * SIZE(AA), %xmm0
742
743#ifdef CORE_KATMAI
744	prefetcht0	(PREFETCHSIZE + 56) * SIZE(AA)
745#endif
746
747	mulps	%xmm1, %xmm3
748	mulps	60 * SIZE(BB), %xmm1
749	addps	%xmm3, %xmm4
750	movaps	56 * SIZE(BB), %xmm3
751
752	addps	%xmm1, %xmm5
753	movaps	60 * SIZE(AA), %xmm1
754	mulps	%xmm1, %xmm3
755	mulps	60 * SIZE(BB), %xmm1
756
757	addps	%xmm3, %xmm6
758	movaps	72 * SIZE(BB), %xmm3
759	addps	%xmm1, %xmm7
760	movaps	72 * SIZE(AA), %xmm1
761
762	addl	$64 * SIZE, BB
763	addl	$64 * SIZE, AA
764	decl	%eax
765	jne	.L11
766#endif
767
768.L12:
769#ifndef TRMMKERNEL
770	movl	K, %eax
771#else
772	movl	KKK, %eax
773#endif
774	movaps	ALPHA_R, %xmm1
775	movaps	ALPHA_I, %xmm3
776	andl	$7, %eax		# if (k & 1)
777	BRANCH
778	je .L14
779
780.L13:
781	mulps	%xmm0, %xmm2
782	mulps	 4 * SIZE(BB), %xmm0
783	addps	%xmm2, %xmm4
784	movaps	 0 * SIZE(BB), %xmm2
785	addps	%xmm0, %xmm5
786	movaps	 4 * SIZE(AA), %xmm0
787	mulps	%xmm0, %xmm2
788	mulps	 4 * SIZE(BB), %xmm0
789	addps	%xmm2, %xmm6
790	movaps	 8 * SIZE(BB), %xmm2
791	addps	%xmm0, %xmm7
792	movaps	 8 * SIZE(AA), %xmm0
793
794	addl	$8 * SIZE, AA		# aoffset  += 8
795	addl	$8 * SIZE, BB		# boffset1 += 8
796
797	decl	%eax
798	jg	.L13
799
800.L14:
801	shufps	$0xb1, %xmm5, %xmm5
802	shufps	$0xb1, %xmm7, %xmm7
803
804#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
805    defined(RR) || defined(RC) || defined(CR) || defined(CC)
806	subps	%xmm5, %xmm4
807	subps	%xmm7, %xmm6
808#else
809	addps	%xmm5, %xmm4
810	addps	%xmm7, %xmm6
811#endif
812
813	movaps	%xmm4, %xmm5
814	movaps	%xmm6, %xmm7
815
816	shufps	$0xb1, %xmm4, %xmm4
817	shufps	$0xb1, %xmm6, %xmm6
818
819	mulps	%xmm1, %xmm5
820	mulps	%xmm3, %xmm4
821	mulps	%xmm1, %xmm7
822	mulps	%xmm3, %xmm6
823
824	addps	%xmm5, %xmm4
825	addps	%xmm7, %xmm6
826
827	shufps	$0xe4, %xmm4, %xmm4
828	shufps	$0xe4, %xmm6, %xmm6
829
830#ifndef TRMMKERNEL
831	movsd	0 * SIZE(%esi), %xmm0
832	movhps	2 * SIZE(%esi), %xmm0
833	movsd	4 * SIZE(%esi), %xmm2
834	movhps	6 * SIZE(%esi), %xmm2
835
836	addps	%xmm0, %xmm4
837	addps	%xmm2, %xmm6
838#endif
839
840	movsd	%xmm4, 0 * SIZE(%esi)
841	movhps	%xmm4, 2 * SIZE(%esi)
842	movsd	%xmm6, 4 * SIZE(%esi)
843	movhps	%xmm6, 6 * SIZE(%esi)
844
845#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
846    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
847	movl	K, %eax
848	subl	KKK, %eax
849	leal	(,%eax,    8), %eax
850	leal	(AA, %eax, 4), AA
851	leal	(BB, %eax, 4), BB
852#endif
853
854#if defined(TRMMKERNEL) && defined(LEFT)
855	addl	$4, KK
856#endif
857
858	addl	$8 * SIZE, %esi		# coffset += 4
859	decl	%ebx			# i --
860	jg	.L10
861	ALIGN_2
862
863.L50:
864	movl	M,  %ebx
865	testl	$2, %ebx
866	jle	.L70
867
868
869#if (L1_DATA_LINESIZE == 64)
870
871#if !defined(TRMMKERNEL) || \
872	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
873	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
874
875	leal	BUFFER, BB
876	movaps	 0 * SIZE + BUFFER, %xmm2
877	xorps	%xmm4, %xmm4
878	movaps	 0 * SIZE(AA), %xmm0
879	xorps	%xmm5, %xmm5
880	movaps	 16 * SIZE + BUFFER, %xmm3
881	xorps	%xmm6, %xmm6
882	movaps	 16 * SIZE(AA), %xmm1
883	xorps	%xmm7, %xmm7
884
885#else
886
887	leal	BUFFER, BB
888	movl	KK, %eax
889	leal	(, %eax,   8), %eax
890	leal	(AA, %eax, 2), AA
891	leal	(BB, %eax, 4), BB /* because it's doubled */
892
893	movaps	 0 * SIZE(BB), %xmm2
894	xorps	%xmm4, %xmm4
895	movaps	 0 * SIZE(AA), %xmm0
896	xorps	%xmm5, %xmm5
897	movaps	 16 * SIZE(BB), %xmm3
898	xorps	%xmm6, %xmm6
899	movaps	 16 * SIZE(AA), %xmm1
900	xorps	%xmm7, %xmm7
901
902#endif
903
904#ifndef TRMMKERNEL
905	movl	K, %eax
906#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
907	movl	K, %eax
908	subl	KK, %eax
909	movl	%eax, KKK
910#else
911	movl	KK, %eax
912#ifdef LEFT
913	addl	$2, %eax
914#else
915	addl	$1, %eax
916#endif
917	movl	%eax, KKK
918#endif
919	sarl	$3, %eax
920	je	.L52
921	ALIGN_4
922
923.L51:
924	mulps	%xmm0, %xmm2
925	mulps	 4 * SIZE(BB), %xmm0
926	addps	%xmm2, %xmm4
927	movaps	 8 * SIZE(BB), %xmm2
928	addps	%xmm0, %xmm5
929	movaps	 4 * SIZE(AA), %xmm0
930	mulps	%xmm0, %xmm2
931	mulps	12 * SIZE(BB), %xmm0
932	addps	%xmm2, %xmm6
933	movaps	32 * SIZE(BB), %xmm2
934	addps	%xmm0, %xmm7
935	movaps	 8 * SIZE(AA), %xmm0
936	mulps	%xmm0, %xmm3
937	mulps	20 * SIZE(BB), %xmm0
938	addps	%xmm3, %xmm4
939	movaps	24 * SIZE(BB), %xmm3
940	addps	%xmm0, %xmm5
941	movaps	12 * SIZE(AA), %xmm0
942	mulps	%xmm0, %xmm3
943	mulps	28 * SIZE(BB), %xmm0
944	addps	%xmm3, %xmm6
945	movaps	48 * SIZE(BB), %xmm3
946	addps	%xmm0, %xmm7
947	movaps	32 * SIZE(AA), %xmm0
948	mulps	%xmm1, %xmm2
949	mulps	36 * SIZE(BB), %xmm1
950	addps	%xmm2, %xmm4
951	movaps	40 * SIZE(BB), %xmm2
952	addps	%xmm1, %xmm5
953	movaps	20 * SIZE(AA), %xmm1
954	mulps	%xmm1, %xmm2
955	mulps	44 * SIZE(BB), %xmm1
956	addps	%xmm2, %xmm6
957	movaps	64 * SIZE(BB), %xmm2
958	addps	%xmm1, %xmm7
959	movaps	24 * SIZE(AA), %xmm1
960	mulps	%xmm1, %xmm3
961	mulps	52 * SIZE(BB), %xmm1
962	addps	%xmm3, %xmm4
963	movaps	56 * SIZE(BB), %xmm3
964	addps	%xmm1, %xmm5
965	movaps	28 * SIZE(AA), %xmm1
966	mulps	%xmm1, %xmm3
967	mulps	60 * SIZE(BB), %xmm1
968	addps	%xmm3, %xmm6
969	movaps	80 * SIZE(BB), %xmm3
970	addps	%xmm1, %xmm7
971	movaps	48 * SIZE(AA), %xmm1
972
973	addl	$32 * SIZE, AA
974	addl	$64 * SIZE, BB
975	decl	%eax
976	jne	.L51
977	ALIGN_2
978
979#else
980
981#if !defined(TRMMKERNEL) || \
982	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
983	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
984
985	leal	BUFFER, BB
986	movaps	 0 * SIZE + BUFFER, %xmm2
987	xorps	%xmm4, %xmm4
988	movaps	 0 * SIZE(AA), %xmm0
989	xorps	%xmm5, %xmm5
990	movaps	 8 * SIZE + BUFFER, %xmm3
991	xorps	%xmm6, %xmm6
992	movaps	 8 * SIZE(AA), %xmm1
993	xorps	%xmm7, %xmm7
994
995#else
996
997	leal	BUFFER, BB
998	movl	KK, %eax
999	leal	(, %eax,   8), %eax
1000	leal	(AA, %eax, 2), AA
1001	leal	(BB, %eax, 4), BB /* because it's doubled */
1002
1003	movaps	 0 * SIZE(BB), %xmm2
1004	xorps	%xmm4, %xmm4
1005	movaps	 0 * SIZE(AA), %xmm0
1006	xorps	%xmm5, %xmm5
1007	movaps	 8 * SIZE(BB), %xmm3
1008	xorps	%xmm6, %xmm6
1009	movaps	 8 * SIZE(AA), %xmm1
1010	xorps	%xmm7, %xmm7
1011
1012#endif
1013
1014#ifndef TRMMKERNEL
1015	movl	K, %eax
1016#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1017	movl	K, %eax
1018	subl	KK, %eax
1019	movl	%eax, KKK
1020#else
1021	movl	KK, %eax
1022#ifdef LEFT
1023	addl	$2, %eax
1024#else
1025	addl	$1, %eax
1026#endif
1027	movl	%eax, KKK
1028#endif
1029	sarl	$3, %eax
1030	je	.L52
1031	ALIGN_4
1032
1033.L51:
1034	mulps	%xmm0, %xmm2
1035	mulps	 4 * SIZE(BB), %xmm0
1036	addps	%xmm2, %xmm4
1037	movaps	16 * SIZE(BB), %xmm2
1038	addps	%xmm0, %xmm5
1039	movaps	 4 * SIZE(AA), %xmm0
1040	mulps	%xmm0, %xmm3
1041	mulps	12 * SIZE(BB), %xmm0
1042	addps	%xmm3, %xmm6
1043	movaps	24 * SIZE(BB), %xmm3
1044	addps	%xmm0, %xmm7
1045	movaps	16 * SIZE(AA), %xmm0
1046	mulps	%xmm1, %xmm2
1047	mulps	20 * SIZE(BB), %xmm1
1048	addps	%xmm2, %xmm4
1049	movaps	32 * SIZE(BB), %xmm2
1050	addps	%xmm1, %xmm5
1051	movaps	12 * SIZE(AA), %xmm1
1052	mulps	%xmm1, %xmm3
1053	mulps	28 * SIZE(BB), %xmm1
1054	addps	%xmm3, %xmm6
1055	movaps	40 * SIZE(BB), %xmm3
1056	addps	%xmm1, %xmm7
1057	movaps	24 * SIZE(AA), %xmm1
1058	mulps	%xmm0, %xmm2
1059	mulps	36 * SIZE(BB), %xmm0
1060	addps	%xmm2, %xmm4
1061	movaps	48 * SIZE(BB), %xmm2
1062	addps	%xmm0, %xmm5
1063	movaps	20 * SIZE(AA), %xmm0
1064	mulps	%xmm0, %xmm3
1065	mulps	44 * SIZE(BB), %xmm0
1066	addps	%xmm3, %xmm6
1067	movaps	56 * SIZE(BB), %xmm3
1068	addps	%xmm0, %xmm7
1069	movaps	 32 * SIZE(AA), %xmm0
1070	mulps	%xmm1, %xmm2
1071	mulps	52 * SIZE(BB), %xmm1
1072	addps	%xmm2, %xmm4
1073	movaps	 64 * SIZE(BB), %xmm2
1074	addps	%xmm1, %xmm5
1075	movaps	28 * SIZE(AA), %xmm1
1076	mulps	%xmm1, %xmm3
1077	mulps	60 * SIZE(BB), %xmm1
1078	addps	%xmm3, %xmm6
1079	movaps	 72 * SIZE(BB), %xmm3
1080	addps	%xmm1, %xmm7
1081	movaps	 40 * SIZE(AA), %xmm1
1082
1083	addl	$32 * SIZE, AA
1084	addl	$64 * SIZE, BB
1085	decl	%eax
1086	jne	.L51
1087#endif
1088
1089.L52:
1090#ifndef TRMMKERNEL
1091	movl	K, %eax
1092#else
1093	movl	KKK, %eax
1094#endif
1095	movaps	ALPHA_R, %xmm1
1096	movaps	ALPHA_I, %xmm3
1097	andl	$7, %eax		# if (k & 1)
1098	BRANCH
1099	je .L54
1100
1101.L53:
1102	mulps	%xmm0, %xmm2
1103	mulps	 4 * SIZE(BB), %xmm0
1104	addps	%xmm2, %xmm4
1105	movaps	 8 * SIZE(BB), %xmm2
1106	addps	%xmm0, %xmm5
1107	movaps	 4 * SIZE(AA), %xmm0
1108
1109	addl	$4 * SIZE, AA		# aoffset  += 8
1110	addl	$8 * SIZE, BB		# boffset1 += 8
1111	decl	%eax
1112	jg	.L53
1113
1114.L54:
1115	addps	%xmm6, %xmm4
1116	addps	%xmm7, %xmm5
1117
1118	shufps	$0xb1, %xmm5, %xmm5
1119
1120#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1121    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1122	subps	%xmm5, %xmm4
1123#else
1124	addps	%xmm5, %xmm4
1125#endif
1126
1127	movaps	%xmm4, %xmm5
1128
1129	shufps	$0xb1, %xmm4, %xmm4
1130
1131	mulps	%xmm1, %xmm5
1132	mulps	%xmm3, %xmm4
1133
1134	addps	%xmm5, %xmm4
1135
1136#ifndef TRMMKERNEL
1137	movsd	0 * SIZE(%esi), %xmm0
1138	movhps	2 * SIZE(%esi), %xmm0
1139
1140	addps	%xmm0, %xmm4
1141#endif
1142
1143	movlps	%xmm4, 0 * SIZE(%esi)
1144	movhps	%xmm4, 2 * SIZE(%esi)
1145
1146#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1147    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1148	movl	K, %eax
1149	subl	KKK, %eax
1150	leal	(,%eax,    8), %eax
1151	leal	(AA, %eax, 2), AA
1152	leal	(BB, %eax, 4), BB
1153#endif
1154
1155#if defined(TRMMKERNEL) && defined(LEFT)
1156	addl	$2, KK
1157#endif
1158	addl	$4 * SIZE, %esi		# coffset += 4
1159	ALIGN_2
1160
1161.L70:
1162	testl	$1, %ebx
1163	jle	.L99
1164
1165
1166#if (L1_DATA_LINESIZE == 64)
1167
1168#if !defined(TRMMKERNEL) || \
1169	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1170	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1171
1172	leal	BUFFER, BB
1173	movaps	 0 * SIZE + BUFFER, %xmm2
1174	xorps	%xmm4, %xmm4
1175#ifdef	movsd
1176	xorps	%xmm0, %xmm0
1177#endif
1178	movsd	 0 * SIZE(AA), %xmm0
1179	xorps	%xmm5, %xmm5
1180	movaps	 16 * SIZE + BUFFER, %xmm3
1181	xorps	%xmm6, %xmm6
1182#ifdef	movsd
1183	xorps	%xmm1, %xmm1
1184#endif
1185	movsd	 8 * SIZE(AA), %xmm1
1186	xorps	%xmm7, %xmm7
1187
1188#else
1189
1190	leal	BUFFER, BB
1191	movl	KK, %eax
1192	leal	(, %eax,   8), %eax
1193	leal	(AA, %eax, 1), AA
1194	leal	(BB, %eax, 4), BB /* because it's doubled */
1195
1196	movaps	 0 * SIZE(BB), %xmm2
1197	xorps	%xmm4, %xmm4
1198#ifdef	movsd
1199	xorps	%xmm0, %xmm0
1200#endif
1201	movsd	 0 * SIZE(AA), %xmm0
1202	xorps	%xmm5, %xmm5
1203	movaps	 16 * SIZE(BB), %xmm3
1204	xorps	%xmm6, %xmm6
1205#ifdef	movsd
1206	xorps	%xmm1, %xmm1
1207#endif
1208	movsd	 8 * SIZE(AA), %xmm1
1209	xorps	%xmm7, %xmm7
1210
1211#endif
1212
1213#ifndef TRMMKERNEL
1214	movl	K, %eax
1215#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1216	movl	K, %eax
1217	subl	KK, %eax
1218	movl	%eax, KKK
1219#else
1220	movl	KK, %eax
1221	addl	$1, %eax
1222	movl	%eax, KKK
1223#endif
1224	sarl	$3, %eax
1225	je	.L72
1226	ALIGN_4
1227
1228.L71:
1229	mulps	%xmm0, %xmm2
1230	addps	%xmm2, %xmm4
1231	movaps	 4 * SIZE(BB), %xmm2
1232	mulps	%xmm0, %xmm2
1233	movsd	 2 * SIZE(AA), %xmm0
1234	addps	%xmm2, %xmm5
1235	movaps	 8 * SIZE(BB), %xmm2
1236	mulps	%xmm0, %xmm2
1237	addps	%xmm2, %xmm6
1238	movaps	12 * SIZE(BB), %xmm2
1239	mulps	%xmm0, %xmm2
1240	movsd	 4 * SIZE(AA), %xmm0
1241	addps	%xmm2, %xmm7
1242	movaps	32 * SIZE(BB), %xmm2
1243	mulps	%xmm0, %xmm3
1244	addps	%xmm3, %xmm4
1245	movaps	20 * SIZE(BB), %xmm3
1246	mulps	%xmm0, %xmm3
1247	movsd	 6 * SIZE(AA), %xmm0
1248	addps	%xmm3, %xmm5
1249	movaps	24 * SIZE(BB), %xmm3
1250	mulps	%xmm0, %xmm3
1251	addps	%xmm3, %xmm6
1252	movaps	28 * SIZE(BB), %xmm3
1253	mulps	%xmm0, %xmm3
1254	movsd	16 * SIZE(AA), %xmm0
1255	addps	%xmm3, %xmm7
1256	movaps	48 * SIZE(BB), %xmm3
1257	mulps	%xmm1, %xmm2
1258	addps	%xmm2, %xmm4
1259	movaps	36 * SIZE(BB), %xmm2
1260	mulps	%xmm1, %xmm2
1261	movsd	10 * SIZE(AA), %xmm1
1262	addps	%xmm2, %xmm5
1263	movaps	40 * SIZE(BB), %xmm2
1264	mulps	%xmm1, %xmm2
1265	addps	%xmm2, %xmm6
1266	movaps	44 * SIZE(BB), %xmm2
1267	mulps	%xmm1, %xmm2
1268	movsd	12 * SIZE(AA), %xmm1
1269	addps	%xmm2, %xmm7
1270	movaps	64 * SIZE(BB), %xmm2
1271	mulps	%xmm1, %xmm3
1272	addps	%xmm3, %xmm4
1273	movaps	52 * SIZE(BB), %xmm3
1274	mulps	%xmm1, %xmm3
1275	movsd	14 * SIZE(AA), %xmm1
1276	addps	%xmm3, %xmm5
1277	movaps	56 * SIZE(BB), %xmm3
1278	mulps	%xmm1, %xmm3
1279	addps	%xmm3, %xmm6
1280	movaps	60 * SIZE(BB), %xmm3
1281	mulps	%xmm1, %xmm3
1282	movsd	24 * SIZE(AA), %xmm1
1283	addps	%xmm3, %xmm7
1284	movaps	80 * SIZE(BB), %xmm3
1285
1286	addl	$16 * SIZE, AA
1287	addl	$64 * SIZE, BB
1288	decl	%eax
1289	jne	.L71
1290	ALIGN_2
1291
1292#else
1293#if !defined(TRMMKERNEL) || \
1294	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1295	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1296
1297	leal	BUFFER, BB
1298	movaps	 0 * SIZE + BUFFER, %xmm2
1299	xorps	%xmm4, %xmm4
1300	movaps	 0 * SIZE(AA), %xmm0
1301	xorps	%xmm5, %xmm5
1302	movaps	 8 * SIZE + BUFFER, %xmm3
1303	xorps	%xmm6, %xmm6
1304	movaps	 8 * SIZE(AA), %xmm1
1305	xorps	%xmm7, %xmm7
1306
1307#else
1308
1309	leal	BUFFER, BB
1310	movl	KK, %eax
1311	leal	(, %eax,   8), %eax
1312	leal	(AA, %eax, 1), AA
1313	leal	(BB, %eax, 4), BB /* because it's doubled */
1314
1315	movaps	 0 * SIZE(BB), %xmm2
1316	xorps	%xmm4, %xmm4
1317#ifdef	movsd
1318	xorps	%xmm0, %xmm0
1319#endif
1320	movsd	 0 * SIZE(AA), %xmm0
1321	xorps	%xmm5, %xmm5
1322	movaps	 8 * SIZE(BB), %xmm3
1323	xorps	%xmm6, %xmm6
1324#ifdef	movsd
1325	xorps	%xmm1, %xmm1
1326#endif
1327	movsd	 8 * SIZE(AA), %xmm1
1328	xorps	%xmm7, %xmm7
1329
1330#endif
1331
1332
1333#ifndef TRMMKERNEL
1334	movl	K, %eax
1335#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1336	movl	K, %eax
1337	subl	KK, %eax
1338	movl	%eax, KKK
1339#else
1340	movl	KK, %eax
1341	addl	$1, %eax
1342	movl	%eax, KKK
1343#endif
1344	sarl	$3, %eax
1345	je	.L72
1346	ALIGN_4
1347
1348.L71:
1349	mulps	%xmm0, %xmm2
1350	addps	%xmm2, %xmm4
1351	movaps	 4 * SIZE(BB), %xmm2
1352	mulps	%xmm0, %xmm2
1353	movsd	 2 * SIZE(AA), %xmm0
1354	addps	%xmm2, %xmm5
1355	movaps	16 * SIZE(BB), %xmm2
1356	mulps	%xmm0, %xmm3
1357	addps	%xmm3, %xmm4
1358	movaps	12 * SIZE(BB), %xmm3
1359	mulps	%xmm0, %xmm3
1360	movsd	 4 * SIZE(AA), %xmm0
1361	addps	%xmm3, %xmm5
1362	movaps	24 * SIZE(BB), %xmm3
1363	mulps	%xmm0, %xmm2
1364	addps	%xmm2, %xmm4
1365	movaps	20 * SIZE(BB), %xmm2
1366	mulps	%xmm0, %xmm2
1367	movsd	 6 * SIZE(AA), %xmm0
1368	addps	%xmm2, %xmm5
1369	movaps	32 * SIZE(BB), %xmm2
1370	mulps	%xmm0, %xmm3
1371	addps	%xmm3, %xmm4
1372	movaps	28 * SIZE(BB), %xmm3
1373	mulps	%xmm0, %xmm3
1374	movsd	16 * SIZE(AA), %xmm0
1375	addps	%xmm3, %xmm5
1376	movaps	40 * SIZE(BB), %xmm3
1377	mulps	%xmm1, %xmm2
1378	addps	%xmm2, %xmm4
1379	movaps	36 * SIZE(BB), %xmm2
1380	mulps	%xmm1, %xmm2
1381	movsd	10 * SIZE(AA), %xmm1
1382	addps	%xmm2, %xmm5
1383	movaps	48 * SIZE(BB), %xmm2
1384	mulps	%xmm1, %xmm3
1385	addps	%xmm3, %xmm4
1386	movaps	44 * SIZE(BB), %xmm3
1387	mulps	%xmm1, %xmm3
1388	movsd	12 * SIZE(AA), %xmm1
1389	addps	%xmm3, %xmm5
1390	movaps	56 * SIZE(BB), %xmm3
1391	mulps	%xmm1, %xmm2
1392	addps	%xmm2, %xmm4
1393	movaps	52 * SIZE(BB), %xmm2
1394	mulps	%xmm1, %xmm2
1395	movsd	14 * SIZE(AA), %xmm1
1396	addps	%xmm2, %xmm5
1397	movaps	64 * SIZE(BB), %xmm2
1398	mulps	%xmm1, %xmm3
1399	addps	%xmm3, %xmm4
1400	movaps	60 * SIZE(BB), %xmm3
1401	mulps	%xmm1, %xmm3
1402 	movsd	24 * SIZE(AA), %xmm1
1403	addps	%xmm3, %xmm5
1404	movaps	72 * SIZE(BB), %xmm3
1405
1406	addl	$16 * SIZE, AA
1407	addl	$64 * SIZE, BB
1408	decl	%eax
1409	jne	.L71
1410	ALIGN_2
1411#endif
1412
1413.L72:
1414#ifndef TRMMKERNEL
1415	movl	K, %eax
1416#else
1417	movl	KKK, %eax
1418#endif
1419	movaps	ALPHA_R, %xmm1
1420	movaps	ALPHA_I, %xmm3
1421	andl	$7, %eax		# if (k & 1)
1422	BRANCH
1423	je .L74
1424
1425.L73:
1426	mulps	%xmm0, %xmm2
1427	addps	%xmm2, %xmm4
1428	movaps	 4 * SIZE(BB), %xmm2
1429	mulps	%xmm0, %xmm2
1430	movsd	 2 * SIZE(AA), %xmm0
1431	addps	%xmm2, %xmm5
1432	movaps	 8 * SIZE(BB), %xmm2
1433
1434	addl	$2 * SIZE, AA		# aoffset  += 8
1435	addl	$8 * SIZE, BB		# boffset1 += 8
1436	decl	%eax
1437	jg	.L73
1438
1439.L74:
1440	addps	%xmm6, %xmm4
1441	addps	%xmm7, %xmm5
1442
1443	shufps	$0xb1, %xmm5, %xmm5
1444
1445#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
1446    defined(RR) || defined(RC) || defined(CR) || defined(CC)
1447	subps	%xmm5, %xmm4
1448#else
1449	addps	%xmm5, %xmm4
1450#endif
1451
1452	movaps	%xmm4, %xmm5
1453
1454	shufps	$0xb1, %xmm4, %xmm4
1455
1456	mulps	%xmm1, %xmm5
1457	mulps	%xmm3, %xmm4
1458
1459	addps	%xmm5, %xmm4
1460
1461#ifndef TRMMKERNEL
1462#ifdef	movsd
1463	xorps	%xmm0, %xmm0
1464#endif
1465	movsd	0 * SIZE(%esi), %xmm0
1466
1467	addps	%xmm0, %xmm4
1468#endif
1469
1470	movlps	%xmm4, 0 * SIZE(%esi)
1471
1472#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1473    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1474	movl	K, %eax
1475	subl	KKK, %eax
1476	leal	(,%eax,    8), %eax
1477	leal	(AA, %eax, 1), AA
1478	leal	(BB, %eax, 4), BB
1479#endif
1480
1481#if defined(TRMMKERNEL) && defined(LEFT)
1482	addl	$1, KK
1483#endif
1484
1485	ALIGN_2
1486
1487.L99:
1488#if defined(TRMMKERNEL) && !defined(LEFT)
1489	addl	$1, KK
1490#endif
1491
1492	addl	LDC, C			# c += ldc
1493	decl	J			# j --
1494	jg	.L01
1495	ALIGN_2
1496
1497.L999:
1498	movl	OLD_STACK, %esp
1499
1500	EMMS
1501
1502	popl	%ebx
1503	popl	%esi
1504	popl	%edi
1505	popl	%ebp
1506	ret
1507
1508	EPILOGUE
1509