1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	 0
44
45#define STACK_M	 4 + STACK + ARGS(%esi)
46#define STACK_N	 8 + STACK + ARGS(%esi)
47#define STACK_K	12 + STACK + ARGS(%esi)
48#define STACK_ALPHA	16 + STACK + ARGS(%esi)
49#define STACK_A	20 + STACK + ARGS(%esi)
50#define STACK_B	24 + STACK + ARGS(%esi)
51#define STACK_C	28 + STACK + ARGS(%esi)
52#define STACK_LDC	32 + STACK + ARGS(%esi)
53#define STACK_OFFT	36 + STACK + ARGS(%esi)
54
55#define ALPHA	 0(%esp)
56#define K	16(%esp)
57#define N	20(%esp)
58#define M	24(%esp)
59#define A	28(%esp)
60#define C	32(%esp)
61#define J	36(%esp)
62#define OLD_STACK 40(%esp)
63#define OFFSET  44(%esp)
64#define KK	48(%esp)
65#define KKK	52(%esp)
66#define BUFFER 512(%esp)
67
68#define PREFETCH_R    (8 * 16 + 0)
69#define PREFETCH_W    (PREFETCH_R * 2)
70
71#define PREFETCHSIZE  (8 * 16 + 4)
72#define PREFETCH     prefetcht0
73
74#define AA	%edx
75#define BB	%ecx
76#define LDC	%ebp
77#define B	%edi
78#define C1	%esi
79#define I	%ebx
80
81
82	PROLOGUE
83
84	pushl	%ebp
85	pushl	%edi
86	pushl	%esi
87	pushl	%ebx
88
89	PROFCODE
90
91	movl	%esp, %esi	# save old stack
92
93	subl	$512 + LOCAL_BUFFER_SIZE, %esp
94	andl	$-4096, %esp	# align stack
95
96        STACK_TOUCHING
97
98	movl	STACK_M, %ebx
99	movl	STACK_N, %eax
100	movl	STACK_K, %ecx
101	movl	STACK_A, %edx
102	movss	STACK_ALPHA,  %xmm3
103#ifdef TRMMKERNEL
104	movd	STACK_OFFT, %mm4
105#endif
106
107	movl	%ebx, M
108	movl	%eax, N
109	movl	%ecx, K
110	movl	%edx, A
111	movl	%esi, OLD_STACK
112#ifdef TRMMKERNEL
113	movd	%mm4, OFFSET
114	movd	%mm4, KK
115#ifndef LEFT
116	negl	KK
117#endif
118#endif
119
120	shufps	$0, %xmm3, %xmm3
121
122	movl	STACK_B, B
123	movl	STACK_C, %ebx
124
125	movaps	 %xmm3, ALPHA
126	movl	%ebx, C
127	movl	STACK_LDC, LDC
128
129	subl	$-32 * SIZE, A
130	subl	$-32 * SIZE, B
131
132	leal	(, LDC, SIZE), LDC
133
134	sarl	$1, %eax
135	movl	%eax, J
136	jle	.L50
137	ALIGN_4
138
139.L01:
140	leal	32 * SIZE + BUFFER, BB
141
142#if defined(TRMMKERNEL) && defined(LEFT)
143	movl	OFFSET, %eax
144	movl	%eax, KK
145#endif
146
147	movl	K, %eax
148	sarl	$2, %eax
149	jle	.L05
150	ALIGN_4
151
152.L02:
153	prefetcht0	(PREFETCH_R + 0) * SIZE(B)
154	movss	-32 * SIZE(B), %xmm0
155	movss	-31 * SIZE(B), %xmm1
156	movss	-30 * SIZE(B), %xmm2
157	movss	-29 * SIZE(B), %xmm3
158	movss	-28 * SIZE(B), %xmm4
159	movss	-27 * SIZE(B), %xmm5
160	movss	-26 * SIZE(B), %xmm6
161	movss	-25 * SIZE(B), %xmm7
162
163	prefetcht0	(PREFETCH_W +  0) * SIZE(BB)
164	shufps	$0, %xmm0, %xmm0
165	shufps	$0, %xmm1, %xmm1
166	shufps	$0, %xmm2, %xmm2
167	shufps	$0, %xmm3, %xmm3
168	shufps	$0, %xmm4, %xmm4
169	shufps	$0, %xmm5, %xmm5
170	shufps	$0, %xmm6, %xmm6
171	shufps	$0, %xmm7, %xmm7
172
173	prefetcht0	(PREFETCH_W + 16) * SIZE(BB)
174	movaps	%xmm0,  -32 * SIZE(BB)
175	movaps	%xmm1,  -28 * SIZE(BB)
176	movaps	%xmm2,  -24 * SIZE(BB)
177	movaps	%xmm3,  -20 * SIZE(BB)
178	movaps	%xmm4,  -16 * SIZE(BB)
179	movaps	%xmm5,  -12 * SIZE(BB)
180	movaps	%xmm6,   -8 * SIZE(BB)
181	movaps	%xmm7,   -4 * SIZE(BB)
182
183	addl	$  8 * SIZE, B
184	subl	$-32 * SIZE, BB
185	decl	%eax
186	jne	.L02
187	ALIGN_4
188
189.L05:
190	movl	K, %eax
191	andl	$3, %eax
192	BRANCH
193	jle	.L10
194	ALIGN_4
195
196.L06:
197	movss	-32 * SIZE(B), %xmm0
198	movss	-31 * SIZE(B), %xmm1
199
200	shufps	$0, %xmm0, %xmm0
201	shufps	$0, %xmm1, %xmm1
202
203	movaps	%xmm0,  -32 * SIZE(BB)
204	movaps	%xmm1,  -28 * SIZE(BB)
205	addl	$2 * SIZE, B
206	addl	$8 * SIZE, BB
207	decl	%eax
208	jne	.L06
209	ALIGN_4
210
211.L10:
212	movl	C, C1
213	movl	A, AA
214	movl	M,  I
215	sarl	$3, I
216	jle	.L20
217	ALIGN_4
218
219.L11:
220#if !defined(TRMMKERNEL) || \
221	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
222	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
223	leal	32 * SIZE + BUFFER, BB
224#else
225	leal	32 * SIZE + BUFFER, BB
226	movl	KK, %eax
227	leal	(, %eax,   8), %eax
228	leal	(AA, %eax, 4), AA
229	leal	(BB, %eax, 4), BB /* because it's doubled */
230#endif
231
232	movaps	-32 * SIZE(AA), %xmm0
233	pxor	%xmm4, %xmm4
234	movaps	-32 * SIZE(BB), %xmm1
235	pxor	%xmm5, %xmm5
236	movapd	-16 * SIZE(AA), %xmm3
237	pxor	%xmm6, %xmm6
238	prefetcht0	7 * SIZE(C1)
239	pxor	%xmm7, %xmm7
240	prefetcht0	7 * SIZE(C1, LDC)
241
242#ifndef TRMMKERNEL
243	movl	K, %eax
244#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
245	movl	K, %eax
246	subl	KK, %eax
247	movl	%eax, KKK
248#else
249	movl	KK, %eax
250#ifdef LEFT
251	addl	$8, %eax
252#else
253	addl	$2, %eax
254#endif
255	movl	%eax, KKK
256#endif
257	sarl	$3, %eax
258	je	.L15
259	ALIGN_4
260
261.L12:
262	movaps	%xmm1,  %xmm2
263	mulps	%xmm0,  %xmm1
264	addps	%xmm1,  %xmm4
265	movaps	-28 * SIZE(BB), %xmm1
266	mulps	%xmm1,  %xmm0
267	addps	%xmm0,  %xmm5
268	movaps	-28 * SIZE(AA), %xmm0
269	mulps	%xmm0,  %xmm2
270	mulps	%xmm0,  %xmm1
271	movaps	-24 * SIZE(AA), %xmm0
272	addps	%xmm2,  %xmm6
273	addps	%xmm1,  %xmm7
274
275	movaps	-24 * SIZE(BB), %xmm1
276	movaps	%xmm1,  %xmm2
277	mulps	%xmm0,  %xmm1
278	addps	%xmm1,  %xmm4
279	movaps	-20 * SIZE(BB), %xmm1
280	mulps	%xmm1,  %xmm0
281	addps	%xmm0,  %xmm5
282	movaps	-20 * SIZE(AA), %xmm0
283	mulps	%xmm0,  %xmm2
284	mulps	%xmm0,  %xmm1
285	movaps	  0 * SIZE(AA), %xmm0
286	addps	%xmm2,  %xmm6
287	addps	%xmm1,  %xmm7
288
289	movaps	-16 * SIZE(BB), %xmm1
290	movaps	%xmm1,  %xmm2
291	mulps	%xmm3,  %xmm1
292	addps	%xmm1,  %xmm4
293	movaps	-12 * SIZE(BB), %xmm1
294	mulps	%xmm1,  %xmm3
295	addps	%xmm3,  %xmm5
296	movaps	-12 * SIZE(AA), %xmm3
297	mulps	%xmm3,  %xmm2
298	mulps	%xmm3,  %xmm1
299	movaps	 -8 * SIZE(AA), %xmm3
300	addps	%xmm2,  %xmm6
301	addps	%xmm1,  %xmm7
302
303	movaps	 -8 * SIZE(BB), %xmm1
304	movaps	%xmm1,  %xmm2
305	mulps	%xmm3,  %xmm1
306	addps	%xmm1,  %xmm4
307	movaps	 -4 * SIZE(BB), %xmm1
308	mulps	%xmm1,  %xmm3
309	addps	%xmm3,  %xmm5
310	movaps	 -4 * SIZE(AA), %xmm3
311	mulps	%xmm3,  %xmm2
312	mulps	%xmm3,  %xmm1
313	movaps	 16 * SIZE(AA), %xmm3
314	addps	%xmm2,  %xmm6
315	addps	%xmm1,  %xmm7
316	movaps	  0 * SIZE(BB), %xmm1
317
318	movaps	%xmm1,  %xmm2
319	mulps	%xmm0,  %xmm1
320	addps	%xmm1,  %xmm4
321	movaps	  4 * SIZE(BB), %xmm1
322	mulps	%xmm1,  %xmm0
323	addps	%xmm0,  %xmm5
324	movaps	  4 * SIZE(AA), %xmm0
325	mulps	%xmm0,  %xmm2
326	mulps	%xmm0,  %xmm1
327	movaps	  8 * SIZE(AA), %xmm0
328	addps	%xmm2,  %xmm6
329	addps	%xmm1,  %xmm7
330
331	movaps	  8 * SIZE(BB), %xmm1
332	movaps	%xmm1,  %xmm2
333	mulps	%xmm0,  %xmm1
334	addps	%xmm1,  %xmm4
335	movaps	 12 * SIZE(BB), %xmm1
336	mulps	%xmm1,  %xmm0
337	addps	%xmm0,  %xmm5
338	movaps	 12 * SIZE(AA), %xmm0
339	mulps	%xmm0,  %xmm2
340	mulps	%xmm0,  %xmm1
341	movaps	 32 * SIZE(AA), %xmm0
342	addps	%xmm2,  %xmm6
343	addps	%xmm1,  %xmm7
344
345	movaps	 16 * SIZE(BB), %xmm1
346	movaps	%xmm1,  %xmm2
347	mulps	%xmm3,  %xmm1
348	addps	%xmm1,  %xmm4
349	movaps	 20 * SIZE(BB), %xmm1
350	mulps	%xmm1,  %xmm3
351	addps	%xmm3,  %xmm5
352	movaps	 20 * SIZE(AA), %xmm3
353	mulps	%xmm3,  %xmm2
354	mulps	%xmm3,  %xmm1
355	addps	%xmm2,  %xmm6
356	movaps	 24 * SIZE(AA), %xmm3
357	addps	%xmm1,  %xmm7
358
359	movaps	 24 * SIZE(BB), %xmm1
360	movaps	%xmm1,  %xmm2
361	mulps	%xmm3,  %xmm1
362	addps	%xmm1,  %xmm4
363	movaps	 28 * SIZE(BB), %xmm1
364	mulps	%xmm1,  %xmm3
365	addps	%xmm3,  %xmm5
366	movaps	 28 * SIZE(AA), %xmm3
367	mulps	%xmm3,  %xmm2
368	mulps	%xmm3,  %xmm1
369	subl   $-64 * SIZE, BB
370	movaps	 48 * SIZE(AA), %xmm3
371	subl   $-64 * SIZE, AA
372	addps	%xmm2,  %xmm6
373	addps	%xmm1,  %xmm7
374	movaps	-32 * SIZE(BB), %xmm1
375
376	decl   %eax
377	jne    .L12
378	ALIGN_4
379
380.L15:
381#ifndef TRMMKERNEL
382	movl	K, %eax
383#else
384	movl	KKK, %eax
385#endif
386	andl	$7, %eax
387	BRANCH
388	je .L18
389	ALIGN_4
390
391.L16:
392	movaps	%xmm1,  %xmm2
393	mulps	%xmm0,  %xmm1
394	addps	%xmm1,  %xmm4
395	movaps	-28 * SIZE(BB), %xmm1
396	mulps	%xmm1,  %xmm0
397	addps	%xmm0,  %xmm5
398	movaps	-28 * SIZE(AA), %xmm0
399	mulps	%xmm0,  %xmm2
400	mulps	%xmm0,  %xmm1
401	movaps	-24 * SIZE(AA), %xmm0
402	addps	%xmm2,  %xmm6
403	addps	%xmm1,  %xmm7
404	movaps	-24 * SIZE(BB), %xmm1
405
406	addl	$8 * SIZE, AA
407	addl	$8 * SIZE, BB
408	decl	%eax
409	jg	.L16
410	ALIGN_4
411
412.L18:
413	movaps	ALPHA,  %xmm3
414
415	mulps	%xmm3, %xmm4
416	mulps	%xmm3, %xmm5
417	mulps	%xmm3, %xmm6
418	mulps	%xmm3, %xmm7
419
420#ifndef TRMMKERNEL
421	movsd	0 * SIZE(C1), %xmm0
422	movhps	2 * SIZE(C1), %xmm0
423	movsd	4 * SIZE(C1), %xmm2
424	movhps	6 * SIZE(C1), %xmm2
425
426	movsd	0 * SIZE(C1, LDC), %xmm1
427	movhps	2 * SIZE(C1, LDC), %xmm1
428	movsd	4 * SIZE(C1, LDC), %xmm3
429	movhps	6 * SIZE(C1, LDC), %xmm3
430
431	addps	%xmm0, %xmm4
432	addps	%xmm1, %xmm5
433	addps	%xmm2, %xmm6
434	addps	%xmm3, %xmm7
435#endif
436
437	movsd	%xmm4, 0 * SIZE(C1)
438	movhps	%xmm4, 2 * SIZE(C1)
439	movsd	%xmm6, 4 * SIZE(C1)
440	movhps	%xmm6, 6 * SIZE(C1)
441
442	movsd	%xmm5, 0 * SIZE(C1, LDC)
443	movhps	%xmm5, 2 * SIZE(C1, LDC)
444	movsd	%xmm7, 4 * SIZE(C1, LDC)
445	movhps	%xmm7, 6 * SIZE(C1, LDC)
446
447#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
448    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
449	movl	K, %eax
450	subl	KKK, %eax
451	leal	(,%eax,    8), %eax
452	leal	(AA, %eax, 4), AA
453	leal	(BB, %eax, 4), BB
454#endif
455
456#if defined(TRMMKERNEL) && defined(LEFT)
457	addl	$8, KK
458#endif
459
460	addl	$8 * SIZE, C1
461	decl	I
462	jg	.L11
463	ALIGN_4
464
465.L20:
466	movl	M,  I
467	testl	$4, I
468	jle	.L30
469
470.L21:
471#if !defined(TRMMKERNEL) || \
472	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
473	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
474	leal	32 * SIZE + BUFFER, BB
475#else
476	leal	32 * SIZE + BUFFER, BB
477	movl	KK, %eax
478	leal	(, %eax,   8), %eax
479	leal	(AA, %eax, 2), AA
480	leal	(BB, %eax, 4), BB /* because it's doubled */
481#endif
482
483	movaps	-32 * SIZE(AA), %xmm0
484	pxor	%xmm4, %xmm4
485	movaps	-32 * SIZE(BB), %xmm1
486	pxor	%xmm5, %xmm5
487	movaps	-16 * SIZE(AA), %xmm2
488	pxor	%xmm6, %xmm6
489	movaps	-16 * SIZE(BB), %xmm3
490	pxor	%xmm7, %xmm7
491
492#ifndef TRMMKERNEL
493	movl	K, %eax
494#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
495	movl	K, %eax
496	subl	KK, %eax
497	movl	%eax, KKK
498#else
499	movl	KK, %eax
500#ifdef LEFT
501	addl	$4, %eax
502#else
503	addl	$2, %eax
504#endif
505	movl	%eax, KKK
506#endif
507	sarl	$3, %eax
508	je	.L25
509	ALIGN_4
510
511.L22:
512	mulps	%xmm0,  %xmm1
513	mulps	-28 * SIZE(BB), %xmm0
514	addps	%xmm1, %xmm4
515	movaps	-24 * SIZE(BB), %xmm1
516	addps	%xmm0, %xmm5
517	movaps	-28 * SIZE(AA), %xmm0
518	mulps	%xmm0,  %xmm1
519	mulps	-20 * SIZE(BB), %xmm0
520	addps	%xmm1, %xmm6
521	movaps	  0 * SIZE(BB), %xmm1
522	addps	%xmm0, %xmm7
523	movaps	-24 * SIZE(AA), %xmm0
524	mulps	%xmm0,  %xmm3
525	mulps	-12 * SIZE(BB), %xmm0
526	addps	%xmm3, %xmm4
527	movaps	 -8 * SIZE(BB), %xmm3
528	addps	%xmm0, %xmm5
529	movaps	-20 * SIZE(AA), %xmm0
530	mulps	%xmm0,  %xmm3
531	mulps	 -4 * SIZE(BB), %xmm0
532	addps	%xmm3, %xmm6
533	movaps	 16 * SIZE(BB), %xmm3
534	addps	%xmm0, %xmm7
535	movaps	  0 * SIZE(AA), %xmm0
536	mulps	%xmm2,  %xmm1
537	mulps	  4 * SIZE(BB), %xmm2
538	addps	%xmm1, %xmm4
539	movaps	  8 * SIZE(BB), %xmm1
540	addps	%xmm2, %xmm5
541	movaps	-12 * SIZE(AA), %xmm2
542	mulps	%xmm2,  %xmm1
543	mulps	 12 * SIZE(BB), %xmm2
544	addps	%xmm1, %xmm6
545	movaps	 32 * SIZE(BB), %xmm1
546	addps	%xmm2, %xmm7
547	movaps	 -8 * SIZE(AA), %xmm2
548	mulps	%xmm2,  %xmm3
549	mulps	 20 * SIZE(BB), %xmm2
550	addps	%xmm3, %xmm4
551	movaps	 24 * SIZE(BB), %xmm3
552	addps	%xmm2, %xmm5
553	movaps	 -4 * SIZE(AA), %xmm2
554	mulps	%xmm2,  %xmm3
555	mulps	 28 * SIZE(BB), %xmm2
556	addps	%xmm3, %xmm6
557	movaps	 48 * SIZE(BB), %xmm3
558	addps	%xmm2, %xmm7
559	movaps	 16 * SIZE(AA), %xmm2
560
561	subl   $-32 * SIZE, AA
562	addl   $ 64 * SIZE, BB
563	decl   %eax
564	jne    .L22
565	ALIGN_4
566
567.L25:
568	movaps	ALPHA,  %xmm3
569
570#ifndef TRMMKERNEL
571	movl	K, %eax
572#else
573	movl	KKK, %eax
574#endif
575	andl	$7, %eax
576	BRANCH
577	je .L28
578	ALIGN_4
579
580.L26:
581	mulps	%xmm0,  %xmm1
582	mulps	-28 * SIZE(BB), %xmm0
583	addps	%xmm1, %xmm4
584	movaps	-24 * SIZE(BB), %xmm1
585	addps	%xmm0, %xmm5
586	movaps	-28 * SIZE(AA), %xmm0
587
588	addl	$4 * SIZE, AA
589	addl	$8 * SIZE, BB
590	decl	%eax
591	jg	.L26
592	ALIGN_4
593
594.L28:
595	addps	%xmm6, %xmm4
596	addps	%xmm7, %xmm5
597
598	mulps	%xmm3, %xmm4
599	mulps	%xmm3, %xmm5
600
601#ifndef TRMMKERNEL
602	movsd	0 * SIZE(C1), %xmm0
603	movhps	2 * SIZE(C1), %xmm0
604
605	movsd	0 * SIZE(C1, LDC), %xmm1
606	movhps	2 * SIZE(C1, LDC), %xmm1
607
608	addps	%xmm0, %xmm4
609	addps	%xmm1, %xmm5
610#endif
611
612	movsd	%xmm4, 0 * SIZE(C1)
613	movhps	%xmm4, 2 * SIZE(C1)
614	movsd	%xmm5, 0 * SIZE(C1, LDC)
615	movhps	%xmm5, 2 * SIZE(C1, LDC)
616
617#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
618    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
619	movl	K, %eax
620	subl	KKK, %eax
621	leal	(,%eax,    8), %eax
622	leal	(AA, %eax, 2), AA
623	leal	(BB, %eax, 4), BB
624#endif
625
626#if defined(TRMMKERNEL) && defined(LEFT)
627	addl	$4, KK
628#endif
629
630	addl	$4 * SIZE, C1
631	ALIGN_4
632
633.L30:
634	movl	M,  I
635	testl	$2, I
636	jle	.L40
637
638.L31:
639#if !defined(TRMMKERNEL) || \
640	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
641	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
642	leal	32 * SIZE + BUFFER, BB
643#else
644	leal	32 * SIZE + BUFFER, BB
645	movl	KK, %eax
646	leal	(, %eax,   8), %eax
647	leal	(AA, %eax, 1), AA
648	leal	(BB, %eax, 4), BB /* because it's doubled */
649#endif
650
651	movsd	-32 * SIZE(AA), %xmm0
652	pxor	%xmm4, %xmm4
653	movsd	-32 * SIZE(BB), %xmm1
654	pxor	%xmm5, %xmm5
655	movsd	-24 * SIZE(AA), %xmm2
656	pxor	%xmm6, %xmm6
657	movsd	-16 * SIZE(BB), %xmm3
658	pxor	%xmm7, %xmm7
659
660#ifndef TRMMKERNEL
661	movl	K, %eax
662#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
663	movl	K, %eax
664	subl	KK, %eax
665	movl	%eax, KKK
666#else
667	movl	KK, %eax
668#ifdef LEFT
669	addl	$2, %eax
670#else
671	addl	$2, %eax
672#endif
673	movl	%eax, KKK
674#endif
675	sarl	$3, %eax
676	je	.L35
677	ALIGN_4
678
679.L32:
680	mulps	%xmm0,  %xmm1
681	mulps	-28 * SIZE(BB), %xmm0
682	addps	%xmm1, %xmm4
683 	movsd	-24 * SIZE(BB), %xmm1
684	addps	%xmm0, %xmm5
685	movsd	-30 * SIZE(AA), %xmm0
686	mulps	%xmm0,  %xmm1
687	mulps	-20 * SIZE(BB), %xmm0
688	addps	%xmm1, %xmm6
689	movsd	  0 * SIZE(BB), %xmm1
690	addps	%xmm0, %xmm7
691	movsd	-28 * SIZE(AA), %xmm0
692	mulps	%xmm0,  %xmm3
693	mulps	-12 * SIZE(BB), %xmm0
694	addps	%xmm3, %xmm4
695	movsd	 -8 * SIZE(BB), %xmm3
696	addps	%xmm0, %xmm5
697	movsd	-26 * SIZE(AA), %xmm0
698	mulps	%xmm0,  %xmm3
699	mulps	 -4 * SIZE(BB), %xmm0
700	addps	%xmm3, %xmm6
701	movsd	 16 * SIZE(BB), %xmm3
702	addps	%xmm0, %xmm7
703	movsd	-16 * SIZE(AA), %xmm0
704	mulps	%xmm2,  %xmm1
705	mulps	  4 * SIZE(BB), %xmm2
706	addps	%xmm1, %xmm4
707	movsd	  8 * SIZE(BB), %xmm1
708	addps	%xmm2, %xmm5
709	movsd	-22 * SIZE(AA), %xmm2
710	mulps	%xmm2,  %xmm1
711	mulps	 12 * SIZE(BB), %xmm2
712	addps	%xmm1, %xmm6
713	movsd	 32 * SIZE(BB), %xmm1
714	addps	%xmm2, %xmm7
715	movsd	-20 * SIZE(AA), %xmm2
716	mulps	%xmm2,  %xmm3
717	mulps	 20 * SIZE(BB), %xmm2
718	addps	%xmm3, %xmm4
719	movsd	 24 * SIZE(BB), %xmm3
720	addps	%xmm2, %xmm5
721	movsd	-18 * SIZE(AA), %xmm2
722	mulps	%xmm2,  %xmm3
723	mulps	 28 * SIZE(BB), %xmm2
724	addps	%xmm3, %xmm6
725	movsd	 48 * SIZE(BB), %xmm3
726	addps	%xmm2, %xmm7
727	movsd	 -8 * SIZE(AA), %xmm2
728
729	subl   $-16 * SIZE, AA
730	addl   $ 64 * SIZE, BB
731	decl   %eax
732	jne    .L32
733	ALIGN_4
734
735.L35:
736	movsd	ALPHA,  %xmm3
737
738#ifndef TRMMKERNEL
739	movl	K, %eax
740#else
741	movl	KKK, %eax
742#endif
743	andl	$7, %eax
744	BRANCH
745	je .L38
746	ALIGN_4
747
748.L36:
749	mulps	%xmm0,  %xmm1
750	mulps	-28 * SIZE(BB), %xmm0
751	addps	%xmm1, %xmm4
752	movsd	-24 * SIZE(BB), %xmm1
753	addps	%xmm0, %xmm5
754	movsd	-30 * SIZE(AA), %xmm0
755
756	addl	$2 * SIZE, AA
757	addl	$8 * SIZE, BB
758	decl	%eax
759	jg	.L36
760	ALIGN_4
761
762.L38:
763	addps	%xmm6, %xmm4
764	addps	%xmm7, %xmm5
765
766	mulps	%xmm3, %xmm4
767	mulps	%xmm3, %xmm5
768
769#ifndef TRMMKERNEL
770	movsd	0 * SIZE(C1), %xmm0
771	movsd	0 * SIZE(C1, LDC), %xmm1
772
773	addps	%xmm0, %xmm4
774	addps	%xmm1, %xmm5
775#endif
776
777	movsd	%xmm4, 0 * SIZE(C1)
778	movsd	%xmm5, 0 * SIZE(C1, LDC)
779
780#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
781    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
782	movl	K, %eax
783	subl	KKK, %eax
784	leal	(,%eax,    8), %eax
785	leal	(AA, %eax, 1), AA
786	leal	(BB, %eax, 4), BB
787#endif
788
789#if defined(TRMMKERNEL) && defined(LEFT)
790	addl	$2, KK
791#endif
792
793	addl	$2 * SIZE, C1
794	ALIGN_4
795
796.L40:
797	movl	M,  I
798	testl	$1, I
799	jle	.L49
800
801.L41:
802#if !defined(TRMMKERNEL) || \
803	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
804	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
805	leal	32 * SIZE + BUFFER, BB
806#else
807	leal	32 * SIZE + BUFFER, BB
808	movl	KK, %eax
809	leal	(, %eax,   4), %eax
810	leal	(AA, %eax, 1), AA
811	leal	(BB, %eax, 8), BB /* because it's doubled */
812#endif
813
814	movss	-32 * SIZE(AA), %xmm0
815	pxor	%xmm4, %xmm4
816	movss	-32 * SIZE(BB), %xmm1
817	pxor	%xmm5, %xmm5
818	movss	-28 * SIZE(AA), %xmm2
819	pxor	%xmm6, %xmm6
820	movss	-16 * SIZE(BB), %xmm3
821	pxor	%xmm7, %xmm7
822
823#ifndef TRMMKERNEL
824	movl	K, %eax
825#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
826	movl	K, %eax
827	subl	KK, %eax
828	movl	%eax, KKK
829#else
830	movl	KK, %eax
831#ifdef LEFT
832	addl	$1, %eax
833#else
834	addl	$2, %eax
835#endif
836	movl	%eax, KKK
837#endif
838	sarl	$3, %eax
839	je	.L45
840	ALIGN_4
841
842.L42:
843	mulss	%xmm0,  %xmm1
844	mulss	-28 * SIZE(BB), %xmm0
845	addss	%xmm1, %xmm4
846 	movss	-24 * SIZE(BB), %xmm1
847	addss	%xmm0, %xmm5
848	movss	-31 * SIZE(AA), %xmm0
849	mulss	%xmm0,  %xmm1
850	mulss	-20 * SIZE(BB), %xmm0
851	addss	%xmm1, %xmm6
852	movss	  0 * SIZE(BB), %xmm1
853	addss	%xmm0, %xmm7
854	movss	-30 * SIZE(AA), %xmm0
855	mulss	%xmm0,  %xmm3
856	mulss	-12 * SIZE(BB), %xmm0
857	addss	%xmm3, %xmm4
858	movss	 -8 * SIZE(BB), %xmm3
859	addss	%xmm0, %xmm5
860	movss	-29 * SIZE(AA), %xmm0
861	mulss	%xmm0,  %xmm3
862	mulss	 -4 * SIZE(BB), %xmm0
863	addss	%xmm3, %xmm6
864	movss	 16 * SIZE(BB), %xmm3
865	addss	%xmm0, %xmm7
866	movss	-24 * SIZE(AA), %xmm0
867	mulss	%xmm2,  %xmm1
868	mulss	  4 * SIZE(BB), %xmm2
869	addss	%xmm1, %xmm4
870	movss	  8 * SIZE(BB), %xmm1
871	addss	%xmm2, %xmm5
872	movss	-27 * SIZE(AA), %xmm2
873	mulss	%xmm2,  %xmm1
874	mulss	 12 * SIZE(BB), %xmm2
875	addss	%xmm1, %xmm6
876	movss	 32 * SIZE(BB), %xmm1
877	addss	%xmm2, %xmm7
878	movss	-26 * SIZE(AA), %xmm2
879	mulss	%xmm2,  %xmm3
880	mulss	 20 * SIZE(BB), %xmm2
881	addss	%xmm3, %xmm4
882	movss	 24 * SIZE(BB), %xmm3
883	addss	%xmm2, %xmm5
884	movss	-25 * SIZE(AA), %xmm2
885	mulss	%xmm2,  %xmm3
886	mulss	 28 * SIZE(BB), %xmm2
887	addss	%xmm3, %xmm6
888	movss	 48 * SIZE(BB), %xmm3
889	addss	%xmm2, %xmm7
890	movss	-20 * SIZE(AA), %xmm2
891
892	subl   $-8 * SIZE, AA
893	addl   $64 * SIZE, BB
894	decl   %eax
895	jne    .L42
896	ALIGN_4
897
898.L45:
899	movss	ALPHA,  %xmm3
900
901#ifndef TRMMKERNEL
902	movl	K, %eax
903#else
904	movl	KKK, %eax
905#endif
906	andl	$7, %eax
907	BRANCH
908	je .L48
909	ALIGN_4
910
911.L46:
912	mulss	%xmm0,  %xmm1
913	mulss	-28 * SIZE(BB), %xmm0
914	addss	%xmm1, %xmm4
915	movss	-24 * SIZE(BB), %xmm1
916	addss	%xmm0, %xmm5
917	movss	-31 * SIZE(AA), %xmm0
918
919	addl	$1 * SIZE, AA
920	addl	$8 * SIZE, BB
921	decl	%eax
922	jg	.L46
923	ALIGN_4
924
925.L48:
926	addss	%xmm6, %xmm4
927	addss	%xmm7, %xmm5
928
929	mulss	%xmm3, %xmm4
930	mulss	%xmm3, %xmm5
931
932#ifndef TRMMKERNEL
933	movss	0 * SIZE(C1), %xmm0
934	movss	0 * SIZE(C1, LDC), %xmm1
935
936	addss	%xmm0, %xmm4
937	addss	%xmm1, %xmm5
938#endif
939
940	movss	%xmm4, 0 * SIZE(C1)
941	movss	%xmm5, 0 * SIZE(C1, LDC)
942
943#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
944    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
945	movl	K, %eax
946	subl	KKK, %eax
947	leal	(,%eax,    4), %eax
948	leal	(AA, %eax, 1), AA
949	leal	(BB, %eax, 8), BB
950#endif
951
952#if defined(TRMMKERNEL) && defined(LEFT)
953	addl	$1, KK
954#endif
955	ALIGN_4
956
957.L49:
958#if defined(TRMMKERNEL) && !defined(LEFT)
959	addl	$2, KK
960#endif
961
962	leal	(, LDC, 2), %eax
963	addl	%eax, C
964	decl	J
965	jg	.L01
966	ALIGN_4
967
968.L50:
969	movl	N, %eax
970	testl	$1, %eax
971	jle	.L999
972	ALIGN_4
973
974.L51:
975	leal	32 * SIZE + BUFFER, BB
976
977#if defined(TRMMKERNEL) && defined(LEFT)
978	movl	OFFSET, %eax
979	movl	%eax, KK
980#endif
981
982	movl	K, %eax
983	sarl	$3, %eax
984	jle	.L55
985	ALIGN_4
986
987.L52:
988	movss	-32 * SIZE(B), %xmm0
989	movss	-31 * SIZE(B), %xmm1
990	movss	-30 * SIZE(B), %xmm2
991	movss	-29 * SIZE(B), %xmm3
992	movss	-28 * SIZE(B), %xmm4
993	movss	-27 * SIZE(B), %xmm5
994	movss	-26 * SIZE(B), %xmm6
995	movss	-25 * SIZE(B), %xmm7
996
997	shufps	$0, %xmm0, %xmm0
998	shufps	$0, %xmm1, %xmm1
999	shufps	$0, %xmm2, %xmm2
1000	shufps	$0, %xmm3, %xmm3
1001	shufps	$0, %xmm4, %xmm4
1002	shufps	$0, %xmm5, %xmm5
1003	shufps	$0, %xmm6, %xmm6
1004	shufps	$0, %xmm7, %xmm7
1005
1006	movaps	%xmm0,  -32 * SIZE(BB)
1007	movaps	%xmm1,  -28 * SIZE(BB)
1008	movaps	%xmm2,  -24 * SIZE(BB)
1009	movaps	%xmm3,  -20 * SIZE(BB)
1010	movaps	%xmm4,  -16 * SIZE(BB)
1011	movaps	%xmm5,  -12 * SIZE(BB)
1012	movaps	%xmm6,   -8 * SIZE(BB)
1013	movaps	%xmm7,   -4 * SIZE(BB)
1014
1015	addl	$  8 * SIZE, B
1016	subl	$-32 * SIZE, BB
1017	decl	%eax
1018	jne	.L52
1019	ALIGN_4
1020
1021.L55:
1022	movl	K, %eax
1023	andl	$7, %eax
1024	BRANCH
1025	jle	.L60
1026	ALIGN_4
1027
1028.L56:
1029	movss	-32 * SIZE(B), %xmm0
1030	shufps	$0, %xmm0, %xmm0
1031	movaps	%xmm0,  -32 * SIZE(BB)
1032
1033	addl	$1 * SIZE, B
1034	addl	$4 * SIZE, BB
1035	decl	%eax
1036	jne	.L56
1037	ALIGN_4
1038
1039.L60:
1040	movl	C, C1
1041	movl	A, AA
1042	movl	M,  I
1043	sarl	$3, I
1044	jle	.L70
1045	ALIGN_4
1046
1047.L61:
1048#if !defined(TRMMKERNEL) || \
1049	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1050	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1051	leal	32 * SIZE + BUFFER, BB
1052#else
1053	leal	32 * SIZE + BUFFER, BB
1054	movl	KK, %eax
1055	leal	(, %eax,   8), %eax
1056	leal	(AA, %eax, 4), AA
1057	leal	(BB, %eax, 2), BB /* because it's doubled */
1058#endif
1059
1060	movaps	-32 * SIZE(AA), %xmm0
1061	pxor	%xmm4, %xmm4
1062	movaps	-32 * SIZE(BB), %xmm1
1063	pxor	%xmm5, %xmm5
1064	movaps	-16 * SIZE(AA), %xmm2
1065	pxor	%xmm6, %xmm6
1066	movaps	-16 * SIZE(BB), %xmm3
1067	pxor	%xmm7, %xmm7
1068
1069	prefetcht0	3 * SIZE(C1)
1070
1071#ifndef TRMMKERNEL
1072	movl	K, %eax
1073#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1074	movl	K, %eax
1075	subl	KK, %eax
1076	movl	%eax, KKK
1077#else
1078	movl	KK, %eax
1079#ifdef LEFT
1080	addl	$8, %eax
1081#else
1082	addl	$1, %eax
1083#endif
1084	movl	%eax, KKK
1085#endif
1086	sarl	$3, %eax
1087	je	.L65
1088	ALIGN_4
1089
1090.L62:
1091	mulps	%xmm1, %xmm0
1092	mulps	-28 * SIZE(AA), %xmm1
1093	addps	%xmm0, %xmm4
1094	movaps	-24 * SIZE(AA), %xmm0
1095	addps	%xmm1, %xmm6
1096	movaps	-28 * SIZE(BB), %xmm1
1097	mulps	%xmm1, %xmm0
1098	mulps	-20 * SIZE(AA), %xmm1
1099	addps	%xmm0, %xmm5
1100	movaps	  0 * SIZE(AA), %xmm0
1101	addps	%xmm1, %xmm7
1102	movaps	-24 * SIZE(BB), %xmm1
1103	mulps	%xmm1, %xmm2
1104	mulps	-12 * SIZE(AA), %xmm1
1105	addps	%xmm2, %xmm4
1106	movaps	 -8 * SIZE(AA), %xmm2
1107	addps	%xmm1, %xmm6
1108	movaps	-20 * SIZE(BB), %xmm1
1109	mulps	%xmm1, %xmm2
1110	mulps	 -4 * SIZE(AA), %xmm1
1111	addps	%xmm2, %xmm5
1112	movaps	 16 * SIZE(AA), %xmm2
1113	addps	%xmm1, %xmm7
1114	movaps	  0 * SIZE(BB), %xmm1
1115	mulps	%xmm3, %xmm0
1116	mulps	  4 * SIZE(AA), %xmm3
1117	addps	%xmm0, %xmm4
1118	movaps	  8 * SIZE(AA), %xmm0
1119	addps	%xmm3, %xmm6
1120	movaps	-12 * SIZE(BB), %xmm3
1121	mulps	%xmm3, %xmm0
1122	mulps	 12 * SIZE(AA), %xmm3
1123	addps	%xmm0, %xmm5
1124	movaps	 32 * SIZE(AA), %xmm0
1125	addps	%xmm3, %xmm7
1126	movaps	 -8 * SIZE(BB), %xmm3
1127	mulps	%xmm3, %xmm2
1128	mulps	 20 * SIZE(AA), %xmm3
1129	addps	%xmm2, %xmm4
1130	movaps	 24 * SIZE(AA), %xmm2
1131	addps	%xmm3, %xmm6
1132	movaps	 -4 * SIZE(BB), %xmm3
1133	mulps	%xmm3, %xmm2
1134	mulps	 28 * SIZE(AA), %xmm3
1135	addps	%xmm2, %xmm5
1136	movaps	 48 * SIZE(AA), %xmm2
1137	addps	%xmm3, %xmm7
1138	movaps	 16 * SIZE(BB), %xmm3
1139
1140	addl   $ 64 * SIZE, AA
1141	subl   $-32 * SIZE, BB
1142	decl   %eax
1143	jne    .L62
1144	ALIGN_4
1145
1146.L65:
1147	movaps	ALPHA,  %xmm3
1148
1149#ifndef TRMMKERNEL
1150	movl	K, %eax
1151#else
1152	movl	KKK, %eax
1153#endif
1154	andl	$7, %eax
1155	BRANCH
1156	je .L68
1157	ALIGN_4
1158
1159.L66:
1160	mulps	%xmm1, %xmm0
1161	mulps	-28 * SIZE(AA), %xmm1
1162	addps	%xmm0, %xmm4
1163	movaps	-24 * SIZE(AA), %xmm0
1164	addps	%xmm1, %xmm6
1165	movaps	-28 * SIZE(BB), %xmm1
1166
1167	addl	$8 * SIZE, AA
1168	addl	$4 * SIZE, BB
1169	decl	%eax
1170	jg	.L66
1171	ALIGN_4
1172
1173.L68:
1174	addps	%xmm5, %xmm4
1175	addps	%xmm7, %xmm6
1176
1177	mulps	%xmm3, %xmm4
1178	mulps	%xmm3, %xmm6
1179
1180#ifndef TRMMKERNEL
1181	movsd	0 * SIZE(C1), %xmm0
1182	movhps	2 * SIZE(C1), %xmm0
1183	movsd	4 * SIZE(C1), %xmm2
1184	movhps	6 * SIZE(C1), %xmm2
1185
1186	addps	%xmm0, %xmm4
1187	addps	%xmm2, %xmm6
1188#endif
1189
1190	movsd	%xmm4, 0 * SIZE(C1)
1191	movhps	%xmm4, 2 * SIZE(C1)
1192	movsd	%xmm6, 4 * SIZE(C1)
1193	movhps	%xmm6, 6 * SIZE(C1)
1194
1195#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1196    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1197	movl	K, %eax
1198	subl	KKK, %eax
1199	leal	(,%eax,    8), %eax
1200	leal	(AA, %eax, 4), AA
1201	leal	(BB, %eax, 2), BB
1202#endif
1203
1204#if defined(TRMMKERNEL) && defined(LEFT)
1205	addl	$8, KK
1206#endif
1207
1208	addl	$8 * SIZE, C1
1209	decl	I
1210	jg	.L61
1211	ALIGN_4
1212
1213.L70:
1214	movl	M,  I
1215	testl	$4, I
1216	jle	.L80
1217
1218.L71:
1219#if !defined(TRMMKERNEL) || \
1220	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1221	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1222	leal	32 * SIZE + BUFFER, BB
1223#else
1224	leal	32 * SIZE + BUFFER, BB
1225	movl	KK, %eax
1226	leal	(, %eax,   8), %eax
1227	leal	(AA, %eax, 2), AA
1228	leal	(BB, %eax, 2), BB /* because it's doubled */
1229#endif
1230
1231	movaps	-32 * SIZE(AA), %xmm0
1232	pxor	%xmm4, %xmm4
1233	movaps	-32 * SIZE(BB), %xmm1
1234	pxor	%xmm5, %xmm5
1235	movaps	-16 * SIZE(AA), %xmm2
1236	movaps	-16 * SIZE(BB), %xmm3
1237
1238#ifndef TRMMKERNEL
1239	movl	K, %eax
1240#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1241	movl	K, %eax
1242	subl	KK, %eax
1243	movl	%eax, KKK
1244#else
1245	movl	KK, %eax
1246#ifdef LEFT
1247	addl	$4, %eax
1248#else
1249	addl	$1, %eax
1250#endif
1251	movl	%eax, KKK
1252#endif
1253	sarl	$3, %eax
1254	je	.L75
1255	ALIGN_4
1256
1257.L72:
1258	mulps	%xmm0,  %xmm1
1259	movaps	-28 * SIZE(AA), %xmm0
1260	addps	%xmm1, %xmm4
1261	movaps	-28 * SIZE(BB), %xmm1
1262	mulps	%xmm0,  %xmm1
1263	movaps	-24 * SIZE(AA), %xmm0
1264	addps	%xmm1, %xmm5
1265	movaps	-24 * SIZE(BB), %xmm1
1266	mulps	%xmm0,  %xmm1
1267	movaps	-20 * SIZE(AA), %xmm0
1268	addps	%xmm1, %xmm4
1269	movaps	-20 * SIZE(BB), %xmm1
1270	mulps	%xmm0,  %xmm1
1271	movaps	  0 * SIZE(AA), %xmm0
1272	addps	%xmm1, %xmm5
1273	movaps	  0 * SIZE(BB), %xmm1
1274	mulps	%xmm2,  %xmm3
1275	movaps	-12 * SIZE(AA), %xmm2
1276	addps	%xmm3, %xmm4
1277	movaps	-12 * SIZE(BB), %xmm3
1278	mulps	%xmm2,  %xmm3
1279	movaps	 -8 * SIZE(AA), %xmm2
1280	addps	%xmm3, %xmm5
1281	movaps	 -8 * SIZE(BB), %xmm3
1282	mulps	%xmm2,  %xmm3
1283	movaps	 -4 * SIZE(AA), %xmm2
1284	addps	%xmm3, %xmm4
1285	movaps	 -4 * SIZE(BB), %xmm3
1286	mulps	%xmm2,  %xmm3
1287	movaps	 16 * SIZE(AA), %xmm2
1288	addps	%xmm3, %xmm5
1289	movaps	 16 * SIZE(BB), %xmm3
1290
1291	subl   $-32 * SIZE, AA
1292	subl   $-32 * SIZE, BB
1293	decl   %eax
1294	jne    .L72
1295	ALIGN_4
1296
1297.L75:
1298	movaps	ALPHA,  %xmm3
1299
1300#ifndef TRMMKERNEL
1301	movl	K, %eax
1302#else
1303	movl	KKK, %eax
1304#endif
1305	andl	$7, %eax
1306	BRANCH
1307	je .L78
1308	ALIGN_4
1309
1310.L76:
1311	mulps	%xmm0,  %xmm1
1312	movaps	-28 * SIZE(AA), %xmm0
1313	addps	%xmm1, %xmm4
1314	movaps	-28 * SIZE(BB), %xmm1
1315
1316	addl	$4 * SIZE, AA
1317	addl	$4 * SIZE, BB
1318	decl	%eax
1319	jg	.L76
1320	ALIGN_4
1321
1322.L78:
1323	addps	%xmm5, %xmm4
1324	mulps	%xmm3, %xmm4
1325
1326#ifndef TRMMKERNEL
1327	movsd	0 * SIZE(C1), %xmm0
1328	movhps	2 * SIZE(C1), %xmm0
1329
1330	addps	%xmm0, %xmm4
1331#endif
1332
1333	movsd	%xmm4, 0 * SIZE(C1)
1334	movhps	%xmm4, 2 * SIZE(C1)
1335
1336#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1337    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1338	movl	K, %eax
1339	subl	KKK, %eax
1340	leal	(,%eax,    8), %eax
1341	leal	(AA, %eax, 2), AA
1342	leal	(BB, %eax, 2), BB
1343#endif
1344
1345#if defined(TRMMKERNEL) && defined(LEFT)
1346	addl	$4, KK
1347#endif
1348
1349	addl	$4 * SIZE, C1
1350	ALIGN_4
1351
1352.L80:
1353	movl	M,  I
1354	testl	$2, I
1355	jle	.L90
1356
1357.L81:
1358#if !defined(TRMMKERNEL) || \
1359	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1360	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1361	leal	32 * SIZE + BUFFER, BB
1362#else
1363	leal	32 * SIZE + BUFFER, BB
1364	movl	KK, %eax
1365	leal	(, %eax,   8), %eax
1366	leal	(AA, %eax, 1), AA
1367	leal	(BB, %eax, 2), BB /* because it's doubled */
1368#endif
1369
1370	movsd	-32 * SIZE(AA), %xmm0
1371	pxor	%xmm4, %xmm4
1372	movsd	-32 * SIZE(BB), %xmm1
1373	pxor	%xmm5, %xmm5
1374	movsd	-16 * SIZE(BB), %xmm3
1375	movsd	-24 * SIZE(AA), %xmm2
1376
1377#ifndef TRMMKERNEL
1378	movl	K, %eax
1379#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1380	movl	K, %eax
1381	subl	KK, %eax
1382	movl	%eax, KKK
1383#else
1384	movl	KK, %eax
1385#ifdef LEFT
1386	addl	$2, %eax
1387#else
1388	addl	$1, %eax
1389#endif
1390	movl	%eax, KKK
1391#endif
1392	sarl	$3, %eax
1393	je	.L85
1394	ALIGN_4
1395
1396.L82:
1397	mulps	%xmm0,  %xmm1
1398	movsd	-30 * SIZE(AA), %xmm0
1399	addps	%xmm1, %xmm4
1400	movsd	-28 * SIZE(BB), %xmm1
1401	mulps	%xmm0,  %xmm1
1402	movsd	-28 * SIZE(AA), %xmm0
1403	addps	%xmm1, %xmm5
1404	movsd	-24 * SIZE(BB), %xmm1
1405	mulps	%xmm0,  %xmm1
1406	movsd	-26 * SIZE(AA), %xmm0
1407	addps	%xmm1, %xmm4
1408	movsd	-20 * SIZE(BB), %xmm1
1409	mulps	%xmm0,  %xmm1
1410	movsd	-16 * SIZE(AA), %xmm0
1411	addps	%xmm1, %xmm5
1412	movsd	 -0 * SIZE(BB), %xmm1
1413	mulps	%xmm2,  %xmm3
1414	movsd	-22 * SIZE(AA), %xmm2
1415	addps	%xmm3, %xmm4
1416	movsd	-12 * SIZE(BB), %xmm3
1417	mulps	%xmm2,  %xmm3
1418	movsd	-20 * SIZE(AA), %xmm2
1419	addps	%xmm3, %xmm5
1420	movsd	 -8 * SIZE(BB), %xmm3
1421	mulps	%xmm2,  %xmm3
1422	movsd	-18 * SIZE(AA), %xmm2
1423	addps	%xmm3, %xmm4
1424	movsd	 -4 * SIZE(BB), %xmm3
1425	mulps	%xmm2,  %xmm3
1426	movsd	 -8 * SIZE(AA), %xmm2
1427	addps	%xmm3, %xmm5
1428	movsd	 16 * SIZE(BB), %xmm3
1429
1430	subl   $-16 * SIZE, AA
1431	subl   $-32 * SIZE, BB
1432	decl   %eax
1433	jne    .L82
1434	ALIGN_4
1435
1436.L85:
1437	movsd	ALPHA,  %xmm3
1438
1439#ifndef TRMMKERNEL
1440	movl	K, %eax
1441#else
1442	movl	KKK, %eax
1443#endif
1444	andl	$7, %eax
1445	BRANCH
1446	je .L88
1447	ALIGN_4
1448
1449.L86:
1450	mulps	%xmm0,  %xmm1
1451	movsd	-30 * SIZE(AA), %xmm0
1452	addps	%xmm1, %xmm4
1453	movsd	-28 * SIZE(BB), %xmm1
1454
1455	addl	$2 * SIZE, AA
1456	addl	$4 * SIZE, BB
1457	decl	%eax
1458	jg	.L86
1459	ALIGN_4
1460
1461.L88:
1462	addps	%xmm5, %xmm4
1463	mulps	%xmm3, %xmm4
1464
1465#ifndef TRMMKERNEL
1466	movsd	0 * SIZE(C1), %xmm0
1467	addps	%xmm0, %xmm4
1468#endif
1469	movsd	%xmm4, 0 * SIZE(C1)
1470
1471#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1472    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1473	movl	K, %eax
1474	subl	KKK, %eax
1475	leal	(,%eax,    8), %eax
1476	leal	(AA, %eax, 1), AA
1477	leal	(BB, %eax, 2), BB
1478#endif
1479
1480#if defined(TRMMKERNEL) && defined(LEFT)
1481	addl	$2, KK
1482#endif
1483	addl	$2 * SIZE, C1
1484	ALIGN_4
1485
1486.L90:
1487	movl	M,  I
1488	testl	$1, I
1489	jle	.L99
1490
1491.L91:
1492#if !defined(TRMMKERNEL) || \
1493	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
1494	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
1495	leal	32 * SIZE + BUFFER, BB
1496#else
1497	leal	32 * SIZE + BUFFER, BB
1498	movl	KK, %eax
1499	leal	(, %eax,   4), %eax
1500	leal	(AA, %eax, 1), AA
1501	leal	(BB, %eax, 4), BB /* because it's doubled */
1502#endif
1503
1504	movss	-32 * SIZE(AA), %xmm0
1505	pxor	%xmm4, %xmm4
1506	movss	-32 * SIZE(BB), %xmm1
1507	pxor	%xmm5, %xmm5
1508	movss	-16 * SIZE(BB), %xmm3
1509	movss	-28 * SIZE(AA), %xmm2
1510
1511#ifndef TRMMKERNEL
1512	movl	K, %eax
1513#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
1514	movl	K, %eax
1515	subl	KK, %eax
1516	movl	%eax, KKK
1517#else
1518	movl	KK, %eax
1519#ifdef LEFT
1520	addl	$1, %eax
1521#else
1522	addl	$1, %eax
1523#endif
1524	movl	%eax, KKK
1525#endif
1526	sarl	$3, %eax
1527	je	.L95
1528	ALIGN_4
1529
1530.L92:
1531	mulss	%xmm0,  %xmm1
1532	movss	-31 * SIZE(AA), %xmm0
1533	addss	%xmm1, %xmm4
1534	movss	-28 * SIZE(BB), %xmm1
1535	mulss	%xmm0,  %xmm1
1536	movss	-30 * SIZE(AA), %xmm0
1537	addss	%xmm1, %xmm5
1538	movss	-24 * SIZE(BB), %xmm1
1539	mulss	%xmm0,  %xmm1
1540	movss	-29 * SIZE(AA), %xmm0
1541	addss	%xmm1, %xmm4
1542	movss	-20 * SIZE(BB), %xmm1
1543	mulss	%xmm0,  %xmm1
1544	movss	-24 * SIZE(AA), %xmm0
1545	addss	%xmm1, %xmm5
1546	movss	 -0 * SIZE(BB), %xmm1
1547	mulss	%xmm2,  %xmm3
1548	movss	-27 * SIZE(AA), %xmm2
1549	addss	%xmm3, %xmm4
1550	movss	-12 * SIZE(BB), %xmm3
1551	mulss	%xmm2,  %xmm3
1552	movss	-26 * SIZE(AA), %xmm2
1553	addss	%xmm3, %xmm5
1554	movss	 -8 * SIZE(BB), %xmm3
1555	mulss	%xmm2,  %xmm3
1556	movss	-25 * SIZE(AA), %xmm2
1557	addss	%xmm3, %xmm4
1558	movss	 -4 * SIZE(BB), %xmm3
1559	mulss	%xmm2,  %xmm3
1560	movss	-20 * SIZE(AA), %xmm2
1561	addss	%xmm3, %xmm5
1562	movss	 16 * SIZE(BB), %xmm3
1563
1564	subl   $ -8 * SIZE, AA
1565	subl   $-32 * SIZE, BB
1566	decl   %eax
1567	jne    .L92
1568	ALIGN_4
1569
1570.L95:
1571	movss	ALPHA,  %xmm3
1572
1573#ifndef TRMMKERNEL
1574	movl	K, %eax
1575#else
1576	movl	KKK, %eax
1577#endif
1578	andl	$7, %eax
1579	BRANCH
1580	je .L98
1581	ALIGN_4
1582
1583.L96:
1584	mulss	%xmm0,  %xmm1
1585	movss	-31 * SIZE(AA), %xmm0
1586	addss	%xmm1, %xmm4
1587	movss	-28 * SIZE(BB), %xmm1
1588
1589	addl	$1 * SIZE, AA
1590	addl	$4 * SIZE, BB
1591	decl	%eax
1592	jg	.L96
1593	ALIGN_4
1594
1595.L98:
1596	addss	%xmm5, %xmm4
1597	mulss	%xmm3, %xmm4
1598
1599#ifndef TRMMKERNEL
1600	movss	0 * SIZE(C1), %xmm0
1601	addss	%xmm0, %xmm4
1602#endif
1603	movss	%xmm4, 0 * SIZE(C1)
1604	ALIGN_4
1605
1606.L99:
1607	addl	LDC, C
1608	ALIGN_4
1609
1610
1611.L999:
1612	movl	OLD_STACK, %esp
1613
1614	EMMS
1615
1616	popl	%ebx
1617	popl	%esi
1618	popl	%edi
1619	popl	%ebp
1620	ret
1621
1622	EPILOGUE
1623