1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	 0
44
45#define OLD_M	 4 + STACK + ARGS(%esi)
46#define OLD_N	 8 + STACK + ARGS(%esi)
47#define OLD_K	12 + STACK + ARGS(%esi)
48#define OLD_ALPHA_R	16 + STACK + ARGS(%esi)
49#define OLD_ALPHA_I	20 + STACK + ARGS(%esi)
50#define OLD_A	24 + STACK + ARGS(%esi)
51#define OLD_B	28 + STACK + ARGS(%esi)
52#define OLD_C	32 + STACK + ARGS(%esi)
53#define OLD_LDC	36 + STACK + ARGS(%esi)
54#define OLD_OFFSET 40 + STACK + ARGS(%esi)
55
56#define GAMMA_R  0(%esp)
57#define GAMMA_I  8(%esp)
58#define ALPHA	16(%esp)
59#define K	24(%esp)
60#define N	28(%esp)
61#define M	32(%esp)
62#define A	36(%esp)
63#define C	40(%esp)
64#define J	44(%esp)
65#define OLD_STACK 48(%esp)
66#define OFFSET	52(%esp)
67#define KK	56(%esp)
68#define KKK	60(%esp)
69#define BUFFER 128(%esp)
70
71#define AA	%edx
72#define BB	%ecx
73
74#define PREFETCHSIZE (16 * 2 + 6)
75
76#define AOFFSET -32
77#define BOFFSET 128
78
79/*
80
81  A hint of scheduling is received from following URL
82
83https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11
84
85*/
86
87	PROLOGUE
88
89	pushl	%ebp
90	pushl	%edi
91	pushl	%esi
92	pushl	%ebx
93
94	PROFCODE
95
96	movl	%esp, %esi	# save old stack
97	subl	$128 + LOCAL_BUFFER_SIZE, %esp
98	movl	OLD_M, %ebx
99	andl	$-1024, %esp	# align stack
100
101	STACK_TOUCHING
102
103	movl	OLD_N, %eax
104	movl	OLD_K, %ecx
105	movl	OLD_A, %edx
106
107	movl	%ebx, M
108	movl	%eax, N
109	movl	%ecx, K
110	subl	$AOFFSET * SIZE, %edx
111	movl	%edx, A
112	movl	%esi, OLD_STACK
113
114	testl	%ebx, %ebx
115	jle	.L999
116
117	movl	OLD_B, %edi
118	movl	OLD_C, %ebx
119
120	EMMS
121
122	movd	OLD_ALPHA_R, %mm0
123	movd	OLD_ALPHA_I, %mm1
124
125	movd	%mm0, 0 + ALPHA
126	movd	%mm1, 4 + ALPHA
127
128#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
129	movl	 $0x3f800000,  0 + GAMMA_R
130	movl	 $0x3f800000,  4 + GAMMA_R
131	movl	 $0xbf800000,  0 + GAMMA_I
132	movl	 $0x3f800000,  4 + GAMMA_I
133#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
134	movl	 $0x3f800000,  0 + GAMMA_R
135	movl	 $0x3f800000,  4 + GAMMA_R
136	movl	 $0x3f800000,  0 + GAMMA_I
137	movl	 $0xbf800000,  4 + GAMMA_I
138#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
139	movl	 $0x3f800000,  0 + GAMMA_R
140	movl	 $0xbF800000,  4 + GAMMA_R
141	movl	 $0x3f800000,  0 + GAMMA_I
142	movl	 $0x3F800000,  4 + GAMMA_I
143#else
144	movl	 $0x3f800000,  0 + GAMMA_R
145	movl	 $0xbf800000,  4 + GAMMA_R
146	movl	 $0xbf800000,  0 + GAMMA_I
147	movl	 $0xbf800000,  4 + GAMMA_I
148#endif
149	movl	%ebx, C
150	movl	OLD_LDC, %ebp
151	leal	(, %ebp, SIZE * 2), %ebp
152
153#ifdef TRMMKERNEL
154	movl	OLD_OFFSET, %eax
155	movl	%eax, OFFSET
156#ifndef LEFT
157	negl	%eax
158	movl	%eax, KK
159#endif
160#endif
161
162	movl	N, %eax
163	sarl	$1, %eax
164	movl	%eax, J			# j = n
165	jle	.L20
166	ALIGN_4
167
168.L01:
169/* Copying to Sub Buffer */
170	leal	BUFFER, BB
171
172#if defined(TRMMKERNEL) && defined(LEFT)
173	movl	OFFSET, %eax
174	movl	%eax, KK
175#endif
176
177	movl	K,  %eax
178	sarl	$2, %eax
179	jle	.L03
180	ALIGN_4
181
182.L02:
183	movd	 0 * SIZE(%edi), %mm0
184	movd	 1 * SIZE(%edi), %mm1
185	movd	 2 * SIZE(%edi), %mm2
186	movd	 3 * SIZE(%edi), %mm3
187	movd	 4 * SIZE(%edi), %mm4
188	movd	 5 * SIZE(%edi), %mm5
189	movd	 6 * SIZE(%edi), %mm6
190	movd	 7 * SIZE(%edi), %mm7
191
192	prefetchnta	72 * SIZE(%edi)
193
194	punpckldq %mm0, %mm0
195	punpckldq %mm1, %mm1
196	punpckldq %mm2, %mm2
197	punpckldq %mm3, %mm3
198	punpckldq %mm4, %mm4
199	punpckldq %mm5, %mm5
200	punpckldq %mm6, %mm6
201	punpckldq %mm7, %mm7
202
203	movq	%mm0,  0 * SIZE(BB)
204	movq	%mm1,  2 * SIZE(BB)
205	movq	%mm2,  4 * SIZE(BB)
206	movq	%mm3,  6 * SIZE(BB)
207	movq	%mm4,  8 * SIZE(BB)
208	movq	%mm5, 10 * SIZE(BB)
209	movq	%mm6, 12 * SIZE(BB)
210	movq	%mm7, 14 * SIZE(BB)
211
212	movd	 8 * SIZE(%edi), %mm0
213	movd	 9 * SIZE(%edi), %mm1
214	movd	10 * SIZE(%edi), %mm2
215	movd	11 * SIZE(%edi), %mm3
216	movd	12 * SIZE(%edi), %mm4
217	movd	13 * SIZE(%edi), %mm5
218	movd	14 * SIZE(%edi), %mm6
219	movd	15 * SIZE(%edi), %mm7
220
221	punpckldq %mm0, %mm0
222	punpckldq %mm1, %mm1
223	punpckldq %mm2, %mm2
224	punpckldq %mm3, %mm3
225	punpckldq %mm4, %mm4
226	punpckldq %mm5, %mm5
227	punpckldq %mm6, %mm6
228	punpckldq %mm7, %mm7
229
230	movq	%mm0, 16 * SIZE(BB)
231	movq	%mm1, 18 * SIZE(BB)
232	movq	%mm2, 20 * SIZE(BB)
233	movq	%mm3, 22 * SIZE(BB)
234	movq	%mm4, 24 * SIZE(BB)
235	movq	%mm5, 26 * SIZE(BB)
236	movq	%mm6, 28 * SIZE(BB)
237	movq	%mm7, 30 * SIZE(BB)
238
239	addl	$16 * SIZE, %edi
240	addl	$32 * SIZE, BB
241	decl	%eax
242	jne	.L02
243	ALIGN_4
244
245.L03:
246	movl	K, %eax
247	andl	$3, %eax
248	BRANCH
249	jle	.L10
250	ALIGN_4
251
252.L04:
253	movd	 0 * SIZE(%edi), %mm0
254	movd	 1 * SIZE(%edi), %mm1
255	movd	 2 * SIZE(%edi), %mm2
256	movd	 3 * SIZE(%edi), %mm3
257
258	punpckldq %mm0, %mm0
259	punpckldq %mm1, %mm1
260	punpckldq %mm2, %mm2
261	punpckldq %mm3, %mm3
262
263	movq	%mm0,  0 * SIZE(BB)
264	movq	%mm1,  2 * SIZE(BB)
265	movq	%mm2,  4 * SIZE(BB)
266	movq	%mm3,  6 * SIZE(BB)
267
268	addl	$4 * SIZE, %edi
269	addl	$8 * SIZE, BB
270	decl	%eax
271	jne	.L04
272	ALIGN_4
273
274.L10:
275	movl	C, %esi		# coffset = c
276	movl	A, AA		# aoffset = a
277	movl	M,  %ebx
278	ALIGN_4
279
280.L11:
281	leal	- BOFFSET * SIZE + BUFFER, BB
282
283#if !defined(TRMMKERNEL) || \
284	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
285	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
286#else
287	movl	KK,   %eax
288	leal	(, %eax, SIZE), %eax
289	leal	(AA, %eax, 2), AA
290	leal	(BB, %eax, 8), BB
291#endif
292
293	movq	        (  0 + AOFFSET) * SIZE(AA), %mm0
294	pxor	%mm4, %mm4
295	movq	        ( 16 + AOFFSET) * SIZE(AA), %mm1
296	pxor	%mm5, %mm5
297	PADDING movq	(  0 + BOFFSET) * SIZE(BB), %mm2
298	pxor	%mm6, %mm6
299	PADDING movq	( 16 + BOFFSET) * SIZE(BB), %mm3
300	pxor	%mm7, %mm7
301
302	prefetchw 2 * SIZE(%esi)
303	prefetchw 2 * SIZE(%esi, %ebp)
304
305#ifndef TRMMKERNEL
306	movl	K,  %eax
307#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
308	movl	K, %eax
309	subl	KK, %eax
310	movl	%eax, KKK
311#else
312	movl	KK, %eax
313#ifdef LEFT
314	addl	$1, %eax
315#else
316	addl	$2, %eax
317#endif
318	movl	%eax, KKK
319#endif
320	sarl	$4, %eax
321	je	.L15
322	ALIGN_4
323
324.L12:
325	pfmul	%mm0, %mm2
326	pfadd	%mm2, %mm4
327	PADDING movq	(  2 + BOFFSET) * SIZE(BB), %mm2
328
329	pfmul	%mm0, %mm2
330	pfadd	%mm2, %mm5
331	PADDING movq	(  4 + BOFFSET) * SIZE(BB), %mm2
332	pfmul	%mm0, %mm2
333	pfadd	%mm2, %mm6
334	PADDING prefetch	(PREFETCHSIZE +  0) * SIZE(AA)
335
336	PADDING movq	(  8 + BOFFSET) * SIZE(BB), %mm2
337	pfmul	        (  6 + BOFFSET) * SIZE(BB), %mm0
338	pfadd	%mm0, %mm7
339	movq	        (  2 + AOFFSET) * SIZE(AA), %mm0
340
341	pfmul	%mm0, %mm2
342	pfadd	%mm2, %mm4
343	PADDING movq	( 10 + BOFFSET) * SIZE(BB), %mm2
344	pfmul	%mm0, %mm2
345	pfadd	%mm2, %mm5
346	PADDING movq	( 12 + BOFFSET) * SIZE(BB), %mm2
347	pfmul	%mm0, %mm2
348	pfadd	%mm2, %mm6
349	PADDING movq	( 32 + BOFFSET) * SIZE(BB), %mm2
350	pfmul	        ( 14 + BOFFSET) * SIZE(BB), %mm0
351	pfadd	%mm0, %mm7
352	movq	        (  4 + AOFFSET) * SIZE(AA), %mm0
353
354	pfmul	%mm0, %mm3
355	pfadd	%mm3, %mm4
356	PADDING movq	( 18 + BOFFSET) * SIZE(BB), %mm3
357	pfmul	%mm0, %mm3
358	pfadd	%mm3, %mm5
359	PADDING movq	( 20 + BOFFSET) * SIZE(BB), %mm3
360	pfmul	%mm0, %mm3
361	pfadd	%mm3, %mm6
362	PADDING movq	( 24 + BOFFSET) * SIZE(BB), %mm3
363	pfmul	        ( 22 + BOFFSET) * SIZE(BB), %mm0
364	pfadd	%mm0, %mm7
365	movq	        (  6 + AOFFSET) * SIZE(AA), %mm0
366
367	pfmul	%mm0, %mm3
368	pfadd	%mm3, %mm4
369	PADDING movq	( 26 + BOFFSET) * SIZE(BB), %mm3
370	pfmul	%mm0, %mm3
371	pfadd	%mm3, %mm5
372	PADDING movq	( 28 + BOFFSET) * SIZE(BB), %mm3
373	pfmul	%mm0, %mm3
374	pfadd	%mm3, %mm6
375	PADDING movq	( 48 + BOFFSET) * SIZE(BB), %mm3
376	pfmul	        ( 30 + BOFFSET) * SIZE(BB), %mm0
377	pfadd	%mm0, %mm7
378	movq	        (  8 + AOFFSET) * SIZE(AA), %mm0
379
380	pfmul	%mm0, %mm2
381	pfadd	%mm2, %mm4
382	PADDING movq	( 34 + BOFFSET) * SIZE(BB), %mm2
383	pfmul	%mm0, %mm2
384	pfadd	%mm2, %mm5
385	PADDING movq	( 36 + BOFFSET) * SIZE(BB), %mm2
386	pfmul	%mm0, %mm2
387	pfadd	%mm2, %mm6
388	PADDING movq	( 40 + BOFFSET) * SIZE(BB), %mm2
389	pfmul	        ( 38 + BOFFSET) * SIZE(BB), %mm0
390	pfadd	%mm0, %mm7
391	movq	        ( 10 + AOFFSET) * SIZE(AA), %mm0
392
393	pfmul	%mm0, %mm2
394	pfadd	%mm2, %mm4
395	PADDING movq	( 42 + BOFFSET) * SIZE(BB), %mm2
396	pfmul	%mm0, %mm2
397	pfadd	%mm2, %mm5
398	PADDING movq	( 44 + BOFFSET) * SIZE(BB), %mm2
399	pfmul	%mm0, %mm2
400	pfadd	%mm2, %mm6
401	PADDING movq	( 64 + BOFFSET) * SIZE(BB), %mm2
402	pfmul	        ( 46 + BOFFSET) * SIZE(BB), %mm0
403	pfadd	%mm0, %mm7
404	movq	        ( 12 + AOFFSET) * SIZE(AA), %mm0
405
406	pfmul	%mm0, %mm3
407	pfadd	%mm3, %mm4
408	PADDING movq	( 50 + BOFFSET) * SIZE(BB), %mm3
409	pfmul	%mm0, %mm3
410	pfadd	%mm3, %mm5
411	PADDING movq	( 52 + BOFFSET) * SIZE(BB), %mm3
412	pfmul	%mm0, %mm3
413	pfadd	%mm3, %mm6
414	PADDING movq	( 56 + BOFFSET) * SIZE(BB), %mm3
415	pfmul	        ( 54 + BOFFSET) * SIZE(BB), %mm0
416	pfadd	%mm0, %mm7
417	movq	        ( 14 + AOFFSET) * SIZE(AA), %mm0
418
419	pfmul	%mm0, %mm3
420	pfadd	%mm3, %mm4
421	PADDING movq	( 58 + BOFFSET) * SIZE(BB), %mm3
422	pfmul	%mm0, %mm3
423	pfadd	%mm3, %mm5
424	PADDING movq	( 60 + BOFFSET) * SIZE(BB), %mm3
425	pfmul	%mm0, %mm3
426	pfadd	%mm3, %mm6
427	PADDING movq	( 80 + BOFFSET) * SIZE(BB), %mm3
428	pfmul	        ( 62 + BOFFSET) * SIZE(BB), %mm0
429	pfadd	%mm0, %mm7
430	movq	        ( 32 + AOFFSET) * SIZE(AA), %mm0
431
432	pfmul	%mm1, %mm2
433	pfadd	%mm2, %mm4
434	PADDING movq	( 66 + BOFFSET) * SIZE(BB), %mm2
435	pfmul	%mm1, %mm2
436	pfadd	%mm2, %mm5
437	PADDING movq	( 68 + BOFFSET) * SIZE(BB), %mm2
438	pfmul	%mm1, %mm2
439	pfadd	%mm2, %mm6
440	PADDING movq	( 72 + BOFFSET) * SIZE(BB), %mm2
441	pfmul	        ( 70 + BOFFSET) * SIZE(BB), %mm1
442	pfadd	%mm1, %mm7
443	movq	        ( 18 + AOFFSET) * SIZE(AA), %mm1
444
445	pfmul	%mm1, %mm2
446	pfadd	%mm2, %mm4
447	PADDING movq	( 74 + BOFFSET) * SIZE(BB), %mm2
448	pfmul	%mm1, %mm2
449	pfadd	%mm2, %mm5
450	PADDING movq	( 76 + BOFFSET) * SIZE(BB), %mm2
451	pfmul	%mm1, %mm2
452	pfadd	%mm2, %mm6
453	PADDING movq	( 96 + BOFFSET) * SIZE(BB), %mm2
454	pfmul	        ( 78 + BOFFSET) * SIZE(BB), %mm1
455	pfadd	%mm1, %mm7
456	movq	        ( 20 + AOFFSET) * SIZE(AA), %mm1
457
458	pfmul	%mm1, %mm3
459	pfadd	%mm3, %mm4
460	PADDING movq	( 82 + BOFFSET) * SIZE(BB), %mm3
461	pfmul	%mm1, %mm3
462	pfadd	%mm3, %mm5
463	PADDING movq	( 84 + BOFFSET) * SIZE(BB), %mm3
464	pfmul	%mm1, %mm3
465	pfadd	%mm3, %mm6
466	PADDING movq	( 88 + BOFFSET) * SIZE(BB), %mm3
467	pfmul	        ( 86 + BOFFSET) * SIZE(BB), %mm1
468	pfadd	%mm1, %mm7
469	movq	        ( 22 + AOFFSET) * SIZE(AA), %mm1
470
471	pfmul	%mm1, %mm3
472	pfadd	%mm3, %mm4
473	PADDING movq	( 90 + BOFFSET) * SIZE(BB), %mm3
474	pfmul	%mm1, %mm3
475	pfadd	%mm3, %mm5
476	PADDING movq	( 92 + BOFFSET) * SIZE(BB), %mm3
477	pfmul	%mm1, %mm3
478	pfadd	%mm3, %mm6
479	PADDING movq	(112 + BOFFSET) * SIZE(BB), %mm3
480	pfmul	        ( 94 + BOFFSET) * SIZE(BB), %mm1
481	pfadd	%mm1, %mm7
482	movq	        ( 24 + AOFFSET) * SIZE(AA), %mm1
483
484	pfmul	%mm1, %mm2
485	pfadd	%mm2, %mm4
486	PADDING movq	( 98 + BOFFSET) * SIZE(BB), %mm2
487	pfmul	%mm1, %mm2
488	pfadd	%mm2, %mm5
489	PADDING movq	(100 + BOFFSET) * SIZE(BB), %mm2
490	pfmul	%mm1, %mm2
491	pfadd	%mm2, %mm6
492	PADDING movq	(104 + BOFFSET) * SIZE(BB), %mm2
493	pfmul	        (102 + BOFFSET) * SIZE(BB), %mm1
494	pfadd	%mm1, %mm7
495	movq	        ( 26 + AOFFSET) * SIZE(AA), %mm1
496
497	pfmul	%mm1, %mm2
498	pfadd	%mm2, %mm4
499	PADDING movq	(106 + BOFFSET) * SIZE(BB), %mm2
500	pfmul	%mm1, %mm2
501	pfadd	%mm2, %mm5
502	PADDING movq	(108 + BOFFSET) * SIZE(BB), %mm2
503	pfmul	%mm1, %mm2
504	pfadd	%mm2, %mm6
505	PADDING movq	(128 + BOFFSET) * SIZE(BB), %mm2
506	pfmul	        (110 + BOFFSET) * SIZE(BB), %mm1
507	pfadd	%mm1, %mm7
508	movq	        ( 28 + AOFFSET) * SIZE(AA), %mm1
509
510	pfmul	%mm1, %mm3
511	pfadd	%mm3, %mm4
512	PADDING movq	(114 + BOFFSET) * SIZE(BB), %mm3
513	pfmul	%mm1, %mm3
514	pfadd	%mm3, %mm5
515	PADDING movq	(116 + BOFFSET) * SIZE(BB), %mm3
516	pfmul	%mm1, %mm3
517	pfadd	%mm3, %mm6
518	PADDING movq	(120 + BOFFSET) * SIZE(BB), %mm3
519	pfmul	        (118 + BOFFSET) * SIZE(BB), %mm1
520	pfadd	%mm1, %mm7
521	movq	        ( 30 + AOFFSET) * SIZE(AA), %mm1
522
523	pfmul	%mm1, %mm3
524	pfadd	%mm3, %mm4
525	PADDING movq	(122 + BOFFSET) * SIZE(BB), %mm3
526	pfmul	%mm1, %mm3
527	pfadd	%mm3, %mm5
528	PADDING movq	(124 + BOFFSET) * SIZE(BB), %mm3
529	pfmul	%mm1, %mm3
530	pfadd	%mm3, %mm6
531	PADDING movq	(144 + BOFFSET) * SIZE(BB), %mm3
532	pfmul	        (126 + BOFFSET) * SIZE(BB), %mm1
533	pfadd	%mm1, %mm7
534	movq	        ( 48 + AOFFSET) * SIZE(AA), %mm1
535
536	subl	$-32 * SIZE, AA
537	addl	$128 * SIZE, BB
538	decl	%eax
539	jne    .L12
540	ALIGN_3
541
542.L15:
543#ifndef TRMMKERNEL
544	movl	K, %eax
545#else
546	movl	KKK, %eax
547#endif
548	andl	$15, %eax		# if (k & 1)
549	BRANCH
550	je .L18
551	ALIGN_3
552
553.L16:
554	pfmul	%mm0, %mm2
555	pfadd	%mm2, %mm4
556	PADDING movq	(  2 + BOFFSET) * SIZE(BB), %mm2
557	pfmul	%mm0, %mm2
558	pfadd	%mm2, %mm5
559	PADDING movq	(  4 + BOFFSET) * SIZE(BB), %mm2
560	pfmul	%mm0, %mm2
561	pfadd	%mm2, %mm6
562	PADDING movq	(  8 + BOFFSET) * SIZE(BB), %mm2
563	pfmul	        (  6 + BOFFSET) * SIZE(BB), %mm0
564	pfadd	%mm0, %mm7
565	movq	        (  2 + AOFFSET) * SIZE(AA), %mm0
566
567	addl	$2 * SIZE, AA
568	addl	$8 * SIZE, BB
569	decl	%eax
570	jg	.L16
571	ALIGN_4
572
573.L18:
574	movq	GAMMA_R, %mm0
575	movq	GAMMA_I, %mm1
576	movq	ALPHA,   %mm2
577
578	pswapd	%mm5, %mm5
579	pswapd	%mm7, %mm7
580
581	pfmul	%mm0, %mm4
582	pfmul	%mm1, %mm5
583	pfmul	%mm0, %mm6
584	pfmul	%mm1, %mm7
585
586	pfadd	%mm5, %mm4
587	pfadd	%mm7, %mm6
588
589	pswapd	%mm4, %mm5
590	pswapd	%mm6, %mm7
591	pfmul	%mm2, %mm4
592	pfmul	%mm2, %mm6
593	pfmul	%mm2, %mm5
594	pfmul	%mm2, %mm7
595
596	pfpnacc	%mm5, %mm4
597	pfpnacc	%mm7, %mm6
598
599#ifndef TRMMKERNEL
600	pfadd	(%esi), %mm4
601	pfadd	(%esi, %ebp), %mm6
602#endif
603	movq	%mm4, (%esi)
604	movq	%mm6, (%esi, %ebp)
605
606#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
607    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
608	movl	K, %eax
609	subl	KKK, %eax
610	leal	(,%eax, SIZE), %eax
611	leal	(AA, %eax, 2), AA
612	leal	(BB, %eax, 8), BB
613#endif
614
615#if defined(TRMMKERNEL) && defined(LEFT)
616	addl	$1, KK
617#endif
618
619	addl	$2 * SIZE, %esi
620	decl	%ebx
621	jg	.L11
622	ALIGN_4
623
624.L19:
625#if defined(TRMMKERNEL) && !defined(LEFT)
626	addl	$2, KK
627#endif
628
629	leal	(, %ebp, 2), %eax
630	addl	%eax, C			# c += ldc
631	decl	J			# j --
632	jg	.L01
633	ALIGN_4
634
635.L20:
636	movl	N, %eax
637	andl	$1, %eax
638	jle	.L999
639	ALIGN_4
640
641.L21:
642/* Copying to Sub Buffer */
643	movl	K, %eax
644	leal	BUFFER, BB
645	sarl	$2, %eax
646	jle	.L25
647	ALIGN_4
648
649.L22:
650	movd	 0 * SIZE(%edi), %mm0
651	movd	 1 * SIZE(%edi), %mm1
652	movd	 2 * SIZE(%edi), %mm2
653	movd	 3 * SIZE(%edi), %mm3
654	movd	 4 * SIZE(%edi), %mm4
655	movd	 5 * SIZE(%edi), %mm5
656	movd	 6 * SIZE(%edi), %mm6
657	movd	 7 * SIZE(%edi), %mm7
658
659	prefetchnta	72 * SIZE(%edi)
660
661	punpckldq %mm0, %mm0
662	punpckldq %mm1, %mm1
663	punpckldq %mm2, %mm2
664	punpckldq %mm3, %mm3
665	punpckldq %mm4, %mm4
666	punpckldq %mm5, %mm5
667	punpckldq %mm6, %mm6
668	punpckldq %mm7, %mm7
669
670	movq	%mm0,  0 * SIZE(BB)
671	movq	%mm1,  2 * SIZE(BB)
672	movq	%mm2,  4 * SIZE(BB)
673	movq	%mm3,  6 * SIZE(BB)
674	movq	%mm4,  8 * SIZE(BB)
675	movq	%mm5, 10 * SIZE(BB)
676	movq	%mm6, 12 * SIZE(BB)
677	movq	%mm7, 14 * SIZE(BB)
678
679	addl	$ 8 * SIZE, %edi
680	addl	$16 * SIZE, BB
681	decl	%eax
682	jne	.L22
683	ALIGN_4
684
685.L25:
686	movl	K, %eax
687	andl	$3, %eax
688	BRANCH
689	jle	.L30
690	ALIGN_4
691
692.L26:
693	movd	 0 * SIZE(%edi), %mm0
694	movd	 1 * SIZE(%edi), %mm1
695
696	movd	%mm0,  0 * SIZE(BB)
697	movd	%mm0,  1 * SIZE(BB)
698	movd	%mm1,  2 * SIZE(BB)
699	movd	%mm1,  3 * SIZE(BB)
700
701	addl	$2 * SIZE, %edi
702	addl	$4 * SIZE, BB
703	decl	%eax
704	jne	.L26
705	ALIGN_4
706
707.L30:
708#if defined(TRMMKERNEL) && defined(LEFT)
709	movl	OFFSET, %eax
710	movl	%eax, KK
711#endif
712
713	movl	C, %esi		# coffset = c
714	movl	A, AA		# aoffset = a
715	movl	M,  %ebx
716	ALIGN_3
717
718.L31:
719	leal	- BOFFSET * SIZE + BUFFER, BB
720
721#if !defined(TRMMKERNEL) || \
722	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
723	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
724#else
725	movl	KK,   %eax
726	leal	(, %eax, SIZE), %eax
727	leal	(AA, %eax, 2), AA
728	leal	(BB, %eax, 4), BB
729#endif
730
731	movq	        (  0 + AOFFSET) * SIZE(AA), %mm0
732	pxor	%mm4, %mm4
733	movq	        ( 16 + AOFFSET) * SIZE(AA), %mm1
734	pxor	%mm5, %mm5
735	PADDING movq	(  0 + BOFFSET) * SIZE(BB), %mm2
736	pxor	%mm6, %mm6
737	PADDING movq	( 16 + BOFFSET) * SIZE(BB), %mm3
738	pxor	%mm7, %mm7
739
740	prefetchw 2 * SIZE(%esi)
741
742#ifndef TRMMKERNEL
743	movl	K,  %eax
744#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
745	movl	K, %eax
746	subl	KK, %eax
747	movl	%eax, KKK
748#else
749	movl	KK, %eax
750#ifdef LEFT
751	addl	$1, %eax
752#else
753	addl	$1, %eax
754#endif
755	movl	%eax, KKK
756#endif
757	sarl	$4, %eax
758	je	.L35
759	ALIGN_4
760
761.L32:
762	pfmul	%mm0, %mm2
763	pfadd	%mm2, %mm4
764	PADDING movq	(  4 + BOFFSET) * SIZE(BB), %mm2
765	pfmul	(  2 + BOFFSET) * SIZE(BB), %mm0
766	pfadd	%mm0, %mm5
767	movq	        (  2 + AOFFSET) * SIZE(AA), %mm0
768
769	PADDING prefetch	(PREFETCHSIZE +  0) * SIZE(AA)
770
771	pfmul	%mm0, %mm2
772	pfadd	%mm2, %mm6
773	PADDING movq	(  8 + BOFFSET) * SIZE(BB), %mm2
774	pfmul	(  6 + BOFFSET) * SIZE(BB), %mm0
775	pfadd	%mm0, %mm7
776	movq	        (  4 + AOFFSET) * SIZE(AA), %mm0
777
778	pfmul	%mm0, %mm2
779	pfadd	%mm2, %mm4
780	PADDING movq	( 12 + BOFFSET) * SIZE(BB), %mm2
781	pfmul	( 10 + BOFFSET) * SIZE(BB), %mm0
782	pfadd	%mm0, %mm5
783	movq	        (  6 + AOFFSET) * SIZE(AA), %mm0
784
785	pfmul	%mm0, %mm2
786	pfadd	%mm2, %mm6
787	PADDING movq	( 32 + BOFFSET) * SIZE(BB), %mm2
788	pfmul	( 14 + BOFFSET) * SIZE(BB), %mm0
789	pfadd	%mm0, %mm7
790	movq	        (  8 + AOFFSET) * SIZE(AA), %mm0
791
792	pfmul	%mm0, %mm3
793	pfadd	%mm3, %mm4
794	PADDING movq	( 20 + BOFFSET) * SIZE(BB), %mm3
795	pfmul	( 18 + BOFFSET) * SIZE(BB), %mm0
796	pfadd	%mm0, %mm5
797	movq	        ( 10 + AOFFSET) * SIZE(AA), %mm0
798
799	pfmul	%mm0, %mm3
800	pfadd	%mm3, %mm6
801	PADDING movq	( 24 + BOFFSET) * SIZE(BB), %mm3
802	pfmul	( 22 + BOFFSET) * SIZE(BB), %mm0
803	pfadd	%mm0, %mm7
804	movq	        ( 12 + AOFFSET) * SIZE(AA), %mm0
805
806	pfmul	%mm0, %mm3
807	pfadd	%mm3, %mm4
808	PADDING movq	( 28 + BOFFSET) * SIZE(BB), %mm3
809	pfmul	( 26 + BOFFSET) * SIZE(BB), %mm0
810	pfadd	%mm0, %mm5
811	movq	        ( 14 + AOFFSET) * SIZE(AA), %mm0
812
813	pfmul	%mm0, %mm3
814	pfadd	%mm3, %mm6
815	PADDING movq	( 48 + BOFFSET) * SIZE(BB), %mm3
816	pfmul	( 30 + BOFFSET) * SIZE(BB), %mm0
817	pfadd	%mm0, %mm7
818	movq	        ( 32 + AOFFSET) * SIZE(AA), %mm0
819
820	pfmul	%mm1, %mm2
821	pfadd	%mm2, %mm4
822	PADDING movq	( 36 + BOFFSET) * SIZE(BB), %mm2
823	pfmul	( 34 + BOFFSET) * SIZE(BB), %mm1
824	pfadd	%mm1, %mm5
825	movq	        ( 18 + AOFFSET) * SIZE(AA), %mm1
826
827	pfmul	%mm1, %mm2
828	pfadd	%mm2, %mm6
829	PADDING movq	( 40 + BOFFSET) * SIZE(BB), %mm2
830	pfmul	( 38 + BOFFSET) * SIZE(BB), %mm1
831	pfadd	%mm1, %mm7
832	movq	        ( 20 + AOFFSET) * SIZE(AA), %mm1
833
834	pfmul	%mm1, %mm2
835	pfadd	%mm2, %mm4
836	PADDING movq	( 44 + BOFFSET) * SIZE(BB), %mm2
837	pfmul	( 42 + BOFFSET) * SIZE(BB), %mm1
838	pfadd	%mm1, %mm5
839	movq	        ( 22 + AOFFSET) * SIZE(AA), %mm1
840
841	pfmul	%mm1, %mm2
842	pfadd	%mm2, %mm6
843	PADDING movq	( 64 + BOFFSET) * SIZE(BB), %mm2
844	pfmul	( 46 + BOFFSET) * SIZE(BB), %mm1
845	pfadd	%mm1, %mm7
846	movq	        ( 24 + AOFFSET) * SIZE(AA), %mm1
847
848	pfmul	%mm1, %mm3
849	pfadd	%mm3, %mm4
850	PADDING movq	( 52 + BOFFSET) * SIZE(BB), %mm3
851	pfmul	( 50 + BOFFSET) * SIZE(BB), %mm1
852	pfadd	%mm1, %mm5
853	movq	        ( 26 + AOFFSET) * SIZE(AA), %mm1
854
855	pfmul	%mm1, %mm3
856	pfadd	%mm3, %mm6
857	PADDING movq	( 56 + BOFFSET) * SIZE(BB), %mm3
858	pfmul	( 54 + BOFFSET) * SIZE(BB), %mm1
859	pfadd	%mm1, %mm7
860	movq	        ( 28 + AOFFSET) * SIZE(AA), %mm1
861
862	pfmul	%mm1, %mm3
863	pfadd	%mm3, %mm4
864	PADDING movq	( 60 + BOFFSET) * SIZE(BB), %mm3
865	pfmul	( 58 + BOFFSET) * SIZE(BB), %mm1
866	pfadd	%mm1, %mm5
867	movq	        ( 30 + AOFFSET) * SIZE(AA), %mm1
868
869	pfmul	%mm1, %mm3
870	pfadd	%mm3, %mm6
871	PADDING movq	( 80 + BOFFSET) * SIZE(BB), %mm3
872	pfmul	( 62 + BOFFSET) * SIZE(BB), %mm1
873	pfadd	%mm1, %mm7
874	movq	        ( 48 + AOFFSET) * SIZE(AA), %mm1
875
876	subl   $-32 * SIZE, AA
877	addl   $ 64 * SIZE, BB
878	decl   %eax
879	jne    .L32
880	ALIGN_3
881
882.L35:
883#ifndef TRMMKERNEL
884	movl	K, %eax
885#else
886	movl	KKK, %eax
887#endif
888	andl	$15, %eax		# if (k & 1)
889	BRANCH
890	je .L38
891	ALIGN_3
892
893.L36:
894	pfmul	%mm0, %mm2
895	pfadd	%mm2, %mm4
896	PADDING movq	(  4 + BOFFSET) * SIZE(BB), %mm2
897
898	pfmul	(  2 + BOFFSET) * SIZE(BB), %mm0
899	pfadd	%mm0, %mm5
900	movq	        (  2 + AOFFSET) * SIZE(AA), %mm0
901
902	addl	$2 * SIZE, AA
903	addl	$4 * SIZE, BB
904	decl	%eax
905	jg	.L36
906	ALIGN_4
907
908.L38:
909	pfadd	%mm6, %mm4
910	pfadd	%mm7, %mm5
911
912	movq	ALPHA,   %mm2
913	pswapd	%mm5, %mm5
914
915	pfmul	GAMMA_R, %mm4
916	pfmul	GAMMA_I, %mm5
917
918	pfadd	%mm5, %mm4
919
920	pswapd	%mm4, %mm5
921	pfmul	%mm2, %mm4
922	pfmul	%mm2, %mm5
923	pfpnacc	%mm5, %mm4
924
925#ifndef TRMMKERNEL
926	pfadd	0 * SIZE(%esi), %mm4
927#endif
928	movq	%mm4, 0 * SIZE(%esi)
929
930#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
931    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
932	movl	K, %eax
933	subl	KKK, %eax
934	leal	(,%eax, SIZE), %eax
935	leal	(AA, %eax, 2), AA
936	leal	(BB, %eax, 4), BB
937#endif
938
939#if defined(TRMMKERNEL) && defined(LEFT)
940	addl	$1, KK
941#endif
942
943	addl	$2 * SIZE, %esi		# coffset += 4
944	decl	%ebx			# i --
945	jg	.L31
946	ALIGN_4
947
948.L999:
949	EMMS
950
951	movl	OLD_STACK, %esp
952	popl	%ebx
953	popl	%esi
954	popl	%edi
955	popl	%ebp
956	ret
957
958	EPILOGUE
959