1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	 0
44
45#define STACK_M	 4 + STACK + ARGS(%esi)
46#define STACK_N	 8 + STACK + ARGS(%esi)
47#define STACK_K	12 + STACK + ARGS(%esi)
48#define STACK_ALPHA_R	16 + STACK + ARGS(%esi)
49#define STACK_ALPHA_I	24 + STACK + ARGS(%esi)
50#define STACK_A	32 + STACK + ARGS(%esi)
51#define STACK_B	36 + STACK + ARGS(%esi)
52#define STACK_C	40 + STACK + ARGS(%esi)
53#define STACK_LDC	44 + STACK + ARGS(%esi)
54#define STACK_OFFT	48 + STACK + ARGS(%esi)
55
56#define POSINV	 0(%esp)
57#define ALPHA_R	16(%esp)
58#define ALPHA_I	32(%esp)
59#define K	48(%esp)
60#define N	52(%esp)
61#define M	56(%esp)
62#define A	60(%esp)
63#define C	64(%esp)
64#define J	68(%esp)
65#define OLD_STACK 72(%esp)
66#define OFFSET  76(%esp)
67#define KK	80(%esp)
68#define KKK	84(%esp)
69#define BUFFER 128(%esp)
70
71#define STACK_ALIGN	4096
72#define STACK_OFFSET	1024
73
74#if defined(OPTERON) || defined(BARCELONA)
75#define PREFETCH	prefetch
76#endif
77
78#define PREFETCHSIZE (8 * 10 + 4)
79
80#define AA	%edx
81#define BB	%ecx
82#define LDC	%ebp
83#define B	%edi
84
85
86#define KERNEL1(address) \
87	mulpd	%xmm0, %xmm2; \
88	addpd	%xmm2, %xmm4; \
89	PREFETCH (PREFETCHSIZE +  0) * SIZE + (address) * 1 * SIZE(AA); \
90	movapd	 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
91	mulpd	%xmm0, %xmm2; \
92	addpd	%xmm2, %xmm5; \
93	movapd	 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
94	mulpd	%xmm0, %xmm2; \
95	mulpd	 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
96	addpd	%xmm2, %xmm6; \
97	movapd	16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
98	addpd	%xmm0, %xmm7; \
99	movapd	 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
100
101#define KERNEL2(address) \
102	mulpd	%xmm0, %xmm3; \
103	addpd	%xmm3, %xmm4; \
104	movapd	10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
105	mulpd	%xmm0, %xmm3; \
106	addpd	%xmm3, %xmm5; \
107	movapd	12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
108	mulpd	%xmm0, %xmm3; \
109	mulpd	14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
110	addpd	%xmm3, %xmm6; \
111	movapd	24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
112	addpd	%xmm0, %xmm7; \
113	movapd	 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
114
115#define KERNEL3(address) \
116	mulpd	%xmm0, %xmm2; \
117	addpd	%xmm2, %xmm4; \
118	movapd	18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
119	mulpd	%xmm0, %xmm2; \
120	addpd	%xmm2, %xmm5; \
121	movapd	20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
122	mulpd	%xmm0, %xmm2; \
123	mulpd	22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
124	addpd	%xmm2, %xmm6; \
125	movapd	32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
126	addpd	%xmm0, %xmm7; \
127	movapd	 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
128
129#define KERNEL4(address) \
130	mulpd	%xmm0, %xmm3; \
131	addpd	%xmm3, %xmm4; \
132	movapd	26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
133	mulpd	%xmm0, %xmm3; \
134	addpd	%xmm3, %xmm5; \
135	movapd	28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
136	mulpd	%xmm0, %xmm3; \
137	mulpd	30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
138	addpd	%xmm3, %xmm6; \
139	movapd	40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
140	addpd	%xmm0, %xmm7; \
141	movapd	16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
142
143#define KERNEL5(address) \
144	PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
145	mulpd	%xmm1, %xmm2; \
146	addpd	%xmm2, %xmm4; \
147	movapd	34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
148	mulpd	%xmm1, %xmm2; \
149	addpd	%xmm2, %xmm5; \
150	movapd	36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
151	mulpd	%xmm1, %xmm2; \
152	mulpd	38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
153	addpd	%xmm2, %xmm6; \
154	movapd	48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
155	addpd	%xmm1, %xmm7; \
156	movapd	10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
157
158#define KERNEL6(address) \
159	mulpd	%xmm1, %xmm3; \
160	addpd	%xmm3, %xmm4; \
161	movapd	42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
162	mulpd	%xmm1, %xmm3; \
163	addpd	%xmm3, %xmm5; \
164	movapd	44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
165	mulpd	%xmm1, %xmm3; \
166	mulpd	46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
167	addpd	%xmm3, %xmm6; \
168	movapd	56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
169	addpd	%xmm1, %xmm7; \
170	movapd	12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
171
172#define KERNEL7(address) \
173	mulpd	%xmm1, %xmm2; \
174	addpd	%xmm2, %xmm4; \
175	movapd	50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
176	mulpd	%xmm1, %xmm2; \
177	addpd	%xmm2, %xmm5; \
178	movapd	52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
179	mulpd	%xmm1, %xmm2; \
180	mulpd	54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
181	addpd	%xmm2, %xmm6; \
182	movapd	64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
183	addpd	%xmm1, %xmm7; \
184	movapd	14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
185
186#define KERNEL8(address) \
187	mulpd	%xmm1, %xmm3; \
188	addpd	%xmm3, %xmm4; \
189	movapd	58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
190	mulpd	%xmm1, %xmm3; \
191	addpd	%xmm3, %xmm5; \
192	movapd	60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
193	mulpd	%xmm1, %xmm3; \
194	mulpd	62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
195	addpd	%xmm3, %xmm6; \
196	movapd	72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
197	addpd	%xmm1, %xmm7; \
198	movapd	24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
199
200	PROLOGUE
201
202	pushl	%ebp
203	pushl	%edi
204	pushl	%esi
205	pushl	%ebx
206
207	PROFCODE
208
209	EMMS
210
211	movl	%esp, %esi	# save old stack
212
213	subl	$128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
214	andl	$-STACK_ALIGN, %esp	# align stack
215	addl	$STACK_OFFSET, %esp
216
217	STACK_TOUCHING
218
219	movl	STACK_M, %ebx
220	movl	STACK_N, %eax
221	movl	STACK_K, %ecx
222	movl	STACK_A, %edx
223
224	movl	%ebx, M
225	movl	%eax, N
226	movl	%ecx, K
227	movl	%edx, A
228	movl	%esi, OLD_STACK
229
230	movl	STACK_B, B
231	movl	STACK_C, %ebx
232#ifdef TRMMKERNEL
233	movss	STACK_OFFT, %xmm4
234#endif
235
236	movlpd	STACK_ALPHA_R, %xmm0
237	movlpd	STACK_ALPHA_I, %xmm1
238
239	pcmpeqb	%xmm7, %xmm7
240	psllq	$63, %xmm7	# Generate mask
241	pxor	%xmm2, %xmm2
242
243	movlpd	 %xmm0, 0 + ALPHA_R
244	movlpd	 %xmm0, 8 + ALPHA_R
245
246	movlpd	 %xmm1, 8 + ALPHA_I
247	xorpd	 %xmm7, %xmm1
248	movlpd	 %xmm1, 0 + ALPHA_I
249
250	movlpd	  %xmm2,  0 + POSINV
251	movlpd	  %xmm7,  8 + POSINV
252
253	movl	%ebx, C
254	movl	STACK_LDC, LDC
255
256#ifdef TRMMKERNEL
257	movss	%xmm4, OFFSET
258	movss	%xmm4, KK
259#ifndef LEFT
260	negl	KK
261#endif
262#endif
263
264	sall	$ZBASE_SHIFT, LDC
265
266	sarl	$1, %eax
267	movl	%eax, J			# j = n
268	jle	.L100
269	ALIGN_4
270
271.L01:
272#if defined(TRMMKERNEL) && defined(LEFT)
273	movl	OFFSET, %eax
274	movl	%eax, KK
275#endif
276
277	leal	BUFFER, %ecx
278
279	movapd	POSINV, %xmm7
280
281	movl	K, %eax
282	sarl	$1, %eax
283	jle	.L03
284	ALIGN_4
285
286.L02:
287	prefetchnta	 56 * SIZE(B)
288
289	movlpd	 0 * SIZE(B), %xmm0
290	movlpd	 1 * SIZE(B), %xmm1
291	movlpd	 2 * SIZE(B), %xmm2
292	movlpd	 3 * SIZE(B), %xmm3
293	movlpd	 4 * SIZE(B), %xmm4
294	movlpd	 5 * SIZE(B), %xmm5
295	movlpd	 6 * SIZE(B), %xmm6
296	movlpd	 7 * SIZE(B), %xmm7
297
298	movlpd	%xmm0,  0 * SIZE(BB)
299	movlpd	%xmm0,  1 * SIZE(BB)
300	movlpd	%xmm1,  2 * SIZE(BB)
301	movlpd	%xmm1,  3 * SIZE(BB)
302	movlpd	%xmm2,  4 * SIZE(BB)
303	movlpd	%xmm2,  5 * SIZE(BB)
304	movlpd	%xmm3,  6 * SIZE(BB)
305	movlpd	%xmm3,  7 * SIZE(BB)
306	movlpd	%xmm4,  8 * SIZE(BB)
307	movlpd	%xmm4,  9 * SIZE(BB)
308	movlpd	%xmm5, 10 * SIZE(BB)
309	movlpd	%xmm5, 11 * SIZE(BB)
310	movlpd	%xmm6, 12 * SIZE(BB)
311	movlpd	%xmm6, 13 * SIZE(BB)
312	movlpd	%xmm7, 14 * SIZE(BB)
313	movlpd	%xmm7, 15 * SIZE(BB)
314
315	addl	$  8 * SIZE, B
316	subl	$-16 * SIZE, BB
317
318	decl	%eax
319	jne	.L02
320	ALIGN_4
321
322.L03:
323	movl	K, %eax
324	andl	$1, %eax
325	BRANCH
326	jle	.L05
327
328	movlpd	 0 * SIZE(B), %xmm0
329	movlpd	 1 * SIZE(B), %xmm1
330	movlpd	 2 * SIZE(B), %xmm2
331	movlpd	 3 * SIZE(B), %xmm3
332
333	movlpd	%xmm0,  0 * SIZE(BB)
334	movlpd	%xmm0,  1 * SIZE(BB)
335	movlpd	%xmm1,  2 * SIZE(BB)
336	movlpd	%xmm1,  3 * SIZE(BB)
337	movlpd	%xmm2,  4 * SIZE(BB)
338	movlpd	%xmm2,  5 * SIZE(BB)
339	movlpd	%xmm3,  6 * SIZE(BB)
340	movlpd	%xmm3,  7 * SIZE(BB)
341
342	addl	$4 * SIZE, B
343	ALIGN_4
344
345.L05:
346	movl	C, %esi		# coffset = c
347	movl	A, AA		# aoffset = a
348	movl	M,  %ebx
349	testl	%ebx, %ebx
350	jle	.L100
351	ALIGN_4
352
353.L10:
354#if !defined(TRMMKERNEL) || \
355	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
356	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
357
358	leal	BUFFER, BB	# boffset1 = boffset
359#else
360	leal	BUFFER, BB	# boffset1 = boffset
361	movl	KK, %eax
362	leal	(, %eax, SIZE), %eax
363	leal	(AA, %eax, 2), AA
364	leal	(BB, %eax, 8), BB
365#endif
366
367	movapd	 0 * SIZE(AA), %xmm0
368	pxor	%xmm4, %xmm4
369	movapd	 8 * SIZE(AA), %xmm1
370	pxor	%xmm5, %xmm5
371	movapd	 0 * SIZE(BB), %xmm2
372	pxor	%xmm6, %xmm6
373	movapd	 8 * SIZE(BB), %xmm3
374	pxor	%xmm7, %xmm7
375
376	prefetchw 2 * SIZE(%esi)
377	prefetchw 2 * SIZE(%esi, LDC)
378
379#ifndef TRMMKERNEL
380	movl	K, %eax
381#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
382	movl	K, %eax
383	subl	KK, %eax
384	movl	%eax, KKK
385#else
386	movl	KK, %eax
387#ifdef LEFT
388	addl	$1, %eax
389#else
390	addl	$2, %eax
391#endif
392	movl	%eax, KKK
393#endif
394
395#if 1
396	andl	$-8, %eax
397	sall	$4, %eax
398	je	.L15
399.L1X:
400	KERNEL1(16  *  0)
401	KERNEL2(16  *  0)
402	KERNEL3(16  *  0)
403	KERNEL4(16  *  0)
404	KERNEL5(16  *  0)
405	KERNEL6(16  *  0)
406	KERNEL7(16  *  0)
407	KERNEL8(16  *  0)
408	cmpl	$128 *  1, %eax
409	jle	.L12
410	KERNEL1(16  *  1)
411	KERNEL2(16  *  1)
412	KERNEL3(16  *  1)
413	KERNEL4(16  *  1)
414	KERNEL5(16  *  1)
415	KERNEL6(16  *  1)
416	KERNEL7(16  *  1)
417	KERNEL8(16  *  1)
418	cmpl	$128 *  2, %eax
419	jle	.L12
420	KERNEL1(16  *  2)
421	KERNEL2(16  *  2)
422	KERNEL3(16  *  2)
423	KERNEL4(16  *  2)
424	KERNEL5(16  *  2)
425	KERNEL6(16  *  2)
426	KERNEL7(16  *  2)
427	KERNEL8(16  *  2)
428	cmpl	$128 *  3, %eax
429	jle	.L12
430	KERNEL1(16  *  3)
431	KERNEL2(16  *  3)
432	KERNEL3(16  *  3)
433	KERNEL4(16  *  3)
434	KERNEL5(16  *  3)
435	KERNEL6(16  *  3)
436	KERNEL7(16  *  3)
437	KERNEL8(16  *  3)
438	cmpl	$128 *  4, %eax
439	jle	.L12
440	KERNEL1(16  *  4)
441	KERNEL2(16  *  4)
442	KERNEL3(16  *  4)
443	KERNEL4(16  *  4)
444	KERNEL5(16  *  4)
445	KERNEL6(16  *  4)
446	KERNEL7(16  *  4)
447	KERNEL8(16  *  4)
448	cmpl	$128 *  5, %eax
449	jle	.L12
450	KERNEL1(16  *  5)
451	KERNEL2(16  *  5)
452	KERNEL3(16  *  5)
453	KERNEL4(16  *  5)
454	KERNEL5(16  *  5)
455	KERNEL6(16  *  5)
456	KERNEL7(16  *  5)
457	KERNEL8(16  *  5)
458	cmpl	$128 *  6, %eax
459	jle	.L12
460	KERNEL1(16  *  6)
461	KERNEL2(16  *  6)
462	KERNEL3(16  *  6)
463	KERNEL4(16  *  6)
464	KERNEL5(16  *  6)
465	KERNEL6(16  *  6)
466	KERNEL7(16  *  6)
467	KERNEL8(16  *  6)
468	cmpl	$128 *  7, %eax
469	jle	.L12
470	KERNEL1(16  *  7)
471	KERNEL2(16  *  7)
472	KERNEL3(16  *  7)
473	KERNEL4(16  *  7)
474	KERNEL5(16  *  7)
475	KERNEL6(16  *  7)
476	KERNEL7(16  *  7)
477	KERNEL8(16  *  7)
478
479	addl	$128 * 4  * SIZE, BB
480	addl	$128 * 1  * SIZE, AA
481	subl	$128 * 8, %eax
482	jg	.L1X
483	jmp	.L15
484
485.L12:
486	leal	(AA, %eax, 1), AA
487	leal	(BB, %eax, 4), BB
488	ALIGN_4
489#else
490
491	sarl	$3, %eax
492	je	.L15
493	ALIGN_4
494
495.L12:
496	KERNEL1(16  *  0)
497	KERNEL2(16  *  0)
498	KERNEL3(16  *  0)
499	KERNEL4(16  *  0)
500	KERNEL5(16  *  0)
501	KERNEL6(16  *  0)
502	KERNEL7(16  *  0)
503	KERNEL8(16  *  0)
504
505	addl   $64 * SIZE, BB
506	addl   $16 * SIZE, AA
507	decl   %eax
508	jne    .L11
509	ALIGN_4
510#endif
511
512.L15:
513#ifndef TRMMKERNEL
514	movl	K, %eax
515#else
516	movl	KKK, %eax
517#endif
518	andl	$7, %eax		# if (k & 1)
519	BRANCH
520	je .L14
521	ALIGN_4
522
523.L13:
524	mulpd	 %xmm0, %xmm2
525	addpd	 %xmm2, %xmm4
526	movapd	 2 * SIZE(BB), %xmm2
527	mulpd	 %xmm0, %xmm2
528	addpd	 %xmm2, %xmm5
529	movapd	 4 * SIZE(BB), %xmm2
530	mulpd	 %xmm0, %xmm2
531	mulpd	 6 * SIZE(BB), %xmm0
532	addpd	 %xmm2, %xmm6
533	movapd	 8 * SIZE(BB), %xmm2
534	addpd	 %xmm0, %xmm7
535	movapd	 2 * SIZE(AA), %xmm0
536
537	addl	$2 * SIZE, AA
538	addl	$8 * SIZE, BB
539	decl	%eax
540	jg	.L13
541	ALIGN_4
542
543.L14:
544	movapd	POSINV,  %xmm1
545	movapd	ALPHA_R, %xmm2
546	movapd	ALPHA_I, %xmm3
547
548	SHUFPD_1 %xmm5, %xmm5
549	SHUFPD_1 %xmm7, %xmm7
550
551
552#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
553    defined(NR) || defined(NC) || defined(TR) || defined(TC)
554	xorpd	%xmm1, %xmm5
555	xorpd	%xmm1, %xmm7
556#else
557	xorpd	%xmm1, %xmm4
558	xorpd	%xmm1, %xmm6
559#endif
560
561#ifndef TRMMKERNEL
562	movlpd	0 * SIZE(%esi), %xmm0
563	movhpd	1 * SIZE(%esi), %xmm0
564	movlpd	0 * SIZE(%esi, LDC), %xmm1
565	movhpd	1 * SIZE(%esi, LDC), %xmm1
566#endif
567
568#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
569    defined(RR) || defined(RC) || defined(CR) || defined(CC)
570	subpd	%xmm5, %xmm4
571	subpd	%xmm7, %xmm6
572#else
573	addpd	%xmm5, %xmm4
574	addpd	%xmm7, %xmm6
575#endif
576
577	pshufd	$0x4e, %xmm4, %xmm5
578	pshufd	$0x4e, %xmm6, %xmm7
579
580	mulpd	%xmm2, %xmm4
581	mulpd	%xmm3, %xmm5
582	mulpd	%xmm2, %xmm6
583	mulpd	%xmm3, %xmm7
584
585	addpd	%xmm5, %xmm4
586	addpd	%xmm7, %xmm6
587
588#ifndef TRMMKERNEL
589	addpd	%xmm0, %xmm4
590	addpd	%xmm1, %xmm6
591#endif
592
593	movlpd	%xmm4, 0 * SIZE(%esi)
594	movhpd	%xmm4, 1 * SIZE(%esi)
595	movlpd	%xmm6, 0 * SIZE(%esi, LDC)
596	movhpd	%xmm6, 1 * SIZE(%esi, LDC)
597
598#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
599    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
600	movl	K, %eax
601	subl	KKK, %eax
602	leal	(,%eax, SIZE), %eax
603	leal	(AA, %eax, 2), AA
604	leal	(BB, %eax, 8), BB
605#endif
606
607#if defined(TRMMKERNEL) && defined(LEFT)
608	addl	$1, KK
609#endif
610
611	addl	$2 * SIZE, %esi		# coffset += 4
612	decl	%ebx			# i --
613	jg	.L10
614	ALIGN_4
615
616.L99:
617#if defined(TRMMKERNEL) && !defined(LEFT)
618	addl	$2, KK
619#endif
620
621	leal	(, LDC, 2), %eax
622	addl	%eax, C			# c += ldc
623	decl	J			# j --
624	jg	.L01
625	ALIGN_4
626
627.L100:
628	movl	N, %eax
629	andl	$1, %eax
630	jle	.L500
631	ALIGN_4
632
633.L101:
634#if defined(TRMMKERNEL) && defined(LEFT)
635	movl	OFFSET, %eax
636	movl	%eax, KK
637#endif
638
639	leal	BUFFER, %ecx
640	movapd	POSINV, %xmm7
641
642	movl	K, %eax
643	sarl	$2, %eax
644	jle	.L103
645	ALIGN_4
646
647.L102:
648	prefetchnta	 56 * SIZE(B)
649
650	movlpd	 0 * SIZE(B), %xmm0
651	movlpd	 1 * SIZE(B), %xmm1
652	movlpd	 2 * SIZE(B), %xmm2
653	movlpd	 3 * SIZE(B), %xmm3
654	movlpd	 4 * SIZE(B), %xmm4
655	movlpd	 5 * SIZE(B), %xmm5
656	movlpd	 6 * SIZE(B), %xmm6
657	movlpd	 7 * SIZE(B), %xmm7
658
659	movlpd	%xmm0,  0 * SIZE(BB)
660	movlpd	%xmm0,  1 * SIZE(BB)
661	movlpd	%xmm1,  2 * SIZE(BB)
662	movlpd	%xmm1,  3 * SIZE(BB)
663	movlpd	%xmm2,  4 * SIZE(BB)
664	movlpd	%xmm2,  5 * SIZE(BB)
665	movlpd	%xmm3,  6 * SIZE(BB)
666	movlpd	%xmm3,  7 * SIZE(BB)
667	movlpd	%xmm4,  8 * SIZE(BB)
668	movlpd	%xmm4,  9 * SIZE(BB)
669	movlpd	%xmm5, 10 * SIZE(BB)
670	movlpd	%xmm5, 11 * SIZE(BB)
671	movlpd	%xmm6, 12 * SIZE(BB)
672	movlpd	%xmm6, 13 * SIZE(BB)
673	movlpd	%xmm7, 14 * SIZE(BB)
674	movlpd	%xmm7, 15 * SIZE(BB)
675
676	addl	$  8 * SIZE, B
677	subl	$-16 * SIZE, %ecx
678	decl	%eax
679	jne	.L102
680	ALIGN_4
681
682.L103:
683	movl	K, %eax
684	andl	$3, %eax
685	BRANCH
686	jle	.L105
687	ALIGN_4
688
689.L104:
690	movlpd	 0 * SIZE(B), %xmm0
691	movlpd	 1 * SIZE(B), %xmm1
692
693	movlpd	%xmm0,  0 * SIZE(BB)
694	movlpd	%xmm0,  1 * SIZE(BB)
695	movlpd	%xmm1,  2 * SIZE(BB)
696	movlpd	%xmm1,  3 * SIZE(BB)
697
698	addl	$2 * SIZE, B
699	addl	$4 * SIZE, %ecx
700	decl	%eax
701	jne	.L104
702	ALIGN_4
703
704.L105:
705	movl	C, %esi		# coffset = c
706	movl	A, AA		# aoffset = a
707	movl	M,  %ebx
708	testl	%ebx, %ebx
709	jle	.L500
710	ALIGN_4
711
712.L110:
713#if !defined(TRMMKERNEL) || \
714	(defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
715	(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
716
717	leal	BUFFER, BB
718#else
719	leal	BUFFER, BB
720	movl	KK, %eax
721	leal	(, %eax, SIZE), %eax
722	leal	(AA, %eax, 2), AA
723	leal	(BB, %eax, 4), BB
724#endif
725
726	pxor	%xmm4, %xmm4
727	pxor	%xmm5, %xmm5
728	pxor	%xmm6, %xmm6
729	pxor	%xmm7, %xmm7
730
731	movapd	 0 * SIZE(AA), %xmm0
732	movapd	 8 * SIZE(AA), %xmm1
733	movapd	 0 * SIZE(BB), %xmm2
734	movapd	 8 * SIZE(BB), %xmm3
735
736#ifndef TRMMKERNEL
737	movl	K, %eax
738#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
739	movl	K, %eax
740	subl	KK, %eax
741	movl	%eax, KKK
742#else
743	movl	KK, %eax
744#ifdef LEFT
745	addl	$1, %eax
746#else
747	addl	$1, %eax
748#endif
749	movl	%eax, KKK
750#endif
751	sarl	$3, %eax
752	je	.L112
753	ALIGN_4
754
755.L111:
756	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
757	mulpd	%xmm0, %xmm2
758	mulpd	 2 * SIZE(BB), %xmm0
759	addpd	%xmm2, %xmm4
760	movapd	 4 * SIZE(BB), %xmm2
761	addpd	%xmm0, %xmm5
762	movapd	 2 * SIZE(AA), %xmm0
763	mulpd	%xmm0, %xmm2
764	mulpd	 6 * SIZE(BB), %xmm0
765	addpd	%xmm2, %xmm6
766	movapd	16 * SIZE(BB), %xmm2
767	addpd	%xmm0, %xmm7
768	movapd	 4 * SIZE(AA), %xmm0
769	mulpd	%xmm0, %xmm3
770	mulpd	10 * SIZE(BB), %xmm0
771	addpd	%xmm3, %xmm4
772	movapd	12 * SIZE(BB), %xmm3
773	addpd	%xmm0, %xmm5
774	movapd	 6 * SIZE(AA), %xmm0
775	mulpd	%xmm0, %xmm3
776	mulpd	14 * SIZE(BB), %xmm0
777	addpd	%xmm3, %xmm6
778	movapd	24 * SIZE(BB), %xmm3
779	addpd	%xmm0, %xmm7
780	movapd	16 * SIZE(AA), %xmm0
781	mulpd	%xmm1, %xmm2
782	mulpd	18 * SIZE(BB), %xmm1
783	addpd	%xmm2, %xmm4
784	movapd	20 * SIZE(BB), %xmm2
785	addpd	%xmm1, %xmm5
786	movapd	10 * SIZE(AA), %xmm1
787	mulpd	%xmm1, %xmm2
788	mulpd	22 * SIZE(BB), %xmm1
789	addpd	%xmm2, %xmm6
790	movapd	32 * SIZE(BB), %xmm2
791	addpd	%xmm1, %xmm7
792	movapd	12 * SIZE(AA), %xmm1
793	mulpd	%xmm1, %xmm3
794	mulpd	26 * SIZE(BB), %xmm1
795	addpd	%xmm3, %xmm4
796	movapd	28 * SIZE(BB), %xmm3
797	addpd	%xmm1, %xmm5
798	movapd	14 * SIZE(AA), %xmm1
799	mulpd	%xmm1, %xmm3
800	mulpd	30 * SIZE(BB), %xmm1
801	addpd	%xmm3, %xmm6
802	movapd	40 * SIZE(BB), %xmm3
803	addpd	%xmm1, %xmm7
804	movapd	24 * SIZE(AA), %xmm1
805
806	addl   $16 * SIZE, AA
807	addl   $32 * SIZE, BB
808	decl   %eax
809	jne    .L111
810	ALIGN_4
811
812.L112:
813#ifndef TRMMKERNEL
814	movl	K, %eax
815#else
816	movl	KKK, %eax
817#endif
818	andl	$7, %eax		# if (k & 1)
819	BRANCH
820	je .L114
821	ALIGN_4
822
823.L113:
824	mulpd	%xmm0, %xmm2
825	mulpd	 2 * SIZE(BB), %xmm0
826	addpd	%xmm2, %xmm4
827	movapd	 4 * SIZE(BB), %xmm2
828	addpd	%xmm0, %xmm5
829	movapd	 2 * SIZE(AA), %xmm0
830
831	addl	$2 * SIZE, AA
832	addl	$4 * SIZE, BB
833	decl	%eax
834	jg	.L113
835	ALIGN_4
836
837.L114:
838	movapd	POSINV,  %xmm1
839	movapd	ALPHA_R, %xmm2
840	movapd	ALPHA_I, %xmm3
841
842	addpd	%xmm6, %xmm4
843	addpd	%xmm7, %xmm5
844
845	SHUFPD_1 %xmm5, %xmm5
846
847#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
848    defined(NR) || defined(NC) || defined(TR) || defined(TC)
849	xorpd	%xmm1, %xmm5
850#else
851	xorpd	%xmm1, %xmm4
852#endif
853
854#ifndef TRMMKERNEL
855	movlpd	0 * SIZE(%esi), %xmm0
856	movhpd	1 * SIZE(%esi), %xmm0
857#endif
858
859#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \
860    defined(RR) || defined(RC) || defined(CR) || defined(CC)
861	subpd	%xmm5, %xmm4
862#else
863	addpd	%xmm5, %xmm4
864#endif
865
866	pshufd	$0x4e, %xmm4, %xmm5
867
868	mulpd	%xmm2, %xmm4
869	mulpd	%xmm3, %xmm5
870
871	addpd	%xmm5, %xmm4
872
873#ifndef TRMMKERNEL
874	addpd	%xmm0, %xmm4
875#endif
876
877	movlpd	%xmm4, 0 * SIZE(%esi)
878	movhpd	%xmm4, 1 * SIZE(%esi)
879
880#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
881    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
882	movl	K, %eax
883	subl	KKK, %eax
884	leal	(,%eax, SIZE), %eax
885	leal	(AA, %eax, 2), AA
886	leal	(BB, %eax, 4), BB
887#endif
888
889#if defined(TRMMKERNEL) && defined(LEFT)
890	addl	$1, KK
891#endif
892
893	addl	$2 * SIZE, %esi		# coffset += 4
894	decl	%ebx			# i --
895	jg	.L110
896	ALIGN_4
897
898.L500:
899	movl	OLD_STACK, %esp
900
901	EMMS
902
903	popl	%ebx
904	popl	%esi
905	popl	%edi
906	popl	%ebp
907	ret
908
909	EPILOGUE
910