1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43
44#define OLD_M	 4 + STACK(%esi)
45#define OLD_N	 8 + STACK(%esi)
46#define OLD_K	12 + STACK(%esi)
47#define OLD_A	20 + STACK(%esi)
48#define OLD_B	24 + STACK(%esi)
49#define OLD_C	28 + STACK(%esi)
50#define OLD_LDC	32 + STACK(%esi)
51#define STACK_OFFT	36 + STACK(%esi)
52
53#define K	16(%esp)
54#define N	20(%esp)
55#define M	24(%esp)
56#define A	28(%esp)
57#define C	32(%esp)
58#define J	36(%esp)
59#define OLD_STACK 40(%esp)
60#define OFFSET  44(%esp)
61#define KK	48(%esp)
62#define KKK	52(%esp)
63#define AORIG	56(%esp)
64#define BORIG	60(%esp)
65#define BUFFER 128(%esp)
66
67#if defined(OPTERON) || defined(BARCELONA)
68#define PREFETCH     prefetch
69#define PREFETCHW    prefetchw
70#define PREFETCHSIZE (16 * 10 + 8)
71#endif
72
73#if defined(PENTIUM4) || defined(PENTIUMM)
74#define PREFETCH     prefetcht0
75#define PREFETCHW    prefetcht0
76#define PREFETCHSIZE   96
77#endif
78
79#if defined(PENRYN) || defined(DUNNINGTON)
80#define PREFETCH     prefetcht0
81#define PREFETCHW    prefetcht0
82#define PREFETCHSIZE   96
83#endif
84
85#define B	%edi
86#define AA	%edx
87#define	BB	%ecx
88#define LDC	%ebp
89#define CO1	%esi
90
91#if defined(OPTERON) || !defined(HAVE_SSE2)
92#define movsd	movlps
93#endif
94
95#ifdef HAVE_SSE2
96#define	xorps	pxor
97#endif
98
99#define KERNEL1(address) \
100	mulps	%xmm0, %xmm2; \
101	addps	%xmm2, %xmm4; \
102	movaps	 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
103	mulps	%xmm0, %xmm2; \
104	PREFETCH (PREFETCHSIZE +  0) * SIZE + (address) * 1 * SIZE(AA); \
105	addps	%xmm2, %xmm5; \
106	movaps	 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
107	mulps	%xmm0, %xmm2; \
108	mulps	12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
109	addps	%xmm2, %xmm6; \
110	movaps	32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
111	addps	%xmm0, %xmm7; \
112	movaps	 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
113
114#define KERNEL2(address) \
115	mulps	%xmm0, %xmm3; \
116	addps	%xmm3, %xmm4; \
117	movaps	20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
118	mulps	%xmm0, %xmm3; \
119	addps	%xmm3, %xmm5; \
120	movaps	24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
121	mulps	%xmm0, %xmm3; \
122	mulps	28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
123	addps	%xmm3, %xmm6; \
124	movaps	48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
125	addps	%xmm0, %xmm7; \
126	movaps	 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
127
128#define KERNEL3(address) \
129	mulps	%xmm0, %xmm2; \
130	addps	%xmm2, %xmm4; \
131	movaps	36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
132	mulps	%xmm0, %xmm2; \
133	addps	%xmm2, %xmm5; \
134	movaps	40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
135	mulps	%xmm0, %xmm2; \
136	mulps	44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
137	addps	%xmm2, %xmm6; \
138	movaps	64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
139	addps	%xmm0, %xmm7; \
140	movaps	12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
141
142#define KERNEL4(address) \
143	mulps	%xmm0, %xmm3; \
144	addps	%xmm3, %xmm4; \
145	movaps	52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
146	mulps	%xmm0, %xmm3; \
147	addps	%xmm3, %xmm5; \
148	movaps	56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
149	mulps	%xmm0, %xmm3; \
150	mulps	60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
151	addps	%xmm3, %xmm6; \
152	movaps	80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
153	addps	%xmm0, %xmm7; \
154	movaps	 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
155
156#define KERNEL5(address) \
157	mulps	%xmm1, %xmm2; \
158	addps	%xmm2, %xmm4; \
159	movaps	68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
160	mulps	%xmm1, %xmm2; \
161	addps	%xmm2, %xmm5; \
162	movaps	72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
163	mulps	%xmm1, %xmm2; \
164	mulps	76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
165	addps	%xmm2, %xmm6; \
166	movaps	 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
167	addps	%xmm1, %xmm7; \
168	movaps	20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
169
170#define KERNEL6(address) \
171	mulps	%xmm1, %xmm3; \
172	addps	%xmm3, %xmm4; \
173	movaps	84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
174	mulps	%xmm1, %xmm3; \
175	addps	%xmm3, %xmm5; \
176	movaps	88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
177	mulps	%xmm1, %xmm3; \
178	mulps	92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
179	addps	%xmm3, %xmm6; \
180	movaps	112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
181	addps	%xmm1, %xmm7; \
182	movaps	 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
183
184#define KERNEL7(address) \
185	mulps	%xmm1, %xmm2; \
186	addps	%xmm2, %xmm4; \
187	movaps	100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
188	mulps	%xmm1, %xmm2; \
189	addps	%xmm2, %xmm5; \
190	movaps	104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
191	mulps	%xmm1, %xmm2; \
192	mulps	108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
193	addps	%xmm2, %xmm6; \
194	movaps	128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
195	addps	%xmm1, %xmm7; \
196	movaps	28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
197
198#define KERNEL8(address) \
199	mulps	%xmm1, %xmm3; \
200	addps	%xmm3, %xmm4; \
201	movaps	116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
202	mulps	%xmm1, %xmm3; \
203	addps	%xmm3, %xmm5; \
204	movaps	120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
205	mulps	%xmm1, %xmm3; \
206	mulps	124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
207	addps	%xmm3, %xmm6; \
208	movaps	144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
209	addps	%xmm1, %xmm7; \
210	movaps	 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
211
212	PROLOGUE
213
214	pushl	%ebp
215	pushl	%edi
216	pushl	%esi
217	pushl	%ebx
218
219	PROFCODE
220
221	movl	%esp, %esi
222
223	subl	$128 + LOCAL_BUFFER_SIZE, %esp
224	andl	$-1024, %esp
225
226	STACK_TOUCHING
227
228	movl	OLD_M, %ebx
229	movl	OLD_N, %eax
230	movl	OLD_K, %ecx
231	movl	OLD_A, %edx
232
233	movl	%ebx, M
234	movl	%eax, N
235	movl	%ecx, K
236	movl	%edx, A
237	movl	%esi, OLD_STACK
238	movss	STACK_OFFT, %xmm4
239
240	movl	OLD_B, B
241	movl	OLD_C, %ebx
242
243	movl	%ebx, C
244	movl	OLD_LDC, LDC
245
246	movss	%xmm4, OFFSET
247	movss	%xmm4, KK
248
249	leal	(, LDC, SIZE), LDC
250
251#ifdef LN
252       movl	M, %eax
253       leal	(, %eax, SIZE), %eax
254       addl	%eax, C
255       imull	K, %eax
256       addl	%eax, A
257#endif
258
259#ifdef RT
260       movl	N, %eax
261       leal	(, %eax, SIZE), %eax
262       imull	K, %eax
263       addl	%eax, B
264       movl	N, %eax
265       imull	LDC, %eax
266       addl	%eax, C
267#endif
268
269#ifdef RN
270	negl	KK
271#endif
272
273#ifdef RT
274       movl	N, %eax
275       subl	OFFSET, %eax
276       movl	%eax, KK
277#endif
278
279	movl	N, %eax
280	sarl	$2, %eax
281	movl	%eax, J
282	jle	.L40
283
284.L01:
285#ifdef LN
286	movl	OFFSET, %eax
287	addl	M, %eax
288	movl	%eax, KK
289#endif
290
291	leal	BUFFER, %ecx
292
293#ifdef RT
294       movl	K, %eax
295       sall	$2 + BASE_SHIFT, %eax
296       subl	%eax, B
297#endif
298
299#if defined(LN) || defined(RT)
300	movl	KK, %eax
301	movl	B, BORIG
302        sall	$2 + BASE_SHIFT, %eax
303	leal	(B,  %eax, 1), B
304	leal	(BB, %eax, 4), BB
305#endif
306
307#ifdef LT
308	movl	OFFSET, %eax
309	movl	%eax, KK
310#endif
311
312#if defined(LT) || defined(RN)
313	movl	KK, %eax
314#else
315	movl	K, %eax
316	subl	KK, %eax
317#endif
318	sarl	$1, %eax
319	jle	.L05
320	ALIGN_4
321
322.L02:
323	movaps	 0 * SIZE(B), %xmm3
324	movaps	 4 * SIZE(B), %xmm7
325
326	pshufd	 $0x00, %xmm3, %xmm0
327	pshufd	 $0x55, %xmm3, %xmm1
328	pshufd	 $0xaa, %xmm3, %xmm2
329	pshufd	 $0xff, %xmm3, %xmm3
330
331	pshufd	 $0x00, %xmm7, %xmm4
332	pshufd	 $0x55, %xmm7, %xmm5
333	pshufd	 $0xaa, %xmm7, %xmm6
334	pshufd	 $0xff, %xmm7, %xmm7
335
336	movaps	%xmm0,  0 * SIZE(BB)
337	movaps	%xmm1,  4 * SIZE(BB)
338	movaps	%xmm2,  8 * SIZE(BB)
339	movaps	%xmm3, 12 * SIZE(BB)
340	movaps	%xmm4, 16 * SIZE(BB)
341	movaps	%xmm5, 20 * SIZE(BB)
342	movaps	%xmm6, 24 * SIZE(BB)
343	movaps	%xmm7, 28 * SIZE(BB)
344
345	addl	$ 8 * SIZE, B
346	addl	$32 * SIZE, %ecx
347	decl	%eax
348	jne	.L02
349	ALIGN_2
350
351.L05:
352#if defined(LT) || defined(RN)
353	movl	KK, %eax
354#else
355	movl	K, %eax
356	subl	KK, %eax
357#endif
358	andl	$1, %eax
359	BRANCH
360	jle	.L10
361
362	movaps	 0 * SIZE(B), %xmm3
363
364	pshufd	 $0x00, %xmm3, %xmm0
365	pshufd	 $0x55, %xmm3, %xmm1
366	pshufd	 $0xaa, %xmm3, %xmm2
367	pshufd	 $0xff, %xmm3, %xmm3
368
369	movaps	%xmm0,  0 * SIZE(BB)
370	movaps	%xmm1,  4 * SIZE(BB)
371	movaps	%xmm2,  8 * SIZE(BB)
372	movaps	%xmm3, 12 * SIZE(BB)
373
374	addl	$4 * SIZE, B
375	ALIGN_4
376
377.L10:
378#if defined(LT) || defined(RN)
379	movl	A, AA
380#else
381	movl	A, %eax
382	movl	%eax, AORIG
383#endif
384
385	leal	(, LDC, 4), %eax
386
387#ifdef RT
388	subl	%eax, C
389#endif
390	movl	C, CO1
391#ifndef RT
392	addl	%eax, C
393#endif
394
395	movl	M,  %ebx
396	sarl	$2, %ebx	# i = (m >> 2)
397	jle	.L20
398	ALIGN_4
399
400.L11:
401#ifdef LN
402       movl	K, %eax
403       sall	$2 + BASE_SHIFT, %eax
404       subl	%eax, AORIG
405#endif
406
407#if defined(LN) || defined(RT)
408	movl	KK, %eax
409	movl	AORIG, AA
410	leal	(, %eax, SIZE), %eax
411	leal	(AA, %eax, 4), AA
412#endif
413
414	leal	BUFFER, BB
415
416#if defined(LN) || defined(RT)
417	movl	KK, %eax
418	sall	$2 + BASE_SHIFT, %eax
419	leal	(BB, %eax, 4), BB
420#endif
421
422	movaps	 0 * SIZE(AA), %xmm0
423	xorps	%xmm4, %xmm4
424	movaps	16 * SIZE(AA), %xmm1
425	xorps	%xmm5, %xmm5
426	movaps	 0 * SIZE(BB), %xmm2
427	xorps	%xmm6, %xmm6
428	movaps	16 * SIZE(BB), %xmm3
429	xorps	%xmm7, %xmm7
430
431	leal	(LDC, LDC, 2), %eax
432
433	PREFETCHW	3 * SIZE(CO1)
434	PREFETCHW	3 * SIZE(CO1, LDC)
435	PREFETCHW	3 * SIZE(CO1, LDC, 2)
436	PREFETCHW	3 * SIZE(CO1, %eax)
437
438#if defined(LT) || defined(RN)
439	movl	KK, %eax
440#else
441	movl	K, %eax
442	subl	KK, %eax
443#endif
444	sarl	$3, %eax
445	je	.L15
446	ALIGN_4
447
448.L12:
449	KERNEL1(0 * 16)
450	KERNEL2(0 * 16)
451	KERNEL3(0 * 16)
452	KERNEL4(0 * 16)
453	KERNEL5(0 * 16)
454	KERNEL6(0 * 16)
455	KERNEL7(0 * 16)
456	KERNEL8(0 * 16)
457
458	addl   $128 * SIZE, BB
459	addl   $32 * SIZE, AA
460	decl   %eax
461	jne    .L12
462	ALIGN_4
463
464.L15:
465#if defined(LT) || defined(RN)
466	movl	KK, %eax
467#else
468	movl	K, %eax
469	subl	KK, %eax
470#endif
471	andl	$7, %eax		# if (k & 1)
472	BRANCH
473	je .L18
474	ALIGN_4
475
476.L16:
477	mulps	%xmm0, %xmm2
478	addps	%xmm2, %xmm4
479	movaps	 4 * SIZE(BB), %xmm2
480	mulps	%xmm0, %xmm2
481	addps	%xmm2, %xmm5
482	movaps	 8 * SIZE(BB), %xmm2
483	mulps	%xmm0, %xmm2
484	mulps	12 * SIZE(BB), %xmm0
485	addps	%xmm2, %xmm6
486	movaps	16 * SIZE(BB), %xmm2
487	addps	%xmm0, %xmm7
488	movaps	 4 * SIZE(AA), %xmm0
489
490	addl	$ 4 * SIZE, AA
491	addl	$16 * SIZE, BB
492	decl	%eax
493	jg	.L16
494	ALIGN_4
495
496.L18:
497#if defined(LN) || defined(RT)
498	movl	KK, %eax
499#ifdef LN
500	subl	$4, %eax
501#else
502	subl	$4, %eax
503#endif
504
505	movl	AORIG, AA
506	movl	BORIG, B
507	leal	BUFFER, BB
508
509	sall	$2 + BASE_SHIFT, %eax
510	leal	(AA, %eax, 1), AA
511	leal	(B,  %eax, 1), B
512	leal	(BB, %eax, 4), BB
513#endif
514
515#if defined(LN) || defined(LT)
516	movaps	 %xmm4, %xmm0
517	unpcklps %xmm6, %xmm4
518	unpckhps %xmm6, %xmm0
519
520	movaps	 %xmm5, %xmm1
521	unpcklps %xmm7, %xmm5
522	unpckhps %xmm7, %xmm1
523
524	movaps	 %xmm4, %xmm6
525	unpcklps %xmm5, %xmm4
526	unpckhps %xmm5, %xmm6
527
528	movaps	 %xmm0, %xmm2
529	unpcklps %xmm1, %xmm0
530	unpckhps %xmm1, %xmm2
531
532	movaps	 0 * SIZE(B), %xmm1
533	movaps	 4 * SIZE(B), %xmm3
534	movaps	 8 * SIZE(B), %xmm5
535	movaps	12 * SIZE(B), %xmm7
536
537	subps	%xmm4,  %xmm1
538	subps	%xmm6,  %xmm3
539	subps	%xmm0,  %xmm5
540	subps	%xmm2,  %xmm7
541#else
542	movaps	 0 * SIZE(AA), %xmm0
543	movaps	 4 * SIZE(AA), %xmm1
544	movaps	 8 * SIZE(AA), %xmm2
545	movaps	12 * SIZE(AA), %xmm3
546
547	subps	%xmm4, %xmm0
548	subps	%xmm5, %xmm1
549	subps	%xmm6, %xmm2
550	subps	%xmm7, %xmm3
551#endif
552
553#ifdef LN
554	movaps	 12 * SIZE(AA), %xmm4
555	pshufd	 $0xff, %xmm4, %xmm6
556	mulps	 %xmm6, %xmm7
557	pshufd	 $0xaa, %xmm4, %xmm6
558	mulps	 %xmm7, %xmm6
559	subps	 %xmm6, %xmm5
560	pshufd	 $0x55, %xmm4, %xmm6
561	mulps	 %xmm7, %xmm6
562	subps	 %xmm6, %xmm3
563	pshufd	 $0x00, %xmm4, %xmm6
564	mulps	 %xmm7, %xmm6
565	subps	 %xmm6, %xmm1
566
567	movaps	  8 * SIZE(AA), %xmm4
568	pshufd	 $0xaa, %xmm4, %xmm6
569	mulps	 %xmm6, %xmm5
570	pshufd	 $0x55, %xmm4, %xmm6
571	mulps	 %xmm5, %xmm6
572	subps	 %xmm6, %xmm3
573	pshufd	 $0x00, %xmm4, %xmm6
574	mulps	 %xmm5, %xmm6
575	subps	 %xmm6, %xmm1
576
577	movaps	  4 * SIZE(AA), %xmm4
578	pshufd	 $0x55, %xmm4, %xmm6
579	mulps	 %xmm6, %xmm3
580	pshufd	 $0x00, %xmm4, %xmm6
581	mulps	 %xmm3, %xmm6
582	subps	 %xmm6, %xmm1
583
584	movaps	  0 * SIZE(AA), %xmm4
585	pshufd	 $0x00, %xmm4, %xmm6
586	mulps	 %xmm6, %xmm1
587#endif
588
589#ifdef LT
590	movaps	 0 * SIZE(AA), %xmm4
591	pshufd	 $0x00, %xmm4, %xmm6
592	mulps	 %xmm6, %xmm1
593
594	pshufd	 $0x55, %xmm4, %xmm6
595	mulps	 %xmm1, %xmm6
596	subps	 %xmm6, %xmm3
597	pshufd	 $0xaa, %xmm4, %xmm6
598	mulps	 %xmm1, %xmm6
599	subps	 %xmm6, %xmm5
600	pshufd	 $0xff, %xmm4, %xmm6
601	mulps	 %xmm1, %xmm6
602	subps	 %xmm6, %xmm7
603
604	movaps	 4 * SIZE(AA), %xmm4
605	pshufd	 $0x55, %xmm4, %xmm6
606	mulps	 %xmm6, %xmm3
607	pshufd	 $0xaa, %xmm4, %xmm6
608	mulps	 %xmm3, %xmm6
609	subps	 %xmm6, %xmm5
610	pshufd	 $0xff, %xmm4, %xmm6
611	mulps	 %xmm3, %xmm6
612	subps	 %xmm6, %xmm7
613
614	movaps	 8 * SIZE(AA), %xmm4
615	pshufd	 $0xaa, %xmm4, %xmm6
616	mulps	 %xmm6, %xmm5
617	pshufd	 $0xff, %xmm4, %xmm6
618	mulps	 %xmm5, %xmm6
619	subps	 %xmm6, %xmm7
620
621	movaps	12 * SIZE(AA), %xmm4
622	pshufd	 $0xff, %xmm4, %xmm6
623	mulps	 %xmm6, %xmm7
624#endif
625
626#ifdef RN
627	movaps	 0 * SIZE(B), %xmm6
628	pshufd	 $0x00, %xmm6, %xmm7
629	mulps	 %xmm7, %xmm0
630	pshufd	 $0x55, %xmm6, %xmm7
631	mulps	 %xmm0, %xmm7
632	subps	 %xmm7, %xmm1
633	pshufd	 $0xaa, %xmm6, %xmm7
634	mulps	 %xmm0, %xmm7
635	subps	 %xmm7, %xmm2
636	pshufd	 $0xff, %xmm6, %xmm7
637	mulps	 %xmm0, %xmm7
638	subps	 %xmm7, %xmm3
639
640	movaps	 4 * SIZE(B), %xmm6
641	pshufd	 $0x55, %xmm6, %xmm7
642	mulps	 %xmm7, %xmm1
643	pshufd	 $0xaa, %xmm6, %xmm7
644	mulps	 %xmm1, %xmm7
645	subps	 %xmm7, %xmm2
646	pshufd	 $0xff, %xmm6, %xmm7
647	mulps	 %xmm1, %xmm7
648	subps	 %xmm7, %xmm3
649
650	movaps	 8 * SIZE(B), %xmm6
651	pshufd	 $0xaa, %xmm6, %xmm7
652	mulps	 %xmm7, %xmm2
653	pshufd	 $0xff, %xmm6, %xmm7
654	mulps	 %xmm2, %xmm7
655	subps	 %xmm7, %xmm3
656
657	movaps	 12 * SIZE(B), %xmm6
658	pshufd	 $0xff, %xmm6, %xmm7
659	mulps	 %xmm7, %xmm3
660#endif
661
662#ifdef RT
663	movaps	 12 * SIZE(B), %xmm6
664	pshufd	 $0xff, %xmm6, %xmm7
665	mulps	 %xmm7, %xmm3
666	pshufd	 $0xaa, %xmm6, %xmm7
667	mulps	 %xmm3, %xmm7
668	subps	 %xmm7, %xmm2
669	pshufd	 $0x55, %xmm6, %xmm7
670	mulps	 %xmm3, %xmm7
671	subps	 %xmm7, %xmm1
672	pshufd	 $0x00, %xmm6, %xmm7
673	mulps	 %xmm3, %xmm7
674	subps	 %xmm7, %xmm0
675
676	movaps	  8 * SIZE(B), %xmm6
677	pshufd	 $0xaa, %xmm6, %xmm7
678	mulps	 %xmm7, %xmm2
679	pshufd	 $0x55, %xmm6, %xmm7
680	mulps	 %xmm2, %xmm7
681	subps	 %xmm7, %xmm1
682	pshufd	 $0x00, %xmm6, %xmm7
683	mulps	 %xmm2, %xmm7
684	subps	 %xmm7, %xmm0
685
686	movaps	  4 * SIZE(B), %xmm6
687	pshufd	 $0x55, %xmm6, %xmm7
688	mulps	 %xmm7, %xmm1
689	pshufd	 $0x00, %xmm6, %xmm7
690	mulps	 %xmm1, %xmm7
691	subps	 %xmm7, %xmm0
692
693	movaps	  0 * SIZE(B), %xmm6
694	pshufd	 $0x00, %xmm6, %xmm7
695	mulps	 %xmm7, %xmm0
696#endif
697
698#if defined(LN) || defined(LT)
699	movaps	%xmm1,   0 * SIZE(B)
700	movaps	%xmm3,   4 * SIZE(B)
701	movaps	%xmm5,   8 * SIZE(B)
702	movaps	%xmm7,  12 * SIZE(B)
703
704	pshufd	$0x00, %xmm1, %xmm0
705	pshufd	$0x55, %xmm1, %xmm2
706	pshufd	$0xaa, %xmm1, %xmm4
707	pshufd	$0xff, %xmm1, %xmm6
708	movaps	%xmm0,   0 * SIZE(BB)
709	movaps	%xmm2,   4 * SIZE(BB)
710	movaps	%xmm4,   8 * SIZE(BB)
711	movaps	%xmm6,  12 * SIZE(BB)
712
713	pshufd	$0x00, %xmm3, %xmm0
714	pshufd	$0x55, %xmm3, %xmm2
715	pshufd	$0xaa, %xmm3, %xmm4
716	pshufd	$0xff, %xmm3, %xmm6
717	movaps	%xmm0,  16 * SIZE(BB)
718	movaps	%xmm2,  20 * SIZE(BB)
719	movaps	%xmm4,  24 * SIZE(BB)
720	movaps	%xmm6,  28 * SIZE(BB)
721
722	pshufd	$0x00, %xmm5, %xmm0
723	pshufd	$0x55, %xmm5, %xmm2
724	pshufd	$0xaa, %xmm5, %xmm4
725	pshufd	$0xff, %xmm5, %xmm6
726	movaps	%xmm0,  32 * SIZE(BB)
727	movaps	%xmm2,  36 * SIZE(BB)
728	movaps	%xmm4,  40 * SIZE(BB)
729	movaps	%xmm6,  44 * SIZE(BB)
730
731	pshufd	$0x00, %xmm7, %xmm0
732	pshufd	$0x55, %xmm7, %xmm2
733	pshufd	$0xaa, %xmm7, %xmm4
734	pshufd	$0xff, %xmm7, %xmm6
735	movaps	%xmm0,  48 * SIZE(BB)
736	movaps	%xmm2,  52 * SIZE(BB)
737	movaps	%xmm4,  56 * SIZE(BB)
738	movaps	%xmm6,  60 * SIZE(BB)
739#else
740	movaps	%xmm0,   0 * SIZE(AA)
741	movaps	%xmm1,   4 * SIZE(AA)
742	movaps	%xmm2,   8 * SIZE(AA)
743	movaps	%xmm3,  12 * SIZE(AA)
744#endif
745
746#ifdef LN
747	subl	$4 * SIZE, CO1
748#endif
749
750	leal	(LDC, LDC, 2), %eax
751
752#if defined(LN) || defined(LT)
753	movaps	 %xmm1, %xmm0
754	unpcklps %xmm5, %xmm1
755	unpckhps %xmm5, %xmm0
756
757	movaps	 %xmm3, %xmm4
758	unpcklps %xmm7, %xmm3
759	unpckhps %xmm7, %xmm4
760
761	movaps	 %xmm1, %xmm2
762	unpcklps %xmm3, %xmm1
763	unpckhps %xmm3, %xmm2
764
765	movaps	 %xmm0, %xmm6
766	unpcklps %xmm4, %xmm0
767	unpckhps %xmm4, %xmm6
768
769	movlps	%xmm1, 0 * SIZE(CO1)
770	movhps	%xmm1, 2 * SIZE(CO1)
771	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
772	movhps	%xmm2, 2 * SIZE(CO1, LDC, 1)
773	movlps	%xmm0, 0 * SIZE(CO1, LDC, 2)
774	movhps	%xmm0, 2 * SIZE(CO1, LDC, 2)
775	movlps	%xmm6, 0 * SIZE(CO1, %eax, 1)
776	movhps	%xmm6, 2 * SIZE(CO1, %eax, 1)
777#else
778	movlps	%xmm0, 0 * SIZE(CO1)
779	movhps	%xmm0, 2 * SIZE(CO1)
780	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
781	movhps	%xmm1, 2 * SIZE(CO1, LDC, 1)
782	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
783	movhps	%xmm2, 2 * SIZE(CO1, LDC, 2)
784	movlps	%xmm3, 0 * SIZE(CO1, %eax, 1)
785	movhps	%xmm3, 2 * SIZE(CO1, %eax, 1)
786#endif
787
788#ifndef LN
789	addl	$4 * SIZE, CO1
790#endif
791
792#if defined(LT) || defined(RN)
793	movl	K,  %eax
794	subl	KK, %eax
795	leal	(,%eax, SIZE), %eax
796	leal	(AA, %eax, 4), AA
797#ifdef LT
798	addl	$16 * SIZE, B
799#endif
800#endif
801
802#ifdef LN
803	subl	$4, KK
804	movl	BORIG, B
805#endif
806
807#ifdef LT
808	addl	$4, KK
809#endif
810
811#ifdef RT
812	movl	K, %eax
813	movl	BORIG, B
814	sall	$2 + BASE_SHIFT, %eax
815	addl	%eax, AORIG
816#endif
817
818	decl	%ebx			# i --
819	jg	.L11
820	ALIGN_4
821
822.L20:
823	testl	$2, M
824	je	.L30
825
826#ifdef LN
827       movl	K, %eax
828       sall	$1 + BASE_SHIFT, %eax
829       subl	%eax, AORIG
830#endif
831
832#if defined(LN) || defined(RT)
833	movl	KK, %eax
834	movl	AORIG, AA
835	leal	(, %eax, SIZE), %eax
836	leal	(AA, %eax, 2), AA
837#endif
838
839	leal	BUFFER, BB
840
841#if defined(LN) || defined(RT)
842	movl	KK, %eax
843	sall	$2 + BASE_SHIFT, %eax
844	leal	(BB, %eax, 4), BB
845#endif
846
847#ifdef	movsd
848	xorps	%xmm0, %xmm0
849#endif
850	movsd	 0 * SIZE(AA), %xmm0
851	xorps	%xmm4, %xmm4
852#ifdef	movsd
853	xorps	%xmm1, %xmm1
854#endif
855	movsd	 8 * SIZE(AA), %xmm1
856	xorps	%xmm5, %xmm5
857	movaps	 0 * SIZE(BB), %xmm2
858	xorps	%xmm6, %xmm6
859	movaps	16 * SIZE(BB), %xmm3
860	xorps	%xmm7, %xmm7
861
862#if defined(LT) || defined(RN)
863	movl	KK, %eax
864#else
865	movl	K, %eax
866	subl	KK, %eax
867#endif
868	sarl	$3, %eax
869	je	.L25
870	ALIGN_4
871
872.L22:
873	mulps	%xmm0, %xmm2
874	addps	%xmm2, %xmm4
875#if defined(OPTERON) || defined(BARCELONA)
876	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
877#endif
878	movaps	 4 * SIZE(BB), %xmm2
879	mulps	%xmm0, %xmm2
880	addps	%xmm2, %xmm5
881	movaps	 8 * SIZE(BB), %xmm2
882	mulps	%xmm0, %xmm2
883	addps	%xmm2, %xmm6
884	movaps	12 * SIZE(BB), %xmm2
885	mulps	%xmm0, %xmm2
886	movsd	 2 * SIZE(AA), %xmm0
887	addps	%xmm2, %xmm7
888	movaps	32 * SIZE(BB), %xmm2
889
890	mulps	%xmm0, %xmm3
891	addps	%xmm3, %xmm4
892	movaps	20 * SIZE(BB), %xmm3
893	mulps	%xmm0, %xmm3
894	addps	%xmm3, %xmm5
895	movaps	24 * SIZE(BB), %xmm3
896	mulps	%xmm0, %xmm3
897	addps	%xmm3, %xmm6
898	movaps	28 * SIZE(BB), %xmm3
899	mulps	%xmm0, %xmm3
900	movsd	 4 * SIZE(AA), %xmm0
901	addps	%xmm3, %xmm7
902	movaps	48 * SIZE(BB), %xmm3
903
904	mulps	%xmm0, %xmm2
905	addps	%xmm2, %xmm4
906	movaps	36 * SIZE(BB), %xmm2
907	mulps	%xmm0, %xmm2
908	addps	%xmm2, %xmm5
909	movaps	40 * SIZE(BB), %xmm2
910	mulps	%xmm0, %xmm2
911	addps	%xmm2, %xmm6
912	movaps	44 * SIZE(BB), %xmm2
913	mulps	%xmm0, %xmm2
914	movsd	 6 * SIZE(AA), %xmm0
915	addps	%xmm2, %xmm7
916	movaps	64 * SIZE(BB), %xmm2
917
918	mulps	%xmm0, %xmm3
919	addps	%xmm3, %xmm4
920	movaps	52 * SIZE(BB), %xmm3
921	mulps	%xmm0, %xmm3
922	addps	%xmm3, %xmm5
923	movaps	56 * SIZE(BB), %xmm3
924	mulps	%xmm0, %xmm3
925	addps	%xmm3, %xmm6
926	movaps	60 * SIZE(BB), %xmm3
927	mulps	%xmm0, %xmm3
928	movsd	 16 * SIZE(AA), %xmm0
929	addps	%xmm3, %xmm7
930	movaps	80 * SIZE(BB), %xmm3
931
932	mulps	%xmm1, %xmm2
933	addps	%xmm2, %xmm4
934	movaps	68 * SIZE(BB), %xmm2
935	mulps	%xmm1, %xmm2
936	addps	%xmm2, %xmm5
937	movaps	72 * SIZE(BB), %xmm2
938	mulps	%xmm1, %xmm2
939	addps	%xmm2, %xmm6
940	movaps	76 * SIZE(BB), %xmm2
941	mulps	%xmm1, %xmm2
942	movsd	10 * SIZE(AA), %xmm1
943	addps	%xmm2, %xmm7
944	movaps	96 * SIZE(BB), %xmm2
945
946	mulps	%xmm1, %xmm3
947	addps	%xmm3, %xmm4
948	movaps	84 * SIZE(BB), %xmm3
949	mulps	%xmm1, %xmm3
950	addps	%xmm3, %xmm5
951	movaps	88 * SIZE(BB), %xmm3
952	mulps	%xmm1, %xmm3
953	addps	%xmm3, %xmm6
954	movaps	92 * SIZE(BB), %xmm3
955	mulps	%xmm1, %xmm3
956	movsd	12 * SIZE(AA), %xmm1
957	addps	%xmm3, %xmm7
958	movaps	112 * SIZE(BB), %xmm3
959
960	mulps	%xmm1, %xmm2
961	addps	%xmm2, %xmm4
962	movaps	100 * SIZE(BB), %xmm2
963	mulps	%xmm1, %xmm2
964	addps	%xmm2, %xmm5
965	movaps	104 * SIZE(BB), %xmm2
966	mulps	%xmm1, %xmm2
967	addps	%xmm2, %xmm6
968	movaps	108 * SIZE(BB), %xmm2
969	mulps	%xmm1, %xmm2
970	movsd	 14 * SIZE(AA), %xmm1
971	addps	%xmm2, %xmm7
972	movaps	128 * SIZE(BB), %xmm2
973
974	mulps	%xmm1, %xmm3
975	addps	%xmm3, %xmm4
976	movaps	116 * SIZE(BB), %xmm3
977	mulps	%xmm1, %xmm3
978	addps	%xmm3, %xmm5
979	movaps	120 * SIZE(BB), %xmm3
980	mulps	%xmm1, %xmm3
981	addps	%xmm3, %xmm6
982	movaps	124 * SIZE(BB), %xmm3
983	mulps	%xmm1, %xmm3
984	movsd	 24 * SIZE(AA), %xmm1
985	addps	%xmm3, %xmm7
986	movaps	144 * SIZE(BB), %xmm3
987
988	addl	$ 16 * SIZE, AA
989	addl	$128 * SIZE, BB
990	decl   %eax
991	jne    .L22
992	ALIGN_4
993
994.L25:
995#if defined(LT) || defined(RN)
996	movl	KK, %eax
997#else
998	movl	K, %eax
999	subl	KK, %eax
1000#endif
1001	andl	$7, %eax		# if (k & 1)
1002	BRANCH
1003	je .L28
1004	ALIGN_4
1005
1006.L26:
1007	mulps	%xmm0, %xmm2
1008	addps	%xmm2, %xmm4
1009	movaps	 4 * SIZE(BB), %xmm2
1010	mulps	%xmm0, %xmm2
1011	addps	%xmm2, %xmm5
1012	movaps	 8 * SIZE(BB), %xmm2
1013	mulps	%xmm0, %xmm2
1014	addps	%xmm2, %xmm6
1015	movaps	12 * SIZE(BB), %xmm2
1016	mulps	%xmm0, %xmm2
1017	movsd	 2 * SIZE(AA), %xmm0
1018	addps	%xmm2, %xmm7
1019	movaps	16 * SIZE(BB), %xmm2
1020
1021	addl	$ 2 * SIZE, AA
1022	addl	$16 * SIZE, BB
1023	decl	%eax
1024	jg	.L26
1025	ALIGN_4
1026
1027.L28:
1028#if defined(LN) || defined(RT)
1029	movl	KK, %eax
1030#ifdef LN
1031	subl	$2, %eax
1032#else
1033	subl	$4, %eax
1034#endif
1035
1036	movl	AORIG, AA
1037	movl	BORIG, B
1038	leal	BUFFER, BB
1039
1040	sall	$1 + BASE_SHIFT, %eax
1041	leal	(AA, %eax, 1), AA
1042	leal	(B,  %eax, 2), B
1043	leal	(BB, %eax, 8), BB
1044#endif
1045
1046#if defined(LN) || defined(LT)
1047	unpcklps %xmm6, %xmm4
1048	unpcklps %xmm7, %xmm5
1049
1050	movaps	 %xmm4, %xmm6
1051	unpcklps %xmm5, %xmm4
1052	unpckhps %xmm5, %xmm6
1053
1054	movaps	 0 * SIZE(B), %xmm1
1055	movaps	 4 * SIZE(B), %xmm3
1056
1057	subps	%xmm4,  %xmm1
1058	subps	%xmm6,  %xmm3
1059#else
1060#ifdef	movsd
1061	xorps	%xmm0, %xmm0
1062#endif
1063	movsd	 0 * SIZE(AA), %xmm0
1064#ifdef	movsd
1065	xorps	%xmm1, %xmm1
1066#endif
1067	movsd	 2 * SIZE(AA), %xmm1
1068#ifdef	movsd
1069	xorps	%xmm2, %xmm2
1070#endif
1071	movsd	 4 * SIZE(AA), %xmm2
1072#ifdef	movsd
1073	xorps	%xmm3, %xmm3
1074#endif
1075	movsd	 6 * SIZE(AA), %xmm3
1076
1077	subps	%xmm4, %xmm0
1078	subps	%xmm5, %xmm1
1079	subps	%xmm6, %xmm2
1080	subps	%xmm7, %xmm3
1081#endif
1082
1083#ifdef LN
1084	movaps	  0 * SIZE(AA), %xmm4
1085	pshufd	 $0xff, %xmm4, %xmm6
1086	mulps	 %xmm6, %xmm3
1087	pshufd	 $0xaa, %xmm4, %xmm6
1088	mulps	 %xmm3, %xmm6
1089	subps	 %xmm6, %xmm1
1090
1091	pshufd	 $0x00, %xmm4, %xmm6
1092	mulps	 %xmm6, %xmm1
1093#endif
1094
1095#ifdef LT
1096	movaps	 0 * SIZE(AA), %xmm4
1097	pshufd	 $0x00, %xmm4, %xmm6
1098	mulps	 %xmm6, %xmm1
1099
1100	pshufd	 $0x55, %xmm4, %xmm6
1101	mulps	 %xmm1, %xmm6
1102	subps	 %xmm6, %xmm3
1103
1104	pshufd	 $0xff, %xmm4, %xmm6
1105	mulps	 %xmm6, %xmm3
1106#endif
1107
1108#ifdef RN
1109	movaps	 0 * SIZE(B), %xmm6
1110	pshufd	 $0x00, %xmm6, %xmm7
1111	mulps	 %xmm7, %xmm0
1112	pshufd	 $0x55, %xmm6, %xmm7
1113	mulps	 %xmm0, %xmm7
1114	subps	 %xmm7, %xmm1
1115	pshufd	 $0xaa, %xmm6, %xmm7
1116	mulps	 %xmm0, %xmm7
1117	subps	 %xmm7, %xmm2
1118	pshufd	 $0xff, %xmm6, %xmm7
1119	mulps	 %xmm0, %xmm7
1120	subps	 %xmm7, %xmm3
1121
1122	movaps	 4 * SIZE(B), %xmm6
1123	pshufd	 $0x55, %xmm6, %xmm7
1124	mulps	 %xmm7, %xmm1
1125	pshufd	 $0xaa, %xmm6, %xmm7
1126	mulps	 %xmm1, %xmm7
1127	subps	 %xmm7, %xmm2
1128	pshufd	 $0xff, %xmm6, %xmm7
1129	mulps	 %xmm1, %xmm7
1130	subps	 %xmm7, %xmm3
1131
1132	movaps	 8 * SIZE(B), %xmm6
1133	pshufd	 $0xaa, %xmm6, %xmm7
1134	mulps	 %xmm7, %xmm2
1135	pshufd	 $0xff, %xmm6, %xmm7
1136	mulps	 %xmm2, %xmm7
1137	subps	 %xmm7, %xmm3
1138
1139	movaps	 12 * SIZE(B), %xmm6
1140	pshufd	 $0xff, %xmm6, %xmm7
1141	mulps	 %xmm7, %xmm3
1142#endif
1143
1144#ifdef RT
1145	movaps	 12 * SIZE(B), %xmm6
1146	pshufd	 $0xff, %xmm6, %xmm7
1147	mulps	 %xmm7, %xmm3
1148	pshufd	 $0xaa, %xmm6, %xmm7
1149	mulps	 %xmm3, %xmm7
1150	subps	 %xmm7, %xmm2
1151	pshufd	 $0x55, %xmm6, %xmm7
1152	mulps	 %xmm3, %xmm7
1153	subps	 %xmm7, %xmm1
1154	pshufd	 $0x00, %xmm6, %xmm7
1155	mulps	 %xmm3, %xmm7
1156	subps	 %xmm7, %xmm0
1157
1158	movaps	  8 * SIZE(B), %xmm6
1159	pshufd	 $0xaa, %xmm6, %xmm7
1160	mulps	 %xmm7, %xmm2
1161	pshufd	 $0x55, %xmm6, %xmm7
1162	mulps	 %xmm2, %xmm7
1163	subps	 %xmm7, %xmm1
1164	pshufd	 $0x00, %xmm6, %xmm7
1165	mulps	 %xmm2, %xmm7
1166	subps	 %xmm7, %xmm0
1167
1168	movaps	  4 * SIZE(B), %xmm6
1169	pshufd	 $0x55, %xmm6, %xmm7
1170	mulps	 %xmm7, %xmm1
1171	pshufd	 $0x00, %xmm6, %xmm7
1172	mulps	 %xmm1, %xmm7
1173	subps	 %xmm7, %xmm0
1174
1175	movaps	  0 * SIZE(B), %xmm6
1176	pshufd	 $0x00, %xmm6, %xmm7
1177	mulps	 %xmm7, %xmm0
1178#endif
1179
1180#if defined(LN) || defined(LT)
1181	movaps	%xmm1,   0 * SIZE(B)
1182	movaps	%xmm3,   4 * SIZE(B)
1183
1184	pshufd	$0x00, %xmm1, %xmm0
1185	pshufd	$0x55, %xmm1, %xmm2
1186	pshufd	$0xaa, %xmm1, %xmm4
1187	pshufd	$0xff, %xmm1, %xmm6
1188	movaps	%xmm0,   0 * SIZE(BB)
1189	movaps	%xmm2,   4 * SIZE(BB)
1190	movaps	%xmm4,   8 * SIZE(BB)
1191	movaps	%xmm6,  12 * SIZE(BB)
1192
1193	pshufd	$0x00, %xmm3, %xmm0
1194	pshufd	$0x55, %xmm3, %xmm2
1195	pshufd	$0xaa, %xmm3, %xmm4
1196	pshufd	$0xff, %xmm3, %xmm6
1197	movaps	%xmm0,  16 * SIZE(BB)
1198	movaps	%xmm2,  20 * SIZE(BB)
1199	movaps	%xmm4,  24 * SIZE(BB)
1200	movaps	%xmm6,  28 * SIZE(BB)
1201#else
1202	movlps	%xmm0,   0 * SIZE(AA)
1203	movlps	%xmm1,   2 * SIZE(AA)
1204	movlps	%xmm2,   4 * SIZE(AA)
1205	movlps	%xmm3,   6 * SIZE(AA)
1206#endif
1207
1208#ifdef LN
1209	subl	$2 * SIZE, CO1
1210#endif
1211
1212	leal	(LDC, LDC, 2), %eax
1213
1214#if defined(LN) || defined(LT)
1215	movaps	 %xmm1, %xmm0
1216	unpcklps %xmm5, %xmm1
1217	unpckhps %xmm5, %xmm0
1218
1219	movaps	 %xmm3, %xmm4
1220	unpcklps %xmm7, %xmm3
1221	unpckhps %xmm7, %xmm4
1222
1223	movaps	 %xmm1, %xmm2
1224	unpcklps %xmm3, %xmm1
1225	unpckhps %xmm3, %xmm2
1226
1227	movaps	 %xmm0, %xmm6
1228	unpcklps %xmm4, %xmm0
1229	unpckhps %xmm4, %xmm6
1230
1231	movlps	%xmm1, 0 * SIZE(CO1)
1232	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
1233	movlps	%xmm0, 0 * SIZE(CO1, LDC, 2)
1234	movlps	%xmm6, 0 * SIZE(CO1, %eax, 1)
1235#else
1236	movlps	%xmm0, 0 * SIZE(CO1)
1237	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
1238	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1239	movlps	%xmm3, 0 * SIZE(CO1, %eax, 1)
1240#endif
1241
1242#ifndef LN
1243	addl	$2 * SIZE, CO1
1244#endif
1245
1246#if defined(LT) || defined(RN)
1247	movl	K,  %eax
1248	subl	KK, %eax
1249	leal	(,%eax, SIZE), %eax
1250	leal	(AA, %eax, 2), AA
1251#ifdef LT
1252	addl	$8 * SIZE, B
1253#endif
1254#endif
1255
1256#ifdef LN
1257	subl	$2, KK
1258	movl	BORIG, B
1259#endif
1260
1261#ifdef LT
1262	addl	$2, KK
1263#endif
1264
1265#ifdef RT
1266	movl	K, %eax
1267	movl	BORIG, B
1268	sall	$1 + BASE_SHIFT, %eax
1269	addl	%eax, AORIG
1270#endif
1271	ALIGN_4
1272
1273.L30:
1274	testl	$1, M
1275	je	.L39
1276
1277#ifdef LN
1278       movl	K, %eax
1279       sall	$BASE_SHIFT, %eax
1280       subl	%eax, AORIG
1281#endif
1282
1283#if defined(LN) || defined(RT)
1284	movl	KK, %eax
1285	movl	AORIG, AA
1286	leal	(AA, %eax, SIZE), AA
1287#endif
1288
1289	leal	BUFFER, BB
1290
1291#if defined(LN) || defined(RT)
1292	movl	KK, %eax
1293	sall	$2 + BASE_SHIFT, %eax
1294	leal	(BB, %eax, 4), BB
1295#endif
1296
1297	movss	 0 * SIZE(AA), %xmm0
1298	xorps	%xmm4, %xmm4
1299	movss	 4 * SIZE(AA), %xmm1
1300	xorps	%xmm5, %xmm5
1301	movss	 0 * SIZE(BB), %xmm2
1302	xorps	%xmm6, %xmm6
1303	movss	16 * SIZE(BB), %xmm3
1304	xorps	%xmm7, %xmm7
1305
1306#if defined(LT) || defined(RN)
1307	movl	KK, %eax
1308#else
1309	movl	K, %eax
1310	subl	KK, %eax
1311#endif
1312	sarl	$3, %eax
1313	je	.L35
1314	ALIGN_4
1315
1316.L32:
1317	mulss	%xmm0, %xmm2
1318	addss	%xmm2, %xmm4
1319#if defined(OPTERON) || defined(BARCELONA)
1320	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
1321#endif
1322	movss	 4 * SIZE(BB), %xmm2
1323	mulss	%xmm0, %xmm2
1324	addss	%xmm2, %xmm5
1325	movss	 8 * SIZE(BB), %xmm2
1326	mulss	%xmm0, %xmm2
1327	mulss	12 * SIZE(BB), %xmm0
1328	addss	%xmm2, %xmm6
1329	movss	32 * SIZE(BB), %xmm2
1330	addss	%xmm0, %xmm7
1331	movss	 1 * SIZE(AA), %xmm0
1332
1333	mulss	%xmm0, %xmm3
1334	addss	%xmm3, %xmm4
1335	movss	20 * SIZE(BB), %xmm3
1336	mulss	%xmm0, %xmm3
1337	addss	%xmm3, %xmm5
1338	movss	24 * SIZE(BB), %xmm3
1339	mulss	%xmm0, %xmm3
1340	mulss	28 * SIZE(BB), %xmm0
1341	addss	%xmm3, %xmm6
1342	movss	48 * SIZE(BB), %xmm3
1343	addss	%xmm0, %xmm7
1344	movss	 2 * SIZE(AA), %xmm0
1345
1346	mulss	%xmm0, %xmm2
1347	addss	%xmm2, %xmm4
1348	movss	36 * SIZE(BB), %xmm2
1349	mulss	%xmm0, %xmm2
1350	addss	%xmm2, %xmm5
1351	movss	40 * SIZE(BB), %xmm2
1352	mulss	%xmm0, %xmm2
1353	mulss	44 * SIZE(BB), %xmm0
1354	addss	%xmm2, %xmm6
1355	movss	64 * SIZE(BB), %xmm2
1356	addss	%xmm0, %xmm7
1357	movss	 3 * SIZE(AA), %xmm0
1358
1359	mulss	%xmm0, %xmm3
1360	addss	%xmm3, %xmm4
1361	movss	52 * SIZE(BB), %xmm3
1362	mulss	%xmm0, %xmm3
1363	addss	%xmm3, %xmm5
1364	movss	56 * SIZE(BB), %xmm3
1365	mulss	%xmm0, %xmm3
1366	mulss	60 * SIZE(BB), %xmm0
1367	addss	%xmm3, %xmm6
1368	movss	80 * SIZE(BB), %xmm3
1369	addss	%xmm0, %xmm7
1370	movss	 8 * SIZE(AA), %xmm0
1371
1372	mulss	%xmm1, %xmm2
1373	addss	%xmm2, %xmm4
1374	movss	68 * SIZE(BB), %xmm2
1375	mulss	%xmm1, %xmm2
1376	addss	%xmm2, %xmm5
1377	movss	72 * SIZE(BB), %xmm2
1378	mulss	%xmm1, %xmm2
1379	mulss	76 * SIZE(BB), %xmm1
1380	addss	%xmm2, %xmm6
1381	movss	96 * SIZE(BB), %xmm2
1382	addss	%xmm1, %xmm7
1383	movss	 5 * SIZE(AA), %xmm1
1384
1385	mulss	%xmm1, %xmm3
1386	addss	%xmm3, %xmm4
1387	movss	84 * SIZE(BB), %xmm3
1388	mulss	%xmm1, %xmm3
1389	addss	%xmm3, %xmm5
1390	movss	88 * SIZE(BB), %xmm3
1391	mulss	%xmm1, %xmm3
1392	mulss	92 * SIZE(BB), %xmm1
1393	addss	%xmm3, %xmm6
1394	movss	112 * SIZE(BB), %xmm3
1395	addss	%xmm1, %xmm7
1396	movss	 6 * SIZE(AA), %xmm1
1397
1398	mulss	%xmm1, %xmm2
1399	addss	%xmm2, %xmm4
1400	movss	100 * SIZE(BB), %xmm2
1401	mulss	%xmm1, %xmm2
1402	addss	%xmm2, %xmm5
1403	movss	104 * SIZE(BB), %xmm2
1404	mulss	%xmm1, %xmm2
1405	mulss	108 * SIZE(BB), %xmm1
1406	addss	%xmm2, %xmm6
1407	movss	128 * SIZE(BB), %xmm2
1408	addss	%xmm1, %xmm7
1409	movss	 7 * SIZE(AA), %xmm1
1410
1411	mulss	%xmm1, %xmm3
1412	addss	%xmm3, %xmm4
1413	movss	116 * SIZE(BB), %xmm3
1414	mulss	%xmm1, %xmm3
1415	addss	%xmm3, %xmm5
1416	movss	120 * SIZE(BB), %xmm3
1417	mulss	%xmm1, %xmm3
1418	mulss	124 * SIZE(BB), %xmm1
1419	addss	%xmm3, %xmm6
1420	movss	144 * SIZE(BB), %xmm3
1421	addss	%xmm1, %xmm7
1422	movss	12 * SIZE(AA), %xmm1
1423
1424	addl	$  8 * SIZE, AA
1425	addl	$128 * SIZE, BB
1426	decl   %eax
1427	jne    .L32
1428	ALIGN_4
1429
1430.L35:
1431#if defined(LT) || defined(RN)
1432	movl	KK, %eax
1433#else
1434	movl	K, %eax
1435	subl	KK, %eax
1436#endif
1437	andl	$7, %eax		# if (k & 1)
1438	BRANCH
1439	je .L38
1440	ALIGN_4
1441
1442.L36:
1443	mulss	%xmm0, %xmm2
1444	addss	%xmm2, %xmm4
1445	movss	 4 * SIZE(BB), %xmm2
1446	mulss	%xmm0, %xmm2
1447	addss	%xmm2, %xmm5
1448	movss	 8 * SIZE(BB), %xmm2
1449	mulss	%xmm0, %xmm2
1450	mulss	12 * SIZE(BB), %xmm0
1451	addss	%xmm2, %xmm6
1452	movss	16 * SIZE(BB), %xmm2
1453	addss	%xmm0, %xmm7
1454	movss	 1 * SIZE(AA), %xmm0
1455
1456	addl	$ 1 * SIZE, AA
1457	addl	$16 * SIZE, BB
1458	decl	%eax
1459	jg	.L36
1460	ALIGN_4
1461
1462.L38:
1463#if defined(LN) || defined(RT)
1464	movl	KK, %eax
1465#ifdef LN
1466	subl	$1, %eax
1467#else
1468	subl	$4, %eax
1469#endif
1470
1471	movl	AORIG, AA
1472	movl	BORIG, B
1473	leal	BUFFER, BB
1474
1475	leal	(AA, %eax, SIZE), AA
1476
1477	sall	$2 + BASE_SHIFT, %eax
1478	leal	(B,  %eax, 1), B
1479	leal	(BB, %eax, 4), BB
1480#endif
1481
1482#if defined(LN) || defined(LT)
1483	unpcklps %xmm6, %xmm4
1484	unpcklps %xmm7, %xmm5
1485	unpcklps %xmm5, %xmm4
1486
1487	movaps	 0 * SIZE(B), %xmm1
1488
1489	subps	%xmm4,  %xmm1
1490#else
1491	movss	 0 * SIZE(AA), %xmm0
1492	movss	 1 * SIZE(AA), %xmm1
1493	movss	 2 * SIZE(AA), %xmm2
1494	movss	 3 * SIZE(AA), %xmm3
1495
1496	subss	%xmm4, %xmm0
1497	subss	%xmm5, %xmm1
1498	subss	%xmm6, %xmm2
1499	subss	%xmm7, %xmm3
1500#endif
1501
1502#if defined(LN) || defined(LT)
1503	movss	  0 * SIZE(AA), %xmm4
1504	pshufd	 $0x00, %xmm4, %xmm6
1505	mulps	 %xmm6, %xmm1
1506#endif
1507
1508#ifdef RN
1509	movaps	 0 * SIZE(B), %xmm6
1510	pshufd	 $0x00, %xmm6, %xmm7
1511	mulss	 %xmm7, %xmm0
1512	pshufd	 $0x55, %xmm6, %xmm7
1513	mulss	 %xmm0, %xmm7
1514	subss	 %xmm7, %xmm1
1515	pshufd	 $0xaa, %xmm6, %xmm7
1516	mulss	 %xmm0, %xmm7
1517	subss	 %xmm7, %xmm2
1518	pshufd	 $0xff, %xmm6, %xmm7
1519	mulss	 %xmm0, %xmm7
1520	subss	 %xmm7, %xmm3
1521
1522	movaps	 4 * SIZE(B), %xmm6
1523	pshufd	 $0x55, %xmm6, %xmm7
1524	mulss	 %xmm7, %xmm1
1525	pshufd	 $0xaa, %xmm6, %xmm7
1526	mulss	 %xmm1, %xmm7
1527	subss	 %xmm7, %xmm2
1528	pshufd	 $0xff, %xmm6, %xmm7
1529	mulss	 %xmm1, %xmm7
1530	subss	 %xmm7, %xmm3
1531
1532	movaps	 8 * SIZE(B), %xmm6
1533	pshufd	 $0xaa, %xmm6, %xmm7
1534	mulss	 %xmm7, %xmm2
1535	pshufd	 $0xff, %xmm6, %xmm7
1536	mulss	 %xmm2, %xmm7
1537	subss	 %xmm7, %xmm3
1538
1539	movaps	 12 * SIZE(B), %xmm6
1540	pshufd	 $0xff, %xmm6, %xmm7
1541	mulss	 %xmm7, %xmm3
1542#endif
1543
1544#ifdef RT
1545	movaps	 12 * SIZE(B), %xmm6
1546	pshufd	 $0xff, %xmm6, %xmm7
1547	mulss	 %xmm7, %xmm3
1548	pshufd	 $0xaa, %xmm6, %xmm7
1549	mulss	 %xmm3, %xmm7
1550	subss	 %xmm7, %xmm2
1551	pshufd	 $0x55, %xmm6, %xmm7
1552	mulss	 %xmm3, %xmm7
1553	subss	 %xmm7, %xmm1
1554	pshufd	 $0x00, %xmm6, %xmm7
1555	mulss	 %xmm3, %xmm7
1556	subss	 %xmm7, %xmm0
1557
1558	movaps	  8 * SIZE(B), %xmm6
1559	pshufd	 $0xaa, %xmm6, %xmm7
1560	mulss	 %xmm7, %xmm2
1561	pshufd	 $0x55, %xmm6, %xmm7
1562	mulss	 %xmm2, %xmm7
1563	subss	 %xmm7, %xmm1
1564	pshufd	 $0x00, %xmm6, %xmm7
1565	mulss	 %xmm2, %xmm7
1566	subss	 %xmm7, %xmm0
1567
1568	movaps	  4 * SIZE(B), %xmm6
1569	pshufd	 $0x55, %xmm6, %xmm7
1570	mulss	 %xmm7, %xmm1
1571	pshufd	 $0x00, %xmm6, %xmm7
1572	mulss	 %xmm1, %xmm7
1573	subss	 %xmm7, %xmm0
1574
1575	movaps	  0 * SIZE(B), %xmm6
1576	pshufd	 $0x00, %xmm6, %xmm7
1577	mulss	 %xmm7, %xmm0
1578#endif
1579
1580#if defined(LN) || defined(LT)
1581	movaps	%xmm1,   0 * SIZE(B)
1582
1583	pshufd	$0x00, %xmm1, %xmm0
1584	pshufd	$0x55, %xmm1, %xmm2
1585	pshufd	$0xaa, %xmm1, %xmm4
1586	pshufd	$0xff, %xmm1, %xmm6
1587	movaps	%xmm0,   0 * SIZE(BB)
1588	movaps	%xmm2,   4 * SIZE(BB)
1589	movaps	%xmm4,   8 * SIZE(BB)
1590	movaps	%xmm6,  12 * SIZE(BB)
1591#else
1592	movss	%xmm0,   0 * SIZE(AA)
1593	movss	%xmm1,   1 * SIZE(AA)
1594	movss	%xmm2,   2 * SIZE(AA)
1595	movss	%xmm3,   3 * SIZE(AA)
1596#endif
1597
1598#ifdef LN
1599	subl	$1 * SIZE, CO1
1600#endif
1601
1602	leal	(LDC, LDC, 2), %eax
1603
1604#if defined(LN) || defined(LT)
1605	movaps	 %xmm1, %xmm0
1606	unpcklps %xmm5, %xmm1
1607	unpckhps %xmm5, %xmm0
1608
1609	movaps	 %xmm3, %xmm4
1610	unpcklps %xmm7, %xmm3
1611	unpckhps %xmm7, %xmm4
1612
1613	movaps	 %xmm1, %xmm2
1614	unpcklps %xmm3, %xmm1
1615	unpckhps %xmm3, %xmm2
1616
1617	movaps	 %xmm0, %xmm6
1618	unpcklps %xmm4, %xmm0
1619	unpckhps %xmm4, %xmm6
1620
1621	movss	%xmm1, 0 * SIZE(CO1)
1622	movss	%xmm2, 0 * SIZE(CO1, LDC, 1)
1623	movss	%xmm0, 0 * SIZE(CO1, LDC, 2)
1624	movss	%xmm6, 0 * SIZE(CO1, %eax, 1)
1625#else
1626	movss	%xmm0, 0 * SIZE(CO1)
1627	movss	%xmm1, 0 * SIZE(CO1, LDC, 1)
1628	movss	%xmm2, 0 * SIZE(CO1, LDC, 2)
1629	movss	%xmm3, 0 * SIZE(CO1, %eax, 1)
1630#endif
1631
1632#ifndef LN
1633	addl	$1 * SIZE, CO1
1634#endif
1635
1636#if defined(LT) || defined(RN)
1637	movl	K,  %eax
1638	subl	KK, %eax
1639	leal	(AA, %eax, SIZE), AA
1640#ifdef LT
1641	addl	$4 * SIZE, B
1642#endif
1643#endif
1644
1645#ifdef LN
1646	subl	$1, KK
1647	movl	BORIG, B
1648#endif
1649
1650#ifdef LT
1651	addl	$1, KK
1652#endif
1653
1654#ifdef RT
1655	movl	K, %eax
1656	movl	BORIG, B
1657	sall	$BASE_SHIFT, %eax
1658	addl	%eax, AORIG
1659#endif
1660	ALIGN_4
1661
1662.L39:
1663#ifdef LN
1664       movl	K, %eax
1665       leal	(, %eax, SIZE), %eax
1666       leal 	(B, %eax, 4), B
1667#endif
1668
1669#if defined(LT) || defined(RN)
1670	movl	K,  %eax
1671	subl	KK, %eax
1672	leal	(,%eax, SIZE), %eax
1673	leal	(B,  %eax, 4), B
1674#endif
1675
1676#ifdef RN
1677	addl	$4, KK
1678#endif
1679
1680#ifdef RT
1681	subl	$4, KK
1682#endif
1683
1684	decl	J			# j --
1685	jg	.L01
1686	ALIGN_4
1687
1688.L40:
1689	testl	$2, N
1690	je	.L80
1691
1692#ifdef LN
1693	movl	OFFSET, %eax
1694	addl	M, %eax
1695	movl	%eax, KK
1696#endif
1697
1698	leal	BUFFER, %ecx
1699
1700#ifdef RT
1701       movl	K, %eax
1702       sall	$1 + BASE_SHIFT, %eax
1703       subl	%eax, B
1704#endif
1705
1706#if defined(LN) || defined(RT)
1707	movl	KK, %eax
1708	movl	B, BORIG
1709        sall	$1 + BASE_SHIFT, %eax
1710	leal	(B,  %eax, 1), B
1711	leal	(BB, %eax, 4), BB
1712#endif
1713
1714#ifdef LT
1715	movl	OFFSET, %eax
1716	movl	%eax, KK
1717#endif
1718
1719#if defined(LT) || defined(RN)
1720	movl	KK, %eax
1721#else
1722	movl	K, %eax
1723	subl	KK, %eax
1724#endif
1725	sarl	$2, %eax
1726	jle	.L45
1727	ALIGN_4
1728
1729.L42:
1730	movaps	 0 * SIZE(B), %xmm3
1731	movaps	 4 * SIZE(B), %xmm7
1732
1733	pshufd	 $0x00, %xmm3, %xmm0
1734	pshufd	 $0x55, %xmm3, %xmm1
1735	pshufd	 $0xaa, %xmm3, %xmm2
1736	pshufd	 $0xff, %xmm3, %xmm3
1737
1738	pshufd	 $0x00, %xmm7, %xmm4
1739	pshufd	 $0x55, %xmm7, %xmm5
1740	pshufd	 $0xaa, %xmm7, %xmm6
1741	pshufd	 $0xff, %xmm7, %xmm7
1742
1743	movaps	%xmm0,  0 * SIZE(BB)
1744	movaps	%xmm1,  4 * SIZE(BB)
1745	movaps	%xmm2,  8 * SIZE(BB)
1746	movaps	%xmm3, 12 * SIZE(BB)
1747	movaps	%xmm4, 16 * SIZE(BB)
1748	movaps	%xmm5, 20 * SIZE(BB)
1749	movaps	%xmm6, 24 * SIZE(BB)
1750	movaps	%xmm7, 28 * SIZE(BB)
1751
1752	addl	$ 8 * SIZE, B
1753	addl	$32 * SIZE, %ecx
1754	decl	%eax
1755	jne	.L42
1756	ALIGN_4
1757
1758.L45:
1759#if defined(LT) || defined(RN)
1760	movl	KK, %eax
1761#else
1762	movl	K, %eax
1763	subl	KK, %eax
1764#endif
1765	andl	$3, %eax
1766	BRANCH
1767	jle	.L50
1768	ALIGN_4
1769
1770.L46:
1771#ifdef	movsd
1772	xorps	%xmm3, %xmm3
1773#endif
1774	movsd	 0 * SIZE(B), %xmm3
1775
1776	pshufd	 $0x00, %xmm3, %xmm0
1777	pshufd	 $0x55, %xmm3, %xmm1
1778
1779	movaps	%xmm0,  0 * SIZE(BB)
1780	movaps	%xmm1,  4 * SIZE(BB)
1781
1782	addl	$2 * SIZE, B
1783	addl	$8 * SIZE, %ecx
1784	decl	%eax
1785	jne	.L46
1786	ALIGN_4
1787
1788.L50:
1789#if defined(LT) || defined(RN)
1790	movl	A, AA
1791#else
1792	movl	A, %eax
1793	movl	%eax, AORIG
1794#endif
1795
1796	leal	(, LDC, 2), %eax
1797
1798#ifdef RT
1799	subl	%eax, C
1800#endif
1801	movl	C, CO1
1802#ifndef RT
1803	addl	%eax, C
1804#endif
1805
1806	movl	M,  %ebx
1807	sarl	$2, %ebx	# i = (m >> 2)
1808	jle	.L60
1809	ALIGN_4
1810
1811.L51:
1812#ifdef LN
1813       movl	K, %eax
1814       sall	$2 + BASE_SHIFT, %eax
1815       subl	%eax, AORIG
1816#endif
1817
1818#if defined(LN) || defined(RT)
1819	movl	KK, %eax
1820	movl	AORIG, AA
1821	leal	(, %eax, SIZE), %eax
1822	leal	(AA, %eax, 4), AA
1823#endif
1824
1825	leal	BUFFER, BB
1826
1827#if defined(LN) || defined(RT)
1828	movl	KK, %eax
1829	sall	$1 + BASE_SHIFT, %eax
1830	leal	(BB, %eax, 4), BB
1831#endif
1832
1833	xorps	%xmm4, %xmm4
1834	xorps	%xmm5, %xmm5
1835	xorps	%xmm6, %xmm6
1836	xorps	%xmm7, %xmm7
1837
1838	movaps	 0 * SIZE(AA), %xmm0
1839	movaps	16 * SIZE(AA), %xmm1
1840	movaps	 0 * SIZE(BB), %xmm2
1841	movaps	16 * SIZE(BB), %xmm3
1842
1843	PREFETCHW	3 * SIZE(CO1)
1844	PREFETCHW	3 * SIZE(CO1, LDC)
1845
1846#if defined(LT) || defined(RN)
1847	movl	KK, %eax
1848#else
1849	movl	K, %eax
1850	subl	KK, %eax
1851#endif
1852	sarl	$3, %eax
1853	je	.L55
1854	ALIGN_4
1855
1856.L52:
1857	mulps	%xmm0, %xmm2
1858#if defined(OPTERON) || defined(BARCELONA)
1859	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
1860#endif
1861	mulps	 4 * SIZE(BB), %xmm0
1862	addps	%xmm2, %xmm4
1863	movaps	 8 * SIZE(BB), %xmm2
1864	addps	%xmm0, %xmm5
1865	movaps	 4 * SIZE(AA), %xmm0
1866
1867	mulps	%xmm0, %xmm2
1868	mulps	12 * SIZE(BB), %xmm0
1869	addps	%xmm2, %xmm4
1870	movaps	32 * SIZE(BB), %xmm2
1871	addps	%xmm0, %xmm5
1872	movaps	 8 * SIZE(AA), %xmm0
1873
1874	mulps	%xmm0, %xmm3
1875	mulps	20 * SIZE(BB), %xmm0
1876	addps	%xmm3, %xmm4
1877	movaps	24 * SIZE(BB), %xmm3
1878	addps	%xmm0, %xmm5
1879	movaps	12 * SIZE(AA), %xmm0
1880
1881	mulps	%xmm0, %xmm3
1882	mulps	28 * SIZE(BB), %xmm0
1883	addps	%xmm3, %xmm4
1884	movaps	48 * SIZE(BB), %xmm3
1885	addps	%xmm0, %xmm5
1886	movaps	32 * SIZE(AA), %xmm0
1887
1888#if defined(OPTERON) || defined(BARCELONA)
1889	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
1890#endif
1891	mulps	%xmm1, %xmm2
1892	mulps	36 * SIZE(BB), %xmm1
1893	addps	%xmm2, %xmm4
1894	movaps	40 * SIZE(BB), %xmm2
1895	addps	%xmm1, %xmm5
1896	movaps	20 * SIZE(AA), %xmm1
1897
1898	mulps	%xmm1, %xmm2
1899	mulps	44 * SIZE(BB), %xmm1
1900	addps	%xmm2, %xmm4
1901	movaps	64 * SIZE(BB), %xmm2
1902	addps	%xmm1, %xmm5
1903	movaps	24 * SIZE(AA), %xmm1
1904
1905	mulps	%xmm1, %xmm3
1906	mulps	52 * SIZE(BB), %xmm1
1907	addps	%xmm3, %xmm4
1908	movaps	56 * SIZE(BB), %xmm3
1909	addps	%xmm1, %xmm5
1910	movaps	28 * SIZE(AA), %xmm1
1911
1912	mulps	%xmm1, %xmm3
1913	mulps	60 * SIZE(BB), %xmm1
1914	addps	%xmm3, %xmm4
1915	movaps	80 * SIZE(BB), %xmm3
1916	addps	%xmm1, %xmm5
1917	movaps	48 * SIZE(AA), %xmm1
1918
1919	addl	$32 * SIZE, AA
1920	addl	$64 * SIZE, BB
1921	decl   %eax
1922	jne    .L52
1923	ALIGN_4
1924
1925.L55:
1926#if defined(LT) || defined(RN)
1927	movl	KK, %eax
1928#else
1929	movl	K, %eax
1930	subl	KK, %eax
1931#endif
1932	andl	$7, %eax		# if (k & 1)
1933	BRANCH
1934	je .L58
1935	ALIGN_4
1936
1937.L56:
1938	mulps	%xmm0, %xmm2
1939	mulps	 4 * SIZE(BB), %xmm0
1940	addps	%xmm2, %xmm4
1941	movaps	 8 * SIZE(BB), %xmm2
1942	addps	%xmm0, %xmm5
1943	movaps	 4 * SIZE(AA), %xmm0
1944
1945	addl	$4 * SIZE, AA
1946	addl	$8 * SIZE, BB
1947	decl	%eax
1948	jg	.L56
1949	ALIGN_4
1950
1951.L58:
1952#if defined(LN) || defined(RT)
1953	movl	KK, %eax
1954#ifdef LN
1955	subl	$4, %eax
1956#else
1957	subl	$2, %eax
1958#endif
1959
1960	movl	AORIG, AA
1961	movl	BORIG, B
1962	leal	BUFFER, BB
1963
1964	sall	$1 + BASE_SHIFT, %eax
1965	leal	(AA, %eax, 2), AA
1966	leal	(B,  %eax, 1), B
1967	leal	(BB, %eax, 4), BB
1968#endif
1969
1970#if defined(LN) || defined(LT)
1971	movaps	 %xmm4, %xmm0
1972	unpcklps %xmm6, %xmm4
1973	unpckhps %xmm6, %xmm0
1974
1975	movaps	 %xmm5, %xmm1
1976	unpcklps %xmm7, %xmm5
1977	unpckhps %xmm7, %xmm1
1978
1979	movaps	 %xmm4, %xmm6
1980	unpcklps %xmm5, %xmm4
1981	unpckhps %xmm5, %xmm6
1982
1983	movaps	 %xmm0, %xmm2
1984	unpcklps %xmm1, %xmm0
1985	unpckhps %xmm1, %xmm2
1986
1987#ifdef	movsd
1988	xorps	%xmm1, %xmm1
1989#endif
1990	movsd	 0 * SIZE(B), %xmm1
1991#ifdef	movsd
1992	xorps	%xmm3, %xmm3
1993#endif
1994	movsd	 2 * SIZE(B), %xmm3
1995#ifdef	movsd
1996	xorps	%xmm5, %xmm5
1997#endif
1998	movsd	 4 * SIZE(B), %xmm5
1999#ifdef	movsd
2000	xorps	%xmm7, %xmm7
2001#endif
2002	movsd	 6 * SIZE(B), %xmm7
2003
2004	subps	%xmm4,  %xmm1
2005	subps	%xmm6,  %xmm3
2006	subps	%xmm0,  %xmm5
2007	subps	%xmm2,  %xmm7
2008#else
2009	movaps	 0 * SIZE(AA), %xmm0
2010	movaps	 4 * SIZE(AA), %xmm1
2011
2012	subps	%xmm4, %xmm0
2013	subps	%xmm5, %xmm1
2014#endif
2015
2016#ifdef LN
2017	movaps	 12 * SIZE(AA), %xmm4
2018	pshufd	 $0xff, %xmm4, %xmm6
2019	mulps	 %xmm6, %xmm7
2020	pshufd	 $0xaa, %xmm4, %xmm6
2021	mulps	 %xmm7, %xmm6
2022	subps	 %xmm6, %xmm5
2023	pshufd	 $0x55, %xmm4, %xmm6
2024	mulps	 %xmm7, %xmm6
2025	subps	 %xmm6, %xmm3
2026	pshufd	 $0x00, %xmm4, %xmm6
2027	mulps	 %xmm7, %xmm6
2028	subps	 %xmm6, %xmm1
2029
2030	movaps	  8 * SIZE(AA), %xmm4
2031	pshufd	 $0xaa, %xmm4, %xmm6
2032	mulps	 %xmm6, %xmm5
2033	pshufd	 $0x55, %xmm4, %xmm6
2034	mulps	 %xmm5, %xmm6
2035	subps	 %xmm6, %xmm3
2036	pshufd	 $0x00, %xmm4, %xmm6
2037	mulps	 %xmm5, %xmm6
2038	subps	 %xmm6, %xmm1
2039
2040	movaps	  4 * SIZE(AA), %xmm4
2041	pshufd	 $0x55, %xmm4, %xmm6
2042	mulps	 %xmm6, %xmm3
2043	pshufd	 $0x00, %xmm4, %xmm6
2044	mulps	 %xmm3, %xmm6
2045	subps	 %xmm6, %xmm1
2046
2047	movaps	  0 * SIZE(AA), %xmm4
2048	pshufd	 $0x00, %xmm4, %xmm6
2049	mulps	 %xmm6, %xmm1
2050#endif
2051
2052#ifdef LT
2053	movaps	 0 * SIZE(AA), %xmm4
2054	pshufd	 $0x00, %xmm4, %xmm6
2055	mulps	 %xmm6, %xmm1
2056
2057	pshufd	 $0x55, %xmm4, %xmm6
2058	mulps	 %xmm1, %xmm6
2059	subps	 %xmm6, %xmm3
2060	pshufd	 $0xaa, %xmm4, %xmm6
2061	mulps	 %xmm1, %xmm6
2062	subps	 %xmm6, %xmm5
2063	pshufd	 $0xff, %xmm4, %xmm6
2064	mulps	 %xmm1, %xmm6
2065	subps	 %xmm6, %xmm7
2066
2067	movaps	 4 * SIZE(AA), %xmm4
2068	pshufd	 $0x55, %xmm4, %xmm6
2069	mulps	 %xmm6, %xmm3
2070	pshufd	 $0xaa, %xmm4, %xmm6
2071	mulps	 %xmm3, %xmm6
2072	subps	 %xmm6, %xmm5
2073	pshufd	 $0xff, %xmm4, %xmm6
2074	mulps	 %xmm3, %xmm6
2075	subps	 %xmm6, %xmm7
2076
2077	movaps	 8 * SIZE(AA), %xmm4
2078	pshufd	 $0xaa, %xmm4, %xmm6
2079	mulps	 %xmm6, %xmm5
2080	pshufd	 $0xff, %xmm4, %xmm6
2081	mulps	 %xmm5, %xmm6
2082	subps	 %xmm6, %xmm7
2083
2084	movaps	12 * SIZE(AA), %xmm4
2085	pshufd	 $0xff, %xmm4, %xmm6
2086	mulps	 %xmm6, %xmm7
2087#endif
2088
2089#ifdef RN
2090	movaps	 0 * SIZE(B), %xmm6
2091	pshufd	 $0x00, %xmm6, %xmm7
2092	mulps	 %xmm7, %xmm0
2093	pshufd	 $0x55, %xmm6, %xmm7
2094	mulps	 %xmm0, %xmm7
2095	subps	 %xmm7, %xmm1
2096
2097	pshufd	 $0xff, %xmm6, %xmm7
2098	mulps	 %xmm7, %xmm1
2099#endif
2100
2101#ifdef RT
2102	movaps	  0 * SIZE(B), %xmm6
2103	pshufd	 $0xff, %xmm6, %xmm7
2104	mulps	 %xmm7, %xmm1
2105	pshufd	 $0xaa, %xmm6, %xmm7
2106	mulps	 %xmm1, %xmm7
2107	subps	 %xmm7, %xmm0
2108
2109	pshufd	 $0x00, %xmm6, %xmm7
2110	mulps	 %xmm7, %xmm0
2111#endif
2112
2113#if defined(LN) || defined(LT)
2114	movlps	%xmm1,   0 * SIZE(B)
2115	movlps	%xmm3,   2 * SIZE(B)
2116	movlps	%xmm5,   4 * SIZE(B)
2117	movlps	%xmm7,   6 * SIZE(B)
2118
2119	pshufd	$0x00, %xmm1, %xmm0
2120	pshufd	$0x55, %xmm1, %xmm2
2121	movaps	%xmm0,   0 * SIZE(BB)
2122	movaps	%xmm2,   4 * SIZE(BB)
2123
2124	pshufd	$0x00, %xmm3, %xmm0
2125	pshufd	$0x55, %xmm3, %xmm2
2126	movaps	%xmm0,   8 * SIZE(BB)
2127	movaps	%xmm2,  12 * SIZE(BB)
2128
2129	pshufd	$0x00, %xmm5, %xmm0
2130	pshufd	$0x55, %xmm5, %xmm2
2131	movaps	%xmm0,  16 * SIZE(BB)
2132	movaps	%xmm2,  20 * SIZE(BB)
2133
2134	pshufd	$0x00, %xmm7, %xmm0
2135	pshufd	$0x55, %xmm7, %xmm2
2136	movaps	%xmm0,  24 * SIZE(BB)
2137	movaps	%xmm2,  28 * SIZE(BB)
2138#else
2139	movaps	%xmm0,   0 * SIZE(AA)
2140	movaps	%xmm1,   4 * SIZE(AA)
2141#endif
2142
2143#ifdef LN
2144	subl	$4 * SIZE, CO1
2145#endif
2146
2147#if defined(LN) || defined(LT)
2148	unpcklps %xmm5, %xmm1
2149	unpcklps %xmm7, %xmm3
2150
2151	movaps	 %xmm1, %xmm2
2152	unpcklps %xmm3, %xmm1
2153	unpckhps %xmm3, %xmm2
2154
2155	movlps	%xmm1, 0 * SIZE(CO1)
2156	movhps	%xmm1, 2 * SIZE(CO1)
2157	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
2158	movhps	%xmm2, 2 * SIZE(CO1, LDC, 1)
2159#else
2160	movlps	%xmm0, 0 * SIZE(CO1)
2161	movhps	%xmm0, 2 * SIZE(CO1)
2162	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
2163	movhps	%xmm1, 2 * SIZE(CO1, LDC, 1)
2164#endif
2165
2166#ifndef LN
2167	addl	$4 * SIZE, CO1
2168#endif
2169
2170#if defined(LT) || defined(RN)
2171	movl	K,  %eax
2172	subl	KK, %eax
2173	leal	(,%eax, SIZE), %eax
2174	leal	(AA, %eax, 4), AA
2175#ifdef LT
2176	addl	$8 * SIZE, B
2177#endif
2178#endif
2179
2180#ifdef LN
2181	subl	$4, KK
2182	movl	BORIG, B
2183#endif
2184
2185#ifdef LT
2186	addl	$4, KK
2187#endif
2188
2189#ifdef RT
2190	movl	K, %eax
2191	movl	BORIG, B
2192	sall	$2 + BASE_SHIFT, %eax
2193	addl	%eax, AORIG
2194#endif
2195
2196	decl	%ebx			# i --
2197	jg	.L51
2198	ALIGN_4
2199
2200.L60:
2201	testl	$2, M
2202	je	.L70
2203
2204#ifdef LN
2205       movl	K, %eax
2206       sall	$1 + BASE_SHIFT, %eax
2207       subl	%eax, AORIG
2208#endif
2209
2210#if defined(LN) || defined(RT)
2211	movl	KK, %eax
2212	movl	AORIG, AA
2213	leal	(, %eax, SIZE), %eax
2214	leal	(AA, %eax, 2), AA
2215#endif
2216
2217	leal	BUFFER, BB
2218
2219#if defined(LN) || defined(RT)
2220	movl	KK, %eax
2221	sall	$1 + BASE_SHIFT, %eax
2222	leal	(BB, %eax, 4), BB
2223#endif
2224
2225	xorps	%xmm4, %xmm4
2226	xorps	%xmm5, %xmm5
2227	xorps	%xmm6, %xmm6
2228	xorps	%xmm7, %xmm7
2229
2230#ifdef	movsd
2231	xorps	%xmm0, %xmm0
2232#endif
2233	movsd	 0 * SIZE(AA), %xmm0
2234#ifdef	movsd
2235	xorps	%xmm1, %xmm1
2236#endif
2237	movsd	 8 * SIZE(AA), %xmm1
2238	movaps	 0 * SIZE(BB), %xmm2
2239	movaps	16 * SIZE(BB), %xmm3
2240
2241#if defined(LT) || defined(RN)
2242	movl	KK, %eax
2243#else
2244	movl	K, %eax
2245	subl	KK, %eax
2246#endif
2247	sarl	$3, %eax
2248	je	.L65
2249	ALIGN_4
2250
2251.L62:
2252#if defined(OPTERON) || defined(BARCELONA)
2253	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2254#endif
2255
2256	mulps	%xmm0, %xmm2
2257	addps	%xmm2, %xmm4
2258	movaps	 4 * SIZE(BB), %xmm2
2259	mulps	%xmm0, %xmm2
2260	movsd	 2 * SIZE(AA), %xmm0
2261	addps	%xmm2, %xmm5
2262	movaps	 8 * SIZE(BB), %xmm2
2263
2264	mulps	%xmm0, %xmm2
2265	addps	%xmm2, %xmm6
2266	movaps	12 * SIZE(BB), %xmm2
2267	mulps	%xmm0, %xmm2
2268	movsd	 4 * SIZE(AA), %xmm0
2269	addps	%xmm2, %xmm7
2270	movaps	32 * SIZE(BB), %xmm2
2271
2272	mulps	%xmm0, %xmm3
2273	addps	%xmm3, %xmm4
2274	movaps	20 * SIZE(BB), %xmm3
2275	mulps	%xmm0, %xmm3
2276	movsd	 6 * SIZE(AA), %xmm0
2277	addps	%xmm3, %xmm5
2278	movaps	24 * SIZE(BB), %xmm3
2279
2280	mulps	%xmm0, %xmm3
2281	addps	%xmm3, %xmm6
2282	movaps	28 * SIZE(BB), %xmm3
2283	mulps	%xmm0, %xmm3
2284	movsd	16 * SIZE(AA), %xmm0
2285	addps	%xmm3, %xmm7
2286	movaps	48 * SIZE(BB), %xmm3
2287
2288	mulps	%xmm1, %xmm2
2289	addps	%xmm2, %xmm4
2290	movaps	36 * SIZE(BB), %xmm2
2291	mulps	%xmm1, %xmm2
2292	movsd	10 * SIZE(AA), %xmm1
2293	addps	%xmm2, %xmm5
2294	movaps	40 * SIZE(BB), %xmm2
2295
2296	mulps	%xmm1, %xmm2
2297	addps	%xmm2, %xmm6
2298	movaps	44 * SIZE(BB), %xmm2
2299	mulps	%xmm1, %xmm2
2300	movsd	12 * SIZE(AA), %xmm1
2301	addps	%xmm2, %xmm7
2302	movaps	64 * SIZE(BB), %xmm2
2303
2304	mulps	%xmm1, %xmm3
2305	addps	%xmm3, %xmm4
2306	movaps	52 * SIZE(BB), %xmm3
2307	mulps	%xmm1, %xmm3
2308	movsd	14 * SIZE(AA), %xmm1
2309	addps	%xmm3, %xmm5
2310	movaps	56 * SIZE(BB), %xmm3
2311
2312	mulps	%xmm1, %xmm3
2313	addps	%xmm3, %xmm6
2314	movaps	60 * SIZE(BB), %xmm3
2315	mulps	%xmm1, %xmm3
2316	movsd	24 * SIZE(AA), %xmm1
2317	addps	%xmm3, %xmm7
2318	movaps	80 * SIZE(BB), %xmm3
2319
2320	addl	$16 * SIZE, AA
2321	addl	$64 * SIZE, BB
2322	decl   %eax
2323	jne    .L62
2324	ALIGN_4
2325
2326.L65:
2327#if defined(LT) || defined(RN)
2328	movl	KK, %eax
2329#else
2330	movl	K, %eax
2331	subl	KK, %eax
2332#endif
2333	andl	$7, %eax		# if (k & 1)
2334	BRANCH
2335	je .L68
2336	ALIGN_4
2337
2338.L66:
2339	mulps	%xmm0, %xmm2
2340	addps	%xmm2, %xmm4
2341	movaps	 4 * SIZE(BB), %xmm2
2342	mulps	%xmm0, %xmm2
2343	movsd	 2 * SIZE(AA), %xmm0
2344	addps	%xmm2, %xmm5
2345	movaps	 8 * SIZE(BB), %xmm2
2346
2347	addl	$2 * SIZE, AA
2348	addl	$8 * SIZE, BB
2349	decl	%eax
2350	jg	.L66
2351	ALIGN_4
2352
2353.L68:
2354	addps	%xmm6, %xmm4
2355	addps	%xmm7, %xmm5
2356
2357#if defined(LN) || defined(RT)
2358	movl	KK, %eax
2359#ifdef LN
2360	subl	$2, %eax
2361#else
2362	subl	$2, %eax
2363#endif
2364
2365	movl	AORIG, AA
2366	movl	BORIG, B
2367	leal	BUFFER, BB
2368
2369	sall	$BASE_SHIFT, %eax
2370	leal	(AA, %eax, 2), AA
2371	leal	(B,  %eax, 2), B
2372	leal	(BB, %eax, 8), BB
2373#endif
2374
2375#if defined(LN) || defined(LT)
2376	unpcklps %xmm6, %xmm4
2377	unpcklps %xmm7, %xmm5
2378
2379	movaps	 %xmm4, %xmm6
2380	unpcklps %xmm5, %xmm4
2381	unpckhps %xmm5, %xmm6
2382
2383#ifdef	movsd
2384	xorps	%xmm1, %xmm1
2385#endif
2386	movsd	 0 * SIZE(B), %xmm1
2387#ifdef	movsd
2388	xorps	%xmm3, %xmm3
2389#endif
2390	movsd	 2 * SIZE(B), %xmm3
2391
2392	subps	%xmm4,  %xmm1
2393	subps	%xmm6,  %xmm3
2394#else
2395#ifdef	movsd
2396	xorps	%xmm0, %xmm0
2397#endif
2398	movsd	 0 * SIZE(AA), %xmm0
2399#ifdef	movsd
2400	xorps	%xmm1, %xmm1
2401#endif
2402	movsd	 2 * SIZE(AA), %xmm1
2403
2404	subps	%xmm4, %xmm0
2405	subps	%xmm5, %xmm1
2406#endif
2407
2408#ifdef LN
2409	movaps	  0 * SIZE(AA), %xmm4
2410	pshufd	 $0xff, %xmm4, %xmm6
2411	mulps	 %xmm6, %xmm3
2412	pshufd	 $0xaa, %xmm4, %xmm6
2413	mulps	 %xmm3, %xmm6
2414	subps	 %xmm6, %xmm1
2415
2416	pshufd	 $0x00, %xmm4, %xmm6
2417	mulps	 %xmm6, %xmm1
2418#endif
2419
2420#ifdef LT
2421	movaps	 0 * SIZE(AA), %xmm4
2422	pshufd	 $0x00, %xmm4, %xmm6
2423	mulps	 %xmm6, %xmm1
2424	pshufd	 $0x55, %xmm4, %xmm6
2425	mulps	 %xmm1, %xmm6
2426	subps	 %xmm6, %xmm3
2427
2428	pshufd	 $0xff, %xmm4, %xmm6
2429	mulps	 %xmm6, %xmm3
2430#endif
2431
2432#ifdef RN
2433	movaps	 0 * SIZE(B), %xmm6
2434	pshufd	 $0x00, %xmm6, %xmm7
2435	mulps	 %xmm7, %xmm0
2436	pshufd	 $0x55, %xmm6, %xmm7
2437	mulps	 %xmm0, %xmm7
2438	subps	 %xmm7, %xmm1
2439
2440	pshufd	 $0xff, %xmm6, %xmm7
2441	mulps	 %xmm7, %xmm1
2442#endif
2443
2444#ifdef RT
2445	movaps	  0 * SIZE(B), %xmm6
2446	pshufd	 $0xff, %xmm6, %xmm7
2447	mulps	 %xmm7, %xmm1
2448	pshufd	 $0xaa, %xmm6, %xmm7
2449	mulps	 %xmm1, %xmm7
2450	subps	 %xmm7, %xmm0
2451
2452	pshufd	 $0x00, %xmm6, %xmm7
2453	mulps	 %xmm7, %xmm0
2454#endif
2455
2456#if defined(LN) || defined(LT)
2457	movlps	%xmm1,   0 * SIZE(B)
2458	movlps	%xmm3,   2 * SIZE(B)
2459
2460	pshufd	$0x00, %xmm1, %xmm0
2461	pshufd	$0x55, %xmm1, %xmm2
2462	movaps	%xmm0,   0 * SIZE(BB)
2463	movaps	%xmm2,   4 * SIZE(BB)
2464
2465	pshufd	$0x00, %xmm3, %xmm0
2466	pshufd	$0x55, %xmm3, %xmm2
2467	movaps	%xmm0,   8 * SIZE(BB)
2468	movaps	%xmm2,  12 * SIZE(BB)
2469#else
2470	movlps	%xmm0,   0 * SIZE(AA)
2471	movlps	%xmm1,   2 * SIZE(AA)
2472#endif
2473
2474#ifdef LN
2475	subl	$2 * SIZE, CO1
2476#endif
2477
2478#if defined(LN) || defined(LT)
2479	unpcklps %xmm3, %xmm1
2480
2481	movlps	%xmm1, 0 * SIZE(CO1)
2482	movhps	%xmm1, 0 * SIZE(CO1, LDC)
2483#else
2484	movlps	%xmm0, 0 * SIZE(CO1)
2485	movlps	%xmm1, 0 * SIZE(CO1, LDC)
2486#endif
2487
2488#ifndef LN
2489	addl	$2 * SIZE, CO1
2490#endif
2491
2492#if defined(LT) || defined(RN)
2493	movl	K,  %eax
2494	subl	KK, %eax
2495	leal	(,%eax, SIZE), %eax
2496	leal	(AA, %eax, 2), AA
2497#ifdef LT
2498	addl	$4 * SIZE, B
2499#endif
2500#endif
2501
2502#ifdef LN
2503	subl	$2, KK
2504	movl	BORIG, B
2505#endif
2506
2507#ifdef LT
2508	addl	$2, KK
2509#endif
2510
2511#ifdef RT
2512	movl	K, %eax
2513	movl	BORIG, B
2514	sall	$1 + BASE_SHIFT, %eax
2515	addl	%eax, AORIG
2516#endif
2517	ALIGN_4
2518
2519.L70:
2520	testl	$1, M
2521	je	.L79
2522
2523#ifdef LN
2524       movl	K, %eax
2525       sall	$BASE_SHIFT, %eax
2526       subl	%eax, AORIG
2527#endif
2528
2529#if defined(LN) || defined(RT)
2530	movl	KK, %eax
2531	movl	AORIG, AA
2532	leal	(AA, %eax, SIZE), AA
2533#endif
2534
2535	leal	BUFFER, BB
2536
2537#if defined(LN) || defined(RT)
2538	movl	KK, %eax
2539	sall	$1 + BASE_SHIFT, %eax
2540	leal	(BB, %eax, 4), BB
2541#endif
2542
2543	xorps	%xmm4, %xmm4
2544	xorps	%xmm5, %xmm5
2545	xorps	%xmm6, %xmm6
2546	xorps	%xmm7, %xmm7
2547
2548	movss	 0 * SIZE(AA), %xmm0
2549	movss	 4 * SIZE(AA), %xmm1
2550	movss	 0 * SIZE(BB), %xmm2
2551	movss	16 * SIZE(BB), %xmm3
2552
2553#if defined(LT) || defined(RN)
2554	movl	KK, %eax
2555#else
2556	movl	K, %eax
2557	subl	KK, %eax
2558#endif
2559	sarl	$3, %eax
2560	je	.L75
2561	ALIGN_4
2562
2563.L72:
2564	mulss	%xmm0, %xmm2
2565#if defined(OPTERON) || defined(BARCELONA)
2566	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2567#endif
2568	mulss	 4 * SIZE(BB), %xmm0
2569	addss	%xmm2, %xmm4
2570	movss	 8 * SIZE(BB), %xmm2
2571	addss	%xmm0, %xmm5
2572	movss	 1 * SIZE(AA), %xmm0
2573	mulss	%xmm0, %xmm2
2574	mulss	12 * SIZE(BB), %xmm0
2575	addss	%xmm2, %xmm6
2576	movss	32 * SIZE(BB), %xmm2
2577	addss	%xmm0, %xmm7
2578	movss	 2 * SIZE(AA), %xmm0
2579	mulss	%xmm0, %xmm3
2580	mulss	20 * SIZE(BB), %xmm0
2581	addss	%xmm3, %xmm4
2582	movss	24 * SIZE(BB), %xmm3
2583	addss	%xmm0, %xmm5
2584	movss	 3 * SIZE(AA), %xmm0
2585	mulss	%xmm0, %xmm3
2586	mulss	28 * SIZE(BB), %xmm0
2587	addss	%xmm3, %xmm6
2588	movss	48 * SIZE(BB), %xmm3
2589	addss	%xmm0, %xmm7
2590	movss	 8 * SIZE(AA), %xmm0
2591	mulss	%xmm1, %xmm2
2592	mulss	36 * SIZE(BB), %xmm1
2593	addss	%xmm2, %xmm4
2594	movss	40 * SIZE(BB), %xmm2
2595	addss	%xmm1, %xmm5
2596	movss	 5 * SIZE(AA), %xmm1
2597	mulss	%xmm1, %xmm2
2598	mulss	44 * SIZE(BB), %xmm1
2599	addss	%xmm2, %xmm6
2600	movss	64 * SIZE(BB), %xmm2
2601	addss	%xmm1, %xmm7
2602	movss	 6 * SIZE(AA), %xmm1
2603	mulss	%xmm1, %xmm3
2604	mulss	52 * SIZE(BB), %xmm1
2605	addss	%xmm3, %xmm4
2606	movss	56 * SIZE(BB), %xmm3
2607	addss	%xmm1, %xmm5
2608	movss	 7 * SIZE(AA), %xmm1
2609	mulss	%xmm1, %xmm3
2610	mulss	60 * SIZE(BB), %xmm1
2611	addss	%xmm3, %xmm6
2612	movss	80 * SIZE(BB), %xmm3
2613	addss	%xmm1, %xmm7
2614	movss	12 * SIZE(AA), %xmm1
2615
2616	addl	$ 8 * SIZE, AA
2617	addl	$64 * SIZE, BB
2618	decl   %eax
2619	jne    .L72
2620	ALIGN_4
2621
2622.L75:
2623#if defined(LT) || defined(RN)
2624	movl	KK, %eax
2625#else
2626	movl	K, %eax
2627	subl	KK, %eax
2628#endif
2629	andl	$7, %eax		# if (k & 1)
2630	BRANCH
2631	je .L78
2632	ALIGN_4
2633
2634.L76:
2635	mulss	%xmm0, %xmm2
2636	mulss	 4 * SIZE(BB), %xmm0
2637	addss	%xmm2, %xmm4
2638	movss	 8 * SIZE(BB), %xmm2
2639	addss	%xmm0, %xmm5
2640	movss	 1 * SIZE(AA), %xmm0
2641
2642	addl	$ 1 * SIZE, AA
2643	addl	$ 8 * SIZE, BB
2644	decl	%eax
2645	jg	.L76
2646	ALIGN_4
2647
2648.L78:
2649	addss	%xmm6, %xmm4
2650	addss	%xmm7, %xmm5
2651
2652#if defined(LN) || defined(RT)
2653	movl	KK, %eax
2654#ifdef LN
2655	subl	$1, %eax
2656#else
2657	subl	$2, %eax
2658#endif
2659
2660	movl	AORIG, AA
2661	movl	BORIG, B
2662	leal	BUFFER, BB
2663
2664	sall	$BASE_SHIFT, %eax
2665	leal	(AA, %eax, 1), AA
2666	leal	(B,  %eax, 2), B
2667	leal	(BB, %eax, 8), BB
2668#endif
2669
2670#if defined(LN) || defined(LT)
2671	unpcklps %xmm5, %xmm4
2672
2673#ifdef	movsd
2674	xorps	%xmm1, %xmm1
2675#endif
2676	movsd	 0 * SIZE(B), %xmm1
2677
2678	subps	%xmm4,  %xmm1
2679#else
2680	movss	 0 * SIZE(AA), %xmm0
2681	movss	 1 * SIZE(AA), %xmm1
2682
2683	subss	%xmm4, %xmm0
2684	subss	%xmm5, %xmm1
2685#endif
2686
2687#if defined(LN) || defined(LT)
2688	movss	  0 * SIZE(AA), %xmm4
2689	pshufd	 $0x00, %xmm4, %xmm6
2690	mulps	 %xmm6, %xmm1
2691#endif
2692
2693#ifdef RN
2694	movaps	 0 * SIZE(B), %xmm6
2695	pshufd	 $0x00, %xmm6, %xmm7
2696	mulss	 %xmm7, %xmm0
2697	pshufd	 $0x55, %xmm6, %xmm7
2698	mulss	 %xmm0, %xmm7
2699	subss	 %xmm7, %xmm1
2700
2701	pshufd	 $0xff, %xmm6, %xmm7
2702	mulss	 %xmm7, %xmm1
2703#endif
2704
2705#ifdef RT
2706	movaps	  0 * SIZE(B), %xmm6
2707	pshufd	 $0xff, %xmm6, %xmm7
2708	mulss	 %xmm7, %xmm1
2709	pshufd	 $0xaa, %xmm6, %xmm7
2710	mulss	 %xmm1, %xmm7
2711	subss	 %xmm7, %xmm0
2712
2713	pshufd	 $0x00, %xmm6, %xmm7
2714	mulss	 %xmm7, %xmm0
2715#endif
2716
2717#if defined(LN) || defined(LT)
2718	movlps	%xmm1,   0 * SIZE(B)
2719
2720	pshufd	$0x00, %xmm1, %xmm0
2721	pshufd	$0x55, %xmm1, %xmm2
2722	movaps	%xmm0,   0 * SIZE(BB)
2723	movaps	%xmm2,   4 * SIZE(BB)
2724#else
2725	movss	%xmm0,   0 * SIZE(AA)
2726	movss	%xmm1,   1 * SIZE(AA)
2727#endif
2728
2729#ifdef LN
2730	subl	$1 * SIZE, CO1
2731#endif
2732
2733#if defined(LN) || defined(LT)
2734	pshufd	$1, %xmm1, %xmm3
2735
2736	movss	%xmm1, 0 * SIZE(CO1)
2737	movss	%xmm3, 0 * SIZE(CO1, LDC)
2738#else
2739	movss	%xmm0, 0 * SIZE(CO1)
2740	movss	%xmm1, 0 * SIZE(CO1, LDC)
2741#endif
2742
2743#ifndef LN
2744	addl	$1 * SIZE, CO1
2745#endif
2746
2747#if defined(LT) || defined(RN)
2748	movl	K,  %eax
2749	subl	KK, %eax
2750	leal	(AA, %eax, SIZE), AA
2751#ifdef LT
2752	addl	$2 * SIZE, B
2753#endif
2754#endif
2755
2756#ifdef LN
2757	subl	$1, KK
2758	movl	BORIG, B
2759#endif
2760
2761#ifdef LT
2762	addl	$1, KK
2763#endif
2764
2765#ifdef RT
2766	movl	K, %eax
2767	movl	BORIG, B
2768	sall	$BASE_SHIFT, %eax
2769	addl	%eax, AORIG
2770#endif
2771	ALIGN_4
2772
2773.L79:
2774#ifdef LN
2775       movl	K, %eax
2776       leal	(, %eax, SIZE), %eax
2777       leal 	(B, %eax, 2), B
2778#endif
2779
2780#if defined(LT) || defined(RN)
2781	movl	K,  %eax
2782	subl	KK, %eax
2783	leal	(,%eax, SIZE), %eax
2784	leal	(B,  %eax, 2), B
2785#endif
2786
2787#ifdef RN
2788	addl	$2, KK
2789#endif
2790
2791#ifdef RT
2792	subl	$2, KK
2793#endif
2794	ALIGN_4
2795
2796.L80:
2797	testl	$1, N
2798	je	.L999
2799
2800#ifdef LN
2801	movl	OFFSET, %eax
2802	addl	M, %eax
2803	movl	%eax, KK
2804#endif
2805
2806	leal	BUFFER, %ecx
2807
2808#ifdef RT
2809       movl	K, %eax
2810       sall	$BASE_SHIFT, %eax
2811       subl	%eax, B
2812#endif
2813
2814#if defined(LN) || defined(RT)
2815	movl	KK, %eax
2816	movl	B, BORIG
2817        sall	$BASE_SHIFT, %eax
2818	leal	(B,  %eax, 1), B
2819	leal	(BB, %eax, 4), BB
2820#endif
2821
2822#ifdef LT
2823	movl	OFFSET, %eax
2824	movl	%eax, KK
2825#endif
2826
2827#if defined(LT) || defined(RN)
2828	movl	KK, %eax
2829#else
2830	movl	K, %eax
2831	subl	KK, %eax
2832#endif
2833	sarl	$3, %eax
2834	jle	.L85
2835	ALIGN_4
2836
2837.L82:
2838	movsd	 0 * SIZE(B), %xmm3
2839	movhps	 2 * SIZE(B), %xmm3
2840	movsd	 4 * SIZE(B), %xmm7
2841	movhps	 6 * SIZE(B), %xmm7
2842
2843	pshufd	 $0x00, %xmm3, %xmm0
2844	pshufd	 $0x55, %xmm3, %xmm1
2845	pshufd	 $0xaa, %xmm3, %xmm2
2846	pshufd	 $0xff, %xmm3, %xmm3
2847
2848	pshufd	 $0x00, %xmm7, %xmm4
2849	pshufd	 $0x55, %xmm7, %xmm5
2850	pshufd	 $0xaa, %xmm7, %xmm6
2851	pshufd	 $0xff, %xmm7, %xmm7
2852
2853	movaps	%xmm0,  0 * SIZE(BB)
2854	movaps	%xmm1,  4 * SIZE(BB)
2855	movaps	%xmm2,  8 * SIZE(BB)
2856	movaps	%xmm3, 12 * SIZE(BB)
2857	movaps	%xmm4, 16 * SIZE(BB)
2858	movaps	%xmm5, 20 * SIZE(BB)
2859	movaps	%xmm6, 24 * SIZE(BB)
2860	movaps	%xmm7, 28 * SIZE(BB)
2861
2862	addl	$ 8 * SIZE, B
2863	addl	$32 * SIZE, BB
2864	decl	%eax
2865	jne	.L82
2866	ALIGN_4
2867
2868.L85:
2869#if defined(LT) || defined(RN)
2870	movl	KK, %eax
2871#else
2872	movl	K, %eax
2873	subl	KK, %eax
2874#endif
2875	andl	$7, %eax
2876	BRANCH
2877	jle	.L90
2878	ALIGN_4
2879
2880.L86:
2881	movss	 0 * SIZE(B), %xmm3
2882
2883	pshufd	 $0x00, %xmm3, %xmm0
2884
2885	movaps	%xmm0,  0 * SIZE(BB)
2886
2887	addl	$1 * SIZE, B
2888	addl	$4 * SIZE, BB
2889	decl	%eax
2890	jne	.L86
2891	ALIGN_4
2892
2893.L90:
2894#if defined(LT) || defined(RN)
2895	movl	A, AA
2896#else
2897	movl	A, %eax
2898	movl	%eax, AORIG
2899#endif
2900
2901#ifdef RT
2902	subl	LDC, C
2903#endif
2904	movl	C, CO1
2905#ifndef RT
2906	addl	LDC, C
2907#endif
2908
2909	movl	M,  %ebx
2910	sarl	$2, %ebx	# i = (m >> 2)
2911	jle	.L100
2912	ALIGN_4
2913
2914.L91:
2915#ifdef LN
2916       movl	K, %eax
2917       sall	$2 + BASE_SHIFT, %eax
2918       subl	%eax, AORIG
2919#endif
2920
2921#if defined(LN) || defined(RT)
2922	movl	KK, %eax
2923	movl	AORIG, AA
2924	leal	(, %eax, SIZE), %eax
2925	leal	(AA, %eax, 4), AA
2926#endif
2927
2928	leal	BUFFER, BB
2929
2930#if defined(LN) || defined(RT)
2931	movl	KK, %eax
2932	sall	$BASE_SHIFT, %eax
2933	leal	(BB, %eax, 4), BB
2934#endif
2935
2936	xorps	%xmm4, %xmm4
2937	xorps	%xmm5, %xmm5
2938	xorps	%xmm6, %xmm6
2939	xorps	%xmm7, %xmm7
2940
2941	movaps	 0 * SIZE(AA), %xmm0
2942	movaps	16 * SIZE(AA), %xmm1
2943	movaps	 0 * SIZE(BB), %xmm2
2944	movaps	16 * SIZE(BB), %xmm3
2945
2946	PREFETCHW	3 * SIZE(CO1)
2947
2948#if defined(LT) || defined(RN)
2949	movl	KK, %eax
2950#else
2951	movl	K, %eax
2952	subl	KK, %eax
2953#endif
2954	sarl	$3, %eax
2955	je	.L95
2956	ALIGN_4
2957
2958.L92:
2959	mulps	%xmm0, %xmm2
2960#if defined(OPTERON) || defined(BARCELONA)
2961	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2962#endif
2963	movaps	 4 * SIZE(AA), %xmm0
2964	addps	%xmm2, %xmm4
2965	movaps	32 * SIZE(BB), %xmm2
2966	mulps	 4 * SIZE(BB), %xmm0
2967	addps	%xmm0, %xmm5
2968	movaps	 8 * SIZE(AA), %xmm0
2969	mulps	 8 * SIZE(BB), %xmm0
2970	addps	%xmm0, %xmm6
2971	movaps	12 * SIZE(AA), %xmm0
2972	mulps	12 * SIZE(BB), %xmm0
2973	addps	%xmm0, %xmm7
2974	movaps	32 * SIZE(AA), %xmm0
2975#if defined(OPTERON) || defined(BARCELONA)
2976	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
2977#endif
2978	mulps	%xmm1, %xmm3
2979	movaps	20 * SIZE(AA), %xmm1
2980	addps	%xmm3, %xmm4
2981	movaps	48 * SIZE(BB), %xmm3
2982	mulps	20 * SIZE(BB), %xmm1
2983	addps	%xmm1, %xmm5
2984	movaps	24 * SIZE(AA), %xmm1
2985	mulps	24 * SIZE(BB), %xmm1
2986	addps	%xmm1, %xmm6
2987	movaps	28 * SIZE(AA), %xmm1
2988	mulps	28 * SIZE(BB), %xmm1
2989	addps	%xmm1, %xmm7
2990	movaps	48 * SIZE(AA), %xmm1
2991
2992	addl	$32 * SIZE, AA
2993	addl	$32 * SIZE, BB
2994	decl   %eax
2995	jne    .L92
2996	ALIGN_4
2997
2998.L95:
2999#if defined(LT) || defined(RN)
3000	movl	KK, %eax
3001#else
3002	movl	K, %eax
3003	subl	KK, %eax
3004#endif
3005	andl	$7, %eax		# if (k & 1)
3006	BRANCH
3007	je .L98
3008	ALIGN_4
3009
3010.L96:
3011	mulps	%xmm0, %xmm2
3012	addps	%xmm2, %xmm4
3013	movaps	 4 * SIZE(AA), %xmm0
3014	movaps	 4 * SIZE(BB), %xmm2
3015
3016	addl	$4 * SIZE, AA
3017	addl	$4 * SIZE, BB
3018	decl	%eax
3019	jg	.L96
3020	ALIGN_4
3021
3022.L98:
3023	addps	%xmm5, %xmm4
3024	addps	%xmm7, %xmm6
3025	addps	%xmm6, %xmm4
3026
3027#if defined(LN) || defined(RT)
3028	movl	KK, %eax
3029#ifdef LN
3030	subl	$4, %eax
3031#else
3032	subl	$1, %eax
3033#endif
3034
3035	movl	AORIG, AA
3036	movl	BORIG, B
3037	leal	BUFFER, BB
3038
3039	sall	$ BASE_SHIFT, %eax
3040	leal	(AA, %eax, 4), AA
3041	leal	(B,  %eax, 1), B
3042	leal	(BB, %eax, 4), BB
3043#endif
3044
3045#if defined(LN) || defined(LT)
3046	movaps	 %xmm4, %xmm0
3047	unpcklps %xmm6, %xmm4
3048	unpckhps %xmm6, %xmm0
3049
3050	movaps	 %xmm5, %xmm1
3051	unpcklps %xmm7, %xmm5
3052	unpckhps %xmm7, %xmm1
3053
3054	movaps	 %xmm4, %xmm6
3055	unpcklps %xmm5, %xmm4
3056	unpckhps %xmm5, %xmm6
3057
3058	movaps	 %xmm0, %xmm2
3059	unpcklps %xmm1, %xmm0
3060	unpckhps %xmm1, %xmm2
3061
3062	movss	 0 * SIZE(B), %xmm1
3063	movss	 1 * SIZE(B), %xmm3
3064	movss	 2 * SIZE(B), %xmm5
3065	movss	 3 * SIZE(B), %xmm7
3066
3067	subss	%xmm4,  %xmm1
3068	subss	%xmm6,  %xmm3
3069	subss	%xmm0,  %xmm5
3070	subss	%xmm2,  %xmm7
3071#else
3072	movaps	 0 * SIZE(AA), %xmm0
3073
3074	subps	%xmm4, %xmm0
3075#endif
3076
3077#ifdef LN
3078	movaps	 12 * SIZE(AA), %xmm4
3079	pshufd	 $0xff, %xmm4, %xmm6
3080	mulss	 %xmm6, %xmm7
3081	pshufd	 $0xaa, %xmm4, %xmm6
3082	mulss	 %xmm7, %xmm6
3083	subss	 %xmm6, %xmm5
3084	pshufd	 $0x55, %xmm4, %xmm6
3085	mulss	 %xmm7, %xmm6
3086	subss	 %xmm6, %xmm3
3087	pshufd	 $0x00, %xmm4, %xmm6
3088	mulss	 %xmm7, %xmm6
3089	subss	 %xmm6, %xmm1
3090
3091	movaps	  8 * SIZE(AA), %xmm4
3092	pshufd	 $0xaa, %xmm4, %xmm6
3093	mulss	 %xmm6, %xmm5
3094	pshufd	 $0x55, %xmm4, %xmm6
3095	mulss	 %xmm5, %xmm6
3096	subss	 %xmm6, %xmm3
3097	pshufd	 $0x00, %xmm4, %xmm6
3098	mulss	 %xmm5, %xmm6
3099	subss	 %xmm6, %xmm1
3100
3101	movaps	  4 * SIZE(AA), %xmm4
3102	pshufd	 $0x55, %xmm4, %xmm6
3103	mulss	 %xmm6, %xmm3
3104	pshufd	 $0x00, %xmm4, %xmm6
3105	mulss	 %xmm3, %xmm6
3106	subss	 %xmm6, %xmm1
3107
3108	movaps	  0 * SIZE(AA), %xmm4
3109	pshufd	 $0x00, %xmm4, %xmm6
3110	mulss	 %xmm6, %xmm1
3111#endif
3112
3113#ifdef LT
3114	movaps	 0 * SIZE(AA), %xmm4
3115	pshufd	 $0x00, %xmm4, %xmm6
3116	mulss	 %xmm6, %xmm1
3117
3118	pshufd	 $0x55, %xmm4, %xmm6
3119	mulss	 %xmm1, %xmm6
3120	subss	 %xmm6, %xmm3
3121	pshufd	 $0xaa, %xmm4, %xmm6
3122	mulss	 %xmm1, %xmm6
3123	subss	 %xmm6, %xmm5
3124	pshufd	 $0xff, %xmm4, %xmm6
3125	mulss	 %xmm1, %xmm6
3126	subss	 %xmm6, %xmm7
3127
3128	movaps	 4 * SIZE(AA), %xmm4
3129	pshufd	 $0x55, %xmm4, %xmm6
3130	mulss	 %xmm6, %xmm3
3131	pshufd	 $0xaa, %xmm4, %xmm6
3132	mulss	 %xmm3, %xmm6
3133	subss	 %xmm6, %xmm5
3134	pshufd	 $0xff, %xmm4, %xmm6
3135	mulss	 %xmm3, %xmm6
3136	subss	 %xmm6, %xmm7
3137
3138	movaps	 8 * SIZE(AA), %xmm4
3139	pshufd	 $0xaa, %xmm4, %xmm6
3140	mulss	 %xmm6, %xmm5
3141	pshufd	 $0xff, %xmm4, %xmm6
3142	mulss	 %xmm5, %xmm6
3143	subss	 %xmm6, %xmm7
3144
3145	movaps	12 * SIZE(AA), %xmm4
3146	pshufd	 $0xff, %xmm4, %xmm6
3147	mulss	 %xmm6, %xmm7
3148#endif
3149
3150#if defined(RN) || defined(RT)
3151	movss	 0 * SIZE(B), %xmm6
3152	pshufd	 $0x00, %xmm6, %xmm7
3153	mulps	 %xmm7, %xmm0
3154#endif
3155
3156#if defined(LN) || defined(LT)
3157	movss	%xmm1,   0 * SIZE(B)
3158	movss	%xmm3,   1 * SIZE(B)
3159	movss	%xmm5,   2 * SIZE(B)
3160	movss	%xmm7,   3 * SIZE(B)
3161
3162	pshufd	$0x00, %xmm1, %xmm0
3163	movaps	%xmm0,   0 * SIZE(BB)
3164	pshufd	$0x00, %xmm3, %xmm0
3165	movaps	%xmm0,   4 * SIZE(BB)
3166
3167	pshufd	$0x00, %xmm5, %xmm0
3168	movaps	%xmm0,   8 * SIZE(BB)
3169	pshufd	$0x00, %xmm7, %xmm0
3170	movaps	%xmm0,  12 * SIZE(BB)
3171#else
3172	movss	%xmm0,   0 * SIZE(AA)
3173	movss	%xmm1,   1 * SIZE(AA)
3174	movss	%xmm2,   2 * SIZE(AA)
3175	movss	%xmm3,   3 * SIZE(AA)
3176#endif
3177
3178#ifdef LN
3179	subl	$4 * SIZE, CO1
3180#endif
3181
3182#if defined(LN) || defined(LT)
3183	unpcklps %xmm5, %xmm1
3184	unpcklps %xmm7, %xmm3
3185
3186	unpcklps %xmm3, %xmm1
3187
3188	movlps	%xmm1, 0 * SIZE(CO1)
3189	movhps	%xmm1, 2 * SIZE(CO1)
3190#else
3191	movlps	%xmm0, 0 * SIZE(CO1)
3192	movhps	%xmm0, 2 * SIZE(CO1)
3193#endif
3194
3195#ifndef LN
3196	addl	$4 * SIZE, CO1
3197#endif
3198
3199#if defined(LT) || defined(RN)
3200	movl	K,  %eax
3201	subl	KK, %eax
3202	leal	(,%eax, SIZE), %eax
3203	leal	(AA, %eax, 4), AA
3204#ifdef LT
3205	addl	$4 * SIZE, B
3206#endif
3207#endif
3208
3209#ifdef LN
3210	subl	$4, KK
3211	movl	BORIG, B
3212#endif
3213
3214#ifdef LT
3215	addl	$4, KK
3216#endif
3217
3218#ifdef RT
3219	movl	K, %eax
3220	movl	BORIG, B
3221	sall	$2 + BASE_SHIFT, %eax
3222	addl	%eax, AORIG
3223#endif
3224
3225	decl	%ebx			# i --
3226	jg	.L91
3227	ALIGN_4
3228
3229.L100:
3230	testl	$2, M
3231	je	.L110
3232
3233#ifdef LN
3234       movl	K, %eax
3235       sall	$1 + BASE_SHIFT, %eax
3236       subl	%eax, AORIG
3237#endif
3238
3239#if defined(LN) || defined(RT)
3240	movl	KK, %eax
3241	movl	AORIG, AA
3242	sall	$1 + BASE_SHIFT, %eax
3243	leal	(, %eax, SIZE), %eax
3244	leal	(AA, %eax, 2), AA
3245#endif
3246
3247	leal	BUFFER, BB
3248
3249#if defined(LN) || defined(RT)
3250	movl	KK, %eax
3251	sall	$BASE_SHIFT, %eax
3252	leal	(BB, %eax, 4), BB
3253#endif
3254
3255	xorps	%xmm4, %xmm4
3256	xorps	%xmm5, %xmm5
3257	xorps	%xmm6, %xmm6
3258	xorps	%xmm7, %xmm7
3259
3260#ifdef	movsd
3261	xorps	%xmm0, %xmm0
3262#endif
3263	movsd	 0 * SIZE(AA), %xmm0
3264#ifdef	movsd
3265	xorps	%xmm1, %xmm1
3266#endif
3267	movsd	 8 * SIZE(AA), %xmm1
3268	movaps	 0 * SIZE(BB), %xmm2
3269	movaps	16 * SIZE(BB), %xmm3
3270
3271#if defined(LT) || defined(RN)
3272	movl	KK, %eax
3273#else
3274	movl	K, %eax
3275	subl	KK, %eax
3276#endif
3277	sarl	$3, %eax
3278	je	.L105
3279	ALIGN_4
3280
3281.L102:
3282	mulps	%xmm0, %xmm2
3283#if defined(OPTERON) || defined(BARCELONA)
3284	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
3285#endif
3286	movsd	 2 * SIZE(AA), %xmm0
3287	addps	%xmm2, %xmm4
3288	movaps	 4 * SIZE(BB), %xmm2
3289	mulps	%xmm0, %xmm2
3290	movsd	 4 * SIZE(AA), %xmm0
3291	addps	%xmm2, %xmm5
3292	movaps	 8 * SIZE(BB), %xmm2
3293	mulps	%xmm0, %xmm2
3294	movsd	 6 * SIZE(AA), %xmm0
3295	addps	%xmm2, %xmm6
3296	movaps	12 * SIZE(BB), %xmm2
3297	mulps	%xmm0, %xmm2
3298	movsd	16 * SIZE(AA), %xmm0
3299	addps	%xmm2, %xmm7
3300	movaps	32 * SIZE(BB), %xmm2
3301	mulps	%xmm1, %xmm3
3302	movsd	10 * SIZE(AA), %xmm1
3303	addps	%xmm3, %xmm4
3304	movaps	20 * SIZE(BB), %xmm3
3305	mulps	%xmm1, %xmm3
3306	movsd	12 * SIZE(AA), %xmm1
3307	addps	%xmm3, %xmm5
3308	movaps	24 * SIZE(BB), %xmm3
3309	mulps	%xmm1, %xmm3
3310	movsd	14 * SIZE(AA), %xmm1
3311	addps	%xmm3, %xmm6
3312	movaps	28 * SIZE(BB), %xmm3
3313	mulps	%xmm1, %xmm3
3314	movsd	24 * SIZE(AA), %xmm1
3315	addps	%xmm3, %xmm7
3316	movaps	48 * SIZE(BB), %xmm3
3317
3318	addl	$16 * SIZE, AA
3319	addl	$32 * SIZE, BB
3320	decl   %eax
3321	jne    .L102
3322	ALIGN_4
3323
3324.L105:
3325#if defined(LT) || defined(RN)
3326	movl	KK, %eax
3327#else
3328	movl	K, %eax
3329	subl	KK, %eax
3330#endif
3331	andl	$7, %eax		# if (k & 1)
3332	BRANCH
3333	je .L108
3334	ALIGN_4
3335
3336.L106:
3337	mulps	%xmm0, %xmm2
3338	addps	%xmm2, %xmm4
3339	movsd	 2 * SIZE(AA), %xmm0
3340	movaps	 4 * SIZE(BB), %xmm2
3341
3342	addl	$2 * SIZE, AA
3343	addl	$4 * SIZE, BB
3344	decl	%eax
3345	jg	.L106
3346	ALIGN_4
3347
3348.L108:
3349	addps	%xmm5, %xmm4
3350	addps	%xmm7, %xmm6
3351	addps	%xmm6, %xmm4
3352
3353#if defined(LN) || defined(RT)
3354	movl	KK, %eax
3355#ifdef LN
3356	subl	$2, %eax
3357#else
3358	subl	$1, %eax
3359#endif
3360
3361	movl	AORIG, AA
3362	movl	BORIG, B
3363	leal	BUFFER, BB
3364
3365	sall	$ BASE_SHIFT, %eax
3366	leal	(AA, %eax, 2), AA
3367	leal	(B,  %eax, 1), B
3368	leal	(BB, %eax, 4), BB
3369#endif
3370
3371#if defined(LN) || defined(LT)
3372	pshufd	$1, %xmm4, %xmm6
3373
3374	movss	 0 * SIZE(B), %xmm1
3375	movss	 1 * SIZE(B), %xmm3
3376
3377	subss	%xmm4,  %xmm1
3378	subss	%xmm6,  %xmm3
3379#else
3380#ifdef	movsd
3381	xorps	%xmm0, %xmm0
3382#endif
3383	movsd	 0 * SIZE(AA), %xmm0
3384
3385	subps	%xmm4, %xmm0
3386#endif
3387
3388#ifdef LN
3389	movaps	  0 * SIZE(AA), %xmm4
3390	pshufd	 $0xff, %xmm4, %xmm6
3391	mulss	 %xmm6, %xmm3
3392	pshufd	 $0xaa, %xmm4, %xmm6
3393	mulss	 %xmm3, %xmm6
3394	subss	 %xmm6, %xmm1
3395
3396	pshufd	 $0x00, %xmm4, %xmm6
3397	mulss	 %xmm6, %xmm1
3398#endif
3399
3400#ifdef LT
3401	movaps	 0 * SIZE(AA), %xmm4
3402	pshufd	 $0x00, %xmm4, %xmm6
3403	mulss	 %xmm6, %xmm1
3404	pshufd	 $0x55, %xmm4, %xmm6
3405	mulss	 %xmm1, %xmm6
3406	subss	 %xmm6, %xmm3
3407
3408	pshufd	 $0xff, %xmm4, %xmm6
3409	mulss	 %xmm6, %xmm3
3410#endif
3411
3412#if defined(RN) || defined(RT)
3413	movss	 0 * SIZE(B), %xmm6
3414	pshufd	 $0x00, %xmm6, %xmm7
3415	mulps	 %xmm7, %xmm0
3416#endif
3417
3418#if defined(LN) || defined(LT)
3419	movss	%xmm1,   0 * SIZE(B)
3420	movss	%xmm3,   1 * SIZE(B)
3421
3422	pshufd	$0x00, %xmm1, %xmm0
3423	movaps	%xmm0,   0 * SIZE(BB)
3424	pshufd	$0x00, %xmm3, %xmm0
3425	movaps	%xmm0,   4 * SIZE(BB)
3426#else
3427	movlps	%xmm0,   0 * SIZE(AA)
3428#endif
3429
3430#ifdef LN
3431	subl	$2 * SIZE, CO1
3432#endif
3433
3434#if defined(LN) || defined(LT)
3435	movss	%xmm1, 0 * SIZE(CO1)
3436	movss	%xmm3, 1 * SIZE(CO1)
3437#else
3438	movlps	%xmm0, 0 * SIZE(CO1)
3439#endif
3440
3441#ifndef LN
3442	addl	$2 * SIZE, CO1
3443#endif
3444
3445#if defined(LT) || defined(RN)
3446	movl	K,  %eax
3447	subl	KK, %eax
3448	leal	(,%eax, SIZE), %eax
3449	leal	(AA, %eax, 2), AA
3450#ifdef LT
3451	addl	$2 * SIZE, B
3452#endif
3453#endif
3454
3455#ifdef LN
3456	subl	$2, KK
3457	movl	BORIG, B
3458#endif
3459
3460#ifdef LT
3461	addl	$2, KK
3462#endif
3463
3464#ifdef RT
3465	movl	K, %eax
3466	movl	BORIG, B
3467	sall	$1 + BASE_SHIFT, %eax
3468	addl	%eax, AORIG
3469#endif
3470	ALIGN_4
3471
3472.L110:
3473	testl	$1, M
3474	je	.L119
3475
3476#ifdef LN
3477       movl	K, %eax
3478       sall	$BASE_SHIFT, %eax
3479       subl	%eax, AORIG
3480#endif
3481
3482#if defined(LN) || defined(RT)
3483	movl	KK, %eax
3484	movl	AORIG, AA
3485	leal	(AA, %eax, SIZE), AA
3486#endif
3487
3488	leal	BUFFER, BB
3489
3490#if defined(LN) || defined(RT)
3491	movl	KK, %eax
3492	sall	$BASE_SHIFT, %eax
3493	leal	(BB, %eax, 4), BB
3494#endif
3495
3496	xorps	%xmm4, %xmm4
3497	xorps	%xmm5, %xmm5
3498	xorps	%xmm6, %xmm6
3499	xorps	%xmm7, %xmm7
3500
3501	movss	 0 * SIZE(AA), %xmm0
3502	movss	 4 * SIZE(AA), %xmm1
3503	movss	 0 * SIZE(BB), %xmm2
3504	movss	16 * SIZE(BB), %xmm3
3505
3506#if defined(LT) || defined(RN)
3507	movl	KK, %eax
3508#else
3509	movl	K, %eax
3510	subl	KK, %eax
3511#endif
3512	sarl	$3, %eax
3513	je	.L115
3514	ALIGN_4
3515
3516.L112:
3517	mulss	%xmm0, %xmm2
3518#if defined(OPTERON) || defined(BARCELONA)
3519	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
3520#endif
3521	movss	 1 * SIZE(AA), %xmm0
3522	addss	%xmm2, %xmm4
3523	movss	32 * SIZE(BB), %xmm2
3524	mulss	 4 * SIZE(BB), %xmm0
3525	addss	%xmm0, %xmm5
3526	movss	 2 * SIZE(AA), %xmm0
3527	mulss	 8 * SIZE(BB), %xmm0
3528	addss	%xmm0, %xmm6
3529	movss	 3 * SIZE(AA), %xmm0
3530	mulss	12 * SIZE(BB), %xmm0
3531	addss	%xmm0, %xmm7
3532	movss	 8 * SIZE(AA), %xmm0
3533	mulss	%xmm1, %xmm3
3534	movss	 5 * SIZE(AA), %xmm1
3535	addss	%xmm3, %xmm4
3536	movss	48 * SIZE(BB), %xmm3
3537	mulss	20 * SIZE(BB), %xmm1
3538	addss	%xmm1, %xmm5
3539	movss	 6 * SIZE(AA), %xmm1
3540	mulss	24 * SIZE(BB), %xmm1
3541	addss	%xmm1, %xmm6
3542	movss	 7 * SIZE(AA), %xmm1
3543	mulss	28 * SIZE(BB), %xmm1
3544	addss	%xmm1, %xmm7
3545	movss	12 * SIZE(AA), %xmm1
3546
3547	addl	$ 8 * SIZE, AA
3548	addl	$32 * SIZE, BB
3549	decl   %eax
3550	jne    .L112
3551	ALIGN_4
3552
3553.L115:
3554#if defined(LT) || defined(RN)
3555	movl	KK, %eax
3556#else
3557	movl	K, %eax
3558	subl	KK, %eax
3559#endif
3560	andl	$7, %eax		# if (k & 1)
3561	BRANCH
3562	je .L118
3563	ALIGN_4
3564
3565.L116:
3566	mulss	%xmm0, %xmm2
3567	movss	 1 * SIZE(AA), %xmm0
3568	addss	%xmm2, %xmm4
3569	movss	 4 * SIZE(BB), %xmm2
3570
3571	addl	$ 1 * SIZE, AA
3572	addl	$ 4 * SIZE, BB
3573	decl	%eax
3574	jg	.L116
3575	ALIGN_4
3576
3577.L118:
3578	addss	%xmm5, %xmm4
3579	addss	%xmm7, %xmm6
3580	addss	%xmm6, %xmm4
3581
3582#if defined(LN) || defined(RT)
3583	movl	KK, %eax
3584	subl	$1, %eax
3585
3586	movl	AORIG, AA
3587	movl	BORIG, B
3588	leal	BUFFER, BB
3589
3590	sall	$ BASE_SHIFT, %eax
3591	leal	(AA, %eax, 1), AA
3592	leal	(B,  %eax, 1), B
3593	leal	(BB, %eax, 4), BB
3594#endif
3595
3596#if defined(LN) || defined(LT)
3597	movss	 0 * SIZE(B), %xmm1
3598	subss	%xmm4,  %xmm1
3599#else
3600	movss	 0 * SIZE(AA), %xmm0
3601	subss	%xmm4, %xmm0
3602#endif
3603
3604#if defined(LN) || defined(LT)
3605	mulss	 0 * SIZE(AA), %xmm1
3606#endif
3607
3608#if defined(RN) || defined(RT)
3609	mulss	 0 * SIZE(B), %xmm0
3610#endif
3611
3612#if defined(LN) || defined(LT)
3613	movss	%xmm1,   0 * SIZE(B)
3614
3615	pshufd	$0x00, %xmm1, %xmm0
3616	movaps	%xmm0,   0 * SIZE(BB)
3617#else
3618	movss	%xmm0,   0 * SIZE(AA)
3619#endif
3620
3621#ifdef LN
3622	subl	$1 * SIZE, CO1
3623#endif
3624
3625#if defined(LN) || defined(LT)
3626	movss	%xmm1, 0 * SIZE(CO1)
3627#else
3628	movss	%xmm0, 0 * SIZE(CO1)
3629#endif
3630
3631#ifndef LN
3632	addl	$1 * SIZE, CO1
3633#endif
3634
3635#if defined(LT) || defined(RN)
3636	movl	K,  %eax
3637	subl	KK, %eax
3638	leal	(AA, %eax, SIZE), AA
3639#ifdef LT
3640	addl	$1 * SIZE, B
3641#endif
3642#endif
3643
3644#ifdef LN
3645	subl	$1, KK
3646	movl	BORIG, B
3647#endif
3648
3649#ifdef LT
3650	addl	$1, KK
3651#endif
3652
3653#ifdef RT
3654	movl	K, %eax
3655	movl	BORIG, B
3656	sall	$BASE_SHIFT, %eax
3657	addl	%eax, AORIG
3658#endif
3659	ALIGN_4
3660
3661.L119:
3662#ifdef LN
3663       movl	K, %eax
3664       leal 	(B, %eax, SIZE), B
3665#endif
3666
3667#if defined(LT) || defined(RN)
3668	movl	K,  %eax
3669	subl	KK, %eax
3670	leal	(B,  %eax, SIZE), B
3671#endif
3672
3673#ifdef RN
3674	addl	$1, KK
3675#endif
3676
3677#ifdef RT
3678	subl	$1, KK
3679#endif
3680	ALIGN_4
3681
3682.L999:
3683	movl	OLD_STACK, %esp
3684	popl	%ebx
3685	popl	%esi
3686	popl	%edi
3687	popl	%ebp
3688	ret
3689
3690	EPILOGUE
3691