1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43
44#define OLD_M	 4 + STACK(%esi)
45#define OLD_N	 8 + STACK(%esi)
46#define OLD_K	12 + STACK(%esi)
47#define OLD_A	20 + STACK(%esi)
48#define OLD_B	24 + STACK(%esi)
49#define OLD_C	28 + STACK(%esi)
50#define OLD_LDC	32 + STACK(%esi)
51#define STACK_OFFT	36 + STACK(%esi)
52
53#define K	16(%esp)
54#define N	20(%esp)
55#define M	24(%esp)
56#define A	28(%esp)
57#define C	32(%esp)
58#define J	36(%esp)
59#define OLD_STACK 40(%esp)
60#define OFFSET  44(%esp)
61#define KK	48(%esp)
62#define KKK	52(%esp)
63#define AORIG	56(%esp)
64#define BORIG	60(%esp)
65#define BUFFER 128(%esp)
66
67#if defined(OPTERON) || defined(BARCELONA)
68#define PREFETCH     prefetch
69#define PREFETCHW    prefetchw
70#define PREFETCHSIZE (16 * 10 + 8)
71#endif
72
73#if defined(PENTIUM4) || defined(PENTIUMM)
74#define PREFETCH     prefetcht0
75#define PREFETCHW    prefetcht0
76#define PREFETCHSIZE   96
77#endif
78
79#if defined(PENRYN) || defined(DUNNINGTON)
80#define PREFETCH     prefetcht0
81#define PREFETCHW    prefetcht0
82#define PREFETCHSIZE   96
83#endif
84
85#define B	%edi
86#define AA	%edx
87#define	BB	%ecx
88#define LDC	%ebp
89#define CO1	%esi
90
91#if defined(OPTERON) || !defined(HAVE_SSE2)
92#define movsd	movlps
93#endif
94
95#ifdef HAVE_SSE2
96#define	xorps	pxor
97#endif
98
99#define KERNEL1(address) \
100	mulps	%xmm0, %xmm2; \
101	addps	%xmm2, %xmm4; \
102	movaps	 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
103	mulps	%xmm0, %xmm2; \
104	PREFETCH (PREFETCHSIZE +  0) * SIZE + (address) * 1 * SIZE(AA); \
105	addps	%xmm2, %xmm5; \
106	movaps	 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
107	mulps	%xmm0, %xmm2; \
108	mulps	12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
109	addps	%xmm2, %xmm6; \
110	movaps	32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
111	addps	%xmm0, %xmm7; \
112	movaps	 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
113
114#define KERNEL2(address) \
115	mulps	%xmm0, %xmm3; \
116	addps	%xmm3, %xmm4; \
117	movaps	20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
118	mulps	%xmm0, %xmm3; \
119	addps	%xmm3, %xmm5; \
120	movaps	24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
121	mulps	%xmm0, %xmm3; \
122	mulps	28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
123	addps	%xmm3, %xmm6; \
124	movaps	48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
125	addps	%xmm0, %xmm7; \
126	movaps	 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
127
128#define KERNEL3(address) \
129	mulps	%xmm0, %xmm2; \
130	addps	%xmm2, %xmm4; \
131	movaps	36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
132	mulps	%xmm0, %xmm2; \
133	addps	%xmm2, %xmm5; \
134	movaps	40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
135	mulps	%xmm0, %xmm2; \
136	mulps	44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
137	addps	%xmm2, %xmm6; \
138	movaps	64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
139	addps	%xmm0, %xmm7; \
140	movaps	12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
141
142#define KERNEL4(address) \
143	mulps	%xmm0, %xmm3; \
144	addps	%xmm3, %xmm4; \
145	movaps	52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
146	mulps	%xmm0, %xmm3; \
147	addps	%xmm3, %xmm5; \
148	movaps	56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
149	mulps	%xmm0, %xmm3; \
150	mulps	60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
151	addps	%xmm3, %xmm6; \
152	movaps	80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
153	addps	%xmm0, %xmm7; \
154	movaps	 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
155
156#define KERNEL5(address) \
157	mulps	%xmm1, %xmm2; \
158	addps	%xmm2, %xmm4; \
159	movaps	68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
160	mulps	%xmm1, %xmm2; \
161	addps	%xmm2, %xmm5; \
162	movaps	72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
163	mulps	%xmm1, %xmm2; \
164	mulps	76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
165	addps	%xmm2, %xmm6; \
166	movaps	 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
167	addps	%xmm1, %xmm7; \
168	movaps	20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
169
170#define KERNEL6(address) \
171	mulps	%xmm1, %xmm3; \
172	addps	%xmm3, %xmm4; \
173	movaps	84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
174	mulps	%xmm1, %xmm3; \
175	addps	%xmm3, %xmm5; \
176	movaps	88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
177	mulps	%xmm1, %xmm3; \
178	mulps	92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
179	addps	%xmm3, %xmm6; \
180	movaps	112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
181	addps	%xmm1, %xmm7; \
182	movaps	 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
183
184#define KERNEL7(address) \
185	mulps	%xmm1, %xmm2; \
186	addps	%xmm2, %xmm4; \
187	movaps	100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
188	mulps	%xmm1, %xmm2; \
189	addps	%xmm2, %xmm5; \
190	movaps	104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
191	mulps	%xmm1, %xmm2; \
192	mulps	108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
193	addps	%xmm2, %xmm6; \
194	movaps	128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
195	addps	%xmm1, %xmm7; \
196	movaps	28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
197
198#define KERNEL8(address) \
199	mulps	%xmm1, %xmm3; \
200	addps	%xmm3, %xmm4; \
201	movaps	116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
202	mulps	%xmm1, %xmm3; \
203	addps	%xmm3, %xmm5; \
204	movaps	120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
205	mulps	%xmm1, %xmm3; \
206	mulps	124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
207	addps	%xmm3, %xmm6; \
208	movaps	144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
209	addps	%xmm1, %xmm7; \
210	movaps	 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
211
212	PROLOGUE
213
214	pushl	%ebp
215	pushl	%edi
216	pushl	%esi
217	pushl	%ebx
218
219	PROFCODE
220
221	movl	%esp, %esi
222
223	subl	$128 + LOCAL_BUFFER_SIZE, %esp
224	andl	$-1024, %esp
225
226	STACK_TOUCHING
227
228	movl	OLD_M, %ebx
229	movl	OLD_N, %eax
230	movl	OLD_K, %ecx
231	movl	OLD_A, %edx
232
233	movl	%ebx, M
234	movl	%eax, N
235	movl	%ecx, K
236	movl	%edx, A
237	movl	%esi, OLD_STACK
238	movss	STACK_OFFT, %xmm4
239
240	movl	OLD_B, B
241	movl	OLD_C, %ebx
242
243	movl	%ebx, C
244	movl	OLD_LDC, LDC
245
246	movss	%xmm4, OFFSET
247	movss	%xmm4, KK
248
249	leal	(, LDC, SIZE), LDC
250
251#ifdef LN
252       movl	M, %eax
253       leal	(, %eax, SIZE), %eax
254       addl	%eax, C
255       imull	K, %eax
256       addl	%eax, A
257#endif
258
259#ifdef RT
260       movl	N, %eax
261       leal	(, %eax, SIZE), %eax
262       imull	K, %eax
263       addl	%eax, B
264       movl	N, %eax
265       imull	LDC, %eax
266       addl	%eax, C
267#endif
268
269#ifdef RN
270	negl	KK
271#endif
272
273#ifdef RT
274       movl	N, %eax
275       subl	OFFSET, %eax
276       movl	%eax, KK
277#endif
278
279	movl	N, %eax
280	sarl	$2, %eax
281	movl	%eax, J
282	jle	.L40
283
284.L01:
285#ifdef LN
286	movl	OFFSET, %eax
287	addl	M, %eax
288	movl	%eax, KK
289#endif
290
291	leal	BUFFER, %ecx
292
293#ifdef RT
294       movl	K, %eax
295       sall	$2 + BASE_SHIFT, %eax
296       subl	%eax, B
297#endif
298
299#if defined(LN) || defined(RT)
300	movl	KK, %eax
301	movl	B, BORIG
302        sall	$2 + BASE_SHIFT, %eax
303	leal	(B,  %eax, 1), B
304	leal	(BB, %eax, 4), BB
305#endif
306
307#ifdef LT
308	movl	OFFSET, %eax
309	movl	%eax, KK
310#endif
311
312#if defined(LT) || defined(RN)
313	movl	KK, %eax
314#else
315	movl	K, %eax
316	subl	KK, %eax
317#endif
318	sarl	$1, %eax
319	jle	.L05
320	ALIGN_4
321
322.L02:
323	movaps	 0 * SIZE(B), %xmm3
324	movaps	 4 * SIZE(B), %xmm7
325
326	pshufd	 $0x00, %xmm3, %xmm0
327	pshufd	 $0x55, %xmm3, %xmm1
328	pshufd	 $0xaa, %xmm3, %xmm2
329	pshufd	 $0xff, %xmm3, %xmm3
330
331	pshufd	 $0x00, %xmm7, %xmm4
332	pshufd	 $0x55, %xmm7, %xmm5
333	pshufd	 $0xaa, %xmm7, %xmm6
334	pshufd	 $0xff, %xmm7, %xmm7
335
336	movaps	%xmm0,  0 * SIZE(BB)
337	movaps	%xmm1,  4 * SIZE(BB)
338	movaps	%xmm2,  8 * SIZE(BB)
339	movaps	%xmm3, 12 * SIZE(BB)
340	movaps	%xmm4, 16 * SIZE(BB)
341	movaps	%xmm5, 20 * SIZE(BB)
342	movaps	%xmm6, 24 * SIZE(BB)
343	movaps	%xmm7, 28 * SIZE(BB)
344
345	addl	$ 8 * SIZE, B
346	addl	$32 * SIZE, %ecx
347	decl	%eax
348	jne	.L02
349	ALIGN_2
350
351.L05:
352#if defined(LT) || defined(RN)
353	movl	KK, %eax
354#else
355	movl	K, %eax
356	subl	KK, %eax
357#endif
358	andl	$1, %eax
359	BRANCH
360	jle	.L10
361
362	movaps	 0 * SIZE(B), %xmm3
363
364	pshufd	 $0x00, %xmm3, %xmm0
365	pshufd	 $0x55, %xmm3, %xmm1
366	pshufd	 $0xaa, %xmm3, %xmm2
367	pshufd	 $0xff, %xmm3, %xmm3
368
369	movaps	%xmm0,  0 * SIZE(BB)
370	movaps	%xmm1,  4 * SIZE(BB)
371	movaps	%xmm2,  8 * SIZE(BB)
372	movaps	%xmm3, 12 * SIZE(BB)
373
374	addl	$4 * SIZE, B
375	ALIGN_4
376
377.L10:
378#if defined(LT) || defined(RN)
379	movl	A, AA
380#else
381	movl	A, %eax
382	movl	%eax, AORIG
383#endif
384
385	leal	(, LDC, 4), %eax
386
387#ifdef RT
388	subl	%eax, C
389#endif
390	movl	C, CO1
391#ifndef RT
392	addl	%eax, C
393#endif
394
395	testl	$1, M
396	je	.L20
397
398#ifdef LN
399       movl	K, %eax
400       sall	$BASE_SHIFT, %eax
401       subl	%eax, AORIG
402#endif
403
404#if defined(LN) || defined(RT)
405	movl	KK, %eax
406	movl	AORIG, AA
407	leal	(AA, %eax, SIZE), AA
408#endif
409
410	leal	BUFFER, BB
411
412#if defined(LN) || defined(RT)
413	movl	KK, %eax
414	sall	$2 + BASE_SHIFT, %eax
415	leal	(BB, %eax, 4), BB
416#endif
417
418	movss	 0 * SIZE(AA), %xmm0
419	xorps	%xmm4, %xmm4
420	movss	 4 * SIZE(AA), %xmm1
421	xorps	%xmm5, %xmm5
422	movss	 0 * SIZE(BB), %xmm2
423	xorps	%xmm6, %xmm6
424	movss	16 * SIZE(BB), %xmm3
425	xorps	%xmm7, %xmm7
426
427#if defined(LT) || defined(RN)
428	movl	KK, %eax
429#else
430	movl	K, %eax
431	subl	KK, %eax
432#endif
433	sarl	$3, %eax
434	je	.L35
435	ALIGN_4
436
437.L32:
438	mulss	%xmm0, %xmm2
439	addss	%xmm2, %xmm4
440#if defined(OPTERON) || defined(BARCELONA)
441	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
442#endif
443	movss	 4 * SIZE(BB), %xmm2
444	mulss	%xmm0, %xmm2
445	addss	%xmm2, %xmm5
446	movss	 8 * SIZE(BB), %xmm2
447	mulss	%xmm0, %xmm2
448	mulss	12 * SIZE(BB), %xmm0
449	addss	%xmm2, %xmm6
450	movss	32 * SIZE(BB), %xmm2
451	addss	%xmm0, %xmm7
452	movss	 1 * SIZE(AA), %xmm0
453
454	mulss	%xmm0, %xmm3
455	addss	%xmm3, %xmm4
456	movss	20 * SIZE(BB), %xmm3
457	mulss	%xmm0, %xmm3
458	addss	%xmm3, %xmm5
459	movss	24 * SIZE(BB), %xmm3
460	mulss	%xmm0, %xmm3
461	mulss	28 * SIZE(BB), %xmm0
462	addss	%xmm3, %xmm6
463	movss	48 * SIZE(BB), %xmm3
464	addss	%xmm0, %xmm7
465	movss	 2 * SIZE(AA), %xmm0
466
467	mulss	%xmm0, %xmm2
468	addss	%xmm2, %xmm4
469	movss	36 * SIZE(BB), %xmm2
470	mulss	%xmm0, %xmm2
471	addss	%xmm2, %xmm5
472	movss	40 * SIZE(BB), %xmm2
473	mulss	%xmm0, %xmm2
474	mulss	44 * SIZE(BB), %xmm0
475	addss	%xmm2, %xmm6
476	movss	64 * SIZE(BB), %xmm2
477	addss	%xmm0, %xmm7
478	movss	 3 * SIZE(AA), %xmm0
479
480	mulss	%xmm0, %xmm3
481	addss	%xmm3, %xmm4
482	movss	52 * SIZE(BB), %xmm3
483	mulss	%xmm0, %xmm3
484	addss	%xmm3, %xmm5
485	movss	56 * SIZE(BB), %xmm3
486	mulss	%xmm0, %xmm3
487	mulss	60 * SIZE(BB), %xmm0
488	addss	%xmm3, %xmm6
489	movss	80 * SIZE(BB), %xmm3
490	addss	%xmm0, %xmm7
491	movss	 8 * SIZE(AA), %xmm0
492
493	mulss	%xmm1, %xmm2
494	addss	%xmm2, %xmm4
495	movss	68 * SIZE(BB), %xmm2
496	mulss	%xmm1, %xmm2
497	addss	%xmm2, %xmm5
498	movss	72 * SIZE(BB), %xmm2
499	mulss	%xmm1, %xmm2
500	mulss	76 * SIZE(BB), %xmm1
501	addss	%xmm2, %xmm6
502	movss	96 * SIZE(BB), %xmm2
503	addss	%xmm1, %xmm7
504	movss	 5 * SIZE(AA), %xmm1
505
506	mulss	%xmm1, %xmm3
507	addss	%xmm3, %xmm4
508	movss	84 * SIZE(BB), %xmm3
509	mulss	%xmm1, %xmm3
510	addss	%xmm3, %xmm5
511	movss	88 * SIZE(BB), %xmm3
512	mulss	%xmm1, %xmm3
513	mulss	92 * SIZE(BB), %xmm1
514	addss	%xmm3, %xmm6
515	movss	112 * SIZE(BB), %xmm3
516	addss	%xmm1, %xmm7
517	movss	 6 * SIZE(AA), %xmm1
518
519	mulss	%xmm1, %xmm2
520	addss	%xmm2, %xmm4
521	movss	100 * SIZE(BB), %xmm2
522	mulss	%xmm1, %xmm2
523	addss	%xmm2, %xmm5
524	movss	104 * SIZE(BB), %xmm2
525	mulss	%xmm1, %xmm2
526	mulss	108 * SIZE(BB), %xmm1
527	addss	%xmm2, %xmm6
528	movss	128 * SIZE(BB), %xmm2
529	addss	%xmm1, %xmm7
530	movss	 7 * SIZE(AA), %xmm1
531
532	mulss	%xmm1, %xmm3
533	addss	%xmm3, %xmm4
534	movss	116 * SIZE(BB), %xmm3
535	mulss	%xmm1, %xmm3
536	addss	%xmm3, %xmm5
537	movss	120 * SIZE(BB), %xmm3
538	mulss	%xmm1, %xmm3
539	mulss	124 * SIZE(BB), %xmm1
540	addss	%xmm3, %xmm6
541	movss	144 * SIZE(BB), %xmm3
542	addss	%xmm1, %xmm7
543	movss	12 * SIZE(AA), %xmm1
544
545	addl	$  8 * SIZE, AA
546	addl	$128 * SIZE, BB
547	decl   %eax
548	jne    .L32
549	ALIGN_4
550
551.L35:
552#if defined(LT) || defined(RN)
553	movl	KK, %eax
554#else
555	movl	K, %eax
556	subl	KK, %eax
557#endif
558	andl	$7, %eax		# if (k & 1)
559	BRANCH
560	je .L38
561	ALIGN_4
562
563.L36:
564	mulss	%xmm0, %xmm2
565	addss	%xmm2, %xmm4
566	movss	 4 * SIZE(BB), %xmm2
567	mulss	%xmm0, %xmm2
568	addss	%xmm2, %xmm5
569	movss	 8 * SIZE(BB), %xmm2
570	mulss	%xmm0, %xmm2
571	mulss	12 * SIZE(BB), %xmm0
572	addss	%xmm2, %xmm6
573	movss	16 * SIZE(BB), %xmm2
574	addss	%xmm0, %xmm7
575	movss	 1 * SIZE(AA), %xmm0
576
577	addl	$ 1 * SIZE, AA
578	addl	$16 * SIZE, BB
579	decl	%eax
580	jg	.L36
581	ALIGN_4
582
583.L38:
584#if defined(LN) || defined(RT)
585	movl	KK, %eax
586#ifdef LN
587	subl	$1, %eax
588#else
589	subl	$4, %eax
590#endif
591
592	movl	AORIG, AA
593	movl	BORIG, B
594	leal	BUFFER, BB
595
596	leal	(AA, %eax, SIZE), AA
597
598	sall	$2 + BASE_SHIFT, %eax
599	leal	(B,  %eax, 1), B
600	leal	(BB, %eax, 4), BB
601#endif
602
603#if defined(LN) || defined(LT)
604	unpcklps %xmm6, %xmm4
605	unpcklps %xmm7, %xmm5
606	unpcklps %xmm5, %xmm4
607
608	movaps	 0 * SIZE(B), %xmm1
609
610	subps	%xmm4,  %xmm1
611#else
612	movss	 0 * SIZE(AA), %xmm0
613	movss	 1 * SIZE(AA), %xmm1
614	movss	 2 * SIZE(AA), %xmm2
615	movss	 3 * SIZE(AA), %xmm3
616
617	subss	%xmm4, %xmm0
618	subss	%xmm5, %xmm1
619	subss	%xmm6, %xmm2
620	subss	%xmm7, %xmm3
621#endif
622
623#if defined(LN) || defined(LT)
624	movss	  0 * SIZE(AA), %xmm4
625	pshufd	 $0x00, %xmm4, %xmm6
626	mulps	 %xmm6, %xmm1
627#endif
628
629#ifdef RN
630	movaps	 0 * SIZE(B), %xmm6
631	pshufd	 $0x00, %xmm6, %xmm7
632	mulss	 %xmm7, %xmm0
633	pshufd	 $0x55, %xmm6, %xmm7
634	mulss	 %xmm0, %xmm7
635	subss	 %xmm7, %xmm1
636	pshufd	 $0xaa, %xmm6, %xmm7
637	mulss	 %xmm0, %xmm7
638	subss	 %xmm7, %xmm2
639	pshufd	 $0xff, %xmm6, %xmm7
640	mulss	 %xmm0, %xmm7
641	subss	 %xmm7, %xmm3
642
643	movaps	 4 * SIZE(B), %xmm6
644	pshufd	 $0x55, %xmm6, %xmm7
645	mulss	 %xmm7, %xmm1
646	pshufd	 $0xaa, %xmm6, %xmm7
647	mulss	 %xmm1, %xmm7
648	subss	 %xmm7, %xmm2
649	pshufd	 $0xff, %xmm6, %xmm7
650	mulss	 %xmm1, %xmm7
651	subss	 %xmm7, %xmm3
652
653	movaps	 8 * SIZE(B), %xmm6
654	pshufd	 $0xaa, %xmm6, %xmm7
655	mulss	 %xmm7, %xmm2
656	pshufd	 $0xff, %xmm6, %xmm7
657	mulss	 %xmm2, %xmm7
658	subss	 %xmm7, %xmm3
659
660	movaps	 12 * SIZE(B), %xmm6
661	pshufd	 $0xff, %xmm6, %xmm7
662	mulss	 %xmm7, %xmm3
663#endif
664
665#ifdef RT
666	movaps	 12 * SIZE(B), %xmm6
667	pshufd	 $0xff, %xmm6, %xmm7
668	mulss	 %xmm7, %xmm3
669	pshufd	 $0xaa, %xmm6, %xmm7
670	mulss	 %xmm3, %xmm7
671	subss	 %xmm7, %xmm2
672	pshufd	 $0x55, %xmm6, %xmm7
673	mulss	 %xmm3, %xmm7
674	subss	 %xmm7, %xmm1
675	pshufd	 $0x00, %xmm6, %xmm7
676	mulss	 %xmm3, %xmm7
677	subss	 %xmm7, %xmm0
678
679	movaps	  8 * SIZE(B), %xmm6
680	pshufd	 $0xaa, %xmm6, %xmm7
681	mulss	 %xmm7, %xmm2
682	pshufd	 $0x55, %xmm6, %xmm7
683	mulss	 %xmm2, %xmm7
684	subss	 %xmm7, %xmm1
685	pshufd	 $0x00, %xmm6, %xmm7
686	mulss	 %xmm2, %xmm7
687	subss	 %xmm7, %xmm0
688
689	movaps	  4 * SIZE(B), %xmm6
690	pshufd	 $0x55, %xmm6, %xmm7
691	mulss	 %xmm7, %xmm1
692	pshufd	 $0x00, %xmm6, %xmm7
693	mulss	 %xmm1, %xmm7
694	subss	 %xmm7, %xmm0
695
696	movaps	  0 * SIZE(B), %xmm6
697	pshufd	 $0x00, %xmm6, %xmm7
698	mulss	 %xmm7, %xmm0
699#endif
700
701#if defined(LN) || defined(LT)
702	movaps	%xmm1,   0 * SIZE(B)
703
704	pshufd	$0x00, %xmm1, %xmm0
705	pshufd	$0x55, %xmm1, %xmm2
706	pshufd	$0xaa, %xmm1, %xmm4
707	pshufd	$0xff, %xmm1, %xmm6
708	movaps	%xmm0,   0 * SIZE(BB)
709	movaps	%xmm2,   4 * SIZE(BB)
710	movaps	%xmm4,   8 * SIZE(BB)
711	movaps	%xmm6,  12 * SIZE(BB)
712#else
713	movss	%xmm0,   0 * SIZE(AA)
714	movss	%xmm1,   1 * SIZE(AA)
715	movss	%xmm2,   2 * SIZE(AA)
716	movss	%xmm3,   3 * SIZE(AA)
717#endif
718
719#ifdef LN
720	subl	$1 * SIZE, CO1
721#endif
722
723	leal	(LDC, LDC, 2), %eax
724
725#if defined(LN) || defined(LT)
726	movaps	 %xmm1, %xmm0
727	unpcklps %xmm5, %xmm1
728	unpckhps %xmm5, %xmm0
729
730	movaps	 %xmm3, %xmm4
731	unpcklps %xmm7, %xmm3
732	unpckhps %xmm7, %xmm4
733
734	movaps	 %xmm1, %xmm2
735	unpcklps %xmm3, %xmm1
736	unpckhps %xmm3, %xmm2
737
738	movaps	 %xmm0, %xmm6
739	unpcklps %xmm4, %xmm0
740	unpckhps %xmm4, %xmm6
741
742	movss	%xmm1, 0 * SIZE(CO1)
743	movss	%xmm2, 0 * SIZE(CO1, LDC, 1)
744	movss	%xmm0, 0 * SIZE(CO1, LDC, 2)
745	movss	%xmm6, 0 * SIZE(CO1, %eax, 1)
746#else
747	movss	%xmm0, 0 * SIZE(CO1)
748	movss	%xmm1, 0 * SIZE(CO1, LDC, 1)
749	movss	%xmm2, 0 * SIZE(CO1, LDC, 2)
750	movss	%xmm3, 0 * SIZE(CO1, %eax, 1)
751#endif
752
753#ifndef LN
754	addl	$1 * SIZE, CO1
755#endif
756
757#if defined(LT) || defined(RN)
758	movl	K,  %eax
759	subl	KK, %eax
760	leal	(AA, %eax, SIZE), AA
761#ifdef LT
762	addl	$4 * SIZE, B
763#endif
764#endif
765
766#ifdef LN
767	subl	$1, KK
768	movl	BORIG, B
769#endif
770
771#ifdef LT
772	addl	$1, KK
773#endif
774
775#ifdef RT
776	movl	K, %eax
777	movl	BORIG, B
778	sall	$BASE_SHIFT, %eax
779	addl	%eax, AORIG
780#endif
781	ALIGN_4
782
783.L20:
784	testl	$2, M
785	je	.L30
786
787#ifdef LN
788       movl	K, %eax
789       sall	$1 + BASE_SHIFT, %eax
790       subl	%eax, AORIG
791#endif
792
793#if defined(LN) || defined(RT)
794	movl	KK, %eax
795	movl	AORIG, AA
796	leal	(, %eax, SIZE), %eax
797	leal	(AA, %eax, 2), AA
798#endif
799
800	leal	BUFFER, BB
801
802#if defined(LN) || defined(RT)
803	movl	KK, %eax
804	sall	$2 + BASE_SHIFT, %eax
805	leal	(BB, %eax, 4), BB
806#endif
807
808#ifdef	movsd
809	xorps	%xmm0, %xmm0
810#endif
811	movsd	 0 * SIZE(AA), %xmm0
812	xorps	%xmm4, %xmm4
813#ifdef	movsd
814	xorps	%xmm1, %xmm1
815#endif
816	movsd	 8 * SIZE(AA), %xmm1
817	xorps	%xmm5, %xmm5
818	movaps	 0 * SIZE(BB), %xmm2
819	xorps	%xmm6, %xmm6
820	movaps	16 * SIZE(BB), %xmm3
821	xorps	%xmm7, %xmm7
822
823#if defined(LT) || defined(RN)
824	movl	KK, %eax
825#else
826	movl	K, %eax
827	subl	KK, %eax
828#endif
829	sarl	$3, %eax
830	je	.L25
831	ALIGN_4
832
833.L22:
834	mulps	%xmm0, %xmm2
835	addps	%xmm2, %xmm4
836#if defined(OPTERON) || defined(BARCELONA)
837	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
838#endif
839	movaps	 4 * SIZE(BB), %xmm2
840	mulps	%xmm0, %xmm2
841	addps	%xmm2, %xmm5
842	movaps	 8 * SIZE(BB), %xmm2
843	mulps	%xmm0, %xmm2
844	addps	%xmm2, %xmm6
845	movaps	12 * SIZE(BB), %xmm2
846	mulps	%xmm0, %xmm2
847	movsd	 2 * SIZE(AA), %xmm0
848	addps	%xmm2, %xmm7
849	movaps	32 * SIZE(BB), %xmm2
850
851	mulps	%xmm0, %xmm3
852	addps	%xmm3, %xmm4
853	movaps	20 * SIZE(BB), %xmm3
854	mulps	%xmm0, %xmm3
855	addps	%xmm3, %xmm5
856	movaps	24 * SIZE(BB), %xmm3
857	mulps	%xmm0, %xmm3
858	addps	%xmm3, %xmm6
859	movaps	28 * SIZE(BB), %xmm3
860	mulps	%xmm0, %xmm3
861	movsd	 4 * SIZE(AA), %xmm0
862	addps	%xmm3, %xmm7
863	movaps	48 * SIZE(BB), %xmm3
864
865	mulps	%xmm0, %xmm2
866	addps	%xmm2, %xmm4
867	movaps	36 * SIZE(BB), %xmm2
868	mulps	%xmm0, %xmm2
869	addps	%xmm2, %xmm5
870	movaps	40 * SIZE(BB), %xmm2
871	mulps	%xmm0, %xmm2
872	addps	%xmm2, %xmm6
873	movaps	44 * SIZE(BB), %xmm2
874	mulps	%xmm0, %xmm2
875	movsd	 6 * SIZE(AA), %xmm0
876	addps	%xmm2, %xmm7
877	movaps	64 * SIZE(BB), %xmm2
878
879	mulps	%xmm0, %xmm3
880	addps	%xmm3, %xmm4
881	movaps	52 * SIZE(BB), %xmm3
882	mulps	%xmm0, %xmm3
883	addps	%xmm3, %xmm5
884	movaps	56 * SIZE(BB), %xmm3
885	mulps	%xmm0, %xmm3
886	addps	%xmm3, %xmm6
887	movaps	60 * SIZE(BB), %xmm3
888	mulps	%xmm0, %xmm3
889	movsd	 16 * SIZE(AA), %xmm0
890	addps	%xmm3, %xmm7
891	movaps	80 * SIZE(BB), %xmm3
892
893	mulps	%xmm1, %xmm2
894	addps	%xmm2, %xmm4
895	movaps	68 * SIZE(BB), %xmm2
896	mulps	%xmm1, %xmm2
897	addps	%xmm2, %xmm5
898	movaps	72 * SIZE(BB), %xmm2
899	mulps	%xmm1, %xmm2
900	addps	%xmm2, %xmm6
901	movaps	76 * SIZE(BB), %xmm2
902	mulps	%xmm1, %xmm2
903	movsd	10 * SIZE(AA), %xmm1
904	addps	%xmm2, %xmm7
905	movaps	96 * SIZE(BB), %xmm2
906
907	mulps	%xmm1, %xmm3
908	addps	%xmm3, %xmm4
909	movaps	84 * SIZE(BB), %xmm3
910	mulps	%xmm1, %xmm3
911	addps	%xmm3, %xmm5
912	movaps	88 * SIZE(BB), %xmm3
913	mulps	%xmm1, %xmm3
914	addps	%xmm3, %xmm6
915	movaps	92 * SIZE(BB), %xmm3
916	mulps	%xmm1, %xmm3
917	movsd	12 * SIZE(AA), %xmm1
918	addps	%xmm3, %xmm7
919	movaps	112 * SIZE(BB), %xmm3
920
921	mulps	%xmm1, %xmm2
922	addps	%xmm2, %xmm4
923	movaps	100 * SIZE(BB), %xmm2
924	mulps	%xmm1, %xmm2
925	addps	%xmm2, %xmm5
926	movaps	104 * SIZE(BB), %xmm2
927	mulps	%xmm1, %xmm2
928	addps	%xmm2, %xmm6
929	movaps	108 * SIZE(BB), %xmm2
930	mulps	%xmm1, %xmm2
931	movsd	 14 * SIZE(AA), %xmm1
932	addps	%xmm2, %xmm7
933	movaps	128 * SIZE(BB), %xmm2
934
935	mulps	%xmm1, %xmm3
936	addps	%xmm3, %xmm4
937	movaps	116 * SIZE(BB), %xmm3
938	mulps	%xmm1, %xmm3
939	addps	%xmm3, %xmm5
940	movaps	120 * SIZE(BB), %xmm3
941	mulps	%xmm1, %xmm3
942	addps	%xmm3, %xmm6
943	movaps	124 * SIZE(BB), %xmm3
944	mulps	%xmm1, %xmm3
945	movsd	 24 * SIZE(AA), %xmm1
946	addps	%xmm3, %xmm7
947	movaps	144 * SIZE(BB), %xmm3
948
949	addl	$ 16 * SIZE, AA
950	addl	$128 * SIZE, BB
951	decl   %eax
952	jne    .L22
953	ALIGN_4
954
955.L25:
956#if defined(LT) || defined(RN)
957	movl	KK, %eax
958#else
959	movl	K, %eax
960	subl	KK, %eax
961#endif
962	andl	$7, %eax		# if (k & 1)
963	BRANCH
964	je .L28
965	ALIGN_4
966
967.L26:
968	mulps	%xmm0, %xmm2
969	addps	%xmm2, %xmm4
970	movaps	 4 * SIZE(BB), %xmm2
971	mulps	%xmm0, %xmm2
972	addps	%xmm2, %xmm5
973	movaps	 8 * SIZE(BB), %xmm2
974	mulps	%xmm0, %xmm2
975	addps	%xmm2, %xmm6
976	movaps	12 * SIZE(BB), %xmm2
977	mulps	%xmm0, %xmm2
978	movsd	 2 * SIZE(AA), %xmm0
979	addps	%xmm2, %xmm7
980	movaps	16 * SIZE(BB), %xmm2
981
982	addl	$ 2 * SIZE, AA
983	addl	$16 * SIZE, BB
984	decl	%eax
985	jg	.L26
986	ALIGN_4
987
988.L28:
989#if defined(LN) || defined(RT)
990	movl	KK, %eax
991#ifdef LN
992	subl	$2, %eax
993#else
994	subl	$4, %eax
995#endif
996
997	movl	AORIG, AA
998	movl	BORIG, B
999	leal	BUFFER, BB
1000
1001	sall	$1 + BASE_SHIFT, %eax
1002	leal	(AA, %eax, 1), AA
1003	leal	(B,  %eax, 2), B
1004	leal	(BB, %eax, 8), BB
1005#endif
1006
1007#if defined(LN) || defined(LT)
1008	unpcklps %xmm6, %xmm4
1009	unpcklps %xmm7, %xmm5
1010
1011	movaps	 %xmm4, %xmm6
1012	unpcklps %xmm5, %xmm4
1013	unpckhps %xmm5, %xmm6
1014
1015	movaps	 0 * SIZE(B), %xmm1
1016	movaps	 4 * SIZE(B), %xmm3
1017
1018	subps	%xmm4,  %xmm1
1019	subps	%xmm6,  %xmm3
1020#else
1021#ifdef	movsd
1022	xorps	%xmm0, %xmm0
1023#endif
1024	movsd	 0 * SIZE(AA), %xmm0
1025#ifdef	movsd
1026	xorps	%xmm1, %xmm1
1027#endif
1028	movsd	 2 * SIZE(AA), %xmm1
1029#ifdef	movsd
1030	xorps	%xmm2, %xmm2
1031#endif
1032	movsd	 4 * SIZE(AA), %xmm2
1033#ifdef	movsd
1034	xorps	%xmm3, %xmm3
1035#endif
1036	movsd	 6 * SIZE(AA), %xmm3
1037
1038	subps	%xmm4, %xmm0
1039	subps	%xmm5, %xmm1
1040	subps	%xmm6, %xmm2
1041	subps	%xmm7, %xmm3
1042#endif
1043
1044#ifdef LN
1045	movaps	  0 * SIZE(AA), %xmm4
1046	pshufd	 $0xff, %xmm4, %xmm6
1047	mulps	 %xmm6, %xmm3
1048	pshufd	 $0xaa, %xmm4, %xmm6
1049	mulps	 %xmm3, %xmm6
1050	subps	 %xmm6, %xmm1
1051
1052	pshufd	 $0x00, %xmm4, %xmm6
1053	mulps	 %xmm6, %xmm1
1054#endif
1055
1056#ifdef LT
1057	movaps	 0 * SIZE(AA), %xmm4
1058	pshufd	 $0x00, %xmm4, %xmm6
1059	mulps	 %xmm6, %xmm1
1060
1061	pshufd	 $0x55, %xmm4, %xmm6
1062	mulps	 %xmm1, %xmm6
1063	subps	 %xmm6, %xmm3
1064
1065	pshufd	 $0xff, %xmm4, %xmm6
1066	mulps	 %xmm6, %xmm3
1067#endif
1068
1069#ifdef RN
1070	movaps	 0 * SIZE(B), %xmm6
1071	pshufd	 $0x00, %xmm6, %xmm7
1072	mulps	 %xmm7, %xmm0
1073	pshufd	 $0x55, %xmm6, %xmm7
1074	mulps	 %xmm0, %xmm7
1075	subps	 %xmm7, %xmm1
1076	pshufd	 $0xaa, %xmm6, %xmm7
1077	mulps	 %xmm0, %xmm7
1078	subps	 %xmm7, %xmm2
1079	pshufd	 $0xff, %xmm6, %xmm7
1080	mulps	 %xmm0, %xmm7
1081	subps	 %xmm7, %xmm3
1082
1083	movaps	 4 * SIZE(B), %xmm6
1084	pshufd	 $0x55, %xmm6, %xmm7
1085	mulps	 %xmm7, %xmm1
1086	pshufd	 $0xaa, %xmm6, %xmm7
1087	mulps	 %xmm1, %xmm7
1088	subps	 %xmm7, %xmm2
1089	pshufd	 $0xff, %xmm6, %xmm7
1090	mulps	 %xmm1, %xmm7
1091	subps	 %xmm7, %xmm3
1092
1093	movaps	 8 * SIZE(B), %xmm6
1094	pshufd	 $0xaa, %xmm6, %xmm7
1095	mulps	 %xmm7, %xmm2
1096	pshufd	 $0xff, %xmm6, %xmm7
1097	mulps	 %xmm2, %xmm7
1098	subps	 %xmm7, %xmm3
1099
1100	movaps	 12 * SIZE(B), %xmm6
1101	pshufd	 $0xff, %xmm6, %xmm7
1102	mulps	 %xmm7, %xmm3
1103#endif
1104
1105#ifdef RT
1106	movaps	 12 * SIZE(B), %xmm6
1107	pshufd	 $0xff, %xmm6, %xmm7
1108	mulps	 %xmm7, %xmm3
1109	pshufd	 $0xaa, %xmm6, %xmm7
1110	mulps	 %xmm3, %xmm7
1111	subps	 %xmm7, %xmm2
1112	pshufd	 $0x55, %xmm6, %xmm7
1113	mulps	 %xmm3, %xmm7
1114	subps	 %xmm7, %xmm1
1115	pshufd	 $0x00, %xmm6, %xmm7
1116	mulps	 %xmm3, %xmm7
1117	subps	 %xmm7, %xmm0
1118
1119	movaps	  8 * SIZE(B), %xmm6
1120	pshufd	 $0xaa, %xmm6, %xmm7
1121	mulps	 %xmm7, %xmm2
1122	pshufd	 $0x55, %xmm6, %xmm7
1123	mulps	 %xmm2, %xmm7
1124	subps	 %xmm7, %xmm1
1125	pshufd	 $0x00, %xmm6, %xmm7
1126	mulps	 %xmm2, %xmm7
1127	subps	 %xmm7, %xmm0
1128
1129	movaps	  4 * SIZE(B), %xmm6
1130	pshufd	 $0x55, %xmm6, %xmm7
1131	mulps	 %xmm7, %xmm1
1132	pshufd	 $0x00, %xmm6, %xmm7
1133	mulps	 %xmm1, %xmm7
1134	subps	 %xmm7, %xmm0
1135
1136	movaps	  0 * SIZE(B), %xmm6
1137	pshufd	 $0x00, %xmm6, %xmm7
1138	mulps	 %xmm7, %xmm0
1139#endif
1140
1141#if defined(LN) || defined(LT)
1142	movaps	%xmm1,   0 * SIZE(B)
1143	movaps	%xmm3,   4 * SIZE(B)
1144
1145	pshufd	$0x00, %xmm1, %xmm0
1146	pshufd	$0x55, %xmm1, %xmm2
1147	pshufd	$0xaa, %xmm1, %xmm4
1148	pshufd	$0xff, %xmm1, %xmm6
1149	movaps	%xmm0,   0 * SIZE(BB)
1150	movaps	%xmm2,   4 * SIZE(BB)
1151	movaps	%xmm4,   8 * SIZE(BB)
1152	movaps	%xmm6,  12 * SIZE(BB)
1153
1154	pshufd	$0x00, %xmm3, %xmm0
1155	pshufd	$0x55, %xmm3, %xmm2
1156	pshufd	$0xaa, %xmm3, %xmm4
1157	pshufd	$0xff, %xmm3, %xmm6
1158	movaps	%xmm0,  16 * SIZE(BB)
1159	movaps	%xmm2,  20 * SIZE(BB)
1160	movaps	%xmm4,  24 * SIZE(BB)
1161	movaps	%xmm6,  28 * SIZE(BB)
1162#else
1163	movlps	%xmm0,   0 * SIZE(AA)
1164	movlps	%xmm1,   2 * SIZE(AA)
1165	movlps	%xmm2,   4 * SIZE(AA)
1166	movlps	%xmm3,   6 * SIZE(AA)
1167#endif
1168
1169#ifdef LN
1170	subl	$2 * SIZE, CO1
1171#endif
1172
1173	leal	(LDC, LDC, 2), %eax
1174
1175#if defined(LN) || defined(LT)
1176	movaps	 %xmm1, %xmm0
1177	unpcklps %xmm5, %xmm1
1178	unpckhps %xmm5, %xmm0
1179
1180	movaps	 %xmm3, %xmm4
1181	unpcklps %xmm7, %xmm3
1182	unpckhps %xmm7, %xmm4
1183
1184	movaps	 %xmm1, %xmm2
1185	unpcklps %xmm3, %xmm1
1186	unpckhps %xmm3, %xmm2
1187
1188	movaps	 %xmm0, %xmm6
1189	unpcklps %xmm4, %xmm0
1190	unpckhps %xmm4, %xmm6
1191
1192	movlps	%xmm1, 0 * SIZE(CO1)
1193	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
1194	movlps	%xmm0, 0 * SIZE(CO1, LDC, 2)
1195	movlps	%xmm6, 0 * SIZE(CO1, %eax, 1)
1196#else
1197	movlps	%xmm0, 0 * SIZE(CO1)
1198	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
1199	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1200	movlps	%xmm3, 0 * SIZE(CO1, %eax, 1)
1201#endif
1202
1203#ifndef LN
1204	addl	$2 * SIZE, CO1
1205#endif
1206
1207#if defined(LT) || defined(RN)
1208	movl	K,  %eax
1209	subl	KK, %eax
1210	leal	(,%eax, SIZE), %eax
1211	leal	(AA, %eax, 2), AA
1212#ifdef LT
1213	addl	$8 * SIZE, B
1214#endif
1215#endif
1216
1217#ifdef LN
1218	subl	$2, KK
1219	movl	BORIG, B
1220#endif
1221
1222#ifdef LT
1223	addl	$2, KK
1224#endif
1225
1226#ifdef RT
1227	movl	K, %eax
1228	movl	BORIG, B
1229	sall	$1 + BASE_SHIFT, %eax
1230	addl	%eax, AORIG
1231#endif
1232	ALIGN_4
1233
1234.L30:
1235	movl	M,  %ebx
1236	sarl	$2, %ebx	# i = (m >> 2)
1237	jle	.L39
1238	ALIGN_4
1239
1240.L11:
1241#ifdef LN
1242       movl	K, %eax
1243       sall	$2 + BASE_SHIFT, %eax
1244       subl	%eax, AORIG
1245#endif
1246
1247#if defined(LN) || defined(RT)
1248	movl	KK, %eax
1249	movl	AORIG, AA
1250	leal	(, %eax, SIZE), %eax
1251	leal	(AA, %eax, 4), AA
1252#endif
1253
1254	leal	BUFFER, BB
1255
1256#if defined(LN) || defined(RT)
1257	movl	KK, %eax
1258	sall	$2 + BASE_SHIFT, %eax
1259	leal	(BB, %eax, 4), BB
1260#endif
1261
1262	movaps	 0 * SIZE(AA), %xmm0
1263	xorps	%xmm4, %xmm4
1264	movaps	16 * SIZE(AA), %xmm1
1265	xorps	%xmm5, %xmm5
1266	movaps	 0 * SIZE(BB), %xmm2
1267	xorps	%xmm6, %xmm6
1268	movaps	16 * SIZE(BB), %xmm3
1269	xorps	%xmm7, %xmm7
1270
1271	leal	(LDC, LDC, 2), %eax
1272
1273	PREFETCHW	-4 * SIZE(CO1)
1274	PREFETCHW	-4 * SIZE(CO1, LDC)
1275	PREFETCHW	-4 * SIZE(CO1, LDC, 2)
1276	PREFETCHW	-4 * SIZE(CO1, %eax)
1277
1278#if defined(LT) || defined(RN)
1279	movl	KK, %eax
1280#else
1281	movl	K, %eax
1282	subl	KK, %eax
1283#endif
1284	sarl	$3, %eax
1285	je	.L15
1286	ALIGN_4
1287
1288.L12:
1289	KERNEL1(0 * 16)
1290	KERNEL2(0 * 16)
1291	KERNEL3(0 * 16)
1292	KERNEL4(0 * 16)
1293	KERNEL5(0 * 16)
1294	KERNEL6(0 * 16)
1295	KERNEL7(0 * 16)
1296	KERNEL8(0 * 16)
1297
1298	addl   $128 * SIZE, BB
1299	addl   $32 * SIZE, AA
1300	decl   %eax
1301	jne    .L12
1302	ALIGN_4
1303
1304.L15:
1305#if defined(LT) || defined(RN)
1306	movl	KK, %eax
1307#else
1308	movl	K, %eax
1309	subl	KK, %eax
1310#endif
1311	andl	$7, %eax		# if (k & 1)
1312	BRANCH
1313	je .L18
1314	ALIGN_4
1315
1316.L16:
1317	mulps	%xmm0, %xmm2
1318	addps	%xmm2, %xmm4
1319	movaps	 4 * SIZE(BB), %xmm2
1320	mulps	%xmm0, %xmm2
1321	addps	%xmm2, %xmm5
1322	movaps	 8 * SIZE(BB), %xmm2
1323	mulps	%xmm0, %xmm2
1324	mulps	12 * SIZE(BB), %xmm0
1325	addps	%xmm2, %xmm6
1326	movaps	16 * SIZE(BB), %xmm2
1327	addps	%xmm0, %xmm7
1328	movaps	 4 * SIZE(AA), %xmm0
1329
1330	addl	$ 4 * SIZE, AA
1331	addl	$16 * SIZE, BB
1332	decl	%eax
1333	jg	.L16
1334	ALIGN_4
1335
1336.L18:
1337#if defined(LN) || defined(RT)
1338	movl	KK, %eax
1339#ifdef LN
1340	subl	$4, %eax
1341#else
1342	subl	$4, %eax
1343#endif
1344
1345	movl	AORIG, AA
1346	movl	BORIG, B
1347	leal	BUFFER, BB
1348
1349	sall	$2 + BASE_SHIFT, %eax
1350	leal	(AA, %eax, 1), AA
1351	leal	(B,  %eax, 1), B
1352	leal	(BB, %eax, 4), BB
1353#endif
1354
1355#if defined(LN) || defined(LT)
1356	movaps	 %xmm4, %xmm0
1357	unpcklps %xmm6, %xmm4
1358	unpckhps %xmm6, %xmm0
1359
1360	movaps	 %xmm5, %xmm1
1361	unpcklps %xmm7, %xmm5
1362	unpckhps %xmm7, %xmm1
1363
1364	movaps	 %xmm4, %xmm6
1365	unpcklps %xmm5, %xmm4
1366	unpckhps %xmm5, %xmm6
1367
1368	movaps	 %xmm0, %xmm2
1369	unpcklps %xmm1, %xmm0
1370	unpckhps %xmm1, %xmm2
1371
1372	movaps	 0 * SIZE(B), %xmm1
1373	movaps	 4 * SIZE(B), %xmm3
1374	movaps	 8 * SIZE(B), %xmm5
1375	movaps	12 * SIZE(B), %xmm7
1376
1377	subps	%xmm4,  %xmm1
1378	subps	%xmm6,  %xmm3
1379	subps	%xmm0,  %xmm5
1380	subps	%xmm2,  %xmm7
1381#else
1382	movaps	 0 * SIZE(AA), %xmm0
1383	movaps	 4 * SIZE(AA), %xmm1
1384	movaps	 8 * SIZE(AA), %xmm2
1385	movaps	12 * SIZE(AA), %xmm3
1386
1387	subps	%xmm4, %xmm0
1388	subps	%xmm5, %xmm1
1389	subps	%xmm6, %xmm2
1390	subps	%xmm7, %xmm3
1391#endif
1392
1393#ifdef LN
1394	movaps	 12 * SIZE(AA), %xmm4
1395	pshufd	 $0xff, %xmm4, %xmm6
1396	mulps	 %xmm6, %xmm7
1397	pshufd	 $0xaa, %xmm4, %xmm6
1398	mulps	 %xmm7, %xmm6
1399	subps	 %xmm6, %xmm5
1400	pshufd	 $0x55, %xmm4, %xmm6
1401	mulps	 %xmm7, %xmm6
1402	subps	 %xmm6, %xmm3
1403	pshufd	 $0x00, %xmm4, %xmm6
1404	mulps	 %xmm7, %xmm6
1405	subps	 %xmm6, %xmm1
1406
1407	movaps	  8 * SIZE(AA), %xmm4
1408	pshufd	 $0xaa, %xmm4, %xmm6
1409	mulps	 %xmm6, %xmm5
1410	pshufd	 $0x55, %xmm4, %xmm6
1411	mulps	 %xmm5, %xmm6
1412	subps	 %xmm6, %xmm3
1413	pshufd	 $0x00, %xmm4, %xmm6
1414	mulps	 %xmm5, %xmm6
1415	subps	 %xmm6, %xmm1
1416
1417	movaps	  4 * SIZE(AA), %xmm4
1418	pshufd	 $0x55, %xmm4, %xmm6
1419	mulps	 %xmm6, %xmm3
1420	pshufd	 $0x00, %xmm4, %xmm6
1421	mulps	 %xmm3, %xmm6
1422	subps	 %xmm6, %xmm1
1423
1424	movaps	  0 * SIZE(AA), %xmm4
1425	pshufd	 $0x00, %xmm4, %xmm6
1426	mulps	 %xmm6, %xmm1
1427#endif
1428
1429#ifdef LT
1430	movaps	 0 * SIZE(AA), %xmm4
1431	pshufd	 $0x00, %xmm4, %xmm6
1432	mulps	 %xmm6, %xmm1
1433
1434	pshufd	 $0x55, %xmm4, %xmm6
1435	mulps	 %xmm1, %xmm6
1436	subps	 %xmm6, %xmm3
1437	pshufd	 $0xaa, %xmm4, %xmm6
1438	mulps	 %xmm1, %xmm6
1439	subps	 %xmm6, %xmm5
1440	pshufd	 $0xff, %xmm4, %xmm6
1441	mulps	 %xmm1, %xmm6
1442	subps	 %xmm6, %xmm7
1443
1444	movaps	 4 * SIZE(AA), %xmm4
1445	pshufd	 $0x55, %xmm4, %xmm6
1446	mulps	 %xmm6, %xmm3
1447	pshufd	 $0xaa, %xmm4, %xmm6
1448	mulps	 %xmm3, %xmm6
1449	subps	 %xmm6, %xmm5
1450	pshufd	 $0xff, %xmm4, %xmm6
1451	mulps	 %xmm3, %xmm6
1452	subps	 %xmm6, %xmm7
1453
1454	movaps	 8 * SIZE(AA), %xmm4
1455	pshufd	 $0xaa, %xmm4, %xmm6
1456	mulps	 %xmm6, %xmm5
1457	pshufd	 $0xff, %xmm4, %xmm6
1458	mulps	 %xmm5, %xmm6
1459	subps	 %xmm6, %xmm7
1460
1461	movaps	12 * SIZE(AA), %xmm4
1462	pshufd	 $0xff, %xmm4, %xmm6
1463	mulps	 %xmm6, %xmm7
1464#endif
1465
1466#ifdef RN
1467	movaps	 0 * SIZE(B), %xmm6
1468	pshufd	 $0x00, %xmm6, %xmm7
1469	mulps	 %xmm7, %xmm0
1470	pshufd	 $0x55, %xmm6, %xmm7
1471	mulps	 %xmm0, %xmm7
1472	subps	 %xmm7, %xmm1
1473	pshufd	 $0xaa, %xmm6, %xmm7
1474	mulps	 %xmm0, %xmm7
1475	subps	 %xmm7, %xmm2
1476	pshufd	 $0xff, %xmm6, %xmm7
1477	mulps	 %xmm0, %xmm7
1478	subps	 %xmm7, %xmm3
1479
1480	movaps	 4 * SIZE(B), %xmm6
1481	pshufd	 $0x55, %xmm6, %xmm7
1482	mulps	 %xmm7, %xmm1
1483	pshufd	 $0xaa, %xmm6, %xmm7
1484	mulps	 %xmm1, %xmm7
1485	subps	 %xmm7, %xmm2
1486	pshufd	 $0xff, %xmm6, %xmm7
1487	mulps	 %xmm1, %xmm7
1488	subps	 %xmm7, %xmm3
1489
1490	movaps	 8 * SIZE(B), %xmm6
1491	pshufd	 $0xaa, %xmm6, %xmm7
1492	mulps	 %xmm7, %xmm2
1493	pshufd	 $0xff, %xmm6, %xmm7
1494	mulps	 %xmm2, %xmm7
1495	subps	 %xmm7, %xmm3
1496
1497	movaps	 12 * SIZE(B), %xmm6
1498	pshufd	 $0xff, %xmm6, %xmm7
1499	mulps	 %xmm7, %xmm3
1500#endif
1501
1502#ifdef RT
1503	movaps	 12 * SIZE(B), %xmm6
1504	pshufd	 $0xff, %xmm6, %xmm7
1505	mulps	 %xmm7, %xmm3
1506	pshufd	 $0xaa, %xmm6, %xmm7
1507	mulps	 %xmm3, %xmm7
1508	subps	 %xmm7, %xmm2
1509	pshufd	 $0x55, %xmm6, %xmm7
1510	mulps	 %xmm3, %xmm7
1511	subps	 %xmm7, %xmm1
1512	pshufd	 $0x00, %xmm6, %xmm7
1513	mulps	 %xmm3, %xmm7
1514	subps	 %xmm7, %xmm0
1515
1516	movaps	  8 * SIZE(B), %xmm6
1517	pshufd	 $0xaa, %xmm6, %xmm7
1518	mulps	 %xmm7, %xmm2
1519	pshufd	 $0x55, %xmm6, %xmm7
1520	mulps	 %xmm2, %xmm7
1521	subps	 %xmm7, %xmm1
1522	pshufd	 $0x00, %xmm6, %xmm7
1523	mulps	 %xmm2, %xmm7
1524	subps	 %xmm7, %xmm0
1525
1526	movaps	  4 * SIZE(B), %xmm6
1527	pshufd	 $0x55, %xmm6, %xmm7
1528	mulps	 %xmm7, %xmm1
1529	pshufd	 $0x00, %xmm6, %xmm7
1530	mulps	 %xmm1, %xmm7
1531	subps	 %xmm7, %xmm0
1532
1533	movaps	  0 * SIZE(B), %xmm6
1534	pshufd	 $0x00, %xmm6, %xmm7
1535	mulps	 %xmm7, %xmm0
1536#endif
1537
1538#if defined(LN) || defined(LT)
1539	movaps	%xmm1,   0 * SIZE(B)
1540	movaps	%xmm3,   4 * SIZE(B)
1541	movaps	%xmm5,   8 * SIZE(B)
1542	movaps	%xmm7,  12 * SIZE(B)
1543
1544	pshufd	$0x00, %xmm1, %xmm0
1545	pshufd	$0x55, %xmm1, %xmm2
1546	pshufd	$0xaa, %xmm1, %xmm4
1547	pshufd	$0xff, %xmm1, %xmm6
1548	movaps	%xmm0,   0 * SIZE(BB)
1549	movaps	%xmm2,   4 * SIZE(BB)
1550	movaps	%xmm4,   8 * SIZE(BB)
1551	movaps	%xmm6,  12 * SIZE(BB)
1552
1553	pshufd	$0x00, %xmm3, %xmm0
1554	pshufd	$0x55, %xmm3, %xmm2
1555	pshufd	$0xaa, %xmm3, %xmm4
1556	pshufd	$0xff, %xmm3, %xmm6
1557	movaps	%xmm0,  16 * SIZE(BB)
1558	movaps	%xmm2,  20 * SIZE(BB)
1559	movaps	%xmm4,  24 * SIZE(BB)
1560	movaps	%xmm6,  28 * SIZE(BB)
1561
1562	pshufd	$0x00, %xmm5, %xmm0
1563	pshufd	$0x55, %xmm5, %xmm2
1564	pshufd	$0xaa, %xmm5, %xmm4
1565	pshufd	$0xff, %xmm5, %xmm6
1566	movaps	%xmm0,  32 * SIZE(BB)
1567	movaps	%xmm2,  36 * SIZE(BB)
1568	movaps	%xmm4,  40 * SIZE(BB)
1569	movaps	%xmm6,  44 * SIZE(BB)
1570
1571	pshufd	$0x00, %xmm7, %xmm0
1572	pshufd	$0x55, %xmm7, %xmm2
1573	pshufd	$0xaa, %xmm7, %xmm4
1574	pshufd	$0xff, %xmm7, %xmm6
1575	movaps	%xmm0,  48 * SIZE(BB)
1576	movaps	%xmm2,  52 * SIZE(BB)
1577	movaps	%xmm4,  56 * SIZE(BB)
1578	movaps	%xmm6,  60 * SIZE(BB)
1579#else
1580	movaps	%xmm0,   0 * SIZE(AA)
1581	movaps	%xmm1,   4 * SIZE(AA)
1582	movaps	%xmm2,   8 * SIZE(AA)
1583	movaps	%xmm3,  12 * SIZE(AA)
1584#endif
1585
1586#ifdef LN
1587	subl	$4 * SIZE, CO1
1588#endif
1589
1590	leal	(LDC, LDC, 2), %eax
1591
1592#if defined(LN) || defined(LT)
1593	movaps	 %xmm1, %xmm0
1594	unpcklps %xmm5, %xmm1
1595	unpckhps %xmm5, %xmm0
1596
1597	movaps	 %xmm3, %xmm4
1598	unpcklps %xmm7, %xmm3
1599	unpckhps %xmm7, %xmm4
1600
1601	movaps	 %xmm1, %xmm2
1602	unpcklps %xmm3, %xmm1
1603	unpckhps %xmm3, %xmm2
1604
1605	movaps	 %xmm0, %xmm6
1606	unpcklps %xmm4, %xmm0
1607	unpckhps %xmm4, %xmm6
1608
1609	movlps	%xmm1, 0 * SIZE(CO1)
1610	movhps	%xmm1, 2 * SIZE(CO1)
1611	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
1612	movhps	%xmm2, 2 * SIZE(CO1, LDC, 1)
1613	movlps	%xmm0, 0 * SIZE(CO1, LDC, 2)
1614	movhps	%xmm0, 2 * SIZE(CO1, LDC, 2)
1615	movlps	%xmm6, 0 * SIZE(CO1, %eax, 1)
1616	movhps	%xmm6, 2 * SIZE(CO1, %eax, 1)
1617#else
1618	movlps	%xmm0, 0 * SIZE(CO1)
1619	movhps	%xmm0, 2 * SIZE(CO1)
1620	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
1621	movhps	%xmm1, 2 * SIZE(CO1, LDC, 1)
1622	movlps	%xmm2, 0 * SIZE(CO1, LDC, 2)
1623	movhps	%xmm2, 2 * SIZE(CO1, LDC, 2)
1624	movlps	%xmm3, 0 * SIZE(CO1, %eax, 1)
1625	movhps	%xmm3, 2 * SIZE(CO1, %eax, 1)
1626#endif
1627
1628#ifndef LN
1629	addl	$4 * SIZE, CO1
1630#endif
1631
1632#if defined(LT) || defined(RN)
1633	movl	K,  %eax
1634	subl	KK, %eax
1635	leal	(,%eax, SIZE), %eax
1636	leal	(AA, %eax, 4), AA
1637#ifdef LT
1638	addl	$16 * SIZE, B
1639#endif
1640#endif
1641
1642#ifdef LN
1643	subl	$4, KK
1644	movl	BORIG, B
1645#endif
1646
1647#ifdef LT
1648	addl	$4, KK
1649#endif
1650
1651#ifdef RT
1652	movl	K, %eax
1653	movl	BORIG, B
1654	sall	$2 + BASE_SHIFT, %eax
1655	addl	%eax, AORIG
1656#endif
1657
1658	decl	%ebx			# i --
1659	jg	.L11
1660	ALIGN_4
1661
1662.L39:
1663#ifdef LN
1664       movl	K, %eax
1665       leal	(, %eax, SIZE), %eax
1666       leal 	(B, %eax, 4), B
1667#endif
1668
1669#if defined(LT) || defined(RN)
1670	movl	K,  %eax
1671	subl	KK, %eax
1672	leal	(,%eax, SIZE), %eax
1673	leal	(B,  %eax, 4), B
1674#endif
1675
1676#ifdef RN
1677	addl	$4, KK
1678#endif
1679
1680#ifdef RT
1681	subl	$4, KK
1682#endif
1683
1684	decl	J			# j --
1685	jg	.L01
1686	ALIGN_4
1687
1688.L40:
1689	testl	$2, N
1690	je	.L80
1691
1692#ifdef LN
1693	movl	OFFSET, %eax
1694	addl	M, %eax
1695	movl	%eax, KK
1696#endif
1697
1698	leal	BUFFER, %ecx
1699
1700#ifdef RT
1701       movl	K, %eax
1702       sall	$1 + BASE_SHIFT, %eax
1703       subl	%eax, B
1704#endif
1705
1706#if defined(LN) || defined(RT)
1707	movl	KK, %eax
1708	movl	B, BORIG
1709        sall	$1 + BASE_SHIFT, %eax
1710	leal	(B,  %eax, 1), B
1711	leal	(BB, %eax, 4), BB
1712#endif
1713
1714#ifdef LT
1715	movl	OFFSET, %eax
1716	movl	%eax, KK
1717#endif
1718
1719#if defined(LT) || defined(RN)
1720	movl	KK, %eax
1721#else
1722	movl	K, %eax
1723	subl	KK, %eax
1724#endif
1725	sarl	$2, %eax
1726	jle	.L45
1727	ALIGN_4
1728
1729.L42:
1730	movaps	 0 * SIZE(B), %xmm3
1731	movaps	 4 * SIZE(B), %xmm7
1732
1733	pshufd	 $0x00, %xmm3, %xmm0
1734	pshufd	 $0x55, %xmm3, %xmm1
1735	pshufd	 $0xaa, %xmm3, %xmm2
1736	pshufd	 $0xff, %xmm3, %xmm3
1737
1738	pshufd	 $0x00, %xmm7, %xmm4
1739	pshufd	 $0x55, %xmm7, %xmm5
1740	pshufd	 $0xaa, %xmm7, %xmm6
1741	pshufd	 $0xff, %xmm7, %xmm7
1742
1743	movaps	%xmm0,  0 * SIZE(BB)
1744	movaps	%xmm1,  4 * SIZE(BB)
1745	movaps	%xmm2,  8 * SIZE(BB)
1746	movaps	%xmm3, 12 * SIZE(BB)
1747	movaps	%xmm4, 16 * SIZE(BB)
1748	movaps	%xmm5, 20 * SIZE(BB)
1749	movaps	%xmm6, 24 * SIZE(BB)
1750	movaps	%xmm7, 28 * SIZE(BB)
1751
1752	addl	$ 8 * SIZE, B
1753	addl	$32 * SIZE, %ecx
1754	decl	%eax
1755	jne	.L42
1756	ALIGN_4
1757
1758.L45:
1759#if defined(LT) || defined(RN)
1760	movl	KK, %eax
1761#else
1762	movl	K, %eax
1763	subl	KK, %eax
1764#endif
1765	andl	$3, %eax
1766	BRANCH
1767	jle	.L50
1768	ALIGN_4
1769
1770.L46:
1771#ifdef	movsd
1772	xorps	%xmm3, %xmm3
1773#endif
1774	movsd	 0 * SIZE(B), %xmm3
1775
1776	pshufd	 $0x00, %xmm3, %xmm0
1777	pshufd	 $0x55, %xmm3, %xmm1
1778
1779	movaps	%xmm0,  0 * SIZE(BB)
1780	movaps	%xmm1,  4 * SIZE(BB)
1781
1782	addl	$2 * SIZE, B
1783	addl	$8 * SIZE, %ecx
1784	decl	%eax
1785	jne	.L46
1786	ALIGN_4
1787
1788.L50:
1789#if defined(LT) || defined(RN)
1790	movl	A, AA
1791#else
1792	movl	A, %eax
1793	movl	%eax, AORIG
1794#endif
1795
1796	leal	(, LDC, 2), %eax
1797
1798#ifdef RT
1799	subl	%eax, C
1800#endif
1801	movl	C, CO1
1802#ifndef RT
1803	addl	%eax, C
1804#endif
1805
1806	testl	$1, M
1807	je	.L60
1808
1809#ifdef LN
1810       movl	K, %eax
1811       sall	$BASE_SHIFT, %eax
1812       subl	%eax, AORIG
1813#endif
1814
1815#if defined(LN) || defined(RT)
1816	movl	KK, %eax
1817	movl	AORIG, AA
1818	leal	(AA, %eax, SIZE), AA
1819#endif
1820
1821	leal	BUFFER, BB
1822
1823#if defined(LN) || defined(RT)
1824	movl	KK, %eax
1825	sall	$1 + BASE_SHIFT, %eax
1826	leal	(BB, %eax, 4), BB
1827#endif
1828
1829	xorps	%xmm4, %xmm4
1830	xorps	%xmm5, %xmm5
1831	xorps	%xmm6, %xmm6
1832	xorps	%xmm7, %xmm7
1833
1834	movss	 0 * SIZE(AA), %xmm0
1835	movss	 4 * SIZE(AA), %xmm1
1836	movss	 0 * SIZE(BB), %xmm2
1837	movss	16 * SIZE(BB), %xmm3
1838
1839#if defined(LT) || defined(RN)
1840	movl	KK, %eax
1841#else
1842	movl	K, %eax
1843	subl	KK, %eax
1844#endif
1845	sarl	$3, %eax
1846	je	.L75
1847	ALIGN_4
1848
1849.L72:
1850	mulss	%xmm0, %xmm2
1851#if defined(OPTERON) || defined(BARCELONA)
1852	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
1853#endif
1854	mulss	 4 * SIZE(BB), %xmm0
1855	addss	%xmm2, %xmm4
1856	movss	 8 * SIZE(BB), %xmm2
1857	addss	%xmm0, %xmm5
1858	movss	 1 * SIZE(AA), %xmm0
1859	mulss	%xmm0, %xmm2
1860	mulss	12 * SIZE(BB), %xmm0
1861	addss	%xmm2, %xmm6
1862	movss	32 * SIZE(BB), %xmm2
1863	addss	%xmm0, %xmm7
1864	movss	 2 * SIZE(AA), %xmm0
1865	mulss	%xmm0, %xmm3
1866	mulss	20 * SIZE(BB), %xmm0
1867	addss	%xmm3, %xmm4
1868	movss	24 * SIZE(BB), %xmm3
1869	addss	%xmm0, %xmm5
1870	movss	 3 * SIZE(AA), %xmm0
1871	mulss	%xmm0, %xmm3
1872	mulss	28 * SIZE(BB), %xmm0
1873	addss	%xmm3, %xmm6
1874	movss	48 * SIZE(BB), %xmm3
1875	addss	%xmm0, %xmm7
1876	movss	 8 * SIZE(AA), %xmm0
1877	mulss	%xmm1, %xmm2
1878	mulss	36 * SIZE(BB), %xmm1
1879	addss	%xmm2, %xmm4
1880	movss	40 * SIZE(BB), %xmm2
1881	addss	%xmm1, %xmm5
1882	movss	 5 * SIZE(AA), %xmm1
1883	mulss	%xmm1, %xmm2
1884	mulss	44 * SIZE(BB), %xmm1
1885	addss	%xmm2, %xmm6
1886	movss	64 * SIZE(BB), %xmm2
1887	addss	%xmm1, %xmm7
1888	movss	 6 * SIZE(AA), %xmm1
1889	mulss	%xmm1, %xmm3
1890	mulss	52 * SIZE(BB), %xmm1
1891	addss	%xmm3, %xmm4
1892	movss	56 * SIZE(BB), %xmm3
1893	addss	%xmm1, %xmm5
1894	movss	 7 * SIZE(AA), %xmm1
1895	mulss	%xmm1, %xmm3
1896	mulss	60 * SIZE(BB), %xmm1
1897	addss	%xmm3, %xmm6
1898	movss	80 * SIZE(BB), %xmm3
1899	addss	%xmm1, %xmm7
1900	movss	12 * SIZE(AA), %xmm1
1901
1902	addl	$ 8 * SIZE, AA
1903	addl	$64 * SIZE, BB
1904	decl   %eax
1905	jne    .L72
1906	ALIGN_4
1907
1908.L75:
1909#if defined(LT) || defined(RN)
1910	movl	KK, %eax
1911#else
1912	movl	K, %eax
1913	subl	KK, %eax
1914#endif
1915	andl	$7, %eax		# if (k & 1)
1916	BRANCH
1917	je .L78
1918	ALIGN_4
1919
1920.L76:
1921	mulss	%xmm0, %xmm2
1922	mulss	 4 * SIZE(BB), %xmm0
1923	addss	%xmm2, %xmm4
1924	movss	 8 * SIZE(BB), %xmm2
1925	addss	%xmm0, %xmm5
1926	movss	 1 * SIZE(AA), %xmm0
1927
1928	addl	$ 1 * SIZE, AA
1929	addl	$ 8 * SIZE, BB
1930	decl	%eax
1931	jg	.L76
1932	ALIGN_4
1933
1934.L78:
1935	addss	%xmm6, %xmm4
1936	addss	%xmm7, %xmm5
1937
1938#if defined(LN) || defined(RT)
1939	movl	KK, %eax
1940#ifdef LN
1941	subl	$1, %eax
1942#else
1943	subl	$2, %eax
1944#endif
1945
1946	movl	AORIG, AA
1947	movl	BORIG, B
1948	leal	BUFFER, BB
1949
1950	sall	$BASE_SHIFT, %eax
1951	leal	(AA, %eax, 1), AA
1952	leal	(B,  %eax, 2), B
1953	leal	(BB, %eax, 8), BB
1954#endif
1955
1956#if defined(LN) || defined(LT)
1957	unpcklps %xmm5, %xmm4
1958
1959#ifdef	movsd
1960	xorps	%xmm1, %xmm1
1961#endif
1962	movsd	 0 * SIZE(B), %xmm1
1963
1964	subps	%xmm4,  %xmm1
1965#else
1966	movss	 0 * SIZE(AA), %xmm0
1967	movss	 1 * SIZE(AA), %xmm1
1968
1969	subss	%xmm4, %xmm0
1970	subss	%xmm5, %xmm1
1971#endif
1972
1973#if defined(LN) || defined(LT)
1974	movss	  0 * SIZE(AA), %xmm4
1975	pshufd	 $0x00, %xmm4, %xmm6
1976	mulps	 %xmm6, %xmm1
1977#endif
1978
1979#ifdef RN
1980	movaps	 0 * SIZE(B), %xmm6
1981	pshufd	 $0x00, %xmm6, %xmm7
1982	mulss	 %xmm7, %xmm0
1983	pshufd	 $0x55, %xmm6, %xmm7
1984	mulss	 %xmm0, %xmm7
1985	subss	 %xmm7, %xmm1
1986
1987	pshufd	 $0xff, %xmm6, %xmm7
1988	mulss	 %xmm7, %xmm1
1989#endif
1990
1991#ifdef RT
1992	movaps	  0 * SIZE(B), %xmm6
1993	pshufd	 $0xff, %xmm6, %xmm7
1994	mulss	 %xmm7, %xmm1
1995	pshufd	 $0xaa, %xmm6, %xmm7
1996	mulss	 %xmm1, %xmm7
1997	subss	 %xmm7, %xmm0
1998
1999	pshufd	 $0x00, %xmm6, %xmm7
2000	mulss	 %xmm7, %xmm0
2001#endif
2002
2003#if defined(LN) || defined(LT)
2004	movlps	%xmm1,   0 * SIZE(B)
2005
2006	pshufd	$0x00, %xmm1, %xmm0
2007	pshufd	$0x55, %xmm1, %xmm2
2008	movaps	%xmm0,   0 * SIZE(BB)
2009	movaps	%xmm2,   4 * SIZE(BB)
2010#else
2011	movss	%xmm0,   0 * SIZE(AA)
2012	movss	%xmm1,   1 * SIZE(AA)
2013#endif
2014
2015#ifdef LN
2016	subl	$1 * SIZE, CO1
2017#endif
2018
2019#if defined(LN) || defined(LT)
2020	pshufd	$1, %xmm1, %xmm3
2021
2022	movss	%xmm1, 0 * SIZE(CO1)
2023	movss	%xmm3, 0 * SIZE(CO1, LDC)
2024#else
2025	movss	%xmm0, 0 * SIZE(CO1)
2026	movss	%xmm1, 0 * SIZE(CO1, LDC)
2027#endif
2028
2029#ifndef LN
2030	addl	$1 * SIZE, CO1
2031#endif
2032
2033#if defined(LT) || defined(RN)
2034	movl	K,  %eax
2035	subl	KK, %eax
2036	leal	(AA, %eax, SIZE), AA
2037#ifdef LT
2038	addl	$2 * SIZE, B
2039#endif
2040#endif
2041
2042#ifdef LN
2043	subl	$1, KK
2044	movl	BORIG, B
2045#endif
2046
2047#ifdef LT
2048	addl	$1, KK
2049#endif
2050
2051#ifdef RT
2052	movl	K, %eax
2053	movl	BORIG, B
2054	sall	$BASE_SHIFT, %eax
2055	addl	%eax, AORIG
2056#endif
2057	ALIGN_4
2058
2059
2060.L60:
2061	testl	$2, M
2062	je	.L70
2063
2064#ifdef LN
2065       movl	K, %eax
2066       sall	$1 + BASE_SHIFT, %eax
2067       subl	%eax, AORIG
2068#endif
2069
2070#if defined(LN) || defined(RT)
2071	movl	KK, %eax
2072	movl	AORIG, AA
2073	leal	(, %eax, SIZE), %eax
2074	leal	(AA, %eax, 2), AA
2075#endif
2076
2077	leal	BUFFER, BB
2078
2079#if defined(LN) || defined(RT)
2080	movl	KK, %eax
2081	sall	$1 + BASE_SHIFT, %eax
2082	leal	(BB, %eax, 4), BB
2083#endif
2084
2085	xorps	%xmm4, %xmm4
2086	xorps	%xmm5, %xmm5
2087	xorps	%xmm6, %xmm6
2088	xorps	%xmm7, %xmm7
2089
2090#ifdef	movsd
2091	xorps	%xmm0, %xmm0
2092#endif
2093	movsd	 0 * SIZE(AA), %xmm0
2094#ifdef	movsd
2095	xorps	%xmm1, %xmm1
2096#endif
2097	movsd	 8 * SIZE(AA), %xmm1
2098	movaps	 0 * SIZE(BB), %xmm2
2099	movaps	16 * SIZE(BB), %xmm3
2100
2101#if defined(LT) || defined(RN)
2102	movl	KK, %eax
2103#else
2104	movl	K, %eax
2105	subl	KK, %eax
2106#endif
2107	sarl	$3, %eax
2108	je	.L65
2109	ALIGN_4
2110
2111.L62:
2112#if defined(OPTERON) || defined(BARCELONA)
2113	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2114#endif
2115
2116	mulps	%xmm0, %xmm2
2117	addps	%xmm2, %xmm4
2118	movaps	 4 * SIZE(BB), %xmm2
2119	mulps	%xmm0, %xmm2
2120	movsd	 2 * SIZE(AA), %xmm0
2121	addps	%xmm2, %xmm5
2122	movaps	 8 * SIZE(BB), %xmm2
2123
2124	mulps	%xmm0, %xmm2
2125	addps	%xmm2, %xmm6
2126	movaps	12 * SIZE(BB), %xmm2
2127	mulps	%xmm0, %xmm2
2128	movsd	 4 * SIZE(AA), %xmm0
2129	addps	%xmm2, %xmm7
2130	movaps	32 * SIZE(BB), %xmm2
2131
2132	mulps	%xmm0, %xmm3
2133	addps	%xmm3, %xmm4
2134	movaps	20 * SIZE(BB), %xmm3
2135	mulps	%xmm0, %xmm3
2136	movsd	 6 * SIZE(AA), %xmm0
2137	addps	%xmm3, %xmm5
2138	movaps	24 * SIZE(BB), %xmm3
2139
2140	mulps	%xmm0, %xmm3
2141	addps	%xmm3, %xmm6
2142	movaps	28 * SIZE(BB), %xmm3
2143	mulps	%xmm0, %xmm3
2144	movsd	16 * SIZE(AA), %xmm0
2145	addps	%xmm3, %xmm7
2146	movaps	48 * SIZE(BB), %xmm3
2147
2148	mulps	%xmm1, %xmm2
2149	addps	%xmm2, %xmm4
2150	movaps	36 * SIZE(BB), %xmm2
2151	mulps	%xmm1, %xmm2
2152	movsd	10 * SIZE(AA), %xmm1
2153	addps	%xmm2, %xmm5
2154	movaps	40 * SIZE(BB), %xmm2
2155
2156	mulps	%xmm1, %xmm2
2157	addps	%xmm2, %xmm6
2158	movaps	44 * SIZE(BB), %xmm2
2159	mulps	%xmm1, %xmm2
2160	movsd	12 * SIZE(AA), %xmm1
2161	addps	%xmm2, %xmm7
2162	movaps	64 * SIZE(BB), %xmm2
2163
2164	mulps	%xmm1, %xmm3
2165	addps	%xmm3, %xmm4
2166	movaps	52 * SIZE(BB), %xmm3
2167	mulps	%xmm1, %xmm3
2168	movsd	14 * SIZE(AA), %xmm1
2169	addps	%xmm3, %xmm5
2170	movaps	56 * SIZE(BB), %xmm3
2171
2172	mulps	%xmm1, %xmm3
2173	addps	%xmm3, %xmm6
2174	movaps	60 * SIZE(BB), %xmm3
2175	mulps	%xmm1, %xmm3
2176	movsd	24 * SIZE(AA), %xmm1
2177	addps	%xmm3, %xmm7
2178	movaps	80 * SIZE(BB), %xmm3
2179
2180	addl	$16 * SIZE, AA
2181	addl	$64 * SIZE, BB
2182	decl   %eax
2183	jne    .L62
2184	ALIGN_4
2185
2186.L65:
2187#if defined(LT) || defined(RN)
2188	movl	KK, %eax
2189#else
2190	movl	K, %eax
2191	subl	KK, %eax
2192#endif
2193	andl	$7, %eax		# if (k & 1)
2194	BRANCH
2195	je .L68
2196	ALIGN_4
2197
2198.L66:
2199	mulps	%xmm0, %xmm2
2200	addps	%xmm2, %xmm4
2201	movaps	 4 * SIZE(BB), %xmm2
2202	mulps	%xmm0, %xmm2
2203	movsd	 2 * SIZE(AA), %xmm0
2204	addps	%xmm2, %xmm5
2205	movaps	 8 * SIZE(BB), %xmm2
2206
2207	addl	$2 * SIZE, AA
2208	addl	$8 * SIZE, BB
2209	decl	%eax
2210	jg	.L66
2211	ALIGN_4
2212
2213.L68:
2214	addps	%xmm6, %xmm4
2215	addps	%xmm7, %xmm5
2216
2217#if defined(LN) || defined(RT)
2218	movl	KK, %eax
2219#ifdef LN
2220	subl	$2, %eax
2221#else
2222	subl	$2, %eax
2223#endif
2224
2225	movl	AORIG, AA
2226	movl	BORIG, B
2227	leal	BUFFER, BB
2228
2229	sall	$BASE_SHIFT, %eax
2230	leal	(AA, %eax, 2), AA
2231	leal	(B,  %eax, 2), B
2232	leal	(BB, %eax, 8), BB
2233#endif
2234
2235#if defined(LN) || defined(LT)
2236	unpcklps %xmm6, %xmm4
2237	unpcklps %xmm7, %xmm5
2238
2239	movaps	 %xmm4, %xmm6
2240	unpcklps %xmm5, %xmm4
2241	unpckhps %xmm5, %xmm6
2242
2243#ifdef	movsd
2244	xorps	%xmm1, %xmm1
2245#endif
2246	movsd	 0 * SIZE(B), %xmm1
2247#ifdef	movsd
2248	xorps	%xmm3, %xmm3
2249#endif
2250	movsd	 2 * SIZE(B), %xmm3
2251
2252	subps	%xmm4,  %xmm1
2253	subps	%xmm6,  %xmm3
2254#else
2255#ifdef	movsd
2256	xorps	%xmm0, %xmm0
2257#endif
2258	movsd	 0 * SIZE(AA), %xmm0
2259#ifdef	movsd
2260	xorps	%xmm1, %xmm1
2261#endif
2262	movsd	 2 * SIZE(AA), %xmm1
2263
2264	subps	%xmm4, %xmm0
2265	subps	%xmm5, %xmm1
2266#endif
2267
2268#ifdef LN
2269	movaps	  0 * SIZE(AA), %xmm4
2270	pshufd	 $0xff, %xmm4, %xmm6
2271	mulps	 %xmm6, %xmm3
2272	pshufd	 $0xaa, %xmm4, %xmm6
2273	mulps	 %xmm3, %xmm6
2274	subps	 %xmm6, %xmm1
2275
2276	pshufd	 $0x00, %xmm4, %xmm6
2277	mulps	 %xmm6, %xmm1
2278#endif
2279
2280#ifdef LT
2281	movaps	 0 * SIZE(AA), %xmm4
2282	pshufd	 $0x00, %xmm4, %xmm6
2283	mulps	 %xmm6, %xmm1
2284	pshufd	 $0x55, %xmm4, %xmm6
2285	mulps	 %xmm1, %xmm6
2286	subps	 %xmm6, %xmm3
2287
2288	pshufd	 $0xff, %xmm4, %xmm6
2289	mulps	 %xmm6, %xmm3
2290#endif
2291
2292#ifdef RN
2293	movaps	 0 * SIZE(B), %xmm6
2294	pshufd	 $0x00, %xmm6, %xmm7
2295	mulps	 %xmm7, %xmm0
2296	pshufd	 $0x55, %xmm6, %xmm7
2297	mulps	 %xmm0, %xmm7
2298	subps	 %xmm7, %xmm1
2299
2300	pshufd	 $0xff, %xmm6, %xmm7
2301	mulps	 %xmm7, %xmm1
2302#endif
2303
2304#ifdef RT
2305	movaps	  0 * SIZE(B), %xmm6
2306	pshufd	 $0xff, %xmm6, %xmm7
2307	mulps	 %xmm7, %xmm1
2308	pshufd	 $0xaa, %xmm6, %xmm7
2309	mulps	 %xmm1, %xmm7
2310	subps	 %xmm7, %xmm0
2311
2312	pshufd	 $0x00, %xmm6, %xmm7
2313	mulps	 %xmm7, %xmm0
2314#endif
2315
2316#if defined(LN) || defined(LT)
2317	movlps	%xmm1,   0 * SIZE(B)
2318	movlps	%xmm3,   2 * SIZE(B)
2319
2320	pshufd	$0x00, %xmm1, %xmm0
2321	pshufd	$0x55, %xmm1, %xmm2
2322	movaps	%xmm0,   0 * SIZE(BB)
2323	movaps	%xmm2,   4 * SIZE(BB)
2324
2325	pshufd	$0x00, %xmm3, %xmm0
2326	pshufd	$0x55, %xmm3, %xmm2
2327	movaps	%xmm0,   8 * SIZE(BB)
2328	movaps	%xmm2,  12 * SIZE(BB)
2329#else
2330	movlps	%xmm0,   0 * SIZE(AA)
2331	movlps	%xmm1,   2 * SIZE(AA)
2332#endif
2333
2334#ifdef LN
2335	subl	$2 * SIZE, CO1
2336#endif
2337
2338#if defined(LN) || defined(LT)
2339	unpcklps %xmm3, %xmm1
2340
2341	movlps	%xmm1, 0 * SIZE(CO1)
2342	movhps	%xmm1, 0 * SIZE(CO1, LDC)
2343#else
2344	movlps	%xmm0, 0 * SIZE(CO1)
2345	movlps	%xmm1, 0 * SIZE(CO1, LDC)
2346#endif
2347
2348#ifndef LN
2349	addl	$2 * SIZE, CO1
2350#endif
2351
2352#if defined(LT) || defined(RN)
2353	movl	K,  %eax
2354	subl	KK, %eax
2355	leal	(,%eax, SIZE), %eax
2356	leal	(AA, %eax, 2), AA
2357#ifdef LT
2358	addl	$4 * SIZE, B
2359#endif
2360#endif
2361
2362#ifdef LN
2363	subl	$2, KK
2364	movl	BORIG, B
2365#endif
2366
2367#ifdef LT
2368	addl	$2, KK
2369#endif
2370
2371#ifdef RT
2372	movl	K, %eax
2373	movl	BORIG, B
2374	sall	$1 + BASE_SHIFT, %eax
2375	addl	%eax, AORIG
2376#endif
2377	ALIGN_4
2378
2379.L70:
2380	movl	M,  %ebx
2381	sarl	$2, %ebx	# i = (m >> 2)
2382	jle	.L79
2383	ALIGN_4
2384
2385.L51:
2386#ifdef LN
2387       movl	K, %eax
2388       sall	$2 + BASE_SHIFT, %eax
2389       subl	%eax, AORIG
2390#endif
2391
2392#if defined(LN) || defined(RT)
2393	movl	KK, %eax
2394	movl	AORIG, AA
2395	leal	(, %eax, SIZE), %eax
2396	leal	(AA, %eax, 4), AA
2397#endif
2398
2399	leal	BUFFER, BB
2400
2401#if defined(LN) || defined(RT)
2402	movl	KK, %eax
2403	sall	$1 + BASE_SHIFT, %eax
2404	leal	(BB, %eax, 4), BB
2405#endif
2406
2407	xorps	%xmm4, %xmm4
2408	xorps	%xmm5, %xmm5
2409	xorps	%xmm6, %xmm6
2410	xorps	%xmm7, %xmm7
2411
2412	movaps	 0 * SIZE(AA), %xmm0
2413	movaps	16 * SIZE(AA), %xmm1
2414	movaps	 0 * SIZE(BB), %xmm2
2415	movaps	16 * SIZE(BB), %xmm3
2416
2417	PREFETCHW	-4 * SIZE(CO1)
2418	PREFETCHW	-4 * SIZE(CO1, LDC)
2419
2420#if defined(LT) || defined(RN)
2421	movl	KK, %eax
2422#else
2423	movl	K, %eax
2424	subl	KK, %eax
2425#endif
2426	sarl	$3, %eax
2427	je	.L55
2428	ALIGN_4
2429
2430.L52:
2431	mulps	%xmm0, %xmm2
2432#if defined(OPTERON) || defined(BARCELONA)
2433	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2434#endif
2435	mulps	 4 * SIZE(BB), %xmm0
2436	addps	%xmm2, %xmm4
2437	movaps	 8 * SIZE(BB), %xmm2
2438	addps	%xmm0, %xmm5
2439	movaps	 4 * SIZE(AA), %xmm0
2440
2441	mulps	%xmm0, %xmm2
2442	mulps	12 * SIZE(BB), %xmm0
2443	addps	%xmm2, %xmm4
2444	movaps	32 * SIZE(BB), %xmm2
2445	addps	%xmm0, %xmm5
2446	movaps	 8 * SIZE(AA), %xmm0
2447
2448	mulps	%xmm0, %xmm3
2449	mulps	20 * SIZE(BB), %xmm0
2450	addps	%xmm3, %xmm4
2451	movaps	24 * SIZE(BB), %xmm3
2452	addps	%xmm0, %xmm5
2453	movaps	12 * SIZE(AA), %xmm0
2454
2455	mulps	%xmm0, %xmm3
2456	mulps	28 * SIZE(BB), %xmm0
2457	addps	%xmm3, %xmm4
2458	movaps	48 * SIZE(BB), %xmm3
2459	addps	%xmm0, %xmm5
2460	movaps	32 * SIZE(AA), %xmm0
2461
2462#if defined(OPTERON) || defined(BARCELONA)
2463	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
2464#endif
2465	mulps	%xmm1, %xmm2
2466	mulps	36 * SIZE(BB), %xmm1
2467	addps	%xmm2, %xmm4
2468	movaps	40 * SIZE(BB), %xmm2
2469	addps	%xmm1, %xmm5
2470	movaps	20 * SIZE(AA), %xmm1
2471
2472	mulps	%xmm1, %xmm2
2473	mulps	44 * SIZE(BB), %xmm1
2474	addps	%xmm2, %xmm4
2475	movaps	64 * SIZE(BB), %xmm2
2476	addps	%xmm1, %xmm5
2477	movaps	24 * SIZE(AA), %xmm1
2478
2479	mulps	%xmm1, %xmm3
2480	mulps	52 * SIZE(BB), %xmm1
2481	addps	%xmm3, %xmm4
2482	movaps	56 * SIZE(BB), %xmm3
2483	addps	%xmm1, %xmm5
2484	movaps	28 * SIZE(AA), %xmm1
2485
2486	mulps	%xmm1, %xmm3
2487	mulps	60 * SIZE(BB), %xmm1
2488	addps	%xmm3, %xmm4
2489	movaps	80 * SIZE(BB), %xmm3
2490	addps	%xmm1, %xmm5
2491	movaps	48 * SIZE(AA), %xmm1
2492
2493	addl	$32 * SIZE, AA
2494	addl	$64 * SIZE, BB
2495	decl   %eax
2496	jne    .L52
2497	ALIGN_4
2498
2499.L55:
2500#if defined(LT) || defined(RN)
2501	movl	KK, %eax
2502#else
2503	movl	K, %eax
2504	subl	KK, %eax
2505#endif
2506	andl	$7, %eax		# if (k & 1)
2507	BRANCH
2508	je .L58
2509	ALIGN_4
2510
2511.L56:
2512	mulps	%xmm0, %xmm2
2513	mulps	 4 * SIZE(BB), %xmm0
2514	addps	%xmm2, %xmm4
2515	movaps	 8 * SIZE(BB), %xmm2
2516	addps	%xmm0, %xmm5
2517	movaps	 4 * SIZE(AA), %xmm0
2518
2519	addl	$4 * SIZE, AA
2520	addl	$8 * SIZE, BB
2521	decl	%eax
2522	jg	.L56
2523	ALIGN_4
2524
2525.L58:
2526#if defined(LN) || defined(RT)
2527	movl	KK, %eax
2528#ifdef LN
2529	subl	$4, %eax
2530#else
2531	subl	$2, %eax
2532#endif
2533
2534	movl	AORIG, AA
2535	movl	BORIG, B
2536	leal	BUFFER, BB
2537
2538	sall	$1 + BASE_SHIFT, %eax
2539	leal	(AA, %eax, 2), AA
2540	leal	(B,  %eax, 1), B
2541	leal	(BB, %eax, 4), BB
2542#endif
2543
2544#if defined(LN) || defined(LT)
2545	movaps	 %xmm4, %xmm0
2546	unpcklps %xmm6, %xmm4
2547	unpckhps %xmm6, %xmm0
2548
2549	movaps	 %xmm5, %xmm1
2550	unpcklps %xmm7, %xmm5
2551	unpckhps %xmm7, %xmm1
2552
2553	movaps	 %xmm4, %xmm6
2554	unpcklps %xmm5, %xmm4
2555	unpckhps %xmm5, %xmm6
2556
2557	movaps	 %xmm0, %xmm2
2558	unpcklps %xmm1, %xmm0
2559	unpckhps %xmm1, %xmm2
2560
2561#ifdef	movsd
2562	xorps	%xmm1, %xmm1
2563#endif
2564	movsd	 0 * SIZE(B), %xmm1
2565#ifdef	movsd
2566	xorps	%xmm3, %xmm3
2567#endif
2568	movsd	 2 * SIZE(B), %xmm3
2569#ifdef	movsd
2570	xorps	%xmm5, %xmm5
2571#endif
2572	movsd	 4 * SIZE(B), %xmm5
2573#ifdef	movsd
2574	xorps	%xmm7, %xmm7
2575#endif
2576	movsd	 6 * SIZE(B), %xmm7
2577
2578	subps	%xmm4,  %xmm1
2579	subps	%xmm6,  %xmm3
2580	subps	%xmm0,  %xmm5
2581	subps	%xmm2,  %xmm7
2582#else
2583	movaps	 0 * SIZE(AA), %xmm0
2584	movaps	 4 * SIZE(AA), %xmm1
2585
2586	subps	%xmm4, %xmm0
2587	subps	%xmm5, %xmm1
2588#endif
2589
2590#ifdef LN
2591	movaps	 12 * SIZE(AA), %xmm4
2592	pshufd	 $0xff, %xmm4, %xmm6
2593	mulps	 %xmm6, %xmm7
2594	pshufd	 $0xaa, %xmm4, %xmm6
2595	mulps	 %xmm7, %xmm6
2596	subps	 %xmm6, %xmm5
2597	pshufd	 $0x55, %xmm4, %xmm6
2598	mulps	 %xmm7, %xmm6
2599	subps	 %xmm6, %xmm3
2600	pshufd	 $0x00, %xmm4, %xmm6
2601	mulps	 %xmm7, %xmm6
2602	subps	 %xmm6, %xmm1
2603
2604	movaps	  8 * SIZE(AA), %xmm4
2605	pshufd	 $0xaa, %xmm4, %xmm6
2606	mulps	 %xmm6, %xmm5
2607	pshufd	 $0x55, %xmm4, %xmm6
2608	mulps	 %xmm5, %xmm6
2609	subps	 %xmm6, %xmm3
2610	pshufd	 $0x00, %xmm4, %xmm6
2611	mulps	 %xmm5, %xmm6
2612	subps	 %xmm6, %xmm1
2613
2614	movaps	  4 * SIZE(AA), %xmm4
2615	pshufd	 $0x55, %xmm4, %xmm6
2616	mulps	 %xmm6, %xmm3
2617	pshufd	 $0x00, %xmm4, %xmm6
2618	mulps	 %xmm3, %xmm6
2619	subps	 %xmm6, %xmm1
2620
2621	movaps	  0 * SIZE(AA), %xmm4
2622	pshufd	 $0x00, %xmm4, %xmm6
2623	mulps	 %xmm6, %xmm1
2624#endif
2625
2626#ifdef LT
2627	movaps	 0 * SIZE(AA), %xmm4
2628	pshufd	 $0x00, %xmm4, %xmm6
2629	mulps	 %xmm6, %xmm1
2630
2631	pshufd	 $0x55, %xmm4, %xmm6
2632	mulps	 %xmm1, %xmm6
2633	subps	 %xmm6, %xmm3
2634	pshufd	 $0xaa, %xmm4, %xmm6
2635	mulps	 %xmm1, %xmm6
2636	subps	 %xmm6, %xmm5
2637	pshufd	 $0xff, %xmm4, %xmm6
2638	mulps	 %xmm1, %xmm6
2639	subps	 %xmm6, %xmm7
2640
2641	movaps	 4 * SIZE(AA), %xmm4
2642	pshufd	 $0x55, %xmm4, %xmm6
2643	mulps	 %xmm6, %xmm3
2644	pshufd	 $0xaa, %xmm4, %xmm6
2645	mulps	 %xmm3, %xmm6
2646	subps	 %xmm6, %xmm5
2647	pshufd	 $0xff, %xmm4, %xmm6
2648	mulps	 %xmm3, %xmm6
2649	subps	 %xmm6, %xmm7
2650
2651	movaps	 8 * SIZE(AA), %xmm4
2652	pshufd	 $0xaa, %xmm4, %xmm6
2653	mulps	 %xmm6, %xmm5
2654	pshufd	 $0xff, %xmm4, %xmm6
2655	mulps	 %xmm5, %xmm6
2656	subps	 %xmm6, %xmm7
2657
2658	movaps	12 * SIZE(AA), %xmm4
2659	pshufd	 $0xff, %xmm4, %xmm6
2660	mulps	 %xmm6, %xmm7
2661#endif
2662
2663#ifdef RN
2664	movaps	 0 * SIZE(B), %xmm6
2665	pshufd	 $0x00, %xmm6, %xmm7
2666	mulps	 %xmm7, %xmm0
2667	pshufd	 $0x55, %xmm6, %xmm7
2668	mulps	 %xmm0, %xmm7
2669	subps	 %xmm7, %xmm1
2670
2671	pshufd	 $0xff, %xmm6, %xmm7
2672	mulps	 %xmm7, %xmm1
2673#endif
2674
2675#ifdef RT
2676	movaps	  0 * SIZE(B), %xmm6
2677	pshufd	 $0xff, %xmm6, %xmm7
2678	mulps	 %xmm7, %xmm1
2679	pshufd	 $0xaa, %xmm6, %xmm7
2680	mulps	 %xmm1, %xmm7
2681	subps	 %xmm7, %xmm0
2682
2683	pshufd	 $0x00, %xmm6, %xmm7
2684	mulps	 %xmm7, %xmm0
2685#endif
2686
2687#if defined(LN) || defined(LT)
2688	movlps	%xmm1,   0 * SIZE(B)
2689	movlps	%xmm3,   2 * SIZE(B)
2690	movlps	%xmm5,   4 * SIZE(B)
2691	movlps	%xmm7,   6 * SIZE(B)
2692
2693	pshufd	$0x00, %xmm1, %xmm0
2694	pshufd	$0x55, %xmm1, %xmm2
2695	movaps	%xmm0,   0 * SIZE(BB)
2696	movaps	%xmm2,   4 * SIZE(BB)
2697
2698	pshufd	$0x00, %xmm3, %xmm0
2699	pshufd	$0x55, %xmm3, %xmm2
2700	movaps	%xmm0,   8 * SIZE(BB)
2701	movaps	%xmm2,  12 * SIZE(BB)
2702
2703	pshufd	$0x00, %xmm5, %xmm0
2704	pshufd	$0x55, %xmm5, %xmm2
2705	movaps	%xmm0,  16 * SIZE(BB)
2706	movaps	%xmm2,  20 * SIZE(BB)
2707
2708	pshufd	$0x00, %xmm7, %xmm0
2709	pshufd	$0x55, %xmm7, %xmm2
2710	movaps	%xmm0,  24 * SIZE(BB)
2711	movaps	%xmm2,  28 * SIZE(BB)
2712#else
2713	movaps	%xmm0,   0 * SIZE(AA)
2714	movaps	%xmm1,   4 * SIZE(AA)
2715#endif
2716
2717#ifdef LN
2718	subl	$4 * SIZE, CO1
2719#endif
2720
2721#if defined(LN) || defined(LT)
2722	unpcklps %xmm5, %xmm1
2723	unpcklps %xmm7, %xmm3
2724
2725	movaps	 %xmm1, %xmm2
2726	unpcklps %xmm3, %xmm1
2727	unpckhps %xmm3, %xmm2
2728
2729	movlps	%xmm1, 0 * SIZE(CO1)
2730	movhps	%xmm1, 2 * SIZE(CO1)
2731	movlps	%xmm2, 0 * SIZE(CO1, LDC, 1)
2732	movhps	%xmm2, 2 * SIZE(CO1, LDC, 1)
2733#else
2734	movlps	%xmm0, 0 * SIZE(CO1)
2735	movhps	%xmm0, 2 * SIZE(CO1)
2736	movlps	%xmm1, 0 * SIZE(CO1, LDC, 1)
2737	movhps	%xmm1, 2 * SIZE(CO1, LDC, 1)
2738#endif
2739
2740#ifndef LN
2741	addl	$4 * SIZE, CO1
2742#endif
2743
2744#if defined(LT) || defined(RN)
2745	movl	K,  %eax
2746	subl	KK, %eax
2747	leal	(,%eax, SIZE), %eax
2748	leal	(AA, %eax, 4), AA
2749#ifdef LT
2750	addl	$8 * SIZE, B
2751#endif
2752#endif
2753
2754#ifdef LN
2755	subl	$4, KK
2756	movl	BORIG, B
2757#endif
2758
2759#ifdef LT
2760	addl	$4, KK
2761#endif
2762
2763#ifdef RT
2764	movl	K, %eax
2765	movl	BORIG, B
2766	sall	$2 + BASE_SHIFT, %eax
2767	addl	%eax, AORIG
2768#endif
2769
2770	decl	%ebx			# i --
2771	jg	.L51
2772	ALIGN_4
2773
2774.L79:
2775#ifdef LN
2776       movl	K, %eax
2777       leal	(, %eax, SIZE), %eax
2778       leal 	(B, %eax, 2), B
2779#endif
2780
2781#if defined(LT) || defined(RN)
2782	movl	K,  %eax
2783	subl	KK, %eax
2784	leal	(,%eax, SIZE), %eax
2785	leal	(B,  %eax, 2), B
2786#endif
2787
2788#ifdef RN
2789	addl	$2, KK
2790#endif
2791
2792#ifdef RT
2793	subl	$2, KK
2794#endif
2795	ALIGN_4
2796
2797.L80:
2798	testl	$1, N
2799	je	.L999
2800
2801#ifdef LN
2802	movl	OFFSET, %eax
2803	addl	M, %eax
2804	movl	%eax, KK
2805#endif
2806
2807	leal	BUFFER, %ecx
2808
2809#ifdef RT
2810       movl	K, %eax
2811       sall	$BASE_SHIFT, %eax
2812       subl	%eax, B
2813#endif
2814
2815#if defined(LN) || defined(RT)
2816	movl	KK, %eax
2817	movl	B, BORIG
2818        sall	$BASE_SHIFT, %eax
2819	leal	(B,  %eax, 1), B
2820	leal	(BB, %eax, 4), BB
2821#endif
2822
2823#ifdef LT
2824	movl	OFFSET, %eax
2825	movl	%eax, KK
2826#endif
2827
2828#if defined(LT) || defined(RN)
2829	movl	KK, %eax
2830#else
2831	movl	K, %eax
2832	subl	KK, %eax
2833#endif
2834	sarl	$3, %eax
2835	jle	.L85
2836	ALIGN_4
2837
2838.L82:
2839	movsd	 0 * SIZE(B), %xmm3
2840	movhps	 2 * SIZE(B), %xmm3
2841	movsd	 4 * SIZE(B), %xmm7
2842	movhps	 6 * SIZE(B), %xmm7
2843
2844	pshufd	 $0x00, %xmm3, %xmm0
2845	pshufd	 $0x55, %xmm3, %xmm1
2846	pshufd	 $0xaa, %xmm3, %xmm2
2847	pshufd	 $0xff, %xmm3, %xmm3
2848
2849	pshufd	 $0x00, %xmm7, %xmm4
2850	pshufd	 $0x55, %xmm7, %xmm5
2851	pshufd	 $0xaa, %xmm7, %xmm6
2852	pshufd	 $0xff, %xmm7, %xmm7
2853
2854	movaps	%xmm0,  0 * SIZE(BB)
2855	movaps	%xmm1,  4 * SIZE(BB)
2856	movaps	%xmm2,  8 * SIZE(BB)
2857	movaps	%xmm3, 12 * SIZE(BB)
2858	movaps	%xmm4, 16 * SIZE(BB)
2859	movaps	%xmm5, 20 * SIZE(BB)
2860	movaps	%xmm6, 24 * SIZE(BB)
2861	movaps	%xmm7, 28 * SIZE(BB)
2862
2863	addl	$ 8 * SIZE, B
2864	addl	$32 * SIZE, BB
2865	decl	%eax
2866	jne	.L82
2867	ALIGN_4
2868
2869.L85:
2870#if defined(LT) || defined(RN)
2871	movl	KK, %eax
2872#else
2873	movl	K, %eax
2874	subl	KK, %eax
2875#endif
2876	andl	$7, %eax
2877	BRANCH
2878	jle	.L90
2879	ALIGN_4
2880
2881.L86:
2882	movss	 0 * SIZE(B), %xmm3
2883
2884	pshufd	 $0x00, %xmm3, %xmm0
2885
2886	movaps	%xmm0,  0 * SIZE(BB)
2887
2888	addl	$1 * SIZE, B
2889	addl	$4 * SIZE, BB
2890	decl	%eax
2891	jne	.L86
2892	ALIGN_4
2893
2894.L90:
2895#if defined(LT) || defined(RN)
2896	movl	A, AA
2897#else
2898	movl	A, %eax
2899	movl	%eax, AORIG
2900#endif
2901
2902#ifdef RT
2903	subl	LDC, C
2904#endif
2905	movl	C, CO1
2906#ifndef RT
2907	addl	LDC, C
2908#endif
2909
2910	testl	$1, M
2911	je	.L100
2912
2913#ifdef LN
2914       movl	K, %eax
2915       sall	$BASE_SHIFT, %eax
2916       subl	%eax, AORIG
2917#endif
2918
2919#if defined(LN) || defined(RT)
2920	movl	KK, %eax
2921	movl	AORIG, AA
2922	leal	(AA, %eax, SIZE), AA
2923#endif
2924
2925	leal	BUFFER, BB
2926
2927#if defined(LN) || defined(RT)
2928	movl	KK, %eax
2929	sall	$BASE_SHIFT, %eax
2930	leal	(BB, %eax, 4), BB
2931#endif
2932
2933	xorps	%xmm4, %xmm4
2934	xorps	%xmm5, %xmm5
2935	xorps	%xmm6, %xmm6
2936	xorps	%xmm7, %xmm7
2937
2938	movss	 0 * SIZE(AA), %xmm0
2939	movss	 4 * SIZE(AA), %xmm1
2940	movss	 0 * SIZE(BB), %xmm2
2941	movss	16 * SIZE(BB), %xmm3
2942
2943#if defined(LT) || defined(RN)
2944	movl	KK, %eax
2945#else
2946	movl	K, %eax
2947	subl	KK, %eax
2948#endif
2949	sarl	$3, %eax
2950	je	.L115
2951	ALIGN_4
2952
2953.L112:
2954	mulss	%xmm0, %xmm2
2955#if defined(OPTERON) || defined(BARCELONA)
2956	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
2957#endif
2958	movss	 1 * SIZE(AA), %xmm0
2959	addss	%xmm2, %xmm4
2960	movss	32 * SIZE(BB), %xmm2
2961	mulss	 4 * SIZE(BB), %xmm0
2962	addss	%xmm0, %xmm5
2963	movss	 2 * SIZE(AA), %xmm0
2964	mulss	 8 * SIZE(BB), %xmm0
2965	addss	%xmm0, %xmm6
2966	movss	 3 * SIZE(AA), %xmm0
2967	mulss	12 * SIZE(BB), %xmm0
2968	addss	%xmm0, %xmm7
2969	movss	 8 * SIZE(AA), %xmm0
2970	mulss	%xmm1, %xmm3
2971	movss	 5 * SIZE(AA), %xmm1
2972	addss	%xmm3, %xmm4
2973	movss	48 * SIZE(BB), %xmm3
2974	mulss	20 * SIZE(BB), %xmm1
2975	addss	%xmm1, %xmm5
2976	movss	 6 * SIZE(AA), %xmm1
2977	mulss	24 * SIZE(BB), %xmm1
2978	addss	%xmm1, %xmm6
2979	movss	 7 * SIZE(AA), %xmm1
2980	mulss	28 * SIZE(BB), %xmm1
2981	addss	%xmm1, %xmm7
2982	movss	12 * SIZE(AA), %xmm1
2983
2984	addl	$ 8 * SIZE, AA
2985	addl	$32 * SIZE, BB
2986	decl   %eax
2987	jne    .L112
2988	ALIGN_4
2989
2990.L115:
2991#if defined(LT) || defined(RN)
2992	movl	KK, %eax
2993#else
2994	movl	K, %eax
2995	subl	KK, %eax
2996#endif
2997	andl	$7, %eax		# if (k & 1)
2998	BRANCH
2999	je .L118
3000	ALIGN_4
3001
3002.L116:
3003	mulss	%xmm0, %xmm2
3004	movss	 1 * SIZE(AA), %xmm0
3005	addss	%xmm2, %xmm4
3006	movss	 4 * SIZE(BB), %xmm2
3007
3008	addl	$ 1 * SIZE, AA
3009	addl	$ 4 * SIZE, BB
3010	decl	%eax
3011	jg	.L116
3012	ALIGN_4
3013
3014.L118:
3015	addss	%xmm5, %xmm4
3016	addss	%xmm7, %xmm6
3017	addss	%xmm6, %xmm4
3018
3019#if defined(LN) || defined(RT)
3020	movl	KK, %eax
3021	subl	$1, %eax
3022
3023	movl	AORIG, AA
3024	movl	BORIG, B
3025	leal	BUFFER, BB
3026
3027	sall	$ BASE_SHIFT, %eax
3028	leal	(AA, %eax, 1), AA
3029	leal	(B,  %eax, 1), B
3030	leal	(BB, %eax, 4), BB
3031#endif
3032
3033#if defined(LN) || defined(LT)
3034	movss	 0 * SIZE(B), %xmm1
3035	subss	%xmm4,  %xmm1
3036#else
3037	movss	 0 * SIZE(AA), %xmm0
3038	subss	%xmm4, %xmm0
3039#endif
3040
3041#if defined(LN) || defined(LT)
3042	mulss	 0 * SIZE(AA), %xmm1
3043#endif
3044
3045#if defined(RN) || defined(RT)
3046	mulss	 0 * SIZE(B), %xmm0
3047#endif
3048
3049#if defined(LN) || defined(LT)
3050	movss	%xmm1,   0 * SIZE(B)
3051
3052	pshufd	$0x00, %xmm1, %xmm0
3053	movaps	%xmm0,   0 * SIZE(BB)
3054#else
3055	movss	%xmm0,   0 * SIZE(AA)
3056#endif
3057
3058#ifdef LN
3059	subl	$1 * SIZE, CO1
3060#endif
3061
3062#if defined(LN) || defined(LT)
3063	movss	%xmm1, 0 * SIZE(CO1)
3064#else
3065	movss	%xmm0, 0 * SIZE(CO1)
3066#endif
3067
3068#ifndef LN
3069	addl	$1 * SIZE, CO1
3070#endif
3071
3072#if defined(LT) || defined(RN)
3073	movl	K,  %eax
3074	subl	KK, %eax
3075	leal	(AA, %eax, SIZE), AA
3076#ifdef LT
3077	addl	$1 * SIZE, B
3078#endif
3079#endif
3080
3081#ifdef LN
3082	subl	$1, KK
3083	movl	BORIG, B
3084#endif
3085
3086#ifdef LT
3087	addl	$1, KK
3088#endif
3089
3090#ifdef RT
3091	movl	K, %eax
3092	movl	BORIG, B
3093	sall	$BASE_SHIFT, %eax
3094	addl	%eax, AORIG
3095#endif
3096	ALIGN_4
3097
3098.L100:
3099	testl	$2, M
3100	je	.L110
3101
3102#ifdef LN
3103       movl	K, %eax
3104       sall	$1 + BASE_SHIFT, %eax
3105       subl	%eax, AORIG
3106#endif
3107
3108#if defined(LN) || defined(RT)
3109	movl	KK, %eax
3110	movl	AORIG, AA
3111	leal	(, %eax, SIZE), %eax
3112	leal	(AA, %eax, 2), AA
3113#endif
3114
3115	leal	BUFFER, BB
3116
3117#if defined(LN) || defined(RT)
3118	movl	KK, %eax
3119	sall	$BASE_SHIFT, %eax
3120	leal	(BB, %eax, 4), BB
3121#endif
3122
3123	xorps	%xmm4, %xmm4
3124	xorps	%xmm5, %xmm5
3125	xorps	%xmm6, %xmm6
3126	xorps	%xmm7, %xmm7
3127
3128#ifdef	movsd
3129	xorps	%xmm0, %xmm0
3130#endif
3131	movsd	 0 * SIZE(AA), %xmm0
3132#ifdef	movsd
3133	xorps	%xmm1, %xmm1
3134#endif
3135	movsd	 8 * SIZE(AA), %xmm1
3136	movaps	 0 * SIZE(BB), %xmm2
3137	movaps	16 * SIZE(BB), %xmm3
3138
3139#if defined(LT) || defined(RN)
3140	movl	KK, %eax
3141#else
3142	movl	K, %eax
3143	subl	KK, %eax
3144#endif
3145	sarl	$3, %eax
3146	je	.L105
3147	ALIGN_4
3148
3149.L102:
3150	mulps	%xmm0, %xmm2
3151#if defined(OPTERON) || defined(BARCELONA)
3152	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
3153#endif
3154	movsd	 2 * SIZE(AA), %xmm0
3155	addps	%xmm2, %xmm4
3156	movaps	 4 * SIZE(BB), %xmm2
3157	mulps	%xmm0, %xmm2
3158	movsd	 4 * SIZE(AA), %xmm0
3159	addps	%xmm2, %xmm5
3160	movaps	 8 * SIZE(BB), %xmm2
3161	mulps	%xmm0, %xmm2
3162	movsd	 6 * SIZE(AA), %xmm0
3163	addps	%xmm2, %xmm6
3164	movaps	12 * SIZE(BB), %xmm2
3165	mulps	%xmm0, %xmm2
3166	movsd	16 * SIZE(AA), %xmm0
3167	addps	%xmm2, %xmm7
3168	movaps	32 * SIZE(BB), %xmm2
3169	mulps	%xmm1, %xmm3
3170	movsd	10 * SIZE(AA), %xmm1
3171	addps	%xmm3, %xmm4
3172	movaps	20 * SIZE(BB), %xmm3
3173	mulps	%xmm1, %xmm3
3174	movsd	12 * SIZE(AA), %xmm1
3175	addps	%xmm3, %xmm5
3176	movaps	24 * SIZE(BB), %xmm3
3177	mulps	%xmm1, %xmm3
3178	movsd	14 * SIZE(AA), %xmm1
3179	addps	%xmm3, %xmm6
3180	movaps	28 * SIZE(BB), %xmm3
3181	mulps	%xmm1, %xmm3
3182	movsd	24 * SIZE(AA), %xmm1
3183	addps	%xmm3, %xmm7
3184	movaps	48 * SIZE(BB), %xmm3
3185
3186	addl	$16 * SIZE, AA
3187	addl	$32 * SIZE, BB
3188	decl   %eax
3189	jne    .L102
3190	ALIGN_4
3191
3192.L105:
3193#if defined(LT) || defined(RN)
3194	movl	KK, %eax
3195#else
3196	movl	K, %eax
3197	subl	KK, %eax
3198#endif
3199	andl	$7, %eax		# if (k & 1)
3200	BRANCH
3201	je .L108
3202	ALIGN_4
3203
3204.L106:
3205	mulps	%xmm0, %xmm2
3206	addps	%xmm2, %xmm4
3207	movsd	 2 * SIZE(AA), %xmm0
3208	movaps	 4 * SIZE(BB), %xmm2
3209
3210	addl	$2 * SIZE, AA
3211	addl	$4 * SIZE, BB
3212	decl	%eax
3213	jg	.L106
3214	ALIGN_4
3215
3216.L108:
3217	addps	%xmm5, %xmm4
3218	addps	%xmm7, %xmm6
3219	addps	%xmm6, %xmm4
3220
3221#if defined(LN) || defined(RT)
3222	movl	KK, %eax
3223#ifdef LN
3224	subl	$2, %eax
3225#else
3226	subl	$1, %eax
3227#endif
3228
3229	movl	AORIG, AA
3230	movl	BORIG, B
3231	leal	BUFFER, BB
3232
3233	sall	$ BASE_SHIFT, %eax
3234	leal	(AA, %eax, 2), AA
3235	leal	(B,  %eax, 1), B
3236	leal	(BB, %eax, 4), BB
3237#endif
3238
3239#if defined(LN) || defined(LT)
3240	pshufd	$1, %xmm4, %xmm6
3241
3242	movss	 0 * SIZE(B), %xmm1
3243	movss	 1 * SIZE(B), %xmm3
3244
3245	subss	%xmm4,  %xmm1
3246	subss	%xmm6,  %xmm3
3247#else
3248#ifdef	movsd
3249	xorps	%xmm0, %xmm0
3250#endif
3251	movsd	 0 * SIZE(AA), %xmm0
3252
3253	subps	%xmm4, %xmm0
3254#endif
3255
3256#ifdef LN
3257	movaps	  0 * SIZE(AA), %xmm4
3258	pshufd	 $0xff, %xmm4, %xmm6
3259	mulss	 %xmm6, %xmm3
3260	pshufd	 $0xaa, %xmm4, %xmm6
3261	mulss	 %xmm3, %xmm6
3262	subss	 %xmm6, %xmm1
3263
3264	pshufd	 $0x00, %xmm4, %xmm6
3265	mulss	 %xmm6, %xmm1
3266#endif
3267
3268#ifdef LT
3269	movaps	 0 * SIZE(AA), %xmm4
3270	pshufd	 $0x00, %xmm4, %xmm6
3271	mulss	 %xmm6, %xmm1
3272	pshufd	 $0x55, %xmm4, %xmm6
3273	mulss	 %xmm1, %xmm6
3274	subss	 %xmm6, %xmm3
3275
3276	pshufd	 $0xff, %xmm4, %xmm6
3277	mulss	 %xmm6, %xmm3
3278#endif
3279
3280#if defined(RN) || defined(RT)
3281	movss	 0 * SIZE(B), %xmm6
3282	pshufd	 $0x00, %xmm6, %xmm7
3283	mulps	 %xmm7, %xmm0
3284#endif
3285
3286#if defined(LN) || defined(LT)
3287	movss	%xmm1,   0 * SIZE(B)
3288	movss	%xmm3,   1 * SIZE(B)
3289
3290	pshufd	$0x00, %xmm1, %xmm0
3291	movaps	%xmm0,   0 * SIZE(BB)
3292	pshufd	$0x00, %xmm3, %xmm0
3293	movaps	%xmm0,   4 * SIZE(BB)
3294#else
3295	movlps	%xmm0,   0 * SIZE(AA)
3296#endif
3297
3298#ifdef LN
3299	subl	$2 * SIZE, CO1
3300#endif
3301
3302#if defined(LN) || defined(LT)
3303	movss	%xmm1, 0 * SIZE(CO1)
3304	movss	%xmm3, 1 * SIZE(CO1)
3305#else
3306	movlps	%xmm0, 0 * SIZE(CO1)
3307#endif
3308
3309#ifndef LN
3310	addl	$2 * SIZE, CO1
3311#endif
3312
3313#if defined(LT) || defined(RN)
3314	movl	K,  %eax
3315	subl	KK, %eax
3316	leal	(,%eax, SIZE), %eax
3317	leal	(AA, %eax, 2), AA
3318#ifdef LT
3319	addl	$2 * SIZE, B
3320#endif
3321#endif
3322
3323#ifdef LN
3324	subl	$2, KK
3325	movl	BORIG, B
3326#endif
3327
3328#ifdef LT
3329	addl	$2, KK
3330#endif
3331
3332#ifdef RT
3333	movl	K, %eax
3334	movl	BORIG, B
3335	sall	$1 + BASE_SHIFT, %eax
3336	addl	%eax, AORIG
3337#endif
3338	ALIGN_4
3339
3340.L110:
3341	movl	M,  %ebx
3342	sarl	$2, %ebx	# i = (m >> 2)
3343	jle	.L119
3344	ALIGN_4
3345
3346.L91:
3347#ifdef LN
3348       movl	K, %eax
3349       sall	$2 + BASE_SHIFT, %eax
3350       subl	%eax, AORIG
3351#endif
3352
3353#if defined(LN) || defined(RT)
3354	movl	KK, %eax
3355	movl	AORIG, AA
3356	leal	(, %eax, SIZE), %eax
3357	leal	(AA, %eax, 4), AA
3358#endif
3359
3360	leal	BUFFER, BB
3361
3362#if defined(LN) || defined(RT)
3363	movl	KK, %eax
3364	sall	$BASE_SHIFT, %eax
3365	leal	(BB, %eax, 4), BB
3366#endif
3367
3368	xorps	%xmm4, %xmm4
3369	xorps	%xmm5, %xmm5
3370	xorps	%xmm6, %xmm6
3371	xorps	%xmm7, %xmm7
3372
3373	movaps	 0 * SIZE(AA), %xmm0
3374	movaps	16 * SIZE(AA), %xmm1
3375	movaps	 0 * SIZE(BB), %xmm2
3376	movaps	16 * SIZE(BB), %xmm3
3377
3378	PREFETCHW	-4 * SIZE(CO1)
3379
3380#if defined(LT) || defined(RN)
3381	movl	KK, %eax
3382#else
3383	movl	K, %eax
3384	subl	KK, %eax
3385#endif
3386	sarl	$3, %eax
3387	je	.L95
3388	ALIGN_4
3389
3390.L92:
3391	mulps	%xmm0, %xmm2
3392#if defined(OPTERON) || defined(BARCELONA)
3393	prefetcht0 (PREFETCHSIZE  + 0) * SIZE(AA)
3394#endif
3395	movaps	 4 * SIZE(AA), %xmm0
3396	addps	%xmm2, %xmm4
3397	movaps	32 * SIZE(BB), %xmm2
3398	mulps	 4 * SIZE(BB), %xmm0
3399	addps	%xmm0, %xmm5
3400	movaps	 8 * SIZE(AA), %xmm0
3401	mulps	 8 * SIZE(BB), %xmm0
3402	addps	%xmm0, %xmm6
3403	movaps	12 * SIZE(AA), %xmm0
3404	mulps	12 * SIZE(BB), %xmm0
3405	addps	%xmm0, %xmm7
3406	movaps	32 * SIZE(AA), %xmm0
3407#if defined(OPTERON) || defined(BARCELONA)
3408	prefetcht0 (PREFETCHSIZE  + 16) * SIZE(AA)
3409#endif
3410	mulps	%xmm1, %xmm3
3411	movaps	20 * SIZE(AA), %xmm1
3412	addps	%xmm3, %xmm4
3413	movaps	48 * SIZE(BB), %xmm3
3414	mulps	20 * SIZE(BB), %xmm1
3415	addps	%xmm1, %xmm5
3416	movaps	24 * SIZE(AA), %xmm1
3417	mulps	24 * SIZE(BB), %xmm1
3418	addps	%xmm1, %xmm6
3419	movaps	28 * SIZE(AA), %xmm1
3420	mulps	28 * SIZE(BB), %xmm1
3421	addps	%xmm1, %xmm7
3422	movaps	48 * SIZE(AA), %xmm1
3423
3424	addl	$32 * SIZE, AA
3425	addl	$32 * SIZE, BB
3426	decl   %eax
3427	jne    .L92
3428	ALIGN_4
3429
3430.L95:
3431#if defined(LT) || defined(RN)
3432	movl	KK, %eax
3433#else
3434	movl	K, %eax
3435	subl	KK, %eax
3436#endif
3437	andl	$7, %eax		# if (k & 1)
3438	BRANCH
3439	je .L98
3440	ALIGN_4
3441
3442.L96:
3443	mulps	%xmm0, %xmm2
3444	addps	%xmm2, %xmm4
3445	movaps	 4 * SIZE(AA), %xmm0
3446	movaps	 4 * SIZE(BB), %xmm2
3447
3448	addl	$4 * SIZE, AA
3449	addl	$4 * SIZE, BB
3450	decl	%eax
3451	jg	.L96
3452	ALIGN_4
3453
3454.L98:
3455	addps	%xmm5, %xmm4
3456	addps	%xmm7, %xmm6
3457	addps	%xmm6, %xmm4
3458
3459#if defined(LN) || defined(RT)
3460	movl	KK, %eax
3461#ifdef LN
3462	subl	$4, %eax
3463#else
3464	subl	$1, %eax
3465#endif
3466
3467	movl	AORIG, AA
3468	movl	BORIG, B
3469	leal	BUFFER, BB
3470
3471	sall	$ BASE_SHIFT, %eax
3472	leal	(AA, %eax, 4), AA
3473	leal	(B,  %eax, 1), B
3474	leal	(BB, %eax, 4), BB
3475#endif
3476
3477#if defined(LN) || defined(LT)
3478	movaps	 %xmm4, %xmm0
3479	unpcklps %xmm6, %xmm4
3480	unpckhps %xmm6, %xmm0
3481
3482	movaps	 %xmm5, %xmm1
3483	unpcklps %xmm7, %xmm5
3484	unpckhps %xmm7, %xmm1
3485
3486	movaps	 %xmm4, %xmm6
3487	unpcklps %xmm5, %xmm4
3488	unpckhps %xmm5, %xmm6
3489
3490	movaps	 %xmm0, %xmm2
3491	unpcklps %xmm1, %xmm0
3492	unpckhps %xmm1, %xmm2
3493
3494	movss	 0 * SIZE(B), %xmm1
3495	movss	 1 * SIZE(B), %xmm3
3496	movss	 2 * SIZE(B), %xmm5
3497	movss	 3 * SIZE(B), %xmm7
3498
3499	subss	%xmm4,  %xmm1
3500	subss	%xmm6,  %xmm3
3501	subss	%xmm0,  %xmm5
3502	subss	%xmm2,  %xmm7
3503#else
3504	movaps	 0 * SIZE(AA), %xmm0
3505
3506	subps	%xmm4, %xmm0
3507#endif
3508
3509#ifdef LN
3510	movaps	 12 * SIZE(AA), %xmm4
3511	pshufd	 $0xff, %xmm4, %xmm6
3512	mulss	 %xmm6, %xmm7
3513	pshufd	 $0xaa, %xmm4, %xmm6
3514	mulss	 %xmm7, %xmm6
3515	subss	 %xmm6, %xmm5
3516	pshufd	 $0x55, %xmm4, %xmm6
3517	mulss	 %xmm7, %xmm6
3518	subss	 %xmm6, %xmm3
3519	pshufd	 $0x00, %xmm4, %xmm6
3520	mulss	 %xmm7, %xmm6
3521	subss	 %xmm6, %xmm1
3522
3523	movaps	  8 * SIZE(AA), %xmm4
3524	pshufd	 $0xaa, %xmm4, %xmm6
3525	mulss	 %xmm6, %xmm5
3526	pshufd	 $0x55, %xmm4, %xmm6
3527	mulss	 %xmm5, %xmm6
3528	subss	 %xmm6, %xmm3
3529	pshufd	 $0x00, %xmm4, %xmm6
3530	mulss	 %xmm5, %xmm6
3531	subss	 %xmm6, %xmm1
3532
3533	movaps	  4 * SIZE(AA), %xmm4
3534	pshufd	 $0x55, %xmm4, %xmm6
3535	mulss	 %xmm6, %xmm3
3536	pshufd	 $0x00, %xmm4, %xmm6
3537	mulss	 %xmm3, %xmm6
3538	subss	 %xmm6, %xmm1
3539
3540	movaps	  0 * SIZE(AA), %xmm4
3541	pshufd	 $0x00, %xmm4, %xmm6
3542	mulss	 %xmm6, %xmm1
3543#endif
3544
3545#ifdef LT
3546	movaps	 0 * SIZE(AA), %xmm4
3547	pshufd	 $0x00, %xmm4, %xmm6
3548	mulss	 %xmm6, %xmm1
3549
3550	pshufd	 $0x55, %xmm4, %xmm6
3551	mulss	 %xmm1, %xmm6
3552	subss	 %xmm6, %xmm3
3553	pshufd	 $0xaa, %xmm4, %xmm6
3554	mulss	 %xmm1, %xmm6
3555	subss	 %xmm6, %xmm5
3556	pshufd	 $0xff, %xmm4, %xmm6
3557	mulss	 %xmm1, %xmm6
3558	subss	 %xmm6, %xmm7
3559
3560	movaps	 4 * SIZE(AA), %xmm4
3561	pshufd	 $0x55, %xmm4, %xmm6
3562	mulss	 %xmm6, %xmm3
3563	pshufd	 $0xaa, %xmm4, %xmm6
3564	mulss	 %xmm3, %xmm6
3565	subss	 %xmm6, %xmm5
3566	pshufd	 $0xff, %xmm4, %xmm6
3567	mulss	 %xmm3, %xmm6
3568	subss	 %xmm6, %xmm7
3569
3570	movaps	 8 * SIZE(AA), %xmm4
3571	pshufd	 $0xaa, %xmm4, %xmm6
3572	mulss	 %xmm6, %xmm5
3573	pshufd	 $0xff, %xmm4, %xmm6
3574	mulss	 %xmm5, %xmm6
3575	subss	 %xmm6, %xmm7
3576
3577	movaps	12 * SIZE(AA), %xmm4
3578	pshufd	 $0xff, %xmm4, %xmm6
3579	mulss	 %xmm6, %xmm7
3580#endif
3581
3582#if defined(RN) || defined(RT)
3583	movss	 0 * SIZE(B), %xmm6
3584	pshufd	 $0x00, %xmm6, %xmm7
3585	mulps	 %xmm7, %xmm0
3586#endif
3587
3588#if defined(LN) || defined(LT)
3589	movss	%xmm1,   0 * SIZE(B)
3590	movss	%xmm3,   1 * SIZE(B)
3591	movss	%xmm5,   2 * SIZE(B)
3592	movss	%xmm7,   3 * SIZE(B)
3593
3594	pshufd	$0x00, %xmm1, %xmm0
3595	movaps	%xmm0,   0 * SIZE(BB)
3596	pshufd	$0x00, %xmm3, %xmm0
3597	movaps	%xmm0,   4 * SIZE(BB)
3598
3599	pshufd	$0x00, %xmm5, %xmm0
3600	movaps	%xmm0,   8 * SIZE(BB)
3601	pshufd	$0x00, %xmm7, %xmm0
3602	movaps	%xmm0,  12 * SIZE(BB)
3603#else
3604	movss	%xmm0,   0 * SIZE(AA)
3605	movss	%xmm1,   1 * SIZE(AA)
3606	movss	%xmm2,   2 * SIZE(AA)
3607	movss	%xmm3,   3 * SIZE(AA)
3608#endif
3609
3610#ifdef LN
3611	subl	$4 * SIZE, CO1
3612#endif
3613
3614#if defined(LN) || defined(LT)
3615	unpcklps %xmm5, %xmm1
3616	unpcklps %xmm7, %xmm3
3617
3618	unpcklps %xmm3, %xmm1
3619
3620	movlps	%xmm1, 0 * SIZE(CO1)
3621	movhps	%xmm1, 2 * SIZE(CO1)
3622#else
3623	movlps	%xmm0, 0 * SIZE(CO1)
3624	movhps	%xmm0, 2 * SIZE(CO1)
3625#endif
3626
3627#ifndef LN
3628	addl	$4 * SIZE, CO1
3629#endif
3630
3631#if defined(LT) || defined(RN)
3632	movl	K,  %eax
3633	subl	KK, %eax
3634	leal	(,%eax, SIZE), %eax
3635	leal	(AA, %eax, 4), AA
3636#ifdef LT
3637	addl	$4 * SIZE, B
3638#endif
3639#endif
3640
3641#ifdef LN
3642	subl	$4, KK
3643	movl	BORIG, B
3644#endif
3645
3646#ifdef LT
3647	addl	$4, KK
3648#endif
3649
3650#ifdef RT
3651	movl	K, %eax
3652	movl	BORIG, B
3653	sall	$2 + BASE_SHIFT, %eax
3654	addl	%eax, AORIG
3655#endif
3656
3657	decl	%ebx			# i --
3658	jg	.L91
3659	ALIGN_4
3660
3661.L119:
3662#ifdef LN
3663       movl	K, %eax
3664       leal 	(B, %eax, SIZE), B
3665#endif
3666
3667#if defined(LT) || defined(RN)
3668	movl	K,  %eax
3669	subl	KK, %eax
3670	leal	(B,  %eax, SIZE), B
3671#endif
3672
3673#ifdef RN
3674	addl	$1, KK
3675#endif
3676
3677#ifdef RT
3678	subl	$1, KK
3679#endif
3680	ALIGN_4
3681
3682
3683.L999:
3684	movl	OLD_STACK, %esp
3685	popl	%ebx
3686	popl	%esi
3687	popl	%edi
3688	popl	%ebp
3689	ret
3690
3691	EPILOGUE
3692