1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#if !defined(HAVE_SSE) || !defined(HAVE_MMX)
43#error  You have to check your configuration.
44#endif
45
46#define STACK	16
47#define ARGS	 0
48
49#define STACK_M	 4 + STACK + ARGS(%esi)
50#define STACK_N	 8 + STACK + ARGS(%esi)
51#define STACK_K	12 + STACK + ARGS(%esi)
52#define STACK_A	20 + STACK + ARGS(%esi)
53#define STACK_B	24 + STACK + ARGS(%esi)
54#define STACK_C	28 + STACK + ARGS(%esi)
55#define STACK_LDC	32 + STACK + ARGS(%esi)
56#define STACK_OFFT	36 + STACK + ARGS(%esi)
57
58#define TRMASK	 0(%esp)
59#define K	16(%esp)
60#define N	20(%esp)
61#define M	24(%esp)
62#define A	28(%esp)
63#define C	32(%esp)
64#define J	36(%esp)
65#define OLD_STACK 40(%esp)
66#define OFFSET  44(%esp)
67#define KK	48(%esp)
68#define KKK	52(%esp)
69#define AORIG	56(%esp)
70#define BORIG	60(%esp)
71#define BUFFER 128(%esp)
72
73#ifdef HAVE_3DNOW
74#define PREFETCH     prefetch
75#define PREFETCHW    prefetchw
76#define PREFETCHSIZE (16 * 10 + 8)
77#else
78#define PREFETCH     prefetcht0
79#define PREFETCHW    prefetcht0
80#define PREFETCHSIZE   96
81#endif
82
83#define B	%edi
84#define AA	%edx
85#define	BB	%ecx
86#define LDC	%ebp
87#define CO1	%esi
88
89#define STACK_ALIGN	4096
90#define STACK_OFFSET	1024
91
92#if !defined(HAVE_SSE2) || defined(OPTERON)
93#define movsd	movlps
94#endif
95
96#ifdef HAVE_SSE2
97#define xorps	pxor
98#endif
99
100	PROLOGUE
101
102	pushl	%ebp
103	pushl	%edi
104	pushl	%esi
105	pushl	%ebx
106
107	PROFCODE
108
109	movl	%esp, %esi	# save old stack
110
111	subl	$128 + LOCAL_BUFFER_SIZE, %esp
112	andl	$-STACK_ALIGN, %esp
113
114	STACK_TOUCHING
115
116	movss	STACK_M, %xmm0
117	movl	STACK_N, %eax
118	movss	STACK_K, %xmm1
119	movss	STACK_A, %xmm2
120	movl	STACK_B, B
121	movss	STACK_C, %xmm3
122	movl	STACK_LDC, LDC
123	movss	STACK_OFFT, %xmm4
124
125	movss	%xmm1, K
126	movl	%eax,  N
127	movss	%xmm0, M
128	movss	%xmm2, A
129	movss	%xmm3, C
130	movl	%esi,  OLD_STACK
131	movss	%xmm4, OFFSET
132	movss	%xmm4, KK
133
134	leal	(, LDC, SIZE), LDC
135
136#ifdef LN
137       movl	M, %eax
138       leal	(, %eax, SIZE), %eax
139       addl	%eax, C
140       imull	K, %eax
141       addl	%eax, A
142#endif
143
144#ifdef RT
145       movl	N, %eax
146       leal	(, %eax, SIZE), %eax
147       imull	K, %eax
148       addl	%eax, B
149       movl	N, %eax
150       imull	LDC, %eax
151       addl	%eax, C
152#endif
153
154#ifdef RN
155	negl	KK
156#endif
157
158#ifdef RT
159       movl	N, %eax
160       subl	OFFSET, %eax
161       movl	%eax, KK
162#endif
163
164#if defined(LN) || defined(LT)
165	movl	  $0x3f800000,   0 + TRMASK	#  1.0
166	movl	  $0x00000000,   4 + TRMASK	#  0.0
167	movl	  $0x3f800000,   8 + TRMASK	#  1.0
168	movl	  $0x00000000,  12 + TRMASK	#  0.0
169#endif
170
171	testl	$1, N
172	jle	.L100
173
174#ifdef LN
175	movl	OFFSET, %eax
176	addl	M, %eax
177	movl	%eax, KK
178#endif
179
180	leal	BUFFER, BB
181
182#ifdef RT
183       movl	K, %eax
184       sall	$BASE_SHIFT, %eax
185       subl	%eax, B
186#endif
187
188#if defined(LN) || defined(RT)
189	movl	KK, %eax
190	movl	B, BORIG
191        sall	$BASE_SHIFT, %eax
192	leal	(B,  %eax, 1), B
193	leal	(BB, %eax, 4), BB
194#endif
195
196#ifdef LT
197	movl	OFFSET, %eax
198	movl	%eax, KK
199#endif
200
201#if defined(LT) || defined(RN)
202	movl	KK, %eax
203#else
204	movl	K,  %eax
205	subl	KK, %eax
206#endif
207	sarl	$3, %eax
208	jle	.L103
209	ALIGN_4
210
211.L102:
212	movsd	 0 * SIZE(B), %xmm3
213	movhps	 2 * SIZE(B), %xmm3
214	movsd	 4 * SIZE(B), %xmm7
215	movhps	 6 * SIZE(B), %xmm7
216
217#ifdef HAVE_SSE2
218	pshufd	 $0x00, %xmm3, %xmm0
219	pshufd	 $0x55, %xmm3, %xmm1
220	pshufd	 $0xaa, %xmm3, %xmm2
221	pshufd	 $0xff, %xmm3, %xmm3
222
223	pshufd	 $0x00, %xmm7, %xmm4
224	pshufd	 $0x55, %xmm7, %xmm5
225	pshufd	 $0xaa, %xmm7, %xmm6
226	pshufd	 $0xff, %xmm7, %xmm7
227#else
228	movaps	%xmm3, %xmm0
229	shufps	 $0x00, %xmm0, %xmm0
230	movaps	%xmm3, %xmm1
231	shufps	 $0x55, %xmm1, %xmm1
232	movaps	%xmm3, %xmm2
233	shufps	 $0xaa, %xmm2, %xmm2
234	shufps	 $0xff, %xmm3, %xmm3
235
236	movaps	%xmm7, %xmm4
237	shufps	 $0x00, %xmm4, %xmm4
238	movaps	%xmm7, %xmm5
239	shufps	 $0x55, %xmm5, %xmm5
240	movaps	%xmm7, %xmm6
241	shufps	 $0xaa, %xmm6, %xmm6
242	shufps	 $0xff, %xmm7, %xmm7
243#endif
244
245	movaps	%xmm0,  0 * SIZE(BB)
246	movaps	%xmm1,  4 * SIZE(BB)
247	movaps	%xmm2,  8 * SIZE(BB)
248	movaps	%xmm3, 12 * SIZE(BB)
249	movaps	%xmm4, 16 * SIZE(BB)
250	movaps	%xmm5, 20 * SIZE(BB)
251	movaps	%xmm6, 24 * SIZE(BB)
252	movaps	%xmm7, 28 * SIZE(BB)
253
254	addl	$ 8 * SIZE, B
255	addl	$32 * SIZE, BB
256
257	decl	%eax
258	BRANCH
259	jne	.L102
260	ALIGN_2
261
262.L103:
263#if defined(LT) || defined(RN)
264	movl	KK, %eax
265#else
266	movl	K,  %eax
267	subl	KK, %eax
268#endif
269	andl	$7, %eax
270	BRANCH
271	jle	.L105
272	ALIGN_2
273
274.L104:
275	movss	 0 * SIZE(B), %xmm0
276
277	shufps	 $0x00, %xmm0, %xmm0
278
279	movaps	%xmm0,  0 * SIZE(BB)
280
281	addl	$1 * SIZE, B
282	addl	$4 * SIZE, BB
283
284	decl	%eax
285	jne	.L104
286	ALIGN_4
287
288.L105:
289#if defined(LT) || defined(RN)
290	movl	A, AA
291#else
292	movl	A, %eax
293	movl	%eax, AORIG
294#endif
295
296#ifdef RT
297	subl	LDC, C
298#endif
299	movl	C, CO1
300#ifndef RT
301	addl	LDC, C
302#endif
303
304	movl	M,  %ebx
305	sarl	$3, %ebx	# i = (m >> 2)
306	jle	.L130
307	ALIGN_4
308
309.L110:
310#ifdef LN
311       movl	K, %eax
312       sall	$3 + BASE_SHIFT, %eax
313       subl	%eax, AORIG
314#endif
315
316#if defined(LN) || defined(RT)
317	movl	KK, %eax
318	movl	AORIG, AA
319	sall	$3 + BASE_SHIFT, %eax
320	addl	%eax, AA
321#endif
322
323	leal	BUFFER, BB
324
325#if defined(LN) || defined(RT)
326	movl	KK, %eax
327	sall	$BASE_SHIFT, %eax
328	leal	(BB, %eax, 4), BB
329#endif
330
331	movaps	 0 * SIZE(BB), %xmm2
332	xorps	%xmm4, %xmm4
333	movaps	 0 * SIZE(AA), %xmm0
334	xorps	%xmm5, %xmm5
335	movaps	 16 * SIZE(BB), %xmm3
336	xorps	%xmm6, %xmm6
337	movaps	 16 * SIZE(AA), %xmm1
338	xorps	%xmm7, %xmm7
339
340	PREFETCHW      7 * SIZE(CO1)
341
342#if defined(LT) || defined(RN)
343	movl	KK, %eax
344#else
345	movl	K, %eax
346	subl	KK, %eax
347#endif
348	sarl	$3, %eax
349	je	.L112
350	ALIGN_2
351
352.L111:
353	mulps	%xmm2, %xmm0
354	mulps	 4 * SIZE(AA), %xmm2
355	addps	%xmm0, %xmm4
356	movaps	 8 * SIZE(AA), %xmm0
357	addps	%xmm2, %xmm6
358	movaps	 4 * SIZE(BB), %xmm2
359	mulps	%xmm2, %xmm0
360	mulps	12 * SIZE(AA), %xmm2
361	addps	%xmm0, %xmm5
362	movaps	32 * SIZE(AA), %xmm0
363	addps	%xmm2, %xmm7
364	movaps	 8 * SIZE(BB), %xmm2
365	mulps	%xmm2, %xmm1
366	mulps	20 * SIZE(AA), %xmm2
367	addps	%xmm1, %xmm4
368	movaps	24 * SIZE(AA), %xmm1
369	addps	%xmm2, %xmm6
370	movaps	12 * SIZE(BB), %xmm2
371	mulps	%xmm2, %xmm1
372	mulps	28 * SIZE(AA), %xmm2
373	addps	%xmm1, %xmm5
374	movaps	48 * SIZE(AA), %xmm1
375	addps	%xmm2, %xmm7
376	movaps	32 * SIZE(BB), %xmm2
377	mulps	%xmm3, %xmm0
378	mulps	36 * SIZE(AA), %xmm3
379	addps	%xmm0, %xmm4
380	movaps	40 * SIZE(AA), %xmm0
381	addps	%xmm3, %xmm6
382	movaps	20 * SIZE(BB), %xmm3
383	mulps	%xmm3, %xmm0
384	mulps	44 * SIZE(AA), %xmm3
385	addps	%xmm0, %xmm5
386	movaps	64 * SIZE(AA), %xmm0
387	addps	%xmm3, %xmm7
388	movaps	24 * SIZE(BB), %xmm3
389	mulps	%xmm3, %xmm1
390	mulps	52 * SIZE(AA), %xmm3
391	addps	%xmm1, %xmm4
392	movaps	56 * SIZE(AA), %xmm1
393	addps	%xmm3, %xmm6
394	movaps	28 * SIZE(BB), %xmm3
395	mulps	%xmm3, %xmm1
396	mulps	60 * SIZE(AA), %xmm3
397	addps	%xmm1, %xmm5
398	movaps	80 * SIZE(AA), %xmm1
399	addps	%xmm3, %xmm7
400	movaps	48 * SIZE(BB), %xmm3
401
402	addl   $64 * SIZE, AA
403	addl   $32 * SIZE, BB
404	decl   %eax
405	jne    .L111
406	ALIGN_2
407
408.L112:
409#if defined(LT) || defined(RN)
410	movl	KK, %eax
411#else
412	movl	K, %eax
413	subl	KK, %eax
414#endif
415	andl	$7, %eax		# if (k & 1)
416	BRANCH
417	je .L114
418
419.L113:
420	movaps	 0 * SIZE(BB), %xmm2
421	movaps	 0 * SIZE(AA), %xmm0
422	mulps	%xmm2, %xmm0
423	addps	%xmm0, %xmm4
424	mulps	 4 * SIZE(AA), %xmm2
425	addps	%xmm2, %xmm6
426
427	addl	$8 * SIZE, AA
428	addl	$4 * SIZE, BB
429	subl	$1, %eax
430	jg	.L113
431	ALIGN_4
432
433.L114:
434	addps	%xmm5, %xmm4
435	addps	%xmm7, %xmm6
436
437#if defined(LN) || defined(RT)
438	movl	KK, %eax
439#ifdef LN
440	subl	$8, %eax
441#else
442	subl	$1, %eax
443#endif
444
445	movl	AORIG, AA
446	movl	BORIG, B
447	leal	BUFFER, BB
448
449	sall	$BASE_SHIFT, %eax
450	leal	(AA, %eax, 8), AA
451	leal	(B,  %eax, 1), B
452	leal	(BB, %eax, 4), BB
453#endif
454
455#if defined(LN) || defined(LT)
456	movsd	 0 * SIZE(B), %xmm2
457	movhps	 2 * SIZE(B), %xmm2
458	movsd	 4 * SIZE(B), %xmm5
459	movhps	 6 * SIZE(B), %xmm5
460
461	subps	%xmm4,  %xmm2
462	subps	%xmm6,  %xmm5
463
464	xorps	%xmm0,  %xmm0
465
466	movaps	 %xmm2, %xmm3
467	unpcklps %xmm0, %xmm2
468	unpckhps %xmm0, %xmm3
469
470	movaps	 %xmm5, %xmm7
471	unpcklps %xmm0, %xmm5
472	unpckhps %xmm0, %xmm7
473#else
474	movaps	 0 * SIZE(AA), %xmm0
475	movaps	 4 * SIZE(AA), %xmm1
476
477	subps	%xmm4, %xmm0
478	subps	%xmm6, %xmm1
479#endif
480
481#if defined(LN) || defined(LT)
482	movaps	TRMASK, %xmm6
483#endif
484
485#ifdef LN
486	movss	63 * SIZE(AA), %xmm0
487	movaps	 %xmm6, %xmm1
488	shufps	$0x00, %xmm0,  %xmm1
489	mulps	%xmm1, %xmm7
490
491	movaps	%xmm7, %xmm1
492	shufps	$0xee, %xmm1, %xmm1
493
494	movss	62 * SIZE(AA), %xmm0
495	shufps	$0x50, %xmm0, %xmm0
496	mulps	%xmm1, %xmm0
497	subps	%xmm0, %xmm7
498
499	movsd	60 * SIZE(AA), %xmm0
500	shufps	$0x50, %xmm0, %xmm0
501	mulps	%xmm1, %xmm0
502	subps	%xmm0, %xmm5
503
504	movsd	58 * SIZE(AA), %xmm0
505	shufps	$0x50, %xmm0, %xmm0
506	mulps	%xmm1, %xmm0
507	subps	%xmm0, %xmm3
508
509	movsd	56 * SIZE(AA), %xmm0
510	shufps	$0x50, %xmm0, %xmm0
511	mulps	%xmm1, %xmm0
512	subps	%xmm0, %xmm2
513
514	movss	54 * SIZE(AA), %xmm0
515	shufps	$0x00, %xmm6,  %xmm0
516	mulps	%xmm0, %xmm7
517
518	movaps	%xmm7, %xmm1
519	shufps	$0x44, %xmm1, %xmm1
520
521	movsd	52 * SIZE(AA), %xmm0
522	shufps	$0x50, %xmm0, %xmm0
523	mulps	%xmm1, %xmm0
524	subps	%xmm0, %xmm5
525
526	movsd	50 * SIZE(AA), %xmm0
527	shufps	$0x50, %xmm0, %xmm0
528	mulps	%xmm1, %xmm0
529	subps	%xmm0, %xmm3
530
531	movsd	48 * SIZE(AA), %xmm0
532	shufps	$0x50, %xmm0, %xmm0
533	mulps	%xmm1, %xmm0
534	subps	%xmm0, %xmm2
535
536
537	movss	45 * SIZE(AA), %xmm0
538	movaps	 %xmm6, %xmm1
539	shufps	$0x00, %xmm0,  %xmm1
540	mulps	%xmm1, %xmm5
541
542	movaps	%xmm5, %xmm1
543	shufps	$0xee, %xmm1, %xmm1
544
545	movss	44 * SIZE(AA), %xmm0
546	shufps	$0x50, %xmm0, %xmm0
547	mulps	%xmm1, %xmm0
548	subps	%xmm0, %xmm5
549
550	movsd	42 * SIZE(AA), %xmm0
551	shufps	$0x50, %xmm0, %xmm0
552	mulps	%xmm1, %xmm0
553	subps	%xmm0, %xmm3
554
555	movsd	40 * SIZE(AA), %xmm0
556	shufps	$0x50, %xmm0, %xmm0
557	mulps	%xmm1, %xmm0
558	subps	%xmm0, %xmm2
559
560	movss	36 * SIZE(AA), %xmm0
561	shufps	$0x00, %xmm6,  %xmm0
562	mulps	%xmm0, %xmm5
563
564	movaps	%xmm5, %xmm1
565	shufps	$0x44, %xmm1, %xmm1
566
567	movsd	34 * SIZE(AA), %xmm0
568	shufps	$0x50, %xmm0, %xmm0
569	mulps	%xmm1, %xmm0
570	subps	%xmm0, %xmm3
571
572	movsd	32 * SIZE(AA), %xmm0
573	shufps	$0x50, %xmm0, %xmm0
574	mulps	%xmm1, %xmm0
575	subps	%xmm0, %xmm2
576
577	movss	27 * SIZE(AA), %xmm0
578	movaps	 %xmm6, %xmm1
579	shufps	$0x00, %xmm0,  %xmm1
580	mulps	%xmm1, %xmm3
581
582	movaps	%xmm3, %xmm1
583	shufps	$0xee, %xmm1, %xmm1
584
585	movss	26 * SIZE(AA), %xmm0
586	shufps	$0x50, %xmm0, %xmm0
587	mulps	%xmm1, %xmm0
588	subps	%xmm0, %xmm3
589
590	movsd	24 * SIZE(AA), %xmm0
591	shufps	$0x50, %xmm0, %xmm0
592	mulps	%xmm1, %xmm0
593	subps	%xmm0, %xmm2
594
595	movss	18 * SIZE(AA), %xmm0
596	shufps	$0x00, %xmm6,  %xmm0
597	mulps	%xmm0, %xmm3
598
599	movaps	%xmm3, %xmm1
600	shufps	$0x44, %xmm1, %xmm1
601
602	movsd	16 * SIZE(AA), %xmm0
603	shufps	$0x50, %xmm0, %xmm0
604	mulps	%xmm1, %xmm0
605	subps	%xmm0, %xmm2
606
607	movss	 9 * SIZE(AA), %xmm0
608	movaps	 %xmm6, %xmm1
609	shufps	$0x00, %xmm0,  %xmm1
610	mulps	%xmm1, %xmm2
611
612	movaps	%xmm2, %xmm1
613	shufps	$0xee, %xmm1, %xmm1
614
615	movss	 8 * SIZE(AA), %xmm0
616	shufps	$0x50, %xmm0, %xmm0
617	mulps	%xmm1, %xmm0
618	subps	%xmm0, %xmm2
619
620	movss	 0 * SIZE(AA), %xmm0
621	shufps	$0x00, %xmm6,  %xmm0
622	mulps	%xmm0, %xmm2
623#endif
624
625#ifdef LT
626	movss	 0 * SIZE(AA), %xmm0
627	shufps	$0x00, %xmm6,  %xmm0
628	mulps	%xmm0, %xmm2
629
630	movaps	%xmm2, %xmm1
631	shufps	$0x44, %xmm1, %xmm1
632
633	movss	 1 * SIZE(AA), %xmm0
634	shufps	$0x05, %xmm0, %xmm0
635	mulps	%xmm1, %xmm0
636	subps	%xmm0, %xmm2
637
638	movsd	 2 * SIZE(AA), %xmm0
639	shufps	$0x50, %xmm0, %xmm0
640	mulps	%xmm1, %xmm0
641	subps	%xmm0, %xmm3
642
643	movsd	 4 * SIZE(AA), %xmm0
644	shufps	$0x50, %xmm0, %xmm0
645	mulps	%xmm1, %xmm0
646	subps	%xmm0, %xmm5
647
648	movsd	 6 * SIZE(AA), %xmm0
649	shufps	$0x50, %xmm0, %xmm0
650	mulps	%xmm1, %xmm0
651	subps	%xmm0, %xmm7
652
653	movss	 9 * SIZE(AA), %xmm0
654	movaps	 %xmm6, %xmm1
655	shufps	$0x00, %xmm0,  %xmm1
656	mulps	%xmm1, %xmm2
657
658	movaps	%xmm2, %xmm1
659	shufps	$0xee, %xmm1, %xmm1
660
661	movsd	10 * SIZE(AA), %xmm0
662	shufps	$0x50, %xmm0, %xmm0
663	mulps	%xmm1, %xmm0
664	subps	%xmm0, %xmm3
665
666	movsd	12 * SIZE(AA), %xmm0
667	shufps	$0x50, %xmm0, %xmm0
668	mulps	%xmm1, %xmm0
669	subps	%xmm0, %xmm5
670
671	movsd	14 * SIZE(AA), %xmm0
672	shufps	$0x50, %xmm0, %xmm0
673	mulps	%xmm1, %xmm0
674	subps	%xmm0, %xmm7
675
676	movss	18 * SIZE(AA), %xmm0
677	shufps	$0x00, %xmm6,  %xmm0
678	mulps	%xmm0, %xmm3
679
680	movaps	%xmm3, %xmm1
681	shufps	$0x44, %xmm1, %xmm1
682
683	movss	19 * SIZE(AA), %xmm0
684	shufps	$0x05, %xmm0, %xmm0
685	mulps	%xmm1, %xmm0
686	subps	%xmm0, %xmm3
687
688	movsd	20 * SIZE(AA), %xmm0
689	shufps	$0x50, %xmm0, %xmm0
690	mulps	%xmm1, %xmm0
691	subps	%xmm0, %xmm5
692
693	movsd	22 * SIZE(AA), %xmm0
694	shufps	$0x50, %xmm0, %xmm0
695	mulps	%xmm1, %xmm0
696	subps	%xmm0, %xmm7
697
698	movss	27 * SIZE(AA), %xmm0
699	movaps	 %xmm6, %xmm1
700	shufps	$0x00, %xmm0,  %xmm1
701	mulps	%xmm1, %xmm3
702
703	movaps	%xmm3, %xmm1
704	shufps	$0xee, %xmm1, %xmm1
705
706	movsd	28 * SIZE(AA), %xmm0
707	shufps	$0x50, %xmm0, %xmm0
708	mulps	%xmm1, %xmm0
709	subps	%xmm0, %xmm5
710
711	movsd	30 * SIZE(AA), %xmm0
712	shufps	$0x50, %xmm0, %xmm0
713	mulps	%xmm1, %xmm0
714	subps	%xmm0, %xmm7
715
716	movss	36 * SIZE(AA), %xmm0
717	shufps	$0x00, %xmm6,  %xmm0
718	mulps	%xmm0, %xmm5
719
720	movaps	%xmm5, %xmm1
721	shufps	$0x44, %xmm1, %xmm1
722
723	movss	37 * SIZE(AA), %xmm0
724	shufps	$0x05, %xmm0, %xmm0
725	mulps	%xmm1, %xmm0
726	subps	%xmm0, %xmm5
727
728	movsd	38 * SIZE(AA), %xmm0
729	shufps	$0x50, %xmm0, %xmm0
730	mulps	%xmm1, %xmm0
731	subps	%xmm0, %xmm7
732
733	movss	45 * SIZE(AA), %xmm0
734	movaps	 %xmm6, %xmm1
735	shufps	$0x00, %xmm0,  %xmm1
736	mulps	%xmm1, %xmm5
737
738	movaps	%xmm5, %xmm1
739	shufps	$0xee, %xmm1, %xmm1
740
741	movsd	46 * SIZE(AA), %xmm0
742	shufps	$0x50, %xmm0, %xmm0
743	mulps	%xmm1, %xmm0
744	subps	%xmm0, %xmm7
745
746	movss	54 * SIZE(AA), %xmm0
747	shufps	$0x00, %xmm6,  %xmm0
748	mulps	%xmm0, %xmm7
749
750	movaps	%xmm7, %xmm1
751	shufps	$0x44, %xmm1, %xmm1
752
753	movss	55 * SIZE(AA), %xmm0
754	shufps	$0x05, %xmm0, %xmm0
755	mulps	%xmm1, %xmm0
756	subps	%xmm0, %xmm7
757
758	movss	63 * SIZE(AA), %xmm0
759	movaps	 %xmm6, %xmm1
760	shufps	$0x00, %xmm0,  %xmm1
761	mulps	%xmm1, %xmm7
762#endif
763
764#if defined(RN) || defined(RT)
765	movss	 0 * SIZE(B), %xmm6
766	shufps	$0x00, %xmm6, %xmm6
767
768	mulps	%xmm6, %xmm0
769	mulps	%xmm6, %xmm1
770#endif
771
772#if defined(LN) || defined(LT)
773	shufps	$0x88, %xmm3, %xmm2
774	shufps	$0x88, %xmm7, %xmm5
775
776	movlps	%xmm2,   0 * SIZE(B)
777	movhps	%xmm2,   2 * SIZE(B)
778	movlps	%xmm5,   4 * SIZE(B)
779	movhps	%xmm5,   6 * SIZE(B)
780
781#ifdef HAVE_SSE2
782	pshufd	$0x00, %xmm2, %xmm0
783	pshufd	$0x55, %xmm2, %xmm1
784	pshufd	$0xaa, %xmm2, %xmm4
785	pshufd	$0xff, %xmm2, %xmm6
786#else
787	movaps	%xmm2, %xmm0
788	shufps	$0x00, %xmm0, %xmm0
789	movaps	%xmm2, %xmm1
790	shufps	$0x55, %xmm1, %xmm1
791	movaps	%xmm2, %xmm4
792	shufps	$0xaa, %xmm4, %xmm4
793	movaps	%xmm2, %xmm6
794	shufps	$0xff, %xmm6, %xmm6
795#endif
796
797	movaps	%xmm0,   0 * SIZE(BB)
798	movaps	%xmm1,   4 * SIZE(BB)
799	movaps	%xmm4,   8 * SIZE(BB)
800	movaps	%xmm6,  12 * SIZE(BB)
801
802#ifdef HAVE_SSE2
803	pshufd	$0x00, %xmm5, %xmm0
804	pshufd	$0x55, %xmm5, %xmm1
805	pshufd	$0xaa, %xmm5, %xmm4
806	pshufd	$0xff, %xmm5, %xmm6
807#else
808	movaps	%xmm5, %xmm0
809	shufps	$0x00, %xmm0, %xmm0
810	movaps	%xmm5, %xmm1
811	shufps	$0x55, %xmm1, %xmm1
812	movaps	%xmm5, %xmm4
813	shufps	$0xaa, %xmm4, %xmm4
814	movaps	%xmm5, %xmm6
815	shufps	$0xff, %xmm6, %xmm6
816#endif
817
818	movaps	%xmm0,  16 * SIZE(BB)
819	movaps	%xmm1,  20 * SIZE(BB)
820	movaps	%xmm4,  24 * SIZE(BB)
821	movaps	%xmm6,  28 * SIZE(BB)
822#else
823	movaps	%xmm0,   0 * SIZE(AA)
824	movaps	%xmm1,   4 * SIZE(AA)
825#endif
826
827#ifdef LN
828	subl	$8 * SIZE, CO1
829#endif
830
831#if defined(LN) || defined(LT)
832	movlps	%xmm2, 0 * SIZE(CO1)
833	movhps	%xmm2, 2 * SIZE(CO1)
834	movlps	%xmm5, 4 * SIZE(CO1)
835	movhps	%xmm5, 6 * SIZE(CO1)
836#else
837	movlps	%xmm0, 0 * SIZE(CO1)
838	movhps	%xmm0, 2 * SIZE(CO1)
839	movlps	%xmm1, 4 * SIZE(CO1)
840	movhps	%xmm1, 6 * SIZE(CO1)
841#endif
842
843#ifndef LN
844	addl	$8 * SIZE, CO1
845#endif
846
847#if defined(LT) || defined(RN)
848	movl	K,  %eax
849	subl	KK, %eax
850	leal	(,%eax, SIZE), %eax
851	leal	(AA, %eax, 8), AA
852#ifdef LT
853	addl	$8 * SIZE, B
854#endif
855#endif
856
857#ifdef LN
858	subl	$8, KK
859	movl	BORIG, B
860#endif
861
862#ifdef LT
863	addl	$8, KK
864#endif
865
866#ifdef RT
867	movl	K, %eax
868	movl	BORIG, B
869	sall	$3 + BASE_SHIFT, %eax
870	addl	%eax, AORIG
871#endif
872
873	decl	%ebx			# i --
874	jg	.L110
875	ALIGN_2
876
877.L130:
878	testl	$4, M
879	jle	.L150
880
881#ifdef LN
882       movl	K, %eax
883       sall	$2 + BASE_SHIFT, %eax
884       subl	%eax, AORIG
885#endif
886
887#if defined(LN) || defined(RT)
888	movl	KK, %eax
889	movl	AORIG, AA
890	sall	$2 + BASE_SHIFT, %eax
891	addl	%eax, AA
892#endif
893
894	leal	BUFFER, BB
895
896#if defined(LN) || defined(RT)
897	movl	KK, %eax
898	sall	$BASE_SHIFT, %eax
899	leal	(BB, %eax, 4), BB
900#endif
901
902	movaps	 0 * SIZE(BB), %xmm2
903	xorps	%xmm4, %xmm4
904	movsd	 0 * SIZE(AA), %xmm0
905	movhps	 2 * SIZE(AA), %xmm0
906	xorps	%xmm5, %xmm5
907	movaps	16 * SIZE(BB), %xmm3
908	xorps	%xmm6, %xmm6
909	movsd	16 * SIZE(AA), %xmm1
910	movhps	18 * SIZE(AA), %xmm1
911	xorps	%xmm7, %xmm7
912
913#if defined(LT) || defined(RN)
914	movl	KK, %eax
915#else
916	movl	K, %eax
917	subl	KK, %eax
918#endif
919	sarl	$3, %eax
920	je	.L132
921	ALIGN_2
922
923.L131:
924	mulps	%xmm0, %xmm2
925	movaps	 4 * SIZE(AA), %xmm0
926	addps	%xmm2, %xmm4
927	mulps	 4 * SIZE(BB), %xmm0
928	movaps	32 * SIZE(BB), %xmm2
929	addps	%xmm0, %xmm5
930	movaps	 8 * SIZE(AA), %xmm0
931	mulps	 8 * SIZE(BB), %xmm0
932	addps	%xmm0, %xmm6
933	movaps	12 * SIZE(AA), %xmm0
934	mulps	12 * SIZE(BB), %xmm0
935	addps	%xmm0, %xmm7
936	movaps	32 * SIZE(AA), %xmm0
937	mulps	%xmm1, %xmm3
938	movaps	20 * SIZE(AA), %xmm1
939	addps	%xmm3, %xmm4
940	mulps	20 * SIZE(BB), %xmm1
941	movaps	48 * SIZE(BB), %xmm3
942	addps	%xmm1, %xmm5
943	movaps	24 * SIZE(AA), %xmm1
944	mulps	24 * SIZE(BB), %xmm1
945	addps	%xmm1, %xmm6
946	movaps	28 * SIZE(AA), %xmm1
947	mulps	28 * SIZE(BB), %xmm1
948	addps	%xmm1, %xmm7
949	movaps	48 * SIZE(AA), %xmm1
950
951	addl   $32 * SIZE, AA
952	addl   $32 * SIZE, BB
953	decl   %eax
954	jne    .L131
955	ALIGN_2
956
957.L132:
958#if defined(LT) || defined(RN)
959	movl	KK, %eax
960#else
961	movl	K, %eax
962	subl	KK, %eax
963#endif
964	andl	$7, %eax		# if (k & 1)
965	BRANCH
966	je .L134
967
968.L133:
969	movaps	 0 * SIZE(BB), %xmm2
970	movaps	 0 * SIZE(AA), %xmm0
971	mulps	%xmm0, %xmm2
972	addps	%xmm2, %xmm4
973
974	addl	$4 * SIZE, AA
975	addl	$4 * SIZE, BB
976	decl	%eax
977	jg	.L133
978	ALIGN_4
979
980.L134:
981	addps	%xmm5, %xmm4
982	addps	%xmm7, %xmm6
983	addps	%xmm6, %xmm4
984
985#if defined(LN) || defined(RT)
986	movl	KK, %eax
987#ifdef LN
988	subl	$4, %eax
989#else
990	subl	$1, %eax
991#endif
992
993	movl	AORIG, AA
994	movl	BORIG, B
995	leal	BUFFER, BB
996
997	sall	$BASE_SHIFT, %eax
998	leal	(AA, %eax, 4), AA
999	leal	(B,  %eax, 1), B
1000	leal	(BB, %eax, 4), BB
1001#endif
1002
1003#if defined(LN) || defined(LT)
1004	movsd	 0 * SIZE(B), %xmm2
1005	movhps	 2 * SIZE(B), %xmm2
1006
1007	subps	%xmm4,  %xmm2
1008
1009	xorps	%xmm5, %xmm5
1010
1011	movaps	 %xmm2, %xmm3
1012	unpcklps %xmm5, %xmm2
1013	unpckhps %xmm5, %xmm3
1014#else
1015	movaps	 0 * SIZE(AA), %xmm0
1016	subps	%xmm4, %xmm0
1017#endif
1018
1019#if defined(LN) || defined(LT)
1020	movaps	TRMASK, %xmm6
1021#endif
1022
1023#ifdef LN
1024	movss	15 * SIZE(AA), %xmm0
1025	movaps	 %xmm6, %xmm1
1026	shufps	$0x00, %xmm0,  %xmm1
1027	mulps	%xmm1, %xmm3
1028
1029	movaps	%xmm3, %xmm1
1030	shufps	$0xee, %xmm1, %xmm1
1031
1032	movss	14 * SIZE(AA), %xmm0
1033	shufps	$0x50, %xmm0, %xmm0
1034	mulps	%xmm1, %xmm0
1035	subps	%xmm0, %xmm3
1036
1037	movsd	12 * SIZE(AA), %xmm0
1038	shufps	$0x50, %xmm0, %xmm0
1039	mulps	%xmm1, %xmm0
1040	subps	%xmm0, %xmm2
1041
1042	movss	10 * SIZE(AA), %xmm0
1043	shufps	$0x00, %xmm6,  %xmm0
1044	mulps	%xmm0, %xmm3
1045
1046	movaps	%xmm3, %xmm1
1047	shufps	$0x44, %xmm1, %xmm1
1048
1049	movsd	 8 * SIZE(AA), %xmm0
1050	shufps	$0x50, %xmm0, %xmm0
1051	mulps	%xmm1, %xmm0
1052	subps	%xmm0, %xmm2
1053
1054	movss	 5 * SIZE(AA), %xmm0
1055	movaps	 %xmm6, %xmm1
1056	shufps	$0x00, %xmm0,  %xmm1
1057	mulps	%xmm1, %xmm2
1058
1059	movaps	%xmm2, %xmm1
1060	shufps	$0xee, %xmm1, %xmm1
1061
1062	movss	 4 * SIZE(AA), %xmm0
1063	shufps	$0x50, %xmm0, %xmm0
1064	mulps	%xmm1, %xmm0
1065	subps	%xmm0, %xmm2
1066
1067	movss	 0 * SIZE(AA), %xmm0
1068	shufps	$0x00, %xmm6,  %xmm0
1069	mulps	%xmm0, %xmm2
1070#endif
1071
1072#ifdef LT
1073	movss	 0 * SIZE(AA), %xmm0
1074	shufps	$0x00, %xmm6,  %xmm0
1075	mulps	%xmm0, %xmm2
1076
1077	movaps	%xmm2, %xmm1
1078	shufps	$0x44, %xmm1, %xmm1
1079
1080	movss	 1 * SIZE(AA), %xmm0
1081	shufps	$0x05, %xmm0, %xmm0
1082	mulps	%xmm1, %xmm0
1083	subps	%xmm0, %xmm2
1084
1085	movsd	 2 * SIZE(AA), %xmm0
1086	shufps	$0x50, %xmm0, %xmm0
1087	mulps	%xmm1, %xmm0
1088	subps	%xmm0, %xmm3
1089
1090	movss	 5 * SIZE(AA), %xmm0
1091	movaps	 %xmm6, %xmm1
1092	shufps	$0x00, %xmm0,  %xmm1
1093	mulps	%xmm1, %xmm2
1094
1095	movaps	%xmm2, %xmm1
1096	shufps	$0xee, %xmm1, %xmm1
1097
1098	movsd	 6 * SIZE(AA), %xmm0
1099	shufps	$0x50, %xmm0, %xmm0
1100	mulps	%xmm1, %xmm0
1101	subps	%xmm0, %xmm3
1102
1103	movss	10 * SIZE(AA), %xmm0
1104	shufps	$0x00, %xmm6,  %xmm0
1105	mulps	%xmm0, %xmm3
1106
1107	movaps	%xmm3, %xmm1
1108	shufps	$0x44, %xmm1, %xmm1
1109
1110	movss	11 * SIZE(AA), %xmm0
1111	shufps	$0x05, %xmm0, %xmm0
1112	mulps	%xmm1, %xmm0
1113	subps	%xmm0, %xmm3
1114
1115	movss	15 * SIZE(AA), %xmm0
1116	movaps	 %xmm6, %xmm1
1117	shufps	$0x00, %xmm0,  %xmm1
1118	mulps	%xmm1, %xmm3
1119#endif
1120
1121#ifdef RN
1122	movss	 0 * SIZE(B), %xmm6
1123	shufps	$0x00, %xmm6, %xmm6
1124	mulps	%xmm6, %xmm0
1125#endif
1126
1127#ifdef RT
1128	movss	 0 * SIZE(B), %xmm6
1129	shufps	$0x00, %xmm6, %xmm6
1130	mulps	%xmm6, %xmm0
1131#endif
1132
1133#if defined(LN) || defined(LT)
1134	shufps	$0x88, %xmm3, %xmm2
1135
1136	movlps	%xmm2,   0 * SIZE(B)
1137	movhps	%xmm2,   2 * SIZE(B)
1138
1139#ifdef HAVE_SSE2
1140	pshufd	$0x00, %xmm2, %xmm0
1141	pshufd	$0x55, %xmm2, %xmm1
1142	pshufd	$0xaa, %xmm2, %xmm4
1143	pshufd	$0xff, %xmm2, %xmm6
1144#else
1145	movaps	%xmm2, %xmm0
1146	shufps	$0x00, %xmm0, %xmm0
1147	movaps	%xmm2, %xmm1
1148	shufps	$0x55, %xmm1, %xmm1
1149	movaps	%xmm2, %xmm4
1150	shufps	$0xaa, %xmm4, %xmm4
1151	movaps	%xmm2, %xmm6
1152	shufps	$0xff, %xmm6, %xmm6
1153#endif
1154
1155	movaps	%xmm0,   0 * SIZE(BB)
1156	movaps	%xmm1,   4 * SIZE(BB)
1157	movaps	%xmm4,   8 * SIZE(BB)
1158	movaps	%xmm6,  12 * SIZE(BB)
1159#else
1160	movaps	%xmm0,   0 * SIZE(AA)
1161#endif
1162
1163#ifdef LN
1164	subl	$4 * SIZE, CO1
1165#endif
1166
1167#if defined(LN) || defined(LT)
1168	movlps	%xmm2, 0 * SIZE(CO1)
1169	movhps	%xmm2, 2 * SIZE(CO1)
1170#else
1171	movlps	%xmm0, 0 * SIZE(CO1)
1172	movhps	%xmm0, 2 * SIZE(CO1)
1173#endif
1174
1175#ifndef LN
1176	addl	$4 * SIZE, CO1
1177#endif
1178
1179#if defined(LT) || defined(RN)
1180	movl	K,  %eax
1181	subl	KK, %eax
1182	leal	(,%eax, SIZE), %eax
1183	leal	(AA, %eax, 4), AA
1184#ifdef LT
1185	addl	$4 * SIZE, B
1186#endif
1187#endif
1188
1189#ifdef LN
1190	subl	$4, KK
1191	movl	BORIG, B
1192#endif
1193
1194#ifdef LT
1195	addl	$4, KK
1196#endif
1197
1198#ifdef RT
1199	movl	K, %eax
1200	movl	BORIG, B
1201	sall	$2 + BASE_SHIFT, %eax
1202	addl	%eax, AORIG
1203#endif
1204	ALIGN_2
1205
1206.L150:
1207	testl	$2, M
1208	jle	.L170
1209
1210#ifdef LN
1211       movl	K, %eax
1212       sall	$1 + BASE_SHIFT, %eax
1213       subl	%eax, AORIG
1214#endif
1215
1216#if defined(LN) || defined(RT)
1217	movl	KK, %eax
1218	movl	AORIG, AA
1219	sall	$1 + BASE_SHIFT, %eax
1220	addl	%eax, AA
1221#endif
1222
1223	leal	BUFFER, BB
1224
1225#if defined(LN) || defined(RT)
1226	movl	KK, %eax
1227	sall	$BASE_SHIFT, %eax
1228	leal	(BB, %eax, 4), BB
1229#endif
1230
1231	movaps	 0 * SIZE(BB), %xmm2
1232	xorps	%xmm4, %xmm4
1233#ifdef	movsd
1234	xorps	%xmm0, %xmm0
1235#endif
1236	movsd	 0 * SIZE(AA), %xmm0
1237	xorps	%xmm5, %xmm5
1238	movaps	 16 * SIZE(BB), %xmm3
1239	xorps	%xmm6, %xmm6
1240#ifdef	movsd
1241	xorps	%xmm1, %xmm1
1242#endif
1243	movsd	 8 * SIZE(AA), %xmm1
1244	xorps	%xmm7, %xmm7
1245
1246#if defined(LT) || defined(RN)
1247	movl	KK, %eax
1248#else
1249	movl	K, %eax
1250	subl	KK, %eax
1251#endif
1252	sarl	$3, %eax
1253	je	.L152
1254	ALIGN_2
1255
1256.L151:
1257	mulps	%xmm0, %xmm2
1258	movsd	 2 * SIZE(AA), %xmm0
1259	addps	%xmm2, %xmm4
1260	movaps	 4 * SIZE(BB), %xmm2
1261	mulps	%xmm0, %xmm2
1262	movsd	 4 * SIZE(AA), %xmm0
1263	addps	%xmm2, %xmm5
1264	movaps	 8 * SIZE(BB), %xmm2
1265	mulps	%xmm0, %xmm2
1266	movsd	 6 * SIZE(AA), %xmm0
1267	addps	%xmm2, %xmm6
1268	movaps	12 * SIZE(BB), %xmm2
1269	mulps	%xmm0, %xmm2
1270	movsd	16 * SIZE(AA), %xmm0
1271	addps	%xmm2, %xmm7
1272	movaps	32 * SIZE(BB), %xmm2
1273	mulps	%xmm1, %xmm3
1274	movsd	10 * SIZE(AA), %xmm1
1275	addps	%xmm3, %xmm4
1276	movaps	20 * SIZE(BB), %xmm3
1277	mulps	%xmm1, %xmm3
1278	movsd	12 * SIZE(AA), %xmm1
1279	addps	%xmm3, %xmm5
1280	movaps	24 * SIZE(BB), %xmm3
1281	mulps	%xmm1, %xmm3
1282	movsd	14 * SIZE(AA), %xmm1
1283	addps	%xmm3, %xmm6
1284	movaps	28 * SIZE(BB), %xmm3
1285	mulps	%xmm1, %xmm3
1286	movsd	24 * SIZE(AA), %xmm1
1287	addps	%xmm3, %xmm7
1288	movaps	48 * SIZE(BB), %xmm3
1289
1290	addl   $16 * SIZE, AA
1291	addl   $32 * SIZE, BB
1292	decl   %eax
1293	jne    .L151
1294	ALIGN_2
1295
1296.L152:
1297#if defined(LT) || defined(RN)
1298	movl	KK, %eax
1299#else
1300	movl	K, %eax
1301	subl	KK, %eax
1302#endif
1303	andl	$7, %eax		# if (k & 1)
1304	BRANCH
1305	je .L154
1306
1307.L153:
1308	mulps	%xmm0, %xmm2
1309	movsd	 2 * SIZE(AA), %xmm0
1310	addps	%xmm2, %xmm4
1311	movaps	 4 * SIZE(BB), %xmm2
1312
1313	addl	$2 * SIZE, AA
1314	addl	$4 * SIZE, BB
1315	decl	%eax
1316	jg	.L153
1317	ALIGN_4
1318
1319.L154:
1320	addps	%xmm5, %xmm4
1321	addps	%xmm7, %xmm6
1322	addps	%xmm6, %xmm4
1323
1324#if defined(LN) || defined(RT)
1325	movl	KK, %eax
1326#ifdef LN
1327	subl	$2, %eax
1328#else
1329	subl	$1, %eax
1330#endif
1331
1332	movl	AORIG, AA
1333	movl	BORIG, B
1334	leal	BUFFER, BB
1335
1336	sall	$BASE_SHIFT, %eax
1337	leal	(AA, %eax, 2), AA
1338	leal	(B,  %eax, 1), B
1339	leal	(BB, %eax, 4), BB
1340#endif
1341
1342#if defined(LN) || defined(LT)
1343	movaps	%xmm4, %xmm5
1344	shufps	$1, %xmm5, %xmm5
1345
1346	movss	 0 * SIZE(B), %xmm0
1347	movss	 1 * SIZE(B), %xmm1
1348
1349	subss	%xmm4,  %xmm0
1350	subss	%xmm5,  %xmm1
1351#else
1352#ifdef	movsd
1353	xorps	%xmm0, %xmm0
1354#endif
1355	movsd	 0 * SIZE(AA), %xmm0
1356	subps	%xmm4, %xmm0
1357#endif
1358
1359#ifdef LN
1360	movaps	  0 * SIZE(AA), %xmm4
1361
1362	movaps	 %xmm4, %xmm6
1363	shufps	 $0xff, %xmm6, %xmm6
1364	mulss	 %xmm6, %xmm1
1365
1366	movaps	 %xmm4, %xmm6
1367	shufps	 $0xaa, %xmm6, %xmm6
1368	mulss	 %xmm1, %xmm6
1369	subss	 %xmm6, %xmm0
1370	mulss	 %xmm4, %xmm0
1371#endif
1372
1373#ifdef LT
1374	movaps	 0 * SIZE(AA), %xmm4
1375	mulss	 %xmm4, %xmm0
1376	movaps	 %xmm4, %xmm6
1377	shufps	 $0x55, %xmm6, %xmm6
1378	mulss	 %xmm0, %xmm6
1379	subss	 %xmm6, %xmm1
1380	movaps	 %xmm4, %xmm6
1381	shufps	 $0xff, %xmm6, %xmm6
1382	mulss	 %xmm6, %xmm1
1383#endif
1384
1385#ifdef RN
1386	movss	 0 * SIZE(B), %xmm6
1387	shufps	$0x00, %xmm6, %xmm6
1388	mulps	%xmm6, %xmm0
1389#endif
1390
1391#ifdef RT
1392	movss	 0 * SIZE(B), %xmm6
1393	shufps	$0x00, %xmm6, %xmm6
1394	mulps	%xmm6, %xmm0
1395#endif
1396
1397#if defined(LN) || defined(LT)
1398	movss	%xmm0,   0 * SIZE(B)
1399	movss	%xmm1,   1 * SIZE(B)
1400
1401	shufps	$0x00, %xmm0, %xmm0
1402	shufps	$0x00, %xmm1, %xmm1
1403	movaps	%xmm0,   0 * SIZE(BB)
1404	movaps	%xmm1,   4 * SIZE(BB)
1405#else
1406	movlps	%xmm0,   0 * SIZE(AA)
1407#endif
1408
1409#ifdef LN
1410	subl	$2 * SIZE, CO1
1411#endif
1412
1413#if defined(LN) || defined(LT)
1414	movss	%xmm0, 0 * SIZE(CO1)
1415	movss	%xmm1, 1 * SIZE(CO1)
1416#else
1417	movlps	%xmm0, 0 * SIZE(CO1)
1418#endif
1419
1420#ifndef LN
1421	addl	$2 * SIZE, CO1
1422#endif
1423
1424#if defined(LT) || defined(RN)
1425	movl	K,  %eax
1426	subl	KK, %eax
1427	leal	(,%eax, SIZE), %eax
1428	leal	(AA, %eax, 2), AA
1429#ifdef LT
1430	addl	$2 * SIZE, B
1431#endif
1432#endif
1433
1434#ifdef LN
1435	subl	$2, KK
1436	movl	BORIG, B
1437#endif
1438
1439#ifdef LT
1440	addl	$2, KK
1441#endif
1442
1443#ifdef RT
1444	movl	K, %eax
1445	movl	BORIG, B
1446	sall	$1 + BASE_SHIFT, %eax
1447	addl	%eax, AORIG
1448#endif
1449	ALIGN_2
1450
1451.L170:
1452	testl	$1, M
1453	jle	.L179
1454
1455#ifdef LN
1456       movl	K, %eax
1457       sall	$BASE_SHIFT, %eax
1458       subl	%eax, AORIG
1459#endif
1460
1461#if defined(LN) || defined(RT)
1462	movl	KK, %eax
1463	movl	AORIG, AA
1464	leal	(AA, %eax, SIZE), AA
1465#endif
1466
1467	leal	BUFFER, BB
1468
1469#if defined(LN) || defined(RT)
1470	movl	KK, %eax
1471	sall	$BASE_SHIFT, %eax
1472	leal	(BB, %eax, 4), BB
1473#endif
1474
1475	movss	 0 * SIZE(BB), %xmm2
1476	xorps	%xmm4, %xmm4
1477	movss	 0 * SIZE(AA), %xmm0
1478	xorps	%xmm5, %xmm5
1479	movss	16 * SIZE(BB), %xmm3
1480	xorps	%xmm6, %xmm6
1481	movss	 4 * SIZE(AA), %xmm1
1482	xorps	%xmm7, %xmm7
1483
1484#if defined(LT) || defined(RN)
1485	movl	KK, %eax
1486#else
1487	movl	K, %eax
1488	subl	KK, %eax
1489#endif
1490	sarl	$3, %eax
1491	je	.L172
1492	ALIGN_2
1493
1494.L171:
1495	mulss	%xmm0, %xmm2
1496	movss	 1 * SIZE(AA), %xmm0
1497	addss	%xmm2, %xmm4
1498	mulss	 4 * SIZE(BB), %xmm0
1499	movss	32 * SIZE(BB), %xmm2
1500	addss	%xmm0, %xmm5
1501	movss	 2 * SIZE(AA), %xmm0
1502	mulss	 8 * SIZE(BB), %xmm0
1503	addss	%xmm0, %xmm6
1504	movss	 3 * SIZE(AA), %xmm0
1505	mulss	12 * SIZE(BB), %xmm0
1506	addss	%xmm0, %xmm7
1507	movss	 8 * SIZE(AA), %xmm0
1508	mulss	%xmm1, %xmm3
1509	movss	 5 * SIZE(AA), %xmm1
1510	addss	%xmm3, %xmm4
1511	mulss	20 * SIZE(BB), %xmm1
1512	movss	48 * SIZE(BB), %xmm3
1513	addss	%xmm1, %xmm5
1514	movss	 6 * SIZE(AA), %xmm1
1515	mulss	24 * SIZE(BB), %xmm1
1516	addss	%xmm1, %xmm6
1517	movss	 7 * SIZE(AA), %xmm1
1518	mulss	28 * SIZE(BB), %xmm1
1519	addss	%xmm1, %xmm7
1520	movss	12 * SIZE(AA), %xmm1
1521
1522	addl   $ 8 * SIZE, AA
1523	addl   $32 * SIZE, BB
1524	decl   %eax
1525	jne    .L171
1526	ALIGN_2
1527
1528.L172:
1529#if defined(LT) || defined(RN)
1530	movl	KK, %eax
1531#else
1532	movl	K, %eax
1533	subl	KK, %eax
1534#endif
1535	andl	$7, %eax		# if (k & 1)
1536	BRANCH
1537	je .L174
1538
1539.L173:
1540	movss	 0 * SIZE(AA), %xmm0
1541	movss	 0 * SIZE(BB), %xmm2
1542	mulss	%xmm0, %xmm2
1543	addss	%xmm2, %xmm4
1544
1545	addl	$1 * SIZE, AA
1546	addl	$4 * SIZE, BB
1547	decl	%eax
1548	jg	.L173
1549	ALIGN_4
1550
1551.L174:
1552	addss	%xmm5, %xmm4
1553	addss	%xmm7, %xmm6
1554	addss	%xmm6, %xmm4
1555
1556#if defined(LN) || defined(RT)
1557	movl	KK, %eax
1558	subl	$1, %eax
1559
1560	movl	AORIG, AA
1561	movl	BORIG, B
1562	leal	BUFFER, BB
1563
1564	sall	$ BASE_SHIFT, %eax
1565	leal	(AA, %eax, 1), AA
1566	leal	(B,  %eax, 1), B
1567	leal	(BB, %eax, 4), BB
1568#endif
1569
1570#if defined(LN) || defined(LT)
1571	movss	 0 * SIZE(B), %xmm1
1572	subss	%xmm4,  %xmm1
1573#else
1574	movss	 0 * SIZE(AA), %xmm0
1575	subss	%xmm4, %xmm0
1576#endif
1577
1578#if defined(LN) || defined(LT)
1579	mulss	 0 * SIZE(AA), %xmm1
1580#endif
1581
1582#if defined(RN) || defined(RT)
1583	mulss	 0 * SIZE(B), %xmm0
1584#endif
1585
1586#if defined(LN) || defined(LT)
1587	movss	%xmm1,   0 * SIZE(B)
1588
1589	shufps	$0x00, %xmm1, %xmm1
1590	movaps	%xmm1,   0 * SIZE(BB)
1591#else
1592	movss	%xmm0,   0 * SIZE(AA)
1593#endif
1594
1595#ifdef LN
1596	subl	$1 * SIZE, CO1
1597#endif
1598
1599#if defined(LN) || defined(LT)
1600	movss	%xmm1, 0 * SIZE(CO1)
1601#else
1602	movss	%xmm0, 0 * SIZE(CO1)
1603#endif
1604
1605#ifndef LN
1606	addl	$1 * SIZE, CO1
1607#endif
1608
1609#if defined(LT) || defined(RN)
1610	movl	K,  %eax
1611	subl	KK, %eax
1612	leal	(AA, %eax, SIZE), AA
1613#ifdef LT
1614	addl	$1 * SIZE, B
1615#endif
1616#endif
1617
1618#ifdef LN
1619	subl	$1, KK
1620	movl	BORIG, B
1621#endif
1622
1623#ifdef LT
1624	addl	$1, KK
1625#endif
1626
1627#ifdef RT
1628	movl	K, %eax
1629	movl	BORIG, B
1630	sall	$BASE_SHIFT, %eax
1631	addl	%eax, AORIG
1632#endif
1633	ALIGN_2
1634.L179:
1635#ifdef LN
1636       movl	K, %eax
1637       leal 	(B, %eax, SIZE), B
1638#endif
1639
1640#if defined(LT) || defined(RN)
1641	movl	K,  %eax
1642	subl	KK, %eax
1643	leal	(B,  %eax, SIZE), B
1644#endif
1645
1646#ifdef RN
1647	addl	$1, KK
1648#endif
1649
1650#ifdef RT
1651	subl	$1, KK
1652#endif
1653	ALIGN_4
1654
1655.L100:
1656	movl	N,  %eax
1657	sarl	$1, %eax	# j = (n >> 1)
1658	movl	%eax, J
1659	jle	.L999
1660	ALIGN_2
1661
1662.L01:
1663#ifdef LN
1664	movl	OFFSET, %eax
1665	addl	M, %eax
1666	movl	%eax, KK
1667#endif
1668
1669	leal	BUFFER, BB
1670
1671#ifdef RT
1672       movl	K, %eax
1673       sall	$1 + BASE_SHIFT, %eax
1674       subl	%eax, B
1675#endif
1676
1677#if defined(LN) || defined(RT)
1678	movl	KK, %eax
1679	movl	B, BORIG
1680        sall	$1 + BASE_SHIFT, %eax
1681	leal	(B,  %eax, 1), B
1682	leal	(BB, %eax, 4), BB
1683#endif
1684
1685#ifdef LT
1686	movl	OFFSET, %eax
1687	movl	%eax, KK
1688#endif
1689
1690#if defined(LT) || defined(RN)
1691	movl	KK, %eax
1692#else
1693	movl	K,  %eax
1694	subl	KK, %eax
1695#endif
1696	sarl	$2, %eax
1697	jle	.L03
1698	ALIGN_4
1699
1700.L02:
1701	movsd	 0 * SIZE(B), %xmm3
1702	movhps	 2 * SIZE(B), %xmm3
1703	movsd	 4 * SIZE(B), %xmm7
1704	movhps	 6 * SIZE(B), %xmm7
1705
1706#ifdef HAVE_SSE2
1707	pshufd	 $0x00, %xmm3, %xmm0
1708	pshufd	 $0x55, %xmm3, %xmm1
1709	pshufd	 $0xaa, %xmm3, %xmm2
1710	pshufd	 $0xff, %xmm3, %xmm3
1711
1712	pshufd	 $0x00, %xmm7, %xmm4
1713	pshufd	 $0x55, %xmm7, %xmm5
1714	pshufd	 $0xaa, %xmm7, %xmm6
1715	pshufd	 $0xff, %xmm7, %xmm7
1716#else
1717	movaps	%xmm3, %xmm0
1718	shufps	 $0x00, %xmm0, %xmm0
1719	movaps	%xmm3, %xmm1
1720	shufps	 $0x55, %xmm1, %xmm1
1721	movaps	%xmm3, %xmm2
1722	shufps	 $0xaa, %xmm2, %xmm2
1723	shufps	 $0xff, %xmm3, %xmm3
1724
1725	movaps	%xmm7, %xmm4
1726	shufps	 $0x00, %xmm4, %xmm4
1727	movaps	%xmm7, %xmm5
1728	shufps	 $0x55, %xmm5, %xmm5
1729	movaps	%xmm7, %xmm6
1730	shufps	 $0xaa, %xmm6, %xmm6
1731	shufps	 $0xff, %xmm7, %xmm7
1732#endif
1733
1734	movaps	%xmm0,  0 * SIZE(BB)
1735	movaps	%xmm1,  4 * SIZE(BB)
1736	movaps	%xmm2,  8 * SIZE(BB)
1737	movaps	%xmm3, 12 * SIZE(BB)
1738	movaps	%xmm4, 16 * SIZE(BB)
1739	movaps	%xmm5, 20 * SIZE(BB)
1740	movaps	%xmm6, 24 * SIZE(BB)
1741	movaps	%xmm7, 28 * SIZE(BB)
1742
1743	addl	$ 8 * SIZE, B
1744	addl	$32 * SIZE, BB
1745	decl	%eax
1746	BRANCH
1747	jne	.L02
1748	ALIGN_2
1749
1750.L03:
1751#if defined(LT) || defined(RN)
1752	movl	KK, %eax
1753#else
1754	movl	K, %eax
1755	subl	KK, %eax
1756#endif
1757	andl	$3, %eax
1758	BRANCH
1759	jle	.L05
1760	ALIGN_2
1761
1762.L04:
1763	movsd	 0 * SIZE(B), %xmm3
1764
1765#ifdef HAVE_SSE2
1766	pshufd	 $0x00, %xmm3, %xmm0
1767	pshufd	 $0x55, %xmm3, %xmm1
1768#else
1769	movaps	%xmm3, %xmm0
1770	shufps	 $0x00, %xmm0, %xmm0
1771	movaps	%xmm3, %xmm1
1772	shufps	 $0x55, %xmm1, %xmm1
1773#endif
1774
1775	movaps	%xmm0,  0 * SIZE(BB)
1776	movaps	%xmm1,  4 * SIZE(BB)
1777
1778	addl	$2 * SIZE, B
1779	addl	$8 * SIZE, BB
1780
1781	decl	%eax
1782	jne	.L04
1783	ALIGN_4
1784
1785.L05:
1786#if defined(LT) || defined(RN)
1787	movl	A, AA
1788#else
1789	movl	A, %eax
1790	movl	%eax, AORIG
1791#endif
1792
1793	leal	(, LDC, 2), %eax
1794
1795#ifdef RT
1796	subl	%eax, C
1797#endif
1798	movl	C, CO1
1799#ifndef RT
1800	addl	%eax, C
1801#endif
1802
1803	movl	M,  %ebx
1804	sarl	$3, %ebx
1805	jle	.L30
1806	ALIGN_4
1807
1808.L10:
1809#ifdef LN
1810       movl	K, %eax
1811       sall	$3 + BASE_SHIFT, %eax
1812       subl	%eax, AORIG
1813#endif
1814
1815#if defined(LN) || defined(RT)
1816	movl	KK, %eax
1817	movl	AORIG, AA
1818	sall	$3 + BASE_SHIFT, %eax
1819	addl	%eax, AA
1820#endif
1821
1822	leal	BUFFER, BB
1823
1824#if defined(LN) || defined(RT)
1825	movl	KK, %eax
1826	sall	$1 + BASE_SHIFT, %eax
1827	leal	(BB, %eax, 4), BB
1828#endif
1829
1830	movaps	 0 * SIZE(BB), %xmm2
1831	xorps	%xmm4, %xmm4
1832	movaps	 0 * SIZE(AA), %xmm0
1833	xorps	%xmm5, %xmm5
1834	movaps	 8 * SIZE(BB), %xmm3
1835	xorps	%xmm6, %xmm6
1836	movaps	 8 * SIZE(AA), %xmm1
1837	xorps	%xmm7, %xmm7
1838
1839	PREFETCHW      7 * SIZE(CO1)
1840	PREFETCHW      7 * SIZE(CO1, LDC)
1841
1842#if defined(LT) || defined(RN)
1843	movl	KK, %eax
1844#else
1845	movl	K, %eax
1846	subl	KK, %eax
1847#endif
1848	sarl	$3, %eax
1849	je	.L12
1850	ALIGN_2
1851
1852.L11:
1853	mulps	%xmm0, %xmm2
1854	mulps	 4 * SIZE(BB), %xmm0
1855	addps	%xmm2, %xmm4
1856	movaps	 0 * SIZE(BB), %xmm2
1857
1858	addps	%xmm0, %xmm5
1859	movaps	 4 * SIZE(AA), %xmm0
1860	mulps	%xmm0, %xmm2
1861	mulps	 4 * SIZE(BB), %xmm0
1862
1863	addps	%xmm2, %xmm6
1864	movaps	16 * SIZE(BB), %xmm2
1865	addps	%xmm0, %xmm7
1866	movaps	16 * SIZE(AA), %xmm0
1867
1868	mulps	%xmm1, %xmm3
1869	mulps	12 * SIZE(BB), %xmm1
1870	addps	%xmm3, %xmm4
1871	movaps	 8 * SIZE(BB), %xmm3
1872
1873	addps	%xmm1, %xmm5
1874	movaps	12 * SIZE(AA), %xmm1
1875	mulps	%xmm1, %xmm3
1876	mulps	12 * SIZE(BB), %xmm1
1877
1878	addps	%xmm3, %xmm6
1879	movaps	24 * SIZE(BB), %xmm3
1880	addps	%xmm1, %xmm7
1881	movaps	24 * SIZE(AA), %xmm1
1882
1883	mulps	%xmm0, %xmm2
1884	mulps	20 * SIZE(BB), %xmm0
1885	addps	%xmm2, %xmm4
1886	movaps	16 * SIZE(BB), %xmm2
1887
1888	addps	%xmm0, %xmm5
1889	movaps	20 * SIZE(AA), %xmm0
1890	mulps	%xmm0, %xmm2
1891	mulps	20 * SIZE(BB), %xmm0
1892
1893	addps	%xmm2, %xmm6
1894	movaps	32 * SIZE(BB), %xmm2
1895	addps	%xmm0, %xmm7
1896	movaps	32 * SIZE(AA), %xmm0
1897
1898	mulps	%xmm1, %xmm3
1899	mulps	28 * SIZE(BB), %xmm1
1900	addps	%xmm3, %xmm4
1901	movaps	24 * SIZE(BB), %xmm3
1902
1903	addps	%xmm1, %xmm5
1904	movaps	28 * SIZE(AA), %xmm1
1905	mulps	%xmm1, %xmm3
1906	mulps	28 * SIZE(BB), %xmm1
1907
1908	addps	%xmm3, %xmm6
1909	movaps	40 * SIZE(BB), %xmm3
1910	addps	%xmm1, %xmm7
1911	movaps	40 * SIZE(AA), %xmm1
1912
1913	mulps	%xmm0, %xmm2
1914	mulps	36 * SIZE(BB), %xmm0
1915	addps	%xmm2, %xmm4
1916	movaps	32 * SIZE(BB), %xmm2
1917
1918	addps	%xmm0, %xmm5
1919	movaps	36 * SIZE(AA), %xmm0
1920	mulps	%xmm0, %xmm2
1921	mulps	36 * SIZE(BB), %xmm0
1922
1923	addps	%xmm2, %xmm6
1924	movaps	48 * SIZE(BB), %xmm2
1925	addps	%xmm0, %xmm7
1926	movaps	48 * SIZE(AA), %xmm0
1927
1928	mulps	%xmm1, %xmm3
1929	mulps	44 * SIZE(BB), %xmm1
1930	addps	%xmm3, %xmm4
1931	movaps	40 * SIZE(BB), %xmm3
1932
1933	addps	%xmm1, %xmm5
1934	movaps	44 * SIZE(AA), %xmm1
1935	mulps	%xmm1, %xmm3
1936	mulps	44 * SIZE(BB), %xmm1
1937
1938	addps	%xmm3, %xmm6
1939	movaps	56 * SIZE(BB), %xmm3
1940	addps	%xmm1, %xmm7
1941	movaps	56 * SIZE(AA), %xmm1
1942
1943	mulps	%xmm0, %xmm2
1944	mulps	52 * SIZE(BB), %xmm0
1945	addps	%xmm2, %xmm4
1946	movaps	48 * SIZE(BB), %xmm2
1947
1948	addps	%xmm0, %xmm5
1949	movaps	52 * SIZE(AA), %xmm0
1950	mulps	%xmm0, %xmm2
1951	mulps	52 * SIZE(BB), %xmm0
1952
1953	addps	%xmm2, %xmm6
1954	movaps	64 * SIZE(BB), %xmm2
1955	addps	%xmm0, %xmm7
1956	movaps	64 * SIZE(AA), %xmm0
1957
1958	mulps	%xmm1, %xmm3
1959	mulps	60 * SIZE(BB), %xmm1
1960	addps	%xmm3, %xmm4
1961	movaps	56 * SIZE(BB), %xmm3
1962
1963	addps	%xmm1, %xmm5
1964	movaps	60 * SIZE(AA), %xmm1
1965	mulps	%xmm1, %xmm3
1966	mulps	60 * SIZE(BB), %xmm1
1967
1968	addps	%xmm3, %xmm6
1969	movaps	72 * SIZE(BB), %xmm3
1970	addps	%xmm1, %xmm7
1971	movaps	72 * SIZE(AA), %xmm1
1972
1973	addl   $64 * SIZE, BB
1974	addl   $64 * SIZE, AA
1975	decl   %eax
1976	jne    .L11
1977	ALIGN_2
1978
1979.L12:
1980#if defined(LT) || defined(RN)
1981	movl	KK, %eax
1982#else
1983	movl	K, %eax
1984	subl	KK, %eax
1985#endif
1986	andl	$7, %eax		# if (k & 1)
1987	BRANCH
1988	je .L14
1989
1990.L13:
1991	movaps	 4 * SIZE(BB), %xmm1
1992	mulps	%xmm0, %xmm2
1993	addps	%xmm2, %xmm4
1994	movaps	 0 * SIZE(BB), %xmm2
1995	mulps	%xmm0, %xmm1
1996	movaps	 4 * SIZE(AA), %xmm0
1997	addps	%xmm1, %xmm5
1998	movaps	 4 * SIZE(BB), %xmm1
1999	mulps	%xmm0, %xmm2
2000	addps	%xmm2, %xmm6
2001	movaps	 8 * SIZE(BB), %xmm2
2002	mulps	%xmm0, %xmm1
2003	movaps	 8 * SIZE(AA), %xmm0
2004	addps	%xmm1, %xmm7
2005
2006	addl	$8 * SIZE, AA
2007	addl	$8 * SIZE, BB
2008	subl	$1, %eax
2009	jg	.L13
2010	ALIGN_4
2011
2012.L14:
2013#if defined(LN) || defined(RT)
2014	movl	KK, %eax
2015#ifdef LN
2016	subl	$8, %eax
2017#else
2018	subl	$2, %eax
2019#endif
2020
2021	movl	AORIG, AA
2022	movl	BORIG, B
2023	leal	BUFFER, BB
2024
2025	sall	$BASE_SHIFT, %eax
2026	leal	(AA, %eax, 8), AA
2027	leal	(B,  %eax, 2), B
2028	leal	(BB, %eax, 8), BB
2029#endif
2030
2031#if defined(LN) || defined(LT)
2032	movaps	 %xmm4, %xmm0
2033	unpcklps %xmm5, %xmm4
2034	unpckhps %xmm5, %xmm0
2035
2036	movaps	 %xmm6, %xmm1
2037	unpcklps %xmm7, %xmm6
2038	unpckhps %xmm7, %xmm1
2039
2040	movsd	 0 * SIZE(B), %xmm2
2041	movhps	 2 * SIZE(B), %xmm2
2042	movsd	 4 * SIZE(B), %xmm3
2043	movhps	 6 * SIZE(B), %xmm3
2044	movsd	 8 * SIZE(B), %xmm5
2045	movhps	10 * SIZE(B), %xmm5
2046	movsd	12 * SIZE(B), %xmm7
2047	movhps	14 * SIZE(B), %xmm7
2048
2049	subps	%xmm4,  %xmm2
2050	subps	%xmm0,  %xmm3
2051	subps	%xmm6,  %xmm5
2052	subps	%xmm1,  %xmm7
2053#else
2054	movaps	 0 * SIZE(AA), %xmm0
2055	movaps	 4 * SIZE(AA), %xmm1
2056	movaps	 8 * SIZE(AA), %xmm2
2057	movaps	12 * SIZE(AA), %xmm3
2058
2059	subps	%xmm4, %xmm0
2060	subps	%xmm6, %xmm1
2061	subps	%xmm5, %xmm2
2062	subps	%xmm7, %xmm3
2063#endif
2064
2065#if defined(LN) || defined(LT)
2066	movaps	TRMASK, %xmm6
2067#endif
2068
2069#ifdef LN
2070	movss	63 * SIZE(AA), %xmm0
2071	movaps	 %xmm6, %xmm1
2072	shufps	$0x00, %xmm0,  %xmm1
2073	mulps	%xmm1, %xmm7
2074
2075	movaps	%xmm7, %xmm1
2076	shufps	$0xee, %xmm1, %xmm1
2077
2078	movss	62 * SIZE(AA), %xmm0
2079	shufps	$0x50, %xmm0, %xmm0
2080	mulps	%xmm1, %xmm0
2081	subps	%xmm0, %xmm7
2082
2083	movsd	60 * SIZE(AA), %xmm0
2084	shufps	$0x50, %xmm0, %xmm0
2085	mulps	%xmm1, %xmm0
2086	subps	%xmm0, %xmm5
2087
2088	movsd	58 * SIZE(AA), %xmm0
2089	shufps	$0x50, %xmm0, %xmm0
2090	mulps	%xmm1, %xmm0
2091	subps	%xmm0, %xmm3
2092
2093	movsd	56 * SIZE(AA), %xmm0
2094	shufps	$0x50, %xmm0, %xmm0
2095	mulps	%xmm1, %xmm0
2096	subps	%xmm0, %xmm2
2097
2098	movss	54 * SIZE(AA), %xmm0
2099	shufps	$0x00, %xmm6,  %xmm0
2100	mulps	%xmm0, %xmm7
2101
2102	movaps	%xmm7, %xmm1
2103	shufps	$0x44, %xmm1, %xmm1
2104
2105	movsd	52 * SIZE(AA), %xmm0
2106	shufps	$0x50, %xmm0, %xmm0
2107	mulps	%xmm1, %xmm0
2108	subps	%xmm0, %xmm5
2109
2110	movsd	50 * SIZE(AA), %xmm0
2111	shufps	$0x50, %xmm0, %xmm0
2112	mulps	%xmm1, %xmm0
2113	subps	%xmm0, %xmm3
2114
2115	movsd	48 * SIZE(AA), %xmm0
2116	shufps	$0x50, %xmm0, %xmm0
2117	mulps	%xmm1, %xmm0
2118	subps	%xmm0, %xmm2
2119
2120
2121	movss	45 * SIZE(AA), %xmm0
2122	movaps	 %xmm6, %xmm1
2123	shufps	$0x00, %xmm0,  %xmm1
2124	mulps	%xmm1, %xmm5
2125
2126	movaps	%xmm5, %xmm1
2127	shufps	$0xee, %xmm1, %xmm1
2128
2129	movss	44 * SIZE(AA), %xmm0
2130	shufps	$0x50, %xmm0, %xmm0
2131	mulps	%xmm1, %xmm0
2132	subps	%xmm0, %xmm5
2133
2134	movsd	42 * SIZE(AA), %xmm0
2135	shufps	$0x50, %xmm0, %xmm0
2136	mulps	%xmm1, %xmm0
2137	subps	%xmm0, %xmm3
2138
2139	movsd	40 * SIZE(AA), %xmm0
2140	shufps	$0x50, %xmm0, %xmm0
2141	mulps	%xmm1, %xmm0
2142	subps	%xmm0, %xmm2
2143
2144	movss	36 * SIZE(AA), %xmm0
2145	shufps	$0x00, %xmm6,  %xmm0
2146	mulps	%xmm0, %xmm5
2147
2148	movaps	%xmm5, %xmm1
2149	shufps	$0x44, %xmm1, %xmm1
2150
2151	movsd	34 * SIZE(AA), %xmm0
2152	shufps	$0x50, %xmm0, %xmm0
2153	mulps	%xmm1, %xmm0
2154	subps	%xmm0, %xmm3
2155
2156	movsd	32 * SIZE(AA), %xmm0
2157	shufps	$0x50, %xmm0, %xmm0
2158	mulps	%xmm1, %xmm0
2159	subps	%xmm0, %xmm2
2160
2161	movss	27 * SIZE(AA), %xmm0
2162	movaps	 %xmm6, %xmm1
2163	shufps	$0x00, %xmm0,  %xmm1
2164	mulps	%xmm1, %xmm3
2165
2166	movaps	%xmm3, %xmm1
2167	shufps	$0xee, %xmm1, %xmm1
2168
2169	movss	26 * SIZE(AA), %xmm0
2170	shufps	$0x50, %xmm0, %xmm0
2171	mulps	%xmm1, %xmm0
2172	subps	%xmm0, %xmm3
2173
2174	movsd	24 * SIZE(AA), %xmm0
2175	shufps	$0x50, %xmm0, %xmm0
2176	mulps	%xmm1, %xmm0
2177	subps	%xmm0, %xmm2
2178
2179	movss	18 * SIZE(AA), %xmm0
2180	shufps	$0x00, %xmm6,  %xmm0
2181	mulps	%xmm0, %xmm3
2182
2183	movaps	%xmm3, %xmm1
2184	shufps	$0x44, %xmm1, %xmm1
2185
2186	movsd	16 * SIZE(AA), %xmm0
2187	shufps	$0x50, %xmm0, %xmm0
2188	mulps	%xmm1, %xmm0
2189	subps	%xmm0, %xmm2
2190
2191	movss	 9 * SIZE(AA), %xmm0
2192	movaps	 %xmm6, %xmm1
2193	shufps	$0x00, %xmm0,  %xmm1
2194	mulps	%xmm1, %xmm2
2195
2196	movaps	%xmm2, %xmm1
2197	shufps	$0xee, %xmm1, %xmm1
2198
2199	movss	 8 * SIZE(AA), %xmm0
2200	shufps	$0x50, %xmm0, %xmm0
2201	mulps	%xmm1, %xmm0
2202	subps	%xmm0, %xmm2
2203
2204	movss	 0 * SIZE(AA), %xmm0
2205	shufps	$0x00, %xmm6,  %xmm0
2206	mulps	%xmm0, %xmm2
2207#endif
2208
2209#ifdef LT
2210	movss	 0 * SIZE(AA), %xmm0
2211	shufps	$0x00, %xmm6,  %xmm0
2212	mulps	%xmm0, %xmm2
2213
2214	movaps	%xmm2, %xmm1
2215	shufps	$0x44, %xmm1, %xmm1
2216
2217	movss	 1 * SIZE(AA), %xmm0
2218	shufps	$0x05, %xmm0, %xmm0
2219	mulps	%xmm1, %xmm0
2220	subps	%xmm0, %xmm2
2221
2222	movsd	 2 * SIZE(AA), %xmm0
2223	shufps	$0x50, %xmm0, %xmm0
2224	mulps	%xmm1, %xmm0
2225	subps	%xmm0, %xmm3
2226
2227	movsd	 4 * SIZE(AA), %xmm0
2228	shufps	$0x50, %xmm0, %xmm0
2229	mulps	%xmm1, %xmm0
2230	subps	%xmm0, %xmm5
2231
2232	movsd	 6 * SIZE(AA), %xmm0
2233	shufps	$0x50, %xmm0, %xmm0
2234	mulps	%xmm1, %xmm0
2235	subps	%xmm0, %xmm7
2236
2237	movss	 9 * SIZE(AA), %xmm0
2238	movaps	 %xmm6, %xmm1
2239	shufps	$0x00, %xmm0,  %xmm1
2240	mulps	%xmm1, %xmm2
2241
2242	movaps	%xmm2, %xmm1
2243	shufps	$0xee, %xmm1, %xmm1
2244
2245	movsd	10 * SIZE(AA), %xmm0
2246	shufps	$0x50, %xmm0, %xmm0
2247	mulps	%xmm1, %xmm0
2248	subps	%xmm0, %xmm3
2249
2250	movsd	12 * SIZE(AA), %xmm0
2251	shufps	$0x50, %xmm0, %xmm0
2252	mulps	%xmm1, %xmm0
2253	subps	%xmm0, %xmm5
2254
2255	movsd	14 * SIZE(AA), %xmm0
2256	shufps	$0x50, %xmm0, %xmm0
2257	mulps	%xmm1, %xmm0
2258	subps	%xmm0, %xmm7
2259
2260	movss	18 * SIZE(AA), %xmm0
2261	shufps	$0x00, %xmm6,  %xmm0
2262	mulps	%xmm0, %xmm3
2263
2264	movaps	%xmm3, %xmm1
2265	shufps	$0x44, %xmm1, %xmm1
2266
2267	movss	19 * SIZE(AA), %xmm0
2268	shufps	$0x05, %xmm0, %xmm0
2269	mulps	%xmm1, %xmm0
2270	subps	%xmm0, %xmm3
2271
2272	movsd	20 * SIZE(AA), %xmm0
2273	shufps	$0x50, %xmm0, %xmm0
2274	mulps	%xmm1, %xmm0
2275	subps	%xmm0, %xmm5
2276
2277	movsd	22 * SIZE(AA), %xmm0
2278	shufps	$0x50, %xmm0, %xmm0
2279	mulps	%xmm1, %xmm0
2280	subps	%xmm0, %xmm7
2281
2282	movss	27 * SIZE(AA), %xmm0
2283	movaps	 %xmm6, %xmm1
2284	shufps	$0x00, %xmm0,  %xmm1
2285	mulps	%xmm1, %xmm3
2286
2287	movaps	%xmm3, %xmm1
2288	shufps	$0xee, %xmm1, %xmm1
2289
2290	movsd	28 * SIZE(AA), %xmm0
2291	shufps	$0x50, %xmm0, %xmm0
2292	mulps	%xmm1, %xmm0
2293	subps	%xmm0, %xmm5
2294
2295	movsd	30 * SIZE(AA), %xmm0
2296	shufps	$0x50, %xmm0, %xmm0
2297	mulps	%xmm1, %xmm0
2298	subps	%xmm0, %xmm7
2299
2300	movss	36 * SIZE(AA), %xmm0
2301	shufps	$0x00, %xmm6,  %xmm0
2302	mulps	%xmm0, %xmm5
2303
2304	movaps	%xmm5, %xmm1
2305	shufps	$0x44, %xmm1, %xmm1
2306
2307	movss	37 * SIZE(AA), %xmm0
2308	shufps	$0x05, %xmm0, %xmm0
2309	mulps	%xmm1, %xmm0
2310	subps	%xmm0, %xmm5
2311
2312	movsd	38 * SIZE(AA), %xmm0
2313	shufps	$0x50, %xmm0, %xmm0
2314	mulps	%xmm1, %xmm0
2315	subps	%xmm0, %xmm7
2316
2317	movss	45 * SIZE(AA), %xmm0
2318	movaps	 %xmm6, %xmm1
2319	shufps	$0x00, %xmm0,  %xmm1
2320	mulps	%xmm1, %xmm5
2321
2322	movaps	%xmm5, %xmm1
2323	shufps	$0xee, %xmm1, %xmm1
2324
2325	movsd	46 * SIZE(AA), %xmm0
2326	shufps	$0x50, %xmm0, %xmm0
2327	mulps	%xmm1, %xmm0
2328	subps	%xmm0, %xmm7
2329
2330	movss	54 * SIZE(AA), %xmm0
2331	shufps	$0x00, %xmm6,  %xmm0
2332	mulps	%xmm0, %xmm7
2333
2334	movaps	%xmm7, %xmm1
2335	shufps	$0x44, %xmm1, %xmm1
2336
2337	movss	55 * SIZE(AA), %xmm0
2338	shufps	$0x05, %xmm0, %xmm0
2339	mulps	%xmm1, %xmm0
2340	subps	%xmm0, %xmm7
2341
2342	movss	63 * SIZE(AA), %xmm0
2343	movaps	 %xmm6, %xmm1
2344	shufps	$0x00, %xmm0,  %xmm1
2345	mulps	%xmm1, %xmm7
2346#endif
2347
2348#ifdef RN
2349	movss	 0 * SIZE(B), %xmm6
2350	shufps	$0x00, %xmm6, %xmm6
2351
2352	mulps	%xmm6, %xmm0
2353	mulps	%xmm6, %xmm1
2354
2355	movss	 1 * SIZE(B), %xmm6
2356	shufps	$0x00, %xmm6, %xmm6
2357	movaps	%xmm6, %xmm5
2358
2359	mulps	%xmm0, %xmm5
2360	mulps	%xmm1, %xmm6
2361
2362	subps	%xmm5, %xmm2
2363	subps	%xmm6, %xmm3
2364
2365	movss	 3 * SIZE(B), %xmm6
2366	shufps	$0x00, %xmm6, %xmm6
2367
2368	mulps	%xmm6, %xmm2
2369	mulps	%xmm6, %xmm3
2370#endif
2371
2372#ifdef RT
2373	movss	 3 * SIZE(B), %xmm6
2374	shufps	$0x00, %xmm6, %xmm6
2375
2376	mulps	%xmm6, %xmm2
2377	mulps	%xmm6, %xmm3
2378
2379	movss	 2 * SIZE(B), %xmm6
2380	shufps	$0x00, %xmm6, %xmm6
2381	movaps	%xmm6, %xmm5
2382
2383	mulps	%xmm2, %xmm5
2384	mulps	%xmm3, %xmm6
2385
2386	subps	%xmm5, %xmm0
2387	subps	%xmm6, %xmm1
2388
2389	movss	 0 * SIZE(B), %xmm6
2390	shufps	$0x00, %xmm6, %xmm6
2391
2392	mulps	%xmm6, %xmm0
2393	mulps	%xmm6, %xmm1
2394#endif
2395
2396#if defined(LN) || defined(LT)
2397	movlps	%xmm2,   0 * SIZE(B)
2398	movhps	%xmm2,   2 * SIZE(B)
2399	movlps	%xmm3,   4 * SIZE(B)
2400	movhps	%xmm3,   6 * SIZE(B)
2401	movlps	%xmm5,   8 * SIZE(B)
2402	movhps	%xmm5,  10 * SIZE(B)
2403	movlps	%xmm7,  12 * SIZE(B)
2404	movhps	%xmm7,  14 * SIZE(B)
2405
2406#ifdef HAVE_SSE2
2407	pshufd	$0x00, %xmm2, %xmm0
2408	pshufd	$0x55, %xmm2, %xmm1
2409	pshufd	$0xaa, %xmm2, %xmm4
2410	pshufd	$0xff, %xmm2, %xmm6
2411#else
2412	movaps	%xmm2, %xmm0
2413	shufps	$0x00, %xmm0, %xmm0
2414	movaps	%xmm2, %xmm1
2415	shufps	$0x55, %xmm1, %xmm1
2416	movaps	%xmm2, %xmm4
2417	shufps	$0xaa, %xmm4, %xmm4
2418	movaps	%xmm2, %xmm6
2419	shufps	$0xff, %xmm6, %xmm6
2420#endif
2421	movaps	%xmm0,   0 * SIZE(BB)
2422	movaps	%xmm1,   4 * SIZE(BB)
2423	movaps	%xmm4,   8 * SIZE(BB)
2424	movaps	%xmm6,  12 * SIZE(BB)
2425
2426#ifdef HAVE_SSE2
2427	pshufd	$0x00, %xmm3, %xmm0
2428	pshufd	$0x55, %xmm3, %xmm1
2429	pshufd	$0xaa, %xmm3, %xmm4
2430	pshufd	$0xff, %xmm3, %xmm6
2431#else
2432	movaps	%xmm3, %xmm0
2433	shufps	$0x00, %xmm0, %xmm0
2434	movaps	%xmm3, %xmm1
2435	shufps	$0x55, %xmm1, %xmm1
2436	movaps	%xmm3, %xmm4
2437	shufps	$0xaa, %xmm4, %xmm4
2438	movaps	%xmm3, %xmm6
2439	shufps	$0xff, %xmm6, %xmm6
2440#endif
2441	movaps	%xmm0,  16 * SIZE(BB)
2442	movaps	%xmm1,  20 * SIZE(BB)
2443	movaps	%xmm4,  24 * SIZE(BB)
2444	movaps	%xmm6,  28 * SIZE(BB)
2445
2446#ifdef HAVE_SSE2
2447	pshufd	$0x00, %xmm5, %xmm0
2448	pshufd	$0x55, %xmm5, %xmm1
2449	pshufd	$0xaa, %xmm5, %xmm4
2450	pshufd	$0xff, %xmm5, %xmm6
2451#else
2452	movaps	%xmm5, %xmm0
2453	shufps	$0x00, %xmm0, %xmm0
2454	movaps	%xmm5, %xmm1
2455	shufps	$0x55, %xmm1, %xmm1
2456	movaps	%xmm5, %xmm4
2457	shufps	$0xaa, %xmm4, %xmm4
2458	movaps	%xmm5, %xmm6
2459	shufps	$0xff, %xmm6, %xmm6
2460#endif
2461	movaps	%xmm0,  32 * SIZE(BB)
2462	movaps	%xmm1,  36 * SIZE(BB)
2463	movaps	%xmm4,  40 * SIZE(BB)
2464	movaps	%xmm6,  44 * SIZE(BB)
2465
2466#ifdef HAVE_SSE2
2467	pshufd	$0x00, %xmm7, %xmm0
2468	pshufd	$0x55, %xmm7, %xmm1
2469	pshufd	$0xaa, %xmm7, %xmm4
2470	pshufd	$0xff, %xmm7, %xmm6
2471#else
2472	movaps	%xmm7, %xmm0
2473	shufps	$0x00, %xmm0, %xmm0
2474	movaps	%xmm7, %xmm1
2475	shufps	$0x55, %xmm1, %xmm1
2476	movaps	%xmm7, %xmm4
2477	shufps	$0xaa, %xmm4, %xmm4
2478	movaps	%xmm7, %xmm6
2479	shufps	$0xff, %xmm6, %xmm6
2480#endif
2481	movaps	%xmm0,  48 * SIZE(BB)
2482	movaps	%xmm1,  52 * SIZE(BB)
2483	movaps	%xmm4,  56 * SIZE(BB)
2484	movaps	%xmm6,  60 * SIZE(BB)
2485#else
2486	movaps	%xmm0,   0 * SIZE(AA)
2487	movaps	%xmm1,   4 * SIZE(AA)
2488	movaps	%xmm2,   8 * SIZE(AA)
2489	movaps	%xmm3,  12 * SIZE(AA)
2490#endif
2491
2492#ifdef LN
2493	subl	$8 * SIZE, CO1
2494#endif
2495
2496#if defined(LN) || defined(LT)
2497	movaps	 %xmm2, %xmm0
2498	shufps	 $0x88, %xmm3, %xmm2
2499	shufps	 $0xdd, %xmm3, %xmm0
2500
2501	movaps	 %xmm5, %xmm4
2502	shufps	 $0x88, %xmm7, %xmm5
2503	shufps	 $0xdd, %xmm7, %xmm4
2504
2505	movlps	%xmm2, 0 * SIZE(CO1)
2506	movhps	%xmm2, 2 * SIZE(CO1)
2507	movlps	%xmm5, 4 * SIZE(CO1)
2508	movhps	%xmm5, 6 * SIZE(CO1)
2509	movlps	%xmm0, 0 * SIZE(CO1, LDC)
2510	movhps	%xmm0, 2 * SIZE(CO1, LDC)
2511	movlps	%xmm4, 4 * SIZE(CO1, LDC)
2512	movhps	%xmm4, 6 * SIZE(CO1, LDC)
2513#else
2514	movlps	%xmm0, 0 * SIZE(CO1)
2515	movhps	%xmm0, 2 * SIZE(CO1)
2516	movlps	%xmm1, 4 * SIZE(CO1)
2517	movhps	%xmm1, 6 * SIZE(CO1)
2518	movlps	%xmm2, 0 * SIZE(CO1, LDC)
2519	movhps	%xmm2, 2 * SIZE(CO1, LDC)
2520	movlps	%xmm3, 4 * SIZE(CO1, LDC)
2521	movhps	%xmm3, 6 * SIZE(CO1, LDC)
2522#endif
2523
2524#ifndef LN
2525	addl	$8 * SIZE, CO1
2526#endif
2527
2528#if defined(LT) || defined(RN)
2529	movl	K,  %eax
2530	subl	KK, %eax
2531	leal	(,%eax, SIZE), %eax
2532	leal	(AA, %eax, 8), AA
2533#ifdef LT
2534	addl	$16 * SIZE, B
2535#endif
2536#endif
2537
2538#ifdef LN
2539	subl	$8, KK
2540	movl	BORIG, B
2541#endif
2542
2543#ifdef LT
2544	addl	$8, KK
2545#endif
2546
2547#ifdef RT
2548	movl	K, %eax
2549	movl	BORIG, B
2550	sall	$3 + BASE_SHIFT, %eax
2551	addl	%eax, AORIG
2552#endif
2553
2554	decl	%ebx			# i --
2555	jg	.L10
2556	ALIGN_2
2557
2558.L30:
2559	testl	$4, M
2560	jle	.L50
2561
2562#ifdef LN
2563       movl	K, %eax
2564       sall	$2 + BASE_SHIFT, %eax
2565       subl	%eax, AORIG
2566#endif
2567
2568#if defined(LN) || defined(RT)
2569	movl	KK, %eax
2570	movl	AORIG, AA
2571	sall	$2 + BASE_SHIFT, %eax
2572	addl	%eax, AA
2573#endif
2574
2575	leal	BUFFER, BB
2576
2577#if defined(LN) || defined(RT)
2578	movl	KK, %eax
2579	sall	$1 + BASE_SHIFT, %eax
2580	leal	(BB, %eax, 4), BB
2581#endif
2582
2583	movaps	 0 * SIZE(BB), %xmm2
2584	xorps	%xmm4, %xmm4
2585	movaps	 0 * SIZE(AA), %xmm0
2586	xorps	%xmm5, %xmm5
2587	movaps	 16 * SIZE(BB), %xmm3
2588	xorps	%xmm6, %xmm6
2589	movaps	 16 * SIZE(AA), %xmm1
2590	xorps	%xmm7, %xmm7
2591
2592#if defined(LT) || defined(RN)
2593	movl	KK, %eax
2594#else
2595	movl	K, %eax
2596	subl	KK, %eax
2597#endif
2598	sarl	$3, %eax
2599	je	.L32
2600	ALIGN_2
2601
2602.L31:
2603	mulps	%xmm0, %xmm2
2604	mulps	 4 * SIZE(BB), %xmm0
2605	addps	%xmm2, %xmm4
2606	movaps	 8 * SIZE(BB), %xmm2
2607	addps	%xmm0, %xmm5
2608	movaps	 4 * SIZE(AA), %xmm0
2609	mulps	%xmm0, %xmm2
2610	mulps	12 * SIZE(BB), %xmm0
2611	addps	%xmm2, %xmm6
2612	movaps	32 * SIZE(BB), %xmm2
2613	addps	%xmm0, %xmm7
2614	movaps	 8 * SIZE(AA), %xmm0
2615	mulps	%xmm0, %xmm3
2616	mulps	20 * SIZE(BB), %xmm0
2617	addps	%xmm3, %xmm4
2618	movaps	24 * SIZE(BB), %xmm3
2619	addps	%xmm0, %xmm5
2620	movaps	12 * SIZE(AA), %xmm0
2621	mulps	%xmm0, %xmm3
2622	mulps	28 * SIZE(BB), %xmm0
2623	addps	%xmm3, %xmm6
2624	movaps	48 * SIZE(BB), %xmm3
2625	addps	%xmm0, %xmm7
2626	movaps	32 * SIZE(AA), %xmm0
2627	mulps	%xmm1, %xmm2
2628	mulps	36 * SIZE(BB), %xmm1
2629	addps	%xmm2, %xmm4
2630	movaps	40 * SIZE(BB), %xmm2
2631	addps	%xmm1, %xmm5
2632	movaps	20 * SIZE(AA), %xmm1
2633	mulps	%xmm1, %xmm2
2634	mulps	44 * SIZE(BB), %xmm1
2635	addps	%xmm2, %xmm6
2636	movaps	64 * SIZE(BB), %xmm2
2637	addps	%xmm1, %xmm7
2638	movaps	24 * SIZE(AA), %xmm1
2639	mulps	%xmm1, %xmm3
2640	mulps	52 * SIZE(BB), %xmm1
2641	addps	%xmm3, %xmm4
2642	movaps	56 * SIZE(BB), %xmm3
2643	addps	%xmm1, %xmm5
2644	movaps	28 * SIZE(AA), %xmm1
2645	mulps	%xmm1, %xmm3
2646	mulps	60 * SIZE(BB), %xmm1
2647	addps	%xmm3, %xmm6
2648	movaps	80 * SIZE(BB), %xmm3
2649	addps	%xmm1, %xmm7
2650	movaps	48 * SIZE(AA), %xmm1
2651
2652	addl   $32 * SIZE, AA
2653	addl   $64 * SIZE, BB
2654	decl   %eax
2655	jne    .L31
2656	ALIGN_2
2657
2658.L32:
2659#if defined(LT) || defined(RN)
2660	movl	KK, %eax
2661#else
2662	movl	K, %eax
2663	subl	KK, %eax
2664#endif
2665	andl	$7, %eax		# if (k & 1)
2666	BRANCH
2667	je .L34
2668
2669.L33:
2670	mulps	%xmm0, %xmm2
2671	mulps	 4 * SIZE(BB), %xmm0
2672	addps	%xmm2, %xmm4
2673	movaps	 8 * SIZE(BB), %xmm2
2674	addps	%xmm0, %xmm5
2675	movaps	 4 * SIZE(AA), %xmm0
2676
2677	addl	$4 * SIZE, AA
2678	addl	$8 * SIZE, BB
2679	decl	%eax
2680	jg	.L33
2681	ALIGN_4
2682
2683.L34:
2684	addps	%xmm6, %xmm4
2685	addps	%xmm7, %xmm5
2686
2687#if defined(LN) || defined(RT)
2688	movl	KK, %eax
2689#ifdef LN
2690	subl	$4, %eax
2691#else
2692	subl	$2, %eax
2693#endif
2694
2695	movl	AORIG, AA
2696	movl	BORIG, B
2697	leal	BUFFER, BB
2698
2699	sall	$BASE_SHIFT, %eax
2700	leal	(AA, %eax, 4), AA
2701	leal	(B,  %eax, 2), B
2702	leal	(BB, %eax, 8), BB
2703#endif
2704
2705#if defined(LN) || defined(LT)
2706	movaps	 %xmm4, %xmm0
2707	unpcklps %xmm5, %xmm4
2708	unpckhps %xmm5, %xmm0
2709
2710	movsd	 0 * SIZE(B), %xmm2
2711	movhps	 2 * SIZE(B), %xmm2
2712	movsd	 4 * SIZE(B), %xmm3
2713	movhps	 6 * SIZE(B), %xmm3
2714
2715	subps	%xmm4,  %xmm2
2716	subps	%xmm0,  %xmm3
2717#else
2718	movaps	 0 * SIZE(AA), %xmm0
2719	movaps	 4 * SIZE(AA), %xmm2
2720
2721	subps	%xmm4, %xmm0
2722	subps	%xmm5, %xmm2
2723#endif
2724
2725#if defined(LN) || defined(LT)
2726	movaps	TRMASK, %xmm6
2727#endif
2728
2729#ifdef LN
2730	movss	15 * SIZE(AA), %xmm0
2731	movaps	 %xmm6, %xmm1
2732	shufps	$0x00, %xmm0,  %xmm1
2733	mulps	%xmm1, %xmm3
2734
2735	movaps	%xmm3, %xmm1
2736	shufps	$0xee, %xmm1, %xmm1
2737
2738	movss	14 * SIZE(AA), %xmm0
2739	shufps	$0x50, %xmm0, %xmm0
2740	mulps	%xmm1, %xmm0
2741	subps	%xmm0, %xmm3
2742
2743	movsd	12 * SIZE(AA), %xmm0
2744	shufps	$0x50, %xmm0, %xmm0
2745	mulps	%xmm1, %xmm0
2746	subps	%xmm0, %xmm2
2747
2748	movss	10 * SIZE(AA), %xmm0
2749	shufps	$0x00, %xmm6,  %xmm0
2750	mulps	%xmm0, %xmm3
2751
2752	movaps	%xmm3, %xmm1
2753	shufps	$0x44, %xmm1, %xmm1
2754
2755	movsd	 8 * SIZE(AA), %xmm0
2756	shufps	$0x50, %xmm0, %xmm0
2757	mulps	%xmm1, %xmm0
2758	subps	%xmm0, %xmm2
2759
2760	movss	 5 * SIZE(AA), %xmm0
2761	movaps	 %xmm6, %xmm1
2762	shufps	$0x00, %xmm0,  %xmm1
2763	mulps	%xmm1, %xmm2
2764
2765	movaps	%xmm2, %xmm1
2766	shufps	$0xee, %xmm1, %xmm1
2767
2768	movss	 4 * SIZE(AA), %xmm0
2769	shufps	$0x50, %xmm0, %xmm0
2770	mulps	%xmm1, %xmm0
2771	subps	%xmm0, %xmm2
2772
2773	movss	 0 * SIZE(AA), %xmm0
2774	shufps	$0x00, %xmm6,  %xmm0
2775	mulps	%xmm0, %xmm2
2776
2777#endif
2778
2779#ifdef LT
2780	movss	 0 * SIZE(AA), %xmm0
2781	shufps	$0x00, %xmm6,  %xmm0
2782	mulps	%xmm0, %xmm2
2783
2784	movaps	%xmm2, %xmm1
2785	shufps	$0x44, %xmm1, %xmm1
2786
2787	movss	 1 * SIZE(AA), %xmm0
2788	shufps	$0x05, %xmm0, %xmm0
2789	mulps	%xmm1, %xmm0
2790	subps	%xmm0, %xmm2
2791
2792	movsd	 2 * SIZE(AA), %xmm0
2793	shufps	$0x50, %xmm0, %xmm0
2794	mulps	%xmm1, %xmm0
2795	subps	%xmm0, %xmm3
2796
2797	movss	 5 * SIZE(AA), %xmm0
2798	movaps	 %xmm6, %xmm1
2799	shufps	$0x00, %xmm0,  %xmm1
2800	mulps	%xmm1, %xmm2
2801
2802	movaps	%xmm2, %xmm1
2803	shufps	$0xee, %xmm1, %xmm1
2804
2805	movsd	 6 * SIZE(AA), %xmm0
2806	shufps	$0x50, %xmm0, %xmm0
2807	mulps	%xmm1, %xmm0
2808	subps	%xmm0, %xmm3
2809
2810	movss	10 * SIZE(AA), %xmm0
2811	shufps	$0x00, %xmm6,  %xmm0
2812	mulps	%xmm0, %xmm3
2813
2814	movaps	%xmm3, %xmm1
2815	shufps	$0x44, %xmm1, %xmm1
2816
2817	movss	11 * SIZE(AA), %xmm0
2818	shufps	$0x05, %xmm0, %xmm0
2819	mulps	%xmm1, %xmm0
2820	subps	%xmm0, %xmm3
2821
2822	movss	15 * SIZE(AA), %xmm0
2823	movaps	 %xmm6, %xmm1
2824	shufps	$0x00, %xmm0,  %xmm1
2825	mulps	%xmm1, %xmm3
2826#endif
2827
2828#ifdef RN
2829	movss	 0 * SIZE(B), %xmm6
2830	shufps	$0x00, %xmm6, %xmm6
2831
2832	mulps	%xmm6, %xmm0
2833
2834	movss	 1 * SIZE(B), %xmm6
2835	shufps	$0x00, %xmm6, %xmm6
2836	movaps	%xmm6, %xmm5
2837
2838	mulps	%xmm0, %xmm5
2839	subps	%xmm5, %xmm2
2840
2841	movss	 3 * SIZE(B), %xmm6
2842	shufps	$0x00, %xmm6, %xmm6
2843
2844	mulps	%xmm6, %xmm2
2845#endif
2846
2847#ifdef RT
2848	movss	 3 * SIZE(B), %xmm6
2849	shufps	$0x00, %xmm6, %xmm6
2850
2851	mulps	%xmm6, %xmm2
2852
2853	movss	 2 * SIZE(B), %xmm6
2854	shufps	$0x00, %xmm6, %xmm6
2855	movaps	%xmm6, %xmm5
2856
2857	mulps	%xmm2, %xmm5
2858
2859	subps	%xmm5, %xmm0
2860
2861	movss	 0 * SIZE(B), %xmm6
2862	shufps	$0x00, %xmm6, %xmm6
2863
2864	mulps	%xmm6, %xmm0
2865#endif
2866
2867#if defined(LN) || defined(LT)
2868	movlps	%xmm2,   0 * SIZE(B)
2869	movhps	%xmm2,   2 * SIZE(B)
2870	movlps	%xmm3,   4 * SIZE(B)
2871	movhps	%xmm3,   6 * SIZE(B)
2872
2873#ifdef HAVE_SSE2
2874	pshufd	$0x00, %xmm2, %xmm0
2875	pshufd	$0x55, %xmm2, %xmm1
2876	pshufd	$0xaa, %xmm2, %xmm4
2877	pshufd	$0xff, %xmm2, %xmm6
2878#else
2879	movaps	%xmm2, %xmm0
2880	shufps	$0x00, %xmm0, %xmm0
2881	movaps	%xmm2, %xmm1
2882	shufps	$0x55, %xmm1, %xmm1
2883	movaps	%xmm2, %xmm4
2884	shufps	$0xaa, %xmm4, %xmm4
2885	movaps	%xmm2, %xmm6
2886	shufps	$0xff, %xmm6, %xmm6
2887#endif
2888
2889	movaps	%xmm0,   0 * SIZE(BB)
2890	movaps	%xmm1,   4 * SIZE(BB)
2891	movaps	%xmm4,   8 * SIZE(BB)
2892	movaps	%xmm6,  12 * SIZE(BB)
2893
2894#ifdef HAVE_SSE2
2895	pshufd	$0x00, %xmm3, %xmm0
2896	pshufd	$0x55, %xmm3, %xmm1
2897	pshufd	$0xaa, %xmm3, %xmm4
2898	pshufd	$0xff, %xmm3, %xmm6
2899#else
2900	movaps	%xmm3, %xmm0
2901	shufps	$0x00, %xmm0, %xmm0
2902	movaps	%xmm3, %xmm1
2903	shufps	$0x55, %xmm1, %xmm1
2904	movaps	%xmm3, %xmm4
2905	shufps	$0xaa, %xmm4, %xmm4
2906	movaps	%xmm3, %xmm6
2907	shufps	$0xff, %xmm6, %xmm6
2908#endif
2909
2910	movaps	%xmm0,  16 * SIZE(BB)
2911	movaps	%xmm1,  20 * SIZE(BB)
2912	movaps	%xmm4,  24 * SIZE(BB)
2913	movaps	%xmm6,  28 * SIZE(BB)
2914#else
2915	movaps	%xmm0,   0 * SIZE(AA)
2916	movaps	%xmm2,   4 * SIZE(AA)
2917#endif
2918
2919#ifdef LN
2920	subl	$4 * SIZE, CO1
2921#endif
2922
2923#if defined(LN) || defined(LT)
2924	movaps	 %xmm2, %xmm0
2925	shufps	 $0x88, %xmm3, %xmm2
2926	shufps	 $0xdd, %xmm3, %xmm0
2927
2928	movlps	%xmm2, 0 * SIZE(CO1)
2929	movhps	%xmm2, 2 * SIZE(CO1)
2930	movlps	%xmm0, 0 * SIZE(CO1, LDC)
2931	movhps	%xmm0, 2 * SIZE(CO1, LDC)
2932#else
2933	movlps	%xmm0, 0 * SIZE(CO1)
2934	movhps	%xmm0, 2 * SIZE(CO1)
2935	movlps	%xmm2, 0 * SIZE(CO1, LDC)
2936	movhps	%xmm2, 2 * SIZE(CO1, LDC)
2937#endif
2938
2939#ifndef LN
2940	addl	$4 * SIZE, CO1
2941#endif
2942
2943#if defined(LT) || defined(RN)
2944	movl	K,  %eax
2945	subl	KK, %eax
2946	leal	(,%eax, SIZE), %eax
2947	leal	(AA, %eax, 4), AA
2948#ifdef LT
2949	addl	$8 * SIZE, B
2950#endif
2951#endif
2952
2953#ifdef LN
2954	subl	$4, KK
2955	movl	BORIG, B
2956#endif
2957
2958#ifdef LT
2959	addl	$4, KK
2960#endif
2961
2962#ifdef RT
2963	movl	K, %eax
2964	movl	BORIG, B
2965	sall	$2 + BASE_SHIFT, %eax
2966	addl	%eax, AORIG
2967#endif
2968	ALIGN_2
2969
2970.L50:
2971	testl	$2, M
2972	jle	.L70
2973
2974#ifdef LN
2975       movl	K, %eax
2976       sall	$1 + BASE_SHIFT, %eax
2977       subl	%eax, AORIG
2978#endif
2979
2980#if defined(LN) || defined(RT)
2981	movl	KK, %eax
2982	movl	AORIG, AA
2983	sall	$1 + BASE_SHIFT, %eax
2984	addl	%eax, AA
2985#endif
2986
2987	leal	BUFFER, BB
2988
2989#if defined(LN) || defined(RT)
2990	movl	KK, %eax
2991	sall	$1 + BASE_SHIFT, %eax
2992	leal	(BB, %eax, 4), BB
2993#endif
2994
2995	movaps	 0 * SIZE(BB), %xmm2
2996	xorps	%xmm4, %xmm4
2997	movaps	 0 * SIZE(AA), %xmm0
2998	xorps	%xmm5, %xmm5
2999	movaps	 16 * SIZE(BB), %xmm3
3000	xorps	%xmm6, %xmm6
3001	movaps	 8 * SIZE(AA), %xmm1
3002	xorps	%xmm7, %xmm7
3003
3004#if defined(LT) || defined(RN)
3005	movl	KK, %eax
3006#else
3007	movl	K, %eax
3008	subl	KK, %eax
3009#endif
3010	sarl	$3, %eax
3011	je	.L52
3012	ALIGN_2
3013
3014.L51:
3015	mulps	%xmm0, %xmm2
3016	addps	%xmm2, %xmm4
3017	movaps	 4 * SIZE(BB), %xmm2
3018	mulps	%xmm0, %xmm2
3019	movsd	 2 * SIZE(AA), %xmm0
3020	addps	%xmm2, %xmm5
3021	movaps	 8 * SIZE(BB), %xmm2
3022	mulps	%xmm0, %xmm2
3023	addps	%xmm2, %xmm6
3024	movaps	12 * SIZE(BB), %xmm2
3025	mulps	%xmm0, %xmm2
3026	movsd	 4 * SIZE(AA), %xmm0
3027	addps	%xmm2, %xmm7
3028	movaps	32 * SIZE(BB), %xmm2
3029	mulps	%xmm0, %xmm3
3030	addps	%xmm3, %xmm4
3031	movaps	20 * SIZE(BB), %xmm3
3032	mulps	%xmm0, %xmm3
3033	movsd	 6 * SIZE(AA), %xmm0
3034	addps	%xmm3, %xmm5
3035	movaps	24 * SIZE(BB), %xmm3
3036	mulps	%xmm0, %xmm3
3037	addps	%xmm3, %xmm6
3038	movaps	28 * SIZE(BB), %xmm3
3039	mulps	%xmm0, %xmm3
3040	movsd	16 * SIZE(AA), %xmm0
3041	addps	%xmm3, %xmm7
3042	movaps	48 * SIZE(BB), %xmm3
3043	mulps	%xmm1, %xmm2
3044	addps	%xmm2, %xmm4
3045	movaps	36 * SIZE(BB), %xmm2
3046	mulps	%xmm1, %xmm2
3047	movsd	10 * SIZE(AA), %xmm1
3048	addps	%xmm2, %xmm5
3049	movaps	40 * SIZE(BB), %xmm2
3050	mulps	%xmm1, %xmm2
3051	addps	%xmm2, %xmm6
3052	movaps	44 * SIZE(BB), %xmm2
3053	mulps	%xmm1, %xmm2
3054	movsd	12 * SIZE(AA), %xmm1
3055	addps	%xmm2, %xmm7
3056	movaps	64 * SIZE(BB), %xmm2
3057	mulps	%xmm1, %xmm3
3058	addps	%xmm3, %xmm4
3059	movaps	52 * SIZE(BB), %xmm3
3060	mulps	%xmm1, %xmm3
3061	movsd	14 * SIZE(AA), %xmm1
3062	addps	%xmm3, %xmm5
3063	movaps	56 * SIZE(BB), %xmm3
3064	mulps	%xmm1, %xmm3
3065	addps	%xmm3, %xmm6
3066	movaps	60 * SIZE(BB), %xmm3
3067	mulps	%xmm1, %xmm3
3068	movsd	24 * SIZE(AA), %xmm1
3069	addps	%xmm3, %xmm7
3070	movaps	80 * SIZE(BB), %xmm3
3071
3072	addl   $16 * SIZE, AA
3073	addl   $64 * SIZE, BB
3074	decl   %eax
3075	jne    .L51
3076	ALIGN_2
3077
3078.L52:
3079#if defined(LT) || defined(RN)
3080	movl	KK, %eax
3081#else
3082	movl	K, %eax
3083	subl	KK, %eax
3084#endif
3085	andl	$7, %eax		# if (k & 1)
3086	BRANCH
3087	je .L54
3088
3089.L53:
3090	mulps	%xmm0, %xmm2
3091	addps	%xmm2, %xmm4
3092	movaps	 4 * SIZE(BB), %xmm2
3093	mulps	%xmm0, %xmm2
3094	movsd	 2 * SIZE(AA), %xmm0
3095	addps	%xmm2, %xmm5
3096	movaps	 8 * SIZE(BB), %xmm2
3097
3098	addl	$2 * SIZE, AA
3099	addl	$8 * SIZE, BB
3100	decl	%eax
3101	jg	.L53
3102	ALIGN_4
3103
3104.L54:
3105	addps	%xmm6, %xmm4
3106	addps	%xmm7, %xmm5
3107
3108#if defined(LN) || defined(RT)
3109	movl	KK, %eax
3110#ifdef LN
3111	subl	$2, %eax
3112#else
3113	subl	$2, %eax
3114#endif
3115
3116	movl	AORIG, AA
3117	movl	BORIG, B
3118	leal	BUFFER, BB
3119
3120	sall	$BASE_SHIFT, %eax
3121	leal	(AA, %eax, 2), AA
3122	leal	(B,  %eax, 2), B
3123	leal	(BB, %eax, 8), BB
3124#endif
3125
3126#if defined(LN) || defined(LT)
3127	unpcklps %xmm5, %xmm4
3128
3129	movsd	 0 * SIZE(B), %xmm2
3130	movhps	 2 * SIZE(B), %xmm2
3131
3132	subps	%xmm4,  %xmm2
3133#else
3134#ifdef	movsd
3135	xorps	%xmm0, %xmm0
3136#endif
3137	movsd	 0 * SIZE(AA), %xmm0
3138#ifdef	movsd
3139	xorps	%xmm2, %xmm2
3140#endif
3141	movsd	 2 * SIZE(AA), %xmm2
3142
3143	subps	%xmm4, %xmm0
3144	subps	%xmm5, %xmm2
3145#endif
3146
3147#if defined(LN) || defined(LT)
3148	movaps	TRMASK, %xmm6
3149#endif
3150
3151#ifdef LN
3152	movss	 3 * SIZE(AA), %xmm0
3153	movaps	 %xmm6, %xmm1
3154	shufps	$0x00, %xmm0,  %xmm1
3155	mulps	%xmm1, %xmm2
3156
3157	movaps	%xmm2, %xmm1
3158	shufps	$0xee, %xmm1, %xmm1
3159
3160	movss	 2 * SIZE(AA), %xmm0
3161	shufps	$0x50, %xmm0, %xmm0
3162	mulps	%xmm1, %xmm0
3163	subps	%xmm0, %xmm2
3164
3165	movss	 0 * SIZE(AA), %xmm0
3166	shufps	$0x00, %xmm6,  %xmm0
3167	mulps	%xmm0, %xmm2
3168
3169#endif
3170
3171#ifdef LT
3172	movss	 0 * SIZE(AA), %xmm0
3173	shufps	$0x00, %xmm6,  %xmm0
3174	mulps	%xmm0, %xmm2
3175
3176	movaps	%xmm2, %xmm1
3177	shufps	$0x44, %xmm1, %xmm1
3178
3179	movss	 1 * SIZE(AA), %xmm0
3180	shufps	$0x05, %xmm0, %xmm0
3181	mulps	%xmm1, %xmm0
3182	subps	%xmm0, %xmm2
3183
3184	movss	 3 * SIZE(AA), %xmm0
3185	movaps	 %xmm6, %xmm1
3186	shufps	$0x00, %xmm0,  %xmm1
3187	mulps	%xmm1, %xmm2
3188#endif
3189
3190#ifdef RN
3191	movss	 0 * SIZE(B), %xmm6
3192	shufps	$0x00, %xmm6, %xmm6
3193
3194	mulps	%xmm6, %xmm0
3195
3196	movss	 1 * SIZE(B), %xmm6
3197	shufps	$0x00, %xmm6, %xmm6
3198	movaps	%xmm6, %xmm5
3199
3200	mulps	%xmm0, %xmm5
3201	subps	%xmm5, %xmm2
3202
3203	movss	 3 * SIZE(B), %xmm6
3204	shufps	$0x00, %xmm6, %xmm6
3205
3206	mulps	%xmm6, %xmm2
3207#endif
3208
3209#ifdef RT
3210	movss	 3 * SIZE(B), %xmm6
3211	shufps	$0x00, %xmm6, %xmm6
3212
3213	mulps	%xmm6, %xmm2
3214
3215	movss	 2 * SIZE(B), %xmm6
3216	shufps	$0x00, %xmm6, %xmm6
3217	movaps	%xmm6, %xmm5
3218
3219	mulps	%xmm2, %xmm5
3220
3221	subps	%xmm5, %xmm0
3222
3223	movss	 0 * SIZE(B), %xmm6
3224	shufps	$0x00, %xmm6, %xmm6
3225
3226	mulps	%xmm6, %xmm0
3227#endif
3228
3229#if defined(LN) || defined(LT)
3230	movlps	%xmm2,   0 * SIZE(B)
3231	movhps	%xmm2,   2 * SIZE(B)
3232
3233#ifdef HAVE_SSE2
3234	pshufd	$0x00, %xmm2, %xmm0
3235	pshufd	$0x55, %xmm2, %xmm1
3236	pshufd	$0xaa, %xmm2, %xmm4
3237	pshufd	$0xff, %xmm2, %xmm6
3238#else
3239	movaps	%xmm2, %xmm0
3240	shufps	$0x00, %xmm0, %xmm0
3241	movaps	%xmm2, %xmm1
3242	shufps	$0x55, %xmm1, %xmm1
3243	movaps	%xmm2, %xmm4
3244	shufps	$0xaa, %xmm4, %xmm4
3245	movaps	%xmm2, %xmm6
3246	shufps	$0xff, %xmm6, %xmm6
3247#endif
3248
3249	movaps	%xmm0,   0 * SIZE(BB)
3250	movaps	%xmm1,   4 * SIZE(BB)
3251	movaps	%xmm4,   8 * SIZE(BB)
3252	movaps	%xmm6,  12 * SIZE(BB)
3253#else
3254	movlps	%xmm0,   0 * SIZE(AA)
3255	movlps	%xmm2,   2 * SIZE(AA)
3256#endif
3257
3258#ifdef LN
3259	subl	$2 * SIZE, CO1
3260#endif
3261
3262#if defined(LN) || defined(LT)
3263	movaps	 %xmm2, %xmm0
3264	shufps	 $0x88, %xmm3, %xmm2
3265	shufps	 $0xdd, %xmm3, %xmm0
3266
3267	movlps	%xmm2, 0 * SIZE(CO1)
3268	movlps	%xmm0, 0 * SIZE(CO1, LDC)
3269#else
3270	movlps	%xmm0, 0 * SIZE(CO1)
3271	movlps	%xmm2, 0 * SIZE(CO1, LDC)
3272#endif
3273
3274#ifndef LN
3275	addl	$2 * SIZE, CO1
3276#endif
3277
3278#if defined(LT) || defined(RN)
3279	movl	K,  %eax
3280	subl	KK, %eax
3281	leal	(,%eax, SIZE), %eax
3282	leal	(AA, %eax, 2), AA
3283#ifdef LT
3284	addl	$4 * SIZE, B
3285#endif
3286#endif
3287
3288#ifdef LN
3289	subl	$2, KK
3290	movl	BORIG, B
3291#endif
3292
3293#ifdef LT
3294	addl	$2, KK
3295#endif
3296
3297#ifdef RT
3298	movl	K, %eax
3299	movl	BORIG, B
3300	sall	$1 + BASE_SHIFT, %eax
3301	addl	%eax, AORIG
3302#endif
3303	ALIGN_2
3304
3305.L70:
3306	testl	$1, M
3307	jle	.L99
3308
3309#ifdef LN
3310       movl	K, %eax
3311       sall	$BASE_SHIFT, %eax
3312       subl	%eax, AORIG
3313#endif
3314
3315#if defined(LN) || defined(RT)
3316	movl	KK, %eax
3317	movl	AORIG, AA
3318	sall	$BASE_SHIFT, %eax
3319	addl	%eax, AA
3320#endif
3321
3322	leal	BUFFER, BB
3323
3324#if defined(LN) || defined(RT)
3325	movl	KK, %eax
3326	sall	$1 + BASE_SHIFT, %eax
3327	leal	(BB, %eax, 4), BB
3328#endif
3329
3330	movss	 0 * SIZE(BB), %xmm2
3331	xorps	%xmm4, %xmm4
3332	movss	 0 * SIZE(AA), %xmm0
3333	xorps	%xmm5, %xmm5
3334	movss	 16 * SIZE(BB), %xmm3
3335	xorps	%xmm6, %xmm6
3336	movss	 4 * SIZE(AA), %xmm1
3337	xorps	%xmm7, %xmm7
3338
3339#if defined(LT) || defined(RN)
3340	movl	KK, %eax
3341#else
3342	movl	K, %eax
3343	subl	KK, %eax
3344#endif
3345	sarl	$3, %eax
3346	je	.L72
3347	ALIGN_2
3348
3349.L71:
3350	mulss	%xmm0, %xmm2
3351	mulss	 4 * SIZE(BB), %xmm0
3352	addss	%xmm2, %xmm4
3353	movss	 8 * SIZE(BB), %xmm2
3354	addss	%xmm0, %xmm5
3355	movss	 1 * SIZE(AA), %xmm0
3356	mulss	%xmm0, %xmm2
3357	mulss	12 * SIZE(BB), %xmm0
3358	addss	%xmm2, %xmm6
3359	movss	32 * SIZE(BB), %xmm2
3360	addss	%xmm0, %xmm7
3361	movss	 2 * SIZE(AA), %xmm0
3362	mulss	%xmm0, %xmm3
3363	mulss	20 * SIZE(BB), %xmm0
3364	addss	%xmm3, %xmm4
3365	movss	24 * SIZE(BB), %xmm3
3366	addss	%xmm0, %xmm5
3367	movss	 3 * SIZE(AA), %xmm0
3368	mulss	%xmm0, %xmm3
3369	mulss	28 * SIZE(BB), %xmm0
3370	addss	%xmm3, %xmm6
3371	movss	48 * SIZE(BB), %xmm3
3372	addss	%xmm0, %xmm7
3373	movss	 8 * SIZE(AA), %xmm0
3374	mulss	%xmm1, %xmm2
3375	mulss	36 * SIZE(BB), %xmm1
3376	addss	%xmm2, %xmm4
3377	movss	40 * SIZE(BB), %xmm2
3378	addss	%xmm1, %xmm5
3379	movss	 5 * SIZE(AA), %xmm1
3380	mulss	%xmm1, %xmm2
3381	mulss	44 * SIZE(BB), %xmm1
3382	addss	%xmm2, %xmm6
3383	movss	64 * SIZE(BB), %xmm2
3384	addss	%xmm1, %xmm7
3385	movss	 6 * SIZE(AA), %xmm1
3386	mulss	%xmm1, %xmm3
3387	mulss	52 * SIZE(BB), %xmm1
3388	addss	%xmm3, %xmm4
3389	movss	56 * SIZE(BB), %xmm3
3390	addss	%xmm1, %xmm5
3391	movss	 7 * SIZE(AA), %xmm1
3392	mulss	%xmm1, %xmm3
3393	mulss	60 * SIZE(BB), %xmm1
3394	addss	%xmm3, %xmm6
3395	movss	80 * SIZE(BB), %xmm3
3396	addss	%xmm1, %xmm7
3397	movss	12 * SIZE(AA), %xmm1
3398
3399	addl   $ 8 * SIZE, AA
3400	addl   $64 * SIZE, BB
3401	decl   %eax
3402	jne    .L71
3403	ALIGN_2
3404
3405.L72:
3406#if defined(LT) || defined(RN)
3407	movl	KK, %eax
3408#else
3409	movl	K, %eax
3410	subl	KK, %eax
3411#endif
3412	andl	$7, %eax		# if (k & 1)
3413	BRANCH
3414	je .L74
3415
3416.L73:
3417	mulss	%xmm0, %xmm2
3418	mulss	 4 * SIZE(BB), %xmm0
3419	addss	%xmm2, %xmm4
3420	movss	 8 * SIZE(BB), %xmm2
3421	addss	%xmm0, %xmm5
3422	movss	 1 * SIZE(AA), %xmm0
3423
3424	addl	$1 * SIZE, AA
3425	addl	$8 * SIZE, BB
3426	decl	%eax
3427	jg	.L73
3428	ALIGN_4
3429
3430.L74:
3431	addss	%xmm6, %xmm4
3432	addss	%xmm7, %xmm5
3433
3434#if defined(LN) || defined(RT)
3435	movl	KK, %eax
3436#ifdef LN
3437	subl	$1, %eax
3438#else
3439	subl	$2, %eax
3440#endif
3441
3442	movl	AORIG, AA
3443	movl	BORIG, B
3444	leal	BUFFER, BB
3445
3446	sall	$BASE_SHIFT, %eax
3447	leal	(AA, %eax, 1), AA
3448	leal	(B,  %eax, 2), B
3449	leal	(BB, %eax, 8), BB
3450#endif
3451
3452#if defined(LN) || defined(LT)
3453	unpcklps %xmm5, %xmm4
3454
3455#ifdef	movsd
3456	xorps	%xmm2, %xmm2
3457#endif
3458	movsd	 0 * SIZE(B), %xmm2
3459
3460	subps	%xmm4,  %xmm2
3461#else
3462	movss	 0 * SIZE(AA), %xmm0
3463	movss	 1 * SIZE(AA), %xmm2
3464
3465	subss	%xmm4, %xmm0
3466	subss	%xmm5, %xmm2
3467#endif
3468
3469#if defined(LN) || defined(LT)
3470	movaps	TRMASK, %xmm6
3471#endif
3472
3473#if defined(LN) || defined(LT)
3474	movss	 0 * SIZE(AA), %xmm0
3475	shufps	$0x00, %xmm6,  %xmm0
3476	mulps	%xmm0, %xmm2
3477#endif
3478
3479#ifdef RN
3480	movss	 0 * SIZE(B), %xmm6
3481	mulss	%xmm6, %xmm0
3482
3483	movss	 1 * SIZE(B), %xmm6
3484	movaps	%xmm6, %xmm5
3485
3486	mulss	%xmm0, %xmm5
3487	subss	%xmm5, %xmm2
3488
3489	movss	 3 * SIZE(B), %xmm6
3490	mulss	%xmm6, %xmm2
3491#endif
3492
3493#ifdef RT
3494	movss	 3 * SIZE(B), %xmm6
3495	mulss	%xmm6, %xmm2
3496
3497	movss	 2 * SIZE(B), %xmm6
3498	movaps	%xmm6, %xmm5
3499
3500	mulss	%xmm2, %xmm5
3501	subss	%xmm5, %xmm0
3502
3503	movss	 0 * SIZE(B), %xmm6
3504	mulss	%xmm6, %xmm0
3505#endif
3506
3507#if defined(LN) || defined(LT)
3508#ifdef	movsd
3509	xorps	%xmm2, %xmm2
3510#endif
3511	movsd	%xmm2,   0 * SIZE(B)
3512
3513	movaps	%xmm2, %xmm0
3514	shufps	$0x00, %xmm0, %xmm0
3515	movaps	%xmm2, %xmm1
3516	shufps	$0x55, %xmm1, %xmm1
3517
3518	movaps	%xmm0,   0 * SIZE(BB)
3519	movaps	%xmm1,   4 * SIZE(BB)
3520#else
3521	movss	%xmm0,   0 * SIZE(AA)
3522	movss	%xmm2,   1 * SIZE(AA)
3523#endif
3524
3525#ifdef LN
3526	subl	$1 * SIZE, CO1
3527#endif
3528
3529#if defined(LN) || defined(LT)
3530	movaps	 %xmm2, %xmm0
3531	shufps	 $0x88, %xmm3, %xmm2
3532	shufps	 $0xdd, %xmm3, %xmm0
3533
3534	movss	%xmm2, 0 * SIZE(CO1)
3535	movss	%xmm0, 0 * SIZE(CO1, LDC)
3536#else
3537	movss	%xmm0, 0 * SIZE(CO1)
3538	movss	%xmm2, 0 * SIZE(CO1, LDC)
3539#endif
3540
3541#ifndef LN
3542	addl	$1 * SIZE, CO1
3543#endif
3544
3545#if defined(LT) || defined(RN)
3546	movl	K,  %eax
3547	subl	KK, %eax
3548	leal	(,%eax, SIZE), %eax
3549	leal	(AA, %eax, 1), AA
3550#ifdef LT
3551	addl	$2 * SIZE, B
3552#endif
3553#endif
3554
3555#ifdef LN
3556	subl	$1, KK
3557	movl	BORIG, B
3558#endif
3559
3560#ifdef LT
3561	addl	$1, KK
3562#endif
3563
3564#ifdef RT
3565	movl	K, %eax
3566	movl	BORIG, B
3567	sall	$BASE_SHIFT, %eax
3568	addl	%eax, AORIG
3569#endif
3570	ALIGN_2
3571
3572.L99:
3573#ifdef LN
3574       movl	K, %eax
3575       leal	(, %eax, SIZE), %eax
3576       leal 	(B, %eax, 2), B
3577#endif
3578
3579#if defined(LT) || defined(RN)
3580	movl	K,  %eax
3581	subl	KK, %eax
3582	leal	(,%eax, SIZE), %eax
3583	leal	(B,  %eax, 2), B
3584#endif
3585
3586#ifdef RN
3587	addl	$2, KK
3588#endif
3589
3590#ifdef RT
3591	subl	$2, KK
3592#endif
3593
3594	decl	J			# j --
3595	jg	.L01
3596	ALIGN_2
3597
3598.L999:
3599	movl	OLD_STACK, %esp
3600
3601	popl	%ebx
3602	popl	%esi
3603	popl	%edi
3604	popl	%ebp
3605	ret
3606
3607	EPILOGUE
3608