1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	16
27
28#define M	 4 + STACK + ARGS(%esp)
29#define N	 8 + STACK + ARGS(%esp)
30#define K	12 + STACK + ARGS(%esp)
31#define ALPHA	16 + STACK + ARGS(%esp)
32#define A	24 + STACK + ARGS(%esp)
33#define ARG_B	28 + STACK + ARGS(%esp)
34#define C	32 + STACK + ARGS(%esp)
35#define ARG_LDC	36 + STACK + ARGS(%esp)
36#define OFFSET	40 + STACK + ARGS(%esp)
37
38#define J	 0 + STACK(%esp)
39#define KK	 4 + STACK(%esp)
40#define KKK	 8 + STACK(%esp)
41#define AORIG	12 + STACK(%esp)
42
43#ifdef PENTIUM4
44#define PREFETCH     prefetcht1
45#define PREFETCHSIZE 84
46#endif
47
48#if defined(PENRYN) || defined(DUNNINGTON)
49#define PREFETCH     prefetcht1
50#define PREFETCHSIZE 84
51#endif
52
53#ifdef PENTIUMM
54#define PREFETCH     prefetcht1
55#define PREFETCHSIZE 84
56#endif
57
58#define AA	%edx
59#define BB	%ecx
60#define LDC	%ebp
61#define B	%edi
62#define CO1	%esi
63
64	PROLOGUE
65
66	subl	$ARGS, %esp
67
68	pushl	%ebp
69	pushl	%edi
70	pushl	%esi
71	pushl	%ebx
72
73	PROFCODE
74
75	movl	ARG_B,   B
76	movl	ARG_LDC, LDC
77
78	movl	OFFSET, %eax
79#ifdef RN
80	negl	%eax
81#endif
82	movl	%eax, KK
83
84	leal	(, LDC, SIZE), LDC
85
86#ifdef LN
87	movl	M, %eax
88	leal	(, %eax, SIZE), %eax
89	addl	%eax, C
90	imull	K, %eax
91	addl	%eax, A
92#endif
93
94#ifdef RT
95	movl	N, %eax
96	leal	(, %eax, SIZE), %eax
97	imull	K, %eax
98	addl	%eax, B
99	movl	N, %eax
100	imull	LDC, %eax
101	addl	%eax, C
102#endif
103
104#ifdef RT
105	movl	N, %eax
106	subl	OFFSET, %eax
107	movl	%eax, KK
108#endif
109
110	movl	N, %eax
111	sarl	$2, %eax
112	movl	%eax, J
113	jle	.L30
114	ALIGN_2
115
116.L10:
117#if defined(LT) || defined(RN)
118	movl	A, AA
119#else
120	movl	A, %eax
121	movl	%eax, AORIG
122#endif
123
124#ifdef RT
125	movl	K, %eax
126	sall	$2 + BASE_SHIFT, %eax
127	subl	%eax, B
128#endif
129
130	leal	(, LDC, 4), %eax
131
132#ifdef RT
133	subl	%eax, C
134#endif
135	movl	C, CO1
136#ifndef RT
137	addl	%eax, C
138#endif
139
140#ifdef LN
141	movl	OFFSET, %eax
142	addl	M, %eax
143	movl	%eax, KK
144#endif
145
146#ifdef LT
147	movl	OFFSET, %eax
148	movl	%eax, KK
149#endif
150
151	movl	M,  %ebx
152	testl	$1, %ebx	# i = (m >> 2)
153	jle	.L20
154
155#ifdef LN
156       movl	K, %eax
157       sall	$BASE_SHIFT, %eax
158       subl	%eax, AORIG
159#endif
160
161#if defined(LN) || defined(RT)
162	movl	KK, %eax
163	movl	AORIG, AA
164	leal	(AA, %eax, SIZE), AA
165#endif
166
167	movl	B, BB
168
169#if defined(LN) || defined(RT)
170	movl	KK, %eax
171	sall	$2 + BASE_SHIFT, %eax
172	addl	%eax, BB
173#endif
174
175	movddup	 0 * SIZE(AA), %xmm0
176	pxor	%xmm4, %xmm4
177	movddup	 8 * SIZE(AA), %xmm1
178	pxor	%xmm5, %xmm5
179	movapd	 0 * SIZE(BB), %xmm2
180	pxor	%xmm6, %xmm6
181	movapd	 8 * SIZE(BB), %xmm3
182	pxor	%xmm7, %xmm7
183
184#if defined(LT) || defined(RN)
185	movl	KK, %eax
186#else
187	movl	K, %eax
188	subl	KK, %eax
189#endif
190	sarl	$4, %eax
191	je	.L25
192	ALIGN_4
193
194.L22:
195	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
196	mulpd	 %xmm0, %xmm2
197	mulpd	 2 * SIZE(BB), %xmm0
198	addpd	 %xmm2, %xmm4
199	movapd	 4 * SIZE(BB), %xmm2
200	addpd	 %xmm0, %xmm5
201	movddup	 1 * SIZE(AA), %xmm0
202	mulpd	 %xmm0, %xmm2
203	mulpd	 6 * SIZE(BB), %xmm0
204	addpd	 %xmm2, %xmm6
205	movapd	16 * SIZE(BB), %xmm2
206	addpd	 %xmm0, %xmm7
207	movddup	 2 * SIZE(AA), %xmm0
208	mulpd	 %xmm0, %xmm3
209	mulpd	10 * SIZE(BB), %xmm0
210	addpd	 %xmm3, %xmm4
211	movapd	12 * SIZE(BB), %xmm3
212	addpd	 %xmm0, %xmm5
213	movddup	 3 * SIZE(AA), %xmm0
214	mulpd	 %xmm0, %xmm3
215	mulpd	14 * SIZE(BB), %xmm0
216	addpd	 %xmm3, %xmm6
217	movapd	24 * SIZE(BB), %xmm3
218	addpd	 %xmm0, %xmm7
219	movddup	 4 * SIZE(AA), %xmm0
220	mulpd	 %xmm0, %xmm2
221	mulpd	18 * SIZE(BB), %xmm0
222	addpd	 %xmm2, %xmm4
223	movapd	20 * SIZE(BB), %xmm2
224	addpd	 %xmm0, %xmm5
225	movddup	 5 * SIZE(AA), %xmm0
226	mulpd	 %xmm0, %xmm2
227	mulpd	22 * SIZE(BB), %xmm0
228	addpd	 %xmm2, %xmm6
229	movapd	32 * SIZE(BB), %xmm2
230	addpd	 %xmm0, %xmm7
231	movddup	 6 * SIZE(AA), %xmm0
232	mulpd	 %xmm0, %xmm3
233	mulpd	26 * SIZE(BB), %xmm0
234	addpd	 %xmm3, %xmm4
235	movapd	28 * SIZE(BB), %xmm3
236	addpd	 %xmm0, %xmm5
237	movddup	 7 * SIZE(AA), %xmm0
238	mulpd	 %xmm0, %xmm3
239	mulpd	30 * SIZE(BB), %xmm0
240	addpd	 %xmm3, %xmm6
241	movapd	40 * SIZE(BB), %xmm3
242	addpd	 %xmm0, %xmm7
243	movddup	16 * SIZE(AA), %xmm0
244	mulpd	 %xmm1, %xmm2
245	mulpd	34 * SIZE(BB), %xmm1
246	addpd	 %xmm2, %xmm4
247	movapd	36 * SIZE(BB), %xmm2
248	addpd	 %xmm1, %xmm5
249	movddup	 9 * SIZE(AA), %xmm1
250	mulpd	 %xmm1, %xmm2
251	mulpd	38 * SIZE(BB), %xmm1
252	addpd	 %xmm2, %xmm6
253	movapd	48 * SIZE(BB), %xmm2
254	addpd	 %xmm1, %xmm7
255	movddup	10 * SIZE(AA), %xmm1
256	mulpd	 %xmm1, %xmm3
257	mulpd	42 * SIZE(BB), %xmm1
258	addpd	 %xmm3, %xmm4
259	movapd	44 * SIZE(BB), %xmm3
260	addpd	 %xmm1, %xmm5
261	movddup	11 * SIZE(AA), %xmm1
262	mulpd	 %xmm1, %xmm3
263	mulpd	46 * SIZE(BB), %xmm1
264	addpd	 %xmm3, %xmm6
265	movapd	56 * SIZE(BB), %xmm3
266	addpd	 %xmm1, %xmm7
267	movddup	12 * SIZE(AA), %xmm1
268	mulpd	 %xmm1, %xmm2
269	mulpd	50 * SIZE(BB), %xmm1
270	addpd	 %xmm2, %xmm4
271	movapd	52 * SIZE(BB), %xmm2
272	addpd	 %xmm1, %xmm5
273	movddup	13 * SIZE(AA), %xmm1
274	mulpd	 %xmm1, %xmm2
275	mulpd	54 * SIZE(BB), %xmm1
276	addpd	 %xmm2, %xmm6
277	movapd	64 * SIZE(BB), %xmm2
278	addpd	 %xmm1, %xmm7
279	movddup	14 * SIZE(AA), %xmm1
280	mulpd	 %xmm1, %xmm3
281	mulpd	58 * SIZE(BB), %xmm1
282	addpd	 %xmm3, %xmm4
283	movapd	60 * SIZE(BB), %xmm3
284	addpd	 %xmm1, %xmm5
285	movddup	15 * SIZE(AA), %xmm1
286	mulpd	 %xmm1, %xmm3
287	mulpd	62 * SIZE(BB), %xmm1
288	addpd	 %xmm3, %xmm6
289	movapd	72 * SIZE(BB), %xmm3
290	addpd	 %xmm1, %xmm7
291	movddup	24 * SIZE(AA), %xmm1
292
293	addl   $16 * SIZE, AA
294	addl   $64 * SIZE, BB
295	decl   %eax
296	jne    .L22
297	ALIGN_4
298
299.L25:
300#if defined(LT) || defined(RN)
301	movl	KK, %eax
302#else
303	movl	K, %eax
304	subl	KK, %eax
305#endif
306	andl	$15, %eax		# if (k & 1)
307	BRANCH
308	je .L28
309
310.L26:
311	mulpd	 %xmm0, %xmm2
312	mulpd	 2 * SIZE(BB), %xmm0
313	addpd	 %xmm2, %xmm4
314	movapd	 4 * SIZE(BB), %xmm2
315	addpd	 %xmm0, %xmm5
316	movddup	 1 * SIZE(AA), %xmm0
317
318	addl	$1 * SIZE, AA
319	addl	$4 * SIZE, BB
320
321	decl	%eax
322	jg	.L26
323	ALIGN_4
324
325.L28:
326	addpd	%xmm6, %xmm4
327	addpd	%xmm7, %xmm5
328
329#if defined(LN) || defined(RT)
330	movl	KK, %eax
331#ifdef LN
332	subl	$1, %eax
333#else
334	subl	$4, %eax
335#endif
336
337	movl	AORIG, AA
338
339	leal	(, %eax, SIZE), %eax
340	leal	(AA, %eax, 1), AA
341	leal	(B,  %eax, 4), BB
342#endif
343
344#if defined(LN) || defined(LT)
345	movapd	 0 * SIZE(BB), %xmm0
346	movapd	 2 * SIZE(BB), %xmm1
347
348	subpd	%xmm4,  %xmm0
349	subpd	%xmm5,  %xmm1
350#else
351	movapd	 0 * SIZE(AA), %xmm1
352	movapd	 2 * SIZE(AA), %xmm3
353
354	subpd	%xmm4, %xmm1
355	subpd	%xmm5, %xmm3
356
357	movapd	       %xmm1, %xmm0
358	unpckhpd       %xmm1, %xmm1
359	movapd	       %xmm3, %xmm2
360	unpckhpd       %xmm3, %xmm3
361#endif
362
363#ifdef LN
364	movddup	 0 * SIZE(AA), %xmm4
365	mulpd	 %xmm4, %xmm0
366	mulpd	 %xmm4, %xmm1
367#endif
368
369#ifdef LT
370	movddup	 0 * SIZE(AA), %xmm4
371	mulpd	 %xmm4, %xmm0
372	mulpd	 %xmm4, %xmm1
373#endif
374
375#ifdef RN
376	movsd	 0 * SIZE(BB), %xmm4
377	mulsd	 %xmm4, %xmm0
378	movsd	 1 * SIZE(BB), %xmm4
379	mulsd	 %xmm0, %xmm4
380	subsd	 %xmm4, %xmm1
381	movsd	 2 * SIZE(BB), %xmm4
382	mulsd	 %xmm0, %xmm4
383	subsd	 %xmm4, %xmm2
384	movsd	 3 * SIZE(BB), %xmm4
385	mulsd	 %xmm0, %xmm4
386	subsd	 %xmm4, %xmm3
387
388	movsd	 5 * SIZE(BB), %xmm4
389	mulsd	 %xmm4, %xmm1
390	movsd	 6 * SIZE(BB), %xmm4
391	mulsd	 %xmm1, %xmm4
392	subsd	 %xmm4, %xmm2
393	movsd	 7 * SIZE(BB), %xmm4
394	mulsd	 %xmm1, %xmm4
395	subsd	 %xmm4, %xmm3
396
397	movsd	10 * SIZE(BB), %xmm4
398	mulsd	 %xmm4, %xmm2
399	movsd	11 * SIZE(BB), %xmm4
400	mulsd	 %xmm2, %xmm4
401	subsd	 %xmm4, %xmm3
402
403	movsd	15 * SIZE(BB), %xmm4
404	mulsd	 %xmm4, %xmm3
405#endif
406
407#ifdef RT
408	movsd	15 * SIZE(BB), %xmm4
409	mulsd	 %xmm4, %xmm3
410	movsd	14 * SIZE(BB), %xmm4
411	mulsd	 %xmm3, %xmm4
412	subsd	 %xmm4, %xmm2
413	movsd	13 * SIZE(BB), %xmm4
414	mulsd	 %xmm3, %xmm4
415	subsd	 %xmm4, %xmm1
416	movsd	12 * SIZE(BB), %xmm4
417	mulsd	 %xmm3, %xmm4
418	subsd	 %xmm4, %xmm0
419
420	movsd	10 * SIZE(BB), %xmm4
421	mulsd	 %xmm4, %xmm2
422	movsd	 9 * SIZE(BB), %xmm4
423	mulsd	 %xmm2, %xmm4
424	subsd	 %xmm4, %xmm1
425	movsd	 8 * SIZE(BB), %xmm4
426	mulsd	 %xmm2, %xmm4
427	subsd	 %xmm4, %xmm0
428
429	movsd	 5 * SIZE(BB), %xmm4
430	mulsd	 %xmm4, %xmm1
431	movsd	 4 * SIZE(BB), %xmm4
432	mulsd	 %xmm1, %xmm4
433	subsd	 %xmm4, %xmm0
434
435	movsd	 0 * SIZE(BB), %xmm4
436	mulsd	 %xmm4, %xmm0
437#endif
438
439#if defined(LN) || defined(LT)
440	movapd	%xmm0,   0 * SIZE(BB)
441	movapd	%xmm1,   2 * SIZE(BB)
442#else
443	movsd	%xmm0,   0 * SIZE(AA)
444	movsd	%xmm1,   1 * SIZE(AA)
445	movsd	%xmm2,   2 * SIZE(AA)
446	movsd	%xmm3,   3 * SIZE(AA)
447#endif
448
449#ifdef LN
450	subl	$1 * SIZE, CO1
451#endif
452
453	leal	(LDC, LDC, 2), %eax
454
455#if defined(LN) || defined(LT)
456	movsd	%xmm0, 0 * SIZE(CO1)
457	movhpd	%xmm0, 0 * SIZE(CO1, LDC, 1)
458	movsd	%xmm1, 0 * SIZE(CO1, LDC, 2)
459	movhpd	%xmm1, 0 * SIZE(CO1, %eax, 1)
460#else
461	movsd	%xmm0, 0 * SIZE(CO1)
462	movsd	%xmm1, 0 * SIZE(CO1, LDC, 1)
463	movsd	%xmm2, 0 * SIZE(CO1, LDC, 2)
464	movsd	%xmm3, 0 * SIZE(CO1, %eax, 1)
465#endif
466
467#ifndef LN
468	addl	$1 * SIZE, CO1
469#endif
470
471#if defined(LT) || defined(RN)
472	movl	K,  %eax
473	subl	KK, %eax
474	leal	(,%eax, SIZE), %eax
475	leal	(AA, %eax, 1), AA
476	leal	(BB, %eax, 4), BB
477#endif
478
479#ifdef LN
480	subl	$1, KK
481#endif
482
483#ifdef LT
484	addl	$1, KK
485#endif
486
487#ifdef RT
488	movl	K, %eax
489	sall	$BASE_SHIFT, %eax
490	addl	%eax, AORIG
491#endif
492	ALIGN_4
493
494.L20:
495	movl	M,  %ebx
496	sarl	$1, %ebx	# i = (m >> 2)
497	jle	.L29
498	ALIGN_4
499
500.L11:
501#ifdef LN
502       movl	K, %eax
503       sall	$1 + BASE_SHIFT, %eax
504       subl	%eax, AORIG
505#endif
506
507#if defined(LN) || defined(RT)
508	movl	KK, %eax
509	movl	AORIG, AA
510	leal	(, %eax, SIZE), %eax
511	leal	(AA, %eax, 2), AA
512#endif
513
514	movl	B, BB
515
516#if defined(LN) || defined(RT)
517	movl	KK, %eax
518	sall	$2 + BASE_SHIFT, %eax
519	addl	%eax, BB
520#endif
521
522	movapd	 0 * SIZE(AA), %xmm0
523	pxor	%xmm4, %xmm4
524	movapd	 8 * SIZE(AA), %xmm1
525	pxor	%xmm5, %xmm5
526	movddup	 0 * SIZE(BB), %xmm2
527	pxor	%xmm6, %xmm6
528	movddup	 8 * SIZE(BB), %xmm3
529	pxor	%xmm7, %xmm7
530
531	leal	(LDC, LDC, 2), %eax
532
533#ifdef LN
534	prefetchnta	-2 * SIZE(CO1)
535	prefetchnta	-2 * SIZE(CO1, LDC, 1)
536	prefetchnta	-2 * SIZE(CO1, LDC, 2)
537	prefetchnta	-2 * SIZE(CO1, %eax, 1)
538#else
539	prefetchnta	 2 * SIZE(CO1)
540	prefetchnta	 2 * SIZE(CO1, LDC, 1)
541	prefetchnta	 2 * SIZE(CO1, LDC, 2)
542	prefetchnta	 2 * SIZE(CO1, %eax, 1)
543#endif
544
545#if defined(LT) || defined(RN)
546	movl	KK, %eax
547#else
548	movl	K, %eax
549	subl	KK, %eax
550#endif
551	sarl	$3, %eax
552	je	.L15
553	ALIGN_4
554
555.L12:
556	mulpd	 %xmm0, %xmm2
557	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
558	addpd	 %xmm2, %xmm4
559	movddup	 1 * SIZE(BB), %xmm2
560	mulpd	 %xmm0, %xmm2
561	addpd	 %xmm2, %xmm5
562	movddup	 2 * SIZE(BB), %xmm2
563	mulpd	 %xmm0, %xmm2
564	addpd	 %xmm2, %xmm6
565	movddup	 3 * SIZE(BB), %xmm2
566	mulpd	 %xmm0, %xmm2
567	movapd	 2 * SIZE(AA), %xmm0
568	addpd	 %xmm2, %xmm7
569	movddup	 4 * SIZE(BB), %xmm2
570	mulpd	 %xmm0, %xmm2
571	addpd	 %xmm2, %xmm4
572	movddup	 5 * SIZE(BB), %xmm2
573	mulpd	 %xmm0, %xmm2
574	addpd	 %xmm2, %xmm5
575	movddup	 6 * SIZE(BB), %xmm2
576	mulpd	 %xmm0, %xmm2
577	addpd	 %xmm2, %xmm6
578	movddup	 7 * SIZE(BB), %xmm2
579	mulpd	 %xmm0, %xmm2
580	movapd	 4 * SIZE(AA), %xmm0
581	addpd	 %xmm2, %xmm7
582	movddup	16 * SIZE(BB), %xmm2
583	mulpd	 %xmm0, %xmm3
584	addpd	 %xmm3, %xmm4
585	movddup	 9 * SIZE(BB), %xmm3
586	mulpd	 %xmm0, %xmm3
587	addpd	 %xmm3, %xmm5
588	movddup	10 * SIZE(BB), %xmm3
589	mulpd	 %xmm0, %xmm3
590	addpd	 %xmm3, %xmm6
591	movddup	11 * SIZE(BB), %xmm3
592	mulpd	 %xmm0, %xmm3
593	movapd	 6 * SIZE(AA), %xmm0
594	addpd	 %xmm3, %xmm7
595	movddup	12 * SIZE(BB), %xmm3
596	mulpd	 %xmm0, %xmm3
597	addpd	 %xmm3, %xmm4
598	movddup	13 * SIZE(BB), %xmm3
599	mulpd	 %xmm0, %xmm3
600	addpd	 %xmm3, %xmm5
601	movddup	14 * SIZE(BB), %xmm3
602	mulpd	 %xmm0, %xmm3
603	addpd	 %xmm3, %xmm6
604	movddup	15 * SIZE(BB), %xmm3
605	mulpd	 %xmm0, %xmm3
606	movapd	16 * SIZE(AA), %xmm0
607	addpd	 %xmm3, %xmm7
608	movddup	24 * SIZE(BB), %xmm3
609	mulpd	 %xmm1, %xmm2
610	addpd	 %xmm2, %xmm4
611	movddup	17 * SIZE(BB), %xmm2
612	mulpd	 %xmm1, %xmm2
613	addpd	 %xmm2, %xmm5
614	movddup	18 * SIZE(BB), %xmm2
615	mulpd	 %xmm1, %xmm2
616	addpd	 %xmm2, %xmm6
617	movddup	19 * SIZE(BB), %xmm2
618	mulpd	 %xmm1, %xmm2
619	movapd	10 * SIZE(AA), %xmm1
620	addpd	 %xmm2, %xmm7
621	movddup	20 * SIZE(BB), %xmm2
622	mulpd	 %xmm1, %xmm2
623	addpd	 %xmm2, %xmm4
624	movddup	21 * SIZE(BB), %xmm2
625	mulpd	 %xmm1, %xmm2
626	addpd	 %xmm2, %xmm5
627	movddup	22 * SIZE(BB), %xmm2
628	mulpd	 %xmm1, %xmm2
629	addpd	 %xmm2, %xmm6
630	movddup	23 * SIZE(BB), %xmm2
631	mulpd	 %xmm1, %xmm2
632	movapd	12 * SIZE(AA), %xmm1
633	addpd	 %xmm2, %xmm7
634	movddup	32 * SIZE(BB), %xmm2
635	mulpd	 %xmm1, %xmm3
636	addpd	 %xmm3, %xmm4
637	movddup	25 * SIZE(BB), %xmm3
638	mulpd	 %xmm1, %xmm3
639	addpd	 %xmm3, %xmm5
640	movddup	26 * SIZE(BB), %xmm3
641	mulpd	 %xmm1, %xmm3
642	addpd	 %xmm3, %xmm6
643	movddup	27 * SIZE(BB), %xmm3
644	mulpd	 %xmm1, %xmm3
645	movapd	14 * SIZE(AA), %xmm1
646	addpd	 %xmm3, %xmm7
647	movddup	28 * SIZE(BB), %xmm3
648	mulpd	 %xmm1, %xmm3
649	addpd	 %xmm3, %xmm4
650	movddup	29 * SIZE(BB), %xmm3
651	mulpd	 %xmm1, %xmm3
652	addpd	 %xmm3, %xmm5
653	movddup	30 * SIZE(BB), %xmm3
654	mulpd	 %xmm1, %xmm3
655	addpd	 %xmm3, %xmm6
656	movddup	31 * SIZE(BB), %xmm3
657	mulpd	 %xmm1, %xmm3
658	movapd	24 * SIZE(AA), %xmm1
659	addpd	 %xmm3, %xmm7
660	movddup	40 * SIZE(BB), %xmm3
661
662	addl   $32 * SIZE, BB
663	addl   $16 * SIZE, AA
664	decl   %eax
665	jne    .L12
666	ALIGN_4
667
668.L15:
669#if defined(LT) || defined(RN)
670	movl	KK, %eax
671#else
672	movl	K, %eax
673	subl	KK, %eax
674#endif
675	andl	$7, %eax		# if (k & 1)
676	BRANCH
677	je .L18
678	ALIGN_3
679
680.L16:
681	mulpd	 %xmm0, %xmm2
682	addpd	 %xmm2, %xmm4
683	movddup	 1 * SIZE(BB), %xmm2
684	mulpd	 %xmm0, %xmm2
685	addpd	 %xmm2, %xmm5
686	movddup	 2 * SIZE(BB), %xmm2
687	mulpd	 %xmm0, %xmm2
688	addpd	 %xmm2, %xmm6
689	movddup	 3 * SIZE(BB), %xmm2
690	mulpd	 %xmm0, %xmm2
691	movapd	 2 * SIZE(AA), %xmm0
692	addpd	 %xmm2, %xmm7
693	movddup	 4 * SIZE(BB), %xmm2
694
695	addl	$2 * SIZE, AA
696	addl	$4 * SIZE, BB
697	decl	%eax
698	jg	.L16
699	ALIGN_4
700
701.L18:
702#if defined(LN) || defined(RT)
703	movl	KK, %eax
704#ifdef LN
705	subl	$2, %eax
706#else
707	subl	$4, %eax
708#endif
709
710	movl	AORIG, AA
711
712	leal	(, %eax, SIZE), %eax
713	leal	(AA, %eax, 2), AA
714	leal	(B,  %eax, 4), BB
715#endif
716
717#if defined(LN) || defined(LT)
718	movapd	 %xmm4, %xmm0
719	unpcklpd %xmm5, %xmm4
720	unpckhpd %xmm5, %xmm0
721
722	movapd	 %xmm6, %xmm1
723	unpcklpd %xmm7, %xmm6
724	unpckhpd %xmm7, %xmm1
725
726	movapd	 0 * SIZE(BB), %xmm2
727	movapd	 2 * SIZE(BB), %xmm5
728	movapd	 4 * SIZE(BB), %xmm3
729	movapd	 6 * SIZE(BB), %xmm7
730
731	subpd	%xmm4,  %xmm2
732	subpd	%xmm6,  %xmm5
733	subpd	%xmm0,  %xmm3
734	subpd	%xmm1,  %xmm7
735#else
736	movapd	 0 * SIZE(AA), %xmm0
737	movapd	 2 * SIZE(AA), %xmm1
738	movapd	 4 * SIZE(AA), %xmm2
739	movapd	 6 * SIZE(AA), %xmm3
740
741	subpd	%xmm4, %xmm0
742	subpd	%xmm5, %xmm1
743	subpd	%xmm6, %xmm2
744	subpd	%xmm7, %xmm3
745#endif
746
747#ifdef LN
748	movddup	 3 * SIZE(AA), %xmm4
749	mulpd	 %xmm4, %xmm3
750	mulpd	 %xmm4, %xmm7
751
752	movddup	 2 * SIZE(AA), %xmm4
753	movapd	 %xmm4, %xmm6
754	mulpd	 %xmm3, %xmm4
755	subpd	 %xmm4, %xmm2
756	mulpd	 %xmm7, %xmm6
757	subpd	 %xmm6, %xmm5
758
759	movddup	 0 * SIZE(AA), %xmm4
760	mulpd	 %xmm4, %xmm2
761	mulpd	 %xmm4, %xmm5
762
763#endif
764
765#ifdef LT
766	movddup	 0 * SIZE(AA), %xmm4
767	mulpd	 %xmm4, %xmm2
768	mulpd	 %xmm4, %xmm5
769
770	movddup	 1 * SIZE(AA), %xmm4
771	movapd	 %xmm4, %xmm6
772	mulpd	 %xmm2, %xmm4
773	subpd	 %xmm4, %xmm3
774	mulpd	 %xmm5, %xmm6
775	subpd	 %xmm6, %xmm7
776
777	movddup	 3 * SIZE(AA), %xmm4
778	mulpd	 %xmm4, %xmm3
779	mulpd	 %xmm4, %xmm7
780#endif
781
782#ifdef RN
783	movddup	 0 * SIZE(BB), %xmm4
784	mulpd	 %xmm4, %xmm0
785	movddup	 1 * SIZE(BB), %xmm4
786	mulpd	 %xmm0, %xmm4
787	subpd	 %xmm4, %xmm1
788	movddup	 2 * SIZE(BB), %xmm4
789	mulpd	 %xmm0, %xmm4
790	subpd	 %xmm4, %xmm2
791	movddup	 3 * SIZE(BB), %xmm4
792	mulpd	 %xmm0, %xmm4
793	subpd	 %xmm4, %xmm3
794
795	movddup	 5 * SIZE(BB), %xmm4
796	mulpd	 %xmm4, %xmm1
797	movddup	 6 * SIZE(BB), %xmm4
798	mulpd	 %xmm1, %xmm4
799	subpd	 %xmm4, %xmm2
800	movddup	 7 * SIZE(BB), %xmm4
801	mulpd	 %xmm1, %xmm4
802	subpd	 %xmm4, %xmm3
803
804	movddup	10 * SIZE(BB), %xmm4
805	mulpd	 %xmm4, %xmm2
806	movddup	11 * SIZE(BB), %xmm4
807	mulpd	 %xmm2, %xmm4
808	subpd	 %xmm4, %xmm3
809
810	movddup	15 * SIZE(BB), %xmm4
811	mulpd	 %xmm4, %xmm3
812#endif
813
814#ifdef RT
815	movddup	15 * SIZE(BB), %xmm4
816	mulpd	 %xmm4, %xmm3
817	movddup	14 * SIZE(BB), %xmm4
818	mulpd	 %xmm3, %xmm4
819	subpd	 %xmm4, %xmm2
820	movddup	13 * SIZE(BB), %xmm4
821	mulpd	 %xmm3, %xmm4
822	subpd	 %xmm4, %xmm1
823	movddup	12 * SIZE(BB), %xmm4
824	mulpd	 %xmm3, %xmm4
825	subpd	 %xmm4, %xmm0
826
827	movddup	10 * SIZE(BB), %xmm4
828	mulpd	 %xmm4, %xmm2
829	movddup	 9 * SIZE(BB), %xmm4
830	mulpd	 %xmm2, %xmm4
831	subpd	 %xmm4, %xmm1
832	movddup	 8 * SIZE(BB), %xmm4
833	mulpd	 %xmm2, %xmm4
834	subpd	 %xmm4, %xmm0
835
836	movddup	 5 * SIZE(BB), %xmm4
837	mulpd	 %xmm4, %xmm1
838	movddup	 4 * SIZE(BB), %xmm4
839	mulpd	 %xmm1, %xmm4
840	subpd	 %xmm4, %xmm0
841
842	movddup	 0 * SIZE(BB), %xmm4
843	mulpd	 %xmm4, %xmm0
844#endif
845
846#if defined(LN) || defined(LT)
847	movapd	%xmm2,   0 * SIZE(BB)
848	movapd	%xmm5,   2 * SIZE(BB)
849	movapd	%xmm3,   4 * SIZE(BB)
850	movapd	%xmm7,   6 * SIZE(BB)
851#else
852	movapd	%xmm0,   0 * SIZE(AA)
853	movapd	%xmm1,   2 * SIZE(AA)
854	movapd	%xmm2,   4 * SIZE(AA)
855	movapd	%xmm3,   6 * SIZE(AA)
856#endif
857
858#ifdef LN
859	subl	$2 * SIZE, CO1
860#endif
861
862	leal	(LDC, LDC, 2), %eax
863
864#if defined(LN) || defined(LT)
865	movsd	%xmm2, 0 * SIZE(CO1)
866	movsd	%xmm3, 1 * SIZE(CO1)
867	movhpd	%xmm2, 0 * SIZE(CO1, LDC, 1)
868	movhpd	%xmm3, 1 * SIZE(CO1, LDC, 1)
869	movsd	%xmm5, 0 * SIZE(CO1, LDC, 2)
870	movsd	%xmm7, 1 * SIZE(CO1, LDC, 2)
871	movhpd	%xmm5, 0 * SIZE(CO1, %eax, 1)
872	movhpd	%xmm7, 1 * SIZE(CO1, %eax, 1)
873#else
874	movsd	%xmm0, 0 * SIZE(CO1)
875	movhpd	%xmm0, 1 * SIZE(CO1)
876	movsd	%xmm1, 0 * SIZE(CO1, LDC, 1)
877	movhpd	%xmm1, 1 * SIZE(CO1, LDC, 1)
878	movsd	%xmm2, 0 * SIZE(CO1, LDC, 2)
879	movhpd	%xmm2, 1 * SIZE(CO1, LDC, 2)
880	movsd	%xmm3, 0 * SIZE(CO1, %eax, 1)
881	movhpd	%xmm3, 1 * SIZE(CO1, %eax, 1)
882#endif
883
884#ifndef LN
885	addl	$2 * SIZE, CO1
886#endif
887
888#if defined(LT) || defined(RN)
889	movl	K,  %eax
890	subl	KK, %eax
891	leal	(,%eax, SIZE), %eax
892	leal	(AA, %eax, 2), AA
893	leal	(BB, %eax, 4), BB
894#endif
895
896#ifdef LN
897	subl	$2, KK
898#endif
899
900#ifdef LT
901	addl	$2, KK
902#endif
903
904#ifdef RT
905	movl	K, %eax
906	sall	$1 + BASE_SHIFT, %eax
907	addl	%eax, AORIG
908#endif
909
910	decl	%ebx			# i --
911	jg	.L11
912	ALIGN_4
913
914.L29:
915#ifdef LN
916       movl	K, %eax
917       leal	(, %eax, SIZE), %eax
918       leal 	(B, %eax, 4), B
919#endif
920
921#if defined(LT) || defined(RN)
922	movl	BB, B
923#endif
924
925#ifdef RN
926	addl	$4, KK
927#endif
928
929#ifdef RT
930	subl	$4, KK
931#endif
932
933	decl	J			# j --
934	jg	.L10
935	ALIGN_4
936
937.L30:
938	testl	$2, N
939	je	.L60
940
941#if defined(LT) || defined(RN)
942	movl	A, AA
943#else
944	movl	A, %eax
945	movl	%eax, AORIG
946#endif
947
948#ifdef RT
949	movl	K, %eax
950	sall	$1 + BASE_SHIFT, %eax
951	subl	%eax, B
952#endif
953
954	leal	(, LDC, 2), %eax
955
956#ifdef RT
957	subl	%eax, C
958#endif
959	movl	C, CO1
960#ifndef RT
961	addl	%eax, C
962#endif
963
964#ifdef LN
965	movl	OFFSET, %eax
966	addl	M, %eax
967	movl	%eax, KK
968#endif
969
970#ifdef LT
971	movl	OFFSET, %eax
972	movl	%eax, KK
973#endif
974
975	movl	M,  %ebx
976	testl	$1, %ebx	# i = (m >> 2)
977	jle	.L50
978
979#ifdef LN
980       movl	K, %eax
981       sall	$BASE_SHIFT, %eax
982       subl	%eax, AORIG
983#endif
984
985#if defined(LN) || defined(RT)
986	movl	KK, %eax
987	movl	AORIG, AA
988	leal	(AA, %eax, SIZE), AA
989#endif
990
991	movl	B, BB
992
993#if defined(LN) || defined(RT)
994	movl	KK, %eax
995	sall	$1 + BASE_SHIFT, %eax
996	addl	%eax, BB
997#endif
998
999	movddup	 0 * SIZE(AA), %xmm0
1000	pxor	%xmm4, %xmm4
1001	movddup	 8 * SIZE(AA), %xmm1
1002	pxor	%xmm5, %xmm5
1003	movapd	 0 * SIZE(BB), %xmm2
1004	pxor	%xmm6, %xmm6
1005	movapd	 8 * SIZE(BB), %xmm3
1006	pxor	%xmm7, %xmm7
1007
1008#if defined(LT) || defined(RN)
1009	movl	KK, %eax
1010#else
1011	movl	K, %eax
1012	subl	KK, %eax
1013#endif
1014	sarl	$4, %eax
1015	je	.L55
1016	ALIGN_4
1017
1018.L52:
1019	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1020	mulpd	 %xmm0, %xmm2
1021	movddup	 1 * SIZE(AA), %xmm0
1022	addpd	 %xmm2, %xmm4
1023	mulpd	 2 * SIZE(BB), %xmm0
1024	movapd	16 * SIZE(BB), %xmm2
1025	addpd	 %xmm0, %xmm5
1026	movddup	 2 * SIZE(AA), %xmm0
1027	mulpd	 4 * SIZE(BB), %xmm0
1028	addpd	 %xmm0, %xmm6
1029	movddup	 3 * SIZE(AA), %xmm0
1030	mulpd	 6 * SIZE(BB), %xmm0
1031	addpd	 %xmm0, %xmm7
1032	movddup	 4 * SIZE(AA), %xmm0
1033	mulpd	 %xmm0, %xmm3
1034	movddup	 5 * SIZE(AA), %xmm0
1035	addpd	 %xmm3, %xmm4
1036	mulpd	10 * SIZE(BB), %xmm0
1037	movapd	24 * SIZE(BB), %xmm3
1038	addpd	 %xmm0, %xmm5
1039	movddup	 6 * SIZE(AA), %xmm0
1040	mulpd	12 * SIZE(BB), %xmm0
1041	addpd	 %xmm0, %xmm6
1042	movddup	 7 * SIZE(AA), %xmm0
1043	mulpd	14 * SIZE(BB), %xmm0
1044	addpd	 %xmm0, %xmm7
1045	movddup	16 * SIZE(AA), %xmm0
1046	mulpd	 %xmm1, %xmm2
1047	movddup	 9 * SIZE(AA), %xmm1
1048	addpd	 %xmm2, %xmm4
1049	mulpd	18 * SIZE(BB), %xmm1
1050	movapd	32 * SIZE(BB), %xmm2
1051	addpd	 %xmm1, %xmm5
1052	movddup	10 * SIZE(AA), %xmm1
1053	mulpd	20 * SIZE(BB), %xmm1
1054	addpd	 %xmm1, %xmm6
1055	movddup	11 * SIZE(AA), %xmm1
1056	mulpd	22 * SIZE(BB), %xmm1
1057	addpd	 %xmm1, %xmm7
1058	movddup	12 * SIZE(AA), %xmm1
1059	mulpd	 %xmm1, %xmm3
1060	movddup	13 * SIZE(AA), %xmm1
1061	addpd	 %xmm3, %xmm4
1062	mulpd	26 * SIZE(BB), %xmm1
1063	movapd	40 * SIZE(BB), %xmm3
1064	addpd	 %xmm1, %xmm5
1065	movddup	14 * SIZE(AA), %xmm1
1066	mulpd	28 * SIZE(BB), %xmm1
1067	addpd	 %xmm1, %xmm6
1068	movddup	15 * SIZE(AA), %xmm1
1069	mulpd	30 * SIZE(BB), %xmm1
1070	addpd	 %xmm1, %xmm7
1071	movddup	24 * SIZE(AA), %xmm1
1072
1073	addl   $16 * SIZE, AA
1074	addl   $32 * SIZE, BB
1075	decl   %eax
1076	jne    .L52
1077	ALIGN_4
1078
1079.L55:
1080#if defined(LT) || defined(RN)
1081	movl	KK, %eax
1082#else
1083	movl	K, %eax
1084	subl	KK, %eax
1085#endif
1086	andl	$15, %eax		# if (k & 1)
1087	BRANCH
1088	je .L58
1089
1090.L56:
1091	mulpd	 %xmm0, %xmm2
1092	movddup	 1 * SIZE(AA), %xmm0
1093	addpd	 %xmm2, %xmm4
1094	movapd	 2 * SIZE(BB), %xmm2
1095
1096	addl	$1 * SIZE, AA
1097	addl	$2 * SIZE, BB
1098	decl	%eax
1099	jg	.L56
1100	ALIGN_4
1101
1102.L58:
1103	addpd	%xmm5, %xmm4
1104	addpd	%xmm7, %xmm6
1105	addpd	%xmm6, %xmm4
1106
1107#if defined(LN) || defined(RT)
1108	movl	KK, %eax
1109#ifdef LN
1110	subl	$1, %eax
1111#else
1112	subl	$2, %eax
1113#endif
1114
1115	movl	AORIG, AA
1116
1117	leal	(, %eax, SIZE), %eax
1118	addl	%eax, AA
1119	leal	(B,  %eax, 2), BB
1120#endif
1121
1122#if defined(LN) || defined(LT)
1123	movapd	 0 * SIZE(BB), %xmm0
1124
1125	subpd	%xmm4,  %xmm0
1126#else
1127	movapd	 0 * SIZE(AA), %xmm1
1128
1129	subpd	%xmm4, %xmm1
1130
1131	movapd	       %xmm1, %xmm0
1132	unpckhpd       %xmm1, %xmm1
1133#endif
1134
1135#ifdef LN
1136	movddup	 0 * SIZE(AA), %xmm4
1137	mulpd	 %xmm4, %xmm0
1138#endif
1139
1140#ifdef LT
1141	movddup	 0 * SIZE(AA), %xmm4
1142	mulpd	 %xmm4, %xmm0
1143#endif
1144
1145#ifdef RN
1146	movsd	 0 * SIZE(BB), %xmm4
1147	mulsd	 %xmm4, %xmm0
1148
1149	movsd	 1 * SIZE(BB), %xmm4
1150	mulsd	 %xmm0, %xmm4
1151	subsd	 %xmm4, %xmm1
1152
1153	movsd	 3 * SIZE(BB), %xmm4
1154	mulsd	 %xmm4, %xmm1
1155#endif
1156
1157#ifdef RT
1158	movsd	 3 * SIZE(BB), %xmm4
1159	mulsd	 %xmm4, %xmm1
1160
1161	movsd	 2 * SIZE(BB), %xmm4
1162	mulsd	 %xmm1, %xmm4
1163	subsd	 %xmm4, %xmm0
1164
1165	movsd	 0 * SIZE(BB), %xmm4
1166	mulsd	 %xmm4, %xmm0
1167#endif
1168
1169#if defined(LN) || defined(LT)
1170	movapd	%xmm0,   0 * SIZE(BB)
1171#else
1172	movsd	%xmm0,   0 * SIZE(AA)
1173	movsd	%xmm1,   1 * SIZE(AA)
1174#endif
1175
1176#ifdef LN
1177	subl	$1 * SIZE, CO1
1178#endif
1179
1180#if defined(LN) || defined(LT)
1181	movsd	%xmm0, 0 * SIZE(CO1)
1182	movhpd	%xmm0, 0 * SIZE(CO1, LDC, 1)
1183#else
1184	movsd	%xmm0, 0 * SIZE(CO1)
1185	movsd	%xmm1, 0 * SIZE(CO1, LDC, 1)
1186#endif
1187
1188#ifndef LN
1189	addl	$1 * SIZE, CO1
1190#endif
1191
1192#if defined(LT) || defined(RN)
1193	movl	K,  %eax
1194	subl	KK, %eax
1195	leal	(,%eax, SIZE), %eax
1196	leal	(AA, %eax, 1), AA
1197	leal	(BB, %eax, 2), BB
1198#endif
1199
1200#ifdef LN
1201	subl	$1, KK
1202#endif
1203
1204#ifdef LT
1205	addl	$1, KK
1206#endif
1207
1208#ifdef RT
1209	movl	K, %eax
1210	sall	$1 + BASE_SHIFT, %eax
1211	addl	%eax, AORIG
1212#endif
1213	ALIGN_4
1214
1215.L50:
1216	movl	M,  %ebx
1217	sarl	$1, %ebx	# i = (m >> 2)
1218	jle	.L59
1219	ALIGN_4
1220
1221.L41:
1222#ifdef LN
1223       movl	K, %eax
1224       sall	$1 + BASE_SHIFT, %eax
1225       subl	%eax, AORIG
1226#endif
1227
1228#if defined(LN) || defined(RT)
1229	movl	KK, %eax
1230	movl	AORIG, AA
1231	leal	(, %eax, SIZE), %eax
1232	leal	(AA, %eax, 2), AA
1233#endif
1234
1235	movl	B, BB
1236
1237#if defined(LN) || defined(RT)
1238	movl	KK, %eax
1239	sall	$1 + BASE_SHIFT, %eax
1240	addl	%eax, BB
1241#endif
1242
1243	movapd	 0 * SIZE(AA), %xmm0
1244	pxor	%xmm4, %xmm4
1245	movapd	 8 * SIZE(AA), %xmm1
1246	pxor	%xmm5, %xmm5
1247	movddup	 0 * SIZE(BB), %xmm2
1248	pxor	%xmm6, %xmm6
1249	movddup	 8 * SIZE(BB), %xmm3
1250	pxor	%xmm7, %xmm7
1251
1252#ifdef LN
1253	prefetchnta	-2 * SIZE(CO1)
1254	prefetchnta	-2 * SIZE(CO1, LDC, 1)
1255#else
1256	prefetchnta	 2 * SIZE(CO1)
1257	prefetchnta	 2 * SIZE(CO1, LDC, 1)
1258#endif
1259
1260#if defined(LT) || defined(RN)
1261	movl	KK, %eax
1262#else
1263	movl	K, %eax
1264	subl	KK, %eax
1265#endif
1266	sarl	$3, %eax
1267	je	.L45
1268	ALIGN_4
1269
1270.L42:
1271	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1272	mulpd	 %xmm0, %xmm2
1273	addpd	 %xmm2, %xmm4
1274	movddup	 1 * SIZE(BB), %xmm2
1275	mulpd	 %xmm0, %xmm2
1276	movapd	 2 * SIZE(AA), %xmm0
1277	addpd	 %xmm2, %xmm5
1278	movddup	 2 * SIZE(BB), %xmm2
1279	mulpd	 %xmm0, %xmm2
1280	addpd	 %xmm2, %xmm6
1281	movddup	 3 * SIZE(BB), %xmm2
1282	mulpd	 %xmm0, %xmm2
1283	movapd	 4 * SIZE(AA), %xmm0
1284	addpd	 %xmm2, %xmm7
1285	movddup	 4 * SIZE(BB), %xmm2
1286	mulpd	 %xmm0, %xmm2
1287	addpd	 %xmm2, %xmm4
1288	movddup	 5 * SIZE(BB), %xmm2
1289	mulpd	 %xmm0, %xmm2
1290	movapd	 6 * SIZE(AA), %xmm0
1291	addpd	 %xmm2, %xmm5
1292	movddup	 6 * SIZE(BB), %xmm2
1293	mulpd	 %xmm0, %xmm2
1294	addpd	 %xmm2, %xmm6
1295	movddup	 7 * SIZE(BB), %xmm2
1296	mulpd	 %xmm0, %xmm2
1297	movapd	16 * SIZE(AA), %xmm0
1298	addpd	 %xmm2, %xmm7
1299	movddup	16 * SIZE(BB), %xmm2
1300	mulpd	 %xmm1, %xmm3
1301	addpd	 %xmm3, %xmm4
1302	movddup	 9 * SIZE(BB), %xmm3
1303	mulpd	 %xmm1, %xmm3
1304	movapd	10 * SIZE(AA), %xmm1
1305	addpd	 %xmm3, %xmm5
1306	movddup	10 * SIZE(BB), %xmm3
1307	mulpd	 %xmm1, %xmm3
1308	addpd	 %xmm3, %xmm6
1309	movddup	11 * SIZE(BB), %xmm3
1310	mulpd	 %xmm1, %xmm3
1311	movapd	12 * SIZE(AA), %xmm1
1312	addpd	 %xmm3, %xmm7
1313	movddup	12 * SIZE(BB), %xmm3
1314	mulpd	 %xmm1, %xmm3
1315	addpd	 %xmm3, %xmm4
1316	movddup	13 * SIZE(BB), %xmm3
1317	mulpd	 %xmm1, %xmm3
1318	movapd	14 * SIZE(AA), %xmm1
1319	addpd	 %xmm3, %xmm5
1320	movddup	14 * SIZE(BB), %xmm3
1321	mulpd	 %xmm1, %xmm3
1322	addpd	 %xmm3, %xmm6
1323	movddup	15 * SIZE(BB), %xmm3
1324	mulpd	 %xmm1, %xmm3
1325	movapd	24 * SIZE(AA), %xmm1
1326	addpd	 %xmm3, %xmm7
1327	movddup	24 * SIZE(BB), %xmm3
1328
1329	addl   $16 * SIZE, AA
1330	addl   $16 * SIZE, BB
1331	decl   %eax
1332	jne    .L42
1333	ALIGN_4
1334
1335.L45:
1336#if defined(LT) || defined(RN)
1337	movl	KK, %eax
1338#else
1339	movl	K, %eax
1340	subl	KK, %eax
1341#endif
1342	andl	$7, %eax		# if (k & 1)
1343	BRANCH
1344	je .L48
1345	ALIGN_3
1346
1347.L46:
1348	mulpd	 %xmm0, %xmm2
1349	addpd	 %xmm2, %xmm4
1350	movddup	 1 * SIZE(BB), %xmm2
1351	mulpd	 %xmm0, %xmm2
1352	movapd	 2 * SIZE(AA), %xmm0
1353	addpd	 %xmm2, %xmm5
1354	movddup	 2 * SIZE(BB), %xmm2
1355
1356	addl	$2 * SIZE, AA
1357	addl	$2 * SIZE, BB
1358	decl	%eax
1359	jg	.L46
1360	ALIGN_4
1361
1362.L48:
1363	addpd	%xmm6, %xmm4
1364	addpd	%xmm7, %xmm5
1365
1366#if defined(LN) || defined(RT)
1367	movl	KK, %eax
1368#ifdef LN
1369	subl	$2, %eax
1370#else
1371	subl	$2, %eax
1372#endif
1373
1374	movl	AORIG, AA
1375
1376	leal	(, %eax, SIZE), %eax
1377	leal	(AA, %eax, 2), AA
1378	leal	(B,  %eax, 2), BB
1379#endif
1380
1381#if defined(LN) || defined(LT)
1382	movapd	 %xmm4, %xmm0
1383	unpcklpd %xmm5, %xmm4
1384	unpckhpd %xmm5, %xmm0
1385
1386	movapd	 0 * SIZE(BB), %xmm2
1387	movapd	 2 * SIZE(BB), %xmm3
1388
1389	subpd	%xmm4,  %xmm2
1390	subpd	%xmm0,  %xmm3
1391#else
1392	movapd	 0 * SIZE(AA), %xmm0
1393	movapd	 2 * SIZE(AA), %xmm1
1394
1395	subpd	%xmm4, %xmm0
1396	subpd	%xmm5, %xmm1
1397#endif
1398
1399#ifdef LN
1400	movddup	 3 * SIZE(AA), %xmm4
1401	mulpd	 %xmm4, %xmm3
1402
1403	movddup	 2 * SIZE(AA), %xmm4
1404	mulpd	 %xmm3, %xmm4
1405	subpd	 %xmm4, %xmm2
1406
1407	movddup	 0 * SIZE(AA), %xmm4
1408	mulpd	 %xmm4, %xmm2
1409
1410#endif
1411
1412#ifdef LT
1413	movddup	 0 * SIZE(AA), %xmm4
1414	mulpd	 %xmm4, %xmm2
1415
1416	movddup	 1 * SIZE(AA), %xmm4
1417	mulpd	 %xmm2, %xmm4
1418	subpd	 %xmm4, %xmm3
1419
1420	movddup	 3 * SIZE(AA), %xmm4
1421	mulpd	 %xmm4, %xmm3
1422#endif
1423
1424#ifdef RN
1425	movddup	 0 * SIZE(BB), %xmm4
1426	mulpd	 %xmm4, %xmm0
1427
1428	movddup	 1 * SIZE(BB), %xmm4
1429	mulpd	 %xmm0, %xmm4
1430	subpd	 %xmm4, %xmm1
1431
1432	movddup	 3 * SIZE(BB), %xmm4
1433	mulpd	 %xmm4, %xmm1
1434#endif
1435
1436#ifdef RT
1437	movddup	 3 * SIZE(BB), %xmm4
1438	mulpd	 %xmm4, %xmm1
1439
1440	movddup	 2 * SIZE(BB), %xmm4
1441	mulpd	 %xmm1, %xmm4
1442	subpd	 %xmm4, %xmm0
1443
1444	movddup	 0 * SIZE(BB), %xmm4
1445	mulpd	 %xmm4, %xmm0
1446#endif
1447
1448#if defined(LN) || defined(LT)
1449	movapd	%xmm2,   0 * SIZE(BB)
1450	movapd	%xmm3,   2 * SIZE(BB)
1451#else
1452	movapd	%xmm0,   0 * SIZE(AA)
1453	movapd	%xmm1,   2 * SIZE(AA)
1454#endif
1455
1456#ifdef LN
1457	subl	$2 * SIZE, CO1
1458#endif
1459
1460#if defined(LN) || defined(LT)
1461	movsd	%xmm2, 0 * SIZE(CO1)
1462	movsd	%xmm3, 1 * SIZE(CO1)
1463	movhpd	%xmm2, 0 * SIZE(CO1, LDC, 1)
1464	movhpd	%xmm3, 1 * SIZE(CO1, LDC, 1)
1465#else
1466	movsd	%xmm0, 0 * SIZE(CO1)
1467	movhpd	%xmm0, 1 * SIZE(CO1)
1468	movsd	%xmm1, 0 * SIZE(CO1, LDC, 1)
1469	movhpd	%xmm1, 1 * SIZE(CO1, LDC, 1)
1470#endif
1471
1472#ifndef LN
1473	addl	$2 * SIZE, CO1
1474#endif
1475
1476#if defined(LT) || defined(RN)
1477	movl	K,  %eax
1478	subl	KK, %eax
1479	leal	(,%eax, SIZE), %eax
1480	leal	(AA, %eax, 2), AA
1481	leal	(BB, %eax, 2), BB
1482#endif
1483
1484#ifdef LN
1485	subl	$2, KK
1486#endif
1487
1488#ifdef LT
1489	addl	$2, KK
1490#endif
1491
1492#ifdef RT
1493	movl	K, %eax
1494	sall	$1 + BASE_SHIFT, %eax
1495	addl	%eax, AORIG
1496#endif
1497
1498	decl	%ebx			# i --
1499	jg	.L41
1500	ALIGN_4
1501
1502.L59:
1503#ifdef LN
1504       movl	K, %eax
1505       leal	(, %eax, SIZE), %eax
1506       leal 	(B, %eax, 2), B
1507#endif
1508
1509#if defined(LT) || defined(RN)
1510	movl	BB, B
1511#endif
1512
1513#ifdef RN
1514	addl	$2, KK
1515#endif
1516
1517#ifdef RT
1518	subl	$2, KK
1519#endif
1520	ALIGN_4
1521
1522.L60:
1523	testl	$1, N
1524	je	.L999
1525
1526#if defined(LT) || defined(RN)
1527	movl	A, AA
1528#else
1529	movl	A, %eax
1530	movl	%eax, AORIG
1531#endif
1532
1533#ifdef RT
1534	movl	K, %eax
1535	sall	$BASE_SHIFT, %eax
1536	subl	%eax, B
1537#endif
1538
1539#ifdef RT
1540	subl	LDC, C
1541#endif
1542	movl	C, CO1
1543#ifndef RT
1544	addl	LDC, C
1545#endif
1546
1547#ifdef LN
1548	movl	OFFSET, %eax
1549	addl	M, %eax
1550	movl	%eax, KK
1551#endif
1552
1553#ifdef LT
1554	movl	OFFSET, %eax
1555	movl	%eax, KK
1556#endif
1557
1558	movl	M,  %ebx
1559	testl	$1, %ebx	# i = (m >> 2)
1560	jle	.L80
1561
1562#ifdef LN
1563       movl	K, %eax
1564       sall	$BASE_SHIFT, %eax
1565       subl	%eax, AORIG
1566#endif
1567
1568#if defined(LN) || defined(RT)
1569	movl	KK, %eax
1570	movl	AORIG, AA
1571	leal	(AA, %eax, SIZE), AA
1572#endif
1573
1574	movl	B, BB
1575
1576#if defined(LN) || defined(RT)
1577	movl	KK, %eax
1578	sall	$BASE_SHIFT, %eax
1579	addl	%eax, BB
1580#endif
1581
1582	movsd	 0 * SIZE(AA), %xmm0
1583	movhpd	 1 * SIZE(AA), %xmm0
1584	pxor	%xmm4, %xmm4
1585	movsd	 8 * SIZE(AA), %xmm1
1586	movhpd	 9 * SIZE(AA), %xmm1
1587	pxor	%xmm5, %xmm5
1588	movsd	 0 * SIZE(BB), %xmm2
1589	movhpd	 1 * SIZE(BB), %xmm2
1590	pxor	%xmm6, %xmm6
1591	movsd	 8 * SIZE(BB), %xmm3
1592	movhpd	 9 * SIZE(BB), %xmm3
1593	pxor	%xmm7, %xmm7
1594
1595#if defined(LT) || defined(RN)
1596	movl	KK, %eax
1597#else
1598	movl	K, %eax
1599	subl	KK, %eax
1600#endif
1601	sarl	$4, %eax
1602	je	.L85
1603	ALIGN_4
1604
1605.L82:
1606	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1607	mulpd	 %xmm0, %xmm2
1608	movapd	 2 * SIZE(AA), %xmm0
1609	addpd	 %xmm2, %xmm4
1610	mulpd	 2 * SIZE(BB), %xmm0
1611	movapd	16 * SIZE(BB), %xmm2
1612	addpd	 %xmm0, %xmm5
1613	movapd	 4 * SIZE(AA), %xmm0
1614	mulpd	 4 * SIZE(BB), %xmm0
1615	addpd	 %xmm0, %xmm6
1616	movapd	 6 * SIZE(AA), %xmm0
1617	mulpd	 6 * SIZE(BB), %xmm0
1618	addpd	 %xmm0, %xmm7
1619	movapd	16 * SIZE(AA), %xmm0
1620	mulpd	 %xmm1, %xmm3
1621	movapd	10 * SIZE(AA), %xmm1
1622	addpd	 %xmm3, %xmm4
1623	mulpd	10 * SIZE(BB), %xmm1
1624	movapd	24 * SIZE(BB), %xmm3
1625	addpd	 %xmm1, %xmm5
1626	movapd	12 * SIZE(AA), %xmm1
1627	mulpd	12 * SIZE(BB), %xmm1
1628	addpd	 %xmm1, %xmm6
1629	movapd	14 * SIZE(AA), %xmm1
1630	mulpd	14 * SIZE(BB), %xmm1
1631	addpd	 %xmm1, %xmm7
1632	movapd	24 * SIZE(AA), %xmm1
1633
1634	addl   $16 * SIZE, AA
1635	addl   $16 * SIZE, BB
1636	decl   %eax
1637	jne    .L82
1638	ALIGN_4
1639
1640.L85:
1641#if defined(LT) || defined(RN)
1642	movl	KK, %eax
1643#else
1644	movl	K, %eax
1645	subl	KK, %eax
1646#endif
1647	andl	$15, %eax		# if (k & 1)
1648	BRANCH
1649	je .L88
1650
1651.L86:
1652	mulsd	 %xmm0, %xmm2
1653	movsd	 1 * SIZE(AA), %xmm0
1654	addsd	 %xmm2, %xmm4
1655	movsd	 1 * SIZE(BB), %xmm2
1656
1657	addl	$1 * SIZE, AA
1658	addl	$1 * SIZE, BB
1659	decl	%eax
1660	jg	.L86
1661	ALIGN_4
1662
1663.L88:
1664	addpd	%xmm5, %xmm4
1665	addpd	%xmm7, %xmm6
1666	addpd	%xmm6, %xmm4
1667
1668	haddpd	%xmm4, %xmm4
1669
1670#if defined(LN) || defined(RT)
1671	movl	KK, %eax
1672#ifdef LN
1673	subl	$1, %eax
1674#else
1675	subl	$1, %eax
1676#endif
1677
1678	movl	AORIG, AA
1679
1680	leal	(, %eax, SIZE), %eax
1681	addl	%eax, AA
1682	leal	(B,  %eax, 1), BB
1683#endif
1684
1685#if defined(LN) || defined(LT)
1686	movsd	 0 * SIZE(BB), %xmm0
1687	subsd	%xmm4,  %xmm0
1688#else
1689	movsd	 0 * SIZE(AA), %xmm0
1690	subsd	%xmm4, %xmm0
1691#endif
1692
1693#ifdef LN
1694	movsd	 0 * SIZE(AA), %xmm4
1695	mulsd	 %xmm4, %xmm0
1696#endif
1697
1698#ifdef LT
1699	movsd	 0 * SIZE(AA), %xmm4
1700	mulsd	 %xmm4, %xmm0
1701#endif
1702
1703#ifdef RN
1704	movsd	 0 * SIZE(BB), %xmm4
1705	mulsd	 %xmm4, %xmm0
1706#endif
1707
1708#ifdef RT
1709	movsd	 0 * SIZE(BB), %xmm4
1710	mulsd	 %xmm4, %xmm0
1711#endif
1712
1713#if defined(LN) || defined(LT)
1714	movsd	%xmm0,   0 * SIZE(BB)
1715#else
1716	movsd	%xmm0,   0 * SIZE(AA)
1717#endif
1718
1719#ifdef LN
1720	subl	$1 * SIZE, CO1
1721#endif
1722
1723#if defined(LN) || defined(LT)
1724	movsd	%xmm0, 0 * SIZE(CO1)
1725#else
1726	movsd	%xmm0, 0 * SIZE(CO1)
1727#endif
1728
1729#ifndef LN
1730	addl	$1 * SIZE, CO1
1731#endif
1732
1733#if defined(LT) || defined(RN)
1734	movl	K,  %eax
1735	subl	KK, %eax
1736	leal	(,%eax, SIZE), %eax
1737	addl	%eax, AA
1738	addl	%eax, BB
1739#endif
1740
1741#ifdef LN
1742	subl	$1, KK
1743#endif
1744
1745#ifdef LT
1746	addl	$1, KK
1747#endif
1748
1749#ifdef RT
1750	movl	K, %eax
1751	sall	$BASE_SHIFT, %eax
1752	addl	%eax, AORIG
1753#endif
1754	ALIGN_4
1755
1756
1757.L80:
1758	movl	M,  %ebx
1759	sarl	$1, %ebx	# i = (m >> 2)
1760	jle	.L89
1761	ALIGN_4
1762
1763.L71:
1764#ifdef LN
1765       movl	K, %eax
1766       sall	$1 + BASE_SHIFT, %eax
1767       subl	%eax, AORIG
1768#endif
1769
1770#if defined(LN) || defined(RT)
1771	movl	KK, %eax
1772	movl	AORIG, AA
1773	leal	(, %eax, SIZE), %eax
1774	leal	(AA, %eax, 2), AA
1775#endif
1776
1777	movl	B, BB
1778
1779#if defined(LN) || defined(RT)
1780	movl	KK, %eax
1781	sall	$BASE_SHIFT, %eax
1782	addl	%eax, BB
1783#endif
1784
1785	movapd	 0 * SIZE(AA), %xmm0
1786	pxor	%xmm4, %xmm4
1787	movapd	 8 * SIZE(AA), %xmm1
1788	pxor	%xmm5, %xmm5
1789	movddup	 0 * SIZE(BB), %xmm2
1790	pxor	%xmm6, %xmm6
1791	movddup	 4 * SIZE(BB), %xmm3
1792	pxor	%xmm7, %xmm7
1793
1794#ifdef LN
1795	prefetchnta	-2 * SIZE(CO1)
1796#else
1797	prefetchnta	 2 * SIZE(CO1)
1798#endif
1799
1800#if defined(LT) || defined(RN)
1801	movl	KK, %eax
1802#else
1803	movl	K, %eax
1804	subl	KK, %eax
1805#endif
1806	sarl	$3, %eax
1807	je	.L75
1808	ALIGN_4
1809
1810.L72:
1811	PREFETCH  (PREFETCHSIZE + 0) * SIZE(AA)
1812	mulpd	 %xmm2, %xmm0
1813	movddup	 1 * SIZE(BB), %xmm2
1814	addpd	 %xmm0, %xmm4
1815	movapd	16 * SIZE(AA), %xmm0
1816	mulpd	 2 * SIZE(AA), %xmm2
1817	addpd	 %xmm2, %xmm5
1818	movddup	 2 * SIZE(BB), %xmm2
1819	mulpd	 4 * SIZE(AA), %xmm2
1820	addpd	 %xmm2, %xmm6
1821	movddup	 3 * SIZE(BB), %xmm2
1822	mulpd	 6 * SIZE(AA), %xmm2
1823	addpd	 %xmm2, %xmm7
1824	movddup	 8 * SIZE(BB), %xmm2
1825	mulpd	 %xmm3, %xmm1
1826	movddup	 5 * SIZE(BB), %xmm3
1827	addpd	 %xmm1, %xmm4
1828	movapd	24 * SIZE(AA), %xmm1
1829	mulpd	10 * SIZE(AA), %xmm3
1830	addpd	 %xmm3, %xmm5
1831	movddup	 6 * SIZE(BB), %xmm3
1832	mulpd	12 * SIZE(AA), %xmm3
1833	addpd	 %xmm3, %xmm6
1834	movddup	 7 * SIZE(BB), %xmm3
1835	mulpd	14 * SIZE(AA), %xmm3
1836	addpd	 %xmm3, %xmm7
1837	movddup	12 * SIZE(BB), %xmm3
1838
1839	addl   $16 * SIZE, AA
1840	addl   $ 8 * SIZE, BB
1841	decl   %eax
1842	jne    .L72
1843	ALIGN_4
1844
1845.L75:
1846#if defined(LT) || defined(RN)
1847	movl	KK, %eax
1848#else
1849	movl	K, %eax
1850	subl	KK, %eax
1851#endif
1852	andl	$7, %eax		# if (k & 1)
1853	BRANCH
1854	je .L78
1855	ALIGN_3
1856
1857.L76:
1858	mulpd	 %xmm2, %xmm0
1859	movddup	 1 * SIZE(BB), %xmm2
1860	addpd	 %xmm0, %xmm4
1861	movapd	 2 * SIZE(AA), %xmm0
1862
1863	addl	$2 * SIZE, AA
1864	addl	$1 * SIZE, BB
1865	decl	%eax
1866	jg	.L76
1867	ALIGN_4
1868
1869.L78:
1870	addpd	%xmm5, %xmm4
1871	addpd	%xmm7, %xmm6
1872	addpd	%xmm6, %xmm4
1873
1874#if defined(LN) || defined(RT)
1875	movl	KK, %eax
1876#ifdef LN
1877	subl	$2, %eax
1878#else
1879	subl	$1, %eax
1880#endif
1881
1882	movl	AORIG, AA
1883
1884	leal	(, %eax, SIZE), %eax
1885	leal	(AA, %eax, 2), AA
1886	leal	(B,  %eax, 1), BB
1887#endif
1888
1889#if defined(LN) || defined(LT)
1890	movapd	 0 * SIZE(BB), %xmm1
1891
1892	subpd	%xmm4,  %xmm1
1893
1894	movapd	%xmm1, %xmm0
1895	unpckhpd %xmm1, %xmm1
1896#else
1897	movapd	 0 * SIZE(AA), %xmm0
1898
1899	subpd	%xmm4, %xmm0
1900#endif
1901
1902#ifdef LN
1903	movsd	 3 * SIZE(AA), %xmm4
1904	mulsd	 %xmm4, %xmm1
1905
1906	movsd	 2 * SIZE(AA), %xmm4
1907	mulsd	 %xmm1, %xmm4
1908	subsd	 %xmm4, %xmm0
1909
1910	movsd	 0 * SIZE(AA), %xmm4
1911	mulsd	 %xmm4, %xmm0
1912
1913#endif
1914
1915#ifdef LT
1916	movsd	 0 * SIZE(AA), %xmm4
1917	mulsd	 %xmm4, %xmm0
1918
1919	movsd	 1 * SIZE(AA), %xmm4
1920	mulsd	 %xmm0, %xmm4
1921	subsd	 %xmm4, %xmm1
1922
1923	movsd	 3 * SIZE(AA), %xmm4
1924	mulsd	 %xmm4, %xmm1
1925#endif
1926
1927#ifdef RN
1928	movddup	 0 * SIZE(BB), %xmm4
1929	mulpd	 %xmm4, %xmm0
1930#endif
1931
1932#ifdef RT
1933	movddup	 0 * SIZE(BB), %xmm4
1934	mulpd	 %xmm4, %xmm0
1935#endif
1936
1937#if defined(LN) || defined(LT)
1938	movsd	%xmm0,   0 * SIZE(BB)
1939	movsd	%xmm1,   1 * SIZE(BB)
1940#else
1941	movapd	%xmm0,   0 * SIZE(AA)
1942#endif
1943
1944#ifdef LN
1945	subl	$2 * SIZE, CO1
1946#endif
1947
1948#if defined(LN) || defined(LT)
1949	movsd	%xmm0, 0 * SIZE(CO1)
1950	movsd	%xmm1, 1 * SIZE(CO1)
1951#else
1952	movsd	%xmm0, 0 * SIZE(CO1)
1953	movhpd	%xmm0, 1 * SIZE(CO1)
1954#endif
1955
1956#ifndef LN
1957	addl	$2 * SIZE, CO1
1958#endif
1959
1960#if defined(LT) || defined(RN)
1961	movl	K,  %eax
1962	subl	KK, %eax
1963	leal	(,%eax, SIZE), %eax
1964	leal	(AA, %eax, 2), AA
1965	addl	%eax, BB
1966#endif
1967
1968#ifdef LN
1969	subl	$2, KK
1970#endif
1971
1972#ifdef LT
1973	addl	$2, KK
1974#endif
1975
1976#ifdef RT
1977	movl	K, %eax
1978	sall	$1 + BASE_SHIFT, %eax
1979	addl	%eax, AORIG
1980#endif
1981
1982	decl	%ebx			# i --
1983	jg	.L71
1984	ALIGN_4
1985
1986.L89:
1987#ifdef LN
1988       movl	K, %eax
1989       leal	(B, %eax, SIZE), B
1990#endif
1991
1992#if defined(LT) || defined(RN)
1993	movl	BB, B
1994#endif
1995
1996#ifdef RN
1997	addl	$1, KK
1998#endif
1999
2000#ifdef RT
2001	subl	$1, KK
2002#endif
2003	ALIGN_4
2004
2005.L999:
2006	popl	%ebx
2007	popl	%esi
2008	popl	%edi
2009	popl	%ebp
2010
2011	addl	$ARGS, %esp
2012	ret
2013
2014	EPILOGUE
2015