1/*********************************************************************/
2/* Copyright 2009, 2010 The University of Texas at Austin.           */
3/* All rights reserved.                                              */
4/*                                                                   */
5/* Redistribution and use in source and binary forms, with or        */
6/* without modification, are permitted provided that the following   */
7/* conditions are met:                                               */
8/*                                                                   */
9/*   1. Redistributions of source code must retain the above         */
10/*      copyright notice, this list of conditions and the following  */
11/*      disclaimer.                                                  */
12/*                                                                   */
13/*   2. Redistributions in binary form must reproduce the above      */
14/*      copyright notice, this list of conditions and the following  */
15/*      disclaimer in the documentation and/or other materials       */
16/*      provided with the distribution.                              */
17/*                                                                   */
18/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32/*                                                                   */
33/* The views and conclusions contained in the software and           */
34/* documentation are those of the authors and should not be          */
35/* interpreted as representing official policies, either expressed   */
36/* or implied, of The University of Texas at Austin.                 */
37/*********************************************************************/
38
39#define ASSEMBLER
40#include "common.h"
41
42#define STACK	16
43#define ARGS	16
44
45#define M	 4 + STACK + ARGS(%esp)
46#define N	 8 + STACK + ARGS(%esp)
47#define K	12 + STACK + ARGS(%esp)
48#define ALPHA_R	16 + STACK + ARGS(%esp)
49#define ALPHA_I	24 + STACK + ARGS(%esp)
50#define A	32 + STACK + ARGS(%esp)
51#define ARG_B	36 + STACK + ARGS(%esp)
52#define C	40 + STACK + ARGS(%esp)
53#define ARG_LDC	44 + STACK + ARGS(%esp)
54#define OFFSET	48 + STACK + ARGS(%esp)
55
56#define J	 0 + STACK(%esp)
57#define KK	 4 + STACK(%esp)
58#define KKK	 8 + STACK(%esp)
59#define AORIG	12 + STACK(%esp)
60
61#if defined(PENRYN) || defined(DUNNINGTON)
62#define PREFETCH	prefetcht1
63#define PREFETCHSIZE 84
64#endif
65
66#ifdef NEHALEM
67#define PREFETCH	prefetcht1
68#define PREFETCHSIZE 84
69#endif
70
71#ifdef NANO
72#define PREFETCH	prefetcht0
73#define PREFETCHSIZE (8 * 2)
74#endif
75
76#define AA	%edx
77#define BB	%ecx
78#define LDC	%ebp
79#define B	%edi
80#define CO1	%esi
81
82#define ADD1	  addpd
83#define ADD2	  addpd
84
85	PROLOGUE
86
87	subl	$ARGS, %esp
88
89	pushl	%ebp
90	pushl	%edi
91	pushl	%esi
92	pushl	%ebx
93
94	PROFCODE
95
96	movl	ARG_B,   B
97	movl	ARG_LDC, LDC
98	movl	OFFSET, %eax
99#ifdef RN
100	negl	%eax
101#endif
102	movl	%eax, KK
103
104	movl	M,  %ebx
105	testl	%ebx, %ebx
106	jle	.L999
107
108	subl	$-16 * SIZE, A
109	subl	$-16 * SIZE, B
110
111	sall	$ZBASE_SHIFT, LDC
112
113#ifdef LN
114       movl	M, %eax
115       sall	$ZBASE_SHIFT, %eax
116       addl	%eax, C
117       imull	K, %eax
118       addl	%eax, A
119#endif
120
121#ifdef RT
122       movl	N, %eax
123       sall	$ZBASE_SHIFT, %eax
124       imull	K, %eax
125       addl	%eax, B
126
127       movl	N, %eax
128       imull	LDC, %eax
129       addl	%eax, C
130#endif
131
132#ifdef RT
133       movl	N, %eax
134       subl	OFFSET, %eax
135       movl	%eax, KK
136#endif
137
138	movl	N, %eax
139	testl	$1, %eax
140	jle	.L100
141
142#if defined(LT) || defined(RN)
143	movl	A, AA
144#else
145	movl	A, %eax
146	movl	%eax, AORIG
147#endif
148
149#ifdef RT
150	movl	K, %eax
151	sall	$ZBASE_SHIFT, %eax
152	subl	%eax, B
153#endif
154
155#ifdef RT
156	subl	LDC, C
157#endif
158	movl	C, CO1
159#ifndef RT
160	addl	LDC, C
161#endif
162
163#ifdef LN
164	movl	OFFSET, %eax
165	addl	M, %eax
166	movl	%eax, KK
167#endif
168
169#ifdef LT
170	movl	OFFSET, %eax
171	movl	%eax, KK
172#endif
173
174	movl	M,  %ebx
175	ALIGN_4
176
177L110:
178#ifdef LN
179	movl	K, %eax
180	sall	$ZBASE_SHIFT, %eax
181	subl	%eax, AORIG
182#endif
183
184#if defined(LN) || defined(RT)
185	movl	KK, %eax
186	movl	AORIG, AA
187	sall	$ZBASE_SHIFT, %eax
188	addl	%eax, AA
189#endif
190
191	movl	B, BB
192
193#if defined(LN) || defined(RT)
194	movl	KK, %eax
195	sall	$ZBASE_SHIFT, %eax
196	addl	%eax, BB
197#endif
198
199	movaps	-16 * SIZE(AA), %xmm0
200	pxor	%xmm2, %xmm2
201	movaps	-16 * SIZE(BB), %xmm1
202	pxor	%xmm3, %xmm3
203
204	pxor	%xmm4, %xmm4
205#ifdef LN
206	prefetcht0	-2 * SIZE(CO1)
207#else
208	prefetcht0	 1 * SIZE(CO1)
209#endif
210	pxor	%xmm5, %xmm5
211	pxor	%xmm6, %xmm6
212	pxor	%xmm7, %xmm7
213
214#if defined(LT) || defined(RN)
215	movl	KK, %eax
216#else
217	movl	K, %eax
218	subl	KK, %eax
219#endif
220	sarl	$3, %eax
221	je	L115
222	ALIGN_4
223
224L112:
225	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
226
227	pshufd	$0x4e, %xmm1, %xmm2
228	mulpd	%xmm0, %xmm1
229	mulpd	%xmm0, %xmm2
230	movaps	-14 * SIZE(AA), %xmm0
231
232	ADD1	%xmm1, %xmm4
233	movaps	-14 * SIZE(BB), %xmm1
234	ADD2	%xmm2, %xmm5
235
236	pshufd	$0x4e, %xmm1, %xmm2
237	mulpd	%xmm0, %xmm1
238	mulpd	%xmm0, %xmm2
239	movaps	-12 * SIZE(AA), %xmm0
240
241	ADD1	%xmm1, %xmm6
242	movaps	-12 * SIZE(BB), %xmm1
243	ADD2	%xmm2, %xmm7
244
245	pshufd	$0x4e, %xmm1, %xmm2
246	mulpd	%xmm0, %xmm1
247	mulpd	%xmm0, %xmm2
248	movaps	-10 * SIZE(AA), %xmm0
249
250	ADD1	%xmm1, %xmm4
251	movaps	-10 * SIZE(BB), %xmm1
252	ADD2	%xmm2, %xmm5
253
254	pshufd	$0x4e, %xmm1, %xmm2
255	mulpd	%xmm0, %xmm1
256	mulpd	%xmm0, %xmm2
257	movaps	 -8 * SIZE(AA), %xmm0
258
259	ADD1	%xmm1, %xmm6
260	movaps	 -8 * SIZE(BB), %xmm1
261	ADD2	%xmm2, %xmm7
262
263	PREFETCH (PREFETCHSIZE +  8) * SIZE(AA)
264
265	pshufd	$0x4e, %xmm1, %xmm2
266	mulpd	%xmm0, %xmm1
267	mulpd	%xmm0, %xmm2
268	movaps	 -6 * SIZE(AA), %xmm0
269
270	ADD1	%xmm1, %xmm4
271	movaps	 -6 * SIZE(BB), %xmm1
272	ADD2	%xmm2, %xmm5
273
274	pshufd	$0x4e, %xmm1, %xmm2
275	mulpd	%xmm0, %xmm1
276	mulpd	%xmm0, %xmm2
277	movaps	 -4 * SIZE(AA), %xmm0
278
279	ADD1	%xmm1, %xmm6
280	movaps	 -4 * SIZE(BB), %xmm1
281	ADD2	%xmm2, %xmm7
282
283	pshufd	$0x4e, %xmm1, %xmm2
284	mulpd	%xmm0, %xmm1
285	mulpd	%xmm0, %xmm2
286	movaps	 -2 * SIZE(AA), %xmm0
287
288	ADD1	%xmm1, %xmm4
289	movaps	 -2 * SIZE(BB), %xmm1
290	ADD2	%xmm2, %xmm5
291
292	pshufd	$0x4e, %xmm1, %xmm2
293	mulpd	%xmm0, %xmm1
294	mulpd	%xmm0, %xmm2
295	movaps	  0 * SIZE(AA), %xmm0
296
297	ADD1	%xmm1, %xmm6
298	movaps	  0 * SIZE(BB), %xmm1
299	ADD2	%xmm2, %xmm7
300
301	subl   $-16 * SIZE, AA
302	subl   $-16 * SIZE, BB
303
304	subl   $1, %eax
305	jne    L112
306	ALIGN_4
307
308L115:
309#if defined(LT) || defined(RN)
310	movl	KK, %eax
311#else
312	movl	K, %eax
313	subl	KK, %eax
314#endif
315	andl	$7, %eax		# if (k & 1)
316	BRANCH
317	je L118
318	ALIGN_4
319
320L116:
321	pshufd	$0x4e, %xmm1, %xmm2
322	mulpd	%xmm0, %xmm1
323	mulpd	%xmm0, %xmm2
324	movaps	-14 * SIZE(AA), %xmm0
325
326	ADD1	%xmm1, %xmm4
327	movaps	-14 * SIZE(BB), %xmm1
328	ADD2	%xmm2, %xmm5
329
330	addl	$2 * SIZE, AA
331	addl	$2 * SIZE, BB
332	decl	%eax
333	jg	L116
334	ALIGN_4
335
336L118:
337#if defined(LN) || defined(RT)
338	movl	KK, %eax
339#ifdef LN
340	subl	$1, %eax
341#else
342	subl	$1, %eax
343#endif
344
345	movl	AORIG, AA
346	sall	$ZBASE_SHIFT, %eax
347	leal	(AA, %eax, 1), AA
348	leal	(B,  %eax, 1), BB
349#endif
350
351	addpd	%xmm6, %xmm4
352	pcmpeqb	%xmm1, %xmm1
353	addpd	%xmm7, %xmm5
354	psllq	$63,   %xmm1
355
356#ifndef CONJ
357	pshufd	$0x40, %xmm1, %xmm0
358	shufps	$0x04, %xmm1, %xmm1
359
360	pxor	%xmm0, %xmm4
361#else
362#if defined(LN) || defined(LT)
363	pshufd	$0x40, %xmm1, %xmm0
364#else
365	pshufd	$0x04, %xmm1, %xmm0
366#endif
367	shufps	$0x40, %xmm1, %xmm1
368
369	pxor	%xmm0, %xmm5
370#endif
371
372	haddpd	%xmm5, %xmm4
373
374#if defined(LN) || defined(LT)
375	movapd	-16 * SIZE(BB), %xmm5
376	subpd	%xmm4,  %xmm5
377#else
378	movapd	-16 * SIZE(AA), %xmm5
379	subpd	%xmm4,  %xmm5
380#endif
381
382#if defined(LN) || defined(LT)
383	movddup	-16 * SIZE(AA), %xmm2
384	movddup	-15 * SIZE(AA), %xmm3
385
386	pshufd	$0x4e, %xmm5, %xmm4
387
388	xorpd	 %xmm1, %xmm4
389
390	mulpd	 %xmm2, %xmm5
391	mulpd	 %xmm3, %xmm4
392
393	addpd	 %xmm4, %xmm5
394#endif
395
396#if defined(RN) || defined(RT)
397	movddup	-16 * SIZE(BB), %xmm2
398	movddup	-15 * SIZE(BB), %xmm3
399
400	pshufd	$0x4e, %xmm5, %xmm4
401
402	xorpd	 %xmm1, %xmm4
403
404	mulpd	 %xmm2, %xmm5
405	mulpd	 %xmm3, %xmm4
406
407	addpd	 %xmm4, %xmm5
408#endif
409
410#ifdef LN
411	subl	$2 * SIZE, CO1
412#endif
413
414	movlpd	%xmm5,   0 * SIZE(CO1)
415	movhpd	%xmm5,   1 * SIZE(CO1)
416
417#if defined(LN) || defined(LT)
418	movapd	%xmm5, -16 * SIZE(BB)
419#else
420	movapd	%xmm5, -16 * SIZE(AA)
421#endif
422
423#ifndef LN
424	addl	$2 * SIZE, CO1
425#endif
426
427#if defined(LT) || defined(RN)
428	movl	K,  %eax
429	subl	KK, %eax
430	sall	$ZBASE_SHIFT, %eax
431	addl	%eax, AA
432	addl	%eax, BB
433#endif
434
435#ifdef LN
436	subl	$1, KK
437#endif
438
439#ifdef LT
440	addl	$1, KK
441#endif
442
443#ifdef RT
444	movl	K, %eax
445	sall	$ZBASE_SHIFT, %eax
446	addl	%eax, AORIG
447#endif
448
449	decl	%ebx			# i --
450	jg	L110
451
452#ifdef LN
453       movl	K, %eax
454       sall	$ZBASE_SHIFT, %eax
455       addl	%eax, B
456#endif
457
458#if defined(LT) || defined(RN)
459	movl	BB, B
460#endif
461
462#ifdef RN
463	addl	$1, KK
464#endif
465
466#ifdef RT
467	subl	$1, KK
468#endif
469	ALIGN_4
470
471.L100:
472	movl	N, %eax
473	sarl	$1, %eax
474	movl	%eax, J			# j = n
475	jle	.L999
476	ALIGN_4
477
478.L01:
479#if defined(LT) || defined(RN)
480	movl	A, AA
481#else
482	movl	A, %eax
483	movl	%eax, AORIG
484#endif
485
486#ifdef RT
487	movl	K, %eax
488	sall	$1 + ZBASE_SHIFT, %eax
489	subl	%eax, B
490#endif
491
492	leal	(, LDC, 2), %eax
493
494#ifdef RT
495	subl	%eax, C
496#endif
497	movl	C, CO1
498#ifndef RT
499	addl	%eax, C
500#endif
501
502#ifdef LN
503	movl	OFFSET, %eax
504	addl	M, %eax
505	movl	%eax, KK
506#endif
507
508#ifdef LT
509	movl	OFFSET, %eax
510	movl	%eax, KK
511#endif
512
513	movl	M,  %ebx
514	ALIGN_4
515
516.L10:
517#ifdef LN
518	movl	K, %eax
519	sall	$ZBASE_SHIFT, %eax
520	subl	%eax, AORIG
521#endif
522
523#if defined(LN) || defined(RT)
524	movl	KK, %eax
525	movl	AORIG, AA
526	sall	$ZBASE_SHIFT, %eax
527	addl	%eax, AA
528#endif
529
530	movl	B, BB
531
532#if defined(LN) || defined(RT)
533	movl	KK, %eax
534	sall	$1 + ZBASE_SHIFT, %eax
535	addl	%eax, BB
536#endif
537
538	movaps	-16 * SIZE(AA), %xmm0
539	pxor	%xmm2, %xmm2
540	movaps	-16 * SIZE(BB), %xmm1
541	pxor	%xmm3, %xmm3
542
543#ifdef LN
544	pxor	%xmm4, %xmm4
545	prefetcht0     -2 * SIZE(CO1)
546	pxor	%xmm5, %xmm5
547	prefetcht0     -2 * SIZE(CO1, LDC)
548#else
549	pxor	%xmm4, %xmm4
550	prefetcht0	1 * SIZE(CO1)
551	pxor	%xmm5, %xmm5
552	prefetcht0	1 * SIZE(CO1, LDC)
553#endif
554	pxor	%xmm6, %xmm6
555	pxor	%xmm7, %xmm7
556
557#if defined(LT) || defined(RN)
558	movl	KK, %eax
559#else
560	movl	K, %eax
561	subl	KK, %eax
562#endif
563	sarl	$3, %eax
564	je	.L15
565	ALIGN_4
566
567.L12:
568	PREFETCH (PREFETCHSIZE +  0) * SIZE(AA)
569
570	ADD1	%xmm3, %xmm6
571	movaps	-14 * SIZE(BB), %xmm3
572	ADD2	%xmm2, %xmm7
573	pshufd	$0x4e, %xmm1, %xmm2
574	mulpd	%xmm0, %xmm1
575	mulpd	%xmm0, %xmm2
576
577	ADD1	%xmm1, %xmm4
578	movaps	-12 * SIZE(BB), %xmm1
579	ADD2	%xmm2, %xmm5
580	pshufd	$0x4e, %xmm3, %xmm2
581	mulpd	%xmm0, %xmm3
582	mulpd	%xmm0, %xmm2
583	movaps	-14 * SIZE(AA), %xmm0
584
585	ADD1	%xmm3, %xmm6
586	movaps	-10 * SIZE(BB), %xmm3
587	ADD2	%xmm2, %xmm7
588	pshufd	$0x4e, %xmm1, %xmm2
589	mulpd	%xmm0, %xmm1
590	mulpd	%xmm0, %xmm2
591
592	ADD1	%xmm1, %xmm4
593	movaps	 -8 * SIZE(BB), %xmm1
594	ADD2	%xmm2, %xmm5
595	pshufd	$0x4e, %xmm3, %xmm2
596	mulpd	%xmm0, %xmm3
597	mulpd	%xmm0, %xmm2
598	movaps	-12 * SIZE(AA), %xmm0
599
600	ADD1	%xmm3, %xmm6
601	movaps	 -6 * SIZE(BB), %xmm3
602	ADD2	%xmm2, %xmm7
603	pshufd	$0x4e, %xmm1, %xmm2
604	mulpd	%xmm0, %xmm1
605	mulpd	%xmm0, %xmm2
606
607	ADD1	%xmm1, %xmm4
608	movaps	 -4 * SIZE(BB), %xmm1
609	ADD2	%xmm2, %xmm5
610	pshufd	$0x4e, %xmm3, %xmm2
611	mulpd	%xmm0, %xmm3
612	mulpd	%xmm0, %xmm2
613	movaps	-10 * SIZE(AA), %xmm0
614
615	ADD1	%xmm3, %xmm6
616	movaps	 -2 * SIZE(BB), %xmm3
617	ADD2	%xmm2, %xmm7
618	pshufd	$0x4e, %xmm1, %xmm2
619	mulpd	%xmm0, %xmm1
620	mulpd	%xmm0, %xmm2
621
622	ADD1	%xmm1, %xmm4
623	movaps	  0 * SIZE(BB), %xmm1
624	ADD2	%xmm2, %xmm5
625	pshufd	$0x4e, %xmm3, %xmm2
626	mulpd	%xmm0, %xmm3
627	mulpd	%xmm0, %xmm2
628	movaps	 -8 * SIZE(AA), %xmm0
629
630	PREFETCH (PREFETCHSIZE +  8) * SIZE(AA)
631
632	ADD1	%xmm3, %xmm6
633	movaps	  2 * SIZE(BB), %xmm3
634	ADD2	%xmm2, %xmm7
635	pshufd	$0x4e, %xmm1, %xmm2
636	mulpd	%xmm0, %xmm1
637	mulpd	%xmm0, %xmm2
638
639	ADD1	%xmm1, %xmm4
640	movaps	  4 * SIZE(BB), %xmm1
641	ADD2	%xmm2, %xmm5
642	pshufd	$0x4e, %xmm3, %xmm2
643	mulpd	%xmm0, %xmm3
644	mulpd	%xmm0, %xmm2
645	movaps	 -6 * SIZE(AA), %xmm0
646
647	ADD1	%xmm3, %xmm6
648	movaps	  6 * SIZE(BB), %xmm3
649	ADD2	%xmm2, %xmm7
650	pshufd	$0x4e, %xmm1, %xmm2
651	mulpd	%xmm0, %xmm1
652	mulpd	%xmm0, %xmm2
653
654	ADD1	%xmm1, %xmm4
655	movaps	  8 * SIZE(BB), %xmm1
656	ADD2	%xmm2, %xmm5
657	pshufd	$0x4e, %xmm3, %xmm2
658	mulpd	%xmm0, %xmm3
659	mulpd	%xmm0, %xmm2
660	movaps	 -4 * SIZE(AA), %xmm0
661
662	ADD1	%xmm3, %xmm6
663	movaps	 10 * SIZE(BB), %xmm3
664	ADD2	%xmm2, %xmm7
665	pshufd	$0x4e, %xmm1, %xmm2
666	mulpd	%xmm0, %xmm1
667	mulpd	%xmm0, %xmm2
668
669	ADD1	%xmm1, %xmm4
670	movaps	 12 * SIZE(BB), %xmm1
671	ADD2	%xmm2, %xmm5
672	pshufd	$0x4e, %xmm3, %xmm2
673	mulpd	%xmm0, %xmm3
674	mulpd	%xmm0, %xmm2
675	movaps	 -2 * SIZE(AA), %xmm0
676
677	ADD1	%xmm3, %xmm6
678	movaps	 14 * SIZE(BB), %xmm3
679	ADD2	%xmm2, %xmm7
680	pshufd	$0x4e, %xmm1, %xmm2
681	mulpd	%xmm0, %xmm1
682	mulpd	%xmm0, %xmm2
683
684	ADD1	%xmm1, %xmm4
685	movaps	 16 * SIZE(BB), %xmm1
686	ADD2	%xmm2, %xmm5
687	pshufd	$0x4e, %xmm3, %xmm2
688	mulpd	%xmm0, %xmm3
689	subl   $-32 * SIZE, BB
690	mulpd	%xmm0, %xmm2
691	movaps	  0 * SIZE(AA), %xmm0
692
693	subl   $-16 * SIZE, AA
694
695	subl   $1, %eax
696	jne    .L12
697	ALIGN_4
698
699.L15:
700#if defined(LT) || defined(RN)
701	movl	KK, %eax
702#else
703	movl	K, %eax
704	subl	KK, %eax
705#endif
706	andl	$7, %eax		# if (k & 1)
707	BRANCH
708	je .L18
709	ALIGN_4
710
711.L16:
712	ADD1	%xmm3, %xmm6
713	movaps	-14 * SIZE(BB), %xmm3
714	ADD2	%xmm2, %xmm7
715	pshufd	$0x4e, %xmm1, %xmm2
716	mulpd	%xmm0, %xmm1
717	mulpd	%xmm0, %xmm2
718
719	ADD1	%xmm1, %xmm4
720	movaps	-12 * SIZE(BB), %xmm1
721	ADD2	%xmm2, %xmm5
722	pshufd	$0x4e, %xmm3, %xmm2
723	mulpd	%xmm0, %xmm3
724	mulpd	%xmm0, %xmm2
725
726	movaps	-14 * SIZE(AA), %xmm0
727
728	addl	$2 * SIZE, AA
729	addl	$4 * SIZE, BB
730	decl	%eax
731	jg	.L16
732	ALIGN_4
733
734.L18:
735#if defined(LN) || defined(RT)
736	movl	KK, %eax
737#ifdef LN
738	subl	$1, %eax
739#else
740	subl	$2, %eax
741#endif
742
743	movl	AORIG, AA
744	sall	$ZBASE_SHIFT, %eax
745	leal	(AA, %eax, 1), AA
746	leal	(B,  %eax, 2), BB
747#endif
748
749	ADD1	%xmm3, %xmm6
750	pcmpeqb	%xmm1, %xmm1
751	ADD2	%xmm2, %xmm7
752	psllq	$63,   %xmm1
753
754#ifndef CONJ
755	pshufd	$0x40, %xmm1, %xmm0
756	shufps	$0x04, %xmm1, %xmm1
757
758	pxor	%xmm0, %xmm4
759	pxor	%xmm0, %xmm6
760#else
761#if defined(LN) || defined(LT)
762	pshufd	$0x40, %xmm1, %xmm0
763#else
764	pshufd	$0x04, %xmm1, %xmm0
765#endif
766	shufps	$0x40, %xmm1, %xmm1
767
768	pxor	%xmm0, %xmm5
769	pxor	%xmm0, %xmm7
770#endif
771
772	haddpd	%xmm5, %xmm4
773	haddpd	%xmm7, %xmm6
774
775
776#if defined(LN) || defined(LT)
777	movapd	-16 * SIZE(BB), %xmm5
778	movapd	-14 * SIZE(BB), %xmm7
779
780	subpd	%xmm4,  %xmm5
781	subpd	%xmm6,  %xmm7
782#else
783	movapd	-16 * SIZE(AA), %xmm5
784	movapd	-14 * SIZE(AA), %xmm7
785
786	subpd	%xmm4,  %xmm5
787	subpd	%xmm6,  %xmm7
788#endif
789
790#if defined(LN) || defined(LT)
791	movddup	-16 * SIZE(AA), %xmm2
792	movddup	-15 * SIZE(AA), %xmm3
793
794	pshufd	$0x4e, %xmm5, %xmm4
795	pshufd	$0x4e, %xmm7, %xmm6
796
797	xorpd	 %xmm1, %xmm4
798	xorpd	 %xmm1, %xmm6
799
800	mulpd	 %xmm2, %xmm5
801	mulpd	 %xmm3, %xmm4
802	mulpd	 %xmm2, %xmm7
803	mulpd	 %xmm3, %xmm6
804
805	addpd	 %xmm4, %xmm5
806	addpd	 %xmm6, %xmm7
807#endif
808
809#ifdef RN
810	movddup	-16 * SIZE(BB), %xmm2
811	movddup	-15 * SIZE(BB), %xmm3
812
813	pshufd	$0x4e, %xmm5, %xmm4
814
815	xorpd	 %xmm1, %xmm4
816
817	mulpd	 %xmm2, %xmm5
818	mulpd	 %xmm3, %xmm4
819
820	addpd	 %xmm4, %xmm5
821
822	movddup	-14 * SIZE(BB), %xmm2
823	movddup -13 * SIZE(BB), %xmm3
824
825	movapd	 %xmm5, %xmm4
826	pshufd	$0x4e, %xmm5, %xmm6
827
828	xorpd	 %xmm1, %xmm6
829
830	mulpd	 %xmm2, %xmm4
831	mulpd	 %xmm3, %xmm6
832
833	subpd	 %xmm4, %xmm7
834	subpd	 %xmm6, %xmm7
835
836	movddup	-10 * SIZE(BB), %xmm2
837	movddup	 -9 * SIZE(BB), %xmm3
838
839	pshufd	$0x4e, %xmm7, %xmm6
840
841	xorpd	 %xmm1, %xmm6
842
843	mulpd	 %xmm2, %xmm7
844	mulpd	 %xmm3, %xmm6
845
846	addpd	 %xmm6, %xmm7
847#endif
848
849#ifdef RT
850	movddup	-10 * SIZE(BB), %xmm2
851	movddup	 -9 * SIZE(BB), %xmm3
852
853	pshufd	$0x4e, %xmm7, %xmm6
854
855	xorpd	 %xmm1, %xmm6
856
857	mulpd	 %xmm2, %xmm7
858	mulpd	 %xmm3, %xmm6
859
860	addpd	 %xmm6, %xmm7
861
862	movddup	-12 * SIZE(BB), %xmm2
863	movddup	-11 * SIZE(BB), %xmm3
864
865	movapd	 %xmm7, %xmm4
866	pshufd	$0x4e, %xmm7, %xmm6
867
868	xorpd	 %xmm1, %xmm6
869
870	mulpd	 %xmm2, %xmm4
871	mulpd	 %xmm3, %xmm6
872
873	subpd	 %xmm4, %xmm5
874	subpd	 %xmm6, %xmm5
875
876	movddup	-16 * SIZE(BB), %xmm2
877	movddup	-15 * SIZE(BB), %xmm3
878
879	pshufd	$0x4e, %xmm5, %xmm4
880
881	xorpd	 %xmm1, %xmm4
882
883	mulpd	 %xmm2, %xmm5
884	mulpd	 %xmm3, %xmm4
885
886	addpd	 %xmm4, %xmm5
887#endif
888
889#ifdef LN
890	subl	$2 * SIZE, CO1
891#endif
892
893	movlpd	%xmm5,   0 * SIZE(CO1)
894	movhpd	%xmm5,   1 * SIZE(CO1)
895
896	movlpd	%xmm7,   0 * SIZE(CO1, LDC)
897	movhpd	%xmm7,   1 * SIZE(CO1, LDC)
898
899#if defined(LN) || defined(LT)
900	movapd	%xmm5,  -16 * SIZE(BB)
901	movapd	%xmm7,  -14 * SIZE(BB)
902#else
903	movapd	%xmm5,  -16 * SIZE(AA)
904	movapd	%xmm7,  -14 * SIZE(AA)
905#endif
906
907#ifndef LN
908	addl	$2 * SIZE, CO1
909#endif
910
911#if defined(LT) || defined(RN)
912	movl	K,  %eax
913	subl	KK, %eax
914	sall	$ZBASE_SHIFT, %eax
915	addl	%eax, AA
916	leal	(BB, %eax, 2), BB
917#endif
918
919#ifdef LN
920	subl	$1, KK
921#endif
922
923#ifdef LT
924	addl	$1, KK
925#endif
926
927#ifdef RT
928	movl	K, %eax
929	sall	$ZBASE_SHIFT, %eax
930	addl	%eax, AORIG
931#endif
932
933	decl	%ebx			# i --
934	jg	.L10
935	ALIGN_4
936
937.L99:
938#ifdef LN
939       movl	K, %eax
940       sall	$1 + ZBASE_SHIFT, %eax
941       addl	%eax, B
942#endif
943
944#if defined(LT) || defined(RN)
945	movl	BB, B
946#endif
947
948#ifdef RN
949	addl	$2, KK
950#endif
951
952#ifdef RT
953	subl	$2, KK
954#endif
955
956	decl	J			# j --
957	jg	.L01
958	ALIGN_4
959
960.L999:
961	popl	%ebx
962	popl	%esi
963	popl	%edi
964	popl	%ebp
965
966	addl	$ARGS, %esp
967	ret
968
969	EPILOGUE
970