1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	 0
27
28#define STACK_M		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31
32#define RET	%eax
33#define	M	%ebx
34#define X	%ecx
35#define INCX	%edx
36#define I	%esi
37#define MM	%ebp
38#define XX	%edi
39#define TEMP	%ebx
40
41#ifdef USE_MIN
42#define maxpd	minpd
43#define maxsd	minsd
44#endif
45
46#include "l1param.h"
47
48	PROLOGUE
49
50	pushl	%ebp
51	pushl	%edi
52	pushl	%esi
53	pushl	%ebx
54
55	PROFCODE
56
57	movl	STACK_M, M
58	movl	STACK_X, X
59	movl	STACK_INCX, INCX
60
61#ifdef F_INTERFACE
62	movl	(M), M
63	movl	(INCX), INCX
64#endif
65
66	pxor	%xmm0, %xmm0
67#ifdef USE_ABS
68	pxor	%xmm7, %xmm7
69#endif
70	xor	RET, RET
71	testl	M, M
72	jle	.L999
73	leal	(, INCX, SIZE), INCX
74	testl	INCX, INCX
75	jle	.L999
76
77	movl	M, MM
78	movl	X, XX
79
80#ifdef USE_ABS
81	cmpeqpd	%xmm7, %xmm7
82	psrlq	$1, %xmm7
83#endif
84
85	movsd	(XX), %xmm0
86	addl	INCX, XX
87	decl	MM
88#ifdef USE_ABS
89	andpd	 %xmm7, %xmm0
90#endif
91	unpcklpd %xmm0, %xmm0
92	movapd	 %xmm0, %xmm1
93	movapd	 %xmm0, %xmm2
94	movapd	 %xmm0, %xmm3
95	cmpl	$SIZE, INCX
96	jne	.L80
97
98/* Analigned Check */
99	cmpl	$7, MM
100	jle	.L50
101
102	testl	$7, XX
103	jne	.L50		# Purely Unaligned Mode
104
105	testl	$15, XX		# Checking for 128bit align
106	je	.L05
107
108	movsd	0 * SIZE(XX), %xmm4
109#ifdef USE_ABS
110	andpd	 %xmm7, %xmm4
111#endif
112	unpcklpd %xmm4, %xmm4
113	maxpd	 %xmm4, %xmm3
114	decl	MM
115	addl	$SIZE, XX
116	ALIGN_3
117
118.L05:
119	movl	MM,  I
120	sarl	$4, I
121	jle	.L15
122	ALIGN_4
123
124.L11:
125#ifdef PREFETCH
126	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
127#endif
128
129	movapd	0 * SIZE(XX), %xmm4
130#ifdef USE_ABS
131	andpd	 %xmm7, %xmm4
132#endif
133	maxpd	%xmm4, %xmm0
134
135	movapd	2 * SIZE(XX), %xmm4
136#ifdef USE_ABS
137	andpd	 %xmm7, %xmm4
138#endif
139	maxpd	%xmm4, %xmm1
140
141	movapd	4 * SIZE(XX), %xmm4
142#ifdef USE_ABS
143	andpd	 %xmm7, %xmm4
144#endif
145	maxpd	%xmm4, %xmm2
146
147	movapd	6 * SIZE(XX), %xmm4
148#ifdef USE_ABS
149	andpd	 %xmm7, %xmm4
150#endif
151	maxpd	%xmm4, %xmm3
152
153#ifdef PREFETCH
154	PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX)
155#endif
156
157	movapd	8 * SIZE(XX), %xmm4
158#ifdef USE_ABS
159	andpd	 %xmm7, %xmm4
160#endif
161	maxpd	%xmm4, %xmm0
162
163	movapd	10 * SIZE(XX), %xmm4
164#ifdef USE_ABS
165	andpd	 %xmm7, %xmm4
166#endif
167	maxpd	%xmm4, %xmm1
168
169	movapd	12 * SIZE(XX), %xmm4
170#ifdef USE_ABS
171	andpd	 %xmm7, %xmm4
172#endif
173	maxpd	%xmm4, %xmm2
174
175	movapd	14 * SIZE(XX), %xmm4
176#ifdef USE_ABS
177	andpd	 %xmm7, %xmm4
178#endif
179	maxpd	%xmm4, %xmm3
180
181	addl	$16 * SIZE, XX
182	decl	I
183	jg	.L11
184	ALIGN_4
185
186.L15:
187	andl	$15,  MM
188	jle	.L20
189
190	testl	$8, MM
191	je	.L16
192
193	movapd	0 * SIZE(XX), %xmm4
194#ifdef USE_ABS
195	andpd	 %xmm7, %xmm4
196#endif
197	maxpd	%xmm4, %xmm0
198
199	movapd	2 * SIZE(XX), %xmm4
200#ifdef USE_ABS
201	andpd	 %xmm7, %xmm4
202#endif
203	maxpd	%xmm4, %xmm1
204
205	movapd	4 * SIZE(XX), %xmm4
206#ifdef USE_ABS
207	andpd	 %xmm7, %xmm4
208#endif
209	maxpd	%xmm4, %xmm2
210
211	movapd	6 * SIZE(XX), %xmm4
212#ifdef USE_ABS
213	andpd	 %xmm7, %xmm4
214#endif
215	maxpd	%xmm4, %xmm3
216	addl	$8 * SIZE, XX
217	ALIGN_3
218
219.L16:
220	testl	$4, MM
221	je	.L17
222
223	movapd	0 * SIZE(XX), %xmm4
224#ifdef USE_ABS
225	andpd	 %xmm7, %xmm4
226#endif
227	maxpd	%xmm4, %xmm0
228
229	movapd	2 * SIZE(XX), %xmm4
230#ifdef USE_ABS
231	andpd	 %xmm7, %xmm4
232#endif
233	maxpd	%xmm4, %xmm1
234	addl	$4 * SIZE, XX
235	ALIGN_3
236
237.L17:
238	testl	$2, MM
239	je	.L18
240
241	movapd	0 * SIZE(XX), %xmm4
242#ifdef USE_ABS
243	andpd	 %xmm7, %xmm4
244#endif
245	maxpd	%xmm4, %xmm2
246	addl	$2 * SIZE, XX
247
248.L18:
249	testl	$1, MM
250	je	.L20
251
252	movsd	0 * SIZE(XX), %xmm4
253#ifdef USE_ABS
254	andpd	 %xmm7, %xmm4
255#endif
256	unpcklpd  %xmm4, %xmm4
257	maxpd	%xmm4, %xmm3
258	ALIGN_3
259
260/* Finding Index */
261.L20:
262	movl	X, XX
263	movl	M, MM
264
265	maxpd	 %xmm1, %xmm0
266	maxpd	 %xmm3, %xmm2
267	maxpd	 %xmm2, %xmm0
268	movapd	 %xmm0, %xmm1
269	unpckhpd %xmm0, %xmm0
270	maxsd	 %xmm1, %xmm0
271	unpcklpd %xmm0, %xmm0
272
273	testl	$15, XX		# Checking for 128bit align
274	je	.L21
275
276	movsd	0 * SIZE(XX), %xmm1
277#ifdef USE_ABS
278	andpd	 %xmm7, %xmm1
279#endif
280	incl	RET
281	comisd	%xmm0, %xmm1
282	je	.L999
283	addl	$SIZE, XX
284	decl	MM
285	ALIGN_3
286
287.L21:
288	movl	MM,  I
289	sarl	$3, I
290	jle	.L25
291	ALIGN_4
292
293.L22:
294#ifdef PREFETCH
295	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
296#endif
297
298	movapd	0 * SIZE(XX), %xmm1
299#ifdef USE_ABS
300	andpd	 %xmm7, %xmm1
301#endif
302	cmpeqpd	%xmm0, %xmm1
303
304	movapd	2 * SIZE(XX), %xmm2
305#ifdef USE_ABS
306	andpd	 %xmm7, %xmm2
307#endif
308	cmpeqpd	%xmm0, %xmm2
309
310	movapd	4 * SIZE(XX), %xmm3
311#ifdef USE_ABS
312	andpd	 %xmm7, %xmm3
313#endif
314	cmpeqpd	%xmm0, %xmm3
315
316	movapd	6 * SIZE(XX), %xmm4
317#ifdef USE_ABS
318	andpd	 %xmm7, %xmm4
319#endif
320	cmpeqpd	%xmm0, %xmm4
321
322	orpd	%xmm2, %xmm1
323	orpd	%xmm4, %xmm3
324	orpd	%xmm3, %xmm1
325	movmskpd %xmm1, TEMP
326	testl	 $3, TEMP
327	jne	 .L23
328
329	addl	$8 * SIZE, XX
330	addl	$8, RET
331	decl	I
332	jg	.L22
333	jmp	.L25
334	ALIGN_4
335
336.L23:
337	movsd	0 * SIZE(XX), %xmm1
338	movsd	1 * SIZE(XX), %xmm2
339	movsd	2 * SIZE(XX), %xmm3
340	movsd	3 * SIZE(XX), %xmm4
341#ifdef USE_ABS
342	andpd	 %xmm7, %xmm1
343	andpd	 %xmm7, %xmm2
344	andpd	 %xmm7, %xmm3
345	andpd	 %xmm7, %xmm4
346#endif
347
348	incl	RET
349	comisd	%xmm0, %xmm1
350	je	.L999
351	incl	RET
352	comisd	%xmm0, %xmm2
353	je	.L999
354	incl	RET
355	comisd	%xmm0, %xmm3
356	je	.L999
357	incl	RET
358	comisd	%xmm0, %xmm4
359	je	.L999
360
361	movsd	4 * SIZE(XX), %xmm1
362	movsd	5 * SIZE(XX), %xmm2
363	movsd	6 * SIZE(XX), %xmm3
364
365#ifdef USE_ABS
366	andpd	 %xmm7, %xmm1
367	andpd	 %xmm7, %xmm2
368	andpd	 %xmm7, %xmm3
369#endif
370
371	incl	RET
372	comisd	%xmm0, %xmm1
373	je	.L999
374	incl	RET
375	comisd	%xmm0, %xmm2
376	je	.L999
377	incl	RET
378	comisd	%xmm0, %xmm3
379	je	.L999
380	incl	RET
381	jmp	.L999
382	ALIGN_3
383
384.L25:
385	testl	$4, MM
386	je	.L27
387
388	movsd	0 * SIZE(XX), %xmm1
389	movsd	1 * SIZE(XX), %xmm2
390	movsd	2 * SIZE(XX), %xmm3
391	movsd	3 * SIZE(XX), %xmm4
392#ifdef USE_ABS
393	andpd	 %xmm7, %xmm1
394	andpd	 %xmm7, %xmm2
395	andpd	 %xmm7, %xmm3
396	andpd	 %xmm7, %xmm4
397#endif
398	addl	$4 * SIZE, XX
399	incl	RET
400	comisd	%xmm0, %xmm1
401	je	.L999
402	incl	RET
403	comisd	%xmm0, %xmm2
404	je	.L999
405	incl	RET
406	comisd	%xmm0, %xmm3
407	je	.L999
408	incl	RET
409	comisd	%xmm0, %xmm4
410	je	.L999
411	ALIGN_3
412
413.L27:
414	testl	$2, MM
415	je	.L28
416
417	movsd	0 * SIZE(XX), %xmm1
418	movsd	1 * SIZE(XX), %xmm2
419#ifdef USE_ABS
420	andpd	 %xmm7, %xmm1
421	andpd	 %xmm7, %xmm2
422#endif
423	addl	$2 * SIZE, XX
424	incl	RET
425	comisd	%xmm0, %xmm1
426	je	.L999
427	incl	RET
428	comisd	%xmm0, %xmm2
429	je	.L999
430	ALIGN_3
431
432.L28:
433	incl	RET
434	jmp	.L999
435	ALIGN_3
436
437.L50:
438/* Unaligned Mode */
439	movl	MM,  I
440	sarl	$4, I
441	jle	.L55
442	ALIGN_4
443
444.L51:
445#ifdef PREFETCH
446	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
447#endif
448
449	movsd	0 * SIZE(XX), %xmm4
450	movhpd	1 * SIZE(XX), %xmm4
451#ifdef USE_ABS
452	andpd	 %xmm7, %xmm4
453#endif
454	maxpd	%xmm4, %xmm0
455
456	movsd	2 * SIZE(XX), %xmm4
457	movhpd	3 * SIZE(XX), %xmm4
458#ifdef USE_ABS
459	andpd	 %xmm7, %xmm4
460#endif
461	maxpd	%xmm4, %xmm1
462
463	movsd	4 * SIZE(XX), %xmm4
464	movhpd	5 * SIZE(XX), %xmm4
465#ifdef USE_ABS
466	andpd	 %xmm7, %xmm4
467#endif
468	maxpd	%xmm4, %xmm2
469
470	movsd	6 * SIZE(XX), %xmm4
471	movhpd	7 * SIZE(XX), %xmm4
472#ifdef USE_ABS
473	andpd	 %xmm7, %xmm4
474#endif
475	maxpd	%xmm4, %xmm3
476
477#ifdef PREFETCH
478	PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX)
479#endif
480
481	movsd	8 * SIZE(XX), %xmm4
482	movhpd	9 * SIZE(XX), %xmm4
483#ifdef USE_ABS
484	andpd	 %xmm7, %xmm4
485#endif
486	maxpd	%xmm4, %xmm0
487
488	movsd	10 * SIZE(XX), %xmm4
489	movhpd	11 * SIZE(XX), %xmm4
490#ifdef USE_ABS
491	andpd	 %xmm7, %xmm4
492#endif
493	maxpd	%xmm4, %xmm1
494
495	movsd	12 * SIZE(XX), %xmm4
496	movhpd	13 * SIZE(XX), %xmm4
497#ifdef USE_ABS
498	andpd	 %xmm7, %xmm4
499#endif
500	maxpd	%xmm4, %xmm2
501
502	movsd	14 * SIZE(XX), %xmm4
503	movhpd	15 * SIZE(XX), %xmm4
504#ifdef USE_ABS
505	andpd	 %xmm7, %xmm4
506#endif
507	maxpd	%xmm4, %xmm3
508
509	addl	$16 * SIZE, XX
510	decl	I
511	jg	.L51
512	ALIGN_4
513
514.L55:
515	andl	$15,  MM
516	jle	.L60
517
518	testl	$8, MM
519	je	.L56
520
521	movsd	0 * SIZE(XX), %xmm4
522	movhpd	1 * SIZE(XX), %xmm4
523#ifdef USE_ABS
524	andpd	 %xmm7, %xmm4
525#endif
526	maxpd	%xmm4, %xmm0
527
528	movsd	2 * SIZE(XX), %xmm4
529	movhpd	3 * SIZE(XX), %xmm4
530#ifdef USE_ABS
531	andpd	 %xmm7, %xmm4
532#endif
533	maxpd	%xmm4, %xmm1
534
535	movsd	4 * SIZE(XX), %xmm4
536	movhpd	5 * SIZE(XX), %xmm4
537#ifdef USE_ABS
538	andpd	 %xmm7, %xmm4
539#endif
540	maxpd	%xmm4, %xmm2
541
542	movsd	6 * SIZE(XX), %xmm4
543	movhpd	7 * SIZE(XX), %xmm4
544#ifdef USE_ABS
545	andpd	 %xmm7, %xmm4
546#endif
547	maxpd	%xmm4, %xmm3
548
549	addl	$8 * SIZE, XX
550	ALIGN_3
551
552.L56:
553	testl	$4, MM
554	je	.L57
555
556	movsd	0 * SIZE(XX), %xmm4
557	movhpd	1 * SIZE(XX), %xmm4
558#ifdef USE_ABS
559	andpd	 %xmm7, %xmm4
560#endif
561	maxpd	%xmm4, %xmm0
562
563	movsd	2 * SIZE(XX), %xmm4
564	movhpd	3 * SIZE(XX), %xmm4
565#ifdef USE_ABS
566	andpd	 %xmm7, %xmm4
567#endif
568	maxpd	%xmm4, %xmm1
569	addl	$4 * SIZE, XX
570	ALIGN_3
571
572.L57:
573	testl	$2, MM
574	je	.L58
575
576	movsd	0 * SIZE(XX), %xmm4
577	movhpd	1 * SIZE(XX), %xmm4
578#ifdef USE_ABS
579	andpd	 %xmm7, %xmm4
580#endif
581	maxpd	%xmm4, %xmm2
582	addl	$2 * SIZE, XX
583
584.L58:
585	testl	$1, MM
586	je	.L60
587
588	movsd	0 * SIZE(XX), %xmm4
589	unpcklpd  %xmm4, %xmm4
590#ifdef USE_ABS
591	andpd	 %xmm7, %xmm4
592#endif
593	maxpd	%xmm4, %xmm3
594	ALIGN_3
595
596.L60:
597	movl	X, XX
598	movl	M, MM
599
600	maxpd	 %xmm1, %xmm0
601	maxpd	 %xmm3, %xmm2
602	maxpd	 %xmm2, %xmm0
603	movapd	 %xmm0, %xmm1
604	unpckhpd %xmm0, %xmm0
605	maxsd	 %xmm1, %xmm0
606	unpcklpd %xmm0, %xmm0
607
608	movl	MM,  I
609	sarl	$3, I
610	jle	.L65
611	ALIGN_4
612
613.L62:
614#ifdef PREFETCH
615	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
616#endif
617
618	movsd	0 * SIZE(XX), %xmm1
619	movhpd	1 * SIZE(XX), %xmm1
620#ifdef USE_ABS
621	andpd	 %xmm7, %xmm1
622#endif
623	cmpeqpd	%xmm0, %xmm1
624
625	movsd	2 * SIZE(XX), %xmm2
626	movhpd	3 * SIZE(XX), %xmm2
627#ifdef USE_ABS
628	andpd	 %xmm7, %xmm2
629#endif
630	cmpeqpd	%xmm0, %xmm2
631
632	movsd	4 * SIZE(XX), %xmm3
633	movhpd	5 * SIZE(XX), %xmm3
634#ifdef USE_ABS
635	andpd	 %xmm7, %xmm3
636#endif
637	cmpeqpd	%xmm0, %xmm3
638
639	movsd	6 * SIZE(XX), %xmm4
640	movhpd	7 * SIZE(XX), %xmm4
641#ifdef USE_ABS
642	andpd	 %xmm7, %xmm4
643#endif
644	cmpeqpd	%xmm0, %xmm4
645
646	orpd	%xmm2, %xmm1
647	orpd	%xmm4, %xmm3
648	orpd	%xmm3, %xmm1
649	movmskpd %xmm1, TEMP
650	testl	 $3, TEMP
651	jne	 .L63
652
653	addl	$8 * SIZE, XX
654	addl	$8, RET
655	decl	I
656	jg	.L62
657	jmp	.L65
658	ALIGN_4
659
660.L63:
661	movsd	0 * SIZE(XX), %xmm1
662	movsd	1 * SIZE(XX), %xmm2
663	movsd	2 * SIZE(XX), %xmm3
664	movsd	3 * SIZE(XX), %xmm4
665
666#ifdef USE_ABS
667	andpd	 %xmm7, %xmm1
668	andpd	 %xmm7, %xmm2
669	andpd	 %xmm7, %xmm3
670	andpd	 %xmm7, %xmm4
671#endif
672
673	incl	RET
674	comisd	%xmm0, %xmm1
675	je	.L999
676	incl	RET
677	comisd	%xmm0, %xmm2
678	je	.L999
679	incl	RET
680	comisd	%xmm0, %xmm3
681	je	.L999
682	incl	RET
683	comisd	%xmm0, %xmm4
684	je	.L999
685	incl	RET
686
687	movsd	4 * SIZE(XX), %xmm1
688	movsd	5 * SIZE(XX), %xmm2
689	movsd	6 * SIZE(XX), %xmm3
690
691#ifdef USE_ABS
692	andpd	 %xmm7, %xmm1
693	andpd	 %xmm7, %xmm2
694	andpd	 %xmm7, %xmm3
695#endif
696
697	comisd	%xmm0, %xmm1
698	je	.L999
699	incl	RET
700	comisd	%xmm0, %xmm2
701	je	.L999
702	incl	RET
703	comisd	%xmm0, %xmm3
704	je	.L999
705	incl	RET
706	jmp	.L999
707	ALIGN_3
708
709.L65:
710	testl	$4, MM
711	je	.L67
712
713	movsd	0 * SIZE(XX), %xmm1
714	movsd	1 * SIZE(XX), %xmm2
715	movsd	2 * SIZE(XX), %xmm3
716	movsd	3 * SIZE(XX), %xmm4
717#ifdef USE_ABS
718	andpd	 %xmm7, %xmm1
719	andpd	 %xmm7, %xmm2
720	andpd	 %xmm7, %xmm3
721	andpd	 %xmm7, %xmm4
722#endif
723	addl	$4 * SIZE, XX
724	incl	RET
725	comisd	%xmm0, %xmm1
726	je	.L999
727	incl	RET
728	comisd	%xmm0, %xmm2
729	je	.L999
730	incl	RET
731	comisd	%xmm0, %xmm3
732	je	.L999
733	incl	RET
734	comisd	%xmm0, %xmm4
735	je	.L999
736	ALIGN_3
737
738.L67:
739	testl	$2, MM
740	je	.L68
741
742	movsd	0 * SIZE(XX), %xmm1
743	movsd	1 * SIZE(XX), %xmm2
744#ifdef USE_ABS
745	andpd	 %xmm7, %xmm1
746	andpd	 %xmm7, %xmm2
747#endif
748	addl	$2 * SIZE, XX
749	incl	RET
750	comisd	%xmm0, %xmm1
751	je	.L999
752	incl	RET
753	comisd	%xmm0, %xmm2
754	je	.L999
755	ALIGN_3
756
757.L68:
758	incl	RET
759	jmp	.L999
760	ALIGN_4
761
762.L80:
763	movl	MM,  I
764	sarl	$4, I
765	jle	.L85
766	ALIGN_4
767
768.L81:
769#ifdef PREFETCH
770	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
771#endif
772
773	movsd	0 * SIZE(XX), %xmm4
774	addl	INCX, XX
775	movhpd	0 * SIZE(XX), %xmm4
776	addl	INCX, XX
777#ifdef USE_ABS
778	andpd	 %xmm7, %xmm4
779#endif
780	maxpd	%xmm4, %xmm0
781
782	movsd	0 * SIZE(XX), %xmm4
783	addl	INCX, XX
784	movhpd	0 * SIZE(XX), %xmm4
785	addl	INCX, XX
786#ifdef USE_ABS
787	andpd	 %xmm7, %xmm4
788#endif
789	maxpd	%xmm4, %xmm1
790
791	movsd	0 * SIZE(XX), %xmm4
792	addl	INCX, XX
793	movhpd	0 * SIZE(XX), %xmm4
794	addl	INCX, XX
795#ifdef USE_ABS
796	andpd	 %xmm7, %xmm4
797#endif
798	maxpd	%xmm4, %xmm2
799
800	movsd	0 * SIZE(XX), %xmm4
801	addl	INCX, XX
802	movhpd	0 * SIZE(XX), %xmm4
803	addl	INCX, XX
804#ifdef USE_ABS
805	andpd	 %xmm7, %xmm4
806#endif
807	maxpd	%xmm4, %xmm3
808
809#ifdef PREFETCH
810	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
811#endif
812
813	movsd	0 * SIZE(XX), %xmm4
814	addl	INCX, XX
815	movhpd	0 * SIZE(XX), %xmm4
816	addl	INCX, XX
817#ifdef USE_ABS
818	andpd	 %xmm7, %xmm4
819#endif
820	maxpd	%xmm4, %xmm0
821
822	movsd	0 * SIZE(XX), %xmm4
823	addl	INCX, XX
824	movhpd	0 * SIZE(XX), %xmm4
825	addl	INCX, XX
826#ifdef USE_ABS
827	andpd	%xmm7, %xmm4
828#endif
829	maxpd	%xmm4, %xmm1
830
831	movsd	0 * SIZE(XX), %xmm4
832	addl	INCX, XX
833	movhpd	0 * SIZE(XX), %xmm4
834	addl	INCX, XX
835#ifdef USE_ABS
836	andpd	%xmm7, %xmm4
837#endif
838	maxpd	%xmm4, %xmm2
839
840	movsd	0 * SIZE(XX), %xmm4
841	addl	INCX, XX
842	movhpd	0 * SIZE(XX), %xmm4
843	addl	INCX, XX
844#ifdef USE_ABS
845	andpd	%xmm7, %xmm4
846#endif
847	maxpd	%xmm4, %xmm3
848
849	decl	I
850	jg	.L81
851	ALIGN_4
852
853.L85:
854	andl	$15,  MM
855	jle	.L90
856
857	testl	$8, MM
858	je	.L86
859
860	movsd	0 * SIZE(XX), %xmm4
861	addl	INCX, XX
862	movhpd	0 * SIZE(XX), %xmm4
863	addl	INCX, XX
864#ifdef USE_ABS
865	andpd	%xmm7, %xmm4
866#endif
867	maxpd	%xmm4, %xmm0
868
869	movsd	0 * SIZE(XX), %xmm4
870	addl	INCX, XX
871	movhpd	0 * SIZE(XX), %xmm4
872	addl	INCX, XX
873#ifdef USE_ABS
874	andpd	%xmm7, %xmm4
875#endif
876	maxpd	%xmm4, %xmm1
877
878	movsd	0 * SIZE(XX), %xmm4
879	addl	INCX, XX
880	movhpd	0 * SIZE(XX), %xmm4
881	addl	INCX, XX
882#ifdef USE_ABS
883	andpd	%xmm7, %xmm4
884#endif
885	maxpd	%xmm4, %xmm2
886
887	movsd	0 * SIZE(XX), %xmm4
888	addl	INCX, XX
889	movhpd	0 * SIZE(XX), %xmm4
890	addl	INCX, XX
891#ifdef USE_ABS
892	andpd	%xmm7, %xmm4
893#endif
894	maxpd	%xmm4, %xmm3
895	ALIGN_3
896
897.L86:
898	testl	$4, MM
899	je	.L87
900
901	movsd	0 * SIZE(XX), %xmm4
902	addl	INCX, XX
903	movhpd	0 * SIZE(XX), %xmm4
904	addl	INCX, XX
905#ifdef USE_ABS
906	andpd	%xmm7, %xmm4
907#endif
908	maxpd	%xmm4, %xmm0
909
910	movsd	0 * SIZE(XX), %xmm4
911	addl	INCX, XX
912	movhpd	0 * SIZE(XX), %xmm4
913	addl	INCX, XX
914#ifdef USE_ABS
915	andpd	%xmm7, %xmm4
916#endif
917	maxpd	%xmm4, %xmm1
918	ALIGN_3
919
920.L87:
921	testl	$2, MM
922	je	.L88
923
924	movsd	0 * SIZE(XX), %xmm4
925	addl	INCX, XX
926	movhpd	0 * SIZE(XX), %xmm4
927	addl	INCX, XX
928#ifdef USE_ABS
929	andpd	 %xmm7, %xmm4
930#endif
931	maxpd	%xmm4, %xmm2
932	ALIGN_3
933
934.L88:
935	testl	$1, MM
936	je	.L90
937
938	movsd	0 * SIZE(XX), %xmm4
939#ifdef USE_ABS
940	andpd	 %xmm7, %xmm4
941#endif
942	unpcklpd  %xmm4, %xmm4
943	maxpd	%xmm4, %xmm3
944	ALIGN_4
945
946.L90:
947	movl	X, XX
948	movl	M, MM
949
950	maxpd	 %xmm1, %xmm0
951	maxpd	 %xmm3, %xmm2
952	maxpd	 %xmm2, %xmm0
953	movapd	 %xmm0, %xmm1
954	unpckhpd %xmm0, %xmm0
955	maxsd	 %xmm1, %xmm0
956	unpcklpd %xmm0, %xmm0
957
958	movl	MM,  I
959	sarl	$3, I
960	jle	.L95
961	ALIGN_4
962
963.L92:
964#ifdef PREFETCH
965	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
966#endif
967
968	movsd	0 * SIZE(XX), %xmm1
969	addl	INCX, XX
970	movhpd	0 * SIZE(XX), %xmm1
971	addl	INCX, XX
972#ifdef USE_ABS
973	andpd	 %xmm7, %xmm1
974#endif
975	cmpeqpd	%xmm0, %xmm1
976
977	movsd	0 * SIZE(XX), %xmm2
978	addl	INCX, XX
979	movhpd	0 * SIZE(XX), %xmm2
980	addl	INCX, XX
981#ifdef USE_ABS
982	andpd	 %xmm7, %xmm2
983#endif
984	cmpeqpd	%xmm0, %xmm2
985
986	movsd	0 * SIZE(XX), %xmm3
987	addl	INCX, XX
988	movhpd	0 * SIZE(XX), %xmm3
989	addl	INCX, XX
990#ifdef USE_ABS
991	andpd	 %xmm7, %xmm3
992#endif
993	cmpeqpd	%xmm0, %xmm3
994
995	movsd	0 * SIZE(XX), %xmm4
996	addl	INCX, XX
997	movhpd	0 * SIZE(XX), %xmm4
998	addl	INCX, XX
999#ifdef USE_ABS
1000	andpd	 %xmm7, %xmm4
1001#endif
1002	cmpeqpd	%xmm0, %xmm4
1003
1004	orpd	%xmm2, %xmm1
1005	orpd	%xmm4, %xmm3
1006	orpd	%xmm3, %xmm1
1007	movmskpd %xmm1, TEMP
1008	testl	 $3, TEMP
1009	jne	 .L93
1010
1011	addl	$8, RET
1012	decl	I
1013	jg	.L92
1014	jmp	.L95
1015	ALIGN_4
1016
1017.L93:
1018	leal	(, INCX, 8), TEMP
1019	subl	TEMP, XX
1020
1021	movsd	0 * SIZE(XX), %xmm1
1022	addl	INCX, XX
1023	movsd	0 * SIZE(XX), %xmm2
1024	addl	INCX, XX
1025	movsd	0 * SIZE(XX), %xmm3
1026	addl	INCX, XX
1027	movsd	0 * SIZE(XX), %xmm4
1028	addl	INCX, XX
1029#ifdef USE_ABS
1030	andpd	 %xmm7, %xmm1
1031	andpd	 %xmm7, %xmm2
1032	andpd	 %xmm7, %xmm3
1033	andpd	 %xmm7, %xmm4
1034#endif
1035	incl	RET
1036	comisd	%xmm0, %xmm1
1037	je	.L999
1038	incl	RET
1039	comisd	%xmm0, %xmm2
1040	je	.L999
1041	incl	RET
1042	comisd	%xmm0, %xmm3
1043	je	.L999
1044	incl	RET
1045	comisd	%xmm0, %xmm4
1046	je	.L999
1047
1048	movsd	0 * SIZE(XX), %xmm1
1049	addl	INCX, XX
1050	movsd	0 * SIZE(XX), %xmm2
1051	addl	INCX, XX
1052	movsd	0 * SIZE(XX), %xmm3
1053#ifdef USE_ABS
1054	andpd	 %xmm7, %xmm1
1055	andpd	 %xmm7, %xmm2
1056	andpd	 %xmm7, %xmm3
1057#endif
1058
1059	incl	RET
1060	comisd	%xmm0, %xmm1
1061	je	.L999
1062	incl	RET
1063	comisd	%xmm0, %xmm2
1064	je	.L999
1065	incl	RET
1066	comisd	%xmm0, %xmm3
1067	je	.L999
1068	incl	RET
1069	jmp	.L999
1070	ALIGN_3
1071
1072.L95:
1073	testl	$4, MM
1074	je	.L97
1075
1076	movsd	0 * SIZE(XX), %xmm1
1077	addl	INCX, XX
1078	movsd	0 * SIZE(XX), %xmm2
1079	addl	INCX, XX
1080	movsd	0 * SIZE(XX), %xmm3
1081	addl	INCX, XX
1082	movsd	0 * SIZE(XX), %xmm4
1083	addl	INCX, XX
1084#ifdef USE_ABS
1085	andpd	 %xmm7, %xmm1
1086	andpd	 %xmm7, %xmm2
1087	andpd	 %xmm7, %xmm3
1088	andpd	 %xmm7, %xmm4
1089#endif
1090	incl	RET
1091	comisd	%xmm0, %xmm1
1092	je	.L999
1093	incl	RET
1094	comisd	%xmm0, %xmm2
1095	je	.L999
1096	incl	RET
1097	comisd	%xmm0, %xmm3
1098	je	.L999
1099	incl	RET
1100	comisd	%xmm0, %xmm4
1101	je	.L999
1102	ALIGN_3
1103
1104.L97:
1105	testl	$2, MM
1106	je	.L98
1107
1108	movsd	0 * SIZE(XX), %xmm1
1109	addl	INCX, XX
1110	movsd	0 * SIZE(XX), %xmm2
1111	addl	INCX, XX
1112#ifdef USE_ABS
1113	andpd	 %xmm7, %xmm1
1114	andpd	 %xmm7, %xmm2
1115#endif
1116	incl	RET
1117	comisd	%xmm0, %xmm1
1118	je	.L999
1119	incl	RET
1120	comisd	%xmm0, %xmm2
1121	je	.L999
1122	ALIGN_3
1123
1124.L98:
1125	incl	RET
1126	ALIGN_3
1127
1128.L999:
1129	popl	%ebx
1130	popl	%esi
1131	popl	%edi
1132	popl	%ebp
1133	ret
1134
1135	EPILOGUE
1136