1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	 0
27
28#define STACK_M		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31
32#define RET	%eax
33#define	M	%ebx
34#define X	%ecx
35#define INCX	%edx
36#define I	%esi
37#define MM	%ebp
38#define XX	%edi
39#define TEMP	%ebx
40
41#ifdef USE_MIN
42#define maxpd	minpd
43#define maxsd	minsd
44#endif
45
46#include "l1param.h"
47
48	PROLOGUE
49
50	pushl	%ebp
51	pushl	%edi
52	pushl	%esi
53	pushl	%ebx
54
55	PROFCODE
56
57	movl	STACK_M, M
58	movl	STACK_X, X
59	movl	STACK_INCX, INCX
60
61#ifdef F_INTERFACE
62	movl	(M), M
63	movl	(INCX), INCX
64#endif
65
66	pxor	%xmm0, %xmm0
67	pxor	%xmm7, %xmm7
68	xor	RET, RET
69	testl	M, M
70	jle	.L999
71	testl	INCX, INCX
72	jle	.L999
73
74	sall	$ZBASE_SHIFT, INCX
75	movl	M, MM
76	movl	X, XX
77
78	cmpeqpd	%xmm7, %xmm7
79	psrlq	$1, %xmm7
80
81	movsd	0 * SIZE(XX), %xmm0
82	movsd	1 * SIZE(XX), %xmm1
83	addl	INCX, XX
84	decl	MM
85	andpd	 %xmm7, %xmm0
86	andpd	 %xmm7, %xmm1
87	addpd	 %xmm1, %xmm0
88	unpcklpd %xmm0, %xmm0
89	cmpl	$2 * SIZE, INCX
90	jne	.L60
91
92	movl	MM,  I
93	sarl	$3, I
94	jle	.L25
95	ALIGN_4
96
97.L21:
98#ifdef PREFETCH
99	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
100#endif
101
102	movsd	 0 * SIZE(XX), %xmm1
103	movsd	 1 * SIZE(XX), %xmm2
104	movhpd	 2 * SIZE(XX), %xmm1
105	movhpd	 3 * SIZE(XX), %xmm2
106
107	andpd	%xmm7, %xmm1
108	andpd	%xmm7, %xmm2
109	addpd	%xmm2, %xmm1
110	maxpd	%xmm1, %xmm0
111
112	movsd	 4 * SIZE(XX), %xmm3
113	movsd	 5 * SIZE(XX), %xmm4
114	movhpd	 6 * SIZE(XX), %xmm3
115	movhpd	 7 * SIZE(XX), %xmm4
116
117	andpd	%xmm7, %xmm3
118	andpd	%xmm7, %xmm4
119	addpd	%xmm4, %xmm3
120	maxpd	%xmm3, %xmm0
121
122#if defined(PREFETCH) && !defined(FETCH128)
123	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(XX)
124#endif
125
126	movsd	 8 * SIZE(XX), %xmm1
127	movsd	 9 * SIZE(XX), %xmm2
128	movhpd	10 * SIZE(XX), %xmm1
129	movhpd	11 * SIZE(XX), %xmm2
130
131	andpd	%xmm7, %xmm1
132	andpd	%xmm7, %xmm2
133	addpd	%xmm2, %xmm1
134	maxpd	%xmm1, %xmm0
135
136	movsd	12 * SIZE(XX), %xmm3
137	movsd	13 * SIZE(XX), %xmm4
138	movhpd	14 * SIZE(XX), %xmm3
139	movhpd	15 * SIZE(XX), %xmm4
140
141	andpd	%xmm7, %xmm3
142	andpd	%xmm7, %xmm4
143	addpd	%xmm4, %xmm3
144	maxpd	%xmm3, %xmm0
145
146	addl	$16 * SIZE, XX
147	decl	I
148	jg	.L21
149	ALIGN_4
150
151.L25:
152	andl	$7,  MM
153	jle	.L30
154
155	testl	$4, MM
156	je	.L26
157
158	movsd	 0 * SIZE(XX), %xmm1
159	movsd	 1 * SIZE(XX), %xmm2
160	movhpd	 2 * SIZE(XX), %xmm1
161	movhpd	 3 * SIZE(XX), %xmm2
162
163	andpd	%xmm7, %xmm1
164	andpd	%xmm7, %xmm2
165	addpd	%xmm2, %xmm1
166	maxpd	%xmm1, %xmm0
167
168	movsd	 4 * SIZE(XX), %xmm3
169	movsd	 5 * SIZE(XX), %xmm4
170	movhpd	 6 * SIZE(XX), %xmm3
171	movhpd	 7 * SIZE(XX), %xmm4
172
173	andpd	%xmm7, %xmm3
174	andpd	%xmm7, %xmm4
175	addpd	%xmm4, %xmm3
176	maxpd	%xmm3, %xmm0
177	addl	$8 * SIZE, XX
178	ALIGN_3
179
180.L26:
181	testl	$2, MM
182	je	.L27
183
184	movsd	 0 * SIZE(XX), %xmm1
185	movsd	 1 * SIZE(XX), %xmm2
186	movhpd	 2 * SIZE(XX), %xmm1
187	movhpd	 3 * SIZE(XX), %xmm2
188
189	andpd	%xmm7, %xmm1
190	andpd	%xmm7, %xmm2
191	addpd	%xmm2, %xmm1
192	maxpd	%xmm1, %xmm0
193
194	addl	$4 * SIZE, XX
195	ALIGN_3
196
197.L27:
198	testl	$1, MM
199	je	.L30
200
201	movsd	0 * SIZE(XX), %xmm1
202	movsd	1 * SIZE(XX), %xmm2
203	andpd	%xmm7, %xmm1
204	andpd	%xmm7, %xmm2
205	addpd	%xmm2, %xmm1
206	maxsd	%xmm1, %xmm0
207	ALIGN_4
208
209.L30:
210	movl	X, XX
211	movl	M, MM
212
213	movapd	 %xmm0, %xmm1
214	unpckhpd %xmm0, %xmm0
215	maxsd	 %xmm1, %xmm0
216	unpcklpd %xmm0, %xmm0
217
218	movl	MM,  I
219	sarl	$2, I
220	jle	.L35
221	ALIGN_4
222
223.L31:
224#ifdef PREFETCH
225	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
226#endif
227
228	movsd	0 * SIZE(XX), %xmm1
229	movsd	1 * SIZE(XX), %xmm2
230	movhpd	2 * SIZE(XX), %xmm1
231	movhpd	3 * SIZE(XX), %xmm2
232	movsd	4 * SIZE(XX), %xmm3
233	movsd	5 * SIZE(XX), %xmm4
234	movhpd	6 * SIZE(XX), %xmm3
235	movhpd	7 * SIZE(XX), %xmm4
236
237	andpd	%xmm7, %xmm1
238	andpd	%xmm7, %xmm2
239	andpd	%xmm7, %xmm3
240	andpd	%xmm7, %xmm4
241
242	addpd	%xmm2,  %xmm1
243	addpd	%xmm4,  %xmm3
244
245	cmpeqpd	%xmm0, %xmm1
246	cmpeqpd	%xmm0, %xmm3
247
248	orpd	%xmm3, %xmm1
249	movmskpd %xmm1, TEMP
250	testl	 $3, TEMP
251	jne	 .L33
252
253	addl	$8 * SIZE, XX
254	addl	$4, RET
255	decl	I
256	jg	.L31
257	jmp	.L35
258	ALIGN_4
259
260.L33:
261	movsd	0 * SIZE(XX), %xmm1
262	movsd	1 * SIZE(XX), %xmm2
263	movsd	2 * SIZE(XX), %xmm3
264	movsd	3 * SIZE(XX), %xmm4
265
266	andpd	%xmm7, %xmm1
267	andpd	%xmm7, %xmm2
268	andpd	%xmm7, %xmm3
269	andpd	%xmm7, %xmm4
270
271	addpd	%xmm2,  %xmm1
272	addpd	%xmm4,  %xmm3
273
274	incl	RET
275	comisd	%xmm0, %xmm1
276	je	.L999
277	incl	RET
278	comisd	%xmm0, %xmm3
279	je	.L999
280
281	movsd	4 * SIZE(XX), %xmm1
282	movsd	5 * SIZE(XX), %xmm2
283	movsd	6 * SIZE(XX), %xmm3
284	movsd	7 * SIZE(XX), %xmm4
285	addl	$8 * SIZE, XX
286
287	andpd	%xmm7, %xmm1
288	andpd	%xmm7, %xmm2
289	andpd	%xmm7, %xmm3
290	andpd	%xmm7, %xmm4
291
292	addpd	%xmm2,  %xmm1
293	addpd	%xmm4,  %xmm3
294
295	incl	RET
296	comisd	%xmm0, %xmm1
297	je	.L999
298	incl	RET
299	comisd	%xmm0, %xmm3
300	je	.L999
301	ALIGN_3
302
303.L35:
304	testl	$2, MM
305	je	.L36
306
307	movsd	0 * SIZE(XX), %xmm1
308	movsd	1 * SIZE(XX), %xmm2
309	movsd	2 * SIZE(XX), %xmm3
310	movsd	3 * SIZE(XX), %xmm4
311	addl	$4 * SIZE, XX
312
313	andpd	%xmm7, %xmm1
314	andpd	%xmm7, %xmm2
315	andpd	%xmm7, %xmm3
316	andpd	%xmm7, %xmm4
317
318	addpd	%xmm2,  %xmm1
319	addpd	%xmm4,  %xmm3
320
321	incl	RET
322	comisd	%xmm0, %xmm1
323	je	.L999
324	incl	RET
325	comisd	%xmm0, %xmm3
326	je	.L999
327	ALIGN_3
328
329.L36:
330	incl	RET
331	jmp	.L999
332	ALIGN_3
333
334.L60:
335	movl	MM,  I
336	sarl	$3, I
337	jle	.L65
338	ALIGN_4
339
340.L61:
341#ifdef PREFETCH
342	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
343#endif
344
345	movsd	0 * SIZE(XX), %xmm1
346	movsd	1 * SIZE(XX), %xmm2
347	addl	INCX, XX
348	movhpd	0 * SIZE(XX), %xmm1
349	movhpd	1 * SIZE(XX), %xmm2
350	addl	INCX, XX
351
352	andpd	%xmm7, %xmm1
353	andpd	%xmm7, %xmm2
354	addpd	%xmm2, %xmm1
355	maxpd	%xmm1, %xmm0
356
357	movsd	0 * SIZE(XX), %xmm3
358	movsd	1 * SIZE(XX), %xmm4
359	addl	INCX, XX
360	movhpd	0 * SIZE(XX), %xmm3
361	movhpd	1 * SIZE(XX), %xmm4
362	addl	INCX, XX
363
364	andpd	%xmm7, %xmm3
365	andpd	%xmm7, %xmm4
366	addpd	%xmm4, %xmm3
367	maxpd	%xmm3, %xmm0
368
369#if defined(PREFETCH) && !defined(FETCH128)
370	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
371#endif
372
373	movsd	0 * SIZE(XX), %xmm1
374	movsd	1 * SIZE(XX), %xmm2
375	addl	INCX, XX
376	movhpd	0 * SIZE(XX), %xmm1
377	movhpd	1 * SIZE(XX), %xmm2
378	addl	INCX, XX
379
380	andpd	%xmm7, %xmm1
381	andpd	%xmm7, %xmm2
382	addpd	%xmm2, %xmm1
383	maxpd	%xmm1, %xmm0
384
385	movsd	0 * SIZE(XX), %xmm3
386	movsd	1 * SIZE(XX), %xmm4
387	addl	INCX, XX
388	movhpd	0 * SIZE(XX), %xmm3
389	movhpd	1 * SIZE(XX), %xmm4
390	addl	INCX, XX
391
392	andpd	%xmm7, %xmm3
393	andpd	%xmm7, %xmm4
394	addpd	%xmm4, %xmm3
395	maxpd	%xmm3, %xmm0
396
397	decl	I
398	jg	.L61
399	ALIGN_4
400
401.L65:
402	andl	$7,  MM
403	jle	.L70
404
405	testl	$4, MM
406	je	.L66
407
408	movsd	0 * SIZE(XX), %xmm1
409	movsd	1 * SIZE(XX), %xmm2
410	addl	INCX, XX
411	movhpd	0 * SIZE(XX), %xmm1
412	movhpd	1 * SIZE(XX), %xmm2
413	addl	INCX, XX
414
415	andpd	%xmm7, %xmm1
416	andpd	%xmm7, %xmm2
417	addpd	%xmm2, %xmm1
418	maxpd	%xmm1, %xmm0
419
420	movsd	0 * SIZE(XX), %xmm3
421	movsd	1 * SIZE(XX), %xmm4
422	addl	INCX, XX
423	movhpd	0 * SIZE(XX), %xmm3
424	movhpd	1 * SIZE(XX), %xmm4
425	addl	INCX, XX
426
427	andpd	%xmm7, %xmm3
428	andpd	%xmm7, %xmm4
429	addpd	%xmm4, %xmm3
430	maxpd	%xmm3, %xmm0
431	ALIGN_3
432
433.L66:
434	testl	$2, MM
435	je	.L67
436
437	movsd	0 * SIZE(XX), %xmm1
438	movsd	1 * SIZE(XX), %xmm2
439	addl	INCX, XX
440	movhpd	0 * SIZE(XX), %xmm1
441	movhpd	1 * SIZE(XX), %xmm2
442	addl	INCX, XX
443
444	andpd	%xmm7, %xmm1
445	andpd	%xmm7, %xmm2
446	addpd	%xmm2, %xmm1
447	maxpd	%xmm1, %xmm0
448	ALIGN_3
449
450.L67:
451	testl	$1, MM
452	je	.L70
453
454	movsd	0 * SIZE(XX), %xmm1
455	movsd	1 * SIZE(XX), %xmm2
456	andpd	%xmm7, %xmm1
457	andpd	%xmm7, %xmm2
458	addpd	%xmm2, %xmm1
459	maxsd	%xmm1, %xmm0
460	ALIGN_3
461
462.L70:
463	movl	X, XX
464	movl	M, MM
465
466	movapd	%xmm0, %xmm1
467	unpckhpd %xmm0, %xmm0
468	maxsd	%xmm1, %xmm0
469	unpcklpd %xmm0, %xmm0
470
471	movl	MM,  I
472	sarl	$2, I
473	jle	.L75
474	ALIGN_4
475
476.L71:
477#ifdef PREFETCH
478	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
479#endif
480
481	movsd	0 * SIZE(XX), %xmm1
482	movsd	1 * SIZE(XX), %xmm2
483	addl	INCX, XX
484	movhpd	0 * SIZE(XX), %xmm1
485	movhpd	1 * SIZE(XX), %xmm2
486	addl	INCX, XX
487	movsd	0 * SIZE(XX), %xmm3
488	movsd	1 * SIZE(XX), %xmm4
489	addl	INCX, XX
490	movhpd	0 * SIZE(XX), %xmm3
491	movhpd	1 * SIZE(XX), %xmm4
492	addl	INCX, XX
493
494	andpd	%xmm7, %xmm1
495	andpd	%xmm7, %xmm2
496	andpd	%xmm7, %xmm3
497	andpd	%xmm7, %xmm4
498
499	addpd	%xmm2,  %xmm1
500	addpd	%xmm4,  %xmm3
501
502	cmpeqpd	%xmm0, %xmm1
503	cmpeqpd	%xmm0, %xmm3
504
505	orpd	%xmm3, %xmm1
506	movmskpd %xmm1, TEMP
507	testl	 $3, TEMP
508	jne	 .L73
509
510	addl	$4, RET
511	decl	I
512	jg	.L71
513	jmp	.L75
514	ALIGN_4
515
516.L73:
517	leal	(, INCX, 4), TEMP
518	subl	TEMP, XX
519
520	movsd	0 * SIZE(XX), %xmm1
521	movsd	1 * SIZE(XX), %xmm2
522	addl	INCX, XX
523	movsd	0 * SIZE(XX), %xmm3
524	movsd	1 * SIZE(XX), %xmm4
525	addl	INCX, XX
526
527	andpd	%xmm7, %xmm1
528	andpd	%xmm7, %xmm2
529	andpd	%xmm7, %xmm3
530	andpd	%xmm7, %xmm4
531
532	addpd	%xmm2,  %xmm1
533	addpd	%xmm4,  %xmm3
534
535	incl	RET
536	comisd	%xmm0, %xmm1
537	je	.L999
538	incl	RET
539	comisd	%xmm0, %xmm3
540	je	.L999
541
542	movsd	0 * SIZE(XX), %xmm1
543	movsd	1 * SIZE(XX), %xmm2
544	addl	INCX, XX
545	movsd	0 * SIZE(XX), %xmm3
546	movsd	1 * SIZE(XX), %xmm4
547	addl	INCX, XX
548
549	andpd	%xmm7, %xmm1
550	andpd	%xmm7, %xmm2
551	andpd	%xmm7, %xmm3
552	andpd	%xmm7, %xmm4
553
554	addpd	%xmm2,  %xmm1
555	addpd	%xmm4,  %xmm3
556
557	incl	RET
558	comisd	%xmm0, %xmm1
559	je	.L999
560	incl	RET
561	comisd	%xmm0, %xmm3
562	je	.L999
563	ALIGN_3
564
565.L75:
566	testl	$2, MM
567	je	.L76
568
569	movsd	0 * SIZE(XX), %xmm1
570	movsd	1 * SIZE(XX), %xmm2
571	addl	INCX, XX
572	movsd	0 * SIZE(XX), %xmm3
573	movsd	1 * SIZE(XX), %xmm4
574	addl	INCX, XX
575
576	andpd	%xmm7, %xmm1
577	andpd	%xmm7, %xmm2
578	andpd	%xmm7, %xmm3
579	andpd	%xmm7, %xmm4
580
581	addpd	%xmm2,  %xmm1
582	addpd	%xmm4,  %xmm3
583	incl	RET
584	comisd	%xmm0, %xmm1
585	je	.L999
586	incl	RET
587	comisd	%xmm0, %xmm3
588	je	.L999
589	ALIGN_3
590
591.L76:
592	incl	RET
593	ALIGN_4
594
595.L999:
596	popl	%ebx
597	popl	%esi
598	popl	%edi
599	popl	%ebp
600	ret
601
602	EPILOGUE
603