1/*********************************************************************/
2/*                                                                   */
3/*             Optimized BLAS libraries                              */
4/*                     By Kazushige Goto <kgoto@tacc.utexas.edu>     */
5/*                                                                   */
6/* Copyright (c) The University of Texas, 2009. All rights reserved. */
7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING  */
8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF      */
9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE,              */
10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY  */
11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF     */
12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO   */
13/* THE USE OF THE SOFTWARE OR DOCUMENTATION.                         */
14/* Under no circumstances shall University be liable for incidental, */
15/* special, indirect, direct or consequential damages or loss of     */
16/* profits, interruption of business, or related expenses which may  */
17/* arise from use of Software or Documentation, including but not    */
18/* limited to those resulting from defects in Software and/or        */
19/* Documentation, or loss or inaccuracy of data of any kind.         */
20/*********************************************************************/
21
22#define ASSEMBLER
23#include "common.h"
24
25#define STACK	16
26#define ARGS	 0
27
28#define STACK_M		 4 + STACK + ARGS(%esp)
29#define STACK_X		 8 + STACK + ARGS(%esp)
30#define STACK_INCX	12 + STACK + ARGS(%esp)
31
32#define RET	%eax
33#define	M	%ebx
34#define X	%ecx
35#define INCX	%edx
36#define I	%esi
37#define MM	%ebp
38#define XX	%edi
39#define TEMP	%ebx
40
41#ifdef USE_MIN
42#define maxps	minps
43#define maxss	minss
44#endif
45
46#ifndef HAVE_SSE2
47#define pxor	xorps
48#define movsd	movlps
49#endif
50
51#include "l1param.h"
52
53	PROLOGUE
54
55	pushl	%ebp
56	pushl	%edi
57	pushl	%esi
58	pushl	%ebx
59
60	PROFCODE
61
62	movl	STACK_M, M
63	movl	STACK_X, X
64	movl	STACK_INCX, INCX
65
66#ifdef F_INTERFACE
67	movl	(M), M
68	movl	(INCX), INCX
69#endif
70
71	pxor	%xmm0, %xmm0
72	pxor	%xmm7, %xmm7
73	xor	RET, RET
74	testl	M, M
75	jle	.L999
76	testl	INCX, INCX
77	jle	.L999
78
79	sall	$ZBASE_SHIFT, INCX
80	movl	M, MM
81	movl	X, XX
82
83#ifdef USE_ABS
84#ifndef HAVE_SSE2
85	subl	$8, %esp
86	movl	$0x7fffffff, (%esp)
87	movss	(%esp), %xmm7
88	shufps	$0, %xmm7, %xmm7
89	addl	$8, %esp
90#else
91	cmpeqps	%xmm7, %xmm7
92	psrld	$1, %xmm7
93#endif
94#endif
95
96	movss	0 * SIZE(XX), %xmm0
97	movss	1 * SIZE(XX), %xmm1
98	addl	INCX, XX
99	decl	MM
100	andps	%xmm7, %xmm0
101	andps	%xmm7, %xmm1
102	addps	%xmm1,  %xmm0
103	shufps	$0, %xmm0, %xmm0
104	cmpl	$2 * SIZE, INCX
105	jne	.L70
106
107.L30:
108	movl	MM,  I
109	sarl	$3, I
110	jle	.L35
111	ALIGN_4
112
113.L31:
114#ifdef PREFETCH
115	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
116#endif
117
118	movsd	 0 * SIZE(XX), %xmm1
119	movhps	 2 * SIZE(XX), %xmm1
120	movsd	 4 * SIZE(XX), %xmm2
121	movhps	 6 * SIZE(XX), %xmm2
122
123	movaps	%xmm1, %xmm3
124
125	shufps	$0x88, %xmm2, %xmm1
126	shufps	$0xdd, %xmm2, %xmm3
127
128	andps	%xmm7, %xmm1
129	andps	%xmm7, %xmm3
130	addps	%xmm3,  %xmm1
131	maxps	%xmm1,  %xmm0
132
133	movsd	 8 * SIZE(XX), %xmm1
134	movhps	10 * SIZE(XX), %xmm1
135	movsd	12 * SIZE(XX), %xmm2
136	movhps	14 * SIZE(XX), %xmm2
137
138	movaps	%xmm1, %xmm3
139
140	shufps	$0x88, %xmm2, %xmm1
141	shufps	$0xdd, %xmm2, %xmm3
142
143	andps	%xmm7, %xmm1
144	andps	%xmm7, %xmm3
145	addps	%xmm3,  %xmm1
146	maxps	%xmm1,  %xmm0
147
148	addl	$16 * SIZE, XX
149	decl	I
150	jg	.L31
151	ALIGN_4
152
153.L35:
154	andl	$7,  MM
155	jle	.L40
156
157	testl	$4, MM
158	je	.L36
159
160	movsd	 0 * SIZE(XX), %xmm1
161	movhps	 2 * SIZE(XX), %xmm1
162	movsd	 4 * SIZE(XX), %xmm2
163	movhps	 6 * SIZE(XX), %xmm2
164
165	movaps	%xmm1, %xmm3
166
167	shufps	$0x88, %xmm2, %xmm1
168	shufps	$0xdd, %xmm2, %xmm3
169
170	andps	%xmm7, %xmm1
171	andps	%xmm7, %xmm3
172	addps	%xmm3,  %xmm1
173	maxps	%xmm1,  %xmm0
174
175	addl	$8 * SIZE, XX
176	ALIGN_3
177
178.L36:
179	testl	$2, MM
180	je	.L37
181
182	movss	0 * SIZE(XX), %xmm1
183	movss	1 * SIZE(XX), %xmm2
184	movss	2 * SIZE(XX), %xmm3
185	movss	3 * SIZE(XX), %xmm4
186	andps	%xmm7, %xmm1
187	andps	%xmm7, %xmm2
188	andps	%xmm7, %xmm3
189	andps	%xmm7, %xmm4
190	addps	%xmm2,  %xmm1
191	addps	%xmm4,  %xmm3
192	maxss	%xmm1,  %xmm0
193	maxss	%xmm3,  %xmm0
194	addl	$4 * SIZE, XX
195	ALIGN_3
196
197.L37:
198	testl	$1, MM
199	je	.L40
200
201	movss	0 * SIZE(XX), %xmm1
202	movss	1 * SIZE(XX), %xmm2
203	andps	%xmm7, %xmm1
204	andps	%xmm7, %xmm2
205	addps	%xmm2,  %xmm1
206	maxss	%xmm1,  %xmm0
207	ALIGN_4
208
209.L40:
210	movl	X, XX
211	movl	M, MM
212
213	movaps	%xmm0, %xmm1
214	movhlps %xmm0, %xmm0
215	maxps	%xmm1, %xmm0
216	movaps	%xmm0, %xmm1
217	shufps	$1, %xmm0, %xmm0
218	maxss	%xmm1, %xmm0
219	shufps	$0, %xmm0, %xmm0
220
221	movl	MM,  I
222	sarl	$2, I
223	jle	.L45
224	ALIGN_4
225
226.L41:
227#ifdef PREFETCH
228	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
229#endif
230
231	movsd	0 * SIZE(XX), %xmm1
232	movhps	2 * SIZE(XX), %xmm1
233	movsd	4 * SIZE(XX), %xmm2
234	movhps	6 * SIZE(XX), %xmm2
235
236	movaps	%xmm1, %xmm3
237
238	shufps	$0x88, %xmm2, %xmm1
239	shufps	$0xdd, %xmm2, %xmm3
240
241	andps	%xmm7, %xmm1
242	andps	%xmm7, %xmm3
243	addps	%xmm3,  %xmm1
244
245	cmpeqps	%xmm0, %xmm1
246	movmskps %xmm1, TEMP
247	testl	 $15, TEMP
248	jne	 .L43
249
250	addl	$8 * SIZE, XX
251	addl	$4, RET
252	decl	I
253	jg	.L41
254	jmp	.L45
255	ALIGN_4
256
257.L43:
258	movss	0 * SIZE(XX), %xmm1
259	movss	1 * SIZE(XX), %xmm2
260	movss	2 * SIZE(XX), %xmm3
261	movss	3 * SIZE(XX), %xmm4
262
263	andps	%xmm7, %xmm1
264	andps	%xmm7, %xmm2
265	andps	%xmm7, %xmm3
266	andps	%xmm7, %xmm4
267
268	addps	%xmm2,  %xmm1
269	addps	%xmm4,  %xmm3
270
271	incl	RET
272	comiss	%xmm0, %xmm1
273	je	.L999
274	incl	RET
275	comiss	%xmm0, %xmm3
276	je	.L999
277
278	movss	4 * SIZE(XX), %xmm1
279	movss	5 * SIZE(XX), %xmm2
280	movss	6 * SIZE(XX), %xmm3
281	movss	7 * SIZE(XX), %xmm4
282
283	andps	%xmm7, %xmm1
284	andps	%xmm7, %xmm2
285	andps	%xmm7, %xmm3
286	andps	%xmm7, %xmm4
287
288	addps	%xmm2,  %xmm1
289	addps	%xmm4,  %xmm3
290
291	addl	$8 * SIZE, XX
292
293	incl	RET
294	comiss	%xmm0, %xmm1
295	je	.L999
296	incl	RET
297	comiss	%xmm0, %xmm3
298	je	.L999
299	ALIGN_3
300
301.L45:
302	testl	$2, MM
303	je	.L47
304
305	movss	0 * SIZE(XX), %xmm1
306	movss	1 * SIZE(XX), %xmm2
307	movss	2 * SIZE(XX), %xmm3
308	movss	3 * SIZE(XX), %xmm4
309	addl	$4 * SIZE, XX
310
311	andps	%xmm7, %xmm1
312	andps	%xmm7, %xmm2
313	andps	%xmm7, %xmm3
314	andps	%xmm7, %xmm4
315	addps	%xmm2,  %xmm1
316	addps	%xmm4,  %xmm3
317
318	incl	RET
319	comiss	%xmm0, %xmm1
320	je	.L999
321	incl	RET
322	comiss	%xmm0, %xmm3
323	je	.L999
324	ALIGN_3
325
326.L47:
327	incl	RET
328	jmp	.L999
329	ALIGN_3
330
331.L70:
332	movl	MM,  I
333	sarl	$3, I
334	jle	.L75
335	ALIGN_4
336
337.L71:
338#ifdef PREFETCH
339	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
340#endif
341
342	movsd	0 * SIZE(XX), %xmm1
343	addl	INCX, XX
344	movhps	0 * SIZE(XX), %xmm1
345	addl	INCX, XX
346	movsd	0 * SIZE(XX), %xmm2
347	addl	INCX, XX
348	movhps	0 * SIZE(XX), %xmm2
349	addl	INCX, XX
350
351	movaps	%xmm1, %xmm3
352
353	shufps	$0x88, %xmm2, %xmm1
354	shufps	$0xdd, %xmm2, %xmm3
355
356	andps	%xmm7, %xmm1
357	andps	%xmm7, %xmm3
358	addps	%xmm3,  %xmm1
359	maxps	%xmm1,  %xmm0
360
361	movsd	0 * SIZE(XX), %xmm1
362	addl	INCX, XX
363	movhps	0 * SIZE(XX), %xmm1
364	addl	INCX, XX
365	movsd	0 * SIZE(XX), %xmm2
366	addl	INCX, XX
367	movhps	0 * SIZE(XX), %xmm2
368	addl	INCX, XX
369
370	movaps	%xmm1, %xmm3
371
372	shufps	$0x88, %xmm2, %xmm1
373	shufps	$0xdd, %xmm2, %xmm3
374
375	andps	%xmm7, %xmm1
376	andps	%xmm7, %xmm3
377	addps	%xmm3,  %xmm1
378	maxps	%xmm1,  %xmm0
379	decl	I
380	jg	.L71
381	ALIGN_4
382
383.L75:
384	andl	$7,  MM
385	jle	.L80
386
387	testl	$4, MM
388	je	.L76
389
390	movsd	0 * SIZE(XX), %xmm1
391	addl	INCX, XX
392	movhps	0 * SIZE(XX), %xmm1
393	addl	INCX, XX
394	movsd	0 * SIZE(XX), %xmm2
395	addl	INCX, XX
396	movhps	0 * SIZE(XX), %xmm2
397	addl	INCX, XX
398
399	movaps	%xmm1, %xmm3
400
401	shufps	$0x88, %xmm2, %xmm1
402	shufps	$0xdd, %xmm2, %xmm3
403
404	andps	%xmm7, %xmm1
405	andps	%xmm7, %xmm3
406	addps	%xmm3,  %xmm1
407	maxps	%xmm1,  %xmm0
408	ALIGN_3
409
410.L76:
411	testl	$2, MM
412	je	.L77
413
414	movss	0 * SIZE(XX), %xmm1
415	movss	1 * SIZE(XX), %xmm2
416	addl	INCX, XX
417	movss	0 * SIZE(XX), %xmm3
418	movss	1 * SIZE(XX), %xmm4
419	addl	INCX, XX
420	andps	%xmm7, %xmm1
421	andps	%xmm7, %xmm2
422	andps	%xmm7, %xmm3
423	andps	%xmm7, %xmm4
424	addps	%xmm2,  %xmm1
425	addps	%xmm4,  %xmm3
426	maxss	%xmm1,  %xmm0
427	maxss	%xmm3,  %xmm0
428	ALIGN_3
429
430.L77:
431	testl	$1, MM
432	je	.L80
433
434	movss	0 * SIZE(XX), %xmm1
435	movss	1 * SIZE(XX), %xmm2
436	andps	%xmm7, %xmm1
437	andps	%xmm7, %xmm2
438	addps	%xmm2,  %xmm1
439	maxss	%xmm1,  %xmm0
440	ALIGN_4
441
442.L80:
443	movl	X, XX
444	movl	M, MM
445
446	movaps	%xmm0, %xmm1
447	movhlps %xmm0, %xmm0
448	maxps	%xmm1, %xmm0
449	movaps	%xmm0, %xmm1
450	shufps	$1, %xmm0, %xmm0
451	maxss	%xmm1, %xmm0
452	shufps	$0, %xmm0, %xmm0
453
454	movl	MM,  I
455	sarl	$2, I
456	jle	.L85
457	ALIGN_4
458
459.L81:
460#ifdef PREFETCH
461	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(XX)
462#endif
463
464	movsd	0 * SIZE(XX), %xmm1
465	addl	INCX, XX
466	movhps	0 * SIZE(XX), %xmm1
467	addl	INCX, XX
468	movsd	0 * SIZE(XX), %xmm2
469	addl	INCX, XX
470	movhps	0 * SIZE(XX), %xmm2
471	addl	INCX, XX
472
473	movaps	%xmm1, %xmm3
474
475	shufps	$0x88, %xmm2, %xmm1
476	shufps	$0xdd, %xmm2, %xmm3
477
478	andps	%xmm7, %xmm1
479	andps	%xmm7, %xmm3
480	addps	%xmm3,  %xmm1
481
482	cmpeqps	%xmm0, %xmm1
483	movmskps %xmm1, TEMP
484	testl	 $15, TEMP
485	jne	 .L83
486
487	addl	$4, RET
488	decl	I
489	jg	.L81
490	jmp	.L85
491	ALIGN_4
492
493.L83:
494	leal	(, INCX, 4), TEMP
495	subl	TEMP, XX
496
497	movss	0 * SIZE(XX), %xmm1
498	movss	1 * SIZE(XX), %xmm2
499	addl	INCX, XX
500	movss	0 * SIZE(XX), %xmm3
501	movss	1 * SIZE(XX), %xmm4
502	addl	INCX, XX
503
504	andps	%xmm7, %xmm1
505	andps	%xmm7, %xmm2
506	andps	%xmm7, %xmm3
507	andps	%xmm7, %xmm4
508
509	addps	%xmm2,  %xmm1
510	addps	%xmm4,  %xmm3
511
512	incl	RET
513	comiss	%xmm0, %xmm1
514	je	.L999
515	incl	RET
516	comiss	%xmm0, %xmm3
517	je	.L999
518
519	movss	0 * SIZE(XX), %xmm1
520	movss	1 * SIZE(XX), %xmm2
521	addl	INCX, XX
522	movss	0 * SIZE(XX), %xmm3
523	movss	1 * SIZE(XX), %xmm4
524	addl	INCX, XX
525
526	andps	%xmm7, %xmm1
527	andps	%xmm7, %xmm2
528	andps	%xmm7, %xmm3
529	andps	%xmm7, %xmm4
530
531	addps	%xmm2,  %xmm1
532	addps	%xmm4,  %xmm3
533
534	incl	RET
535	comiss	%xmm0, %xmm1
536	je	.L999
537	incl	RET
538	comiss	%xmm0, %xmm3
539	je	.L999
540	ALIGN_3
541
542.L85:
543	testl	$2, MM
544	je	.L87
545
546	movss	0 * SIZE(XX), %xmm1
547	movss	1 * SIZE(XX), %xmm2
548	addl	INCX, XX
549	movss	0 * SIZE(XX), %xmm3
550	movss	1 * SIZE(XX), %xmm4
551	addl	INCX, XX
552
553	andps	%xmm7, %xmm1
554	andps	%xmm7, %xmm2
555	andps	%xmm7, %xmm3
556	andps	%xmm7, %xmm4
557	addps	%xmm2, %xmm1
558	addps	%xmm4, %xmm3
559
560	incl	RET
561	comiss	%xmm0, %xmm1
562	je	.L999
563	incl	RET
564	comiss	%xmm0, %xmm3
565	je	.L999
566	ALIGN_3
567
568.L87:
569	incl	RET
570	ALIGN_4
571
572.L999:
573	popl	%ebx
574	popl	%esi
575	popl	%edi
576	popl	%ebp
577	ret
578
579	EPILOGUE
580