1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26	.ident	"@(#)__vcos.S	1.8	06/01/23 SMI"
27
28	.file	"__vcos.S"
29
30#include "libm.h"
31
32	RO_DATA
33	.align	64
34constants:
35	.word	0x3ec718e3,0xa6972785
36	.word	0x3ef9fd39,0x94293940
37	.word	0xbf2a019f,0x75ee4be1
38	.word	0xbf56c16b,0xba552569
39	.word	0x3f811111,0x1108c703
40	.word	0x3fa55555,0x554f5b35
41	.word	0xbfc55555,0x555554d0
42	.word	0xbfdfffff,0xffffff85
43	.word	0x3ff00000,0x00000000
44	.word	0xbfc55555,0x5551fc28
45	.word	0x3f811107,0x62eacc9d
46	.word	0xbfdfffff,0xffff6328
47	.word	0x3fa55551,0x5f7acf0c
48	.word	0x3fe45f30,0x6dc9c883
49	.word	0x43380000,0x00000000
50	.word	0x3ff921fb,0x54400000
51	.word	0x3dd0b461,0x1a600000
52	.word	0x3ba3198a,0x2e000000
53	.word	0x397b839a,0x252049c1
54	.word	0x80000000,0x00004000
55	.word	0xffff8000,0x00000000	! N.B.: low-order words used
56	.word	0x3fc90000,0x80000000	! for sign bit hacking; see
57	.word	0x3fc40000,0x00000000	! references to "thresh" below
58
59#define p4		0x0
60#define q4		0x08
61#define p3		0x10
62#define q3		0x18
63#define p2		0x20
64#define q2		0x28
65#define p1		0x30
66#define q1		0x38
67#define one		0x40
68#define pp1		0x48
69#define pp2		0x50
70#define qq1		0x58
71#define qq2		0x60
72#define invpio2		0x68
73#define round		0x70
74#define pio2_1		0x78
75#define pio2_2		0x80
76#define pio2_3		0x88
77#define pio2_3t		0x90
78#define f30val		0x98
79#define mask		0xa0
80#define thresh		0xa8
81
82! local storage indices
83
84#define xsave		STACK_BIAS-0x8
85#define ysave		STACK_BIAS-0x10
86#define nsave		STACK_BIAS-0x14
87#define sxsave		STACK_BIAS-0x18
88#define sysave		STACK_BIAS-0x1c
89#define biguns		STACK_BIAS-0x20
90#define n2		STACK_BIAS-0x24
91#define n1		STACK_BIAS-0x28
92#define n0		STACK_BIAS-0x2c
93#define x2_1		STACK_BIAS-0x40
94#define x1_1		STACK_BIAS-0x50
95#define x0_1		STACK_BIAS-0x60
96#define y2_0		STACK_BIAS-0x70
97#define y1_0		STACK_BIAS-0x80
98#define y0_0		STACK_BIAS-0x90
99! sizeof temp storage - must be a multiple of 16 for V9
100#define tmps		0x90
101
102!--------------------------------------------------------------------
103! define pipes for easier reading
104
105#define P0_f0		%f0
106#define P0_f1		%f1
107#define P0_f2		%f2
108#define P0_f3		%f3
109#define P0_f4		%f4
110#define P0_f5		%f5
111#define P0_f6		%f6
112#define P0_f7		%f7
113#define P0_f8		%f8
114#define P0_f9		%f9
115
116#define P1_f10		%f10
117#define P1_f11		%f11
118#define P1_f12		%f12
119#define P1_f13		%f13
120#define P1_f14		%f14
121#define P1_f15		%f15
122#define P1_f16		%f16
123#define P1_f17		%f17
124#define P1_f18		%f18
125#define P1_f19		%f19
126
127#define P2_f20		%f20
128#define P2_f21		%f21
129#define P2_f22		%f22
130#define P2_f23		%f23
131#define P2_f24		%f24
132#define P2_f25		%f25
133#define P2_f26		%f26
134#define P2_f27		%f27
135#define P2_f28		%f28
136#define P2_f29		%f29
137
138! define __vlibm_TBL_sincos_hi & lo for easy reading
139
140#define SC_HI		%l3
141#define SC_LO		%l4
142
143! define constants for easy reading
144
145#define C_q1 %f46
146#define C_q2 %f48
147#define C_q3 %f50
148#define C_q4 %f52
149
150! one ( 1 ) uno eins echi un
151#define C_ONE		%f54
152#define C_ONE_LO	%f55
153
154! masks
155#define MSK_SIGN	%i5
156#define MSK_BIT31	%f30
157#define MSK_BIT13	%f31
158#define MSK_BITSHI17	%f44
159
160
161! constants for pp and qq
162#define C_pp1 %f56
163#define C_pp2 %f58
164#define C_qq1 %f60
165#define C_qq2 %f62
166
167! sign mask
168#define C_signM		%i5
169
170#define LIM_l5		%l5
171#define LIM_l6		%l6
172! when in pri range, using value as transition from poly to table.
173! for Medium range,change use of %l6 and use to keep track of biguns.
174#define LIM_l7		%l7
175
176!--------------------------------------------------------------------
177
178
179	ENTRY(__vcos)
180	save	%sp,-SA(MINFRAME)-tmps,%sp
181	PIC_SETUP(g5)
182	PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
183	PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
184	PIC_SET(g5,constants,o0)
185	mov	%o0,%g1
186	wr	%g0,0x82,%asi		! set %asi for non-faulting loads
187
188! ========== primary range ==========
189
190! register use
191
192! i0  n
193! i1  x
194! i2  stridex
195! i3  y
196! i4  stridey
197! i5  0x80000000
198
199! l0  hx0
200! l1  hx1
201! l2  hx2
202! l3  __vlibm_TBL_sincos_hi
203! l4  __vlibm_TBL_sincos_lo
204! l5  0x3fc40000
205! l6  0x3e400000
206! l7  0x3fe921fb
207
208! the following are 64-bit registers in both V8+ and V9
209
210! g1  scratch
211! g5
212
213! o0  py0
214! o1  py1
215! o2  py2
216! o3  oy0
217! o4  oy1
218! o5  oy2
219! o7  scratch
220
221! f0  x0
222! f2
223! f4
224! f6
225! f8  scratch for table base
226! f9  signbit0
227! f10 x1
228! f12
229! f14
230! f16
231! f18 scratch for table base
232! f19 signbit1
233! f20 x2
234! f22
235! f24
236! f26
237! f28 scratch for table base
238! f29 signbit2
239! f30 0x80000000
240! f31 0x4000
241! f32
242! f34
243! f36
244! f38
245! f40
246! f42
247! f44 0xffff800000000000
248! f46 p1
249! f48 p2
250! f50 p3
251! f52 p4
252! f54 one
253! f56 pp1
254! f58 pp2
255! f60 qq1
256! f62 qq2
257
258#ifdef __sparcv9
259	stx	%i1,[%fp+xsave]		! save arguments
260	stx	%i3,[%fp+ysave]
261#else
262	st	%i1,[%fp+xsave]		! save arguments
263	st	%i3,[%fp+ysave]
264#endif
265
266	st	%i0,[%fp+nsave]
267	st	%i2,[%fp+sxsave]
268	st	%i4,[%fp+sysave]
269	sethi	%hi(0x80000000),MSK_SIGN	! load/set up constants
270	sethi	%hi(0x3fc40000),LIM_l5
271	sethi	%hi(0x3e400000),LIM_l6
272	sethi	%hi(0x3fe921fb),LIM_l7
273	or	LIM_l7,%lo(0x3fe921fb),LIM_l7
274	ldd	[%g1+f30val],MSK_BIT31
275	ldd	[%g1+mask],MSK_BITSHI17
276	ldd	[%g1+q1],C_q1
277	ldd	[%g1+q2],C_q2
278	ldd	[%g1+q3],C_q3
279	ldd	[%g1+q4],C_q4
280	ldd	[%g1+one],C_ONE
281	ldd	[%g1+pp1],C_pp1
282	ldd	[%g1+pp2],C_pp2
283	ldd	[%g1+qq1],C_qq1
284	ldd	[%g1+qq2],C_qq2
285	sll	%i2,3,%i2		! scale strides
286	sll	%i4,3,%i4
287	add	%fp,x0_1,%o3		! precondition loop
288	add	%fp,x0_1,%o4
289	add	%fp,x0_1,%o5
290	ld	[%i1],%l0		! hx = *x
291	ld	[%i1],P0_f0
292	ld	[%i1+4],P0_f1
293	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
294	add	%i1,%i2,%i1		! x += stridex
295
296	ba,pt	%icc,.loop0
297!delay slot
298	nop
299
300	.align 32
301.loop0:
302	lda	[%i1]%asi,%l1		! preload next argument
303	sub	%l0,LIM_l6,%g1
304	sub	LIM_l7,%l0,%o7
305	fands	P0_f0,MSK_BIT31,P0_f9		! save signbit
306
307	lda	[%i1]%asi,P1_f10
308	orcc	%o7,%g1,%g0
309	mov	%i3,%o0			! py0 = y
310	bl,pn	%icc,.range0		! if hx < 0x3e400000 or > 0x3fe921fb
311
312! delay slot
313	lda	[%i1+4]%asi,P1_f11
314	addcc	%i0,-1,%i0
315	add	%i3,%i4,%i3		! y += stridey
316	ble,pn	%icc,.endloop1
317
318! delay slot
319	andn	%l1,MSK_SIGN,%l1
320	add	%i1,%i2,%i1		! x += stridex
321	fabsd	P0_f0,P0_f0
322	fmuld	C_ONE,C_ONE,C_ONE		! one*one; a nop for alignment only
323
324.loop1:
325	lda	[%i1]%asi,%l2		! preload next argument
326	sub	%l1,LIM_l6,%g1
327	sub	LIM_l7,%l1,%o7
328	fands	P1_f10,MSK_BIT31,P1_f19		! save signbit
329
330	lda	[%i1]%asi,P2_f20
331	orcc	%o7,%g1,%g0
332	mov	%i3,%o1			! py1 = y
333	bl,pn	%icc,.range1		! if hx < 0x3e400000 or > 0x3fe921fb
334
335! delay slot
336	lda	[%i1+4]%asi,P2_f21
337	addcc	%i0,-1,%i0
338	add	%i3,%i4,%i3		! y += stridey
339	ble,pn	%icc,.endloop2
340
341! delay slot
342	andn	%l2,MSK_SIGN,%l2
343	add	%i1,%i2,%i1		! x += stridex
344	fabsd	P1_f10,P1_f10
345	fmuld	C_ONE,C_ONE,C_ONE		! one*one; a nop for alignment only
346
347.loop2:
348	st	P0_f6,[%o3]
349	sub	%l2,LIM_l6,%g1
350	sub	LIM_l7,%l2,%o7
351	fands	P2_f20,MSK_BIT31,P2_f29		! save signbit
352
353	st	P0_f7,[%o3+4]
354	orcc	%g1,%o7,%g0
355	mov	%i3,%o2			! py2 = y
356	bl,pn	%icc,.range2		! if hx < 0x3e400000 or > 0x3fe921fb
357
358! delay slot
359	add	%i3,%i4,%i3		! y += stridey
360	cmp	%l0,LIM_l5
361	fabsd	P2_f20,P2_f20
362	bl,pn	%icc,.case4
363
364! delay slot
365	st	P1_f16,[%o4]
366	cmp	%l1,LIM_l5
367	fpadd32s P0_f0,MSK_BIT13,P0_f8
368	bl,pn	%icc,.case2
369
370! delay slot
371	st	P1_f17,[%o4+4]
372	cmp	%l2,LIM_l5
373	fpadd32s P1_f10,MSK_BIT13,P1_f18
374	bl,pn	%icc,.case1
375
376! delay slot
377	st	P2_f26,[%o5]
378	mov	%o0,%o3
379	sethi	%hi(0x3fc3c000),%o7
380	fpadd32s P2_f20,MSK_BIT13,P2_f28
381
382	st	P2_f27,[%o5+4]
383	fand	P0_f8,MSK_BITSHI17,P0_f2
384	mov	%o1,%o4
385
386	fand	P1_f18,MSK_BITSHI17,P1_f12
387	mov	%o2,%o5
388	sub	%l0,%o7,%l0
389
390	fand	P2_f28,MSK_BITSHI17,P2_f22
391	sub	%l1,%o7,%l1
392	sub	%l2,%o7,%l2
393
394	fsubd	P0_f0,P0_f2,P0_f0
395	srl	%l0,10,%l0
396	add	SC_HI,8,%g1;add	SC_LO,8,%o7
397
398	fsubd	P1_f10,P1_f12,P1_f10
399	srl	%l1,10,%l1
400
401	fsubd	P2_f20,P2_f22,P2_f20
402	srl	%l2,10,%l2
403
404	fmuld	P0_f0,P0_f0,P0_f2
405	andn	%l0,0x1f,%l0
406
407	fmuld	P1_f10,P1_f10,P1_f12
408	andn	%l1,0x1f,%l1
409
410	fmuld	P2_f20,P2_f20,P2_f22
411	andn	%l2,0x1f,%l2
412
413	fmuld	P0_f2,C_pp2,P0_f6
414	ldd	[%g1+%l0],%f32
415
416	fmuld	P1_f12,C_pp2,P1_f16
417	ldd	[%g1+%l1],%f36
418
419	fmuld	P2_f22,C_pp2,P2_f26
420	ldd	[%g1+%l2],%f40
421
422	faddd	P0_f6,C_pp1,P0_f6
423	fmuld	P0_f2,C_qq2,P0_f4
424	ldd	[SC_HI+%l0],%f34
425
426	faddd	P1_f16,C_pp1,P1_f16
427	fmuld	P1_f12,C_qq2,P1_f14
428	ldd	[SC_HI+%l1],%f38
429
430	faddd	P2_f26,C_pp1,P2_f26
431	fmuld	P2_f22,C_qq2,P2_f24
432	ldd	[SC_HI+%l2],%f42
433
434	fmuld	P0_f2,P0_f6,P0_f6
435	faddd	P0_f4,C_qq1,P0_f4
436
437	fmuld	P1_f12,P1_f16,P1_f16
438	faddd	P1_f14,C_qq1,P1_f14
439
440	fmuld	P2_f22,P2_f26,P2_f26
441	faddd	P2_f24,C_qq1,P2_f24
442
443	faddd	P0_f6,C_ONE,P0_f6
444	fmuld	P0_f2,P0_f4,P0_f4
445
446	faddd	P1_f16,C_ONE,P1_f16
447	fmuld	P1_f12,P1_f14,P1_f14
448
449	faddd	P2_f26,C_ONE,P2_f26
450	fmuld	P2_f22,P2_f24,P2_f24
451
452	fmuld	P0_f0,P0_f6,P0_f6
453	ldd	[%o7+%l0],P0_f2
454
455	fmuld	P1_f10,P1_f16,P1_f16
456	ldd	[%o7+%l1],P1_f12
457
458	fmuld	P2_f20,P2_f26,P2_f26
459	ldd	[%o7+%l2],P2_f22
460
461	fmuld	P0_f4,%f32,P0_f4
462	lda	[%i1]%asi,%l0		! preload next argument
463
464	fmuld	P1_f14,%f36,P1_f14
465	lda	[%i1]%asi,P0_f0
466
467	fmuld	P2_f24,%f40,P2_f24
468	lda	[%i1+4]%asi,P0_f1
469
470	fmuld	P0_f6,%f34,P0_f6
471	add	%i1,%i2,%i1		! x += stridex
472
473	fmuld	P1_f16,%f38,P1_f16
474
475	fmuld	P2_f26,%f42,P2_f26
476
477	fsubd	P0_f6,P0_f4,P0_f6
478
479	fsubd	P1_f16,P1_f14,P1_f16
480
481	fsubd	P2_f26,P2_f24,P2_f26
482
483	fsubd	P0_f2,P0_f6,P0_f6
484
485	fsubd	P1_f12,P1_f16,P1_f16
486
487	fsubd	P2_f22,P2_f26,P2_f26
488
489	faddd	P0_f6,%f32,P0_f6
490
491	faddd	P1_f16,%f36,P1_f16
492
493	faddd	P2_f26,%f40,P2_f26
494	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
495
496	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
497	addcc	%i0,-1,%i0
498
499	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
500	bg,pt	%icc,.loop0
501
502! delay slot
503	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
504
505	ba,pt	%icc,.endloop0
506! delay slot
507	nop
508
509	.align	32
510.case1:
511	st	P2_f27,[%o5+4]
512	sethi	%hi(0x3fc3c000),%o7
513	fand	P0_f8,MSK_BITSHI17,P0_f2
514
515	sub	%l0,%o7,%l0
516	sub	%l1,%o7,%l1
517	add	SC_HI,8,%g1;add	SC_LO,8,%o7
518	fand	P1_f18,MSK_BITSHI17,P1_f12
519	fmuld	P2_f20,P2_f20,P2_f22
520
521	fsubd	P0_f0,P0_f2,P0_f0
522	srl	%l0,10,%l0
523	mov	%o0,%o3
524
525	fsubd	P1_f10,P1_f12,P1_f10
526	srl	%l1,10,%l1
527	mov	%o1,%o4
528
529	fmuld	P2_f22,C_q4,P2_f24
530	mov	%o2,%o5
531
532	fmuld	P0_f0,P0_f0,P0_f2
533	andn	%l0,0x1f,%l0
534
535	fmuld	P1_f10,P1_f10,P1_f12
536	andn	%l1,0x1f,%l1
537
538	faddd	P2_f24,C_q3,P2_f24
539
540	fmuld	P0_f2,C_pp2,P0_f6
541	ldd	[%g1+%l0],%f32
542
543	fmuld	P1_f12,C_pp2,P1_f16
544	ldd	[%g1+%l1],%f36
545
546	fmuld	P2_f22,P2_f24,P2_f24
547
548	faddd	P0_f6,C_pp1,P0_f6
549	fmuld	P0_f2,C_qq2,P0_f4
550	ldd	[SC_HI+%l0],%f34
551
552	faddd	P1_f16,C_pp1,P1_f16
553	fmuld	P1_f12,C_qq2,P1_f14
554	ldd	[SC_HI+%l1],%f38
555
556	faddd	P2_f24,C_q2,P2_f24
557
558	fmuld	P0_f2,P0_f6,P0_f6
559	faddd	P0_f4,C_qq1,P0_f4
560
561	fmuld	P1_f12,P1_f16,P1_f16
562	faddd	P1_f14,C_qq1,P1_f14
563
564	fmuld	P2_f22,P2_f24,P2_f24
565
566	faddd	P0_f6,C_ONE,P0_f6
567	fmuld	P0_f2,P0_f4,P0_f4
568
569	faddd	P1_f16,C_ONE,P1_f16
570	fmuld	P1_f12,P1_f14,P1_f14
571
572	faddd	P2_f24,C_q1,P2_f24
573
574	fmuld	P0_f0,P0_f6,P0_f6
575	ldd	[%o7+%l0],P0_f2
576
577	fmuld	P1_f10,P1_f16,P1_f16
578	ldd	[%o7+%l1],P1_f12
579
580	fmuld	P0_f4,%f32,P0_f4
581	lda	[%i1]%asi,%l0		! preload next argument
582
583	fmuld	P1_f14,%f36,P1_f14
584	lda	[%i1]%asi,P0_f0
585
586	fmuld	P0_f6,%f34,P0_f6
587	lda	[%i1+4]%asi,P0_f1
588
589	fmuld	P1_f16,%f38,P1_f16
590	add	%i1,%i2,%i1		! x += stridex
591
592	fmuld	P2_f22,P2_f24,P2_f24
593
594	fsubd	P0_f6,P0_f4,P0_f6
595
596	fsubd	P1_f16,P1_f14,P1_f16
597
598	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
599
600	fsubd	P0_f2,P0_f6,P0_f6
601
602	fsubd	P1_f12,P1_f16,P1_f16
603
604	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
605
606	faddd	P0_f6,%f32,P0_f6
607
608	faddd	P1_f16,%f36,P1_f16
609	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
610
611	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
612	addcc	%i0,-1,%i0
613
614	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
615	bg,pt	%icc,.loop0
616
617! delay slot
618	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
619
620	ba,pt	%icc,.endloop0
621! delay slot
622	nop
623
624	.align	32
625.case2:
626	st	P2_f26,[%o5]
627	cmp	%l2,LIM_l5
628	fpadd32s P2_f20,MSK_BIT13,P2_f28
629	bl,pn	%icc,.case3
630
631! delay slot
632	st	P2_f27,[%o5+4]
633	sethi	%hi(0x3fc3c000),%o7
634	fand	P0_f8,MSK_BITSHI17,P0_f2
635
636	sub	%l0,%o7,%l0
637	sub	%l2,%o7,%l2
638	add	SC_HI,8,%g1;add	SC_LO,8,%o7
639	fand	P2_f28,MSK_BITSHI17,P2_f22
640	fmuld	P1_f10,P1_f10,P1_f12
641
642	fsubd	P0_f0,P0_f2,P0_f0
643	srl	%l0,10,%l0
644	mov	%o0,%o3
645
646	fsubd	P2_f20,P2_f22,P2_f20
647	srl	%l2,10,%l2
648	mov	%o2,%o5
649
650	fmuld	P1_f12,C_q4,P1_f14
651	mov	%o1,%o4
652
653	fmuld	P0_f0,P0_f0,P0_f2
654	andn	%l0,0x1f,%l0
655
656	fmuld	P2_f20,P2_f20,P2_f22
657	andn	%l2,0x1f,%l2
658
659	faddd	P1_f14,C_q3,P1_f14
660
661	fmuld	P0_f2,C_pp2,P0_f6
662	ldd	[%g1+%l0],%f32
663
664	fmuld	P2_f22,C_pp2,P2_f26
665	ldd	[%g1+%l2],%f40
666
667	fmuld	P1_f12,P1_f14,P1_f14
668
669	faddd	P0_f6,C_pp1,P0_f6
670	fmuld	P0_f2,C_qq2,P0_f4
671	ldd	[SC_HI+%l0],%f34
672
673	faddd	P2_f26,C_pp1,P2_f26
674	fmuld	P2_f22,C_qq2,P2_f24
675	ldd	[SC_HI+%l2],%f42
676
677	faddd	P1_f14,C_q2,P1_f14
678
679	fmuld	P0_f2,P0_f6,P0_f6
680	faddd	P0_f4,C_qq1,P0_f4
681
682	fmuld	P2_f22,P2_f26,P2_f26
683	faddd	P2_f24,C_qq1,P2_f24
684
685	fmuld	P1_f12,P1_f14,P1_f14
686
687	faddd	P0_f6,C_ONE,P0_f6
688	fmuld	P0_f2,P0_f4,P0_f4
689
690	faddd	P2_f26,C_ONE,P2_f26
691	fmuld	P2_f22,P2_f24,P2_f24
692
693	faddd	P1_f14,C_q1,P1_f14
694
695	fmuld	P0_f0,P0_f6,P0_f6
696	ldd	[%o7+%l0],P0_f2
697
698	fmuld	P2_f20,P2_f26,P2_f26
699	ldd	[%o7+%l2],P2_f22
700
701	fmuld	P0_f4,%f32,P0_f4
702	lda	[%i1]%asi,%l0		! preload next argument
703
704	fmuld	P2_f24,%f40,P2_f24
705	lda	[%i1]%asi,P0_f0
706
707	fmuld	P0_f6,%f34,P0_f6
708	lda	[%i1+4]%asi,P0_f1
709
710	fmuld	P2_f26,%f42,P2_f26
711	add	%i1,%i2,%i1		! x += stridex
712
713	fmuld	P1_f12,P1_f14,P1_f14
714
715	fsubd	P0_f6,P0_f4,P0_f6
716
717	fsubd	P2_f26,P2_f24,P2_f26
718
719	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
720
721	fsubd	P0_f2,P0_f6,P0_f6
722
723	fsubd	P2_f22,P2_f26,P2_f26
724
725	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
726
727	faddd	P0_f6,%f32,P0_f6
728
729	faddd	P2_f26,%f40,P2_f26
730	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
731
732	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
733	addcc	%i0,-1,%i0
734
735	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
736	bg,pt	%icc,.loop0
737
738! delay slot
739	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
740
741	ba,pt	%icc,.endloop0
742! delay slot
743	nop
744
745	.align	32
746.case3:
747	sethi	%hi(0x3fc3c000),%o7
748	fand	P0_f8,MSK_BITSHI17,P0_f2
749	fmuld	P1_f10,P1_f10,P1_f12
750
751	sub	%l0,%o7,%l0
752	add	SC_HI,8,%g1;add	SC_LO,8,%o7
753	fmuld	P2_f20,P2_f20,P2_f22
754
755	fsubd	P0_f0,P0_f2,P0_f0
756	srl	%l0,10,%l0
757	mov	%o0,%o3
758
759	fmuld	P1_f12,C_q4,P1_f14
760	mov	%o1,%o4
761
762	fmuld	P2_f22,C_q4,P2_f24
763	mov	%o2,%o5
764
765	fmuld	P0_f0,P0_f0,P0_f2
766	andn	%l0,0x1f,%l0
767
768	faddd	P1_f14,C_q3,P1_f14
769
770	faddd	P2_f24,C_q3,P2_f24
771
772	fmuld	P0_f2,C_pp2,P0_f6
773	ldd	[%g1+%l0],%f32
774
775	fmuld	P1_f12,P1_f14,P1_f14
776
777	fmuld	P2_f22,P2_f24,P2_f24
778
779	faddd	P0_f6,C_pp1,P0_f6
780	fmuld	P0_f2,C_qq2,P0_f4
781	ldd	[SC_HI+%l0],%f34
782
783	faddd	P1_f14,C_q2,P1_f14
784
785	faddd	P2_f24,C_q2,P2_f24
786
787	fmuld	P0_f2,P0_f6,P0_f6
788	faddd	P0_f4,C_qq1,P0_f4
789
790	fmuld	P1_f12,P1_f14,P1_f14
791
792	fmuld	P2_f22,P2_f24,P2_f24
793
794	faddd	P0_f6,C_ONE,P0_f6
795	fmuld	P0_f2,P0_f4,P0_f4
796
797	faddd	P1_f14,C_q1,P1_f14
798
799	faddd	P2_f24,C_q1,P2_f24
800
801	fmuld	P0_f0,P0_f6,P0_f6
802	ldd	[%o7+%l0],P0_f2
803
804	fmuld	P0_f4,%f32,P0_f4
805	lda	[%i1]%asi,%l0		! preload next argument
806
807	fmuld	P1_f12,P1_f14,P1_f14
808	lda	[%i1]%asi,P0_f0
809
810	fmuld	P0_f6,%f34,P0_f6
811	lda	[%i1+4]%asi,P0_f1
812
813	fmuld	P2_f22,P2_f24,P2_f24
814	add	%i1,%i2,%i1		! x += stridex
815
816	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
817
818	fsubd	P0_f6,P0_f4,P0_f6
819
820	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
821
822	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
823
824	fsubd	P0_f2,P0_f6,P0_f6
825
826	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
827
828	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
829	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
830
831	faddd	P0_f6,%f32,P0_f6
832	addcc	%i0,-1,%i0
833
834	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
835	bg,pt	%icc,.loop0
836
837! delay slot
838	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
839
840	ba,pt	%icc,.endloop0
841! delay slot
842	nop
843
844	.align	32
845.case4:
846	st	P1_f17,[%o4+4]
847	cmp	%l1,LIM_l5
848	fpadd32s P1_f10,MSK_BIT13,P1_f18
849	bl,pn	%icc,.case6
850
851! delay slot
852	st	P2_f26,[%o5]
853	cmp	%l2,LIM_l5
854	fpadd32s P2_f20,MSK_BIT13,P2_f28
855	bl,pn	%icc,.case5
856
857! delay slot
858	st	P2_f27,[%o5+4]
859	sethi	%hi(0x3fc3c000),%o7
860	fand	P1_f18,MSK_BITSHI17,P1_f12
861
862	sub	%l1,%o7,%l1
863	sub	%l2,%o7,%l2
864	add	SC_HI,8,%g1;add	SC_LO,8,%o7
865	fand	P2_f28,MSK_BITSHI17,P2_f22
866	fmuld	P0_f0,P0_f0,P0_f2
867
868	fsubd	P1_f10,P1_f12,P1_f10
869	srl	%l1,10,%l1
870	mov	%o1,%o4
871
872	fsubd	P2_f20,P2_f22,P2_f20
873	srl	%l2,10,%l2
874	mov	%o2,%o5
875
876	fmovd	P0_f0,P0_f6		!ID for processing
877	fmuld	P0_f2,C_q4,P0_f4
878	mov	%o0,%o3
879
880	fmuld	P1_f10,P1_f10,P1_f12
881	andn	%l1,0x1f,%l1
882
883	fmuld	P2_f20,P2_f20,P2_f22
884	andn	%l2,0x1f,%l2
885
886	faddd	P0_f4,C_q3,P0_f4
887
888	fmuld	P1_f12,C_pp2,P1_f16
889	ldd	[%g1+%l1],%f36
890
891	fmuld	P2_f22,C_pp2,P2_f26
892	ldd	[%g1+%l2],%f40
893
894	fmuld	P0_f2,P0_f4,P0_f4
895
896	faddd	P1_f16,C_pp1,P1_f16
897	fmuld	P1_f12,C_qq2,P1_f14
898	ldd	[SC_HI+%l1],%f38
899
900	faddd	P2_f26,C_pp1,P2_f26
901	fmuld	P2_f22,C_qq2,P2_f24
902	ldd	[SC_HI+%l2],%f42
903
904	faddd	P0_f4,C_q2,P0_f4
905
906	fmuld	P1_f12,P1_f16,P1_f16
907	faddd	P1_f14,C_qq1,P1_f14
908
909	fmuld	P2_f22,P2_f26,P2_f26
910	faddd	P2_f24,C_qq1,P2_f24
911
912	fmuld	P0_f2,P0_f4,P0_f4
913
914	faddd	P1_f16,C_ONE,P1_f16
915	fmuld	P1_f12,P1_f14,P1_f14
916
917	faddd	P2_f26,C_ONE,P2_f26
918	fmuld	P2_f22,P2_f24,P2_f24
919
920	faddd	P0_f4,C_q1,P0_f4
921
922	fmuld	P1_f10,P1_f16,P1_f16
923	ldd	[%o7+%l1],P1_f12
924
925	fmuld	P2_f20,P2_f26,P2_f26
926	ldd	[%o7+%l2],P2_f22
927
928	fmuld	P1_f14,%f36,P1_f14
929	lda	[%i1]%asi,%l0		! preload next argument
930
931	fmuld	P2_f24,%f40,P2_f24
932	lda	[%i1]%asi,P0_f0
933
934	fmuld	P1_f16,%f38,P1_f16
935	lda	[%i1+4]%asi,P0_f1
936
937	fmuld	P2_f26,%f42,P2_f26
938	add	%i1,%i2,%i1		! x += stridex
939
940	fmuld	P0_f2,P0_f4,P0_f4
941
942	fsubd	P1_f16,P1_f14,P1_f16
943
944	fsubd	P2_f26,P2_f24,P2_f26
945
946	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
947
948	fsubd	P1_f12,P1_f16,P1_f16
949
950	fsubd	P2_f22,P2_f26,P2_f26
951
952	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
953
954	faddd	P1_f16,%f36,P1_f16
955
956	faddd	P2_f26,%f40,P2_f26
957	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
958
959	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
960	addcc	%i0,-1,%i0
961
962	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
963	bg,pt	%icc,.loop0
964
965! delay slot
966	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
967
968	ba,pt	%icc,.endloop0
969! delay slot
970	nop
971
972	.align	32
973.case5:
974	sethi	%hi(0x3fc3c000),%o7
975	fand	P1_f18,MSK_BITSHI17,P1_f12
976	fmuld	P0_f0,P0_f0,P0_f2
977
978	sub	%l1,%o7,%l1
979	add	SC_HI,8,%g1;add	SC_LO,8,%o7
980	fmuld	P2_f20,P2_f20,P2_f22
981
982	fsubd	P1_f10,P1_f12,P1_f10
983	srl	%l1,10,%l1
984	mov	%o1,%o4
985
986	fmovd	P0_f0,P0_f6		!ID for processing
987	fmuld	P0_f2,C_q4,P0_f4
988	mov	%o0,%o3
989
990	fmuld	P2_f22,C_q4,P2_f24
991	mov	%o2,%o5
992
993	fmuld	P1_f10,P1_f10,P1_f12
994	andn	%l1,0x1f,%l1
995
996	faddd	P0_f4,C_q3,P0_f4
997
998	faddd	P2_f24,C_q3,P2_f24
999
1000	fmuld	P1_f12,C_pp2,P1_f16
1001	ldd	[%g1+%l1],%f36
1002
1003	fmuld	P0_f2,P0_f4,P0_f4
1004
1005	fmuld	P2_f22,P2_f24,P2_f24
1006
1007	faddd	P1_f16,C_pp1,P1_f16
1008	fmuld	P1_f12,C_qq2,P1_f14
1009	ldd	[SC_HI+%l1],%f38
1010
1011	faddd	P0_f4,C_q2,P0_f4
1012
1013	faddd	P2_f24,C_q2,P2_f24
1014
1015	fmuld	P1_f12,P1_f16,P1_f16
1016	faddd	P1_f14,C_qq1,P1_f14
1017
1018	fmuld	P0_f2,P0_f4,P0_f4
1019
1020	fmuld	P2_f22,P2_f24,P2_f24
1021
1022	faddd	P1_f16,C_ONE,P1_f16
1023	fmuld	P1_f12,P1_f14,P1_f14
1024
1025	faddd	P0_f4,C_q1,P0_f4
1026
1027	faddd	P2_f24,C_q1,P2_f24
1028
1029	fmuld	P1_f10,P1_f16,P1_f16
1030	ldd	[%o7+%l1],P1_f12
1031
1032	fmuld	P1_f14,%f36,P1_f14
1033	lda	[%i1]%asi,%l0		! preload next argument
1034
1035	fmuld	P0_f2,P0_f4,P0_f4
1036	lda	[%i1]%asi,P0_f0
1037
1038	fmuld	P1_f16,%f38,P1_f16
1039	lda	[%i1+4]%asi,P0_f1
1040
1041	fmuld	P2_f22,P2_f24,P2_f24
1042	add	%i1,%i2,%i1		! x += stridex
1043
1044	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1045
1046	fsubd	P1_f16,P1_f14,P1_f16
1047
1048	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
1049
1050	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1051
1052	fsubd	P1_f12,P1_f16,P1_f16
1053
1054	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
1055
1056	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1057	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1058
1059	faddd	P1_f16,%f36,P1_f16
1060	addcc	%i0,-1,%i0
1061
1062	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1063	bg,pt	%icc,.loop0
1064
1065! delay slot
1066	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1067
1068	ba,pt	%icc,.endloop0
1069! delay slot
1070	nop
1071
1072	.align	32
1073.case6:
1074	st	P2_f27,[%o5+4]
1075	cmp	%l2,LIM_l5
1076	fpadd32s P2_f20,MSK_BIT13,P2_f28
1077	bl,pn	%icc,.case7
1078
1079! delay slot
1080	sethi	%hi(0x3fc3c000),%o7
1081	fand	P2_f28,MSK_BITSHI17,P2_f22
1082	fmuld	P0_f0,P0_f0,P0_f2
1083
1084	sub	%l2,%o7,%l2
1085	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1086	fmuld	P1_f10,P1_f10,P1_f12
1087
1088	fsubd	P2_f20,P2_f22,P2_f20
1089	srl	%l2,10,%l2
1090	mov	%o2,%o5
1091
1092	fmovd	P0_f0,P0_f6		!ID for processing
1093	fmuld	P0_f2,C_q4,P0_f4
1094	mov	%o0,%o3
1095
1096	fmuld	P1_f12,C_q4,P1_f14
1097	mov	%o1,%o4
1098
1099	fmuld	P2_f20,P2_f20,P2_f22
1100	andn	%l2,0x1f,%l2
1101
1102	faddd	P0_f4,C_q3,P0_f4
1103
1104	faddd	P1_f14,C_q3,P1_f14
1105
1106	fmuld	P2_f22,C_pp2,P2_f26
1107	ldd	[%g1+%l2],%f40
1108
1109	fmuld	P0_f2,P0_f4,P0_f4
1110
1111	fmuld	P1_f12,P1_f14,P1_f14
1112
1113	faddd	P2_f26,C_pp1,P2_f26
1114	fmuld	P2_f22,C_qq2,P2_f24
1115	ldd	[SC_HI+%l2],%f42
1116
1117	faddd	P0_f4,C_q2,P0_f4
1118
1119	faddd	P1_f14,C_q2,P1_f14
1120
1121	fmuld	P2_f22,P2_f26,P2_f26
1122	faddd	P2_f24,C_qq1,P2_f24
1123
1124	fmuld	P0_f2,P0_f4,P0_f4
1125
1126	fmuld	P1_f12,P1_f14,P1_f14
1127
1128	faddd	P2_f26,C_ONE,P2_f26
1129	fmuld	P2_f22,P2_f24,P2_f24
1130
1131	faddd	P0_f4,C_q1,P0_f4
1132
1133	faddd	P1_f14,C_q1,P1_f14
1134
1135	fmuld	P2_f20,P2_f26,P2_f26
1136	ldd	[%o7+%l2],P2_f22
1137
1138	fmuld	P2_f24,%f40,P2_f24
1139	lda	[%i1]%asi,%l0		! preload next argument
1140
1141	fmuld	P0_f2,P0_f4,P0_f4
1142	lda	[%i1]%asi,P0_f0
1143
1144	fmuld	P2_f26,%f42,P2_f26
1145	lda	[%i1+4]%asi,P0_f1
1146
1147	fmuld	P1_f12,P1_f14,P1_f14
1148	add	%i1,%i2,%i1		! x += stridex
1149
1150	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1151
1152	fsubd	P2_f26,P2_f24,P2_f26
1153
1154	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1155
1156	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1157
1158	fsubd	P2_f22,P2_f26,P2_f26
1159
1160	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
1161
1162	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1163	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1164
1165	faddd	P2_f26,%f40,P2_f26
1166	addcc	%i0,-1,%i0
1167
1168	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1169	bg,pt	%icc,.loop0
1170
1171! delay slot
1172	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1173
1174	ba,pt	%icc,.endloop0
1175! delay slot
1176	nop
1177
1178	.align	32
1179.case7:
1180	fmuld	P0_f0,P0_f0,P0_f2
1181	fmovd	P0_f0,P0_f6		!ID for processing
1182	mov	%o0,%o3
1183
1184	fmuld	P1_f10,P1_f10,P1_f12
1185	mov	%o1,%o4
1186
1187	fmuld	P2_f20,P2_f20,P2_f22
1188	mov	%o2,%o5
1189
1190	fmuld	P0_f2,C_q4,P0_f4
1191	lda	[%i1]%asi,%l0		! preload next argument
1192
1193	fmuld	P1_f12,C_q4,P1_f14
1194	lda	[%i1]%asi,P0_f0
1195
1196	fmuld	P2_f22,C_q4,P2_f24
1197	lda	[%i1+4]%asi,P0_f1
1198
1199	faddd	P0_f4,C_q3,P0_f4
1200	add	%i1,%i2,%i1		! x += stridex
1201
1202	faddd	P1_f14,C_q3,P1_f14
1203
1204	faddd	P2_f24,C_q3,P2_f24
1205
1206	fmuld	P0_f2,P0_f4,P0_f4
1207
1208	fmuld	P1_f12,P1_f14,P1_f14
1209
1210	fmuld	P2_f22,P2_f24,P2_f24
1211
1212	faddd	P0_f4,C_q2,P0_f4
1213
1214	faddd	P1_f14,C_q2,P1_f14
1215
1216	faddd	P2_f24,C_q2,P2_f24
1217
1218	fmuld	P0_f2,P0_f4,P0_f4
1219
1220	fmuld	P1_f12,P1_f14,P1_f14
1221
1222	fmuld	P2_f22,P2_f24,P2_f24
1223
1224	faddd	P0_f4,C_q1,P0_f4
1225
1226	faddd	P1_f14,C_q1,P1_f14
1227
1228	faddd	P2_f24,C_q1,P2_f24
1229
1230	fmuld	P0_f2,P0_f4,P0_f4
1231
1232	fmuld	P1_f12,P1_f14,P1_f14
1233
1234	fmuld	P2_f22,P2_f24,P2_f24
1235
1236	!!(vsin)fmuld	P0_f6,P0_f4,P0_f4
1237
1238	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1239
1240	!!(vsin)fmuld	P2_f20,P2_f24,P2_f24
1241
1242	faddd	C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6	! faddd then spaces for processing
1243
1244	faddd	C_ONE,P1_f14,P1_f16 !!(vsin)faddd	P1_f10,P1_f14,P1_f16
1245
1246	faddd	C_ONE,P2_f24,P2_f26 !!(vsin)faddd	P2_f20,P2_f24,P2_f26
1247	andn	%l0,MSK_SIGN,%l0		! hx &= ~0x80000000
1248
1249	nop	!!(vsin) 	fors	P0_f6,P0_f9,P0_f6
1250	addcc	%i0,-1,%i0
1251
1252	nop	!!(vsin) 	fors	P1_f16,P1_f19,P1_f16
1253	bg,pt	%icc,.loop0
1254
1255! delay slot
1256	nop	!!(vsin) 	fors	P2_f26,P2_f29,P2_f26
1257
1258	ba,pt	%icc,.endloop0
1259! delay slot
1260	nop
1261
1262
1263	.align	32
1264.endloop2:
1265	cmp	%l1,LIM_l5
1266	bl,pn	%icc,1f
1267! delay slot
1268	fabsd	P1_f10,P1_f10
1269	sethi	%hi(0x3fc3c000),%o7
1270	fpadd32s P1_f10,MSK_BIT13,P1_f18
1271	fand	P1_f18,MSK_BITSHI17,P1_f12
1272	sub	%l1,%o7,%l1
1273	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1274	fsubd	P1_f10,P1_f12,P1_f10
1275	srl	%l1,10,%l1
1276	fmuld	P1_f10,P1_f10,P1_f12
1277	andn	%l1,0x1f,%l1
1278	fmuld	P1_f12,C_pp2,P2_f20
1279	ldd	[%g1+%l1],%f36
1280	faddd	P2_f20,C_pp1,P2_f20
1281	fmuld	P1_f12,C_qq2,P1_f14
1282	ldd	[SC_HI+%l1],%f38
1283	fmuld	P1_f12,P2_f20,P2_f20
1284	faddd	P1_f14,C_qq1,P1_f14
1285	faddd	P2_f20,C_ONE,P2_f20
1286	fmuld	P1_f12,P1_f14,P1_f14
1287	fmuld	P1_f10,P2_f20,P2_f20
1288	ldd	[%o7+%l1],P1_f12
1289	fmuld	P1_f14,%f36,P1_f14
1290	fmuld	P2_f20,%f38,P2_f20
1291	fsubd	P2_f20,P1_f14,P2_f20
1292	fsubd	P1_f12,P2_f20,P2_f20
1293	ba,pt	%icc,2f
1294! delay slot
1295	faddd	P2_f20,%f36,P2_f20
12961:
1297	fmuld	P1_f10,P1_f10,P1_f12
1298	fmuld	P1_f12,C_q4,P1_f14
1299	faddd	P1_f14,C_q3,P1_f14
1300	fmuld	P1_f12,P1_f14,P1_f14
1301	faddd	P1_f14,C_q2,P1_f14
1302	fmuld	P1_f12,P1_f14,P1_f14
1303	faddd	P1_f14,C_q1,P1_f14
1304	fmuld	P1_f12,P1_f14,P1_f14
1305	!!(vsin)fmuld	P1_f10,P1_f14,P1_f14
1306	faddd	C_ONE,P1_f14,P2_f20 !!(vsin)faddd	P1_f10,P1_f14,P2_f20
13072:
1308	nop	!!(vsin) 	fors	P2_f20,P1_f19,P2_f20
1309	st	P2_f20,[%o1]
1310	st	P2_f21,[%o1+4]
1311
1312.endloop1:
1313	cmp	%l0,LIM_l5
1314	bl,pn	%icc,1f
1315! delay slot
1316	fabsd	P0_f0,P0_f0
1317	sethi	%hi(0x3fc3c000),%o7
1318	fpadd32s P0_f0,MSK_BIT13,P0_f8
1319	fand	P0_f8,MSK_BITSHI17,P0_f2
1320	sub	%l0,%o7,%l0
1321	add	SC_HI,8,%g1;add	SC_LO,8,%o7
1322	fsubd	P0_f0,P0_f2,P0_f0
1323	srl	%l0,10,%l0
1324	fmuld	P0_f0,P0_f0,P0_f2
1325	andn	%l0,0x1f,%l0
1326	fmuld	P0_f2,C_pp2,P2_f20
1327	ldd	[%g1+%l0],%f32
1328	faddd	P2_f20,C_pp1,P2_f20
1329	fmuld	P0_f2,C_qq2,P0_f4
1330	ldd	[SC_HI+%l0],%f34
1331	fmuld	P0_f2,P2_f20,P2_f20
1332	faddd	P0_f4,C_qq1,P0_f4
1333	faddd	P2_f20,C_ONE,P2_f20
1334	fmuld	P0_f2,P0_f4,P0_f4
1335	fmuld	P0_f0,P2_f20,P2_f20
1336	ldd	[%o7+%l0],P0_f2
1337	fmuld	P0_f4,%f32,P0_f4
1338	fmuld	P2_f20,%f34,P2_f20
1339	fsubd	P2_f20,P0_f4,P2_f20
1340	fsubd	P0_f2,P2_f20,P2_f20
1341	ba,pt	%icc,2f
1342! delay slot
1343	faddd	P2_f20,%f32,P2_f20
13441:
1345	fmuld	P0_f0,P0_f0,P0_f2
1346	fmuld	P0_f2,C_q4,P0_f4
1347	faddd	P0_f4,C_q3,P0_f4
1348	fmuld	P0_f2,P0_f4,P0_f4
1349	faddd	P0_f4,C_q2,P0_f4
1350	fmuld	P0_f2,P0_f4,P0_f4
1351	faddd	P0_f4,C_q1,P0_f4
1352	fmuld	P0_f2,P0_f4,P0_f4
1353	!!(vsin)fmuld	P0_f0,P0_f4,P0_f4
1354	faddd	C_ONE,P0_f4,P2_f20 !!(vsin)faddd	P0_f0,P0_f4,P2_f20
13552:
1356	nop	!!(vsin) 	fors	P2_f20,P0_f9,P2_f20
1357	st	P2_f20,[%o0]
1358	st	P2_f21,[%o0+4]
1359
1360.endloop0:
1361	st	P0_f6,[%o3]
1362	st	P0_f7,[%o3+4]
1363	st	P1_f16,[%o4]
1364	st	P1_f17,[%o4+4]
1365	st	P2_f26,[%o5]
1366	st	P2_f27,[%o5+4]
1367
1368! return.  finished off with only primary range arguments
1369
1370	ret
1371	restore
1372
1373
1374	.align	32
1375.range0:
1376	cmp	%l0,LIM_l6
1377	bg,a,pt	%icc,.MEDIUM		! branch to Medium range on big arg.
1378! delay slot, annulled if branch not taken
1379	mov	0x1,LIM_l6		! set biguns flag or
1380	fdtoi	P0_f0,P0_f2; fmovd	C_ONE,P0_f0 ; st	P0_f0,[%o0]		! *y = *x with inexact if x nonzero
1381	st	P0_f1,[%o0+4]
1382	!nop		! (vsin) fdtoi	P0_f0,P0_f2
1383	addcc	%i0,-1,%i0
1384	ble,pn	%icc,.endloop0
1385! delay slot, harmless if branch taken
1386	add	%i3,%i4,%i3		! y += stridey
1387	andn	%l1,MSK_SIGN,%l0		! hx &= ~0x80000000
1388	fmovd	P1_f10,P0_f0
1389	ba,pt	%icc,.loop0
1390! delay slot
1391	add	%i1,%i2,%i1		! x += stridex
1392
1393
1394	.align	32
1395.range1:
1396	cmp	%l1,LIM_l6
1397	bg,a,pt	%icc,.MEDIUM		! branch to Medium range on big arg.
1398! delay slot, annulled if branch not taken
1399	mov	0x2,LIM_l6		! set biguns flag or
1400	fdtoi	P1_f10,P1_f12; fmovd	C_ONE,P1_f10 ; st	P1_f10,[%o1]		! *y = *x with inexact if x nonzero
1401	st	P1_f11,[%o1+4]
1402	!nop		! (vsin) fdtoi	P1_f10,P1_f12
1403	addcc	%i0,-1,%i0
1404	ble,pn	%icc,.endloop1
1405! delay slot, harmless if branch taken
1406	add	%i3,%i4,%i3		! y += stridey
1407	andn	%l2,MSK_SIGN,%l1		! hx &= ~0x80000000
1408	fmovd	P2_f20,P1_f10
1409	ba,pt	%icc,.loop1
1410! delay slot
1411	add	%i1,%i2,%i1		! x += stridex
1412
1413
1414	.align	32
1415.range2:
1416	cmp	%l2,LIM_l6
1417	bg,a,pt	%icc,.MEDIUM		! brance to Medium range on big arg.
1418! delay slot, annulled if branch not taken
1419	mov	0x3,LIM_l6		! set biguns flag or
1420	fdtoi	P2_f20,P2_f22; fmovd	C_ONE,P2_f20 ; st	P2_f20,[%o2]		! *y = *x with inexact if x nonzero
1421	st	P2_f21,[%o2+4]
1422	nop		! (vsin) fdtoi	P2_f20,P2_f22
14231:
1424	addcc	%i0,-1,%i0
1425	ble,pn	%icc,.endloop2
1426! delay slot
1427	nop
1428	ld	[%i1],%l2
1429	ld	[%i1],P2_f20
1430	ld	[%i1+4],P2_f21
1431	andn	%l2,MSK_SIGN,%l2		! hx &= ~0x80000000
1432	ba,pt	%icc,.loop2
1433! delay slot
1434	add	%i1,%i2,%i1		! x += stridex
1435
1436
1437	.align	32
1438.MEDIUM:
1439
1440! ========== medium range ==========
1441
1442! register use
1443
1444! i0  n
1445! i1  x
1446! i2  stridex
1447! i3  y
1448! i4  stridey
1449! i5  0x80000000
1450
1451! l0  hx0
1452! l1  hx1
1453! l2  hx2
1454! l3  __vlibm_TBL_sincos_hi
1455! l4  __vlibm_TBL_sincos_lo
1456! l5  constants
1457! l6  biguns stored here : still called LIM_l6
1458! l7  0x413921fb
1459
1460! the following are 64-bit registers in both V8+ and V9
1461
1462! g1  scratch
1463! g5
1464
1465! o0  py0
1466! o1  py1
1467! o2  py2
1468! o3  n0
1469! o4  n1
1470! o5  n2
1471! o7  scratch
1472
1473! f0  x0
1474! f2  n0,y0
1475! f4
1476! f6
1477! f8  scratch for table base
1478! f9  signbit0
1479! f10 x1
1480! f12 n1,y1
1481! f14
1482! f16
1483! f18 scratch for table base
1484! f19 signbit1
1485! f20 x2
1486! f22 n2,y2
1487! f24
1488! f26
1489! f28 scratch for table base
1490! f29 signbit2
1491! f30 0x80000000
1492! f31 0x4000
1493! f32
1494! f34
1495! f36
1496! f38
1497! f40 invpio2
1498! f42 round
1499! f44 0xffff800000000000
1500! f46 pio2_1
1501! f48 pio2_2
1502! f50 pio2_3
1503! f52 pio2_3t
1504! f54 one
1505! f56 pp1
1506! f58 pp2
1507! f60 qq1
1508! f62 qq2
1509
1510
1511	PIC_SET(g5,constants,l5)
1512
1513	! %o3,%o4,%o5 need to be stored
1514	st      P0_f6,[%o3]
1515	sethi	%hi(0x413921fb),%l7
1516	st      P0_f7,[%o3+4]
1517	or	%l7,%lo(0x413921fb),%l7
1518	st      P1_f16,[%o4]
1519	st      P1_f17,[%o4+4]
1520	st      P2_f26,[%o5]
1521	st      P2_f27,[%o5+4]
1522	ldd	[%l5+invpio2],%f40
1523	ldd	[%l5+round],%f42
1524	ldd	[%l5+pio2_1],%f46
1525	ldd	[%l5+pio2_2],%f48
1526	ldd	[%l5+pio2_3],%f50
1527	ldd	[%l5+pio2_3t],%f52
1528	std	%f54,[%fp+x0_1+8]	! set up stack data
1529	std	%f54,[%fp+x1_1+8]
1530	std	%f54,[%fp+x2_1+8]
1531	stx	%g0,[%fp+y0_0+8]
1532	stx	%g0,[%fp+y1_0+8]
1533	stx	%g0,[%fp+y2_0+8]
1534
1535!	branched here in the middle of the array.  Need to adjust
1536!	for the members of the triple that were selected in the primary
1537!	loop.
1538
1539!	no adjustment since all three selected here
1540	subcc	LIM_l6,0x1,%g0		! continue in LOOP0?
1541	bz,a	%icc,.LOOP0
1542	mov	0x0,LIM_l6		! delay slot set biguns=0
1543
1544!	ajust 1st triple since 2d and 3d done here
1545	subcc	LIM_l6,0x2,%g0		! continue in LOOP1?
1546	fmuld	%f0,%f40,%f2		! adj LOOP0
1547	bz,a	%icc,.LOOP1
1548	mov	0x0,LIM_l6		! delay slot set biguns=0
1549
1550!	ajust 1st and 2d triple since 3d done here
1551	subcc	LIM_l6,0x3,%g0		! continue in LOOP2?
1552	!done fmuld	%f0,%f40,%f2		! adj LOOP0
1553	sub	%i3,%i4,%i3		! adjust to not double increment
1554	fmuld	%f10,%f40,%f12		! adj LOOP1
1555	faddd	%f2,%f42,%f2		! adj LOOP1
1556	bz,a	%icc,.LOOP2
1557	mov	0x0,LIM_l6		! delay slot set biguns=0
1558
1559	ba	.LOOP0
1560	nop
1561
1562! -- 16 byte aligned
1563
1564	.align	32
1565.LOOP0:
1566	lda	[%i1]%asi,%l1		! preload next argument
1567	mov	%i3,%o0			! py0 = y
1568
1569	lda	[%i1]%asi,%f10
1570	cmp	%l0,%l7
1571	add	%i3,%i4,%i3		! y += stridey
1572	bg,pn	%icc,.BIG0		! if hx > 0x413921fb
1573
1574! delay slot
1575	lda	[%i1+4]%asi,%f11
1576	addcc	%i0,-1,%i0
1577	add	%i1,%i2,%i1		! x += stridex
1578	ble,pn	%icc,.ENDLOOP1
1579
1580! delay slot
1581	andn	%l1,%i5,%l1
1582	nop
1583	fmuld	%f0,%f40,%f2
1584	fabsd	%f54,%f54		! a nop for alignment only
1585
1586.LOOP1:
1587	lda	[%i1]%asi,%l2		! preload next argument
1588	mov	%i3,%o1			! py1 = y
1589
1590	lda	[%i1]%asi,%f20
1591	cmp	%l1,%l7
1592	add	%i3,%i4,%i3		! y += stridey
1593	bg,pn	%icc,.BIG1		! if hx > 0x413921fb
1594
1595! delay slot
1596	lda	[%i1+4]%asi,%f21
1597	addcc	%i0,-1,%i0
1598	add	%i1,%i2,%i1		! x += stridex
1599	ble,pn	%icc,.ENDLOOP2
1600
1601! delay slot
1602	andn	%l2,%i5,%l2
1603	nop
1604	fmuld	%f10,%f40,%f12
1605	faddd	%f2,%f42,%f2
1606
1607.LOOP2:
1608	st	%f3,[%fp+n0]
1609	mov	%i3,%o2			! py2 = y
1610
1611	cmp	%l2,%l7
1612	add	%i3,%i4,%i3		! y += stridey
1613	fmuld	%f20,%f40,%f22
1614	bg,pn	%icc,.BIG2		! if hx > 0x413921fb
1615
1616! delay slot
1617	add	%l5,thresh+4,%o7
1618	faddd	%f12,%f42,%f12
1619	st	%f13,[%fp+n1]
1620
1621! -
1622
1623	add	%l5,thresh,%g1
1624	faddd	%f22,%f42,%f22
1625	st	%f23,[%fp+n2]
1626
1627	fsubd	%f2,%f42,%f2		! n
1628
1629	fsubd	%f12,%f42,%f12		! n
1630
1631	fsubd	%f22,%f42,%f22		! n
1632
1633	fmuld	%f2,%f46,%f4
1634
1635	fmuld	%f12,%f46,%f14
1636
1637	fmuld	%f22,%f46,%f24
1638
1639	fsubd	%f0,%f4,%f4
1640	fmuld	%f2,%f48,%f6
1641
1642	fsubd	%f10,%f14,%f14
1643	fmuld	%f12,%f48,%f16
1644
1645	fsubd	%f20,%f24,%f24
1646	fmuld	%f22,%f48,%f26
1647
1648	fsubd	%f4,%f6,%f0
1649	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
1650
1651	fsubd	%f14,%f16,%f10
1652	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
1653
1654	fsubd	%f24,%f26,%f20
1655	ld	[%fp+n2],%o5 ; add	%o5,1,%o5
1656
1657	fsubd	%f4,%f0,%f32
1658	and	%o3,1,%o3
1659
1660	fsubd	%f14,%f10,%f34
1661	and	%o4,1,%o4
1662
1663	fsubd	%f24,%f20,%f36
1664	and	%o5,1,%o5
1665
1666	fsubd	%f32,%f6,%f32
1667	fmuld	%f2,%f50,%f8
1668	sll	%o3,3,%o3
1669
1670	fsubd	%f34,%f16,%f34
1671	fmuld	%f12,%f50,%f18
1672	sll	%o4,3,%o4
1673
1674	fsubd	%f36,%f26,%f36
1675	fmuld	%f22,%f50,%f28
1676	sll	%o5,3,%o5
1677
1678	fsubd	%f8,%f32,%f8
1679	ld	[%g1+%o3],%f6
1680
1681	fsubd	%f18,%f34,%f18
1682	ld	[%g1+%o4],%f16
1683
1684	fsubd	%f28,%f36,%f28
1685	ld	[%g1+%o5],%f26
1686
1687	fsubd	%f0,%f8,%f4
1688
1689	fsubd	%f10,%f18,%f14
1690
1691	fsubd	%f20,%f28,%f24
1692
1693	fsubd	%f0,%f4,%f32
1694
1695	fsubd	%f10,%f14,%f34
1696
1697	fsubd	%f20,%f24,%f36
1698
1699	fsubd	%f32,%f8,%f32
1700	fmuld	%f2,%f52,%f2
1701
1702	fsubd	%f34,%f18,%f34
1703	fmuld	%f12,%f52,%f12
1704
1705	fsubd	%f36,%f28,%f36
1706	fmuld	%f22,%f52,%f22
1707
1708	fsubd	%f2,%f32,%f2
1709	ld	[%o7+%o3],%f8
1710
1711	fsubd	%f12,%f34,%f12
1712	ld	[%o7+%o4],%f18
1713
1714	fsubd	%f22,%f36,%f22
1715	ld	[%o7+%o5],%f28
1716
1717	fsubd	%f4,%f2,%f0		! x
1718
1719	fsubd	%f14,%f12,%f10		! x
1720
1721	fsubd	%f24,%f22,%f20		! x
1722
1723	fsubd	%f4,%f0,%f4
1724
1725	fsubd	%f14,%f10,%f14
1726
1727	fsubd	%f24,%f20,%f24
1728
1729	fands	%f0,%f30,%f9		! save signbit
1730
1731	fands	%f10,%f30,%f19		! save signbit
1732
1733	fands	%f20,%f30,%f29		! save signbit
1734
1735	fabsd	%f0,%f0
1736	std	%f0,[%fp+x0_1]
1737
1738	fabsd	%f10,%f10
1739	std	%f10,[%fp+x1_1]
1740
1741	fabsd	%f20,%f20
1742	std	%f20,[%fp+x2_1]
1743
1744	fsubd	%f4,%f2,%f2		! y
1745
1746	fsubd	%f14,%f12,%f12		! y
1747
1748	fsubd	%f24,%f22,%f22		! y
1749
1750	fcmpgt32 %f6,%f0,%l0
1751
1752	fcmpgt32 %f16,%f10,%l1
1753
1754	fcmpgt32 %f26,%f20,%l2
1755
1756! -- 16 byte aligned
1757	fxors	%f2,%f9,%f2
1758
1759	fxors	%f12,%f19,%f12
1760
1761	fxors	%f22,%f29,%f22
1762
1763	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
1764	andcc	%l0,2,%g0
1765	bne,pn	%icc,.CASE4
1766
1767! delay slot
1768	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
1769	andcc	%l1,2,%g0
1770	bne,pn	%icc,.CASE2
1771
1772! delay slot
1773	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
1774	andcc	%l2,2,%g0
1775	bne,pn	%icc,.CASE1
1776
1777! delay slot
1778	fpadd32s %f0,%f31,%f8
1779	sethi	%hi(0x3fc3c000),%o7
1780	ld	[%fp+x0_1],%l0
1781
1782	fpadd32s %f10,%f31,%f18
1783	add	%l3,8,%g1
1784	ld	[%fp+x1_1],%l1
1785
1786	fpadd32s %f20,%f31,%f28
1787	ld	[%fp+x2_1],%l2
1788
1789	fand	%f8,%f44,%f4
1790	sub	%l0,%o7,%l0
1791
1792	fand	%f18,%f44,%f14
1793	sub	%l1,%o7,%l1
1794
1795	fand	%f28,%f44,%f24
1796	sub	%l2,%o7,%l2
1797
1798	fsubd	%f0,%f4,%f0
1799	srl	%l0,10,%l0
1800
1801	fsubd	%f10,%f14,%f10
1802	srl	%l1,10,%l1
1803
1804	fsubd	%f20,%f24,%f20
1805	srl	%l2,10,%l2
1806
1807	faddd	%f0,%f2,%f0
1808	andn	%l0,0x1f,%l0
1809
1810	faddd	%f10,%f12,%f10
1811	andn	%l1,0x1f,%l1
1812
1813	faddd	%f20,%f22,%f20
1814	andn	%l2,0x1f,%l2
1815
1816	fmuld	%f0,%f0,%f2
1817	add	%l0,%o3,%l0
1818
1819	fmuld	%f10,%f10,%f12
1820	add	%l1,%o4,%l1
1821
1822	fmuld	%f20,%f20,%f22
1823	add	%l2,%o5,%l2
1824
1825	fmuld	%f2,%f58,%f6
1826	ldd	[%l3+%l0],%f32
1827
1828	fmuld	%f12,%f58,%f16
1829	ldd	[%l3+%l1],%f34
1830
1831	fmuld	%f22,%f58,%f26
1832	ldd	[%l3+%l2],%f36
1833
1834	faddd	%f6,%f56,%f6
1835	fmuld	%f2,%f62,%f4
1836
1837	faddd	%f16,%f56,%f16
1838	fmuld	%f12,%f62,%f14
1839
1840	faddd	%f26,%f56,%f26
1841	fmuld	%f22,%f62,%f24
1842
1843	fmuld	%f2,%f6,%f6
1844	faddd	%f4,%f60,%f4
1845
1846	fmuld	%f12,%f16,%f16
1847	faddd	%f14,%f60,%f14
1848
1849	fmuld	%f22,%f26,%f26
1850	faddd	%f24,%f60,%f24
1851
1852	faddd	%f6,%f54,%f6
1853	fmuld	%f2,%f4,%f4
1854
1855	faddd	%f16,%f54,%f16
1856	fmuld	%f12,%f14,%f14
1857
1858	faddd	%f26,%f54,%f26
1859	fmuld	%f22,%f24,%f24
1860
1861	fmuld	%f0,%f6,%f6
1862	ldd	[%g1+%l0],%f2
1863
1864	fmuld	%f10,%f16,%f16
1865	ldd	[%g1+%l1],%f12
1866
1867	fmuld	%f20,%f26,%f26
1868	ldd	[%g1+%l2],%f22
1869
1870	fmuld	%f4,%f32,%f4
1871	ldd	[%l4+%l0],%f0
1872
1873	fmuld	%f14,%f34,%f14
1874	ldd	[%l4+%l1],%f10
1875
1876	fmuld	%f24,%f36,%f24
1877	ldd	[%l4+%l2],%f20
1878
1879	fmuld	%f6,%f2,%f6
1880
1881	fmuld	%f16,%f12,%f16
1882
1883	fmuld	%f26,%f22,%f26
1884
1885	faddd	%f6,%f4,%f6
1886
1887	faddd	%f16,%f14,%f16
1888
1889	faddd	%f26,%f24,%f26
1890
1891	faddd	%f6,%f0,%f6
1892
1893	faddd	%f16,%f10,%f16
1894
1895	faddd	%f26,%f20,%f26
1896
1897	faddd	%f6,%f32,%f6
1898
1899	faddd	%f16,%f34,%f16
1900
1901	faddd	%f26,%f36,%f26
1902
1903.FIXSIGN:
1904	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
1905	add	%l5,thresh-4,%g1
1906
1907	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
1908
1909	ld	[%fp+n2],%o5 ; add	%o5,1,%o5
1910	and	%o3,2,%o3
1911
1912	sll	%o3,2,%o3
1913	and	%o4,2,%o4
1914	lda	[%i1]%asi,%l0		! preload next argument
1915
1916	sll	%o4,2,%o4
1917	and	%o5,2,%o5
1918	ld	[%g1+%o3],%f8
1919
1920	sll	%o5,2,%o5
1921	ld	[%g1+%o4],%f18
1922
1923	ld	[%g1+%o5],%f28
1924	fxors	%f9,%f8,%f9
1925
1926	lda	[%i1]%asi,%f0
1927	fxors	%f29,%f28,%f29
1928
1929	lda	[%i1+4]%asi,%f1
1930	fxors	%f19,%f18,%f19
1931
1932	fors	%f6,%f9,%f6		! tack on sign
1933	add	%i1,%i2,%i1		! x += stridex
1934	st	%f6,[%o0]
1935
1936	fors	%f26,%f29,%f26		! tack on sign
1937	st	%f7,[%o0+4]
1938
1939	fors	%f16,%f19,%f16		! tack on sign
1940	st	%f26,[%o2]
1941
1942	st	%f27,[%o2+4]
1943	addcc	%i0,-1,%i0
1944
1945	st	%f16,[%o1]
1946	andn	%l0,%i5,%l0		! hx &= ~0x80000000
1947	bg,pt	%icc,.LOOP0
1948
1949! delay slot
1950	st	%f17,[%o1+4]
1951
1952	ba,pt	%icc,.ENDLOOP0
1953! delay slot
1954	nop
1955
1956	.align	32
1957.CASE1:
1958	fpadd32s %f10,%f31,%f18
1959	sethi	%hi(0x3fc3c000),%o7
1960	ld	[%fp+x0_1],%l0
1961
1962	fand	%f8,%f44,%f4
1963	add	%l3,8,%g1
1964	ld	[%fp+x1_1],%l1
1965
1966	fand	%f18,%f44,%f14
1967	sub	%l0,%o7,%l0
1968
1969	fsubd	%f0,%f4,%f0
1970	srl	%l0,10,%l0
1971	sub	%l1,%o7,%l1
1972
1973	fsubd	%f10,%f14,%f10
1974	srl	%l1,10,%l1
1975
1976	fmuld	%f20,%f20,%f20
1977	ldd	[%l5+%o5],%f36
1978	add	%l5,%o5,%l2
1979
1980	faddd	%f0,%f2,%f0
1981	andn	%l0,0x1f,%l0
1982
1983	faddd	%f10,%f12,%f10
1984	andn	%l1,0x1f,%l1
1985
1986	fmuld	%f20,%f36,%f24
1987	ldd	[%l2+0x10],%f26
1988	add	%fp,%o5,%o5
1989
1990	fmuld	%f0,%f0,%f2
1991	add	%l0,%o3,%l0
1992
1993	fmuld	%f10,%f10,%f12
1994	add	%l1,%o4,%l1
1995
1996	faddd	%f24,%f26,%f24
1997	ldd	[%l2+0x20],%f36
1998
1999	fmuld	%f2,%f58,%f6
2000	ldd	[%l3+%l0],%f32
2001
2002	fmuld	%f12,%f58,%f16
2003	ldd	[%l3+%l1],%f34
2004
2005	fmuld	%f20,%f24,%f24
2006	ldd	[%l2+0x30],%f26
2007
2008	faddd	%f6,%f56,%f6
2009	fmuld	%f2,%f62,%f4
2010
2011	faddd	%f16,%f56,%f16
2012	fmuld	%f12,%f62,%f14
2013
2014	faddd	%f24,%f36,%f24
2015	ldd	[%o5+x2_1],%f36
2016
2017	fmuld	%f2,%f6,%f6
2018	faddd	%f4,%f60,%f4
2019
2020	fmuld	%f12,%f16,%f16
2021	faddd	%f14,%f60,%f14
2022
2023	fmuld	%f20,%f24,%f24
2024
2025	faddd	%f6,%f54,%f6
2026	fmuld	%f2,%f4,%f4
2027	ldd	[%g1+%l0],%f2
2028
2029	faddd	%f16,%f54,%f16
2030	fmuld	%f12,%f14,%f14
2031	ldd	[%g1+%l1],%f12
2032
2033	faddd	%f24,%f26,%f24
2034
2035	fmuld	%f0,%f6,%f6
2036	ldd	[%l4+%l0],%f0
2037
2038	fmuld	%f10,%f16,%f16
2039	ldd	[%l4+%l1],%f10
2040
2041	fmuld	%f4,%f32,%f4
2042	std	%f22,[%fp+y2_0]
2043
2044	fmuld	%f14,%f34,%f14
2045
2046	fmuld	%f6,%f2,%f6
2047
2048	fmuld	%f16,%f12,%f16
2049
2050	fmuld	%f20,%f24,%f24
2051
2052	faddd	%f6,%f4,%f6
2053
2054	faddd	%f16,%f14,%f16
2055
2056	fmuld	%f36,%f24,%f24
2057	ldd	[%o5+y2_0],%f22
2058
2059	faddd	%f6,%f0,%f6
2060
2061	faddd	%f16,%f10,%f16
2062
2063	faddd	%f24,%f22,%f24
2064
2065	faddd	%f6,%f32,%f6
2066
2067	faddd	%f16,%f34,%f16
2068	ba,pt	%icc,.FIXSIGN
2069
2070! delay slot
2071	faddd	%f36,%f24,%f26
2072
2073	.align	32
2074.CASE2:
2075	fpadd32s %f0,%f31,%f8
2076	ld	[%fp+x0_1],%l0
2077	andcc	%l2,2,%g0
2078	bne,pn	%icc,.CASE3
2079
2080! delay slot
2081	sethi	%hi(0x3fc3c000),%o7
2082	fpadd32s %f20,%f31,%f28
2083	ld	[%fp+x2_1],%l2
2084
2085	fand	%f8,%f44,%f4
2086	sub	%l0,%o7,%l0
2087	add	%l3,8,%g1
2088
2089	fand	%f28,%f44,%f24
2090	sub	%l2,%o7,%l2
2091
2092	fsubd	%f0,%f4,%f0
2093	srl	%l0,10,%l0
2094
2095	fsubd	%f20,%f24,%f20
2096	srl	%l2,10,%l2
2097
2098	fmuld	%f10,%f10,%f10
2099	ldd	[%l5+%o4],%f34
2100	add	%l5,%o4,%l1
2101
2102	faddd	%f0,%f2,%f0
2103	andn	%l0,0x1f,%l0
2104
2105	faddd	%f20,%f22,%f20
2106	andn	%l2,0x1f,%l2
2107
2108	fmuld	%f10,%f34,%f14
2109	ldd	[%l1+0x10],%f16
2110	add	%fp,%o4,%o4
2111
2112	fmuld	%f0,%f0,%f2
2113	add	%l0,%o3,%l0
2114
2115	fmuld	%f20,%f20,%f22
2116	add	%l2,%o5,%l2
2117
2118	faddd	%f14,%f16,%f14
2119	ldd	[%l1+0x20],%f34
2120
2121	fmuld	%f2,%f58,%f6
2122	ldd	[%l3+%l0],%f32
2123
2124	fmuld	%f22,%f58,%f26
2125	ldd	[%l3+%l2],%f36
2126
2127	fmuld	%f10,%f14,%f14
2128	ldd	[%l1+0x30],%f16
2129
2130	faddd	%f6,%f56,%f6
2131	fmuld	%f2,%f62,%f4
2132
2133	faddd	%f26,%f56,%f26
2134	fmuld	%f22,%f62,%f24
2135
2136	faddd	%f14,%f34,%f14
2137	ldd	[%o4+x1_1],%f34
2138
2139	fmuld	%f2,%f6,%f6
2140	faddd	%f4,%f60,%f4
2141
2142	fmuld	%f22,%f26,%f26
2143	faddd	%f24,%f60,%f24
2144
2145	fmuld	%f10,%f14,%f14
2146
2147	faddd	%f6,%f54,%f6
2148	fmuld	%f2,%f4,%f4
2149	ldd	[%g1+%l0],%f2
2150
2151	faddd	%f26,%f54,%f26
2152	fmuld	%f22,%f24,%f24
2153	ldd	[%g1+%l2],%f22
2154
2155	faddd	%f14,%f16,%f14
2156
2157	fmuld	%f0,%f6,%f6
2158	ldd	[%l4+%l0],%f0
2159
2160	fmuld	%f20,%f26,%f26
2161	ldd	[%l4+%l2],%f20
2162
2163	fmuld	%f4,%f32,%f4
2164	std	%f12,[%fp+y1_0]
2165
2166	fmuld	%f24,%f36,%f24
2167
2168	fmuld	%f6,%f2,%f6
2169
2170	fmuld	%f26,%f22,%f26
2171
2172	fmuld	%f10,%f14,%f14
2173
2174	faddd	%f6,%f4,%f6
2175
2176	faddd	%f26,%f24,%f26
2177
2178	fmuld	%f34,%f14,%f14
2179	ldd	[%o4+y1_0],%f12
2180
2181	faddd	%f6,%f0,%f6
2182
2183	faddd	%f26,%f20,%f26
2184
2185	faddd	%f14,%f12,%f14
2186
2187	faddd	%f6,%f32,%f6
2188
2189	faddd	%f26,%f36,%f26
2190	ba,pt	%icc,.FIXSIGN
2191
2192! delay slot
2193	faddd	%f34,%f14,%f16
2194
2195	.align	32
2196.CASE3:
2197	fand	%f8,%f44,%f4
2198	add	%l3,8,%g1
2199	sub	%l0,%o7,%l0
2200
2201	fmuld	%f10,%f10,%f10
2202	ldd	[%l5+%o4],%f34
2203	add	%l5,%o4,%l1
2204
2205	fsubd	%f0,%f4,%f0
2206	srl	%l0,10,%l0
2207
2208	fmuld	%f20,%f20,%f20
2209	ldd	[%l5+%o5],%f36
2210	add	%l5,%o5,%l2
2211
2212	fmuld	%f10,%f34,%f14
2213	ldd	[%l1+0x10],%f16
2214	add	%fp,%o4,%o4
2215
2216	faddd	%f0,%f2,%f0
2217	andn	%l0,0x1f,%l0
2218
2219	fmuld	%f20,%f36,%f24
2220	ldd	[%l2+0x10],%f26
2221	add	%fp,%o5,%o5
2222
2223	faddd	%f14,%f16,%f14
2224	ldd	[%l1+0x20],%f34
2225
2226	fmuld	%f0,%f0,%f2
2227	add	%l0,%o3,%l0
2228
2229	faddd	%f24,%f26,%f24
2230	ldd	[%l2+0x20],%f36
2231
2232	fmuld	%f10,%f14,%f14
2233	ldd	[%l1+0x30],%f16
2234
2235	fmuld	%f2,%f58,%f6
2236	ldd	[%l3+%l0],%f32
2237
2238	fmuld	%f20,%f24,%f24
2239	ldd	[%l2+0x30],%f26
2240
2241	faddd	%f14,%f34,%f14
2242	ldd	[%o4+x1_1],%f34
2243
2244	faddd	%f6,%f56,%f6
2245	fmuld	%f2,%f62,%f4
2246
2247	faddd	%f24,%f36,%f24
2248	ldd	[%o5+x2_1],%f36
2249
2250	fmuld	%f10,%f14,%f14
2251	std	%f12,[%fp+y1_0]
2252
2253	fmuld	%f2,%f6,%f6
2254	faddd	%f4,%f60,%f4
2255
2256	fmuld	%f20,%f24,%f24
2257	std	%f22,[%fp+y2_0]
2258
2259	faddd	%f14,%f16,%f14
2260
2261	faddd	%f6,%f54,%f6
2262	fmuld	%f2,%f4,%f4
2263	ldd	[%g1+%l0],%f2
2264
2265	faddd	%f24,%f26,%f24
2266
2267	fmuld	%f10,%f14,%f14
2268
2269	fmuld	%f0,%f6,%f6
2270	ldd	[%l4+%l0],%f0
2271
2272	fmuld	%f4,%f32,%f4
2273
2274	fmuld	%f20,%f24,%f24
2275
2276	fmuld	%f6,%f2,%f6
2277
2278	fmuld	%f34,%f14,%f14
2279	ldd	[%o4+y1_0],%f12
2280
2281	fmuld	%f36,%f24,%f24
2282	ldd	[%o5+y2_0],%f22
2283
2284	faddd	%f6,%f4,%f6
2285
2286	faddd	%f14,%f12,%f14
2287
2288	faddd	%f24,%f22,%f24
2289
2290	faddd	%f6,%f0,%f6
2291
2292	faddd	%f34,%f14,%f16
2293
2294	faddd	%f36,%f24,%f26
2295	ba,pt	%icc,.FIXSIGN
2296
2297! delay slot
2298	faddd	%f6,%f32,%f6
2299
2300	.align	32
2301.CASE4:
2302	fands	%f29,%f28,%f29		! if (n & 1) clear sign bit
2303	sethi	%hi(0x3fc3c000),%o7
2304	andcc	%l1,2,%g0
2305	bne,pn	%icc,.CASE6
2306
2307! delay slot
2308	andcc	%l2,2,%g0
2309	fpadd32s %f10,%f31,%f18
2310	ld	[%fp+x1_1],%l1
2311	bne,pn	%icc,.CASE5
2312
2313! delay slot
2314	add	%l3,8,%g1
2315	ld	[%fp+x2_1],%l2
2316	fpadd32s %f20,%f31,%f28
2317
2318	fand	%f18,%f44,%f14
2319	sub	%l1,%o7,%l1
2320
2321	fand	%f28,%f44,%f24
2322	sub	%l2,%o7,%l2
2323
2324	fsubd	%f10,%f14,%f10
2325	srl	%l1,10,%l1
2326
2327	fsubd	%f20,%f24,%f20
2328	srl	%l2,10,%l2
2329
2330	fmuld	%f0,%f0,%f0
2331	ldd	[%l5+%o3],%f32
2332	add	%l5,%o3,%l0
2333
2334	faddd	%f10,%f12,%f10
2335	andn	%l1,0x1f,%l1
2336
2337	faddd	%f20,%f22,%f20
2338	andn	%l2,0x1f,%l2
2339
2340	fmuld	%f0,%f32,%f4
2341	ldd	[%l0+0x10],%f6
2342	add	%fp,%o3,%o3
2343
2344	fmuld	%f10,%f10,%f12
2345	add	%l1,%o4,%l1
2346
2347	fmuld	%f20,%f20,%f22
2348	add	%l2,%o5,%l2
2349
2350	faddd	%f4,%f6,%f4
2351	ldd	[%l0+0x20],%f32
2352
2353	fmuld	%f12,%f58,%f16
2354	ldd	[%l3+%l1],%f34
2355
2356	fmuld	%f22,%f58,%f26
2357	ldd	[%l3+%l2],%f36
2358
2359	fmuld	%f0,%f4,%f4
2360	ldd	[%l0+0x30],%f6
2361
2362	faddd	%f16,%f56,%f16
2363	fmuld	%f12,%f62,%f14
2364
2365	faddd	%f26,%f56,%f26
2366	fmuld	%f22,%f62,%f24
2367
2368	faddd	%f4,%f32,%f4
2369	ldd	[%o3+x0_1],%f32
2370
2371	fmuld	%f12,%f16,%f16
2372	faddd	%f14,%f60,%f14
2373
2374	fmuld	%f22,%f26,%f26
2375	faddd	%f24,%f60,%f24
2376
2377	fmuld	%f0,%f4,%f4
2378
2379	faddd	%f16,%f54,%f16
2380	fmuld	%f12,%f14,%f14
2381	ldd	[%g1+%l1],%f12
2382
2383	faddd	%f26,%f54,%f26
2384	fmuld	%f22,%f24,%f24
2385	ldd	[%g1+%l2],%f22
2386
2387	faddd	%f4,%f6,%f4
2388
2389	fmuld	%f10,%f16,%f16
2390	ldd	[%l4+%l1],%f10
2391
2392	fmuld	%f20,%f26,%f26
2393	ldd	[%l4+%l2],%f20
2394
2395	fmuld	%f14,%f34,%f14
2396	std	%f2,[%fp+y0_0]
2397
2398	fmuld	%f24,%f36,%f24
2399
2400	fmuld	%f0,%f4,%f4
2401
2402	fmuld	%f16,%f12,%f16
2403
2404	fmuld	%f26,%f22,%f26
2405
2406	fmuld	%f32,%f4,%f4
2407	ldd	[%o3+y0_0],%f2
2408
2409	faddd	%f16,%f14,%f16
2410
2411	faddd	%f26,%f24,%f26
2412
2413	faddd	%f4,%f2,%f4
2414
2415	faddd	%f16,%f10,%f16
2416
2417	faddd	%f26,%f20,%f26
2418
2419	faddd	%f32,%f4,%f6
2420
2421	faddd	%f16,%f34,%f16
2422	ba,pt	%icc,.FIXSIGN
2423
2424! delay slot
2425	faddd	%f26,%f36,%f26
2426
2427	.align	32
2428.CASE5:
2429	fand	%f18,%f44,%f14
2430	sub	%l1,%o7,%l1
2431
2432	fmuld	%f0,%f0,%f0
2433	ldd	[%l5+%o3],%f32
2434	add	%l5,%o3,%l0
2435
2436	fsubd	%f10,%f14,%f10
2437	srl	%l1,10,%l1
2438
2439	fmuld	%f20,%f20,%f20
2440	ldd	[%l5+%o5],%f36
2441	add	%l5,%o5,%l2
2442
2443	fmuld	%f0,%f32,%f4
2444	ldd	[%l0+0x10],%f6
2445	add	%fp,%o3,%o3
2446
2447	faddd	%f10,%f12,%f10
2448	andn	%l1,0x1f,%l1
2449
2450	fmuld	%f20,%f36,%f24
2451	ldd	[%l2+0x10],%f26
2452	add	%fp,%o5,%o5
2453
2454	faddd	%f4,%f6,%f4
2455	ldd	[%l0+0x20],%f32
2456
2457	fmuld	%f10,%f10,%f12
2458	add	%l1,%o4,%l1
2459
2460	faddd	%f24,%f26,%f24
2461	ldd	[%l2+0x20],%f36
2462
2463	fmuld	%f0,%f4,%f4
2464	ldd	[%l0+0x30],%f6
2465
2466	fmuld	%f12,%f58,%f16
2467	ldd	[%l3+%l1],%f34
2468
2469	fmuld	%f20,%f24,%f24
2470	ldd	[%l2+0x30],%f26
2471
2472	faddd	%f4,%f32,%f4
2473	ldd	[%o3+x0_1],%f32
2474
2475	faddd	%f16,%f56,%f16
2476	fmuld	%f12,%f62,%f14
2477
2478	faddd	%f24,%f36,%f24
2479	ldd	[%o5+x2_1],%f36
2480
2481	fmuld	%f0,%f4,%f4
2482	std	%f2,[%fp+y0_0]
2483
2484	fmuld	%f12,%f16,%f16
2485	faddd	%f14,%f60,%f14
2486
2487	fmuld	%f20,%f24,%f24
2488	std	%f22,[%fp+y2_0]
2489
2490	faddd	%f4,%f6,%f4
2491
2492	faddd	%f16,%f54,%f16
2493	fmuld	%f12,%f14,%f14
2494	ldd	[%g1+%l1],%f12
2495
2496	faddd	%f24,%f26,%f24
2497
2498	fmuld	%f0,%f4,%f4
2499
2500	fmuld	%f10,%f16,%f16
2501	ldd	[%l4+%l1],%f10
2502
2503	fmuld	%f14,%f34,%f14
2504
2505	fmuld	%f20,%f24,%f24
2506
2507	fmuld	%f16,%f12,%f16
2508
2509	fmuld	%f32,%f4,%f4
2510	ldd	[%o3+y0_0],%f2
2511
2512	fmuld	%f36,%f24,%f24
2513	ldd	[%o5+y2_0],%f22
2514
2515	faddd	%f16,%f14,%f16
2516
2517	faddd	%f4,%f2,%f4
2518
2519	faddd	%f24,%f22,%f24
2520
2521	faddd	%f16,%f10,%f16
2522
2523	faddd	%f32,%f4,%f6
2524
2525	faddd	%f36,%f24,%f26
2526	ba,pt	%icc,.FIXSIGN
2527
2528! delay slot
2529	faddd	%f16,%f34,%f16
2530
2531	.align	32
2532.CASE6:
2533	ld	[%fp+x2_1],%l2
2534	add	%l3,8,%g1
2535	bne,pn	%icc,.CASE7
2536! delay slot
2537	fpadd32s %f20,%f31,%f28
2538
2539	fand	%f28,%f44,%f24
2540	ldd	[%l5+%o3],%f32
2541	add	%l5,%o3,%l0
2542
2543	fmuld	%f0,%f0,%f0
2544	sub	%l2,%o7,%l2
2545
2546	fsubd	%f20,%f24,%f20
2547	srl	%l2,10,%l2
2548
2549	fmuld	%f10,%f10,%f10
2550	ldd	[%l5+%o4],%f34
2551	add	%l5,%o4,%l1
2552
2553	fmuld	%f0,%f32,%f4
2554	ldd	[%l0+0x10],%f6
2555	add	%fp,%o3,%o3
2556
2557	faddd	%f20,%f22,%f20
2558	andn	%l2,0x1f,%l2
2559
2560	fmuld	%f10,%f34,%f14
2561	ldd	[%l1+0x10],%f16
2562	add	%fp,%o4,%o4
2563
2564	faddd	%f4,%f6,%f4
2565	ldd	[%l0+0x20],%f32
2566
2567	fmuld	%f20,%f20,%f22
2568	add	%l2,%o5,%l2
2569
2570	faddd	%f14,%f16,%f14
2571	ldd	[%l1+0x20],%f34
2572
2573	fmuld	%f0,%f4,%f4
2574	ldd	[%l0+0x30],%f6
2575
2576	fmuld	%f22,%f58,%f26
2577	ldd	[%l3+%l2],%f36
2578
2579	fmuld	%f10,%f14,%f14
2580	ldd	[%l1+0x30],%f16
2581
2582	faddd	%f4,%f32,%f4
2583	ldd	[%o3+x0_1],%f32
2584
2585	faddd	%f26,%f56,%f26
2586	fmuld	%f22,%f62,%f24
2587
2588	faddd	%f14,%f34,%f14
2589	ldd	[%o4+x1_1],%f34
2590
2591	fmuld	%f0,%f4,%f4
2592	std	%f2,[%fp+y0_0]
2593
2594	fmuld	%f22,%f26,%f26
2595	faddd	%f24,%f60,%f24
2596
2597	fmuld	%f10,%f14,%f14
2598	std	%f12,[%fp+y1_0]
2599
2600	faddd	%f4,%f6,%f4
2601
2602	faddd	%f26,%f54,%f26
2603	fmuld	%f22,%f24,%f24
2604	ldd	[%g1+%l2],%f22
2605
2606	faddd	%f14,%f16,%f14
2607
2608	fmuld	%f0,%f4,%f4
2609
2610	fmuld	%f20,%f26,%f26
2611	ldd	[%l4+%l2],%f20
2612
2613	fmuld	%f24,%f36,%f24
2614
2615	fmuld	%f10,%f14,%f14
2616
2617	fmuld	%f26,%f22,%f26
2618
2619	fmuld	%f32,%f4,%f4
2620	ldd	[%o3+y0_0],%f2
2621
2622	fmuld	%f34,%f14,%f14
2623	ldd	[%o4+y1_0],%f12
2624
2625	faddd	%f26,%f24,%f26
2626
2627	faddd	%f4,%f2,%f4
2628
2629	faddd	%f14,%f12,%f14
2630
2631	faddd	%f26,%f20,%f26
2632
2633	faddd	%f32,%f4,%f6
2634
2635	faddd	%f34,%f14,%f16
2636	ba,pt	%icc,.FIXSIGN
2637
2638! delay slot
2639	faddd	%f26,%f36,%f26
2640
2641	.align	32
2642.CASE7:
2643	fmuld	%f0,%f0,%f0
2644	ldd	[%l5+%o3],%f32
2645	add	%l5,%o3,%l0
2646
2647	fmuld	%f10,%f10,%f10
2648	ldd	[%l5+%o4],%f34
2649	add	%l5,%o4,%l1
2650
2651	fmuld	%f20,%f20,%f20
2652	ldd	[%l5+%o5],%f36
2653	add	%l5,%o5,%l2
2654
2655	fmuld	%f0,%f32,%f4
2656	ldd	[%l0+0x10],%f6
2657	add	%fp,%o3,%o3
2658
2659	fmuld	%f10,%f34,%f14
2660	ldd	[%l1+0x10],%f16
2661	add	%fp,%o4,%o4
2662
2663	fmuld	%f20,%f36,%f24
2664	ldd	[%l2+0x10],%f26
2665	add	%fp,%o5,%o5
2666
2667	faddd	%f4,%f6,%f4
2668	ldd	[%l0+0x20],%f32
2669
2670	faddd	%f14,%f16,%f14
2671	ldd	[%l1+0x20],%f34
2672
2673	faddd	%f24,%f26,%f24
2674	ldd	[%l2+0x20],%f36
2675
2676	fmuld	%f0,%f4,%f4
2677	ldd	[%l0+0x30],%f6
2678
2679	fmuld	%f10,%f14,%f14
2680	ldd	[%l1+0x30],%f16
2681
2682	fmuld	%f20,%f24,%f24
2683	ldd	[%l2+0x30],%f26
2684
2685	faddd	%f4,%f32,%f4
2686	ldd	[%o3+x0_1],%f32
2687
2688	faddd	%f14,%f34,%f14
2689	ldd	[%o4+x1_1],%f34
2690
2691	faddd	%f24,%f36,%f24
2692	ldd	[%o5+x2_1],%f36
2693
2694	fmuld	%f0,%f4,%f4
2695	std	%f2,[%fp+y0_0]
2696
2697	fmuld	%f10,%f14,%f14
2698	std	%f12,[%fp+y1_0]
2699
2700	fmuld	%f20,%f24,%f24
2701	std	%f22,[%fp+y2_0]
2702
2703	faddd	%f4,%f6,%f4
2704
2705	faddd	%f14,%f16,%f14
2706
2707	faddd	%f24,%f26,%f24
2708
2709	fmuld	%f0,%f4,%f4
2710
2711	fmuld	%f10,%f14,%f14
2712
2713	fmuld	%f20,%f24,%f24
2714
2715	fmuld	%f32,%f4,%f4
2716	ldd	[%o3+y0_0],%f2
2717
2718	fmuld	%f34,%f14,%f14
2719	ldd	[%o4+y1_0],%f12
2720
2721	fmuld	%f36,%f24,%f24
2722	ldd	[%o5+y2_0],%f22
2723
2724	faddd	%f4,%f2,%f4
2725
2726	faddd	%f14,%f12,%f14
2727
2728	faddd	%f24,%f22,%f24
2729
2730	faddd	%f32,%f4,%f6
2731
2732	faddd	%f34,%f14,%f16
2733	ba,pt	%icc,.FIXSIGN
2734
2735! delay slot
2736	faddd	%f36,%f24,%f26
2737
2738
2739	.align	32
2740.ENDLOOP2:
2741	fmuld	%f10,%f40,%f12
2742	add	%l5,thresh,%g1
2743	faddd	%f12,%f42,%f12
2744	st	%f13,[%fp+n1]
2745	fsubd	%f12,%f42,%f12		! n
2746	fmuld	%f12,%f46,%f14
2747	fsubd	%f10,%f14,%f14
2748	fmuld	%f12,%f48,%f16
2749	fsubd	%f14,%f16,%f10
2750	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
2751	fsubd	%f14,%f10,%f34
2752	and	%o4,1,%o4
2753	fsubd	%f34,%f16,%f34
2754	fmuld	%f12,%f50,%f18
2755	sll	%o4,3,%o4
2756	fsubd	%f18,%f34,%f18
2757	ld	[%g1+%o4],%f16
2758	fsubd	%f10,%f18,%f14
2759	fsubd	%f10,%f14,%f34
2760	add	%l5,thresh+4,%o7
2761	fsubd	%f34,%f18,%f34
2762	fmuld	%f12,%f52,%f12
2763	fsubd	%f12,%f34,%f12
2764	ld	[%o7+%o4],%f18
2765	fsubd	%f14,%f12,%f10		! x
2766	fsubd	%f14,%f10,%f14
2767	fands	%f10,%f30,%f19		! save signbit
2768	fabsd	%f10,%f10
2769	std	%f10,[%fp+x1_1]
2770	fsubd	%f14,%f12,%f12		! y
2771	fcmpgt32 %f16,%f10,%l1
2772	fxors	%f12,%f19,%f12
2773	fands	%f19,%f18,%f19		! if (n & 1) clear sign bit
2774	andcc	%l1,2,%g0
2775	bne,pn	%icc,1f
2776! delay slot
2777	nop
2778	fpadd32s %f10,%f31,%f18
2779	ld	[%fp+x1_1],%l1
2780	fand	%f18,%f44,%f14
2781	sethi	%hi(0x3fc3c000),%o7
2782	add	%l3,8,%g1
2783	fsubd	%f10,%f14,%f10
2784	sub	%l1,%o7,%l1
2785	srl	%l1,10,%l1
2786	faddd	%f10,%f12,%f10
2787	andn	%l1,0x1f,%l1
2788	fmuld	%f10,%f10,%f12
2789	add	%l1,%o4,%l1
2790	fmuld	%f12,%f58,%f16
2791	ldd	[%l3+%l1],%f34
2792	faddd	%f16,%f56,%f16
2793	fmuld	%f12,%f62,%f14
2794	fmuld	%f12,%f16,%f16
2795	faddd	%f14,%f60,%f14
2796	faddd	%f16,%f54,%f16
2797	fmuld	%f12,%f14,%f14
2798	ldd	[%g1+%l1],%f12
2799	fmuld	%f10,%f16,%f16
2800	ldd	[%l4+%l1],%f10
2801	fmuld	%f14,%f34,%f14
2802	fmuld	%f16,%f12,%f16
2803	faddd	%f16,%f14,%f16
2804	faddd	%f16,%f10,%f16
2805	ba,pt	%icc,2f
2806	faddd	%f16,%f34,%f16
28071:
2808	fmuld	%f10,%f10,%f10
2809	ldd	[%l5+%o4],%f34
2810	add	%l5,%o4,%l1
2811	fmuld	%f10,%f34,%f14
2812	ldd	[%l1+0x10],%f16
2813	add	%fp,%o4,%o4
2814	faddd	%f14,%f16,%f14
2815	ldd	[%l1+0x20],%f34
2816	fmuld	%f10,%f14,%f14
2817	ldd	[%l1+0x30],%f16
2818	faddd	%f14,%f34,%f14
2819	ldd	[%o4+x1_1],%f34
2820	fmuld	%f10,%f14,%f14
2821	std	%f12,[%fp+y1_0]
2822	faddd	%f14,%f16,%f14
2823	fmuld	%f10,%f14,%f14
2824	fmuld	%f34,%f14,%f14
2825	ldd	[%o4+y1_0],%f12
2826	faddd	%f14,%f12,%f14
2827	faddd	%f34,%f14,%f16
28282:
2829	add	%l5,thresh-4,%g1
2830	ld	[%fp+n1],%o4 ; add	%o4,1,%o4
2831	and	%o4,2,%o4
2832	sll	%o4,2,%o4
2833	ld	[%g1+%o4],%f18
2834	fxors	%f19,%f18,%f19
2835	fors	%f16,%f19,%f16		! tack on sign
2836	st	%f16,[%o1]
2837	st	%f17,[%o1+4]
2838
2839.ENDLOOP1:
2840	fmuld	%f0,%f40,%f2
2841	add	%l5,thresh,%g1
2842	faddd	%f2,%f42,%f2
2843	st	%f3,[%fp+n0]
2844	fsubd	%f2,%f42,%f2		! n
2845	fmuld	%f2,%f46,%f4
2846	fsubd	%f0,%f4,%f4
2847	fmuld	%f2,%f48,%f6
2848	fsubd	%f4,%f6,%f0
2849	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
2850	fsubd	%f4,%f0,%f32
2851	and	%o3,1,%o3
2852	fsubd	%f32,%f6,%f32
2853	fmuld	%f2,%f50,%f8
2854	sll	%o3,3,%o3
2855	fsubd	%f8,%f32,%f8
2856	ld	[%g1+%o3],%f6
2857	fsubd	%f0,%f8,%f4
2858	fsubd	%f0,%f4,%f32
2859	add	%l5,thresh+4,%o7
2860	fsubd	%f32,%f8,%f32
2861	fmuld	%f2,%f52,%f2
2862	fsubd	%f2,%f32,%f2
2863	ld	[%o7+%o3],%f8
2864	fsubd	%f4,%f2,%f0		! x
2865	fsubd	%f4,%f0,%f4
2866	fands	%f0,%f30,%f9		! save signbit
2867	fabsd	%f0,%f0
2868	std	%f0,[%fp+x0_1]
2869	fsubd	%f4,%f2,%f2		! y
2870	fcmpgt32 %f6,%f0,%l0
2871	fxors	%f2,%f9,%f2
2872	fands	%f9,%f8,%f9		! if (n & 1) clear sign bit
2873	andcc	%l0,2,%g0
2874	bne,pn	%icc,1f
2875! delay slot
2876	nop
2877	fpadd32s %f0,%f31,%f8
2878	ld	[%fp+x0_1],%l0
2879	fand	%f8,%f44,%f4
2880	sethi	%hi(0x3fc3c000),%o7
2881	add	%l3,8,%g1
2882	fsubd	%f0,%f4,%f0
2883	sub	%l0,%o7,%l0
2884	srl	%l0,10,%l0
2885	faddd	%f0,%f2,%f0
2886	andn	%l0,0x1f,%l0
2887	fmuld	%f0,%f0,%f2
2888	add	%l0,%o3,%l0
2889	fmuld	%f2,%f58,%f6
2890	ldd	[%l3+%l0],%f32
2891	faddd	%f6,%f56,%f6
2892	fmuld	%f2,%f62,%f4
2893	fmuld	%f2,%f6,%f6
2894	faddd	%f4,%f60,%f4
2895	faddd	%f6,%f54,%f6
2896	fmuld	%f2,%f4,%f4
2897	ldd	[%g1+%l0],%f2
2898	fmuld	%f0,%f6,%f6
2899	ldd	[%l4+%l0],%f0
2900	fmuld	%f4,%f32,%f4
2901	fmuld	%f6,%f2,%f6
2902	faddd	%f6,%f4,%f6
2903	faddd	%f6,%f0,%f6
2904	ba,pt	%icc,2f
2905	faddd	%f6,%f32,%f6
29061:
2907	fmuld	%f0,%f0,%f0
2908	ldd	[%l5+%o3],%f32
2909	add	%l5,%o3,%l0
2910	fmuld	%f0,%f32,%f4
2911	ldd	[%l0+0x10],%f6
2912	add	%fp,%o3,%o3
2913	faddd	%f4,%f6,%f4
2914	ldd	[%l0+0x20],%f32
2915	fmuld	%f0,%f4,%f4
2916	ldd	[%l0+0x30],%f6
2917	faddd	%f4,%f32,%f4
2918	ldd	[%o3+x0_1],%f32
2919	fmuld	%f0,%f4,%f4
2920	std	%f2,[%fp+y0_0]
2921	faddd	%f4,%f6,%f4
2922	fmuld	%f0,%f4,%f4
2923	fmuld	%f32,%f4,%f4
2924	ldd	[%o3+y0_0],%f2
2925	faddd	%f4,%f2,%f4
2926	faddd	%f32,%f4,%f6
29272:
2928	add	%l5,thresh-4,%g1
2929	ld	[%fp+n0],%o3 ; add	%o3,1,%o3
2930	and	%o3,2,%o3
2931	sll	%o3,2,%o3
2932	ld	[%g1+%o3],%f8
2933	fxors	%f9,%f8,%f9
2934	fors	%f6,%f9,%f6		! tack on sign
2935	st	%f6,[%o0]
2936	st	%f7,[%o0+4]
2937
2938.ENDLOOP0:
2939
2940! check for huge arguments remaining
2941
2942	tst	LIM_l6
2943	be,pt	%icc,.exit
2944! delay slot
2945	nop
2946
2947! ========== huge range (use C code) ==========
2948
2949#ifdef __sparcv9
2950	ldx	[%fp+xsave],%o1
2951	ldx	[%fp+ysave],%o3
2952#else
2953	ld	[%fp+xsave],%o1
2954	ld	[%fp+ysave],%o3
2955#endif
2956	ld	[%fp+nsave],%o0
2957	ld	[%fp+sxsave],%o2
2958	ld	[%fp+sysave],%o4
2959	sra	%o2,0,%o2		! sign-extend for V9
2960	sra	%o4,0,%o4
2961	call	__vlibm_vcos_big
2962	mov	%l7,%o5			! delay slot
2963
2964.exit:
2965	ret
2966	restore
2967
2968
2969	.align	32
2970.SKIP0:
2971	addcc	%i0,-1,%i0
2972	ble,pn	%icc,.ENDLOOP0
2973! delay slot, harmless if branch taken
2974	add	%i3,%i4,%i3		! y += stridey
2975	andn	%l1,%i5,%l0		! hx &= ~0x80000000
2976	fmovs	%f10,%f0
2977	ld	[%i1+4],%f1
2978	ba,pt	%icc,.LOOP0
2979! delay slot
2980	add	%i1,%i2,%i1		! x += stridex
2981
2982
2983	.align	32
2984.SKIP1:
2985	addcc	%i0,-1,%i0
2986	ble,pn	%icc,.ENDLOOP1
2987! delay slot, harmless if branch taken
2988	add	%i3,%i4,%i3		! y += stridey
2989	andn	%l2,%i5,%l1		! hx &= ~0x80000000
2990	fmovs	%f20,%f10
2991	ld	[%i1+4],%f11
2992	ba,pt	%icc,.LOOP1
2993! delay slot
2994	add	%i1,%i2,%i1		! x += stridex
2995
2996
2997	.align	32
2998.SKIP2:
2999	addcc	%i0,-1,%i0
3000	ble,pn	%icc,.ENDLOOP2
3001! delay slot, harmless if branch taken
3002	add	%i3,%i4,%i3		! y += stridey
3003	ld	[%i1],%l2
3004	ld	[%i1],%f20
3005	ld	[%i1+4],%f21
3006	andn	%l2,%i5,%l2		! hx &= ~0x80000000
3007	ba,pt	%icc,.LOOP2
3008! delay slot
3009	add	%i1,%i2,%i1		! x += stridex
3010
3011
3012	.align	32
3013.BIG0:
3014	sethi	%hi(0x7ff00000),%o7
3015	cmp	%l0,%o7
3016	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3017! delay slot, annulled if branch not taken
3018	mov	%l7,LIM_l6	! set biguns flag or
3019	fsubd	%f0,%f0,%f0		! y = x - x
3020	st	%f0,[%o0]
3021	st	%f1,[%o0+4]
30221:
3023	addcc	%i0,-1,%i0
3024	ble,pn	%icc,.ENDLOOP0
3025! delay slot, harmless if branch taken
3026	andn	%l1,%i5,%l0		! hx &= ~0x80000000
3027	fmovd	%f10,%f0
3028	ba,pt	%icc,.LOOP0
3029! delay slot
3030	add	%i1,%i2,%i1		! x += stridex
3031
3032
3033	.align	32
3034.BIG1:
3035	sethi	%hi(0x7ff00000),%o7
3036	cmp	%l1,%o7
3037	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3038! delay slot, annulled if branch not taken
3039	mov	%l7,LIM_l6		! set biguns flag or
3040	fsubd	%f10,%f10,%f10		! y = x - x
3041	st	%f10,[%o1]
3042	st	%f11,[%o1+4]
30431:
3044	addcc	%i0,-1,%i0
3045	ble,pn	%icc,.ENDLOOP1
3046! delay slot, harmless if branch taken
3047	andn	%l2,%i5,%l1		! hx &= ~0x80000000
3048	fmovd	%f20,%f10
3049	ba,pt	%icc,.LOOP1
3050! delay slot
3051	add	%i1,%i2,%i1		! x += stridex
3052
3053
3054	.align	32
3055.BIG2:
3056	sethi	%hi(0x7ff00000),%o7
3057	cmp	%l2,%o7
3058	bl,a,pt	%icc,1f			! if hx < 0x7ff00000
3059! delay slot, annulled if branch not taken
3060	mov	%l7,LIM_l6		! set biguns flag or
3061	fsubd	%f20,%f20,%f20		! y = x - x
3062	st	%f20,[%o2]
3063	st	%f21,[%o2+4]
30641:
3065	addcc	%i0,-1,%i0
3066	ble,pn	%icc,.ENDLOOP2
3067! delay slot
3068	nop
3069	ld	[%i1],%l2
3070	ld	[%i1],%f20
3071	ld	[%i1+4],%f21
3072	andn	%l2,%i5,%l2		! hx &= ~0x80000000
3073	ba,pt	%icc,.LOOP2
3074! delay slot
3075	add	%i1,%i2,%i1		! x += stridex
3076
3077	SET_SIZE(__vcos)
3078
3079