xref: /netbsd/sys/arch/m68k/fpsp/stan.sa (revision 6550d01e)
1*	$NetBSD: stan.sa,v 1.4 2000/03/13 23:52:32 soren Exp $
2
3*	MOTOROLA MICROPROCESSOR & MEMORY TECHNOLOGY GROUP
4*	M68000 Hi-Performance Microprocessor Division
5*	M68040 Software Package
6*
7*	M68040 Software Package Copyright (c) 1993, 1994 Motorola Inc.
8*	All rights reserved.
9*
10*	THE SOFTWARE is provided on an "AS IS" basis and without warranty.
11*	To the maximum extent permitted by applicable law,
12*	MOTOROLA DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED,
13*	INCLUDING IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A
14*	PARTICULAR PURPOSE and any warranty against infringement with
15*	regard to the SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF)
16*	and any accompanying written materials.
17*
18*	To the maximum extent permitted by applicable law,
19*	IN NO EVENT SHALL MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
20*	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS
21*	PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR
22*	OTHER PECUNIARY LOSS) ARISING OF THE USE OR INABILITY TO USE THE
23*	SOFTWARE.  Motorola assumes no responsibility for the maintenance
24*	and support of the SOFTWARE.
25*
26*	You are hereby granted a copyright license to use, modify, and
27*	distribute the SOFTWARE so long as this entire notice is retained
28*	without alteration in any modified and/or redistributed versions,
29*	and that such modified versions are clearly identified as such.
30*	No licenses are granted by implication, estoppel or otherwise
31*	under any patents or trademarks of Motorola, Inc.
32
33*
34*	stan.sa 3.3 7/29/91
35*
36*	The entry point stan computes the tangent of
37*	an input argument;
38*	stand does the same except for denormalized input.
39*
40*	Input: Double-extended number X in location pointed to
41*		by address register a0.
42*
43*	Output: The value tan(X) returned in floating-point register Fp0.
44*
45*	Accuracy and Monotonicity: The returned result is within 3 ulp in
46*		64 significant bit, i.e. within 0.5001 ulp to 53 bits if the
47*		result is subsequently rounded to double precision. The
48*		result is provably monotonic in double precision.
49*
50*	Speed: The program sTAN takes approximately 170 cycles for
51*		input argument X such that |X| < 15Pi, which is the usual
52*		situation.
53*
54*	Algorithm:
55*
56*	1. If |X| >= 15Pi or |X| < 2**(-40), go to 6.
57*
58*	2. Decompose X as X = N(Pi/2) + r where |r| <= Pi/4. Let
59*		k = N mod 2, so in particular, k = 0 or 1.
60*
61*	3. If k is odd, go to 5.
62*
63*	4. (k is even) Tan(X) = tan(r) and tan(r) is approximated by a
64*		rational function U/V where
65*		U = r + r*s*(P1 + s*(P2 + s*P3)), and
66*		V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))),  s = r*r.
67*		Exit.
68*
69*	4. (k is odd) Tan(X) = -cot(r). Since tan(r) is approximated by a
70*		rational function U/V where
71*		U = r + r*s*(P1 + s*(P2 + s*P3)), and
72*		V = 1 + s*(Q1 + s*(Q2 + s*(Q3 + s*Q4))), s = r*r,
73*		-Cot(r) = -V/U. Exit.
74*
75*	6. If |X| > 1, go to 8.
76*
77*	7. (|X|<2**(-40)) Tan(X) = X. Exit.
78*
79*	8. Overwrite X by X := X rem 2Pi. Now that |X| <= Pi, go back to 2.
80*
81
82STAN	IDNT	2,1 Motorola 040 Floating Point Software Package
83
84	section	8
85
86	include fpsp.h
87
88BOUNDS1	DC.L $3FD78000,$4004BC7E
89TWOBYPI	DC.L $3FE45F30,$6DC9C883
90
91TANQ4	DC.L $3EA0B759,$F50F8688
92TANP3	DC.L $BEF2BAA5,$A8924F04
93
94TANQ3	DC.L $BF346F59,$B39BA65F,$00000000,$00000000
95
96TANP2	DC.L $3FF60000,$E073D3FC,$199C4A00,$00000000
97
98TANQ2	DC.L $3FF90000,$D23CD684,$15D95FA1,$00000000
99
100TANP1	DC.L $BFFC0000,$8895A6C5,$FB423BCA,$00000000
101
102TANQ1	DC.L $BFFD0000,$EEF57E0D,$A84BC8CE,$00000000
103
104INVTWOPI DC.L $3FFC0000,$A2F9836E,$4E44152A,$00000000
105
106TWOPI1	DC.L $40010000,$C90FDAA2,$00000000,$00000000
107TWOPI2	DC.L $3FDF0000,$85A308D4,$00000000,$00000000
108
109*--N*PI/2, -32 <= N <= 32, IN A LEADING TERM IN EXT. AND TRAILING
110*--TERM IN SGL. NOTE THAT PI IS 64-BIT LONG, THUS N*PI/2 IS AT
111*--MOST 69 BITS LONG.
112	xdef	PITBL
113PITBL:
114  DC.L  $C0040000,$C90FDAA2,$2168C235,$21800000
115  DC.L  $C0040000,$C2C75BCD,$105D7C23,$A0D00000
116  DC.L  $C0040000,$BC7EDCF7,$FF523611,$A1E80000
117  DC.L  $C0040000,$B6365E22,$EE46F000,$21480000
118  DC.L  $C0040000,$AFEDDF4D,$DD3BA9EE,$A1200000
119  DC.L  $C0040000,$A9A56078,$CC3063DD,$21FC0000
120  DC.L  $C0040000,$A35CE1A3,$BB251DCB,$21100000
121  DC.L  $C0040000,$9D1462CE,$AA19D7B9,$A1580000
122  DC.L  $C0040000,$96CBE3F9,$990E91A8,$21E00000
123  DC.L  $C0040000,$90836524,$88034B96,$20B00000
124  DC.L  $C0040000,$8A3AE64F,$76F80584,$A1880000
125  DC.L  $C0040000,$83F2677A,$65ECBF73,$21C40000
126  DC.L  $C0030000,$FB53D14A,$A9C2F2C2,$20000000
127  DC.L  $C0030000,$EEC2D3A0,$87AC669F,$21380000
128  DC.L  $C0030000,$E231D5F6,$6595DA7B,$A1300000
129  DC.L  $C0030000,$D5A0D84C,$437F4E58,$9FC00000
130  DC.L  $C0030000,$C90FDAA2,$2168C235,$21000000
131  DC.L  $C0030000,$BC7EDCF7,$FF523611,$A1680000
132  DC.L  $C0030000,$AFEDDF4D,$DD3BA9EE,$A0A00000
133  DC.L  $C0030000,$A35CE1A3,$BB251DCB,$20900000
134  DC.L  $C0030000,$96CBE3F9,$990E91A8,$21600000
135  DC.L  $C0030000,$8A3AE64F,$76F80584,$A1080000
136  DC.L  $C0020000,$FB53D14A,$A9C2F2C2,$1F800000
137  DC.L  $C0020000,$E231D5F6,$6595DA7B,$A0B00000
138  DC.L  $C0020000,$C90FDAA2,$2168C235,$20800000
139  DC.L  $C0020000,$AFEDDF4D,$DD3BA9EE,$A0200000
140  DC.L  $C0020000,$96CBE3F9,$990E91A8,$20E00000
141  DC.L  $C0010000,$FB53D14A,$A9C2F2C2,$1F000000
142  DC.L  $C0010000,$C90FDAA2,$2168C235,$20000000
143  DC.L  $C0010000,$96CBE3F9,$990E91A8,$20600000
144  DC.L  $C0000000,$C90FDAA2,$2168C235,$1F800000
145  DC.L  $BFFF0000,$C90FDAA2,$2168C235,$1F000000
146  DC.L  $00000000,$00000000,$00000000,$00000000
147  DC.L  $3FFF0000,$C90FDAA2,$2168C235,$9F000000
148  DC.L  $40000000,$C90FDAA2,$2168C235,$9F800000
149  DC.L  $40010000,$96CBE3F9,$990E91A8,$A0600000
150  DC.L  $40010000,$C90FDAA2,$2168C235,$A0000000
151  DC.L  $40010000,$FB53D14A,$A9C2F2C2,$9F000000
152  DC.L  $40020000,$96CBE3F9,$990E91A8,$A0E00000
153  DC.L  $40020000,$AFEDDF4D,$DD3BA9EE,$20200000
154  DC.L  $40020000,$C90FDAA2,$2168C235,$A0800000
155  DC.L  $40020000,$E231D5F6,$6595DA7B,$20B00000
156  DC.L  $40020000,$FB53D14A,$A9C2F2C2,$9F800000
157  DC.L  $40030000,$8A3AE64F,$76F80584,$21080000
158  DC.L  $40030000,$96CBE3F9,$990E91A8,$A1600000
159  DC.L  $40030000,$A35CE1A3,$BB251DCB,$A0900000
160  DC.L  $40030000,$AFEDDF4D,$DD3BA9EE,$20A00000
161  DC.L  $40030000,$BC7EDCF7,$FF523611,$21680000
162  DC.L  $40030000,$C90FDAA2,$2168C235,$A1000000
163  DC.L  $40030000,$D5A0D84C,$437F4E58,$1FC00000
164  DC.L  $40030000,$E231D5F6,$6595DA7B,$21300000
165  DC.L  $40030000,$EEC2D3A0,$87AC669F,$A1380000
166  DC.L  $40030000,$FB53D14A,$A9C2F2C2,$A0000000
167  DC.L  $40040000,$83F2677A,$65ECBF73,$A1C40000
168  DC.L  $40040000,$8A3AE64F,$76F80584,$21880000
169  DC.L  $40040000,$90836524,$88034B96,$A0B00000
170  DC.L  $40040000,$96CBE3F9,$990E91A8,$A1E00000
171  DC.L  $40040000,$9D1462CE,$AA19D7B9,$21580000
172  DC.L  $40040000,$A35CE1A3,$BB251DCB,$A1100000
173  DC.L  $40040000,$A9A56078,$CC3063DD,$A1FC0000
174  DC.L  $40040000,$AFEDDF4D,$DD3BA9EE,$21200000
175  DC.L  $40040000,$B6365E22,$EE46F000,$A1480000
176  DC.L  $40040000,$BC7EDCF7,$FF523611,$21E80000
177  DC.L  $40040000,$C2C75BCD,$105D7C23,$20D00000
178  DC.L  $40040000,$C90FDAA2,$2168C235,$A1800000
179
180INARG	equ	FP_SCR4
181
182TWOTO63 equ     L_SCR1
183ENDFLAG	equ	L_SCR2
184N       equ     L_SCR3
185
186	xref	t_frcinx
187	xref	t_extdnrm
188
189	xdef	stand
190stand:
191*--TAN(X) = X FOR DENORMALIZED X
192
193	bra		t_extdnrm
194
195	xdef	stan
196stan:
197	FMOVE.X		(a0),FP0	...LOAD INPUT
198
199	MOVE.L		(A0),D0
200	MOVE.W		4(A0),D0
201	ANDI.L		#$7FFFFFFF,D0
202
203	CMPI.L		#$3FD78000,D0		...|X| >= 2**(-40)?
204	BGE.B		TANOK1
205	BRA.W		TANSM
206TANOK1:
207	CMPI.L		#$4004BC7E,D0		...|X| < 15 PI?
208	BLT.B		TANMAIN
209	BRA.W		REDUCEX
210
211
212TANMAIN:
213*--THIS IS THE USUAL CASE, |X| <= 15 PI.
214*--THE ARGUMENT REDUCTION IS DONE BY TABLE LOOK UP.
215	FMOVE.X		FP0,FP1
216	FMUL.D		TWOBYPI,FP1	...X*2/PI
217
218*--HIDE THE NEXT TWO INSTRUCTIONS
219	lea.l		PITBL+$200,a1 ...TABLE OF N*PI/2, N = -32,...,32
220
221*--FP1 IS NOW READY
222	FMOVE.L		FP1,D0		...CONVERT TO INTEGER
223
224	ASL.L		#4,D0
225	ADDA.L		D0,a1		...ADDRESS N*PIBY2 IN Y1, Y2
226
227	FSUB.X		(a1)+,FP0	...X-Y1
228*--HIDE THE NEXT ONE
229
230	FSUB.S		(a1),FP0	...FP0 IS R = (X-Y1)-Y2
231
232	ROR.L		#5,D0
233	ANDI.L		#$80000000,D0	...D0 WAS ODD IFF D0 < 0
234
235TANCONT:
236
237	TST.L		D0
238	BLT.W		NODD
239
240	FMOVE.X		FP0,FP1
241	FMUL.X		FP1,FP1	 	...S = R*R
242
243	FMOVE.D		TANQ4,FP3
244	FMOVE.D		TANP3,FP2
245
246	FMUL.X		FP1,FP3	 	...SQ4
247	FMUL.X		FP1,FP2	 	...SP3
248
249	FADD.D		TANQ3,FP3	...Q3+SQ4
250	FADD.X		TANP2,FP2	...P2+SP3
251
252	FMUL.X		FP1,FP3	 	...S(Q3+SQ4)
253	FMUL.X		FP1,FP2	 	...S(P2+SP3)
254
255	FADD.X		TANQ2,FP3	...Q2+S(Q3+SQ4)
256	FADD.X		TANP1,FP2	...P1+S(P2+SP3)
257
258	FMUL.X		FP1,FP3	 	...S(Q2+S(Q3+SQ4))
259	FMUL.X		FP1,FP2	 	...S(P1+S(P2+SP3))
260
261	FADD.X		TANQ1,FP3	...Q1+S(Q2+S(Q3+SQ4))
262	FMUL.X		FP0,FP2	 	...RS(P1+S(P2+SP3))
263
264	FMUL.X		FP3,FP1	 	...S(Q1+S(Q2+S(Q3+SQ4)))
265
266
267	FADD.X		FP2,FP0	 	...R+RS(P1+S(P2+SP3))
268
269
270	FADD.S		#:3F800000,FP1	...1+S(Q1+...)
271
272	FMOVE.L		d1,fpcr		;restore users exceptions
273	FDIV.X		FP1,FP0		;last inst - possible exception set
274
275	bra		t_frcinx
276
277NODD:
278	FMOVE.X		FP0,FP1
279	FMUL.X		FP0,FP0	 	...S = R*R
280
281	FMOVE.D		TANQ4,FP3
282	FMOVE.D		TANP3,FP2
283
284	FMUL.X		FP0,FP3	 	...SQ4
285	FMUL.X		FP0,FP2	 	...SP3
286
287	FADD.D		TANQ3,FP3	...Q3+SQ4
288	FADD.X		TANP2,FP2	...P2+SP3
289
290	FMUL.X		FP0,FP3	 	...S(Q3+SQ4)
291	FMUL.X		FP0,FP2	 	...S(P2+SP3)
292
293	FADD.X		TANQ2,FP3	...Q2+S(Q3+SQ4)
294	FADD.X		TANP1,FP2	...P1+S(P2+SP3)
295
296	FMUL.X		FP0,FP3	 	...S(Q2+S(Q3+SQ4))
297	FMUL.X		FP0,FP2	 	...S(P1+S(P2+SP3))
298
299	FADD.X		TANQ1,FP3	...Q1+S(Q2+S(Q3+SQ4))
300	FMUL.X		FP1,FP2	 	...RS(P1+S(P2+SP3))
301
302	FMUL.X		FP3,FP0	 	...S(Q1+S(Q2+S(Q3+SQ4)))
303
304
305	FADD.X		FP2,FP1	 	...R+RS(P1+S(P2+SP3))
306	FADD.S		#:3F800000,FP0	...1+S(Q1+...)
307
308
309	FMOVE.X		FP1,-(sp)
310	EORI.L		#$80000000,(sp)
311
312	FMOVE.L		d1,fpcr	 	;restore users exceptions
313	FDIV.X		(sp)+,FP0	;last inst - possible exception set
314
315	bra		t_frcinx
316
317TANBORS:
318*--IF |X| > 15PI, WE USE THE GENERAL ARGUMENT REDUCTION.
319*--IF |X| < 2**(-40), RETURN X OR 1.
320	CMPI.L		#$3FFF8000,D0
321	BGT.B		REDUCEX
322
323TANSM:
324
325	FMOVE.X		FP0,-(sp)
326	FMOVE.L		d1,fpcr		 ;restore users exceptions
327	FMOVE.X		(sp)+,FP0	;last inst - posibble exception set
328
329	bra		t_frcinx
330
331
332REDUCEX:
333*--WHEN REDUCEX IS USED, THE CODE WILL INEVITABLY BE SLOW.
334*--THIS REDUCTION METHOD, HOWEVER, IS MUCH FASTER THAN USING
335*--THE REMAINDER INSTRUCTION WHICH IS NOW IN SOFTWARE.
336
337	FMOVEM.X	FP2-FP5,-(A7)	...save FP2 through FP5
338	MOVE.L		D2,-(A7)
339        FMOVE.S         #:00000000,FP1
340
341*--If compact form of abs(arg) in d0=$7ffeffff, argument is so large that
342*--there is a danger of unwanted overflow in first LOOP iteration.  In this
343*--case, reduce argument by one remainder step to make subsequent reduction
344*--safe.
345	cmpi.l	#$7ffeffff,d0		;is argument dangerously large?
346	bne.b	LOOP
347	move.l	#$7ffe0000,FP_SCR2(a6)	;yes
348*					;create 2**16383*PI/2
349	move.l	#$c90fdaa2,FP_SCR2+4(a6)
350	clr.l	FP_SCR2+8(a6)
351	ftst.x	fp0			;test sign of argument
352	move.l	#$7fdc0000,FP_SCR3(a6)	;create low half of 2**16383*
353*					;PI/2 at FP_SCR3
354	move.l	#$85a308d3,FP_SCR3+4(a6)
355	clr.l   FP_SCR3+8(a6)
356	fblt.w	red_neg
357	or.w	#$8000,FP_SCR2(a6)	;positive arg
358	or.w	#$8000,FP_SCR3(a6)
359red_neg:
360	fadd.x  FP_SCR2(a6),fp0		;high part of reduction is exact
361	fmove.x  fp0,fp1		;save high result in fp1
362	fadd.x  FP_SCR3(a6),fp0		;low part of reduction
363	fsub.x  fp0,fp1			;determine low component of result
364	fadd.x  FP_SCR3(a6),fp1		;fp0/fp1 are reduced argument.
365
366*--ON ENTRY, FP0 IS X, ON RETURN, FP0 IS X REM PI/2, |X| <= PI/4.
367*--integer quotient will be stored in N
368*--Intermeditate remainder is 66-bit long; (R,r) in (FP0,FP1)
369
370LOOP:
371	FMOVE.X		FP0,INARG(a6)	...+-2**K * F, 1 <= F < 2
372	MOVE.W		INARG(a6),D0
373        MOVE.L          D0,A1		...save a copy of D0
374	ANDI.L		#$00007FFF,D0
375	SUBI.L		#$00003FFF,D0	...D0 IS K
376	CMPI.L		#28,D0
377	BLE.B		LASTLOOP
378CONTLOOP:
379	SUBI.L		#27,D0	 ...D0 IS L := K-27
380	CLR.L		ENDFLAG(a6)
381	BRA.B		WORK
382LASTLOOP:
383	CLR.L		D0		...D0 IS L := 0
384	MOVE.L		#1,ENDFLAG(a6)
385
386WORK:
387*--FIND THE REMAINDER OF (R,r) W.R.T.	2**L * (PI/2). L IS SO CHOSEN
388*--THAT	INT( X * (2/PI) / 2**(L) ) < 2**29.
389
390*--CREATE 2**(-L) * (2/PI), SIGN(INARG)*2**(63),
391*--2**L * (PIby2_1), 2**L * (PIby2_2)
392
393	MOVE.L		#$00003FFE,D2	...BIASED EXPO OF 2/PI
394	SUB.L		D0,D2		...BIASED EXPO OF 2**(-L)*(2/PI)
395
396	MOVE.L		#$A2F9836E,FP_SCR1+4(a6)
397	MOVE.L		#$4E44152A,FP_SCR1+8(a6)
398	MOVE.W		D2,FP_SCR1(a6)	...FP_SCR1 is 2**(-L)*(2/PI)
399
400	FMOVE.X		FP0,FP2
401	FMUL.X		FP_SCR1(a6),FP2
402*--WE MUST NOW FIND INT(FP2). SINCE WE NEED THIS VALUE IN
403*--FLOATING POINT FORMAT, THE TWO FMOVE'S	FMOVE.L FP <--> N
404*--WILL BE TOO INEFFICIENT. THE WAY AROUND IT IS THAT
405*--(SIGN(INARG)*2**63	+	FP2) - SIGN(INARG)*2**63 WILL GIVE
406*--US THE DESIRED VALUE IN FLOATING POINT.
407
408*--HIDE SIX CYCLES OF INSTRUCTION
409        MOVE.L		A1,D2
410        SWAP		D2
411	ANDI.L		#$80000000,D2
412	ORI.L		#$5F000000,D2	...D2 IS SIGN(INARG)*2**63 IN SGL
413	MOVE.L		D2,TWOTO63(a6)
414
415	MOVE.L		D0,D2
416	ADDI.L		#$00003FFF,D2	...BIASED EXPO OF 2**L * (PI/2)
417
418*--FP2 IS READY
419	FADD.S		TWOTO63(a6),FP2	...THE FRACTIONAL PART OF FP1 IS ROUNDED
420
421*--HIDE 4 CYCLES OF INSTRUCTION; creating 2**(L)*Piby2_1  and  2**(L)*Piby2_2
422        MOVE.W		D2,FP_SCR2(a6)
423	CLR.W           FP_SCR2+2(a6)
424	MOVE.L		#$C90FDAA2,FP_SCR2+4(a6)
425	CLR.L		FP_SCR2+8(a6)		...FP_SCR2 is  2**(L) * Piby2_1
426
427*--FP2 IS READY
428	FSUB.S		TWOTO63(a6),FP2		...FP2 is N
429
430	ADDI.L		#$00003FDD,D0
431        MOVE.W		D0,FP_SCR3(a6)
432	CLR.W           FP_SCR3+2(a6)
433	MOVE.L		#$85A308D3,FP_SCR3+4(a6)
434	CLR.L		FP_SCR3+8(a6)		...FP_SCR3 is 2**(L) * Piby2_2
435
436	MOVE.L		ENDFLAG(a6),D0
437
438*--We are now ready to perform (R+r) - N*P1 - N*P2, P1 = 2**(L) * Piby2_1 and
439*--P2 = 2**(L) * Piby2_2
440	FMOVE.X		FP2,FP4
441	FMul.X		FP_SCR2(a6),FP4		...W = N*P1
442	FMove.X		FP2,FP5
443	FMul.X		FP_SCR3(a6),FP5		...w = N*P2
444	FMove.X		FP4,FP3
445*--we want P+p = W+w  but  |p| <= half ulp of P
446*--Then, we need to compute  A := R-P   and  a := r-p
447	FAdd.X		FP5,FP3			...FP3 is P
448	FSub.X		FP3,FP4			...W-P
449
450	FSub.X		FP3,FP0			...FP0 is A := R - P
451        FAdd.X		FP5,FP4			...FP4 is p = (W-P)+w
452
453	FMove.X		FP0,FP3			...FP3 A
454	FSub.X		FP4,FP1			...FP1 is a := r - p
455
456*--Now we need to normalize (A,a) to  "new (R,r)" where R+r = A+a but
457*--|r| <= half ulp of R.
458	FAdd.X		FP1,FP0			...FP0 is R := A+a
459*--No need to calculate r if this is the last loop
460	TST.L		D0
461	BGT.W		RESTORE
462
463*--Need to calculate r
464	FSub.X		FP0,FP3			...A-R
465	FAdd.X		FP3,FP1			...FP1 is r := (A-R)+a
466	BRA.W		LOOP
467
468RESTORE:
469        FMOVE.L		FP2,N(a6)
470	MOVE.L		(A7)+,D2
471	FMOVEM.X	(A7)+,FP2-FP5
472
473
474	MOVE.L		N(a6),D0
475        ROR.L		#1,D0
476
477
478	BRA.W		TANCONT
479
480	end
481