xref: /original-bsd/lib/libm/national/sqrt.s (revision 2301fdfb)
1; Copyright (c) 1985 Regents of the University of California.
2; All rights reserved.
3;
4; Redistribution and use in source and binary forms are permitted
5; provided that the above copyright notice and this paragraph are
6; duplicated in all such forms and that any documentation,
7; advertising materials, and other materials related to such
8; distribution and use acknowledge that the software was developed
9; by the University of California, Berkeley.  The name of the
10; University may not be used to endorse or promote products derived
11; from this software without specific prior written permission.
12; THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
13; IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
14; WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
15;
16; All recipients should regard themselves as participants in an ongoing
17; research project and hence should feel obligated to report their
18; experiences (good or bad) with these elementary function codes, using
19; the sendbug(8) program, to the authors.
20;
21;	@(#)sqrt.s	5.3 (Berkeley) 06/30/88
22;
23
24; double sqrt(x)
25; double x;
26; IEEE double precision sqrt
27; code in NSC assembly by K.C. Ng
28; 12/13/85
29;
30; Method:
31; 	Use Kahan's trick to get 8 bits initial approximation
32;	by integer shift and add/subtract. Then three Newton
33;	iterations to get down error to within one ulp. Finally
34;	twiddle the last bit to make to correctly rounded
35;	according to the rounding mode.
36;
37	.vers	2
38	.text
39	.align	2
40	.globl	_sqrt
41_sqrt:
42	enter	[r3,r4,r5,r6,r7],44
43	movl	f4,tos
44	movl	f6,tos
45	movd	2146435072,r2	; r2 = 0x7ff00000
46	movl	8(fp),f0	; f2 = x
47	movd	12(fp),r3	; r3 = high part of x
48	movd	r3,r4		; make a copy of high part of x in r4
49	andd	r2,r3		; r3 become the bias exponent of x
50	cmpd	r2,r3		; if r3 = 0x7ff00000 then x is INF or NAN
51	bne	L22
52				; to see if x is INF
53	movd	8(fp),r0	; r0 = low part of x
54	movd	r4,r1		; r1 is high part of x again
55	andd	0xfff00000,r1	; mask off the sign and exponent of x
56	ord	r0,r1		; or with low part, if 0 then x is INF
57	cmpqd	0,r1		;
58	bne	L1		; not 0; therefore x is NaN; return x.
59	cmpqd	0,r4		; now x is Inf, is it +inf?
60	blt	L1		; +INF, return x
61				; -INF, return NaN by doing 0/0
62nan:	movl	0f0.0,f0	;
63	divl	f0,f0
64	br	L1
65L22:				; now x is finite
66	cmpl	0f0.0,f0	; x = 0 ?
67	beq	L1		; return x if x is +0 or -0
68	cmpqd	0,r4		; Is x < 0 ?
69	bgt	nan		; if x < 0 return NaN
70	movqd	0,r5		; r5 == scalx initialize to zero
71	cmpqd	0,r3		; is x is subnormal ?  (r3 is the exponent)
72	bne	L21		; if x is normal, goto L21
73	movl	L30,f2		; f2 = 2**54
74	mull	f2,f0		; scale up x by 2**54
75	subd	0x1b00000,r5	; off set the scale factor by -27 in exponent
76L21:
77				; now x is normal
78				; notations:
79				;    r1 == copy of fsr
80				;    r2 == mask of e inexact enable flag
81				;    r3 == mask of i inexact flag
82				;    r4 == mask of r rounding mode
83				;    r5 == x's scale factor (already defined)
84
85	movd	0x20,r2
86	movd	0x40,r3
87	movd	0x180,r4
88	sfsr	r0		; store fsr to r0
89	movd	r0,r1		; make a copy of fsr to r1
90	bicd	[5,6,7,8],r0	; clear e,i, and set r to round to nearest
91	lfsr	r0
92
93				; begin to compute sqrt(x)
94	movl	f0,-8(fp)
95	movd	-4(fp),r0	; r0 the high part of modified x
96	lshd	-1,r0		; r0 >> 1
97	addd	0x1ff80000,r0	; add correction to r0  ...got 5 bits approx.
98	movd	r0,r6
99	lshd	-13,r6		; r6 = r0>>-15
100	andd	0x7c,r6		; obtain 4*leading 5 bits of r0
101	addrd	L29,r7		; r7 = address of L29 = table[0]
102	addd	r6,r7		; r6 = address of L29[r6] = table[r6]
103	subd	0(r7),r0	; r0 = r0 - table[r6]
104	movd	r0,-4(fp)
105	movl	-8(fp),f2	; now f2 = y approximate sqrt(f0) to 8 bits
106
107	movl	0f0.5,f6	; f6 = 0.5
108	movl	f0,f4
109	divl	f2,f4		; t = x/y
110	addl	f4,f2		; y = y + x/y
111	mull	f6,f2		; y = 0.5(y+x/y) got 17 bits approx.
112	movl	f0,f4
113	divl	f2,f4		; t = x/y
114	addl	f4,f2		; y = y + x/y
115	mull	f6,f2		; y = 0.5(y+x/y) got 35 bits approx.
116	movl	f0,f4
117	divl	f2,f4		; t = x/y
118	subl	f2,f4		; t = x/y - y
119	mull	f6,f4		; t = 0.5(x/y-y)
120	addl	f4,f2		; y = y + 0.5(x/y -y)
121				; now y approx. sqrt(x) to within 1 ulp
122
123				; twiddle last bit to force y correctly rounded
124	movd	r1,r0		; restore the old fsr
125	bicd	[6,7,8],r0	; clear inexact bit but retain inexact enable
126	ord	0x80,r0		; set rounding mode to round to zero
127	lfsr	r0
128
129	movl	f0,f4
130	divl	f2,f4		; f4 = x/y
131	sfsr	r0
132	andd	r3,r0		; get the inexact flag
133	cmpqd	0,r0
134	bne	L18
135				; if x/y exact, then ...
136	cmpl	f2,f4		; if y == x/y
137	beq	L2
138	movl	f4,-8(fp)
139	subd	1,-8(fp)
140	subcd	0,-4(fp)
141	movl	-8(fp),f4	; f4 = f4 - ulp
142L18:
143	bicd	[6],r1
144	ord	r3,r1		; set inexact flag in r1
145
146	andd	r1,r4		; r4 = the old rounding mode
147	cmpqd	0,r4		; round to nearest?
148	bne	L17
149	movl	f4,-8(fp)
150	addd	1,-8(fp)
151	addcd	0,-4(fp)
152	movl	-8(fp),f4	; f4 = f4 + ulp
153	br	L16
154L17:
155	cmpd	0x100,r4	; round to positive inf ?
156	bne	L16
157	movl	f4,-8(fp)
158	addd	1,-8(fp)
159	addcd	0,-4(fp)
160	movl	-8(fp),f4	; f4 = f4 + ulp
161
162	movl	f2,-8(fp)
163	addd	1,-8(fp)
164	addcd	0,-4(fp)
165	movl	-8(fp),f2	; f2 = f2 + ulp
166L16:
167	addl	f4,f2		; y  = y + t
168	subd	0x100000,r5	; scalx = scalx - 1
169L2:
170	movl	f2,-8(fp)
171	addd	r5,-4(fp)
172	movl	-8(fp),f0
173	lfsr	r1
174L1:
175	movl	tos,f6
176	movl	tos,f4
177	exit	[r3,r4,r5,r6,r7]
178	ret	0
179	.data
180L28:	.byte	64,40,35,41,115,113,114,116,46,99
181	.byte	9,49,46,49,32,40,117,99,98,46
182	.byte	101,108,101,102,117,110,116,41,32,57
183	.byte	47,49,57,47,56,53,0
184L29:	.blkb	4
185	.double	1204
186	.double	3062
187	.double	5746
188	.double	9193
189	.double	13348
190	.double	18162
191	.double	23592
192	.double	29598
193	.double	36145
194	.double	43202
195	.double	50740
196	.double	58733
197	.double	67158
198	.double	75992
199	.double	85215
200	.double	83599
201	.double	71378
202	.double	60428
203	.double	50647
204	.double	41945
205	.double	34246
206	.double	27478
207	.double	21581
208	.double	16499
209	.double	12183
210	.double	8588
211	.double	5674
212	.double	3403
213	.double	1742
214	.double	661
215	.double	130
216L30:	.blkb	4
217	.double	1129316352 	;L30:	.double 0,0x43500000
218L31:	.blkb	4
219	.double 0x1ff00000
220L32:	.blkb	4
221	.double 0x5ff00000
222