1; Copyright (c) 1985 Regents of the University of California. 2; All rights reserved. 3; 4; Redistribution and use in source and binary forms are permitted 5; provided that the above copyright notice and this paragraph are 6; duplicated in all such forms and that any documentation, 7; advertising materials, and other materials related to such 8; distribution and use acknowledge that the software was developed 9; by the University of California, Berkeley. The name of the 10; University may not be used to endorse or promote products derived 11; from this software without specific prior written permission. 12; THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 13; IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 14; WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 15; 16; All recipients should regard themselves as participants in an ongoing 17; research project and hence should feel obligated to report their 18; experiences (good or bad) with these elementary function codes, using 19; the sendbug(8) program, to the authors. 20; 21; @(#)sqrt.s 5.3 (Berkeley) 06/30/88 22; 23 24; double sqrt(x) 25; double x; 26; IEEE double precision sqrt 27; code in NSC assembly by K.C. Ng 28; 12/13/85 29; 30; Method: 31; Use Kahan's trick to get 8 bits initial approximation 32; by integer shift and add/subtract. Then three Newton 33; iterations to get down error to within one ulp. Finally 34; twiddle the last bit to make to correctly rounded 35; according to the rounding mode. 36; 37 .vers 2 38 .text 39 .align 2 40 .globl _sqrt 41_sqrt: 42 enter [r3,r4,r5,r6,r7],44 43 movl f4,tos 44 movl f6,tos 45 movd 2146435072,r2 ; r2 = 0x7ff00000 46 movl 8(fp),f0 ; f2 = x 47 movd 12(fp),r3 ; r3 = high part of x 48 movd r3,r4 ; make a copy of high part of x in r4 49 andd r2,r3 ; r3 become the bias exponent of x 50 cmpd r2,r3 ; if r3 = 0x7ff00000 then x is INF or NAN 51 bne L22 52 ; to see if x is INF 53 movd 8(fp),r0 ; r0 = low part of x 54 movd r4,r1 ; r1 is high part of x again 55 andd 0xfff00000,r1 ; mask off the sign and exponent of x 56 ord r0,r1 ; or with low part, if 0 then x is INF 57 cmpqd 0,r1 ; 58 bne L1 ; not 0; therefore x is NaN; return x. 59 cmpqd 0,r4 ; now x is Inf, is it +inf? 60 blt L1 ; +INF, return x 61 ; -INF, return NaN by doing 0/0 62nan: movl 0f0.0,f0 ; 63 divl f0,f0 64 br L1 65L22: ; now x is finite 66 cmpl 0f0.0,f0 ; x = 0 ? 67 beq L1 ; return x if x is +0 or -0 68 cmpqd 0,r4 ; Is x < 0 ? 69 bgt nan ; if x < 0 return NaN 70 movqd 0,r5 ; r5 == scalx initialize to zero 71 cmpqd 0,r3 ; is x is subnormal ? (r3 is the exponent) 72 bne L21 ; if x is normal, goto L21 73 movl L30,f2 ; f2 = 2**54 74 mull f2,f0 ; scale up x by 2**54 75 subd 0x1b00000,r5 ; off set the scale factor by -27 in exponent 76L21: 77 ; now x is normal 78 ; notations: 79 ; r1 == copy of fsr 80 ; r2 == mask of e inexact enable flag 81 ; r3 == mask of i inexact flag 82 ; r4 == mask of r rounding mode 83 ; r5 == x's scale factor (already defined) 84 85 movd 0x20,r2 86 movd 0x40,r3 87 movd 0x180,r4 88 sfsr r0 ; store fsr to r0 89 movd r0,r1 ; make a copy of fsr to r1 90 bicd [5,6,7,8],r0 ; clear e,i, and set r to round to nearest 91 lfsr r0 92 93 ; begin to compute sqrt(x) 94 movl f0,-8(fp) 95 movd -4(fp),r0 ; r0 the high part of modified x 96 lshd -1,r0 ; r0 >> 1 97 addd 0x1ff80000,r0 ; add correction to r0 ...got 5 bits approx. 98 movd r0,r6 99 lshd -13,r6 ; r6 = r0>>-15 100 andd 0x7c,r6 ; obtain 4*leading 5 bits of r0 101 addrd L29,r7 ; r7 = address of L29 = table[0] 102 addd r6,r7 ; r6 = address of L29[r6] = table[r6] 103 subd 0(r7),r0 ; r0 = r0 - table[r6] 104 movd r0,-4(fp) 105 movl -8(fp),f2 ; now f2 = y approximate sqrt(f0) to 8 bits 106 107 movl 0f0.5,f6 ; f6 = 0.5 108 movl f0,f4 109 divl f2,f4 ; t = x/y 110 addl f4,f2 ; y = y + x/y 111 mull f6,f2 ; y = 0.5(y+x/y) got 17 bits approx. 112 movl f0,f4 113 divl f2,f4 ; t = x/y 114 addl f4,f2 ; y = y + x/y 115 mull f6,f2 ; y = 0.5(y+x/y) got 35 bits approx. 116 movl f0,f4 117 divl f2,f4 ; t = x/y 118 subl f2,f4 ; t = x/y - y 119 mull f6,f4 ; t = 0.5(x/y-y) 120 addl f4,f2 ; y = y + 0.5(x/y -y) 121 ; now y approx. sqrt(x) to within 1 ulp 122 123 ; twiddle last bit to force y correctly rounded 124 movd r1,r0 ; restore the old fsr 125 bicd [6,7,8],r0 ; clear inexact bit but retain inexact enable 126 ord 0x80,r0 ; set rounding mode to round to zero 127 lfsr r0 128 129 movl f0,f4 130 divl f2,f4 ; f4 = x/y 131 sfsr r0 132 andd r3,r0 ; get the inexact flag 133 cmpqd 0,r0 134 bne L18 135 ; if x/y exact, then ... 136 cmpl f2,f4 ; if y == x/y 137 beq L2 138 movl f4,-8(fp) 139 subd 1,-8(fp) 140 subcd 0,-4(fp) 141 movl -8(fp),f4 ; f4 = f4 - ulp 142L18: 143 bicd [6],r1 144 ord r3,r1 ; set inexact flag in r1 145 146 andd r1,r4 ; r4 = the old rounding mode 147 cmpqd 0,r4 ; round to nearest? 148 bne L17 149 movl f4,-8(fp) 150 addd 1,-8(fp) 151 addcd 0,-4(fp) 152 movl -8(fp),f4 ; f4 = f4 + ulp 153 br L16 154L17: 155 cmpd 0x100,r4 ; round to positive inf ? 156 bne L16 157 movl f4,-8(fp) 158 addd 1,-8(fp) 159 addcd 0,-4(fp) 160 movl -8(fp),f4 ; f4 = f4 + ulp 161 162 movl f2,-8(fp) 163 addd 1,-8(fp) 164 addcd 0,-4(fp) 165 movl -8(fp),f2 ; f2 = f2 + ulp 166L16: 167 addl f4,f2 ; y = y + t 168 subd 0x100000,r5 ; scalx = scalx - 1 169L2: 170 movl f2,-8(fp) 171 addd r5,-4(fp) 172 movl -8(fp),f0 173 lfsr r1 174L1: 175 movl tos,f6 176 movl tos,f4 177 exit [r3,r4,r5,r6,r7] 178 ret 0 179 .data 180L28: .byte 64,40,35,41,115,113,114,116,46,99 181 .byte 9,49,46,49,32,40,117,99,98,46 182 .byte 101,108,101,102,117,110,116,41,32,57 183 .byte 47,49,57,47,56,53,0 184L29: .blkb 4 185 .double 1204 186 .double 3062 187 .double 5746 188 .double 9193 189 .double 13348 190 .double 18162 191 .double 23592 192 .double 29598 193 .double 36145 194 .double 43202 195 .double 50740 196 .double 58733 197 .double 67158 198 .double 75992 199 .double 85215 200 .double 83599 201 .double 71378 202 .double 60428 203 .double 50647 204 .double 41945 205 .double 34246 206 .double 27478 207 .double 21581 208 .double 16499 209 .double 12183 210 .double 8588 211 .double 5674 212 .double 3403 213 .double 1742 214 .double 661 215 .double 130 216L30: .blkb 4 217 .double 1129316352 ;L30: .double 0,0x43500000 218L31: .blkb 4 219 .double 0x1ff00000 220L32: .blkb 4 221 .double 0x5ff00000 222