1/* Copyright (C) 2008-2019 Free Software Foundation, Inc. 2 Contributor: Joern Rennecke <joern.rennecke@embecosm.com> 3 on behalf of Synopsys Inc. 4 5This file is part of GCC. 6 7GCC is free software; you can redistribute it and/or modify it under 8the terms of the GNU General Public License as published by the Free 9Software Foundation; either version 3, or (at your option) any later 10version. 11 12GCC is distributed in the hope that it will be useful, but WITHOUT ANY 13WARRANTY; without even the implied warranty of MERCHANTABILITY or 14FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15for more details. 16 17Under Section 7 of GPL version 3, you are granted additional 18permissions described in the GCC Runtime Library Exception, version 193.1, as published by the Free Software Foundation. 20 21You should have received a copy of the GNU General Public License and 22a copy of the GCC Runtime Library Exception along with this program; 23see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 24<http://www.gnu.org/licenses/>. */ 25 26/* 27 to calculate a := b/x as b*y, with y := 1/x: 28 - x is in the range [1..2) 29 - calculate 15..18 bit inverse y0 using a table of approximating polynoms. 30 Precision is higher for polynoms used to evaluate input with larger 31 value. 32 - Do one newton-raphson iteration step to double the precision, 33 then multiply this with the divisor 34 -> more time to decide if dividend is subnormal 35 - the worst error propagation is on the side of the value range 36 with the least initial defect, thus giving us about 30 bits precision. 37 The truncation error for the either is less than 1 + x/2 ulp. 38 A 31 bit inverse can be simply calculated by using x with implicit 1 39 and chaining the multiplies. For a 32 bit inverse, we multiply y0^2 40 with the bare fraction part of x, then add in y0^2 for the implicit 41 1 of x. 42 - If calculating a 31 bit inverse, the systematic error is less than 43 -1 ulp; likewise, for 32 bit, it is less than -2 ulp. 44 - If we calculate our seed with a 32 bit fraction, we can archive a 45 tentative result strictly better than -2 / +2.5 (1) ulp/128, i.e. we 46 only need to take the step to calculate the 2nd stage rest and 47 rounding adjust 1/32th of the time. However, if we use a 20 bit 48 fraction for the seed, the negative error can exceed -2 ulp/128, (2) 49 thus for a simple add / tst check, we need to do the 2nd stage 50 rest calculation/ rounding adjust 1/16th of the time. 51 (1): The inexactness of the 32 bit inverse contributes an error in the 52 range of (-1 .. +(1+x/2) ) ulp/128. Leaving out the low word of the 53 rest contributes an error < +1/x ulp/128 . In the interval [1,2), 54 x/2 + 1/x <= 1.5 . 55 (2): Unless proven otherwise. I have not actually looked for an 56 example where -2 ulp/128 is exceeded, and my calculations indicate 57 that the excess, if existent, is less than -1/512 ulp. 58 ??? The algorithm is still based on the ARC700 optimized code. 59 Maybe we could make better use of 64 bit multiply results and/or mmed . 60 */ 61#include "../arc-ieee-754.h" 62 63/* N.B. fp-bit.c does double rounding on denormal numbers. */ 64#if 0 /* DEBUG */ 65 .global __divdf3 66 FUNC(__divdf3) 67 .balign 4 68__divdf3: 69 push_s blink 70 push_s r2 71 push_s r3 72 push_s r0 73 bl.d __divdf3_c 74 push_s r1 75 ld_s r2,[sp,12] 76 ld_s r3,[sp,8] 77 st_s r0,[sp,12] 78 st_s r1,[sp,8] 79 pop_s r1 80 bl.d __divdf3_asm 81 pop_s r0 82 pop_s r3 83 pop_s r2 84 pop_s blink 85 cmp r0,r2 86 cmp.eq r1,r3 87 jeq_s [blink] 88 and r12,DBL0H,DBL1H 89 bic.f 0,0x7ff80000,r12 ; both NaN -> OK 90 jeq_s [blink] 91 bl abort 92 ENDFUNC(__divdf3) 93#define __divdf3 __divdf3_asm 94#endif /* DEBUG */ 95 96 FUNC(__divdf3) 97 .balign 4 98.L7ff00000: 99 .long 0x7ff00000 100.Ldivtab: 101 .long 0xfc0fffe1 102 .long 0xf46ffdfb 103 .long 0xed1ffa54 104 .long 0xe61ff515 105 .long 0xdf7fee75 106 .long 0xd91fe680 107 .long 0xd2ffdd52 108 .long 0xcd1fd30c 109 .long 0xc77fc7cd 110 .long 0xc21fbbb6 111 .long 0xbcefaec0 112 .long 0xb7efa100 113 .long 0xb32f92bf 114 .long 0xae8f83b7 115 .long 0xaa2f7467 116 .long 0xa5ef6479 117 .long 0xa1cf53fa 118 .long 0x9ddf433e 119 .long 0x9a0f3216 120 .long 0x965f2091 121 .long 0x92df0f11 122 .long 0x8f6efd05 123 .long 0x8c1eeacc 124 .long 0x88eed876 125 .long 0x85dec615 126 .long 0x82eeb3b9 127 .long 0x800ea10b 128 .long 0x7d3e8e0f 129 .long 0x7a8e7b3f 130 .long 0x77ee6836 131 .long 0x756e5576 132 .long 0x72fe4293 133 .long 0x709e2f93 134 .long 0x6e4e1c7f 135 .long 0x6c0e095e 136 .long 0x69edf6c5 137 .long 0x67cde3a5 138 .long 0x65cdd125 139 .long 0x63cdbe25 140 .long 0x61ddab3f 141 .long 0x600d991f 142 .long 0x5e3d868c 143 .long 0x5c6d7384 144 .long 0x5abd615f 145 .long 0x590d4ecd 146 .long 0x576d3c83 147 .long 0x55dd2a89 148 .long 0x545d18e9 149 .long 0x52dd06e9 150 .long 0x516cf54e 151 .long 0x4ffce356 152 .long 0x4e9cd1ce 153 .long 0x4d3cbfec 154 .long 0x4becae86 155 .long 0x4aac9da4 156 .long 0x496c8c73 157 .long 0x483c7bd3 158 .long 0x470c6ae8 159 .long 0x45dc59af 160 .long 0x44bc4915 161 .long 0x43ac3924 162 .long 0x428c27fb 163 .long 0x418c187a 164 .long 0x407c07bd 165 166__divdf3_support: /* This label makes debugger output saner. */ 167 .balign 4 168.Ldenorm_dbl1: 169 brge r6, \ 170 0x43500000,.Linf_NaN ; large number / denorm -> Inf 171 bmsk.f r12,DBL1H,19 172 mov.eq r12,DBL1L 173 mov.eq DBL1L,0 174 sub.eq r7,r7,32 175 norm.f r11,r12 ; flag for x/0 -> Inf check 176 beq_s .Linf_NaN 177 mov.mi r11,0 178 add.pl r11,r11,1 179 add_s r12,r12,r12 180 asl r8,r12,r11 181 rsub r12,r11,31 182 lsr r12,DBL1L,r12 183 tst_s DBL1H,DBL1H 184 or r8,r8,r12 185 lsr r4,r8,26 186 lsr DBL1H,r8,12 187 ld.as r4,[r10,r4] 188 bxor.mi DBL1H,DBL1H,31 189 sub r11,r11,11 190 asl DBL1L,DBL1L,r11 191 sub r11,r11,1 192 mulu64 r4,r8 193 sub r7,r7,r11 194 b.d .Lpast_denorm_dbl1 195 asl r7,r7,20 196 197 .balign 4 198.Ldenorm_dbl0: 199 bmsk.f r12,DBL0H,19 200 ; wb stall 201 mov.eq r12,DBL0L 202 sub.eq r6,r6,32 203 norm.f r11,r12 ; flag for 0/x -> 0 check 204 brge r7, \ 205 0x43500000, .Lret0_2 ; denorm/large number -> 0 206 beq_s .Lret0_2 207 mov.mi r11,0 208 add.pl r11,r11,1 209 asl r12,r12,r11 210 sub r6,r6,r11 211 add.f 0,r6,31 212 lsr r10,DBL0L,r6 213 mov.mi r10,0 214 add r6,r6,11+32 215 neg.f r11,r6 216 asl DBL0L,DBL0L,r11 217 mov.pl DBL0L,0 218 sub r6,r6,32-1 219 b.d .Lpast_denorm_dbl0 220 asl r6,r6,20 221 222.Linf_NaN: 223 tst_s DBL0L,DBL0L ; 0/0 -> NaN 224 xor_s DBL1H,DBL1H,DBL0H 225 bclr.eq.f DBL0H,DBL0H,31 226 bmsk DBL0H,DBL1H,30 227 xor_s DBL0H,DBL0H,DBL1H 228 sub.eq DBL0H,DBL0H,1 229 mov_s DBL0L,0 230 j_s.d [blink] 231 or DBL0H,DBL0H,r9 232 .balign 4 233.Lret0_2: 234 xor_s DBL1H,DBL1H,DBL0H 235 mov_s DBL0L,0 236 bmsk DBL0H,DBL1H,30 237 j_s.d [blink] 238 xor_s DBL0H,DBL0H,DBL1H 239 .balign 4 240 .global __divdf3 241/* N.B. the spacing between divtab and the sub3 to get its address must 242 be a multiple of 8. */ 243__divdf3: 244 asl r8,DBL1H,12 245 lsr r4,r8,26 246 sub3 r10,pcl,61; (.-.Ldivtab) >> 3 247 ld.as r9,[pcl,-124]; [pcl,(-((.-.L7ff00000) >> 2))] ; 0x7ff00000 248 ld.as r4,[r10,r4] 249 lsr r12,DBL1L,20 250 and.f r7,DBL1H,r9 251 or r8,r8,r12 252 mulu64 r4,r8 253 beq.d .Ldenorm_dbl1 254.Lpast_denorm_dbl1: 255 and.f r6,DBL0H,r9 256 breq.d r7,r9,.Linf_nan_dbl1 257 asl r4,r4,12 258 sub r4,r4,mhi 259 mulu64 r4,r4 260 beq.d .Ldenorm_dbl0 261 lsr r8,r8,1 262 breq.d r6,r9,.Linf_nan_dbl0 263 asl r12,DBL0H,11 264 lsr r10,DBL0L,21 265.Lpast_denorm_dbl0: 266 bset r8,r8,31 267 mulu64 mhi,r8 268 add_s r12,r12,r10 269 bset r5,r12,31 270 cmp r5,r8 271 cmp.eq DBL0L,DBL1L 272 lsr.cc r5,r5,1 273 sub r4,r4,mhi ; u1.31 inverse, about 30 bit 274 mulu64 r5,r4 ; result fraction highpart 275 lsr r8,r8,2 ; u3.29 276 add r5,r6, /* wait for immediate */ \ 277 0x3fe00000 278 mov r11,mhi ; result fraction highpart 279 mulu64 r11,r8 ; u-28.31 280 asl_s DBL1L,DBL1L,9 ; u-29.23:9 281 sbc r6,r5,r7 282 mov r12,mlo ; u-28.31 283 mulu64 r11,DBL1L ; mhi: u-28.23:9 284 add.cs DBL0L,DBL0L,DBL0L 285 asl_s DBL0L,DBL0L,6 ; u-26.25:7 286 asl r10,r11,23 287 sub_l DBL0L,DBL0L,r12 288 lsr r7,r11,9 289 sub r5,DBL0L,mhi ; rest msw ; u-26.31:0 290 mul64 r5,r4 ; mhi: result fraction lowpart 291 xor.f 0,DBL0H,DBL1H 292 and DBL0H,r6,r9 293 add_s DBL0H,DBL0H,r7 294 bclr r12,r9,20 ; 0x7fe00000 295 brhs.d r6,r12,.Linf_denorm 296 bxor.mi DBL0H,DBL0H,31 297 add.f r12,mhi,0x11 298 asr r9,r12,5 299 sub.mi DBL0H,DBL0H,1 300 add.f DBL0L,r9,r10 301 tst r12,0x1c 302 jne.d [blink] 303 add.cs DBL0H,DBL0H,1 304 /* work out exact rounding if we fall through here. */ 305 /* We know that the exact result cannot be represented in double 306 precision. Find the mid-point between the two nearest 307 representable values, multiply with the divisor, and check if 308 the result is larger than the dividend. Since we want to know 309 only the sign bit, it is sufficient to calculate only the 310 highpart of the lower 64 bits. */ 311 mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo 312 sub.f DBL0L,DBL0L,1 313 asl r12,r9,2 ; u-22.30:2 314 sub.cs DBL0H,DBL0H,1 315 sub.f r12,r12,2 316 mov r10,mlo ; rest before considering r12 in r5 : -r10 317 mulu64 r12,DBL1L ; mhi: u-51.32 318 asl r5,r5,25 ; s-51.7:25 319 lsr r10,r10,7 ; u-51.30:2 320 mov r7,mhi ; u-51.32 321 mulu64 r12,r8 ; mlo: u-51.31:1 322 sub r5,r5,r10 323 add.mi r5,r5,DBL1L ; signed multiply adjust for r12*DBL1L 324 bset r7,r7,0 ; make sure that the result is not zero, and that 325 sub r5,r5,r7 ; a highpart zero appears negative 326 sub.f r5,r5,mlo ; rest msw 327 add.pl.f DBL0L,DBL0L,1 328 j_s.d [blink] 329 add.eq DBL0H,DBL0H,1 330 331.Linf_nan_dbl1: ; 0/Inf -> NaN Inf/Inf -> NaN x/Inf-> 0 x/NaN -> NaN 332 or.f 0,r6,DBL0L 333 cmp.ne r6,r9 334 not_s DBL0L,DBL1H 335 sub_s.ne DBL0L,DBL0L,DBL0L 336 tst_s DBL0H,DBL0H 337 add_s DBL0H,DBL1H,DBL0L 338 j_s.d [blink] 339 bxor.mi DBL0H,DBL0H,31 340.Linf_nan_dbl0: 341 tst_s DBL1H,DBL1H 342 j_s.d [blink] 343 bxor.mi DBL0H,DBL0H,31 344 .balign 4 345.Linf_denorm: 346 lsr r12,r6,28 347 brlo.d r12,0xc,.Linf 348.Ldenorm: 349 asr r6,r6,20 350 neg r9,r6 351 mov_s DBL0H,0 352 brhs.d r9,54,.Lret0 353 bxor.mi DBL0H,DBL0H,31 354 add r12,mhi,1 355 and r12,r12,-4 356 rsub r7,r6,5 357 asr r10,r12,28 358 bmsk r4,r12,27 359 min r7,r7,31 360 asr DBL0L,r4,r7 361 add DBL1H,r11,r10 362 abs.f r10,r4 363 sub.mi r10,r10,1 364 add.f r7,r6,32-5 365 asl r4,r4,r7 366 mov.mi r4,r10 367 add.f r10,r6,23 368 rsub r7,r6,9 369 lsr r7,DBL1H,r7 370 asl r10,DBL1H,r10 371 or.pnz DBL0H,DBL0H,r7 372 or.mi r4,r4,r10 373 mov.mi r10,r7 374 add.f DBL0L,r10,DBL0L 375 add.cs.f DBL0H,DBL0H,1 ; carry clear after this point 376 bxor.f 0,r4,31 377 add.pnz.f DBL0L,DBL0L,1 378 add.cs.f DBL0H,DBL0H,1 379 jne_s [blink] 380 /* Calculation so far was not conclusive; calculate further rest. */ 381 mulu64 r11,DBL1L ; rest before considering r12 in r5 : -mlo 382 asr.f r12,r12,3 383 asl r5,r5,25 ; s-51.7:25 384 mov r11,mlo ; rest before considering r12 in r5 : -r11 385 mulu64 r12,r8 ; u-51.31:1 386 and r9,DBL0L,1 ; tie-breaker: round to even 387 lsr r11,r11,7 ; u-51.30:2 388 mov DBL1H,mlo ; u-51.31:1 389 mulu64 r12,DBL1L ; u-51.62:2 390 sub.mi r11,r11,DBL1L ; signed multiply adjust for r12*DBL1L 391 add_s DBL1H,DBL1H,r11 392 sub DBL1H,DBL1H,r5 ; -rest msw 393 add_s DBL1H,DBL1H,mhi ; -rest msw 394 add.f 0,DBL1H,DBL1H ; can't ror.f by 32 :-( 395 tst_s DBL1H,DBL1H 396 cmp.eq mlo,r9 397 add.cs.f DBL0L,DBL0L,1 398 j_s.d [blink] 399 add.cs DBL0H,DBL0H,1 400 401.Lret0: 402 /* return +- 0 */ 403 j_s.d [blink] 404 mov_s DBL0L,0 405.Linf: 406 mov_s DBL0H,r9 407 mov_s DBL0L,0 408 j_s.d [blink] 409 bxor.mi DBL0H,DBL0H,31 410 ENDFUNC(__divdf3) 411