1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; log10.asm 26; 27; An implementation of the log10 libm function. 28; 29; Prototype: 30; 31; double log10(double x); 32; 33 34; 35; Algorithm: 36; Similar to one presnted in log.asm 37; 38 39.const 40 41ALIGN 16 42 43__real_ninf DQ 0fff0000000000000h ; -inf 44 DQ 0000000000000000h 45__real_inf DQ 7ff0000000000000h ; +inf 46 DQ 0000000000000000h 47__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN 48 DQ 0000000000000000h 49__real_qnanbit DQ 0008000000000000h 50 DQ 0000000000000000h 51__int_1023 DQ 00000000000003ffh 52 DQ 0000000000000000h 53__mask_001 DQ 0000000000000001h 54 DQ 0000000000000000h 55 56__mask_mant DQ 000FFFFFFFFFFFFFh ; mask for mantissa bits 57 DQ 0000000000000000h 58 59__mask_mant_top8 DQ 000ff00000000000h ; mask for top 8 mantissa bits 60 DQ 0000000000000000h 61 62__mask_mant9 DQ 0000080000000000h ; mask for 9th mantissa bit 63 DQ 0000000000000000h 64 65__real_log10_e DQ 3fdbcb7b1526e50eh 66 DQ 0000000000000000h 67 68__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01 69 DQ 0000000000000000h 70__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7 71 DQ 0000000000000000h 72 73__real_log10_2_lead DQ 3fd3441350000000h 74 DQ 0000000000000000h 75__real_log10_2_tail DQ 3e03ef3fde623e25h 76 DQ 0000000000000000h 77 78__real_two DQ 4000000000000000h ; 2 79 DQ 0000000000000000h 80 81__real_one DQ 3ff0000000000000h ; 1 82 DQ 0000000000000000h 83 84__real_half DQ 3fe0000000000000h ; 1/2 85 DQ 0000000000000000h 86 87__mask_100 DQ 0000000000000100h 88 DQ 0000000000000000h 89__real_1_over_512 DQ 3f60000000000000h 90 DQ 0000000000000000h 91 92__real_1_over_2 DQ 3fe0000000000000h 93 DQ 0000000000000000h 94__real_1_over_3 DQ 3fd5555555555555h 95 DQ 0000000000000000h 96__real_1_over_4 DQ 3fd0000000000000h 97 DQ 0000000000000000h 98__real_1_over_5 DQ 3fc999999999999ah 99 DQ 0000000000000000h 100__real_1_over_6 DQ 3fc5555555555555h 101 DQ 0000000000000000h 102 103__real_neg_1023 DQ 0c08ff80000000000h 104 DQ 0000000000000000h 105 106__mask_2045 DQ 00000000000007fdh 107 DQ 0000000000000000h 108 109__real_threshold DQ 3fb0000000000000h ; .0625 110 DQ 0000000000000000h 111 112__real_near_one_lt DQ 3fee000000000000h ; .9375 113 DQ 0000000000000000h 114 115__real_near_one_gt DQ 3ff1000000000000h ; 1.0625 116 DQ 0000000000000000h 117 118__real_min_norm DQ 0010000000000000h 119 DQ 0000000000000000h 120 121__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit 122 DQ 0000000000000000h 123 124__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02 125 DQ 0000000000000000h 126__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02 127 DQ 0000000000000000h 128__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03 129 DQ 0000000000000000h 130__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04 131 DQ 0000000000000000h 132 133__mask_lower DQ 0ffffffff00000000h 134 DQ 0000000000000000h 135 136; these codes and the ones in the corresponding .c file have to match 137__flag_x_zero DD 00000001 138__flag_x_neg DD 00000002 139__flag_x_nan DD 00000003 140 141 142EXTRN __log10_256_lead:QWORD 143EXTRN __log10_256_tail:QWORD 144EXTRN __log_F_inv_qword:QWORD 145EXTRN __use_fma3_lib:DWORD 146 147 148; local variable storage offsets 149save_xmm6 EQU 20h 150dummy_space EQU 30h 151stack_size EQU 058h 152 153include fm.inc 154 155fname TEXTEQU <log10> 156fname_special TEXTEQU <_log10_special> 157 158EXTERN fname_special:PROC 159 160.code 161ALIGN 16 162PUBLIC fname 163fname PROC FRAME 164 StackAllocate stack_size 165 SaveXmm xmm6, save_xmm6 166 .ENDPROLOG 167 168 cmp DWORD PTR __use_fma3_lib, 0 169 jne Llog10_fma3 170 171Llog10_sse2: 172 173 ; compute exponent part 174 movapd xmm3, xmm0 175 movapd xmm4, xmm0 176 psrlq xmm3, 52 177 movd rax, xmm0 178 psubq xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent 179 180 ; NaN or inf 181 movapd xmm5, xmm0 182 andpd xmm5, XMMWORD PTR __real_inf 183 comisd xmm5, QWORD PTR __real_inf 184 je Llog10_sse2_x_is_inf_or_nan 185 186 movapd xmm2, xmm0 187 cvtdq2pd xmm6, xmm3 ; xmm6 <-- unbiased exp as double 188 189 190 pand xmm2, XMMWORD PTR __mask_mant 191 subsd xmm4, QWORD PTR __real_one 192 193 comisd xmm6, QWORD PTR __real_neg_1023 194 je Llog10_sse2_denormal_adjust 195 196Llog10_sse2_continue_common: 197 198 andpd xmm4, XMMWORD PTR __real_notsign 199 ; compute index into the log tables 200 mov r9, rax 201 and rax, QWORD PTR __mask_mant_top8 202 and r9, QWORD PTR __mask_mant9 203 shl r9, 1 204 add rax, r9 205 movd xmm1, rax 206 207 ; near one codepath 208 comisd xmm4, QWORD PTR __real_threshold 209 jb Llog10_sse2_near_one 210 211 ; F, Y 212 shr rax, 44 213 por xmm2, XMMWORD PTR __real_half 214 por xmm1, XMMWORD PTR __real_half 215 lea r9, QWORD PTR __log_F_inv_qword 216 217 ; check for negative numbers or zero 218 xorpd xmm5, xmm5 219 comisd xmm0, xmm5 220 jbe Llog10_sse2_x_is_zero_or_neg 221 222 ; f = F - Y, r = f * inv 223 subsd xmm1, xmm2 224 mulsd xmm1, QWORD PTR [r9+rax*8] 225 226 movapd xmm2, xmm1 227 movapd xmm0, xmm1 228 lea r9, QWORD PTR __log10_256_lead 229 230 ; poly 231 movsd xmm3, QWORD PTR __real_1_over_6 232 movsd xmm1, QWORD PTR __real_1_over_3 233 mulsd xmm3, xmm2 234 mulsd xmm1, xmm2 235 mulsd xmm0, xmm2 236 movapd xmm4, xmm0 237 addsd xmm3, QWORD PTR __real_1_over_5 238 addsd xmm1, QWORD PTR __real_1_over_2 239 mulsd xmm4, xmm0 240 mulsd xmm3, xmm2 241 mulsd xmm1, xmm0 242 addsd xmm3, QWORD PTR __real_1_over_4 243 addsd xmm1, xmm2 244 mulsd xmm3, xmm4 245 addsd xmm1, xmm3 246 247 movsd xmm5, QWORD PTR __real_log10_2_tail 248 mulsd xmm1, QWORD PTR __real_log10_e 249 250 ; m*log(10) + log10(G) - poly 251 mulsd xmm5, xmm6 252 subsd xmm5, xmm1 253 254 movsd xmm0, QWORD PTR [r9+rax*8] 255 lea rdx, QWORD PTR __log10_256_tail 256 movsd xmm2, QWORD PTR [rdx+rax*8] 257 258 movsd xmm4, QWORD PTR __real_log10_2_lead 259 mulsd xmm4, xmm6 260 addsd xmm0, xmm4 261 addsd xmm2, xmm5 262 263 addsd xmm0, xmm2 264 265 RestoreXmm xmm6, save_xmm6 266 StackDeallocate stack_size 267 ret 268 269ALIGN 16 270Llog10_sse2_near_one: 271 272 ; r = x - 1.0 273 movsd xmm2, QWORD PTR __real_two 274 subsd xmm0, QWORD PTR __real_one ; r 275 276 addsd xmm2, xmm0 277 movapd xmm1, xmm0 278 divsd xmm1, xmm2 ; r/(2+r) = u/2 279 280 movsd xmm4, QWORD PTR __real_ca2 281 movsd xmm5, QWORD PTR __real_ca4 282 283 movapd xmm6, xmm0 284 mulsd xmm6, xmm1 ; correction 285 286 addsd xmm1, xmm1 ; u 287 movapd xmm2, xmm1 288 289 mulsd xmm2, xmm1 ; u^2 290 291 mulsd xmm4, xmm2 292 mulsd xmm5, xmm2 293 294 addsd xmm4, QWORD PTR __real_ca1 295 addsd xmm5, QWORD PTR __real_ca3 296 297 mulsd xmm2, xmm1 ; u^3 298 mulsd xmm4, xmm2 299 300 mulsd xmm2, xmm2 301 mulsd xmm2, xmm1 ; u^7 302 mulsd xmm5, xmm2 303 304 movsd xmm2, QWORD PTR __real_log10_e_tail 305 addsd xmm4, xmm5 306 subsd xmm4, xmm6 307 movsd xmm6, QWORD PTR __real_log10_e_lead 308 309 movapd xmm3, xmm0 310 pand xmm3, XMMWORD PTR __mask_lower 311 subsd xmm0, xmm3 312 addsd xmm4, xmm0 313 314 movapd xmm0, xmm3 315 movapd xmm1, xmm4 316 317 mulsd xmm4, xmm2 318 mulsd xmm0, xmm2 319 mulsd xmm1, xmm6 320 mulsd xmm3, xmm6 321 322 addsd xmm0, xmm4 323 addsd xmm0, xmm1 324 addsd xmm0, xmm3 325 326 RestoreXmm xmm6, save_xmm6 327 StackDeallocate stack_size 328 ret 329 330Llog10_sse2_denormal_adjust: 331 por xmm2, XMMWORD PTR __real_one 332 subsd xmm2, QWORD PTR __real_one 333 movsd xmm5, xmm2 334 pand xmm2, XMMWORD PTR __mask_mant 335 movd rax, xmm2 336 psrlq xmm5, 52 337 psubd xmm5, XMMWORD PTR __mask_2045 338 cvtdq2pd xmm6, xmm5 339 jmp Llog10_sse2_continue_common 340 341ALIGN 16 342Llog10_sse2_x_is_zero_or_neg: 343 jne Llog10_sse2_x_is_neg 344 345 movsd xmm1, QWORD PTR __real_ninf 346 mov r8d, DWORD PTR __flag_x_zero 347 call fname_special 348 jmp Llog10_sse2_finish 349 350ALIGN 16 351Llog10_sse2_x_is_neg: 352 353 movsd xmm1, QWORD PTR __real_neg_qnan 354 mov r8d, DWORD PTR __flag_x_neg 355 call fname_special 356 jmp Llog10_sse2_finish 357 358ALIGN 16 359Llog10_sse2_x_is_inf_or_nan: 360 361 cmp rax, QWORD PTR __real_inf 362 je Llog10_sse2_finish 363 364 cmp rax, QWORD PTR __real_ninf 365 je Llog10_sse2_x_is_neg 366 367 or rax, QWORD PTR __real_qnanbit 368 movd xmm1, rax 369 mov r8d, DWORD PTR __flag_x_nan 370 call fname_special 371 jmp Llog10_sse2_finish 372 373ALIGN 16 374Llog10_sse2_finish: 375 RestoreXmm xmm6, save_xmm6 376 StackDeallocate stack_size 377 ret 378 379ALIGN 16 380Llog10_fma3: 381 ; compute exponent part 382 xor rax,rax 383 vpsrlq xmm3,xmm0,52 384 vmovq rax,xmm0 385 vpsubq xmm3,xmm3,XMMWORD PTR __int_1023 386 vcvtdq2pd xmm6,xmm3 ; xmm6 <-- (double)xexp 387 388 ; NaN or Inf? 389 vpand xmm5,xmm0,__real_inf 390 vcomisd xmm5,QWORD PTR __real_inf 391 je Llog10_fma3_x_is_inf_or_nan 392 393 ; negative number or zero? 394 vpxor xmm5,xmm5,xmm5 395 vcomisd xmm0,xmm5 396 jbe Llog10_fma3_x_is_zero_or_neg 397 398 vpand xmm2,xmm0,__mask_mant 399 vsubsd xmm4,xmm0,QWORD PTR __real_one 400 401 ; Subnormal? 402 vcomisd xmm6,QWORD PTR __real_neg_1023 403 je Llog10_fma3_denormal_adjust 404 405Llog10_fma3_continue_common: 406 ; compute index into the log tables 407 vpand xmm1,xmm0,XMMWORD PTR __mask_mant_top8 408 vpand xmm3,xmm0,XMMWORD PTR __mask_mant9 409 vpsllq xmm3,xmm3,1 410 vpaddq xmm1,xmm3,xmm1 411 vmovq rax,xmm1 412 413 ; near one codepath 414 vpand xmm4,xmm4,XMMWORD PTR __real_notsign 415 vcomisd xmm4,QWORD PTR __real_threshold 416 jb Llog10_fma3_near_one 417 418 ; F,Y 419 shr rax,44 420 vpor xmm2,xmm2,XMMWORD PTR __real_half 421 vpor xmm1,xmm1,XMMWORD PTR __real_half 422 lea r9,DWORD PTR __log_F_inv_qword 423 424 ; f = F - Y,r = f * inv 425 vsubsd xmm1,xmm1,xmm2 426 vmulsd xmm1,xmm1,QWORD PTR [r9 + rax * 8] 427 428 lea r9,DWORD PTR __log10_256_lead 429 430 ; poly 431 vmulsd xmm0,xmm1,xmm1 ; r*r 432 vmovsd xmm3,QWORD PTR __real_1_over_6 433 vmovsd xmm5,QWORD PTR __real_1_over_3 434 vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5 435 vfmadd213sd xmm5,xmm1,QWORD PTR __real_half ; 1/2+r*1/3 436 movsd xmm4,xmm0 ; r*r 437 vfmadd213sd xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6) 438 439 vmulsd xmm4,xmm0,xmm0 ; r*r*r*r 440 vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r 441 vfmadd231sd xmm1,xmm3,xmm4 442 443 vmulsd xmm1,xmm1,QWORD PTR __real_log10_e 444 ; m*log(2) + log(G) - poly*log10_e 445 vmovsd xmm5,QWORD PTR __real_log10_2_tail 446 vfmsub213sd xmm5,xmm6,xmm1 447 448 movsd xmm0,QWORD PTR [r9 + rax * 8] 449 lea rdx,DWORD PTR __log10_256_tail 450 movsd xmm2,QWORD PTR [rdx + rax * 8] 451 vaddsd xmm2,xmm2,xmm5 452 453 vfmadd231sd xmm0,xmm6,QWORD PTR __real_log10_2_lead 454 455 vaddsd xmm0,xmm0,xmm2 456 AVXRestoreXmm xmm6, save_xmm6 457 StackDeallocate stack_size 458 ret 459 460 461ALIGN 16 462Llog10_fma3_near_one: 463 ; r = x - 1.0 464 vmovsd xmm2,QWORD PTR __real_two 465 vsubsd xmm0,xmm0,QWORD PTR __real_one ; r 466 467 vaddsd xmm2,xmm2,xmm0 468 vdivsd xmm1,xmm0,xmm2 ; r/(2+r) = u/2 469 470 vmovsd xmm4,QWORD PTR __real_ca2 471 vmovsd xmm5,QWORD PTR __real_ca4 472 473 vmulsd xmm6,xmm0,xmm1 ; correction 474 vaddsd xmm1,xmm1,xmm1 ; u 475 476 vmulsd xmm2,xmm1,xmm1 ; u^2 477 vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1 478 vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3 479 480 vmulsd xmm2,xmm2,xmm1 ; u^3 481 vmulsd xmm4,xmm4,xmm2 482 483 vmulsd xmm2,xmm2,xmm2 484 vmulsd xmm2,xmm2,xmm1 ; u^7 485 486 vmulsd xmm5,xmm5,xmm2 487 vaddsd xmm4,xmm4,xmm5 488 vsubsd xmm4,xmm4,xmm6 489 vpand xmm3,xmm0,XMMWORD PTR __mask_lower 490 vsubsd xmm0,xmm0,xmm3 491 vaddsd xmm4,xmm4,xmm0 492 493 vmulsd xmm1,xmm4,QWORD PTR __real_log10_e_lead 494 vmulsd xmm4,xmm4,QWORD PTR __real_log10_e_tail 495 vmulsd xmm0,xmm3,QWORD PTR __real_log10_e_tail 496 vmulsd xmm3,xmm3,QWORD PTR __real_log10_e_lead 497 498 vaddsd xmm0,xmm0,xmm4 499 vaddsd xmm0,xmm0,xmm1 500 vaddsd xmm0,xmm0,xmm3 501 502 AVXRestoreXmm xmm6, save_xmm6 503 StackDeallocate stack_size 504 ret 505 506 507Llog10_fma3_denormal_adjust: 508 vpor xmm2,xmm2,XMMWORD PTR __real_one 509 vsubsd xmm2,xmm2,QWORD PTR __real_one 510 vpsrlq xmm5,xmm2,52 511 vpand xmm2,xmm2,XMMWORD PTR __mask_mant 512 vmovapd xmm0,xmm2 513 vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045 514 vcvtdq2pd xmm6,xmm5 515 jmp Llog10_fma3_continue_common 516 517ALIGN 16 518Llog10_fma3_x_is_zero_or_neg: 519 jne Llog10_fma3_x_is_neg 520 vmovsd xmm1,QWORD PTR __real_ninf 521 mov r8d,DWORD PTR __flag_x_zero 522 call fname_special 523 524 AVXRestoreXmm xmm6, save_xmm6 525 StackDeallocate stack_size 526 ret 527 528 529ALIGN 16 530Llog10_fma3_x_is_neg: 531 532 vmovsd xmm1,QWORD PTR __real_neg_qnan 533 mov r8d,DWORD PTR __flag_x_neg 534 call fname_special 535 536 AVXRestoreXmm xmm6, save_xmm6 537 StackDeallocate stack_size 538 ret 539 540 541ALIGN 16 542Llog10_fma3_x_is_inf_or_nan: 543 544 cmp rax,QWORD PTR __real_inf 545 je Llog10_fma3_finish 546 547 cmp rax,QWORD PTR __real_ninf 548 je Llog10_fma3_x_is_neg 549 550 or rax,QWORD PTR __real_qnanbit 551 movd xmm1,rax 552 mov r8d,DWORD PTR __flag_x_nan 553 call fname_special 554 jmp Llog10_fma3_finish 555 556ALIGN 16 557Llog10_fma3_finish: 558 559 AVXRestoreXmm xmm6, save_xmm6 560 StackDeallocate stack_size 561 ret 562fname endp 563 564END 565 566