1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; log.asm 26; 27; An implementation of the log libm function. 28; 29; Prototype: 30; 31; double log(double x); 32; 33 34; 35; Algorithm: 36; 37; Based on: 38; Ping-Tak Peter Tang 39; "Table-driven implementation of the logarithm function in IEEE 40; floating-point arithmetic" 41; ACM Transactions on Mathematical Software (TOMS) 42; Volume 16, Issue 4 (December 1990) 43; 44; 45; x very close to 1.0 is handled differently, for x everywhere else 46; a brief explanation is given below 47; 48; x = (2^m)*A 49; x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9)) 50; x = (2^m)*2*(G/2+g/2) 51; x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10)) 52; 53; Y = (2^(-1))*(2^(-m))*(2^m)*A 54; Now, range of Y is: 0.5 <= Y < 1 55; 56; F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit) 57; Now, range of F is: 256 <= F <= 512 58; F = F / 512 59; Now, range of F is: 0.5 <= F <= 1 60; 61; f = -(Y-F), with (f <= 2^(-10)) 62; 63; log(x) = m*log(2) + log(2) + log(F-f) 64; log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F)) 65; log(x) = m*log(2) + log(2*F) + log(1-r) 66; 67; r = (f/F), with (r <= 2^(-9)) 68; r = f*(1/F) with (1/F) precomputed to avoid division 69; 70; log(x) = m*log(2) + log(G) - poly 71; 72; log(G) is precomputed 73; poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6)) 74; 75; log(2) and log(G) need to be maintained in extra precision 76; to avoid losing precision in the calculations 77; 78 79.const 80ALIGN 16 81 82__real_ninf DQ 0fff0000000000000h ; -inf 83 DQ 0000000000000000h 84__real_inf DQ 7ff0000000000000h ; +inf 85 DQ 0000000000000000h 86__real_neg_qnan DQ 0fff8000000000000h ; neg qNaN 87 DQ 0000000000000000h 88__real_qnanbit DQ 0008000000000000h 89 DQ 0000000000000000h 90__real_min_norm DQ 0010000000000000h 91 DQ 0000000000000000h 92__real_mant DQ 000FFFFFFFFFFFFFh ; mantissa bits 93 DQ 0000000000000000h 94__mask_1023 DQ 00000000000003ffh 95 DQ 0000000000000000h 96__mask_001 DQ 0000000000000001h 97 DQ 0000000000000000h 98 99__mask_mant_all8 DQ 000ff00000000000h 100 DQ 0000000000000000h 101__mask_mant9 DQ 0000080000000000h 102 DQ 0000000000000000h 103 104__real_two DQ 4000000000000000h ; 2 105 DQ 0000000000000000h 106 107__real_one DQ 3ff0000000000000h ; 1 108 DQ 0000000000000000h 109 110__real_near_one_lt DQ 3fee000000000000h ; .9375 111 DQ 0000000000000000h 112 113__real_near_one_gt DQ 3ff1000000000000h ; 1.0625 114 DQ 0000000000000000h 115 116__real_half DQ 3fe0000000000000h ; 1/2 117 DQ 0000000000000000h 118 119__mask_100 DQ 0000000000000100h 120 DQ 0000000000000000h 121 122__real_1_over_512 DQ 3f60000000000000h 123 DQ 0000000000000000h 124 125__real_1_over_2 DQ 3fe0000000000000h 126 DQ 0000000000000000h 127__real_1_over_3 DQ 3fd5555555555555h 128 DQ 0000000000000000h 129__real_1_over_4 DQ 3fd0000000000000h 130 DQ 0000000000000000h 131__real_1_over_5 DQ 3fc999999999999ah 132 DQ 0000000000000000h 133__real_1_over_6 DQ 3fc5555555555555h 134 DQ 0000000000000000h 135 136__mask_1023_f DQ 0c08ff80000000000h 137 DQ 0000000000000000h 138 139__mask_2045 DQ 00000000000007fdh 140 DQ 0000000000000000h 141 142__real_threshold DQ 3fb0000000000000h ; .0625 143 DQ 0000000000000000h 144 145__real_notsign DQ 7ffFFFFFFFFFFFFFh ; ^sign bit 146 DQ 0000000000000000h 147 148__real_ca1 DQ 3fb55555555554e6h ; 8.33333333333317923934e-02 149 DQ 0000000000000000h 150__real_ca2 DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02 151 DQ 0000000000000000h 152__real_ca3 DQ 3f62492307f1519fh ; 2.23213998791944806202e-03 153 DQ 0000000000000000h 154__real_ca4 DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04 155 DQ 0000000000000000h 156__real_log2_lead DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01 157 DQ 00000000000000000h 158__real_log2_tail DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08 159 DQ 00000000000000000h 160 161; these codes and the ones in the corresponding .c file have to match 162__flag_x_zero DD 00000001 163__flag_x_neg DD 00000002 164__flag_x_nan DD 00000003 165 166 167EXTRN __log_256_lead:QWORD 168EXTRN __log_256_tail:QWORD 169EXTRN __log_F_inv_qword:QWORD 170EXTRN __use_fma3_lib:DWORD 171 172 173fname TEXTEQU <log> 174fname_special TEXTEQU <_log_special> 175 176; define local variable storage offsets 177 178save_xmm6 EQU 20h 179dummy_space EQU 40h 180 181stack_size EQU 58h 182 183include fm.inc 184 185; external function 186EXTERN fname_special:PROC 187 188.code 189ALIGN 16 190PUBLIC fname 191fname PROC FRAME 192 StackAllocate stack_size 193 SaveXmm xmm6, save_xmm6 194 .ENDPROLOG 195 196 cmp DWORD PTR __use_fma3_lib, 0 197 jne Llog_fma3 198 199Llog_sse2: 200 201 ; compute exponent part 202 movdqa xmm3, xmm0 203 movapd xmm4, xmm0 204 psrlq xmm3, 52 205 movd rax, xmm0 206 psubq xmm3, XMMWORD PTR __mask_1023 207 208 ; NaN or inf 209 mov rcx, rax 210 btr rcx, 63 211 cmp rcx, QWORD PTR __real_inf 212 jae __x_is_inf_or_nan 213 214 movdqa xmm2, xmm0 215 cvtdq2pd xmm6, xmm3 ; xexp 216 217 218 pand xmm2, XMMWORD PTR __real_mant 219 subsd xmm4, QWORD PTR __real_one 220 221 comisd xmm6, QWORD PTR __mask_1023_f 222 je __denormal_adjust 223 224__continue_common: 225 226 andpd xmm4, XMMWORD PTR __real_notsign 227 ; compute index into the log tables 228 mov r9, rax 229 and rax, QWORD PTR __mask_mant_all8 230 and r9, QWORD PTR __mask_mant9 231 shl r9, 1 232 add rax, r9 233 movd xmm1, rax 234 235 ; near one codepath 236 comisd xmm4, QWORD PTR __real_threshold 237 jb __near_one 238 239 ; F, Y 240 shr rax, 44 241 por xmm2, XMMWORD PTR __real_half 242 por xmm1, XMMWORD PTR __real_half 243 lea r9, __log_F_inv_qword 244 245 ; check for negative numbers or zero 246 xorpd xmm5, xmm5 247 comisd xmm0, xmm5 248 jbe __x_is_zero_or_neg 249 250 ; f = F - Y, r = f * inv 251 subsd xmm1, xmm2 ; xmm1 <-- f = F - Y 252 mulsd xmm1, QWORD PTR [r9+rax*8] ; xmm1 <-- r = f * inv 253 254 movapd xmm2, xmm1 ; xmm2 <-- copy of r 255 movapd xmm0, xmm1 ; xmm0 <-- copy of r 256 lea r9, QWORD PTR __log_256_lead 257 258 ; poly 259 movsd xmm3, QWORD PTR __real_1_over_6 260 movsd xmm1, QWORD PTR __real_1_over_3 261 mulsd xmm3, xmm2 ; xmm3 <-- r/6 262 mulsd xmm1, xmm2 ; xmm1 <-- r/3 263 mulsd xmm0, xmm2 ; xmm0 <-- r*r 264 movapd xmm4, xmm0 ; xmm4 <-- copy of r*r 265 addsd xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5 266 addsd xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2 267 mulsd xmm4, xmm0 ; xmm4 <-- r^4 268 mulsd xmm3, xmm2 ; xmm3 <-- (r/6 + 1/5)*r 269 mulsd xmm1, xmm0 ; xmm1 <-- (r/3 + 1/2)*r^2 270 addsd xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4 271 addsd xmm1, xmm2 ; xmm1 <-- (r/3 + 1/2)*r^2 + r 272 mulsd xmm3, xmm4 ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4 273 addsd xmm1, xmm3 ; xmm1 <-- poly 274 275 ; m*log(2)_tail + log(G)_tail - poly 276 movsd xmm5, QWORD PTR __real_log2_tail 277 mulsd xmm5, xmm6 ; xmm5 <-- m*log2_tail 278 subsd xmm5, xmm1 ; xmm5 <-- m*log2_tail - poly 279 280 movsd xmm0, QWORD PTR [r9+rax*8] ; xmm0 <-- log(G)_lead 281 lea rdx, QWORD PTR __log_256_tail 282 movsd xmm2, QWORD PTR [rdx+rax*8] ; xmm2 <-- log(G)_tail 283 addsd xmm2, xmm5 ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail 284 285 movsd xmm4, QWORD PTR __real_log2_lead 286 mulsd xmm4, xmm6 ; xmm4 <-- m*log2_lead 287 addsd xmm0, xmm4 ; xmm0 <-- m*log2_lead + log(G)_lead 288 289 addsd xmm0, xmm2 ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly 290 291 RestoreXmm xmm6, save_xmm6 292 StackDeallocate stack_size 293 ret 294 295ALIGN 16 296__near_one: 297 298 ; r = x - 1.0 299 movsd xmm2, QWORD PTR __real_two 300 subsd xmm0, QWORD PTR __real_one ; r 301 302 addsd xmm2, xmm0 303 movsd xmm1, xmm0 304 divsd xmm1, xmm2 ; r/(2+r) = u/2 305 306 movsd xmm4, QWORD PTR __real_ca2 307 movsd xmm5, QWORD PTR __real_ca4 308 309 movsd xmm6, xmm0 310 mulsd xmm6, xmm1 ; correction 311 312 addsd xmm1, xmm1 ; u 313 movsd xmm2, xmm1 314 315 mulsd xmm2, xmm1 ; u^2 316 317 mulsd xmm4, xmm2 318 mulsd xmm5, xmm2 319 320 addsd xmm4, __real_ca1 321 addsd xmm5, __real_ca3 322 323 mulsd xmm2, xmm1 ; u^3 324 mulsd xmm4, xmm2 325 326 mulsd xmm2, xmm2 327 mulsd xmm2, xmm1 ; u^7 328 mulsd xmm5, xmm2 329 330 addsd xmm4, xmm5 331 subsd xmm4, xmm6 332 addsd xmm0, xmm4 333 334 RestoreXmm xmm6, save_xmm6 335 StackDeallocate stack_size 336 ret 337 338ALIGN 16 339__denormal_adjust: 340 por xmm2, XMMWORD PTR __real_one 341 subsd xmm2, QWORD PTR __real_one 342 movsd xmm5, xmm2 343 pand xmm2, XMMWORD PTR __real_mant 344 movd rax, xmm2 345 psrlq xmm5, 52 346 psubd xmm5, XMMWORD PTR __mask_2045 347 cvtdq2pd xmm6, xmm5 348 jmp __continue_common 349 350ALIGN 16 351__x_is_zero_or_neg: 352 jne __x_is_neg 353 354 movsd xmm1, QWORD PTR __real_ninf 355 mov r8d, DWORD PTR __flag_x_zero 356 call fname_special 357 jmp __finish 358 359ALIGN 16 360__x_is_neg: 361 362 movsd xmm1, QWORD PTR __real_neg_qnan 363 mov r8d, DWORD PTR __flag_x_neg 364 call fname_special 365 jmp __finish 366 367ALIGN 16 368__x_is_inf_or_nan: 369 370 cmp rax, QWORD PTR __real_inf 371 je __finish 372 373 cmp rax, QWORD PTR __real_ninf 374 je __x_is_neg 375 376 or rax, QWORD PTR __real_qnanbit 377 movd xmm1, rax 378 mov r8d, DWORD PTR __flag_x_nan 379 call fname_special 380 jmp __finish 381 382ALIGN 16 383__finish: 384 RestoreXmm xmm6, save_xmm6 385 StackDeallocate stack_size 386 ret 387 388ALIGN 16 389Llog_fma3: 390 ; compute exponent part 391 xor rax,rax 392 vpsrlq xmm3,xmm0,52 393 vmovq rax,xmm0 394 vpsubq xmm3,xmm3,XMMWORD PTR __mask_1023 395 vcvtdq2pd xmm6,xmm3 ; xexp 396 397 ; NaN or inf 398 vpand xmm5,xmm0,XMMWORD PTR __real_inf 399 vcomisd xmm5,QWORD PTR __real_inf 400 je Llog_fma3_x_is_inf_or_nan 401 402 ; check for negative numbers or zero 403 vpxor xmm5,xmm5,xmm5 404 vcomisd xmm0,xmm5 405 jbe Llog_fma3_x_is_zero_or_neg 406 407 vpand xmm2,xmm0,XMMWORD PTR __real_mant 408 vsubsd xmm4,xmm0,QWORD PTR __real_one 409 410 vcomisd xmm6,QWORD PTR __mask_1023_f 411 je Llog_fma3_denormal_adjust 412 413Llog_fma3_continue_common: 414 ; compute index into the log tables 415 vpand xmm1,xmm0,XMMWORD PTR __mask_mant_all8 416 vpand xmm3,xmm0,XMMWORD PTR __mask_mant9 417 vpsllq xmm3,xmm3,1 418 vpaddq xmm1,xmm3,xmm1 419 vmovq rax,xmm1 420 421 ; near one codepath 422 vpand xmm4,xmm4,XMMWORD PTR __real_notsign 423 vcomisd xmm4,QWORD PTR __real_threshold 424 jb Llog_fma3_near_one 425 426 ; F,Y 427 shr rax,44 428 vpor xmm2,xmm2,XMMWORD PTR __real_half 429 vpor xmm1,xmm1,XMMWORD PTR __real_half 430 lea r9,QWORD PTR __log_F_inv_qword 431 432 ; f = F - Y,r = f * inv 433 vsubsd xmm1,xmm1,xmm2 434 vmulsd xmm1,xmm1,QWORD PTR[r9 + rax * 8] 435 436 lea r9,QWORD PTR __log_256_lead 437 438 ; poly 439 vmulsd xmm0,xmm1,xmm1 ; r*r 440 vmovsd xmm3,QWORD PTR __real_1_over_6 441 vmovsd xmm5,QWORD PTR __real_1_over_3 442 vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5 443 vfmadd213sd xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3 444 vmovsd xmm4,xmm0,xmm0 445 vfmadd213sd xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6) 446 447 vmulsd xmm4,xmm0,xmm0 ; r*r*r*r 448 vfmadd231sd xmm1,xmm5,xmm0 ; r*r*(1/2+r*1/3) + r 449 vfmadd231sd xmm1,xmm3,xmm4 450 451 ; m*log(2) + log(G) - poly 452 vmovsd xmm5,QWORD PTR __real_log2_tail 453 vfmsub213sd xmm5,xmm6,xmm1 454 455 vmovsd xmm0,QWORD PTR[r9 + rax * 8] 456 lea rdx,QWORD PTR __log_256_tail 457 vmovsd xmm1,QWORD PTR[rdx + rax * 8] 458 vaddsd xmm1,xmm1,xmm5 459 460 vfmadd231sd xmm0,xmm6,QWORD PTR __real_log2_lead 461 462 vaddsd xmm0,xmm0,xmm1 463 AVXRestoreXmm xmm6, save_xmm6 464 StackDeallocate stack_size 465 ret 466 467 468ALIGN 16 469Llog_fma3_near_one: 470 471 ; r = x - 1.0 472 vmovsd xmm3,QWORD PTR __real_two 473 vsubsd xmm0,xmm0,QWORD PTR __real_one ; r 474 475 vaddsd xmm3,xmm3,xmm0 476 vdivsd xmm1,xmm0,xmm3 ; r/(2+r) = u/2 477 478 vmovsd xmm4,QWORD PTR __real_ca2 479 vmovsd xmm5,QWORD PTR __real_ca4 480 481 vmulsd xmm3,xmm0,xmm1 ; correction 482 vaddsd xmm1,xmm1,xmm1 ; u 483 484 vmulsd xmm2,xmm1,xmm1 ; u^2 485 vfmadd213sd xmm4,xmm2,QWORD PTR __real_ca1 486 vfmadd213sd xmm5,xmm2,QWORD PTR __real_ca3 487 488 vmulsd xmm2,xmm2,xmm1 ; u^3 489 vmulsd xmm4,xmm4,xmm2 490 491 vmulsd xmm2,xmm2,xmm2 492 vmulsd xmm2,xmm2,xmm1 ; u^7 493 494 vfmadd231sd xmm4,xmm5,xmm2 495 vsubsd xmm4,xmm4,xmm3 496 vaddsd xmm0,xmm0,xmm4 497 498 AVXRestoreXmm xmm6, save_xmm6 499 StackDeallocate stack_size 500 ret 501 502 503Llog_fma3_denormal_adjust: 504 vpor xmm2,xmm2,XMMWORD PTR __real_one 505 vsubsd xmm2,xmm2,QWORD PTR __real_one 506 vpsrlq xmm5,xmm2,52 507 vpand xmm2,xmm2,XMMWORD PTR __real_mant 508 vmovapd xmm0,xmm2 509 vpsubd xmm5,xmm5,XMMWORD PTR __mask_2045 510 vcvtdq2pd xmm6,xmm5 511 jmp Llog_fma3_continue_common 512 513ALIGN 16 514Llog_fma3_x_is_zero_or_neg: 515 jne Llog_fma3_x_is_neg 516 vmovsd xmm1,QWORD PTR __real_ninf 517 mov r8d,DWORD PTR __flag_x_zero 518 call fname_special 519 520 AVXRestoreXmm xmm6, save_xmm6 521 StackDeallocate stack_size 522 ret 523 524ALIGN 16 525Llog_fma3_x_is_neg: 526 527 vmovsd xmm1,QWORD PTR __real_neg_qnan 528 mov r8d,DWORD PTR __flag_x_neg 529 call fname_special 530 531 AVXRestoreXmm xmm6, save_xmm6 532 StackDeallocate stack_size 533 ret 534 535ALIGN 16 536Llog_fma3_x_is_inf_or_nan: 537 538 cmp rax,QWORD PTR __real_inf 539 je Llog_fma3_finish 540 541 cmp rax,QWORD PTR __real_ninf 542 je Llog_fma3_x_is_neg 543 544 or rax,QWORD PTR __real_qnanbit 545 vmovq xmm1,rax 546 mov r8d,DWORD PTR __flag_x_nan 547 call fname_special 548 549ALIGN 16 550Llog_fma3_finish: 551 AVXRestoreXmm xmm6, save_xmm6 552 StackDeallocate stack_size 553 ret 554fname endp 555 556END 557 558