1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; 26; logf.asm 27; 28; An implementation of the logf libm function. 29; 30; Prototype: 31; 32; float logf(float x); 33; 34 35; 36; Algorithm: 37; Similar to one presnted in log.asm 38; 39.const 40 41 42ALIGN 16 43 44L_real_one DQ 0000000003f800000h ; 1.0 45 DQ 0000000000000000h 46L_real_two DQ 00000000040000000h ; 1.0 47 DQ 00000000000000000h 48L_real_ninf DQ 000000000ff800000h ; -inf 49 DQ 0000000000000000h 50L_real_inf DQ 0000000007f800000h ; +inf 51 DQ 0000000000000000h 52L_real_nan DQ 0000000007fc00000h ; NaN 53 DQ 0000000000000000h 54L_real_neg_qnan DQ 000000000ffc00000h 55 DQ 0000000000000000h 56L_real_notsign DQ 0000000007ffFFFFFh ; ^sign bit 57 DQ 0000000000000000h 58L_real_mant DQ 0007FFFFF007FFFFFh ; mantissa bits 59 DQ 0007FFFFF007FFFFFh 60L_mask_127 DQ 00000007f0000007fh ; 61 DQ 00000007f0000007fh 62L_mask_253 DQ 000000000000000fdh 63 DQ 00000000000000000h 64L_mask_mant_all7 DQ 00000000007f0000h 65 DQ 00000000007f0000h 66L_mask_mant8 DQ 0000000000008000h 67 DQ 0000000000000000h 68L_real_ca1 DQ 0000000003DAAAAABh ; 8.33333333333317923934e-02 69 DQ 00000000000000000h 70L_real_ca2 DQ 0000000003C4CCCCDh ; 1.25000000037717509602e-02 71 DQ 00000000000000000h 72L_real_log2_lead DQ 03F3170003F317000h ; 0.693115234375 73 DQ 00000000000000000h 74L_real_log2_tail DQ 0000000003805FDF4h ; 0.000031946183 75 DQ 00000000000000000h 76L_real_half DQ 0000000003f000000h ; 1/2 77 DQ 00000000000000000h 78L_real_1_over_3 DQ 0000000003eaaaaabh 79 DQ 00000000000000000h 80 81L_real_1_over_2 DD 03f000000h 82L_real_neg127 DD 0c2fe0000h 83L_real_qnanbit DD 000400000h ; quiet nan bit 84L_real_threshold DD 03d800000h 85 86; these codes and the ones in the corresponding .c file have to match 87L_flag_x_zero DD 00000001 88L_flag_x_neg DD 00000002 89L_flag_x_nan DD 00000003 90 91EXTRN __log_128_lead:DWORD 92EXTRN __log_128_tail:DWORD 93EXTRN __log_F_inv_dword:DWORD 94EXTRN __use_fma3_lib:DWORD 95 96fname TEXTEQU <logf> 97fname_special TEXTEQU <_logf_special> 98 99; define local variable storage offsets 100 101dummy_space EQU 020h 102stack_size EQU 038h 103 104include fm.inc 105 106; external function 107EXTERN fname_special:PROC 108 109.code 110 111PUBLIC fname 112fname PROC FRAME 113 StackAllocate stack_size 114 .ENDPROLOG 115 cmp DWORD PTR __use_fma3_lib, 0 116 jne Llogf_fma3 117 118 ; Some of the placement of instructions below iwll be odd. 119 ; We are attempting to have no more than one branch per 32-byte block. 120Llogf_sse2: 121 ; Zero the high bits of rax because it will be used as an index later. 122 xor rax, rax 123 movdqa xmm3, xmm0 124 movaps xmm4, xmm0 125 126 ; This computation of the expoonent of x will produce nonsenes if x <= 0., 127 ; but those cases are eliminated below, so it does no harm. 128 psrld xmm3, 23 ; xmm3 <-- biased exp if x > 0. 129 130 ; Is x Inf or NaN? 131 movd eax, xmm0 ; eax <-- x 132 mov ecx, eax 133 btr ecx, 31 ; ecx <-- |x| 134 cmp ecx, DWORD PTR L_real_inf 135 jae Llogf_sse2_x_is_inf_or_nan 136 137 ; Finish computing exponent. 138 psubd xmm3, XMMWORD PTR L_mask_127 ; xmm3 <-- xexp (unbiased) 139 movdqa xmm2, xmm0 140 cvtdq2ps xmm5, xmm3 ; (float)xexp, unless x <= 0. 141 142 ; Is x negative or zero? 143 xorps xmm1, xmm1 144 comiss xmm0, xmm1 145 jbe Llogf_sse2_x_is_zero_or_neg 146 147 pand xmm2, XMMWORD PTR L_real_mant ; xmm2 <-- x mantissa for later 148 subss xmm4, DWORD PTR L_real_one ; xmm4 <-- x - 1. for later 149 150 comiss xmm5, DWORD PTR L_real_neg127 ; x!=0, xexp==0 ==> subnormal 151 je Llogf_sse2_subnormal_adjust 152 153Llogf_sse2_continue_common: 154 ; At this point we need |x| (possibly adjusted) in eax 155 ; and m = xexpx (possibly adjusted) in xmm5 156 ; We also need the value of x - 1. computed above. 157 158 ; compute the index into the log tables 159 mov r9d, eax 160 and eax, DWORD PTR L_mask_mant_all7 ; eax <-- 7 bits of x mantissa 161 and r9d, DWORD PTR L_mask_mant8 ; r9d <-- 8th bit 162 shl r9d, 1 163 add eax, r9d ; use 8th bit to round up 164 movd xmm1, eax 165 166 ; Is x near 1.0 ? 167 ; Note that if x is subnormal it is perforce not near one. 168 andps xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1| 169 comiss xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16? 170 jb Llogf_sse2_near_one ; if so, handle elsewhere 171 172 ; F, Y 173 ; F is a number in [.5,1) scaled from the rounded mantissa bits computed 174 ; above by oring in the exponent of .5. 175 ; Y is all of the mantissa bits of X scaled to [.5,1.) similarly 176 shr eax, 16 ; shift eax to use as index 177 por xmm2, XMMWORD PTR L_real_half ; xmm2 <-- Y 178 por xmm1, XMMWORD PTR L_real_half ; xmm2 <-- F 179 lea r9, QWORD PTR __log_F_inv_dword 180 181 182 ; f = F - Y, r = f * inv 183 subss xmm1, xmm2 ; xmm1 <-- f 184 mulss xmm1, DWORD PTR [r9+rax*4] ; xmm1 <-- r = f*inv (tabled) 185 186 movaps xmm2, xmm1 187 movaps xmm0, xmm1 188 189 ; poly 190 mulss xmm2, DWORD PTR L_real_1_over_3 ; xmm2 <-- r/3 191 mulss xmm0, xmm1 ; xmm0 <-- r^2 192 addss xmm2, DWORD PTR L_real_1_over_2 193 movaps xmm3, XMMWORD PTR L_real_log2_tail 194 195 lea r9, QWORD PTR __log_128_tail 196 lea r10, QWORD PTR __log_128_lead 197 198 mulss xmm2, xmm0 ; xmm2 <-- r^2 * (r/3 + 1/2) 199 mulss xmm3, xmm5 ; xmm3 <-- (m=xexp)*log2_tail 200 addss xmm1, xmm2 ; xmm1 <-- poly 201 202 ; m*log(2) + log(G) - poly, where G is just 2*F 203 ; log(G) is precomputed to extra precision. 204 ; small pieces and large pieces are separated until the final add, 205 ; to preserve accuracy 206 movaps xmm0, XMMWORD PTR L_real_log2_lead 207 subss xmm3, xmm1 ; xmm3 <-- m*log2_tail - poly 208 mulss xmm0, xmm5 ; xmm0 <-- m*log1_lead 209 addss xmm3, DWORD PTR [r9+rax*4] ; xmm3 += log(G) tail 210 addss xmm0, DWORD PTR [r10+rax*4] ; xmm0 += log(G) lead 211 212 addss xmm0, xmm3 ; xmm0 <-- m*log(2)+log(G)-poly 213 214 StackDeallocate stack_size 215 ret 216 217ALIGN 16 218Llogf_sse2_near_one: 219 ; Computation of the log for x near one requires special techniques. 220 movaps xmm2, DWORD PTR L_real_two 221 subss xmm0, DWORD PTR L_real_one ; xmm0 <-- r = x - 1.0 222 addss xmm2, xmm0 223 movaps xmm1, xmm0 224 divss xmm1, xmm2 ; xmm1 <-- u = r/(2.0+r) 225 movaps xmm4, xmm0 226 mulss xmm4, xmm1 ; xmm4 <-- correction = r*u 227 addss xmm1, xmm1 ; xmm1 <-- u = 2.*u 228 movaps xmm2, xmm1 229 mulss xmm2, xmm2 ; xmm2 <-- u^2 230 231 ; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction) 232 movaps xmm3, xmm1 233 mulss xmm3, xmm2 ; xmm3 <-- u^3 234 mulss xmm2, DWORD PTR L_real_ca2 ; xmm2 <-- ca2*u^2 235 addss xmm2, DWORD PTR L_real_ca1 ; xmm2 <-- ca2*u^2 + ca1 236 mulss xmm2, xmm3 ; xmm2 <-- u^3*(ca1+u^2*ca2) 237 subss xmm2, xmm4 ; xmm2 <-- r2 238 239 ; return r + r2 240 addss xmm0, xmm2 241 StackDeallocate stack_size 242 ret 243 244ALIGN 16 245Llogf_sse2_subnormal_adjust: 246 ; This code adjusts eax and xmm5. 247 ; It must preserve xmm4. 248 por xmm2, XMMWORD PTR L_real_one 249 subss xmm2, DWORD PTR L_real_one 250 movdqa xmm5, xmm2 251 pand xmm2, XMMWORD PTR L_real_mant 252 movd eax, xmm2 253 psrld xmm5, 23 254 psubd xmm5, XMMWORD PTR L_mask_253 255 cvtdq2ps xmm5, xmm5 256 jmp Llogf_sse2_continue_common 257 258; Until we get to the FMA3 code, the rest of this is special case handling. 259Llogf_sse2_x_is_zero_or_neg: 260 jne Llogf_sse2_x_is_neg 261 262 movaps xmm1, XMMWORD PTR L_real_ninf 263 mov r8d, DWORD PTR L_flag_x_zero 264 call fname_special 265 jmp Llogf_sse2_finish 266 267Llogf_sse2_x_is_neg: 268 269 movaps xmm1, XMMWORD PTR L_real_neg_qnan 270 mov r8d, DWORD PTR L_flag_x_neg 271 call fname_special 272 jmp Llogf_sse2_finish 273 274Llogf_sse2_x_is_inf_or_nan: 275 276 cmp eax, DWORD PTR L_real_inf 277 je Llogf_sse2_finish 278 279 cmp eax, DWORD PTR L_real_ninf 280 je Llogf_sse2_x_is_neg 281 282 or eax, DWORD PTR L_real_qnanbit 283 movd xmm1, eax 284 mov r8d, DWORD PTR L_flag_x_nan 285 call fname_special 286 jmp Llogf_sse2_finish 287 288Llogf_sse2_finish: 289 StackDeallocate stack_size 290 ret 291 292ALIGN 16 293Llogf_fma3: 294 ; compute exponent part 295 vmovaps xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test 296 xor rax,rax 297 vpsrld xmm3,xmm0,23 ; xmm3 <-- (ux>>23) 298 vmovd eax,xmm0 ;eax = x 299 vpsubd xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127 300 vcvtdq2ps xmm5,xmm3 ; xmm5 <-- float((ux>>23)-127) = xexp 301 302 ; NaN or inf 303 vpand xmm1,xmm0,xmm4 ; xmm1 <-- (ux & 07f800000h) 304 vcomiss xmm1,xmm4 305 je Llogf_fma3_x_is_inf_or_nan 306 307 ; check for negative numbers or zero 308 vpxor xmm1,xmm1,xmm1 309 vcomiss xmm0,xmm1 310 jbe Llogf_fma3_x_is_zero_or_neg 311 312 vpand xmm2,xmm0,DWORD PTR L_real_mant ; xmm2 <-- ux & 0007FFFFFh 313 vsubss xmm4,xmm0,DWORD PTR L_real_one ; xmm4 <-- x - 1.0 314 315 vcomiss xmm5,DWORD PTR L_real_neg127 316 je Llogf_fma3_subnormal_adjust 317 318Llogf_fma3_continue_common: 319 320 ; compute the index into the log tables 321 vpand xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h 322 vpand xmm3,xmm0,DWORD PTR L_mask_mant8 ; xmm3 = ux & 000008000h 323 vpslld xmm3,xmm3,1 ; xmm3 = (ux & 000008000h) << 1 324 vpaddd xmm1,xmm3,xmm1 325 ; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1) 326 ; eax <-- x/127., rounded to nearest 327 vmovd eax,xmm1 328 329 ; near one codepath 330 vandps xmm4,xmm4,DWORD PTR L_real_notsign ; xmm4 <-- fabs (x - 1.0) 331 vcomiss xmm4,DWORD PTR L_real_threshold 332 jb Llogf_fma3_near_one 333 334 ; F,Y 335 shr eax,16 336 vpor xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y 337 vpor xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F 338 lea r9,QWORD PTR __log_F_inv_dword 339 340 ; f = F - Y 341 vsubss xmm1,xmm1,xmm2 ; f = F - Y 342 ; r = f * log_F_inv_dword[index] 343 vmulss xmm1,xmm1,DWORD PTR [r9 + rax * 4] 344 345 ; poly 346 vmovaps xmm2,XMMWORD PTR L_real_1_over_3 347 vfmadd213ss xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2 348 vmulss xmm0,xmm1,xmm1 ; r*r 349 vmovaps xmm3,DWORD PTR L_real_log2_tail; 350 351 lea r9,DWORD PTR __log_128_tail 352 lea r10,DWORD PTR __log_128_lead 353 354 vfmadd231ss xmm1,xmm2,xmm0 ; poly = r + 1/2*r*r + 1/3*r*r*r 355 vfmsub213ss xmm3,xmm5,xmm1 ; (xexp * log2_tail) - poly 356 357 ; m*log(2) + log(G) - poly 358 vmovaps xmm0,DWORD PTR L_real_log2_lead 359 vfmadd213ss xmm0,xmm5,[r10 + rax * 4] 360 ; z2 = (xexp * log2_tail) - poly + log_128_tail[index] 361 vaddss xmm3,xmm3,DWORD PTR [r9 + rax * 4] 362 vaddss xmm0,xmm0,xmm3 ; return z1 + z2 363 364 StackDeallocate stack_size 365 ret 366 367ALIGN 16 368Llogf_fma3_near_one: 369 ; r = x - 1.0; 370 vmovaps xmm2,DWORD PTR L_real_two 371 vsubss xmm0,xmm0,DWORD PTR L_real_one ; xmm0 = r = = x - 1.0 372 373 ; u = r / (2.0 + r) 374 vaddss xmm2,xmm2,xmm0 ; (r+2.0) 375 vdivss xmm1,xmm0,xmm2 ; u = r / (2.0 + r) 376 377 ; correction = r * u 378 vmulss xmm4,xmm0,xmm1 ; correction = u*r 379 380 ; u = u + u; 381 vaddss xmm1,xmm1,xmm1 ; u = u+u 382 vmulss xmm2,xmm1,xmm1 ; v = u^2 383 384 ; r2 = (u * v * (ca_1 + v * ca_2) - correction) 385 vmulss xmm3,xmm1,xmm2 ; u^3 386 vmovaps xmm5,DWORD PTR L_real_ca2 387 vfmadd213ss xmm2,xmm5,DWORD PTR L_real_ca1 388 vfmsub213ss xmm2,xmm3,xmm4 ; r2 = (ca1 + ca2 * v) * u^3 - correction 389 390 ; r + r2 391 vaddss xmm0,xmm0,xmm2 392 StackDeallocate stack_size 393 ret 394 395 396ALIGN 16 397Llogf_fma3_subnormal_adjust: 398 vmovaps xmm3,DWORD PTR L_real_one 399 vpor xmm2,xmm2,xmm3 ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h) 400 vsubss xmm2,xmm2,xmm3 ; xmm2 = temp -1.0 401 vpsrld xmm5,xmm2,23 ; xmm5 = (utemp >> 23) 402 vpand xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh) 403 vmovaps xmm0,xmm2 404 vpsubd xmm5,xmm5,DWORD PTR L_mask_253 ; xmm5 = (utemp >> 23) - 253 405 vcvtdq2ps xmm5,xmm5 ; xmm5 = (float) ((utemp >> 23) - 253) 406 jmp Llogf_fma3_continue_common 407 408Llogf_fma3_x_is_zero_or_neg: 409 jne Llogf_fma3_x_is_neg 410 411 vmovaps xmm1,DWORD PTR L_real_ninf 412 mov r8d,DWORD PTR L_flag_x_zero 413 call fname_special 414 415 StackDeallocate stack_size 416 ret 417 418 419Llogf_fma3_x_is_neg: 420 421 vmovaps xmm1,DWORD PTR L_real_neg_qnan 422 mov r8d,DWORD PTR L_flag_x_neg 423 call fname_special 424 425 StackDeallocate stack_size 426 ret 427 428Llogf_fma3_x_is_inf_or_nan: 429 430 cmp eax,DWORD PTR L_real_inf 431 je Llogf_fma3_finish 432 433 cmp eax,DWORD PTR L_real_ninf 434 je Llogf_fma3_x_is_neg 435 436 or eax,DWORD PTR L_real_qnanbit 437 vmovd xmm1,eax 438 mov r8d,DWORD PTR L_flag_x_nan 439 call fname_special 440 441 StackDeallocate stack_size 442 ret 443 444Llogf_fma3_finish: 445 446 StackDeallocate stack_size 447 ret 448 449 450fname endp 451END 452