1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; An implementation of the cosf function. 26; 27; Prototype: 28; 29; float cosf(float x); 30; 31; Computes cosf(x). 32; Based on the NAG C implementation. 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Original Author: Harsha Jagasia 36 37.const 38ALIGN 16 39L_real_one DQ 03ff0000000000000h ; 1.0 40 DQ 0 ; for alignment 41L_one_half DQ 03fe0000000000000h ; 0.5 42 DQ 0 43L_2bypi DQ 03fe45f306dc9c883h ; 2./pi 44 DQ 0 45L_one_sixth DQ 03fc5555555555555h ; 0.166666666666 46 DQ 0 47L_piby2 DQ 03fe921fb54442d18h 48 DQ 0 49L_piby2_1 DQ 03ff921fb54400000h ; piby2_1 50 DQ 0 51L_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail 52 DQ 0 53L_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 54 DQ 0 55L_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail 56 DQ 0 57L_large_x_sse2 DQ 0411E848000000000h ; 5e5 58 DQ 0 59L_large_x_fma3 DQ 041E921FB60000000h ; 3.37325952e9 60 DQ 0 61L_sign_mask DQ 07FFFFFFFFFFFFFFFh 62 DQ 07FFFFFFFFFFFFFFFh 63L__int_three DQ 00000000000000003h 64 DQ 00000000000000003h 65L__min_norm_double DQ 00010000000000000h 66 DQ 00010000000000000h 67L_two_to_neg_7 DQ 03f80000000000000h 68 DQ 0 69L_two_to_neg_13 DQ 03f20000000000000h 70 DQ 0 71L_inf_mask_32 DD 07F800000h 72 DQ 0 73 74fname TEXTEQU <cosf> 75fname_special TEXTEQU <_cosf_special> 76 77;Define name and any external functions being called 78EXTERN __remainder_piby2d2f_forAsm : PROC ; NEAR 79EXTERN __remainder_piby2_fma3_bdl : PROC ; NEAR 80EXTERN __remainder_piby2_fma3 : PROC ; NEAR 81EXTERN fname_special : PROC 82EXTERN _set_statfp : PROC 83 84 85EXTRN __Lcosfarray:QWORD 86EXTRN __Lsinfarray:QWORD 87EXTRN __use_fma3_lib:DWORD 88 89; define local variable storage offsets 90p_temp equ 020h ; temporary for get/put bits operation 91p_temp1 equ 030h ; temporary for get/put bits operation 92dummy_space EQU 040h 93stack_size EQU 068h 94 95include fm.inc 96 97.code 98 99ALIGN 16 100PUBLIC fname 101fname PROC FRAME 102 StackAllocate stack_size 103 .ENDPROLOG 104 cmp DWORD PTR __use_fma3_lib, 0 105 jne Lcosf_fma3 106 107Lcosf_sse2: 108 109 xorpd xmm2, xmm2 ; zeroed out for later use 110 111;; if NaN or inf 112 movd edx, xmm0 113 mov eax, 07f800000h 114 mov r10d, eax 115 and r10d, edx 116 cmp r10d, eax 117 jz Lcosf_sse2_naninf 118 119 cvtss2sd xmm0, xmm0 120 movd rdx, xmm0 121 122; ax = (ux & ~SIGNBIT_DP64); 123 mov r10, rdx 124 btr r10, 63 ; r10 <-- |x| 125 mov r8d, 1 ; for determining region later on 126 127 movapd xmm1, xmm0 ; xmm1 <-- copy of x 128 129 130;; if (ax <= 3fe921fb54442d18h) /* abs(x) <= pi/4 */ 131 mov rax, 03fe921fb54442d18h 132 cmp r10, rax 133 jg Lcosf_sse2_absx_gt_piby4 134 135; *c = cos_piby4(x, 0.0); 136 movapd xmm2, xmm0 137 mulsd xmm2, xmm2 ;x^2 138 xor eax, eax 139 mov rdx, r10 140 movsd xmm5, QWORD PTR L_one_half 141 jmp Lcosf_sse2_calc_sincosf_piby4 ; done 142 143 144ALIGN 16 145Lcosf_sse2_absx_gt_piby4: 146; reduce the argument to be in a range from -pi/4 to +pi/4 147; by subtracting multiples of pi/2 148; xneg = (ax != ux); 149 movd xmm0, r10 ; xmm0 <-- |x| 150 cmp r10, QWORD PTR L_large_x_sse2 151 jae Lcosf_sse2_reduce_precise 152 153;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 154; xmm0=abs(x), xmm1=x 155;/* How many pi/2 is x a multiple of? */ 156 157 movapd xmm2, xmm0 158 movsd xmm3, QWORD PTR L_2bypi 159 movapd xmm4, xmm0 160 movsd xmm5, QWORD PTR L_one_half 161 mulsd xmm2, xmm3 162 163; movsd xmm5, QWORD PTR L_one_half 164; movapd xmm2, xmm0 165; mulsd xmm2, QWORD PTR L_2bypi 166; movapd xmm4, xmm0 167 168 mov r9, r10 169 shr r9, 52 ; r9 <-- biased exponent of x 170 171; npi2 = (int)(x * twobypi + 0.5); 172 addsd xmm2, xmm5 ; npi2 173 174 movsd xmm3, QWORD PTR L_piby2_1 ; piby2_1 175 cvttpd2dq xmm0, xmm2 ; xmm0 <-- npi2 176 movsd xmm1, QWORD PTR L_piby2_1tail ; piby2_1tail 177 cvtdq2pd xmm2, xmm0 ; xmm2 <-- (double)npi2 178 179; Subtract the multiple from x to get an extra-precision remainder 180; rhead = x - npi2 * piby2_1; 181 182 mulsd xmm3, xmm2 ; use piby2_1 183 subsd xmm4, xmm3 ; rhead 184 185; rtail = npi2 * piby2_1tail; 186 mulsd xmm1, xmm2 ; rtail 187 movd eax, xmm0 188 189; GET_BITS_DP64(rhead-rtail, uy); 190; originally only rhead 191 movapd xmm0, xmm4 192 subsd xmm0, xmm1 193 194 movsd xmm3, QWORD PTR L_piby2_2 ; piby2_2 195 movd rcx, xmm0 ; rcx <-- rhead-rtail 196 movsd xmm5, QWORD PTR L_piby2_2tail ; piby2_2tail 197 198; region = npi2 & 3; 199; and eax, 3 200; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); 201 shl rcx, 1 ; strip any sign bit 202 shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 203 sub r9, rcx ; expdiff 204 205;; if (expdiff > 15) 206 cmp r9, 15 207 jle Lcosf_sse2_expdiff_le_15 208 209; The remainder is pretty small compared with x, which 210; implies that x is a near multiple of pi/2 211; (x matches the multiple to at least 15 bits) 212; t = rhead; 213 movapd xmm1, xmm4 214 215; rtail = npi2 * piby2_2; 216 mulsd xmm3, xmm2 217 218; rhead = t - rtail; 219 mulsd xmm5, xmm2 ; npi2 * piby2_2tail 220 subsd xmm4, xmm3 ; rhead 221 222; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 223 subsd xmm1, xmm4 ; t - rhead 224 subsd xmm1, xmm3 ; -rtail 225 subsd xmm5, xmm1 ; rtail 226 227; r = rhead - rtail; 228 movapd xmm0, xmm4 229 230;HARSHA 231;xmm1=rtail 232 movapd xmm1, xmm5 233 subsd xmm0, xmm5 234 235; xmm0=r, xmm4=rhead, xmm1=rtail 236 237;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 238Lcosf_sse2_expdiff_le_15: 239 cmp rcx, 03f2h ; is r < 2^-13 ? 240 jge Lcosf_sse2_calc_sincosf_piby4 ; use taylor series if not 241 cmp rcx, 03deh ; is r < 2^-33 ? 242 jle Lcosf_sse2_r_very_small ; then cosf(r) ~ 1 or r 243 244 movapd xmm2, xmm0 245 mulsd xmm2, xmm0 ; xmm2 <-- x^2 246 247;; if region is 1 or 3 do a sinf calc. 248 and r8d, eax 249 jz Lcosf_sse2_r_small_calc_sin 250 251Lcosf_sse2_r_small_calc_cos: 252; region 1 or 3 253; use simply polynomial 254; *s = x - x*x*x*0.166666666666666666; 255 movsd xmm3, QWORD PTR L_one_sixth 256 mulsd xmm3, xmm0 ; * x 257 mulsd xmm3, xmm2 ; * x^2 258 subsd xmm0, xmm3 ; xs 259 jmp Lcosf_sse2_adjust_region 260 261ALIGN 16 262Lcosf_sse2_r_small_calc_sin: 263; region 0 or 2 264; cos = 1.0 - x*x*0.5; 265 movsd xmm0, QWORD PTR L_real_one ; 1.0 266 mulsd xmm2, QWORD PTR L_one_half ; 0.5 *x^2 267 subsd xmm0, xmm2 268 jmp Lcosf_sse2_adjust_region 269 270ALIGN 16 271Lcosf_sse2_r_very_small: 272; then sin(r) = r 273; if region is 1 or 3 do a sin calc. 274 and r8d, eax 275 jnz Lcosf_sse2_adjust_region 276 277 movsd xmm0, QWORD PTR L_real_one ; cosf(r) is a 1 278 ; By this point, calculations should already have set inexact 279 jmp Lcosf_sse2_adjust_region 280 281;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 282ALIGN 16 283Lcosf_sse2_reduce_precise: 284; Reduce abs(x) into range [-pi/4, pi/4] 285; remainder_piby2d2f(ax, &r, ®ion); 286 mov QWORD PTR p_temp[rsp], rdx ; save ux for use later 287 mov QWORD PTR p_temp1[rsp], r10 ; save ax for use later 288 289 call __remainder_piby2d2f_forAsm 290 mov rdx, QWORD PTR p_temp[rsp] ; restore ux for use later 291 mov r10, QWORD PTR p_temp1[rsp] ; restore ax for use later 292 mov r8d, 1 ; for determining region later 293 294 ; Reduced argument is in xmm0. No second word; after all, we started in 295 ; single precision. Region is in rax. 296 movapd xmm1, xmm0 297 movsd xmm5, QWORD PTR L_one_half 298 299 jmp Lcosf_sse2_calc_sincosf_piby4 300 301 302; done with reducing the argument. Now perform the sin/cos calculations. 303ALIGN 16 304Lcosf_sse2_calc_sincosf_piby4: 305 movapd xmm2, xmm0 306 mulsd xmm2, xmm0 ; x^2 307 308;; if region is 0 or 2, do a cosf calc 309 and r8d, eax 310 jz Lcosf_sse2_do_cosf_calc 311; region is 1 or 3: do a sinf calc. 312Lcosf_sse2_do_sinf_calc: 313 movsd xmm1, QWORD PTR __Lsinfarray+18h ; s4 314 mulsd xmm1, xmm2 ; s4x2 315 movsd xmm4, xmm2 ; move for x4 316 mulsd xmm4, xmm2 ; x4 317 movsd xmm5, QWORD PTR __Lsinfarray+8h ; s2 318 mulsd xmm5, xmm2 ; s2x2 319 movsd xmm3, xmm0 ; move for x3 320 mulsd xmm3, xmm2 ; x3 321 addsd xmm1, QWORD PTR __Lsinfarray+10h ; s3+s4x2 322 mulsd xmm1, xmm4 ; s3x4+s4x6 323 addsd xmm5, QWORD PTR __Lsinfarray ; s1+s2x2 324 addsd xmm1, xmm5 ; s1+s2x2+s3x4+s4x6 325 mulsd xmm1, xmm3 ; x3(s1+s2x2+s3x4+s4x6) 326 addsd xmm0, xmm1 ; x + x3(s1+s2x2+s3x4+s4x6) 327 jmp Lcosf_sse2_adjust_region 328 329ALIGN 16 330Lcosf_sse2_do_cosf_calc: 331; region 0 or 2 - do a cos calculation 332; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8; 333; zc = 1-0.5*x2+ c1*x4 +c2*x6 +c3*x8 + c4*x10 for a higher precision 334 movsd xmm1, QWORD PTR __Lcosfarray+20h ; c4 335 movsd xmm4, xmm2 ; move for x4 336 mulsd xmm1, xmm2 ; c4x2 337 movsd xmm3, QWORD PTR __Lcosfarray+10h ; c2 338 mulsd xmm4, xmm2 ; x4 339 movsd xmm0, QWORD PTR __Lcosfarray ; c0 340 mulsd xmm3, xmm2 ; c2x2 341 mulsd xmm0, xmm2 ; c0x2 (=-0.5x2) 342 addsd xmm1, QWORD PTR __Lcosfarray+18h ; c3+c4x2 343 mulsd xmm1, xmm4 ; c3x4 + c4x6 344 addsd xmm3, QWORD PTR __Lcosfarray+8h ; c1+c2x2 345 addsd xmm1, xmm3 ; c1 + c2x2 + c3x4 + c4x6 346 mulsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 347 addsd xmm0, QWORD PTR L_real_one ; 1 - 0.5x2 348 addsd xmm0, xmm1 ; 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 349 350Lcosf_sse2_adjust_region: 351; xmm1 is cos or sin, relies on previous sections to 352; switch (region) 353 add eax, 1 354 and eax, 2 355 jz Lcosf_sse2_cleanup 356;; if region 1 or 2 then we negate the result. 357 xorpd xmm2, xmm2 358 subsd xmm2, xmm0 359 movapd xmm0, xmm2 360 361ALIGN 16 362Lcosf_sse2_cleanup: 363 cvtsd2ss xmm0, xmm0 364 StackDeallocate stack_size 365 ret 366 367 368Lcosf_sse2_naninf: 369 call fname_special 370 StackDeallocate stack_size 371 ret 372 373 374ALIGN 16 375Lcosf_fma3: 376 vmovd eax,xmm0 377 mov r8d,L_inf_mask_32 378 and eax,r8d 379 cmp eax, r8d 380 jz Lcosf_fma3_naninf 381 382 vcvtss2sd xmm5,xmm0,xmm0 383 vmovq r9,xmm5 384 btr r9,63 ;clear sign 385 386 cmp r9,L_piby2 387 jg Lcosf_fma3_range_reduce 388 cmp r9,L_two_to_neg_7 389 jge Lcosf_fma3_compute_cosf_piby_4 390 cmp r9,L_two_to_neg_13 391 jge Lcosf_fma3_compute_1_xx_5 392 393 vmovq xmm0,QWORD PTR L_real_one 394 ; Here we need to set inexact 395 vaddsd xmm0,xmm0,L__min_norm_double ; this will set inexact 396 jmp Lcosf_fma3_return 397 398ALIGN 16 399Lcosf_fma3_compute_1_xx_5: 400 vmulsd xmm0,xmm5,QWORD PTR L_one_half 401 vfnmadd213sd xmm0,xmm5,L_real_one ; xmm9 1.0 - x*x*(double2)0.5 402 jmp Lcosf_fma3_return 403 404ALIGN 16 405Lcosf_fma3_compute_cosf_piby_4: 406 movsd xmm0,xmm5 407 vmovapd xmm2,L_real_one 408 vmulsd xmm3,xmm0,xmm0 409 vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r 410 vsubsd xmm2,xmm2,xmm1 411 vmovsd xmm1,__Lcosfarray+018h 412 vfmadd231sd xmm1,xmm3,__Lcosfarray+020h 413 vfmadd213sd xmm1,xmm3,__Lcosfarray+010h 414 vfmadd213sd xmm1,xmm3,__Lcosfarray+008h 415 vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 416 vmovdqa xmm0,xmm2 417 vfmadd231sd xmm0,xmm1,xmm3 418 jmp Lcosf_fma3_return 419 420ALIGN 16 421Lcosf_fma3_range_reduce: 422 vmovq xmm0,r9 ; xmm0 <-- |x| 423 cmp r9,L_large_x_fma3 424 jge Lcosf_reduce_precise 425 426;cosff_range_e_5_s: 427 vandpd xmm1,xmm0,L_sign_mask 428 vmovapd xmm2,L_2bypi 429 vfmadd213sd xmm2,xmm1,L_one_half 430 vcvttpd2dq xmm2,xmm2 431 vpmovsxdq xmm1,xmm2 432 vandpd xmm4,xmm1,L__int_three ; region xmm4 433 vshufps xmm1 ,xmm1,xmm1,8 434 vcvtdq2pd xmm1,xmm1 435 vmovdqa xmm2,xmm0 436 vfnmadd231sd xmm2,xmm1,L_piby2_1 ; xmm2 rhead 437 vmulsd xmm3,xmm1,L_piby2_1tail ; xmm3 rtail 438 vsubsd xmm0,xmm2,xmm3 ; r_1 xmm0 439 vsubsd xmm2,xmm2,xmm0 440 vsubsd xmm1,xmm2,xmm3 441 vmovq rax,xmm4 442 jmp Lcosf_exit_s 443 444ALIGN 16 445Lcosf_reduce_precise: 446 447 vmovq xmm0,r9 ; r9 <-- |x| 448 cmp r9,L_large_x_fma3 449 jge Lcos_remainder_piby2 450 451 ; __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl 452 ; have the following conventions: 453 ; on input 454 ; x is in xmm0 455 ; on output 456 ; r is in xmm0 457 ; rr is in xmm1 458 ; region is in rax 459 ; The _bdl routine is guaranteed not to touch r10 460 461Lcos_remainder_piby2_small: ;; unused label 462 ; Boldo-Daumas-Li reduction for reasonably small |x| 463 call __remainder_piby2_fma3_bdl 464 jmp Lcosf_exit_s 465 466ALIGN 16 467Lcos_remainder_piby2: 468 ; argument reduction for general x 469 call __remainder_piby2_fma3 470Lcosf_exit_s: 471 bt rax,0 472 jnc Lcosf_piby4_compute 473 474;sinf_piby4_compute: 475; vmovapd xmm1,__Lsinfarray+010h 476 vmovsd xmm1,__Lsinfarray+010h 477 vmulsd xmm3,xmm0,xmm0 478 vfmadd231sd xmm1,xmm3,__Lsinfarray+018h 479 vfmadd213sd xmm1,xmm3,__Lsinfarray+008h 480 vfmadd213sd xmm1,xmm3,__Lsinfarray 481 vmulsd xmm3,xmm0,xmm3 ; xmm3 <-- x^3 482 vfmadd231sd xmm0,xmm1,xmm3 483 jmp Lcosf_fma3_adjust_sign 484 485ALIGN 16 486Lcosf_piby4_compute: 487 vmovapd xmm2,L_real_one 488 vmulsd xmm3,xmm0,xmm0 489 vmulsd xmm1,xmm3,L_one_half ; xmm1 <-- r 490 vsubsd xmm2,xmm2,xmm1 491 vmovsd xmm1,__Lcosfarray+018h 492 vfmadd231sd xmm1 ,xmm3,__Lcosfarray+020h 493 vfmadd213sd xmm1 ,xmm3,__Lcosfarray+010h 494 vfmadd213sd xmm1 ,xmm3,__Lcosfarray+008h 495 vmulsd xmm3,xmm3,xmm3 ; xmm3 <-- x^4 496 vmovdqa xmm0, xmm2 497 vfmadd231sd xmm0 ,xmm1,xmm3 498 499Lcosf_fma3_adjust_sign: 500 ; assuming FMA3 ==> AVX ==> SSE4.1 501; vpcmpeqq xmm1,xmm4,XMMWORD PTR L_int_one 502; vpcmpeqq xmm2,xmm4,XMMWORD PTR L_int_two 503; vorpd xmm3,xmm2,xmm1 504 505; vandpd xmm3,xmm3,L_signbit 506 507 add rax,1 ; 1,2 --> 2,3 508 shr rax,1 ; 2,3 --> 1 509 shl rax,63 ; 1 --> sign bit 510 vmovq xmm3,rax 511 512 vxorpd xmm0,xmm0,xmm3 513 514Lcosf_fma3_return: 515 vcvtsd2ss xmm0,xmm0,xmm0 516 StackDeallocate stack_size 517 ret 518 519Lcosf_fma3_naninf: 520 call fname_special 521 StackDeallocate stack_size 522 ret 523 524fname endp 525END 526