1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; 26; An implementation of the sin function. 27; 28; Prototype: 29; 30; double sin(double x); 31; 32; Computes sin(x). 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Based on the NAG C implementation. 36; 37; If FMA3 hardware is available, an FMA3 implementation of sin will be used. 38 39 40.const 41ALIGN 16 42L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1 43 DQ 0 44L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail 45 DQ 0 46L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 47 DQ 0 48L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail 49 DQ 0 50ALIGN 16 51 52L_one DQ 03FF0000000000000h, 03FF0000000000000h 53L_signbit DQ 08000000000000000h, 00000000000000000h 54L_int_one DQ 00000000000000001h, 00000000000000000h 55L_int_two DQ 00000000000000002h, 00000000000000000h 56L_int_three DQ 00000000000000003h, 00000000000000000h 57 58L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi 59L_one_half DQ 03FE0000000000000h ; .5 60L_one_sixth DQ 03FC5555555555555h ; .1666... 61L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 62L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 63L_piby4 DQ 03FE921FB54442D18h ; pi/4 64L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW 65L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL 66 67L__inf_mask_64 DQ 07FF0000000000000h ; +Inf 68 69EXTRN __Lcosarray:QWORD 70EXTRN __Lsinarray:QWORD 71EXTRN __use_fma3_lib:DWORD 72 73; define local variable storage offsets 74p_temp EQU 030h 75p_temp1 EQU 040h 76save_r10 EQU 050h 77dummy_space EQU 060h 78stack_size EQU 078h 79 80include fm.inc 81 82fname TEXTEQU <sin> 83fname_special TEXTEQU <_sin_special> 84 85;Define name and any external functions being called 86EXTERN __remainder_piby2_forAsm : PROC 87EXTERN __remainder_piby2_fma3 : PROC 88EXTERN __remainder_piby2_fma3_bdl : PROC 89EXTERN fname_special : PROC 90 91.code 92 93PUBLIC fname 94fname PROC FRAME 95 StackAllocate stack_size 96 .ENDPROLOG 97 cmp DWORD PTR __use_fma3_lib, 0 98 jne Lsin_fma3 99 100Lsin_sse2: 101 movd rdx, xmm0 102 xorpd xmm2, xmm2 ; zeroed out for later use 103 104 mov r10,rdx 105 mov r8d, 1 ; for determining region later on 106 btr r10,63 ; r10 <-- |x| 107 cmp r10,L_piby4 108 jb Lsin_sse2_absx_lt_piby4 109 110Lsin_sse2_absx_nlt_piby4: ; common case 111 mov r11,rdx 112 shr r11,63 113 movd xmm0,r10 ; xmm0 <-- |x| 114 cmp r10, QWORD PTR L_small_arg_cw 115 jae Lsin_reduce_precise ; Note NaN/Inf will branch 116 117; At this point we have |x| < L_small_arg_cw, which is currently 500000. 118; Note that if |x| were too large, conversion of npi2 to integer would fail. 119; We reduce the argument to be in a range from -pi/4 to +pi/4 120; by subtracting multiples of pi/2 121 movapd xmm2, xmm0 122 mulsd xmm2, L_2_by_pi 123 movapd xmm4, xmm0 124 125; xexp = ax >> EXPSHIFTBITS_DP64; 126 mov r9, r10 127 shr r9, 52 ; >>EXPSHIFTBITS_DP64 128 129; How many pi/2 is |x| a multiple of? 130; npi2 = (int)(x * twobypi + 0.5); 131 addsd xmm2, L_one_half ; npi2 132 133 movsd xmm3, L_real_piby2_1 134 cvttpd2dq xmm0, xmm2 ; convert npi2 to integer 135 movsd xmm1, L_real_piby2_1tail 136 cvtdq2pd xmm2, xmm0 ; npi2 back to double 137 138; Subtract the multiple from x to get an extra-precision remainder 139; rhead = x - npi2 * piby2_1; 140 mulsd xmm3, xmm2 141 subsd xmm4, xmm3 ; rhead 142 143; rtail = npi2 * piby2_1tail; 144 mulsd xmm1, xmm2 ; rtail 145 movd eax, xmm0 ; eax <-- npi2 146 147; GET_BITS_DP64(rhead-rtail, uy); 148; originally only rhead 149 movapd xmm0, xmm4 150 subsd xmm0, xmm1 151 152 movsd xmm3, L_real_piby2_2 153 movd rcx, xmm0 ; rcx <-- rhead - rtail 154 movsd xmm5, L_real_piby2_2tail ; piby2_2tail 155 156; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, 157; xmm4=rhead, xmm5= temp for calc 158; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); 159; expdiff measures how close rhead - rtail is to |x| 160; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer) 161 shl rcx, 1 ; strip any sign bit 162 shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 163 sub r9, rcx ; expdiff 164 165;; if (expdiff > 15) 166 cmp r9, 15 167 jle Lsin_sse2_cw_reduction_done 168 169; Here the remainder is pretty small compared with x, which 170; implies that x is a near multiple of pi/2 171; (x matches the multiple to at least 15 bits) 172; So we do another stage of argument reduction. 173 174; t = rhead; 175 movapd xmm1, xmm4 176 177; rtail = npi2 * piby2_2; 178 mulsd xmm3, xmm2 179 180; rhead = t - rtail; 181 mulsd xmm5, xmm2 ; npi2 * piby2_2tail 182 subsd xmm4, xmm3 ; rhead 183 184; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 185 subsd xmm1, xmm4 ; t - rhead 186 subsd xmm1, xmm3 ; -rtail 187 subsd xmm5, xmm1 ; rtail 188 189; r = rhead - rtail; 190 movapd xmm0, xmm4 191 192;HARSHA 193;xmm1=rtail 194 movapd xmm1, xmm5 ; xmm1 <-- copy of rtail 195 subsd xmm0, xmm5 196 197 198; xmm0=r, xmm4=rhead, xmm1=rtail 199Lsin_sse2_cw_reduction_done: 200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 201;; if the input was close to a pi/2 multiple 202; The original NAG code missed this trick. 203; If the input is very close to n*pi/2 after reduction, so r < 2^-27, 204; then the sin is either ~ 1.0 or ~r, to within 53 bits. 205 206; Note: Unfortunately this introduces two jcc instructions close to each 207; other and to other branches. As r < 2^-13 should be rather uncommon, it 208; almost certainly costs more than it saves. - WAT 209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 210; region = npi2 & 3; 211 212 subsd xmm4, xmm0 ; rhead-r 213 subsd xmm4, xmm1 ; rr = (rhead-r) - rtail 214 215Lsin_piby4: 216; perform taylor series to calc sinx, sinx for |x| <= pi/4 217; x2 = r * r; 218 219;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the sin path 220;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path 221 movapd xmm3, xmm0 222 movapd xmm2, xmm0 223 mulsd xmm2, xmm0 ;x2 224 225 bt eax,0 226 jc Lsin_sse2_calc_cos 227 228;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 229; region 0 or 2 do a sin calculation 230 movsd xmm3, __Lsinarray+50h ; s6 231 mulsd xmm3, xmm2 ; x2s6 232 movsd xmm5, __Lsinarray+20h ; s3 233 movsd QWORD PTR p_temp[rsp], xmm4 ; store xx 234 movapd xmm1, xmm2 ; move for x4 235 mulsd xmm1, xmm2 ; x4 236 movsd QWORD PTR p_temp1[rsp], xmm0 ; store x 237 mulsd xmm5, xmm2 ; x2s3 238 movapd xmm4, xmm0 ; move for x3 239 addsd xmm3, __Lsinarray+40h ; s5+x2s6 240 mulsd xmm1, xmm2 ; x6 241 mulsd xmm3, xmm2 ; x2(s5+x2s6) 242 mulsd xmm4, xmm2 ; x3 243 addsd xmm5, __Lsinarray+10h ; s2+x2s3 244 mulsd xmm5, xmm2 ; x2(s2+x2s3) 245 addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) 246 mulsd xmm2, L_one_half ; 0.5 *x2 247 movsd xmm0, QWORD PTR p_temp[rsp] ; load xx 248 mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) 249 addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) 250 mulsd xmm2, xmm0 ; 0.5 * x2 *xx 251 addsd xmm3, xmm5 ; zs 252 mulsd xmm4, xmm3 ; *x3 253 subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx 254 addsd xmm0, xmm4 ; +xx 255 addsd xmm0, QWORD PTR p_temp1[rsp] ; +x 256 257 jmp Lsin_sse2_adjust_region 258 259ALIGN 16 260Lsin_sse2_calc_cos: 261;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 262; region 1 or 3 - do a cos calculation 263; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); 264 mulsd xmm4, xmm0 ; x*xx 265 movsd xmm5, L_one_half 266 movsd xmm1, __Lcosarray+50h ; c6 267 movsd xmm0, __Lcosarray+20h ; c3 268 mulsd xmm5, xmm2 ; r = 0.5 *x2 269 movapd xmm3, xmm2 ; copy of x2 270 movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx 271 mulsd xmm1, xmm2 ; c6*x2 272 mulsd xmm0, xmm2 ; c3*x2 273 subsd xmm5, L_one ; -t=r-1.0, trash r 274 mulsd xmm3, xmm2 ; x4 275 addsd xmm1, __Lcosarray+40h ; c5+x2c6 276 addsd xmm0, __Lcosarray+10h ; c2+x2C3 277 addsd xmm5, L_one ; 1 + (-t), trash t 278 mulsd xmm3, xmm2 ; x6 279 mulsd xmm1, xmm2 ; x2(c5+x2c6) 280 mulsd xmm0, xmm2 ; x2(c2+x2C3) 281 movapd xmm4, xmm2 ; copy of x2 282 mulsd xmm4, L_one_half ; r recalculate 283 addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6) 284 addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3) 285 mulsd xmm2, xmm2 ; x4 recalculate 286 subsd xmm5, xmm4 ; (1 + (-t)) - r 287 mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6)) 288 addsd xmm0, xmm1 ; zc 289 subsd xmm4, L_one ; t relaculate 290 subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx 291 mulsd xmm0, xmm2 ; x4 * zc 292 addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx) 293 subsd xmm0, xmm4 ; result - (-t) 294 295;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 296 297Lsin_sse2_adjust_region: 298; positive or negative 299; switch (region) 300 shr eax, 1 301 mov ecx, eax 302 and eax, r11d 303 304 not ecx 305 not r11d 306 and ecx, r11d 307 308 or eax, ecx 309 and eax, 1 310 jnz Lsin_sse2_cleanup 311 312;; if the original region 0, 1 and arg is negative, then we negate the result. 313;; if the original region 2, 3 and arg is positive, then we negate the result. 314 movapd xmm2, xmm0 315 xorpd xmm0, xmm0 316 subsd xmm0, xmm2 317 318ALIGN 16 319Lsin_sse2_cleanup: 320 StackDeallocate stack_size 321 ret 322 323ALIGN 16 324Lsin_sse2_absx_lt_piby4: 325; sin = sin_piby4(x, 0.0); 326 327; x2 = r * r; 328 movapd xmm2, xmm0 329 mulsd xmm2, xmm0 ; x2 330 331;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 332; region 0 - do a sin calculation 333; zs = (s2 + x2 * (s3 + x2 * (s4 + x2 * (s5 + x2 * s6)))); 334 movsd xmm3, __Lsinarray+50h ; s6 335 mulsd xmm3, xmm2 ; x2s6 336 movsd xmm5, __Lsinarray+20h ; s3 337 movapd xmm1, xmm2 ; move for x4 338 mulsd xmm1, xmm2 ; x4 339 mulsd xmm5, xmm2 ; x2s3 340 movapd xmm4, xmm0 ; move for x3 341 addsd xmm3, __Lsinarray+40h ; s5+x2s6 342 mulsd xmm1, xmm2 ; x6 343 mulsd xmm3, xmm2 ; x2(s5+x2s6) 344 mulsd xmm4, xmm2 ; x3 345 addsd xmm5, __Lsinarray+10h ; s2+x2s3 346 mulsd xmm5, xmm2 ; x2(s2+x2s3) 347 addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) 348 mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) 349 addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) 350 addsd xmm3, xmm5 ; zs 351 mulsd xmm4, xmm3 ; *x3 352 addsd xmm0, xmm4 ; +x 353 jmp Lsin_sse2_cleanup 354 355;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 356ALIGN 16 357Lsin_reduce_precise: 358; Reduce x into range [-pi/4, pih/4] 359 cmp r10,L__inf_mask_64 360 jae Lsin_x_naninf 361 mov QWORD PTR p_temp[rsp], r11 362 call __remainder_piby2_forAsm 363 mov r11, QWORD PTR p_temp[rsp] 364 365 ; At this point xmm0 has r, xmm1 has rr, rax has region 366 367 movapd xmm4, xmm1 ; xmm4 <-- rr 368 jmp Lsin_piby4 369 370; xmm0 = x, xmm4 = xx, eax= region 371 372 373ALIGN 16 374Lsin_x_naninf: 375 call fname_special 376 StackDeallocate stack_size 377 ret 378 379;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 380; From this point we assume that FMA3 and AVX hardware are present. 381 382ALIGN 16 383Lsin_fma3: 384 vmovq r9,xmm0 385 mov r10,r9 ; save x to get sign later 386 btr r9,63 ; r9 <-- |x| 387 cmp r9,L_piby4 388 jae Lsin_fma3_absx_nlt_piby4 ; Note that NaN will branch 389 cmp r9,L_two_to_neg_13 390 jae Lsin_fma3_calc_sin_for_absx_lt_piby4 391 cmp r9,L_two_to_neg_27 392 jae Lsin_fma3_compute_x_xxx_0_1666 393 StackDeallocate stack_size 394 ret ; sin x ~= x for |x| < 2^-27 395 396ALIGN 16 397Lsin_fma3_compute_x_xxx_0_1666: ; |x| in [2^-27,2^-13] 398 vmulsd xmm1,xmm0,xmm0 ; xmm1l <-- x*x 399 vmulsd xmm1,xmm1,xmm0 ; xmm1l <-- x*x*x 400 vfnmadd231sd xmm0,xmm1,L_one_sixth ; xmm0l <-- x - x*x*x*(1/6) 401 StackDeallocate stack_size 402 ret 403 404ALIGN 16 405Lsin_fma3_calc_sin_for_absx_lt_piby4: ; |x| in [2^-13,pi/4] 406 vmovsd xmm5,__Lsinarray+050h 407 vmulsd xmm3,xmm0,xmm0 ; xmm3l <-- x^2 408 409 vfmadd213sd xmm5,xmm3,__Lsinarray+040h 410 vfmadd213sd xmm5,xmm3,__Lsinarray+030h 411 vfmadd213sd xmm5,xmm3,__Lsinarray+020h 412 vfmadd213sd xmm5,xmm3,__Lsinarray+010h 413 414 vmulsd xmm4,xmm0,xmm3 ; xmm4l <-- x^3 415 vfmadd213sd xmm5,xmm3,__Lsinarray 416 vfmadd231sd xmm0,xmm4,xmm5 ; xmm0l <-- x + x^3 p(x^2) 417 418 StackDeallocate stack_size 419 ret 420 421ALIGN 16 422Lsin_fma3_absx_nlt_piby4: ; !(|x| < pi/4) 423 ; here r9 has |x| 424 cmp r9,L__inf_mask_64 425 jae Lsin_x_naninf 426;Lrange_reduce: ;; unused label 427 428 vmovq xmm0,r9 ; xmm0 <-- |x| 429 cmp r9,L_small_arg_bdl 430 jae Lsin_fma3_do_general_arg_reduction 431 432 ; Note that __remainder_piby2_fma3 conventions are 433 ; on input 434 ; |x| is in xmm0 435 ; on output 436 ; r is in xmm0 437 ; rr is in xmm1 438 ; region of |x| is in rax 439 440 ; Boldo-Daumas-Li reduction for reasonably small |x| 441 call __remainder_piby2_fma3_bdl 442Lsin_fma3_exit_s: 443 bt rax,0 444 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x 445 jc Lsin_fma3_calc_cos 446 447Lsin_fma3_calc_sin: ;; unused label 448 ; region 0 or 2 449 ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4] 450 vmovsd xmm5,__Lsinarray+050h 451 vfmadd213sd xmm5,xmm3,__Lsinarray+040h 452 vfmadd213sd xmm5,xmm3,__Lsinarray+030h 453 vfmadd213sd xmm5,xmm3,__Lsinarray+020h 454 vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r 455 456 vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x 457 vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r 458 vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x 459 vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r 460 vmulsd xmm2,xmm3,xmm2 461 vsubsd xmm2,xmm2,xmm1 462 vfnmadd231sd xmm2, xmm4,__Lsinarray 463 vsubsd xmm0,xmm0,xmm2 464 jmp Lsin_fma3_exit_s_1 465 466ALIGN 16 467Lsin_fma3_calc_cos: 468 ; region 1 or 3 469 ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4] 470 vmovapd xmm2,L_one 471 vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r 472 vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5 473 vsubsd xmm2,xmm2,xmm4 ; 1-t 474 vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r 475 vmovsd xmm5,__Lcosarray+050h 476 vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2 477 vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1 478 vfmadd213sd xmm5,xmm3,__Lcosarray+040h 479 vfmadd213sd xmm5,xmm3,__Lcosarray+030h 480 vfmadd213sd xmm5,xmm3,__Lcosarray+020h 481 vfmadd213sd xmm5,xmm3,__Lcosarray+010h 482 vfmadd213sd xmm5,xmm3,__Lcosarray 483 vfmadd213sd xmm5,xmm1,xmm2 484 vaddsd xmm0,xmm5,xmm4 485 486Lsin_fma3_exit_s_1: 487 xor r8,r8 ; prepare r8 for cmov 488 and r10,L_signbit ; isolate original sign of x 489 bt eax,1 490 cmovc r8,L_signbit 491 xor r8,r10 492 vmovq xmm3,r8 493 vxorpd xmm0,xmm0,xmm3 494 495 StackDeallocate stack_size 496 ret 497 498ALIGN 16 499Lsin_fma3_do_general_arg_reduction: 500 ; argument reduction for general x 501 502 ; NOTE: the BDL argument reduction routine does not touch r10, 503 ; but the general-purpose reduction does. 504 mov QWORD PTR [save_r10+rsp], r10 505 call __remainder_piby2_fma3 506 mov r10, QWORD PTR [save_r10+rsp] 507 jmp Lsin_fma3_exit_s 508 509fname endp 510END 511 512