1; 2; MIT License 3; ----------- 4; 5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 6; 7; Permission is hereby granted, free of charge, to any person obtaining a copy 8; of this Software and associated documentaon files (the "Software"), to deal 9; in the Software without restriction, including without limitation the rights 10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11; copies of the Software, and to permit persons to whom the Software is 12; furnished to do so, subject to the following conditions: 13; 14; The above copyright notice and this permission notice shall be included in 15; all copies or substantial portions of the Software. 16; 17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23; THE SOFTWARE. 24; 25; 26; An implementation of the cos function. 27; 28; Prototype: 29; 30; double cos(double x); 31; 32; Computes cos(x). 33; It will provide proper C99 return values, 34; but may not raise floating point status bits properly. 35; Based on the NAG C implementation. 36; 37; If FMA3 hardware is available, an FMA3 implementation of cos will be used. 38 39.const 40ALIGN 16 41L_real_piby2_1 DQ 03ff921fb54400000h ; piby2_1 42 DQ 0 43L_real_piby2_1tail DQ 03dd0b4611a626331h ; piby2_1tail 44 DQ 0 45L_real_piby2_2 DQ 03dd0b4611a600000h ; piby2_2 46 DQ 0 47L_real_piby2_2tail DQ 03ba3198a2e037073h ; piby2_2tail 48 DQ 0 49 50ALIGN 16 51L_one DQ 03FF0000000000000h, 03FF0000000000000h 52L_signbit DQ 08000000000000000h, 00000000000000000h 53L_int_one DQ 00000000000000001h, 00000000000000000h 54L_int_two DQ 00000000000000002h, 00000000000000000h 55 56L_2_by_pi DQ 03fe45f306dc9c883h ; 2/pi 57L_one_half DQ 03FE0000000000000h ; .5 58L_neg_one_half DQ 0bfe0000000000000h ; - 0.5 59L_two_to_neg_27 DQ 03e40000000000000h ; 2^-27 60L_two_to_neg_13 DQ 03f20000000000000h ; 2^-13 61L_piby4 DQ 03FE921FB54442D18h ; pi/4 62L_small_arg_cw DQ 0411E848000000000h ; 5.e5, appropriate for CW 63L_small_arg_bdl DQ 0417312D000000000h ; 2e7, works for BDL 64L_sign_mask DQ 07FFFFFFFFFFFFFFFh 65 66L__inf_mask_64 DQ 07FF0000000000000h ; +Inf 67 68 69 70EXTRN __Lcosarray:QWORD 71EXTRN __Lsinarray:QWORD 72EXTRN __use_fma3_lib:DWORD 73 74; local storage offsets 75p_temp EQU 020h ; temporary for get/put bits operation 76p_temp1 EQU 030h ; temporary for get/put bits operation 77dummy_space EQU 040h 78stack_size EQU 068h 79 80include fm.inc 81 82fname TEXTEQU <cos> 83fname_special TEXTEQU <_cos_special> 84 85;Define name and any external functions being called 86EXTERN __remainder_piby2_forAsm : PROC 87EXTERN __remainder_piby2_fma3 : PROC 88EXTERN __remainder_piby2_fma3_bdl : PROC 89EXTERN fname_special : PROC 90 91.code 92 93PUBLIC fname 94fname PROC FRAME 95 StackAllocate stack_size 96 .ENDPROLOG 97 98 cmp DWORD PTR __use_fma3_lib, 0 99 jne L_cos_fma3 100 101Lcos_sse2: 102 movd rdx, xmm0 103 xorpd xmm2, xmm2 ; zeroed out for later use 104 105 mov r10, rdx 106 btr r10, 63 ; r10 <-- |x| 107 cmp r10, L_piby4 108 jb Lcos_sse2_absx_lt_piby4 109 110Lcos_absx_nlt_piby4: ; common case 111 112; Here rdx has x, r10 has |x| 113 movd xmm0, r10 ; xmm0 <-- |x| 114 115 cmp r10, QWORD PTR L_small_arg_cw 116 jae Lcos_reduce_precise ; Note NaN/Inf will branch 117 118; At this point we have |x| < L_small_arg_cw, which is currently 500000. 119; Note that if |x| were too large, conversion of npi2 to integer would fail. 120; We reduce the argument to be in a range from -pi/4 to +pi/4 121; by subtracting multiples of pi/2 122 movapd xmm2, xmm0 123 mulsd xmm2, L_2_by_pi 124 movapd xmm4, xmm0 125 126; xexp = ax >> EXPSHIFTBITS_DP64; 127 mov r9, r10 128 shr r9, 52 ; >>EXPSHIFTBITS_DP64 129 130; How many pi/2 is |x| a multiple of? 131; npi2 = (int)(x * twobypi + 0.5); 132 addsd xmm2, L_one_half ; npi2 133 134 movsd xmm3, L_real_piby2_1 135 cvttpd2dq xmm0, xmm2 ; convert npi2 to integer 136 movsd xmm1, L_real_piby2_1tail 137 cvtdq2pd xmm2, xmm0 ; and back to double. 138 139; Subtract the multiple from x to get an extra-precision remainder 140; rhead = x - npi2 * piby2_1; 141 mulsd xmm3, xmm2 142 subsd xmm4, xmm3 ; rhead 143 144; rtail = npi2 * piby2_1tail; 145 mulsd xmm1, xmm2 ; rtail 146 movd eax, xmm0 ; eax <-- npi2 147 148; GET_BITS_DP64(rhead-rtail, uy); 149; originally only rhead 150 movapd xmm0, xmm4 151 subsd xmm0, xmm1 152 153 movsd xmm3, L_real_piby2_2 154 movd rcx, xmm0 ; rcx <-- rhead - rtail 155 movsd xmm5, L_real_piby2_2tail ; piby2_2tail 156 157; xmm0=r, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, 158; xmm4=rhead xmm5= temp for calc 159; expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); 160; expdiff measures how close rhead - rtail is to |x| 161; (larger expdiff ==> more cancellation in |x| - (rhead-rtail) ==> closer) 162 shl rcx, 1 ; strip any sign bit 163 shr rcx, 53 ; >> EXPSHIFTBITS_DP64 +1 164 sub r9, rcx ; expdiff 165 166;; if (expdiff > 15) 167 cmp r9, 15 168 jle Lcos_sse2_cw_reduction_done 169 170; Here the remainder is pretty small compared with x, which 171; implies that x is a near multiple of pi/2 172; (x matches the multiple to at least 15 bits) 173; So we do another stage of argument reduction. 174 175; t = rhead; 176 movapd xmm1, xmm4 177 178; rtail = npi2 * piby2_2; 179 mulsd xmm3, xmm2 180 181; rhead = t - rtail; 182 mulsd xmm5, xmm2 ; npi2 * piby2_2tail 183 subsd xmm4, xmm3 ; rhead 184 185; rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); 186 subsd xmm1, xmm4 ; t - rhead 187 subsd xmm1, xmm3 ; -rtail 188 subsd xmm5, xmm1 ; rtail 189 190; r = rhead - rtail; 191 movapd xmm0, xmm4 192 193;HARSHA 194;xmm1=rtail 195 movapd xmm1, xmm5 ; xmm1 <-- copy of rtail 196 subsd xmm0, xmm5 197 198; xmm0=r, xmm4=rhead, xmm1=rtail 199Lcos_sse2_cw_reduction_done: 200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 201;; if the input was close to a pi/2 multiple 202; The original NAG code missed this trick. 203; If the input is very close to n*pi/2 after reduction, so r < 2^-27, 204; then the cos is either ~ 1.0 or ~r, to within 53 bits. 205 206; NOTE: Unfortunately, this introduces two jcc instructions close to each 207; other and to other branches. As r < 2^-13 should be rather uncommon, 208; the problems for branch prediction outweigh the computational savings. - WAT 209;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 210; region = npi2 & 3; 211 subsd xmm4, xmm0 ; rhead-r 212 subsd xmm4, xmm1 ; rr = (rhead-r) - rtail 213 214Lcos_piby4: 215; perform taylor series to calc sinx or cosx 216; x2 = r * r; 217 218;xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path 219;instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path 220 movapd xmm3, xmm0 221 movapd xmm2, xmm0 222 mulsd xmm2, xmm0 ;x2 223 224 bt eax,0 225 jnc Lcos_sse2_calc_cos 226 227;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 228; region 1 or 3 do a sin calculation 229 movsd xmm3, __Lsinarray+50h ; s6 230 mulsd xmm3, xmm2 ; x2s6 231 movsd xmm5, __Lsinarray+20h ; s3 232 movsd QWORD PTR p_temp[rsp], xmm4 ; store xx 233 movapd xmm1, xmm2 ; move for x4 234 mulsd xmm1, xmm2 ; x4 235 movsd QWORD PTR p_temp1[rsp], xmm0 ; store x 236 mulsd xmm5, xmm2 ; x2s3 237 movapd xmm4, xmm0 ; move for x3 238 addsd xmm3, __Lsinarray+40h ; s5+x2s6 239 mulsd xmm1, xmm2 ; x6 240 mulsd xmm3, xmm2 ; x2(s5+x2s6) 241 mulsd xmm4, xmm2 ; x3 242 addsd xmm5, __Lsinarray+10h ; s2+x2s3 243 mulsd xmm5, xmm2 ; x2(s2+x2s3) 244 addsd xmm3, __Lsinarray+30h ; s4 + x2(s5+x2s6) 245 mulsd xmm2, L_one_half ; 0.5 *x2 246 movsd xmm0, QWORD PTR p_temp[rsp] ; load xx 247 mulsd xmm3, xmm1 ; x6(s4 + x2(s5+x2s6)) 248 addsd xmm5, __Lsinarray ; s1+x2(s2+x2s3) 249 mulsd xmm2, xmm0 ; 0.5 * x2 *xx 250 addsd xmm3, xmm5 ; zs 251 mulsd xmm4, xmm3 ; *x3 252 subsd xmm4, xmm2 ; x3*zs - 0.5 * x2 *xx 253 addsd xmm0, xmm4 ; +xx 254 addsd xmm0, QWORD PTR p_temp1[rsp] ; +x 255 256 jmp Lcos_sse2_adjust_region 257 258ALIGN 16 259Lcos_sse2_calc_cos: 260;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 261; region 0 or 2 - do a cos calculation 262; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); 263 mulsd xmm4, xmm0 ; x*xx 264 movsd xmm5, L_one_half 265 movsd xmm1, __Lcosarray+50h ; c6 266 movsd xmm0, __Lcosarray+20h ; c3 267 mulsd xmm5, xmm2 ; r = 0.5 *x2 268 movapd xmm3, xmm2 ; copy of x2 269 movsd QWORD PTR p_temp[rsp], xmm4 ; store x*xx 270 mulsd xmm1, xmm2 ; c6*x2 271 mulsd xmm0, xmm2 ; c3*x2 272 subsd xmm5, L_one ; -t=r-1.0, trash r 273 mulsd xmm3, xmm2 ; x4 274 addsd xmm1, __Lcosarray+40h ; c5+x2c6 275 addsd xmm0, __Lcosarray+10h ; c2+x2C3 276 addsd xmm5, L_one ; 1 + (-t), trash t 277 mulsd xmm3, xmm2 ; x6 278 mulsd xmm1, xmm2 ; x2(c5+x2c6) 279 mulsd xmm0, xmm2 ; x2(c2+x2C3) 280 movapd xmm4, xmm2 ; copy of x2 281 mulsd xmm4, L_one_half ; r recalculate 282 addsd xmm1, __Lcosarray+30h ; c4 + x2(c5+x2c6) 283 addsd xmm0, __Lcosarray ; c1+x2(c2+x2C3) 284 mulsd xmm2, xmm2 ; x4 recalculate 285 subsd xmm5, xmm4 ; (1 + (-t)) - r 286 mulsd xmm1, xmm3 ; x6(c4 + x2(c5+x2c6)) 287 addsd xmm0, xmm1 ; zc 288 subsd xmm4, L_one ; t relaculate 289 subsd xmm5, QWORD PTR p_temp[rsp] ; ((1 + (-t)) - r) - x*xx 290 mulsd xmm0, xmm2 ; x4 * zc 291 addsd xmm0, xmm5 ; x4 * zc + ((1 + (-t)) - r -x*xx) 292 subsd xmm0, xmm4 ; result - (-t) 293 294;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 295 296Lcos_sse2_adjust_region: 297; switch (region) 298 add eax, 1 299 and eax, 2 300 jz Lcos_sse2_cleanup 301 302;; if the original region 1 or 2 then we negate the result. 303 movapd xmm2, xmm0 304 xorpd xmm0, xmm0 305 subsd xmm0, xmm2 306 307ALIGN 16 308Lcos_sse2_cleanup: 309 StackDeallocate stack_size 310 ret 311 312 313 314 315 316 317ALIGN 16 318Lcos_sse2_absx_lt_piby4: 319; cos = cos_piby4(x, 0.0); 320 321; x2 = r * r; 322 cmp r10, L_two_to_neg_13 323 jb Lcos_sse2_x_small 324 movapd xmm2, xmm0 325 mulsd xmm2, xmm0 ; x2 326 327;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 328; region 0 - do a cos calculation 329; zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6)))); 330 movsd xmm1, __Lcosarray+10h ; c2 331 movapd xmm4, xmm2 ; move for x4 332 mulsd xmm4, xmm2 ; x4 333 movsd xmm3, __Lcosarray+30h ; c4 334 mulsd xmm1, xmm2 ; c2x2 335 movsd xmm5, __Lcosarray+50h ; c6 336 mulsd xmm3, xmm2 ; c4x2 337 movapd xmm0, xmm4 ; move for x8 338 mulsd xmm5, xmm2 ; c6x2 339 mulsd xmm0, xmm4 ; x8 340 addsd xmm1, __Lcosarray ; c1 + c2x2 341 mulsd xmm1, xmm4 ; c1x4 + c2x6 342 addsd xmm3, __Lcosarray+20h ; c3 + c4x2 343 mulsd xmm2, L_neg_one_half ; -0.5x2, destroy xmm2 344 addsd xmm5, __Lcosarray+40h ; c5 + c6x2 345 mulsd xmm3, xmm0 ; c3x8 + c4x10 346 mulsd xmm4, xmm0 ; x12 347 mulsd xmm4, xmm5 ; c5x12 + c6x14 348 349 movsd xmm0, L_one 350 addsd xmm1, xmm3 ; c1x4 + c2x6 + c3x8 + c4x10 351 movapd xmm3, xmm2 ; preserve -0.5x2 352 addsd xmm2, xmm0 ; t = 1 - 0.5x2 353 subsd xmm0, xmm2 ; 1-t 354 addsd xmm0, xmm3 ; (1-t) - r 355 addsd xmm1, xmm4 ; c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14 356 addsd xmm0, xmm1 ; (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14 357 addsd xmm0, xmm2 ; 1 - 0.5x2 + above 358 359 StackDeallocate stack_size 360 ret 361 362ALIGN 16 363Lcos_sse2_x_small: 364 movsd xmm2, xmm0 365 movsd xmm0, L_one 366 cmp r10, L_two_to_neg_27 367 jb Lcos_sse2_x_smaller 368 mulsd xmm2, xmm2 369 mulsd xmm2, L_one_half 370 subsd xmm0, xmm2 371 StackDeallocate stack_size 372 ret 373 374ALIGN 16 375Lcos_sse2_x_smaller: 376 movsd xmm0, L_one 377 addsd xmm0, L_int_one ; really adding smallest subnormal; set inexact 378 StackDeallocate stack_size 379 ret 380 381ALIGN 16 382Lcos_reduce_precise: 383; Reduce x into range [-pi/4, pi/4] 384 cmp r10, L__inf_mask_64 385 jae Lcos_x_naninf 386 call __remainder_piby2_forAsm 387 388 ; At this point xmm0 has r, xmm1 has rr, rax has region 389 390 movapd xmm4, xmm1 ; xmm4 <-- rr 391 jmp Lcos_piby4 392 393; xmm0 = x, xmm4 = xx, eax= region 394 395 396ALIGN 16 397Lcos_x_naninf: 398 call fname_special 399 StackDeallocate stack_size 400 ret 401 402;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 403; From this point we assume that FMA3 and AVX hardware are present. 404 405ALIGN 16 406L_cos_fma3: 407 vmovq r9,xmm0 408 mov rax,r9 409 and r9,L_sign_mask ; clear sign 410 411Lcos_early_exit_s_1: ;; unused label 412 cmp r9,L_piby4 413 jg Lcos_early_exit_s ; Note that NaN will branch 414 cmp r9,L_two_to_neg_13 415 jge Lcompute_cos_pyby_4 416 cmp r9,L_two_to_neg_27 417 jge Lcompute_1_xx_5 418 vmovq xmm0,L_one ; for tiniest args, cos is 1 419 jmp Lreturn_no_restore 420 421Lcompute_1_xx_5: 422 vmulsd xmm1,xmm0,L_one_half ; xmm1l <-- .5*x 423 vfnmadd213sd xmm0,xmm1,L_one ; xmm0l <-- 1.0 - (.5*x)*x 424 jmp Lreturn_no_restore 425 426Lcompute_cos_pyby_4: 427 ; make sure this is accurate enough 428 ; note that x^2 can't be all that close to 1 here 429 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- xx = x*x 430 vmovapd xmm0,__Lcosarray+050h ; xmm0 <-- c5 431 vfmadd213sd xmm0,xmm3,__Lcosarray+040h ; xmm0 <-- c5*xx + c4 432 vfmadd213sd xmm0,xmm3,__Lcosarray+030h ; xmm0 <-- (c5*xx + c4)*xx + c3 433 vfmadd213sd xmm0,xmm3,__Lcosarray+020h 434 vfmadd213sd xmm0,xmm3,__Lcosarray+010h 435 vfmadd213sd xmm0,xmm3,__Lcosarray 436 vfmsub213sd xmm0,xmm3,L_one_half 437 vfmadd213sd xmm0,xmm3,L_one 438 439 jmp Lreturn_no_restore 440 441Lcos_early_exit_s: 442 mov r8,L__inf_mask_64 443 and rax,r8 444 cmp rax, r8 445 jz Lcos_x_naninf 446 447Lrange_reduce: 448 vmovq xmm0,r9 ; r9 <-- |x| 449 cmp r9,L_small_arg_bdl 450 jae Lcos_remainder_piby2 451 452 ; For __remainder_piby2_fma3 and __remainder_piby2_fma3_bdl 453 ; on input 454 ; x is in xmm0 455 ; on output 456 ; r is in xmm0 457 ; rr is in xmm1 458 ; region is in rax 459 460 ; Boldo-Daumas-Li reduction for reasonably small |x| 461 call __remainder_piby2_fma3_bdl 462 463;; if region is 0 or 2 do a cos calc. 464;; if region is 1 or 3 do a sin calc. 465Lcos_exit_s: 466 bt rax,0 467 jc Lsin_piby4_compute 468 469Lcos_piby4_compute: ;; unused label 470 ; compute the cosine of r+rr, where this sum is in [-pi/4,pi/4] 471 vmovapd xmm2,L_one 472 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x * x 473 vmulsd xmm5,xmm3,L_one_half ; xmm5 <-- x*x*.5 == r 474 vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- t = 1. - x*x*.5 475 vsubsd xmm2,xmm2,xmm4 ; 1-t 476 vsubsd xmm2,xmm2,xmm5 ; xmm2 <-- (1-t) - r 477 vmovapd xmm5,__Lcosarray+040h 478 vfnmadd231sd xmm2,xmm0,xmm1 ; (1.0 - t) - r) - x * xx) xmm2 479 vmulsd xmm1,xmm3,xmm3 ; x2 * x2 xmm1 480 vfmadd231sd xmm5,xmm3,__Lcosarray+050h 481 vfmadd213sd xmm5,xmm3,__Lcosarray+030h 482 vfmadd213sd xmm5,xmm3,__Lcosarray+020h 483 vfmadd213sd xmm5,xmm3,__Lcosarray+010h 484 vfmadd213sd xmm5,xmm3,__Lcosarray 485 vfmadd213sd xmm5,xmm1,xmm2 486 vaddsd xmm0,xmm5,xmm4 487 488 jmp Lcos_exit_s_1 489 490ALIGN 16 491Lsin_piby4_compute: 492 ; compute the sine of r+rr, where this sum is in [-pi/4,pi/4] 493 vmovapd xmm5,__Lsinarray+040h 494 vmulsd xmm3,xmm0,xmm0 ; xmm3 <-- x2 = x * x 495 vfmadd231sd xmm5,xmm3,__Lsinarray+050h 496 vfmadd213sd xmm5,xmm3,__Lsinarray+030h 497 vfmadd213sd xmm5,xmm3,__Lsinarray+020h 498 vfmadd213sd xmm5,xmm3,__Lsinarray+010h ; xmm5 <-- r 499 500 vmulsd xmm4,xmm0,xmm3 ; xmm4 <-- x3 = x*x*x 501 vmulsd xmm2,xmm4,xmm5 ; xmm2 <-- x*x*x * r 502 vmulsd xmm5,xmm1,L_one_half ; xmm5 <-- .5*x*x 503 vsubsd xmm2,xmm5,xmm2 ; xmm2 <-- .5*x*x - x*x*x*r 504 vmulsd xmm2,xmm3,xmm2 505 vsubsd xmm2,xmm2,xmm1 506 vfnmadd231sd xmm2, xmm4,__Lsinarray 507 vsubsd xmm0,xmm0,xmm2 508 509Lcos_exit_s_1: 510 xor r8,r8 511 add eax, 1 512 and eax, 2 513 cmovnz r8, L_signbit 514 vmovq xmm3,r8 515 vxorpd xmm0,xmm0,xmm3 516 517Lreturn_restore_regs: 518 StackDeallocate stack_size 519 ret 520 521Lreturn_no_restore: 522 StackDeallocate stack_size 523 ret 524 525ALIGN 16 526Lcos_remainder_piby2: 527 ; argument reduction for general x 528 call __remainder_piby2_fma3 529 jmp Lcos_exit_s 530 531 532fname endp 533END 534