1; 2; 3; MIT License 4; ----------- 5; 6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 7; 8; Permission is hereby granted, free of charge, to any person obtaining a copy 9; of this Software and associated documentaon files (the "Software"), to deal 10; in the Software without restriction, including without limitation the rights 11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12; copies of the Software, and to permit persons to whom the Software is 13; furnished to do so, subject to the following conditions: 14; 15; The above copyright notice and this permission notice shall be included in 16; all copies or substantial portions of the Software. 17; 18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24; THE SOFTWARE. 25; 26; An implementation of the remainder by pi/2 function 27; This is a service routine for use by trig functions coded in asm 28; 29; On input, 30; xmm0 = x; 31; On ouput 32; xmm0 = r 33; xmm1 = rr 34; xmm2 = region 35 36.const 37ALIGN 16 38L__piby2_part3_piby2_lead DQ 03ff921fb54442d18h, 03c91a62633145c06h 39L__piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h 40L__piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h 41;; constants for CW reduction 42L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h 43L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h 44L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h 45L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h 46L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h 47L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h 48L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h 49L_point_five DQ 03FE0000000000000h, 03FE0000000000000h 50L_int_three DQ 00000000000000003h, 00000000000000003h 51L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h 52L_signbit DQ 08000000000000000h, 08000000000000000h 53L_int_1 DQ 00000000000000001h, 00000000000000001h 54L_int_15 DQ 0000000000000000Fh 55L_int_48 DQ 00000000000000030h 56L_3pio4 DQ 04002D97C7F3321D2h 57L_5pio4 DQ 0400F6A7A2955385Eh 58L_7pio4 DQ 04015FDBBE9BBA775h 59L_9pio4 DQ 0401c463abeccb2bbh 60ALIGN 16 61L__2_by_pi_bits DB 224, 241, 27, 193, 12, 88, 33, 116 62 DB 53, 126, 196, 126, 237, 175, 169, 75 63 DB 74, 41, 222, 231, 28, 244, 236, 197 64 DB 151, 175, 31, 235, 158, 212, 181, 168 65 DB 127, 121, 154, 253, 24, 61, 221, 38 66 DB 44, 159, 60, 251, 217, 180, 125, 180 67 DB 41, 104, 45, 70, 188, 188, 63, 96 68 DB 22, 120, 255, 95, 226, 127, 236, 160 69 DB 228, 247, 46, 126, 17, 114, 210, 231 70 DB 76, 13, 230, 88, 71, 230, 4, 249 71 DB 125, 209, 154, 192, 113, 166, 19, 18 72 DB 237, 186, 212, 215, 8, 162, 251, 156 73 DB 166, 196, 114, 172, 119, 248, 115, 72 74 DB 70, 39, 168, 187, 36, 25, 128, 75 75 DB 55, 9, 233, 184, 145, 220, 134, 21 76 DB 239, 122, 175, 142, 69, 249, 7, 65 77 DB 14, 241, 100, 86, 138, 109, 3, 119 78 DB 211, 212, 71, 95, 157, 240, 167, 84 79 DB 16, 57, 185, 13, 230, 139, 2, 0 80 DB 0, 0, 0, 0, 0, 0 81 82 83; local storage offsets 84region EQU 000h 85stack_size EQU 018h 86sstack_size EQU 000h ; no stack for fsname 87 88include fm.inc 89 90fname TEXTEQU <__remainder_piby2_forAsm> 91fsname TEXTEQU <__remainder_piby2_cw_forAsm> 92 93 94.code 95 96; xmm0l has |x| 97PUBLIC fname 98fname PROC FRAME 99 StackAllocate stack_size 100 .ENDPROLOG 101 102 ; This function is not using rdx, r8, and r9 as pointers; 103 ; all returns are in registers 104 105 ; get the unbiased exponent and the mantissa part of x 106 lea r9,L__2_by_pi_bits 107 108 ;xexp = (x >> 52) - 1023 109 movd r11,xmm0 110 mov rcx,r11 111 shr r11,52 112 sub r11,1023 ; r11 <-- xexp = exponent of input x 113 114 ;calculate the last byte from which to start multiplication 115 ;last = 134 - (xexp >> 3) 116 mov r10,r11 117 shr r10,3 118 sub r10,134 ; r10 <-- -last 119 neg r10 ; r10 <-- last 120 121 ; load 64 bits of 2_by_pi 122 mov rax,[r9 + r10] 123 124 ; mantissa of x = ((x << 12) >> 12) | implied bit 125 shl rcx,12 126 shr rcx,12 ; rcx <-- mantissa part of input x 127 bts rcx,52 ; add the implied bit as well 128 129 ; load next 128 bits of 2_by_pi 130 add r10,8 ;increment to next 8 bytes of 2_by_pi 131 movdqu xmm0,[r9 + r10] 132 133 ; do three 64-bit multiplications with mant of x 134 mul rcx 135 mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] 136 mov r10,rdx ; r10 <-- carry 137 movd rax,xmm0 138 mul rcx 139 ; resexp = xexp & 7 140 and r11,7 ; r11 <-- resexp = xexp & 7 = last 3 bits 141 psrldq xmm0,8 142 add rax,r10 ; add the previous carry 143 adc rdx,0 144 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 145 mov r10,rdx ; r10 <-- carry 146 movd rax,xmm0 147 mul rcx 148 add r10,rax ; r10 <-- most sig. 64 bits = res1[0] 149 ; find the region 150 ; last three bits ltb = most sig bits >> (54 - resexp)); 151 ; decimal point in last 18 bits ==> 8 lsb's in first 64 bits 152 ; and 8 msb's in next 64 bits 153 ; point_five = ltb & 01h; 154 ; region = ((ltb >> 1) + point_five) & 3; 155 mov rcx,54 156 mov rax,r10 157 sub rcx,r11 158 xor rdx,rdx ; rdx <-- sign of x 159 shr rax,cl 160 jnc L__no_point_five 161 ; if there is carry then negate the result of multiplication 162 not r10 163 not r9 164 not r8 165 mov rdx,08000000000000000h 166 167ALIGN 16 168L__no_point_five: 169 adc rax,0 170 and rax,3 ; rax now has region 171 mov QWORD PTR [region+rsp],rax 172 173 ; calculate the number of integer bits and zero them out 174 mov rcx,r11 175 add rcx,10 ; rcx = no. of integer bits 176 shl r10,cl 177 shr r10,cl ; r10 contains only mant bits 178 sub rcx,64 ; form the exponent 179 mov r11,rcx 180 181 ;find the highest set bit 182 bsr rcx,r10 183 jnz L__form_mantissa 184 mov r10,r9 185 mov r9,r8 186 mov r8,0 187 bsr rcx,r10 ; rcx = hsb 188 sub r11,64 189 190 191ALIGN 16 192L__form_mantissa: 193 add r11,rcx ; for exp of x 194 sub rcx,52 ; rcx = no. of bits to shift in r10 195 cmp rcx,0 196 jl L__hsb_below_52 197 je L__form_numbers 198 ; hsb above 52 199 mov r8,r10 ; previous contents of r8 not required 200 shr r10,cl ; r10 = mantissa of x with hsb at 52 201 shr r9,cl ; make space for bits from r10 202 sub rcx,64 203 neg rcx 204 ; rcx <-- no of bits to shift r10 to move those bits to r9 205 shl r8,cl 206 or r9,r8 ; r9 = mantissa bits of xx 207 jmp L__form_numbers 208 209ALIGN 16 210L__hsb_below_52: 211 neg rcx 212 mov rax,r9 213 shl r10,cl 214 shl r9,cl 215 sub rcx,64 216 neg rcx 217 shr rax,cl 218 or r10,rax 219 shr r8,cl 220 or r9,r8 221 222ALIGN 16 223L__form_numbers: 224 add r11,1023 225 btr r10,52 ; remove the implicit bit 226 mov rcx,r11 227 or r10,rdx ; put the sign 228 shl rcx,52 229 or r10,rcx ; r10 <-- x 230 231 movd xmm0,r10 ; xmm0 <-- x 232 movdqa xmm1,xmm0 ; xmm1 <-- x 233 psrlq xmm1,27 234 psllq xmm1,27 ; xmm1 <-- hx 235 movdqa xmm2,xmm0 ; xmm2 <-- x 236 subsd xmm2,xmm1 ; xmm2 <-- tx 237 movlhps xmm0,xmm0 ; xmm0 <-- x,x 238 movlhps xmm2,xmm1 ; xmm2 <-- hx,tx 239 240 movdqa xmm1,XMMWORD PTR L__piby2_part3_piby2_lead 241 movdqa xmm3,XMMWORD PTR L__piby2_part1 242 movdqa xmm4,XMMWORD PTR L__piby2_part2 243 244 ; form xx 245 xor rcx,rcx 246 bsr rcx,r9 247 sub rcx,64 ; to shift the implicit bit as well 248 neg rcx 249 shl r9,cl 250 shr r9,12 251 add rcx,52 252 sub r11,rcx 253 shl r11,52 254 or r9,rdx 255 or r9,r11 256 movd xmm5,r9 ; xmm5 <-- xx 257 258 mulpd xmm0,xmm1 ; xmm0 <-- piby2_part3 * x,piby2_lead * x = c 259 mulpd xmm5,xmm1 ; xmm5 <-- piby2_lead * xx 260 mulpd xmm3,xmm2 ; xmm3 <-- piby2_part1 * hx,piby2_part1 * tx 261 mulpd xmm4,xmm2 ; xmm4 <-- piby2_part2 * hx,piby2_part2 * tx 262 263 ; cc = (piby2_part1 * hx - c) + (piby2_part1 * tx) + 264 ; (piby2_part2 * hx) + (piby2_part2 * tx) + 265 ; (piby2_lead * xx + piby2_part3 * x) 266 movhlps xmm1,xmm3 ; xmm1 = piby2_part1 * hx 267 movhlps xmm2,xmm4 ; xmm2 = piby2_part2 * hx 268 subsd xmm1,xmm0 ; xmm1 = (piby2_part1 * hx - c) 269 addsd xmm1,xmm3 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) 270 movhlps xmm3,xmm0 ; xmm3 = piby2_part3 * x 271 addsd xmm1,xmm2 272 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + (piby2_part2 * hx) 273 addsd xmm3,xmm5 ; xmm3 = (piby2_lead * xx + piby2_part3 * x) 274 addsd xmm1,xmm4 275 ; xmm1 = (piby2_part1 * hx - c) + (piby2_part1 * tx) + 276 ; (piby2_part2 * hx) + (piby2_part2 * tx) 277 addsd xmm1,xmm3 ; xmm1 = cc 278 279 ; xmm0 <-- c, xmm1 <-- cc 280 ; r = c + cc 281 ; rr = (c - r) + cc 282 283 movdqa xmm2,xmm0 ; xmm2 <-- copy of c 284 addsd xmm0,xmm1 ; xmm0 <-- r = c + cc 285 subsd xmm2,xmm0 ; xmm2 <-- c - r 286 addsd xmm1,xmm2 ; xmm1 <-- rr = cc + (c - r) 287 mov rax, QWORD PTR[region+rsp] ; rax <-- region 288 289 StackDeallocate stack_size 290 ret 291 292fname endp 293 294; NOTE: If this is not going to be used, should probably remove it. - WAT 295ALIGN 16 296PUBLIC fsname 297fsname PROC FRAME 298 StackAllocate sstack_size 299 .ENDPROLOG 300 301; xmm0l has |x| 302; r9 also has |x| 303; ASSUMPTION: if we call this function, |x| > pi/4 304 305 xor r8d,r8d 306 cmp r9, QWORD PTR L_5pio4 307 ja Lax_gt_5pio4 308 cmp r9, QWORD PTR L_3pio4 309 seta r8b 310 inc r8d 311 jmp Lstage_npi2 312Lax_gt_5pio4: 313 cmp r9, QWORD PTR L_9pio4 314 ja Lnpi2_full_computation 315 cmp r9, QWORD PTR L_7pio4 316 seta r8b 317 add r8d,3 318Lstage_npi2: 319 movd xmm2, r8d 320 cvtdq2pd xmm4, xmm2 321 jmp Lnpi2_known 322 323Lnpi2_full_computation: 324; movapd xmm1, L_twobypi 325; movapd xmm3, L_point_five 326 movapd xmm5,xmm0 327; mulsd xmm5,xmm1 328; addsd xmm5,xmm3 ; xmm5 <-- |x|*2/pi + .5 329 mulsd xmm5, L_twobypi 330 addsd xmm5, L_point_five 331 332 cvttpd2dq xmm5,xmm5 ; xmm5 < npi2 = int part 333 movapd xmm2,xmm5 334 andpd xmm2,L_int_three 335 cvtdq2pd xmm4,xmm5 336 337Lnpi2_known: 338 movapd xmm5,xmm4 339 mulsd xmm5,QWORD PTR L_piby2_1 ; xmm5 <-- npi2*piby2_1 340 xorpd xmm5,L_signbit ; xmm5 <-- -npi2*piby2_1 341 addpd xmm5,xmm0 ; xmm5 <-- rhead = x - npi2*piby2_1 342 movapd xmm3,xmm4 343 mulsd xmm3,QWORD PTR L_piby2_1tail ; xmm3 <-- rtail = npi2*piby2_1tail 344 345 ; If x is nearly a multiple of pi/2, rhead will be small compared to |x| 346 ; we check this by checking exponent difference. 347 348 ; Note that both the unbiased exponents are positive, and that of rhead 349 ; must be <= that of |x| 350 movapd xmm1,xmm5 ; xmm1l <-- rhead 351 subpd xmm1,xmm3 ; xmm1l <-- r = rhead - rtail 352 andpd xmm1,L_inf_mask_64 353 psubq xmm0,xmm1 ; xmm0 <-- |x| - r 354 psrlq xmm0,52 355 comisd xmm0,L_int_15 356 357; movd rax, xmm5 ; really a movq 358; shr rax, 52 359; shr rdx, 52 ; get exponent of |x| (no and needed) 360; sub rdx, rax 361; cmp rdx, 15 362 jbe Lcw_get_r_rr 363 364 ; here expdiff > 15, so x is nearly a multiple of pi/2 and things are hard 365 ; we use another piece of pi/2 in the reduction 366 367 movapd xmm1,xmm5 368 movapd xmm3,xmm4 369 mulsd xmm3,QWORD PTR L_piby2_2 ; xmm3 <--- rtail = npi2*piby2_2 370 subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail 371 372 ; now rtail = npi2*piby2_2tail - ((t-rhead) - rtail) 373 subsd xmm1,xmm5 374 subsd xmm1,xmm3 375 movapd xmm3,xmm4 376 mulsd xmm3,QWORD PTR L_piby2_2tail 377 subsd xmm3,xmm1 ; xmm3 <-- rtail 378 379 comisd xmm0,L_int_48 380; cmp rdx, 48 381 jbe Lcw_get_r_rr 382 383 ; here expdiff > 48, so x is REALLY close to a multiple of pi/2 384 ; and we use yet another piece of pi/2 in the reduction 385 386 movapd xmm0,xmm5 ; xmm0 <-- t = rhead 387 movapd xmm3,xmm4 388 mulsd xmm3,QWORD PTR L_piby2_3 ; xmm3 <-- rtail = npi2 * piby2_3 389 movapd xmm5,xmm0 390 subsd xmm5,xmm3 ; xmm5 <-- rhead = t - rtail 391 392 ; now rtail = npi2 * piby2_3tail - ((t - rhead) - rtail) 393 movapd xmm1,xmm0 394 subsd xmm1,xmm5 395 subsd xmm1,xmm3 396 movapd xmm3,xmm4 397 mulsd xmm3,QWORD PTR L_piby2_3tail 398 subsd xmm3,xmm1 ; xmm3 <-- rtail 399 400Lcw_get_r_rr: 401 ; We have a satisfactory rhead in xmm5 and rtail in xmm3 402 ; We now produce r in xmm0 and rr in xmm1, where the actual reduced argument 403 ; is the sum of r and rr, and rr is insignificant 404 ; with respect to r under addition (i.e., r + rr == r). 405 movapd xmm0,xmm5 ; xmm0 <-- rhead 406 subsd xmm0,xmm3 ; xmm0 <-- r = rhead - rtail 407 movapd xmm1,xmm5 ; xmm1 <-- rhead 408 subsd xmm1,xmm0 ; xmm1 <-- (rhead - r) 409 subsd xmm1,xmm3 ; xmm1 <-- rr = (rhead - r) - rtail 410 movd rax,xmm2 ; rax <-- region 411 StackDeallocate sstack_size 412 ret 413fsname endp 414 415END 416