1; 2; 3; MIT License 4; ----------- 5; 6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc. 7; 8; Permission is hereby granted, free of charge, to any person obtaining a copy 9; of this Software and associated documentaon files (the "Software"), to deal 10; in the Software without restriction, including without limitation the rights 11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12; copies of the Software, and to permit persons to whom the Software is 13; furnished to do so, subject to the following conditions: 14; 15; The above copyright notice and this permission notice shall be included in 16; all copies or substantial portions of the Software. 17; 18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24; THE SOFTWARE. 25; 26; An implementation of the remainder by pi/2 function using fma3 27; This is a service routine for use by trig functions coded in asm that use fma3 28; 29; On input, 30; xmm0 = x; 31; On ouput 32; xmm0 = r 33; xmm1 = rr 34; rax = region 35 36.const 37ALIGN 16 38L_piby2_lead DQ 03ff921fb54442d18h, 03ff921fb54442d18h 39L_fff800 DQ 0fffffffff8000000h, 0fffffffff8000000h 40L_piby2_part1 DQ 03ff921fb50000000h, 03ff921fb50000000h 41L_piby2_part2 DQ 03e5110b460000000h, 03e5110b460000000h 42L_piby2_part3 DQ 03c91a62633145c06h, 03c91a62633145c06h 43L_piby2_1 DQ 03FF921FB54400000h, 03FF921FB54400000h 44L_piby2_2 DQ 03DD0B4611A600000h, 03DD0B4611A600000h 45L_piby2_3 DQ 03BA3198A2E000000h, 03BA3198A2E000000h 46L_piby2_1tail DQ 03DD0B4611A626331h, 03DD0B4611A626331h 47L_piby2_2tail DQ 03BA3198A2E037073h, 03BA3198A2E037073h 48L_piby2_3tail DQ 0397B839A252049C1h, 0397B839A252049C1h 49L_sign_mask DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh 50L_twobypi DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h 51L_point_five DQ 03FE0000000000000h, 03FE0000000000000h 52L_int_three DQ 00000000000000003h, 00000000000000003h 53L_inf_mask_64 DQ 07FF0000000000000h, 07FF0000000000000h 54L_signbit DQ 08000000000000000h, 08000000000000000h 55;; constants for BDL reduction 56L_r DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h ; 2/pi 57L_xc1 DQ 03FF921FB54442D18H, 03FF921FB54442D18h ; pi/2 (L_piby2_lead) 58L_xc2 DQ 03C91A62633145C00H, 03C91A62633145C00h ; pi/2 part 2 59L_xc3 DQ 0397B839A252049C0H, 0397B839A252049C0h ; pi/2 part 3 60; sigma is 3*2^(p-n-2) where n is 0 and p is 53. 61L_sigma DQ 04338000000000000h, 04338000000000000h ; 6755399441055744. 62 63EXTRN __L_2_by_pi_bits:BYTE 64 65region EQU 020h 66stack_size EQU 038h 67 68include fm.inc 69 70fname TEXTEQU <__remainder_piby2_fma3> 71fbname TEXTEQU <__remainder_piby2_fma3_bdl> 72 73.code 74 75PUBLIC fname 76fname PROC FRAME 77 StackAllocate stack_size 78 .ENDPROLOG 79 80 ; This function is not using rdx, r8, and r9 as pointers; 81 ; all returns are in registers 82 83 ; get the unbiased exponent and the mantissa part of x 84 lea r9,__L_2_by_pi_bits 85 86 ; xexp = (x >> 52) - 1023 87 vmovq r11,xmm0 88 mov rcx,r11 89 shr r11,52 90 sub r11,1023 ; r11 <-- xexp = exponent of input x 91 92 ; calculate the last byte from which to start multiplication 93 ; last = 134 - (xexp >> 3) 94 mov r10,r11 95 shr r10,3 96 sub r10,134 ; r10 <-- -last 97 neg r10 ; r10 <-- last 98 99 ; load 64 bits of 2_by_pi 100 mov rax,[r9 + r10] 101 102 ; mantissa of x = ((x << 12) >> 12) | implied bit 103 shl rcx,12 104 shr rcx,12 ; rcx <-- mantissa part of input x 105 bts rcx,52 ; add the implied bit as well 106 107 ; load next 128 bits of 2_by_pi 108 add r10,8 ; increment to next 8 bytes of 2_by_pi 109 vmovdqu xmm0,XMMWORD PTR[r9 + r10] 110 111 ; do three 64-bit multiplications with mant of x 112 mul rcx 113 mov r8,rax ; r8 <-- last 64 bits of mul = res1[2] 114 mov r10,rdx ; r10 <-- carry 115 vmovq rax,xmm0 116 mul rcx 117 ; resexp = xexp & 7 118 and r11,7 ; r11 <-- resexp = last 3 bits of xexp 119 vpsrldq xmm0,xmm0,8 120 add rax,r10 ; add the previous carry 121 adc rdx,0 122 mov r9,rax ; r9 <-- next 64 bits of mul = res1[1] 123 mov r10,rdx ; r10 <-- carry 124 vmovq rax,xmm0 125 mul rcx 126 add r10,rax ; r10 <-- most sig. 64 bits = res1[0] 127 128 ; find the region 129 ; last three bits ltb = most sig bits >> (54 - resexp)); 130 ; decimal point in last 18 bits ==> 8 lsb's in first 64 bits 131 ; and 8 msb's in next 64 bits 132 ; point_five = ltb & 01h; 133 ; region = ((ltb >> 1) + point_five) & 3; 134 mov rcx,54 135 mov rax,r10 136 sub rcx,r11 137 xor rdx,rdx ; rdx <-- sign of x 138 shr rax,cl 139 jnc L__no_point_five 140 ; if there is carry then negate the result of multiplication 141 not r10 142 not r9 143 not r8 144 mov rdx,08000000000000000h 145 146ALIGN 16 147L__no_point_five: 148 adc rax,0 149 and rax,3 ; rax now has region 150 mov QWORD PTR [region+rsp], rax 151 152 ; calculate the number of integer bits and zero them out 153 mov rcx,r11 154 add rcx,10 ; rcx = no. of integer bits 155 shl r10,cl 156 shr r10,cl ; r10 contains only mant bits 157 sub rcx,64 ; form the exponent 158 mov r11,rcx 159 160 ; find the highest set bit 161 bsr rcx,r10 162 jnz L__form_mantissa 163 mov r10,r9 164 mov r9,r8 165 mov r8,0 166 bsr rcx,r10 ; rcx = hsb 167 sub r11,64 168 169ALIGN 16 170L__form_mantissa: 171 add r11,rcx ; for exp of x 172 sub rcx,52 ; rcx = no. of bits to shift in r10 173 cmp rcx,0 174 jl L__hsb_below_52 175 je L__form_numbers 176 ; hsb above 52 177 mov r8,r10 ; previous r8 not required 178 shr r10,cl ; r10 = mantissa of x with hsb at 52 179 shr r9,cl ; make space for bits from r10 180 sub rcx,64 181 neg rcx 182 ; rcx <-- no of bits to shift r10 to move those bits to r9 183 shl r8,cl 184 or r9,r8 ; r9 = mantissa bits of xx 185 jmp L__form_numbers 186 187ALIGN 16 188L__hsb_below_52: 189 ; rcx has shift count (< 0) 190 neg rcx 191 mov rax,r9 192 shl r10,cl 193 shl r9,cl 194 sub rcx,64 195 neg rcx 196 shr rax,cl 197 or r10,rax 198 shr r8,cl 199 or r9,r8 200 201ALIGN 16 202; Here r11 has unbiased exponent 203; r10 has mantissa, with implicit bit possibly set 204; rdx has the sign bit 205L__form_numbers: 206 add r11,1023 ; r11 <-- biased exponent 207 btr r10,52 ; remove the implicit bit 208 mov rcx,r11 ; rcx <-- copy of biased exponent 209 or r10,rdx ; put the sign 210 shl rcx,52 ; shift biased exponent into place 211 or r10,rcx ; r10 <-- x 212 vmovq xmm2,r10 ; xmm1l <-- x 213 214 ; form xx 215; xor rcx,rcx ; Why is this necessary??? 216 bsr rcx,r9 ; scan for high bit of xx mantissa 217 sub rcx,64 ; to shift the implied bit as well 218 neg rcx 219 shl r9,cl 220 shr r9,12 221 add rcx,52 222 sub r11,rcx 223 shl r11,52 224 or r9,rdx 225 or r9,r11 226 vmovq xmm1,r9 ; xmm1 <-- xx 227 vandpd xmm4,xmm2,L_fff800 ; xmm4 <-- hx 228 vsubsd xmm0,xmm2,xmm4 ; xmm0 <-- tx 229 vmulsd xmm5,xmm2,L_piby2_lead ; xmm5 <-- c 230 vmulsd xmm3,xmm4,L_piby2_part1 231 vsubsd xmm3,xmm3,xmm5 232 vfmadd231sd xmm3,xmm0,L_piby2_part1 233 vfmadd231sd xmm3,xmm4,L_piby2_part2 234 vfmadd231sd xmm3,xmm0,L_piby2_part2 235 vmulsd xmm4,xmm1,L_piby2_lead 236 vfmadd231sd xmm4,xmm2,L_piby2_part3 237 vaddsd xmm3,xmm3,xmm4 ; xmm3 <-- cc 238 vaddsd xmm0,xmm5,xmm3 ; xmm0 <--r 239 vsubsd xmm1,xmm5,xmm0 240 vaddsd xmm1,xmm1,xmm3 ; xmm1 <-- rr 241 mov rax, QWORD PTR [region+rsp] 242 243 StackDeallocate stack_size 244 ret 245fname endp 246 247ALIGN 16 248PUBLIC fbname 249fbname PROC FRAME 250 .ENDPROLOG 251 ; Boldo, Daumas, annd Li, "Formally Verified Argument 252 ; Reduction With a Fused Multiply-Add," 253 ; IEEE Trans. Comp., vol. 58, #8, Aug. 2009 254 ; coefficients are from table 1, mutatis mutandis 255 ; algorithm is their formula 3.1 (for getting z from sigma) and 256 ; algorithm 5.1 (and extended version) for actual reduction 257 vmovapd xmm1,xmm0 258 vmovapd xmm4,L_xc2 ; xmm4 <-- xc2 259 vmovapd xmm2,L_sigma 260 vfmadd132sd xmm1,xmm2,L_r ; z = arg*r + sigma 261 vsubsd xmm1,xmm1,xmm2 ; xmm1 <-- z -= sigma 262 vcvttpd2dq xmm5,xmm1 263 vmovq rax, xmm5 264 vmovapd xmm2,xmm1 265 vfnmadd132sd xmm2,xmm0,L_xc1 ; xmm2 <-- u = arg - z*xc1 266 vmulsd xmm3,xmm1,xmm4 ; xmm3 <-- p1 = z*xc2 267 vmovapd xmm0,xmm1 ; xmm0 <-- copy of z 268 vfmsub213sd xmm0,xmm4,xmm3 ; xmm0 <-- p2 = z*xc2 - p1 269 vsubsd xmm5,xmm2,xmm3 ; xmm5 <-- t1 = u - p1 270 ; We really don't want to spill in this code, so we're commandeering xmm4 271 vsubsd xmm4,xmm2,xmm5 ; xmm4 <-- temp = u - t1 272 vsubsd xmm4,xmm4,xmm3 ; xmm4 <-- t2 = temp - p1 273 ; used to use xmm4 here for L_xc2 274 vfnmadd231sd xmm2,xmm1,L_xc2 ; xmm2 <-- v1 = -xc2*z + u 275 vsubsd xmm5,xmm5,xmm2 ; xmm5 <-- v2 = t1 - v1 276 vaddsd xmm5,xmm5,xmm4 ; xmm5 <-- v2 += t2 277 vsubsd xmm5,xmm5,xmm0 ; xmm5 <-- v2 -= p2 278 vmovapd xmm0,xmm2 ; xmm0 <-- arghead = v1 279 vfnmadd132sd xmm1,xmm5,L_xc3 ; xmm1 <-- argtail = -xc3*z + v2 280 and rax, 3 ; rax <-- region 281 ret 282fbname endp 283END 284