1;
2;
3; MIT License
4; -----------
5;
6; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
7;
8; Permission is hereby granted, free of charge, to any person obtaining a copy
9; of this Software and associated documentaon files (the "Software"), to deal
10; in the Software without restriction, including without limitation the rights
11; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12; copies of the Software, and to permit persons to whom the Software is
13; furnished to do so, subject to the following conditions:
14;
15; The above copyright notice and this permission notice shall be included in
16; all copies or substantial portions of the Software.
17;
18; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24; THE SOFTWARE.
25;
26; An implementation of the remainder by pi/2 function using fma3
27; This is a service routine for use by trig functions coded in asm that use fma3
28;
29; On input,
30;   xmm0 = x;
31; On ouput
32;   xmm0 = r
33;   xmm1 = rr
34;   rax = region
35
36.const
37ALIGN 16
38L_piby2_lead   DQ 03ff921fb54442d18h, 03ff921fb54442d18h
39L_fff800       DQ 0fffffffff8000000h, 0fffffffff8000000h
40L_piby2_part1  DQ 03ff921fb50000000h, 03ff921fb50000000h
41L_piby2_part2  DQ 03e5110b460000000h, 03e5110b460000000h
42L_piby2_part3  DQ 03c91a62633145c06h, 03c91a62633145c06h
43L_piby2_1      DQ 03FF921FB54400000h, 03FF921FB54400000h
44L_piby2_2      DQ 03DD0B4611A600000h, 03DD0B4611A600000h
45L_piby2_3      DQ 03BA3198A2E000000h, 03BA3198A2E000000h
46L_piby2_1tail  DQ 03DD0B4611A626331h, 03DD0B4611A626331h
47L_piby2_2tail  DQ 03BA3198A2E037073h, 03BA3198A2E037073h
48L_piby2_3tail  DQ 0397B839A252049C1h, 0397B839A252049C1h
49L_sign_mask    DQ 07FFFFFFFFFFFFFFFh, 07FFFFFFFFFFFFFFFh
50L_twobypi      DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h
51L_point_five   DQ 03FE0000000000000h, 03FE0000000000000h
52L_int_three    DQ 00000000000000003h, 00000000000000003h
53L_inf_mask_64  DQ 07FF0000000000000h, 07FF0000000000000h
54L_signbit      DQ 08000000000000000h, 08000000000000000h
55;; constants for BDL reduction
56L_r            DQ 03FE45F306DC9C883h, 03FE45F306DC9C883h  ; 2/pi
57L_xc1          DQ 03FF921FB54442D18H, 03FF921FB54442D18h  ; pi/2 (L_piby2_lead)
58L_xc2          DQ 03C91A62633145C00H, 03C91A62633145C00h  ; pi/2 part 2
59L_xc3          DQ 0397B839A252049C0H, 0397B839A252049C0h  ; pi/2 part 3
60; sigma is 3*2^(p-n-2) where n is 0 and p is 53.
61L_sigma        DQ 04338000000000000h, 04338000000000000h  ; 6755399441055744.
62
63EXTRN __L_2_by_pi_bits:BYTE
64
65region      EQU        020h
66stack_size  EQU        038h
67
68include fm.inc
69
70fname TEXTEQU  <__remainder_piby2_fma3>
71fbname TEXTEQU <__remainder_piby2_fma3_bdl>
72
73.code
74
75PUBLIC fname
76fname PROC FRAME
77    StackAllocate stack_size
78    .ENDPROLOG
79
80    ; This function is not using rdx, r8, and r9 as pointers;
81    ; all returns are in registers
82
83    ; get the unbiased exponent and the mantissa part of x
84    lea          r9,__L_2_by_pi_bits
85
86    ; xexp = (x >> 52) - 1023
87    vmovq        r11,xmm0
88    mov          rcx,r11
89    shr          r11,52
90    sub          r11,1023                 ; r11 <-- xexp = exponent of input x
91
92    ; calculate the last byte from which to start multiplication
93    ; last = 134 - (xexp >> 3)
94    mov          r10,r11
95    shr          r10,3
96    sub          r10,134                  ; r10 <-- -last
97    neg          r10                      ; r10 <-- last
98
99    ; load 64 bits of 2_by_pi
100    mov          rax,[r9 + r10]
101
102    ; mantissa of x = ((x << 12) >> 12) | implied bit
103    shl          rcx,12
104    shr          rcx,12                   ; rcx <-- mantissa part of input x
105    bts          rcx,52                   ; add the implied bit as well
106
107    ; load next 128 bits of 2_by_pi
108    add          r10,8                    ; increment to next 8 bytes of 2_by_pi
109    vmovdqu      xmm0,XMMWORD PTR[r9 + r10]
110
111    ; do three 64-bit multiplications with mant of x
112    mul          rcx
113    mov          r8,rax                   ; r8 <-- last 64 bits of mul = res1[2]
114    mov          r10,rdx                  ; r10 <-- carry
115    vmovq        rax,xmm0
116    mul          rcx
117    ; resexp = xexp & 7
118    and          r11,7                    ; r11 <-- resexp = last 3 bits of xexp
119    vpsrldq      xmm0,xmm0,8
120    add          rax,r10                  ; add the previous carry
121    adc          rdx,0
122    mov          r9,rax                   ; r9 <-- next 64 bits of mul = res1[1]
123    mov          r10,rdx                  ; r10 <-- carry
124    vmovq        rax,xmm0
125    mul          rcx
126    add          r10,rax                  ; r10 <-- most sig. 64 bits = res1[0]
127
128    ; find the region
129    ; last three bits ltb = most sig bits >> (54 - resexp));
130    ;   decimal point in last 18 bits ==> 8 lsb's in first 64 bits
131    ;   and 8 msb's in next 64 bits
132    ; point_five = ltb & 01h;
133    ; region = ((ltb >> 1) + point_five) & 3;
134    mov          rcx,54
135    mov          rax,r10
136    sub          rcx,r11
137    xor          rdx,rdx                  ; rdx <-- sign of x
138    shr          rax,cl
139    jnc          L__no_point_five
140    ; if there is carry then negate the result of multiplication
141    not          r10
142    not          r9
143    not          r8
144    mov          rdx,08000000000000000h
145
146ALIGN  16
147L__no_point_five:
148    adc          rax,0
149    and          rax,3                    ; rax now has region
150    mov          QWORD PTR [region+rsp], rax
151
152    ; calculate the number of integer bits and zero them out
153    mov          rcx,r11
154    add          rcx,10                   ; rcx = no. of integer bits
155    shl          r10,cl
156    shr          r10,cl                   ; r10 contains only mant bits
157    sub          rcx,64                   ; form the exponent
158    mov          r11,rcx
159
160    ; find the highest set bit
161    bsr          rcx,r10
162    jnz          L__form_mantissa
163    mov          r10,r9
164    mov          r9,r8
165    mov          r8,0
166    bsr          rcx,r10                  ; rcx = hsb
167    sub          r11,64
168
169ALIGN  16
170L__form_mantissa:
171    add          r11,rcx                  ; for exp of x
172    sub          rcx,52                   ; rcx = no. of bits to shift in r10
173    cmp          rcx,0
174    jl           L__hsb_below_52
175    je           L__form_numbers
176    ; hsb above 52
177    mov          r8,r10                   ; previous r8 not required
178    shr          r10,cl                   ; r10 = mantissa of x with hsb at 52
179    shr          r9,cl                    ; make space for bits from r10
180    sub          rcx,64
181    neg          rcx
182    ; rcx <-- no of bits to shift r10 to move those bits to r9
183    shl          r8,cl
184    or           r9,r8                    ; r9 = mantissa bits of xx
185    jmp          L__form_numbers
186
187ALIGN  16
188L__hsb_below_52:
189    ; rcx has shift count (< 0)
190    neg          rcx
191    mov          rax,r9
192    shl          r10,cl
193    shl          r9,cl
194    sub          rcx,64
195    neg          rcx
196    shr          rax,cl
197    or           r10,rax
198    shr          r8,cl
199    or           r9,r8
200
201ALIGN  16
202;   Here r11 has unbiased exponent
203;   r10 has mantissa, with implicit bit possibly set
204;   rdx has the sign bit
205L__form_numbers:
206    add          r11,1023                 ; r11 <-- biased exponent
207    btr          r10,52                   ; remove the implicit bit
208    mov          rcx,r11                  ; rcx <-- copy of biased exponent
209    or           r10,rdx                  ; put the sign
210    shl          rcx,52                   ; shift biased exponent into place
211    or           r10,rcx                  ; r10 <-- x
212    vmovq        xmm2,r10                 ; xmm1l <-- x
213
214    ; form xx
215;   xor          rcx,rcx ; Why is this necessary???
216    bsr          rcx,r9                   ; scan for high bit of xx mantissa
217    sub          rcx,64                   ; to shift the implied bit as well
218    neg          rcx
219    shl          r9,cl
220    shr          r9,12
221    add          rcx,52
222    sub          r11,rcx
223    shl          r11,52
224    or           r9,rdx
225    or           r9,r11
226    vmovq        xmm1,r9                  ; xmm1 <-- xx
227    vandpd       xmm4,xmm2,L_fff800       ; xmm4 <-- hx
228    vsubsd       xmm0,xmm2,xmm4           ; xmm0 <-- tx
229    vmulsd       xmm5,xmm2,L_piby2_lead   ; xmm5 <-- c
230    vmulsd       xmm3,xmm4,L_piby2_part1
231    vsubsd       xmm3,xmm3,xmm5
232    vfmadd231sd  xmm3,xmm0,L_piby2_part1
233    vfmadd231sd  xmm3,xmm4,L_piby2_part2
234    vfmadd231sd  xmm3,xmm0,L_piby2_part2
235    vmulsd       xmm4,xmm1,L_piby2_lead
236    vfmadd231sd  xmm4,xmm2,L_piby2_part3
237    vaddsd       xmm3,xmm3,xmm4           ; xmm3 <-- cc
238    vaddsd       xmm0,xmm5,xmm3           ; xmm0 <--r
239    vsubsd       xmm1,xmm5,xmm0
240    vaddsd       xmm1,xmm1,xmm3           ; xmm1 <-- rr
241    mov          rax, QWORD PTR [region+rsp]
242
243    StackDeallocate stack_size
244    ret
245fname endp
246
247ALIGN 16
248PUBLIC fbname
249fbname PROC FRAME
250    .ENDPROLOG
251    ; Boldo, Daumas, annd Li, "Formally Verified Argument
252    ; Reduction With a Fused Multiply-Add,"
253    ; IEEE Trans. Comp., vol. 58, #8, Aug. 2009
254    ; coefficients are from table 1, mutatis mutandis
255    ; algorithm is their formula 3.1 (for getting z from sigma) and
256    ; algorithm 5.1 (and extended version) for actual reduction
257    vmovapd      xmm1,xmm0
258    vmovapd      xmm4,L_xc2               ; xmm4 <-- xc2
259    vmovapd      xmm2,L_sigma
260    vfmadd132sd  xmm1,xmm2,L_r            ; z = arg*r + sigma
261    vsubsd       xmm1,xmm1,xmm2           ; xmm1 <-- z -= sigma
262    vcvttpd2dq   xmm5,xmm1
263    vmovq        rax, xmm5
264    vmovapd      xmm2,xmm1
265    vfnmadd132sd xmm2,xmm0,L_xc1          ; xmm2 <-- u = arg - z*xc1
266    vmulsd       xmm3,xmm1,xmm4           ; xmm3 <-- p1 = z*xc2
267    vmovapd      xmm0,xmm1                ; xmm0 <-- copy of z
268    vfmsub213sd  xmm0,xmm4,xmm3           ; xmm0 <-- p2 = z*xc2 - p1
269    vsubsd       xmm5,xmm2,xmm3           ; xmm5 <-- t1 = u - p1
270    ; We really don't want to spill in this code, so we're commandeering xmm4
271    vsubsd       xmm4,xmm2,xmm5           ; xmm4 <-- temp = u - t1
272    vsubsd       xmm4,xmm4,xmm3           ; xmm4 <-- t2 = temp - p1
273    ; used to use xmm4 here for L_xc2
274    vfnmadd231sd xmm2,xmm1,L_xc2          ; xmm2 <-- v1 = -xc2*z + u
275    vsubsd       xmm5,xmm5,xmm2           ; xmm5 <-- v2 = t1 - v1
276    vaddsd       xmm5,xmm5,xmm4           ; xmm5 <-- v2 += t2
277    vsubsd       xmm5,xmm5,xmm0           ; xmm5 <-- v2 -= p2
278    vmovapd      xmm0,xmm2                ; xmm0 <-- arghead = v1
279    vfnmadd132sd xmm1,xmm5,L_xc3          ; xmm1 <-- argtail = -xc3*z + v2
280    and          rax, 3                   ; rax <-- region
281    ret
282fbname        endp
283END
284