xref: /reactos/sdk/lib/crt/math/libm_sse2/exp.asm (revision 4afb647c)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25; exp.asm
26;
27; An implementation of the exp libm function.
28;
29; Prototype:
30;
31;     double exp(double x);
32;
33
34;
35;   Algorithm:
36;
37;   e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
38;
39;   x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
40;   n = 64*m + j,   0 <= j < 64
41;
42;   e^x = 2^((64*m + j + f)/64)
43;       = (2^m) * (2^(j/64)) * 2^(f/64)
44;       = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
45;
46;   f = x*(64/ln(2)) - n
47;   r = f*(ln(2)/64) = x - n*(ln(2)/64)
48;
49;   e^x = (2^m) * (2^(j/64)) * e^r
50;
51;   (2^(j/64)) is precomputed
52;
53;   e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
54;   e^r = 1 + q
55;
56;   q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
57;
58
59.const
60ALIGN 16
61; these codes and the ones in the corresponding .c file have to match
62__flag_x_nan            DD 00000001
63__flag_y_zero           DD 00000002
64__flag_y_inf            DD 00000003
65
66ALIGN 16
67
68L__real_1_by_720              DQ 03f56c16c16c16c17h
69                              DQ 03f56c16c16c16c17h   ; 1/720
70L__real_1_by_120              DQ 03f81111111111111h
71                              DQ 03f81111111111111h   ; 1/120
72L__real_1_by_6                DQ 03fc5555555555555h
73                              DQ 03fc5555555555555h   ; 1/6
74L__real_1_by_2                DQ 03fe0000000000000h
75                              DQ 03fe0000000000000h   ; 1/2
76L__real_1_by_24               DQ 03fa5555555555555h
77                              DQ 03fa5555555555555h   ; 1/24
78
79ALIGN 16
80L__log2_by_64_mtail_mhead     DQ 0bf862e42fefa0000h, 0bd1cf79abc9e3b39h
81L__ln_of_smallest_normal      DQ 0C086232BDD7ABCD2h
82L__zero                       DQ 00000000000000000h
83L__max_exp_arg                DQ 040862e42fefa39efh   ;  709.78271289338397
84L__denormal_tiny_threshold    DQ 0c0874046dfefd9d0h   ; -744.03460681327306
85L__min_exp_arg                DQ 0c0874910d52d3051h   ; -745.13321910194111
86L__real_64_by_log2            DQ 040571547652b82feh   ; 64/ln(2)
87L__positive_infinity          DQ 07ff0000000000000h
88L__negative_infinity          DQ 0fff0000000000000h
89L__real_qnanbit               DQ 0008000000000000h    ; qnan set bit
90L__real_x_near0_threshold     DQ 3c00000000000000h
91L__log2_by_64_mhead           DQ 0bf862e42fefa0000h
92L__log2_by_64_mtail           DQ 0bd1cf79abc9e3b39h
93L__real_smallest_denormal     DQ 00000000000000001h
94L__real_one                   DQ 03ff0000000000000h
95L__2_to_neg_26                DQ 03E50000000000000h   ; 2^-26
96L__min_normal                 DQ 00010000000000000h   ; smallest normal
97
98
99EXTRN __two_to_jby64_table:QWORD
100EXTRN __two_to_jby64_head_table:QWORD
101EXTRN __two_to_jby64_tail_table:QWORD
102EXTRN __use_fma3_lib:DWORD
103
104; make room for fname_special to save things
105dummy_space     EQU    020h
106stack_size      EQU    038h
107
108include fm.inc
109
110fname           TEXTEQU <exp>
111fname_special   TEXTEQU <_exp_special>
112
113;Define name and any external functions being called
114EXTERN       fname_special      : PROC
115
116.code
117PUBLIC fname
118fname PROC FRAME
119    StackAllocate stack_size
120    .ENDPROLOG
121
122    ; We need to avoid unwanted exceptions from a NaN argument.
123    ; It could be argued that a signaling NaN should raise an exception,
124    ; but the existing library doesn't.  At any rate, the comparison operations
125    ; don't seem to like quiet NaN either, so...
126    movd         rdx, xmm0
127    btr          rdx, 63
128    cmp          rdx, L__positive_infinity
129    jge          Lexp_x_is_nan_or_inf
130
131    cmp          DWORD PTR __use_fma3_lib, 0
132    jne          Lexp_fma3
133
134    movapd       xmm2, xmm0
135    movapd       xmm3, xmm0
136
137    ; Some hardware has problems with too many branches in a single
138    ; 16- or 32-byte window, so let's peel off the common case into
139    ; a single branch.
140    cmplesd      xmm2, L__max_exp_arg  ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
141    cmpnltsd     xmm3, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
142    andps        xmm2, xmm3     ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
143    ucomisd      xmm2, xmm2   ; note that FFF... is NaN, so this comparison should set PF for in-range x
144    jp           Lexp_y_is_finite
145
146    ucomisd      xmm0,   L__max_exp_arg
147    ja           Lexp_y_is_inf
148    ; Since we peeled off the cases with normal result,
149    ; there is only one possibility remaining:
150    jmp          Lexp_y_is_denormal_or_zero
151
152ALIGN 16
153Lexp_y_is_finite:
154    ; x * (64/ln(2))
155    movapd       xmm1,   xmm0
156    btr          rdx, 63                  ; rdx <-- |x|
157    cmp          rdx, L__2_to_neg_26
158    jbe          Lexp_return_1_plus_x
159    mulsd        xmm1,   L__real_64_by_log2
160
161    ; n = int( x * (64/ln(2)) )
162    cvttpd2dq    xmm2, xmm1               ; xmm2 = (int)n
163    cvtdq2pd     xmm1, xmm2               ; xmm1 = (double)n
164    movd         ecx, xmm2
165    movapd       xmm2, xmm1
166
167    ; r1 = x - n * ln(2)/64 head
168    mulsd        xmm1, L__log2_by_64_mhead
169
170    ; j = n & 0x3f
171    mov          rax, 03fh
172    and          eax, ecx                 ; eax = j
173    ; m = (n - j) / 64
174    sar          ecx,    6                ; ecx = m
175
176
177    ; r2 = - n * ln(2)/64 tail
178    mulsd        xmm2, L__log2_by_64_mtail
179    addsd        xmm0, xmm1               ; xmm0 = r1
180
181    ; r1+r2
182    addsd        xmm2, xmm0               ; xmm2 = r
183
184    ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
185    ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
186    movapd       xmm3, L__real_1_by_720   ; xmm3 = 1/720
187    mulsd        xmm3, xmm2               ; xmm3 = r*1/720
188    movapd       xmm0, L__real_1_by_6     ; xmm0 = 1/6
189    movapd       xmm1, xmm2               ; xmm1 = r
190    mulsd        xmm0, xmm2               ; xmm0 = r*1/6
191    addsd        xmm3, L__real_1_by_120   ; xmm3 = 1/120 + (r*1/720)
192    mulsd        xmm1, xmm2               ; xmm1 = r*r
193    addsd        xmm0, L__real_1_by_2     ; xmm0 = 1/2 + (r*1/6)
194    movapd       xmm4, xmm1               ; xmm4 = r*r
195    mulsd        xmm4, xmm1               ; xmm4 = (r*r) * (r*r)
196    mulsd        xmm3, xmm2               ; xmm3 = r * (1/120 + (r*1/720))
197    mulsd        xmm0, xmm1               ; xmm0 = (r*r)*(1/2 + (r*1/6))
198    addsd        xmm3, L__real_1_by_24    ; xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
199    addsd        xmm0, xmm2               ; xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
200    mulsd        xmm3, xmm4               ; xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
201    addsd        xmm0, xmm3               ; xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
202
203    ;(f)*(q) + f2 + f1
204    cmp          ecx, 0fffffc02h          ; -1022
205    lea          rdx,  __two_to_jby64_table
206    lea          r11,  __two_to_jby64_tail_table
207    lea          r10,  __two_to_jby64_head_table
208    mulsd        xmm0, QWORD PTR [rdx+rax * 8 ]
209    addsd        xmm0, QWORD PTR [r11+rax * 8 ]
210    addsd        xmm0, QWORD PTR [r10+rax * 8 ]
211
212    jle          Lexp_process_denormal
213Lexp_process_normal:
214    shl          rcx,    52
215    movd         xmm2,   rcx
216    paddq        xmm0,   xmm2
217    StackDeallocate stack_size
218    ret
219
220ALIGN 16
221Lexp_process_denormal:
222    jl           Lexp_process_true_denormal
223    ucomisd      xmm0,   L__real_one
224    jae          Lexp_process_normal
225Lexp_process_true_denormal:
226    ; here ( e^r < 1 and m = -1022 ) or m <= -1023
227    add          ecx, 1074
228    mov          rax, 1
229    shl          rax, cl
230    movd         xmm2, rax
231    mulsd        xmm0, xmm2
232    jmp          Lexp_finish
233
234Lexp_y_is_one:
235    movsd        xmm0, L__real_one
236    jmp          Lexp_finish
237
238ALIGN 16
239Lexp_x_is_nan_or_inf:
240    movd         rax, xmm0
241    cmp          rax, L__positive_infinity
242    je           Lexp_finish
243    cmp          rax, L__negative_infinity
244    je           Lexp_return_zero_without_exception
245    or           rax, L__real_qnanbit
246    movd         xmm1, rax
247    mov          r8d, __flag_x_nan
248    call         fname_special
249    jmp          Lexp_finish
250
251ALIGN 16
252Lexp_y_is_inf:
253    mov          rax, 07ff0000000000000h
254    movd         xmm1, rax
255    mov          r8d, __flag_y_inf
256    call         fname_special
257    jmp          Lexp_finish
258
259ALIGN 16
260Lexp_y_is_denormal_or_zero:
261    ucomisd      xmm0, L__min_exp_arg
262    jbe          Lexp_y_is_zero
263    movapd       xmm0, L__real_smallest_denormal
264    jmp          Lexp_finish
265
266ALIGN 16
267Lexp_y_is_zero:
268    pxor         xmm1, xmm1
269    mov          r8d, __flag_y_zero
270    call         fname_special
271    jmp          Lexp_finish
272
273ALIGN 16
274Lexp_return_1_plus_x:
275    cmp          rdx, L__min_normal
276    jbe          Lexp_return_1_plus_eps
277    addsd        xmm0, L__real_one
278    StackDeallocate stack_size
279    ret          0
280
281; Some hardware really does not like subnormals.  Try to avoid them.
282ALIGN 16
283Lexp_return_1_plus_eps:
284    movsd        xmm0, L__real_one
285    addsd        xmm0, L__min_normal         ; make sure inexact is set
286    StackDeallocate stack_size
287    ret          0
288
289ALIGN 16
290Lexp_return_zero_without_exception:
291    pxor         xmm0,xmm0
292    StackDeallocate stack_size
293    ret          0
294
295
296ALIGN 16
297Lexp_finish:
298    StackDeallocate stack_size
299    ret          0
300
301ALIGN 16
302Lexp_fma3:
303    ; Some hardware has problems with too many branches in a single
304    ; 16- or 32-byte window, so let's peel off the common case into
305    ; a single branch.
306    vcmplesd     xmm2, xmm0, L__max_exp_arg  ; xmm2 <-- 0xFFFFFFFF is x is not too big positive
307    vcmpnltsd    xmm3, xmm0, L__denormal_tiny_threshold ; xmm3 <-- 0xFFFFFFFF if x is not too big negative
308    vandps       xmm2, xmm3, xmm2  ; xmm2 <-- 0xFFFFFFFF if x is in range, 0 otherwise
309    vucomisd     xmm2, xmm2   ; note that FFF... is NaN, so this comparison should set PF for in-range x
310    jp           Lexp_fma3_y_is_finite
311
312    vucomisd     xmm0,L__max_exp_arg
313    ja           Lexp_fma3_y_is_inf
314    ; Since we peeled off the cases with normal result,
315    ; there is only one possibility remaining:
316    jmp          Lexp_fma3_y_is_zero
317
318;   vpsllq       xmm1, xmm0, 1
319;   vpsrlq       xmm1, xmm1, 1
320;   vucomisd     xmm1, L__real_x_near0_threshold   ; 2^-63
321;   jb           Lexp_fma3_y_is_one
322
323ALIGN 16
324Lexp_fma3_y_is_finite:
325    vmovq        rdx, xmm0
326    btr          rdx, 63                  ; rdx <-- |x|
327    cmp          rdx, L__2_to_neg_26
328    jbe          Lexp_fma3_return_1_plus_x
329
330    ; x * (64/ln(2))
331    vmulsd       xmm1,xmm0,L__real_64_by_log2
332
333    ; n = int( x * (64/ln(2)) )
334    vcvttpd2dq   xmm2,xmm1 ;xmm2 = (int)n
335    vcvtdq2pd    xmm1,xmm2 ;xmm1 = (double)n ;can use round
336    vmovd        ecx,xmm2
337
338    ; r1 = x - n * ln(2)/64 head
339    ; r2 = - n * ln(2)/64 tail
340    ; r = r1+r2
341    vmovlhps     xmm1,xmm1,xmm1 ;xmm1 = (double (double)n,)n
342    vmovq        xmm0,xmm0 ;xmm0 = 0,x ;zero out the upper part
343    vfmadd132pd  xmm1,xmm0,L__log2_by_64_mtail_mhead
344    vhaddpd      xmm2,xmm1,xmm1 ;xmm2 = r,r
345
346    ;j = n & 03fh
347    mov          rax,03fh
348    and          eax,ecx ;eax = j
349    ; m = (n - j) / 64
350    sar          ecx,6 ;ecx = m
351
352    ; q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
353    ; q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
354    vmovapd      xmm3,L__real_1_by_720
355    vfmadd213sd  xmm3,xmm2,L__real_1_by_120
356    vfmadd213sd  xmm3,xmm2,L__real_1_by_24
357    vfmadd213sd  xmm3,xmm2,L__real_1_by_6
358    vfmadd213sd  xmm3,xmm2,L__real_1_by_2
359    vmulsd       xmm0,xmm2,xmm2
360    vfmadd213sd  xmm0,xmm3,xmm2
361
362    ; (f)*(q) + f2 + f1
363    cmp          ecx,0fffffc02h ; -1022
364    lea          rdx,__two_to_jby64_table
365    lea          r11,__two_to_jby64_tail_table
366    lea          r10,__two_to_jby64_head_table
367    vmulsd       xmm2,xmm0,QWORD PTR[rdx + rax * 8]
368    vaddsd       xmm1,xmm2,QWORD PTR[r11 + rax * 8]
369    vaddsd       xmm0,xmm1,QWORD PTR[r10 + rax * 8]
370
371    jle          Lexp_fma3_process_denormal
372Lexp_fma3_process_normal:
373    shl          rcx,52
374    vmovq        xmm2,rcx
375    vpaddq       xmm0,xmm0,xmm2
376    StackDeallocate stack_size
377    ret
378
379ALIGN 16
380Lexp_fma3_process_denormal:
381    jl           Lexp_fma3_process_true_denormal
382    vucomisd     xmm0,L__real_one
383    jae          Lexp_fma3_process_normal
384Lexp_fma3_process_true_denormal:
385    ; here ( e^r < 1 and m = -1022 ) or m <= -1023
386    add          ecx,1074
387    mov          rax,1
388    shl          rax,cl
389    vmovq        xmm2,rax
390    vmulsd       xmm0,xmm0,xmm2
391    jmp          Lexp_fma3_finish
392
393Lexp_fma3_y_is_one:
394    vmovsd       xmm0, L__real_one
395    jmp          Lexp_fma3_finish
396
397
398ALIGN 16
399Lexp_fma3_y_is_inf:
400    mov          rax,07ff0000000000000h
401    vmovq        xmm1,rax
402    mov          r8d,__flag_y_inf
403    call         fname_special
404    jmp          Lexp_fma3_finish
405
406ALIGN 16
407Lexp_fma3_return_1_plus_x:
408    cmp          rdx, L__min_normal
409    jbe          Lexp_fma3_return_1_plus_eps
410    vaddsd       xmm0, xmm0, L__real_one
411    StackDeallocate stack_size
412    ret          0
413
414; Some hardware really does not like subnormals.  Try to avoid them.
415ALIGN 16
416Lexp_fma3_return_1_plus_eps:
417    vmovsd       xmm0, L__real_one
418    vaddsd       xmm0, xmm0, L__min_normal         ; make sure inexact is set
419    StackDeallocate stack_size
420    ret          0
421
422ALIGN 16
423Lexp_fma3_y_is_zero:
424    vpxor        xmm1,xmm1,xmm1
425    mov          r8d,__flag_y_zero
426    call         fname_special
427    jmp          Lexp_fma3_finish
428
429ALIGN 16
430Lexp_fma3_return_zero_without_exception:
431    vpxor        xmm0,xmm0,xmm0
432
433ALIGN 16
434Lexp_fma3_finish:
435    StackDeallocate stack_size
436    ret
437
438fname            endp
439END
440