xref: /reactos/sdk/lib/crt/math/libm_sse2/log.asm (revision 5140a990)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25; log.asm
26;
27; An implementation of the log libm function.
28;
29; Prototype:
30;
31;     double log(double x);
32;
33
34;
35;   Algorithm:
36;
37;   Based on:
38;   Ping-Tak Peter Tang
39;   "Table-driven implementation of the logarithm function in IEEE
40;   floating-point arithmetic"
41;   ACM Transactions on Mathematical Software (TOMS)
42;   Volume 16, Issue 4 (December 1990)
43;
44;
45;   x very close to 1.0 is handled differently, for x everywhere else
46;   a brief explanation is given below
47;
48;   x = (2^m)*A
49;   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-9))
50;   x = (2^m)*2*(G/2+g/2)
51;   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-10))
52;
53;   Y = (2^(-1))*(2^(-m))*(2^m)*A
54;   Now, range of Y is: 0.5 <= Y < 1
55;
56;   F = 0x100 + (first 8 mantissa bits) + (9th mantissa bit)
57;   Now, range of F is: 256 <= F <= 512
58;   F = F / 512
59;   Now, range of F is: 0.5 <= F <= 1
60;
61;   f = -(Y-F), with (f <= 2^(-10))
62;
63;   log(x) = m*log(2) + log(2) + log(F-f)
64;   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
65;   log(x) = m*log(2) + log(2*F) + log(1-r)
66;
67;   r = (f/F), with (r <= 2^(-9))
68;   r = f*(1/F) with (1/F) precomputed to avoid division
69;
70;   log(x) = m*log(2) + log(G) - poly
71;
72;   log(G) is precomputed
73;   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5) + (r^6)/6))
74;
75;   log(2) and log(G) need to be maintained in extra precision
76;   to avoid losing precision in the calculations
77;
78
79.const
80ALIGN 16
81
82__real_ninf         DQ 0fff0000000000000h   ; -inf
83                    DQ 0000000000000000h
84__real_inf          DQ 7ff0000000000000h    ; +inf
85                    DQ 0000000000000000h
86__real_neg_qnan     DQ 0fff8000000000000h   ; neg qNaN
87                    DQ 0000000000000000h
88__real_qnanbit      DQ 0008000000000000h
89                    DQ 0000000000000000h
90__real_min_norm     DQ 0010000000000000h
91                    DQ 0000000000000000h
92__real_mant         DQ 000FFFFFFFFFFFFFh    ; mantissa bits
93                    DQ 0000000000000000h
94__mask_1023         DQ 00000000000003ffh
95                    DQ 0000000000000000h
96__mask_001          DQ 0000000000000001h
97                    DQ 0000000000000000h
98
99__mask_mant_all8    DQ 000ff00000000000h
100                    DQ 0000000000000000h
101__mask_mant9        DQ 0000080000000000h
102                    DQ 0000000000000000h
103
104__real_two          DQ 4000000000000000h ; 2
105                    DQ 0000000000000000h
106
107__real_one          DQ 3ff0000000000000h ; 1
108                    DQ 0000000000000000h
109
110__real_near_one_lt  DQ 3fee000000000000h ; .9375
111                    DQ 0000000000000000h
112
113__real_near_one_gt  DQ 3ff1000000000000h ; 1.0625
114                    DQ 0000000000000000h
115
116__real_half         DQ 3fe0000000000000h ; 1/2
117                    DQ 0000000000000000h
118
119__mask_100          DQ 0000000000000100h
120                    DQ 0000000000000000h
121
122__real_1_over_512   DQ 3f60000000000000h
123                    DQ 0000000000000000h
124
125__real_1_over_2     DQ 3fe0000000000000h
126                    DQ 0000000000000000h
127__real_1_over_3     DQ 3fd5555555555555h
128                    DQ 0000000000000000h
129__real_1_over_4     DQ 3fd0000000000000h
130                    DQ 0000000000000000h
131__real_1_over_5     DQ 3fc999999999999ah
132                    DQ 0000000000000000h
133__real_1_over_6     DQ 3fc5555555555555h
134                    DQ 0000000000000000h
135
136__mask_1023_f       DQ 0c08ff80000000000h
137                    DQ 0000000000000000h
138
139__mask_2045         DQ 00000000000007fdh
140                    DQ 0000000000000000h
141
142__real_threshold    DQ 3fb0000000000000h ; .0625
143                    DQ 0000000000000000h
144
145__real_notsign      DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
146                    DQ 0000000000000000h
147
148__real_ca1          DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
149                    DQ 0000000000000000h
150__real_ca2          DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
151                    DQ 0000000000000000h
152__real_ca3          DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
153                    DQ 0000000000000000h
154__real_ca4          DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
155                    DQ 0000000000000000h
156__real_log2_lead    DQ 03fe62e42e0000000h ; 6.93147122859954833984e-01
157                    DQ 00000000000000000h
158__real_log2_tail    DQ 03e6efa39ef35793ch ; 5.76999904754328540596e-08
159                    DQ 00000000000000000h
160
161; these codes and the ones in the corresponding .c file have to match
162__flag_x_zero          DD 00000001
163__flag_x_neg           DD 00000002
164__flag_x_nan           DD 00000003
165
166
167EXTRN __log_256_lead:QWORD
168EXTRN __log_256_tail:QWORD
169EXTRN __log_F_inv_qword:QWORD
170EXTRN __use_fma3_lib:DWORD
171
172
173fname           TEXTEQU <log>
174fname_special   TEXTEQU <_log_special>
175
176; define local variable storage offsets
177
178save_xmm6       EQU     20h
179dummy_space     EQU     40h
180
181stack_size      EQU     58h
182
183include fm.inc
184
185; external function
186EXTERN fname_special:PROC
187
188.code
189ALIGN 16
190PUBLIC fname
191fname PROC FRAME
192    StackAllocate stack_size
193    SaveXmm      xmm6, save_xmm6
194    .ENDPROLOG
195
196    cmp          DWORD PTR __use_fma3_lib, 0
197    jne          Llog_fma3
198
199Llog_sse2:
200
201    ; compute exponent part
202    movdqa      xmm3, xmm0
203    movapd      xmm4, xmm0
204    psrlq       xmm3, 52
205    movd        rax, xmm0
206    psubq       xmm3, XMMWORD PTR __mask_1023
207
208    ;  NaN or inf
209    mov         rcx, rax
210    btr         rcx, 63
211    cmp         rcx, QWORD PTR __real_inf
212    jae         __x_is_inf_or_nan
213
214    movdqa      xmm2, xmm0
215    cvtdq2pd    xmm6, xmm3 ; xexp
216
217
218    pand        xmm2, XMMWORD PTR __real_mant
219    subsd       xmm4, QWORD PTR __real_one
220
221    comisd      xmm6, QWORD PTR __mask_1023_f
222    je          __denormal_adjust
223
224__continue_common:
225
226    andpd       xmm4, XMMWORD PTR __real_notsign
227    ; compute index into the log tables
228    mov         r9, rax
229    and         rax, QWORD PTR __mask_mant_all8
230    and         r9, QWORD PTR __mask_mant9
231    shl         r9, 1
232    add         rax, r9
233    movd        xmm1, rax
234
235    ; near one codepath
236    comisd      xmm4, QWORD PTR __real_threshold
237    jb          __near_one
238
239    ; F, Y
240    shr         rax, 44
241    por         xmm2, XMMWORD PTR __real_half
242    por         xmm1, XMMWORD PTR __real_half
243    lea         r9, __log_F_inv_qword
244
245    ; check for negative numbers or zero
246    xorpd       xmm5, xmm5
247    comisd      xmm0, xmm5
248    jbe         __x_is_zero_or_neg
249
250    ; f = F - Y, r = f * inv
251    subsd       xmm1, xmm2                       ; xmm1 <-- f = F - Y
252    mulsd       xmm1, QWORD PTR [r9+rax*8]       ; xmm1 <-- r = f * inv
253
254    movapd      xmm2, xmm1                       ; xmm2 <-- copy of r
255    movapd      xmm0, xmm1                       ; xmm0 <-- copy of r
256    lea         r9, QWORD PTR __log_256_lead
257
258    ; poly
259    movsd       xmm3, QWORD PTR __real_1_over_6
260    movsd       xmm1, QWORD PTR __real_1_over_3
261    mulsd       xmm3, xmm2                      ; xmm3 <-- r/6
262    mulsd       xmm1, xmm2                      ; xmm1 <-- r/3
263    mulsd       xmm0, xmm2                      ; xmm0 <-- r*r
264    movapd      xmm4, xmm0                      ; xmm4 <-- copy of r*r
265    addsd       xmm3, QWORD PTR __real_1_over_5 ; xmm3 <-- r/6 + 1/5
266    addsd       xmm1, QWORD PTR __real_1_over_2 ; xmm1 <-- r/3 + 1/2
267    mulsd       xmm4, xmm0                      ; xmm4 <-- r^4
268    mulsd       xmm3, xmm2                      ; xmm3 <-- (r/6 + 1/5)*r
269    mulsd       xmm1, xmm0                      ; xmm1 <-- (r/3 + 1/2)*r^2
270    addsd       xmm3, QWORD PTR __real_1_over_4 ; xmm3 <-- (r/6 + 1/5)*r + 1/4
271    addsd       xmm1, xmm2                      ; xmm1 <-- (r/3 + 1/2)*r^2 + r
272    mulsd       xmm3, xmm4                      ; xmm3 <-- ((r/6+1/5)*r+1/4)*r^4
273    addsd       xmm1, xmm3                      ; xmm1 <-- poly
274
275    ; m*log(2)_tail + log(G)_tail - poly
276    movsd       xmm5, QWORD PTR __real_log2_tail
277    mulsd       xmm5, xmm6                      ; xmm5 <-- m*log2_tail
278    subsd       xmm5, xmm1                      ; xmm5 <-- m*log2_tail - poly
279
280    movsd       xmm0, QWORD PTR [r9+rax*8]      ; xmm0 <-- log(G)_lead
281    lea         rdx, QWORD PTR __log_256_tail
282    movsd       xmm2, QWORD PTR [rdx+rax*8]     ; xmm2 <-- log(G)_tail
283    addsd       xmm2, xmm5                      ; xmm2 <-- (m*log2_tail - poly) + log(G)_tail
284
285    movsd       xmm4, QWORD PTR __real_log2_lead
286    mulsd       xmm4, xmm6                      ; xmm4 <-- m*log2_lead
287    addsd       xmm0, xmm4                      ; xmm0 <-- m*log2_lead + log(G)_lead
288
289    addsd       xmm0, xmm2        ; xmm0 <-- m*log(2)_tail + log(G)_tail - poly
290
291    RestoreXmm  xmm6, save_xmm6
292    StackDeallocate stack_size
293    ret
294
295ALIGN 16
296__near_one:
297
298    ; r = x - 1.0
299    movsd       xmm2, QWORD PTR __real_two
300    subsd       xmm0, QWORD PTR __real_one ; r
301
302    addsd       xmm2, xmm0
303    movsd       xmm1, xmm0
304    divsd       xmm1, xmm2 ; r/(2+r) = u/2
305
306    movsd       xmm4, QWORD PTR __real_ca2
307    movsd       xmm5, QWORD PTR __real_ca4
308
309    movsd       xmm6, xmm0
310    mulsd       xmm6, xmm1 ; correction
311
312    addsd       xmm1, xmm1 ; u
313    movsd       xmm2, xmm1
314
315    mulsd       xmm2, xmm1 ; u^2
316
317    mulsd       xmm4, xmm2
318    mulsd       xmm5, xmm2
319
320    addsd       xmm4, __real_ca1
321    addsd       xmm5, __real_ca3
322
323    mulsd       xmm2, xmm1 ; u^3
324    mulsd       xmm4, xmm2
325
326    mulsd       xmm2, xmm2
327    mulsd       xmm2, xmm1 ; u^7
328    mulsd       xmm5, xmm2
329
330    addsd       xmm4, xmm5
331    subsd       xmm4, xmm6
332    addsd       xmm0, xmm4
333
334    RestoreXmm  xmm6, save_xmm6
335    StackDeallocate stack_size
336    ret
337
338ALIGN 16
339__denormal_adjust:
340    por         xmm2, XMMWORD PTR __real_one
341    subsd       xmm2, QWORD PTR __real_one
342    movsd       xmm5, xmm2
343    pand        xmm2, XMMWORD PTR __real_mant
344    movd        rax, xmm2
345    psrlq       xmm5, 52
346    psubd       xmm5, XMMWORD PTR __mask_2045
347    cvtdq2pd    xmm6, xmm5
348    jmp         __continue_common
349
350ALIGN 16
351__x_is_zero_or_neg:
352    jne         __x_is_neg
353
354    movsd       xmm1, QWORD PTR __real_ninf
355    mov         r8d, DWORD PTR __flag_x_zero
356    call        fname_special
357    jmp         __finish
358
359ALIGN 16
360__x_is_neg:
361
362    movsd       xmm1, QWORD PTR __real_neg_qnan
363    mov         r8d, DWORD PTR __flag_x_neg
364    call        fname_special
365    jmp         __finish
366
367ALIGN 16
368__x_is_inf_or_nan:
369
370    cmp         rax, QWORD PTR __real_inf
371    je          __finish
372
373    cmp         rax, QWORD PTR __real_ninf
374    je          __x_is_neg
375
376    or          rax, QWORD PTR __real_qnanbit
377    movd        xmm1, rax
378    mov         r8d, DWORD PTR __flag_x_nan
379    call        fname_special
380    jmp         __finish
381
382ALIGN 16
383__finish:
384    RestoreXmm  xmm6, save_xmm6
385    StackDeallocate stack_size
386    ret
387
388ALIGN 16
389Llog_fma3:
390    ; compute exponent part
391    xor          rax,rax
392    vpsrlq       xmm3,xmm0,52
393    vmovq        rax,xmm0
394    vpsubq       xmm3,xmm3,XMMWORD PTR __mask_1023
395    vcvtdq2pd    xmm6,xmm3 ; xexp
396
397    ;  NaN or inf
398    vpand        xmm5,xmm0,XMMWORD PTR __real_inf
399    vcomisd      xmm5,QWORD PTR __real_inf
400    je           Llog_fma3_x_is_inf_or_nan
401
402    ; check for negative numbers or zero
403    vpxor        xmm5,xmm5,xmm5
404    vcomisd      xmm0,xmm5
405    jbe          Llog_fma3_x_is_zero_or_neg
406
407    vpand        xmm2,xmm0,XMMWORD PTR __real_mant
408    vsubsd       xmm4,xmm0,QWORD PTR __real_one
409
410    vcomisd      xmm6,QWORD PTR __mask_1023_f
411    je           Llog_fma3_denormal_adjust
412
413Llog_fma3_continue_common:
414    ; compute index into the log tables
415    vpand        xmm1,xmm0,XMMWORD PTR __mask_mant_all8
416    vpand        xmm3,xmm0,XMMWORD PTR __mask_mant9
417    vpsllq       xmm3,xmm3,1
418    vpaddq       xmm1,xmm3,xmm1
419    vmovq        rax,xmm1
420
421    ; near one codepath
422    vpand        xmm4,xmm4,XMMWORD PTR __real_notsign
423    vcomisd      xmm4,QWORD PTR __real_threshold
424    jb           Llog_fma3_near_one
425
426    ; F,Y
427    shr          rax,44
428    vpor         xmm2,xmm2,XMMWORD PTR __real_half
429    vpor         xmm1,xmm1,XMMWORD PTR __real_half
430    lea          r9,QWORD PTR __log_F_inv_qword
431
432    ; f = F - Y,r = f * inv
433    vsubsd       xmm1,xmm1,xmm2
434    vmulsd       xmm1,xmm1,QWORD PTR[r9 + rax * 8]
435
436    lea          r9,QWORD PTR __log_256_lead
437
438    ; poly
439    vmulsd       xmm0,xmm1,xmm1           ; r*r
440    vmovsd       xmm3,QWORD PTR __real_1_over_6
441    vmovsd       xmm5,QWORD PTR __real_1_over_3
442    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_5 ; r*1/6 + 1/5
443    vfmadd213sd  xmm5,xmm1,QWORD PTR __real_1_over_2 ; 1/2+r*1/3
444    vmovsd       xmm4,xmm0,xmm0
445    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
446
447    vmulsd       xmm4,xmm0,xmm0           ; r*r*r*r
448    vfmadd231sd  xmm1,xmm5,xmm0           ; r*r*(1/2+r*1/3) + r
449    vfmadd231sd  xmm1,xmm3,xmm4
450
451    ; m*log(2) + log(G) - poly
452    vmovsd       xmm5,QWORD PTR __real_log2_tail
453    vfmsub213sd  xmm5,xmm6,xmm1
454
455    vmovsd       xmm0,QWORD PTR[r9 + rax * 8]
456    lea          rdx,QWORD PTR __log_256_tail
457    vmovsd       xmm1,QWORD PTR[rdx + rax * 8]
458    vaddsd       xmm1,xmm1,xmm5
459
460    vfmadd231sd  xmm0,xmm6,QWORD PTR __real_log2_lead
461
462    vaddsd       xmm0,xmm0,xmm1
463    AVXRestoreXmm   xmm6, save_xmm6
464    StackDeallocate stack_size
465    ret
466
467
468ALIGN  16
469Llog_fma3_near_one:
470
471    ; r = x - 1.0
472    vmovsd       xmm3,QWORD PTR __real_two
473    vsubsd       xmm0,xmm0,QWORD PTR __real_one ; r
474
475    vaddsd       xmm3,xmm3,xmm0
476    vdivsd       xmm1,xmm0,xmm3           ; r/(2+r) = u/2
477
478    vmovsd       xmm4,QWORD PTR __real_ca2
479    vmovsd       xmm5,QWORD PTR __real_ca4
480
481    vmulsd       xmm3,xmm0,xmm1           ; correction
482    vaddsd       xmm1,xmm1,xmm1           ; u
483
484    vmulsd       xmm2,xmm1,xmm1           ; u^2
485    vfmadd213sd  xmm4,xmm2,QWORD PTR __real_ca1
486    vfmadd213sd  xmm5,xmm2,QWORD PTR __real_ca3
487
488    vmulsd       xmm2,xmm2,xmm1           ; u^3
489    vmulsd       xmm4,xmm4,xmm2
490
491    vmulsd       xmm2,xmm2,xmm2
492    vmulsd       xmm2,xmm2,xmm1           ; u^7
493
494    vfmadd231sd  xmm4,xmm5,xmm2
495    vsubsd       xmm4,xmm4,xmm3
496    vaddsd       xmm0,xmm0,xmm4
497
498    AVXRestoreXmm   xmm6, save_xmm6
499    StackDeallocate stack_size
500    ret
501
502
503Llog_fma3_denormal_adjust:
504    vpor         xmm2,xmm2,XMMWORD PTR __real_one
505    vsubsd       xmm2,xmm2,QWORD PTR __real_one
506    vpsrlq       xmm5,xmm2,52
507    vpand        xmm2,xmm2,XMMWORD PTR __real_mant
508    vmovapd      xmm0,xmm2
509    vpsubd       xmm5,xmm5,XMMWORD PTR __mask_2045
510    vcvtdq2pd    xmm6,xmm5
511    jmp          Llog_fma3_continue_common
512
513ALIGN  16
514Llog_fma3_x_is_zero_or_neg:
515    jne          Llog_fma3_x_is_neg
516    vmovsd       xmm1,QWORD PTR __real_ninf
517    mov          r8d,DWORD PTR __flag_x_zero
518    call         fname_special
519
520    AVXRestoreXmm   xmm6, save_xmm6
521    StackDeallocate stack_size
522    ret
523
524ALIGN  16
525Llog_fma3_x_is_neg:
526
527    vmovsd       xmm1,QWORD PTR __real_neg_qnan
528    mov          r8d,DWORD PTR __flag_x_neg
529    call         fname_special
530
531    AVXRestoreXmm   xmm6, save_xmm6
532    StackDeallocate stack_size
533    ret
534
535ALIGN  16
536Llog_fma3_x_is_inf_or_nan:
537
538    cmp          rax,QWORD PTR __real_inf
539    je           Llog_fma3_finish
540
541    cmp          rax,QWORD PTR __real_ninf
542    je           Llog_fma3_x_is_neg
543
544    or           rax,QWORD PTR __real_qnanbit
545    vmovq        xmm1,rax
546    mov          r8d,DWORD PTR __flag_x_nan
547    call         fname_special
548
549ALIGN  16
550Llog_fma3_finish:
551    AVXRestoreXmm   xmm6, save_xmm6
552    StackDeallocate stack_size
553    ret
554fname       endp
555
556END
557
558