xref: /reactos/sdk/lib/crt/math/libm_sse2/log10.asm (revision 84344399)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25; log10.asm
26;
27; An implementation of the log10 libm function.
28;
29; Prototype:
30;
31;     double log10(double x);
32;
33
34;
35;   Algorithm:
36;       Similar to one presnted in log.asm
37;
38
39.const
40
41ALIGN 16
42
43__real_ninf     DQ 0fff0000000000000h   ; -inf
44                DQ 0000000000000000h
45__real_inf      DQ 7ff0000000000000h    ; +inf
46                DQ 0000000000000000h
47__real_neg_qnan DQ 0fff8000000000000h   ; neg qNaN
48                DQ 0000000000000000h
49__real_qnanbit  DQ 0008000000000000h
50                DQ 0000000000000000h
51__int_1023      DQ 00000000000003ffh
52                DQ 0000000000000000h
53__mask_001      DQ 0000000000000001h
54                DQ 0000000000000000h
55
56__mask_mant     DQ 000FFFFFFFFFFFFFh    ; mask for mantissa bits
57                DQ 0000000000000000h
58
59__mask_mant_top8 DQ 000ff00000000000h   ; mask for top 8 mantissa bits
60                DQ 0000000000000000h
61
62__mask_mant9    DQ 0000080000000000h    ; mask for 9th mantissa bit
63                DQ 0000000000000000h
64
65__real_log10_e      DQ 3fdbcb7b1526e50eh
66                    DQ 0000000000000000h
67
68__real_log10_e_lead DQ 3fdbcb7800000000h ; log10e_lead 4.34293746948242187500e-01
69                    DQ 0000000000000000h
70__real_log10_e_tail DQ 3ea8a93728719535h ; log10e_tail 7.3495500964015109100644e-7
71                    DQ 0000000000000000h
72
73__real_log10_2_lead DQ 3fd3441350000000h
74                    DQ 0000000000000000h
75__real_log10_2_tail DQ 3e03ef3fde623e25h
76                    DQ 0000000000000000h
77
78__real_two          DQ 4000000000000000h ; 2
79                    DQ 0000000000000000h
80
81__real_one          DQ 3ff0000000000000h ; 1
82                    DQ 0000000000000000h
83
84__real_half         DQ 3fe0000000000000h ; 1/2
85                    DQ 0000000000000000h
86
87__mask_100          DQ 0000000000000100h
88                    DQ 0000000000000000h
89__real_1_over_512   DQ 3f60000000000000h
90                    DQ 0000000000000000h
91
92__real_1_over_2     DQ 3fe0000000000000h
93                    DQ 0000000000000000h
94__real_1_over_3     DQ 3fd5555555555555h
95                    DQ 0000000000000000h
96__real_1_over_4     DQ 3fd0000000000000h
97                    DQ 0000000000000000h
98__real_1_over_5     DQ 3fc999999999999ah
99                    DQ 0000000000000000h
100__real_1_over_6     DQ 3fc5555555555555h
101                    DQ 0000000000000000h
102
103__real_neg_1023     DQ 0c08ff80000000000h
104                    DQ 0000000000000000h
105
106__mask_2045         DQ 00000000000007fdh
107                    DQ 0000000000000000h
108
109__real_threshold    DQ 3fb0000000000000h ; .0625
110                    DQ 0000000000000000h
111
112__real_near_one_lt  DQ 3fee000000000000h ; .9375
113                    DQ 0000000000000000h
114
115__real_near_one_gt  DQ 3ff1000000000000h ; 1.0625
116                    DQ 0000000000000000h
117
118__real_min_norm     DQ 0010000000000000h
119                    DQ 0000000000000000h
120
121__real_notsign      DQ 7ffFFFFFFFFFFFFFh ; ^sign bit
122                    DQ 0000000000000000h
123
124__real_ca1          DQ 3fb55555555554e6h ; 8.33333333333317923934e-02
125                    DQ 0000000000000000h
126__real_ca2          DQ 3f89999999bac6d4h ; 1.25000000037717509602e-02
127                    DQ 0000000000000000h
128__real_ca3          DQ 3f62492307f1519fh ; 2.23213998791944806202e-03
129                    DQ 0000000000000000h
130__real_ca4          DQ 3f3c8034c85dfff0h ; 4.34887777707614552256e-04
131                    DQ 0000000000000000h
132
133__mask_lower        DQ 0ffffffff00000000h
134                    DQ 0000000000000000h
135
136; these codes and the ones in the corresponding .c file have to match
137__flag_x_zero           DD 00000001
138__flag_x_neg            DD 00000002
139__flag_x_nan            DD 00000003
140
141
142EXTRN __log10_256_lead:QWORD
143EXTRN __log10_256_tail:QWORD
144EXTRN __log_F_inv_qword:QWORD
145EXTRN __use_fma3_lib:DWORD
146
147
148; local variable storage offsets
149save_xmm6       EQU     20h
150dummy_space     EQU     30h
151stack_size      EQU     058h
152
153include fm.inc
154
155fname           TEXTEQU <log10>
156fname_special   TEXTEQU <_log10_special>
157
158EXTERN fname_special:PROC
159
160.code
161ALIGN 16
162PUBLIC fname
163fname PROC FRAME
164    StackAllocate stack_size
165    SaveXmm xmm6, save_xmm6
166    .ENDPROLOG
167
168    cmp          DWORD PTR __use_fma3_lib, 0
169    jne          Llog10_fma3
170
171Llog10_sse2:
172
173    ; compute exponent part
174    movapd      xmm3, xmm0
175    movapd      xmm4, xmm0
176    psrlq       xmm3, 52
177    movd        rax, xmm0
178    psubq       xmm3, XMMWORD PTR __int_1023 ; xmm3 <-- unbiased exponent
179
180    ;  NaN or inf
181    movapd      xmm5, xmm0
182    andpd       xmm5, XMMWORD PTR __real_inf
183    comisd      xmm5, QWORD PTR __real_inf
184    je          Llog10_sse2_x_is_inf_or_nan
185
186    movapd      xmm2, xmm0
187    cvtdq2pd    xmm6, xmm3                   ; xmm6 <-- unbiased exp as double
188
189
190    pand        xmm2, XMMWORD PTR __mask_mant
191    subsd       xmm4, QWORD PTR __real_one
192
193    comisd      xmm6, QWORD PTR __real_neg_1023
194    je          Llog10_sse2_denormal_adjust
195
196Llog10_sse2_continue_common:
197
198    andpd       xmm4, XMMWORD PTR __real_notsign
199    ; compute index into the log tables
200    mov         r9, rax
201    and         rax, QWORD PTR __mask_mant_top8
202    and         r9, QWORD PTR __mask_mant9
203    shl         r9, 1
204    add         rax, r9
205    movd        xmm1, rax
206
207    ; near one codepath
208    comisd      xmm4, QWORD PTR __real_threshold
209    jb          Llog10_sse2_near_one
210
211    ; F, Y
212    shr         rax, 44
213    por         xmm2, XMMWORD PTR __real_half
214    por         xmm1, XMMWORD PTR __real_half
215    lea         r9, QWORD PTR __log_F_inv_qword
216
217    ; check for negative numbers or zero
218    xorpd       xmm5, xmm5
219    comisd      xmm0, xmm5
220    jbe         Llog10_sse2_x_is_zero_or_neg
221
222    ; f = F - Y, r = f * inv
223    subsd       xmm1, xmm2
224    mulsd       xmm1, QWORD PTR [r9+rax*8]
225
226    movapd      xmm2, xmm1
227    movapd      xmm0, xmm1
228    lea         r9, QWORD PTR __log10_256_lead
229
230    ; poly
231    movsd       xmm3, QWORD PTR __real_1_over_6
232    movsd       xmm1, QWORD PTR __real_1_over_3
233    mulsd       xmm3, xmm2
234    mulsd       xmm1, xmm2
235    mulsd       xmm0, xmm2
236    movapd      xmm4, xmm0
237    addsd       xmm3, QWORD PTR __real_1_over_5
238    addsd       xmm1, QWORD PTR __real_1_over_2
239    mulsd       xmm4, xmm0
240    mulsd       xmm3, xmm2
241    mulsd       xmm1, xmm0
242    addsd       xmm3, QWORD PTR __real_1_over_4
243    addsd       xmm1, xmm2
244    mulsd       xmm3, xmm4
245    addsd       xmm1, xmm3
246
247    movsd       xmm5, QWORD PTR __real_log10_2_tail
248    mulsd       xmm1, QWORD PTR __real_log10_e
249
250    ; m*log(10) + log10(G) - poly
251    mulsd       xmm5, xmm6
252    subsd       xmm5, xmm1
253
254    movsd       xmm0, QWORD PTR [r9+rax*8]
255    lea         rdx, QWORD PTR __log10_256_tail
256    movsd       xmm2, QWORD PTR [rdx+rax*8]
257
258    movsd       xmm4, QWORD PTR __real_log10_2_lead
259    mulsd       xmm4, xmm6
260    addsd       xmm0, xmm4
261    addsd       xmm2, xmm5
262
263    addsd       xmm0, xmm2
264
265    RestoreXmm  xmm6, save_xmm6
266    StackDeallocate stack_size
267    ret
268
269ALIGN 16
270Llog10_sse2_near_one:
271
272    ; r = x - 1.0
273    movsd       xmm2, QWORD PTR __real_two
274    subsd       xmm0, QWORD PTR __real_one ; r
275
276    addsd       xmm2, xmm0
277    movapd      xmm1, xmm0
278    divsd       xmm1, xmm2 ; r/(2+r) = u/2
279
280    movsd       xmm4, QWORD PTR __real_ca2
281    movsd       xmm5, QWORD PTR __real_ca4
282
283    movapd       xmm6, xmm0
284    mulsd       xmm6, xmm1 ; correction
285
286    addsd       xmm1, xmm1 ; u
287    movapd      xmm2, xmm1
288
289    mulsd       xmm2, xmm1 ; u^2
290
291    mulsd       xmm4, xmm2
292    mulsd       xmm5, xmm2
293
294    addsd       xmm4, QWORD PTR __real_ca1
295    addsd       xmm5, QWORD PTR __real_ca3
296
297    mulsd       xmm2, xmm1 ; u^3
298    mulsd       xmm4, xmm2
299
300    mulsd       xmm2, xmm2
301    mulsd       xmm2, xmm1 ; u^7
302    mulsd       xmm5, xmm2
303
304    movsd       xmm2, QWORD PTR __real_log10_e_tail
305    addsd       xmm4, xmm5
306    subsd       xmm4, xmm6
307    movsd       xmm6, QWORD PTR __real_log10_e_lead
308
309    movapd      xmm3, xmm0
310    pand        xmm3, XMMWORD PTR __mask_lower
311    subsd       xmm0, xmm3
312    addsd       xmm4, xmm0
313
314    movapd      xmm0, xmm3
315    movapd      xmm1, xmm4
316
317    mulsd       xmm4, xmm2
318    mulsd       xmm0, xmm2
319    mulsd       xmm1, xmm6
320    mulsd       xmm3, xmm6
321
322    addsd       xmm0, xmm4
323    addsd       xmm0, xmm1
324    addsd       xmm0, xmm3
325
326    RestoreXmm  xmm6, save_xmm6
327    StackDeallocate stack_size
328    ret
329
330Llog10_sse2_denormal_adjust:
331    por         xmm2, XMMWORD PTR __real_one
332    subsd       xmm2, QWORD PTR __real_one
333    movsd       xmm5, xmm2
334    pand        xmm2, XMMWORD PTR __mask_mant
335    movd        rax, xmm2
336    psrlq       xmm5, 52
337    psubd       xmm5, XMMWORD PTR __mask_2045
338    cvtdq2pd    xmm6, xmm5
339    jmp         Llog10_sse2_continue_common
340
341ALIGN 16
342Llog10_sse2_x_is_zero_or_neg:
343    jne         Llog10_sse2_x_is_neg
344
345    movsd       xmm1, QWORD PTR __real_ninf
346    mov         r8d, DWORD PTR __flag_x_zero
347    call        fname_special
348    jmp         Llog10_sse2_finish
349
350ALIGN 16
351Llog10_sse2_x_is_neg:
352
353    movsd       xmm1, QWORD PTR __real_neg_qnan
354    mov         r8d, DWORD PTR __flag_x_neg
355    call        fname_special
356    jmp         Llog10_sse2_finish
357
358ALIGN 16
359Llog10_sse2_x_is_inf_or_nan:
360
361    cmp         rax, QWORD PTR __real_inf
362    je          Llog10_sse2_finish
363
364    cmp         rax, QWORD PTR __real_ninf
365    je          Llog10_sse2_x_is_neg
366
367    or          rax, QWORD PTR __real_qnanbit
368    movd        xmm1, rax
369    mov         r8d, DWORD PTR __flag_x_nan
370    call        fname_special
371    jmp         Llog10_sse2_finish
372
373ALIGN 16
374Llog10_sse2_finish:
375    RestoreXmm  xmm6, save_xmm6
376    StackDeallocate stack_size
377    ret
378
379ALIGN 16
380Llog10_fma3:
381    ; compute exponent part
382    xor          rax,rax
383    vpsrlq       xmm3,xmm0,52
384    vmovq        rax,xmm0
385    vpsubq       xmm3,xmm3,XMMWORD PTR __int_1023
386    vcvtdq2pd    xmm6,xmm3                ; xmm6 <-- (double)xexp
387
388    ;  NaN or Inf?
389    vpand        xmm5,xmm0,__real_inf
390    vcomisd      xmm5,QWORD PTR __real_inf
391    je           Llog10_fma3_x_is_inf_or_nan
392
393    ; negative number or zero?
394    vpxor        xmm5,xmm5,xmm5
395    vcomisd      xmm0,xmm5
396    jbe          Llog10_fma3_x_is_zero_or_neg
397
398    vpand        xmm2,xmm0,__mask_mant
399    vsubsd       xmm4,xmm0,QWORD PTR __real_one
400
401    ; Subnormal?
402    vcomisd      xmm6,QWORD PTR __real_neg_1023
403    je           Llog10_fma3_denormal_adjust
404
405Llog10_fma3_continue_common:
406    ; compute index into the log tables
407    vpand        xmm1,xmm0,XMMWORD PTR __mask_mant_top8
408    vpand        xmm3,xmm0,XMMWORD PTR __mask_mant9
409    vpsllq       xmm3,xmm3,1
410    vpaddq       xmm1,xmm3,xmm1
411    vmovq        rax,xmm1
412
413    ; near one codepath
414    vpand        xmm4,xmm4,XMMWORD PTR __real_notsign
415    vcomisd      xmm4,QWORD PTR __real_threshold
416    jb           Llog10_fma3_near_one
417
418    ; F,Y
419    shr          rax,44
420    vpor         xmm2,xmm2,XMMWORD PTR __real_half
421    vpor         xmm1,xmm1,XMMWORD PTR __real_half
422    lea          r9,DWORD PTR __log_F_inv_qword
423
424    ; f = F - Y,r = f * inv
425    vsubsd       xmm1,xmm1,xmm2
426    vmulsd       xmm1,xmm1,QWORD PTR [r9 + rax * 8]
427
428    lea          r9,DWORD PTR __log10_256_lead
429
430    ; poly
431    vmulsd       xmm0,xmm1,xmm1 ; r*r
432    vmovsd       xmm3,QWORD PTR __real_1_over_6
433    vmovsd       xmm5,QWORD PTR __real_1_over_3
434    vfmadd213sd  xmm3,xmm1,QWORD PTR __real_1_over_5  ; r*1/6 + 1/5
435    vfmadd213sd  xmm5,xmm1,QWORD PTR __real_half      ; 1/2+r*1/3
436    movsd        xmm4,xmm0                             ; r*r
437    vfmadd213sd  xmm3 ,xmm1,QWORD PTR __real_1_over_4 ; 1/4+(1/5*r+r*r*1/6)
438
439    vmulsd       xmm4,xmm0,xmm0                        ; r*r*r*r
440    vfmadd231sd  xmm1,xmm5,xmm0                        ; r*r*(1/2+r*1/3) + r
441    vfmadd231sd  xmm1,xmm3,xmm4
442
443    vmulsd       xmm1,xmm1,QWORD PTR __real_log10_e
444    ; m*log(2) + log(G) - poly*log10_e
445    vmovsd       xmm5,QWORD PTR __real_log10_2_tail
446    vfmsub213sd  xmm5,xmm6,xmm1
447
448    movsd        xmm0,QWORD PTR [r9 + rax * 8]
449    lea          rdx,DWORD PTR __log10_256_tail
450    movsd        xmm2,QWORD PTR [rdx + rax * 8]
451    vaddsd       xmm2,xmm2,xmm5
452
453    vfmadd231sd  xmm0,xmm6,QWORD PTR __real_log10_2_lead
454
455    vaddsd       xmm0,xmm0,xmm2
456    AVXRestoreXmm  xmm6, save_xmm6
457    StackDeallocate stack_size
458    ret
459
460
461ALIGN  16
462Llog10_fma3_near_one:
463    ; r = x - 1.0
464    vmovsd       xmm2,QWORD PTR __real_two
465    vsubsd       xmm0,xmm0,QWORD PTR __real_one ; r
466
467    vaddsd       xmm2,xmm2,xmm0
468    vdivsd       xmm1,xmm0,xmm2 ; r/(2+r) = u/2
469
470    vmovsd       xmm4,QWORD PTR __real_ca2
471    vmovsd       xmm5,QWORD PTR __real_ca4
472
473    vmulsd       xmm6,xmm0,xmm1 ; correction
474    vaddsd       xmm1,xmm1,xmm1 ; u
475
476    vmulsd       xmm2,xmm1,xmm1 ; u^2
477    vfmadd213sd  xmm4,xmm2,QWORD PTR __real_ca1
478    vfmadd213sd  xmm5,xmm2,QWORD PTR __real_ca3
479
480    vmulsd       xmm2,xmm2,xmm1 ; u^3
481    vmulsd       xmm4,xmm4,xmm2
482
483    vmulsd       xmm2,xmm2,xmm2
484    vmulsd       xmm2,xmm2,xmm1 ; u^7
485
486    vmulsd       xmm5,xmm5,xmm2
487    vaddsd       xmm4,xmm4,xmm5
488    vsubsd       xmm4,xmm4,xmm6
489    vpand        xmm3,xmm0,XMMWORD PTR __mask_lower
490    vsubsd       xmm0,xmm0,xmm3
491    vaddsd       xmm4,xmm4,xmm0
492
493    vmulsd       xmm1,xmm4,QWORD PTR __real_log10_e_lead
494    vmulsd       xmm4,xmm4,QWORD PTR __real_log10_e_tail
495    vmulsd       xmm0,xmm3,QWORD PTR __real_log10_e_tail
496    vmulsd       xmm3,xmm3,QWORD PTR __real_log10_e_lead
497
498    vaddsd       xmm0,xmm0,xmm4
499    vaddsd       xmm0,xmm0,xmm1
500    vaddsd       xmm0,xmm0,xmm3
501
502    AVXRestoreXmm  xmm6, save_xmm6
503    StackDeallocate stack_size
504    ret
505
506
507Llog10_fma3_denormal_adjust:
508    vpor         xmm2,xmm2,XMMWORD PTR __real_one
509    vsubsd       xmm2,xmm2,QWORD PTR __real_one
510    vpsrlq       xmm5,xmm2,52
511    vpand        xmm2,xmm2,XMMWORD PTR __mask_mant
512    vmovapd      xmm0,xmm2
513    vpsubd       xmm5,xmm5,XMMWORD PTR __mask_2045
514    vcvtdq2pd    xmm6,xmm5
515    jmp          Llog10_fma3_continue_common
516
517ALIGN  16
518Llog10_fma3_x_is_zero_or_neg:
519    jne          Llog10_fma3_x_is_neg
520    vmovsd       xmm1,QWORD PTR __real_ninf
521    mov          r8d,DWORD PTR __flag_x_zero
522    call         fname_special
523
524    AVXRestoreXmm  xmm6, save_xmm6
525    StackDeallocate stack_size
526    ret
527
528
529ALIGN  16
530Llog10_fma3_x_is_neg:
531
532    vmovsd       xmm1,QWORD PTR __real_neg_qnan
533    mov          r8d,DWORD PTR __flag_x_neg
534    call         fname_special
535
536    AVXRestoreXmm  xmm6, save_xmm6
537    StackDeallocate stack_size
538    ret
539
540
541ALIGN  16
542Llog10_fma3_x_is_inf_or_nan:
543
544    cmp          rax,QWORD PTR __real_inf
545    je           Llog10_fma3_finish
546
547    cmp          rax,QWORD PTR __real_ninf
548    je           Llog10_fma3_x_is_neg
549
550    or           rax,QWORD PTR __real_qnanbit
551    movd         xmm1,rax
552    mov          r8d,DWORD PTR __flag_x_nan
553    call         fname_special
554    jmp          Llog10_fma3_finish
555
556ALIGN  16
557Llog10_fma3_finish:
558
559    AVXRestoreXmm  xmm6, save_xmm6
560    StackDeallocate stack_size
561    ret
562fname            endp
563
564END
565
566