xref: /reactos/sdk/lib/crt/math/libm_sse2/logf.asm (revision 83e13630)
1;
2; MIT License
3; -----------
4;
5; Copyright (c) 2002-2019 Advanced Micro Devices, Inc.
6;
7; Permission is hereby granted, free of charge, to any person obtaining a copy
8; of this Software and associated documentaon files (the "Software"), to deal
9; in the Software without restriction, including without limitation the rights
10; to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11; copies of the Software, and to permit persons to whom the Software is
12; furnished to do so, subject to the following conditions:
13;
14; The above copyright notice and this permission notice shall be included in
15; all copies or substantial portions of the Software.
16;
17; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23; THE SOFTWARE.
24;
25;
26; logf.asm
27;
28; An implementation of the logf libm function.
29;
30; Prototype:
31;
32;     float logf(float x);
33;
34
35;
36;   Algorithm:
37;       Similar to one presnted in log.asm
38;
39.const
40
41
42ALIGN 16
43
44L_real_one              DQ 0000000003f800000h   ; 1.0
45                        DQ 0000000000000000h
46L_real_two              DQ 00000000040000000h   ; 1.0
47                        DQ 00000000000000000h
48L_real_ninf             DQ 000000000ff800000h   ; -inf
49                        DQ 0000000000000000h
50L_real_inf              DQ 0000000007f800000h   ; +inf
51                        DQ 0000000000000000h
52L_real_nan              DQ 0000000007fc00000h   ; NaN
53                        DQ 0000000000000000h
54L_real_neg_qnan         DQ 000000000ffc00000h
55                        DQ 0000000000000000h
56L_real_notsign          DQ 0000000007ffFFFFFh   ; ^sign bit
57                        DQ 0000000000000000h
58L_real_mant             DQ 0007FFFFF007FFFFFh   ; mantissa bits
59                        DQ 0007FFFFF007FFFFFh
60L_mask_127              DQ 00000007f0000007fh   ;
61                        DQ 00000007f0000007fh
62L_mask_253              DQ 000000000000000fdh
63                        DQ 00000000000000000h
64L_mask_mant_all7        DQ 00000000007f0000h
65                        DQ 00000000007f0000h
66L_mask_mant8            DQ 0000000000008000h
67                        DQ 0000000000000000h
68L_real_ca1              DQ 0000000003DAAAAABh   ; 8.33333333333317923934e-02
69                        DQ 00000000000000000h
70L_real_ca2              DQ 0000000003C4CCCCDh   ; 1.25000000037717509602e-02
71                        DQ 00000000000000000h
72L_real_log2_lead        DQ 03F3170003F317000h   ; 0.693115234375
73                        DQ 00000000000000000h
74L_real_log2_tail        DQ 0000000003805FDF4h   ; 0.000031946183
75                        DQ 00000000000000000h
76L_real_half             DQ 0000000003f000000h   ; 1/2
77                        DQ 00000000000000000h
78L_real_1_over_3         DQ 0000000003eaaaaabh
79                        DQ 00000000000000000h
80
81L_real_1_over_2         DD 03f000000h
82L_real_neg127           DD 0c2fe0000h
83L_real_qnanbit          DD 000400000h   ; quiet nan bit
84L_real_threshold        DD 03d800000h
85
86; these codes and the ones in the corresponding .c file have to match
87L_flag_x_zero           DD 00000001
88L_flag_x_neg            DD 00000002
89L_flag_x_nan            DD 00000003
90
91EXTRN __log_128_lead:DWORD
92EXTRN __log_128_tail:DWORD
93EXTRN __log_F_inv_dword:DWORD
94EXTRN __use_fma3_lib:DWORD
95
96fname           TEXTEQU <logf>
97fname_special   TEXTEQU <_logf_special>
98
99; define local variable storage offsets
100
101dummy_space     EQU     020h
102stack_size      EQU     038h
103
104include fm.inc
105
106; external function
107EXTERN fname_special:PROC
108
109.code
110
111PUBLIC fname
112fname PROC FRAME
113    StackAllocate stack_size
114    .ENDPROLOG
115    cmp         DWORD PTR __use_fma3_lib, 0
116    jne         Llogf_fma3
117
118    ; Some of the placement of instructions below iwll be odd.
119    ; We are attempting to have no more than one branch per 32-byte block.
120Llogf_sse2:
121    ; Zero the high bits of rax because it will be used as an index later.
122    xor         rax, rax
123    movdqa      xmm3, xmm0
124    movaps      xmm4, xmm0
125
126    ; This computation of the expoonent of x will produce nonsenes if x <= 0.,
127    ; but those cases are eliminated below, so it does no harm.
128    psrld       xmm3, 23                         ; xmm3 <-- biased exp if x > 0.
129
130    ; Is x Inf or NaN?
131    movd        eax, xmm0                        ; eax <-- x
132    mov         ecx, eax
133    btr         ecx, 31                          ; ecx <-- |x|
134    cmp         ecx, DWORD PTR L_real_inf
135    jae         Llogf_sse2_x_is_inf_or_nan
136
137    ; Finish computing exponent.
138    psubd       xmm3, XMMWORD PTR L_mask_127     ; xmm3 <-- xexp (unbiased)
139    movdqa      xmm2, xmm0
140    cvtdq2ps    xmm5, xmm3                       ; (float)xexp, unless x <= 0.
141
142    ; Is x negative or zero?
143    xorps       xmm1, xmm1
144    comiss      xmm0, xmm1
145    jbe         Llogf_sse2_x_is_zero_or_neg
146
147    pand        xmm2, XMMWORD PTR L_real_mant    ; xmm2 <-- x mantissa for later
148    subss       xmm4, DWORD PTR L_real_one       ; xmm4 <-- x - 1. for later
149
150    comiss      xmm5, DWORD PTR L_real_neg127    ; x!=0, xexp==0 ==> subnormal
151    je          Llogf_sse2_subnormal_adjust
152
153Llogf_sse2_continue_common:
154    ; At this point we need |x| (possibly adjusted) in eax
155    ; and m = xexpx (possibly adjusted) in xmm5
156    ; We also need the value of x - 1. computed above.
157
158    ; compute the index into the log tables
159    mov         r9d, eax
160    and         eax, DWORD PTR L_mask_mant_all7  ; eax <-- 7 bits of x mantissa
161    and         r9d, DWORD PTR L_mask_mant8      ; r9d <-- 8th bit
162    shl         r9d, 1
163    add         eax, r9d                         ; use 8th bit to round up
164    movd        xmm1, eax
165
166    ; Is x near 1.0 ?
167    ; Note that if x is subnormal it is perforce not near one.
168    andps       xmm4, XMMWORD PTR L_real_notsign ; xmm4 <-- |x-1|
169    comiss      xmm4, DWORD PTR L_real_threshold ; is |x-1| < 1/16?
170    jb          Llogf_sse2_near_one              ; if so, handle elsewhere
171
172    ; F, Y
173    ; F is a number in [.5,1) scaled from the rounded mantissa bits computed
174    ; above by oring in the exponent of .5.
175    ; Y is all of the mantissa bits of X scaled to [.5,1.) similarly
176    shr         eax, 16                          ; shift eax to use as index
177    por         xmm2, XMMWORD PTR L_real_half    ; xmm2 <-- Y
178    por         xmm1, XMMWORD PTR L_real_half    ; xmm2 <-- F
179    lea         r9, QWORD PTR __log_F_inv_dword
180
181
182    ; f = F - Y, r = f * inv
183    subss       xmm1, xmm2                       ; xmm1 <-- f
184    mulss       xmm1, DWORD PTR [r9+rax*4]       ; xmm1 <-- r = f*inv (tabled)
185
186    movaps      xmm2, xmm1
187    movaps      xmm0, xmm1
188
189    ; poly
190    mulss       xmm2, DWORD PTR L_real_1_over_3  ; xmm2 <-- r/3
191    mulss       xmm0, xmm1                       ; xmm0 <-- r^2
192    addss       xmm2, DWORD PTR L_real_1_over_2
193    movaps      xmm3, XMMWORD PTR L_real_log2_tail
194
195    lea         r9, QWORD PTR __log_128_tail
196    lea         r10, QWORD PTR __log_128_lead
197
198    mulss       xmm2, xmm0                       ; xmm2 <-- r^2 * (r/3 + 1/2)
199    mulss       xmm3, xmm5                       ; xmm3 <-- (m=xexp)*log2_tail
200    addss       xmm1, xmm2                       ; xmm1 <-- poly
201
202    ; m*log(2) + log(G) - poly, where G is just 2*F
203    ; log(G) is precomputed to extra precision.
204    ; small pieces and large pieces are separated until the final add,
205    ; to preserve accuracy
206    movaps      xmm0, XMMWORD PTR L_real_log2_lead
207    subss       xmm3, xmm1                       ; xmm3 <-- m*log2_tail - poly
208    mulss       xmm0, xmm5                       ; xmm0 <-- m*log1_lead
209    addss       xmm3, DWORD PTR [r9+rax*4]       ; xmm3 += log(G) tail
210    addss       xmm0, DWORD PTR [r10+rax*4]      ; xmm0 += log(G) lead
211
212    addss       xmm0, xmm3                       ; xmm0 <-- m*log(2)+log(G)-poly
213
214    StackDeallocate stack_size
215    ret
216
217ALIGN 16
218Llogf_sse2_near_one:
219    ; Computation of the log for x near one requires special techniques.
220    movaps      xmm2, DWORD PTR L_real_two
221    subss       xmm0, DWORD PTR L_real_one       ; xmm0 <-- r = x - 1.0
222    addss       xmm2, xmm0
223    movaps      xmm1, xmm0
224    divss       xmm1, xmm2                       ; xmm1 <-- u = r/(2.0+r)
225    movaps      xmm4, xmm0
226    mulss       xmm4, xmm1                       ; xmm4 <-- correction = r*u
227    addss       xmm1, xmm1                       ; xmm1 <-- u = 2.*u
228    movaps      xmm2, xmm1
229    mulss       xmm2, xmm2                       ; xmm2 <-- u^2
230
231    ; r2 = (u^3 * (ca_1 + u^2 * ca_2) - correction)
232    movaps      xmm3, xmm1
233    mulss       xmm3, xmm2                       ; xmm3 <-- u^3
234    mulss       xmm2, DWORD PTR L_real_ca2       ; xmm2 <-- ca2*u^2
235    addss       xmm2, DWORD PTR L_real_ca1       ; xmm2 <-- ca2*u^2 + ca1
236    mulss       xmm2, xmm3                       ; xmm2 <-- u^3*(ca1+u^2*ca2)
237    subss       xmm2, xmm4                       ; xmm2 <-- r2
238
239    ; return r + r2
240    addss       xmm0, xmm2
241    StackDeallocate stack_size
242    ret
243
244ALIGN 16
245Llogf_sse2_subnormal_adjust:
246    ; This code adjusts eax and xmm5.
247    ; It must preserve xmm4.
248    por         xmm2, XMMWORD PTR L_real_one
249    subss       xmm2, DWORD PTR L_real_one
250    movdqa      xmm5, xmm2
251    pand        xmm2, XMMWORD PTR L_real_mant
252    movd        eax, xmm2
253    psrld       xmm5, 23
254    psubd       xmm5, XMMWORD PTR L_mask_253
255    cvtdq2ps    xmm5, xmm5
256    jmp         Llogf_sse2_continue_common
257
258; Until we get to the FMA3 code, the rest of this is special case handling.
259Llogf_sse2_x_is_zero_or_neg:
260    jne         Llogf_sse2_x_is_neg
261
262    movaps      xmm1, XMMWORD PTR L_real_ninf
263    mov         r8d, DWORD PTR L_flag_x_zero
264    call        fname_special
265    jmp         Llogf_sse2_finish
266
267Llogf_sse2_x_is_neg:
268
269    movaps      xmm1, XMMWORD PTR L_real_neg_qnan
270    mov         r8d, DWORD PTR L_flag_x_neg
271    call        fname_special
272    jmp         Llogf_sse2_finish
273
274Llogf_sse2_x_is_inf_or_nan:
275
276    cmp         eax, DWORD PTR L_real_inf
277    je          Llogf_sse2_finish
278
279    cmp         eax, DWORD PTR L_real_ninf
280    je          Llogf_sse2_x_is_neg
281
282    or          eax, DWORD PTR L_real_qnanbit
283    movd        xmm1, eax
284    mov         r8d, DWORD PTR L_flag_x_nan
285    call        fname_special
286    jmp         Llogf_sse2_finish
287
288Llogf_sse2_finish:
289    StackDeallocate stack_size
290    ret
291
292ALIGN 16
293Llogf_fma3:
294    ; compute exponent part
295    vmovaps      xmm4,XMMWORD PTR L_real_inf ; preload for inf/nan test
296    xor          rax,rax
297    vpsrld       xmm3,xmm0,23             ; xmm3 <-- (ux>>23)
298    vmovd        eax,xmm0  ;eax = x
299    vpsubd       xmm3,xmm3,DWORD PTR L_mask_127 ; xmm3 <-- (ux>>23) - 127
300    vcvtdq2ps    xmm5,xmm3                ; xmm5 <-- float((ux>>23)-127) = xexp
301
302    ;  NaN or inf
303    vpand        xmm1,xmm0,xmm4           ; xmm1 <-- (ux & 07f800000h)
304    vcomiss      xmm1,xmm4
305    je           Llogf_fma3_x_is_inf_or_nan
306
307    ; check for negative numbers or zero
308    vpxor        xmm1,xmm1,xmm1
309    vcomiss      xmm0,xmm1
310    jbe          Llogf_fma3_x_is_zero_or_neg
311
312    vpand        xmm2,xmm0,DWORD PTR L_real_mant  ; xmm2 <-- ux & 0007FFFFFh
313    vsubss       xmm4,xmm0,DWORD PTR L_real_one   ; xmm4 <-- x - 1.0
314
315    vcomiss      xmm5,DWORD PTR L_real_neg127
316    je           Llogf_fma3_subnormal_adjust
317
318Llogf_fma3_continue_common:
319
320    ; compute the index into the log tables
321    vpand        xmm1,xmm0,DWORD PTR L_mask_mant_all7 ; xmm1 = ux & 0007f0000h
322    vpand        xmm3,xmm0,DWORD PTR L_mask_mant8     ; xmm3 = ux & 000008000h
323    vpslld       xmm3,xmm3,1              ; xmm3  = (ux & 000008000h) << 1
324    vpaddd       xmm1,xmm3,xmm1
325    ; eax = (ux & 0007f0000h) + ((ux & 000008000h) << 1)
326    ; eax <-- x/127., rounded to nearest
327    vmovd        eax,xmm1
328
329    ; near one codepath
330    vandps       xmm4,xmm4,DWORD PTR L_real_notsign   ; xmm4 <-- fabs (x - 1.0)
331    vcomiss      xmm4,DWORD PTR L_real_threshold
332    jb           Llogf_fma3_near_one
333
334    ; F,Y
335    shr          eax,16
336    vpor         xmm2,xmm2,DWORD PTR L_real_half ; xmm2 <-- Y
337    vpor         xmm1,xmm1,DWORD PTR L_real_half ; xmm1 <-- F
338    lea          r9,QWORD PTR __log_F_inv_dword
339
340    ; f = F - Y
341    vsubss       xmm1,xmm1,xmm2           ; f = F - Y
342    ; r = f * log_F_inv_dword[index]
343    vmulss       xmm1,xmm1,DWORD PTR [r9 + rax * 4]
344
345    ; poly
346    vmovaps      xmm2,XMMWORD PTR L_real_1_over_3
347    vfmadd213ss  xmm2,xmm1,DWORD PTR L_real_1_over_2 ; 1/3*r + 1/2
348    vmulss       xmm0,xmm1,xmm1           ; r*r
349    vmovaps      xmm3,DWORD PTR L_real_log2_tail;
350
351    lea          r9,DWORD PTR __log_128_tail
352    lea          r10,DWORD PTR __log_128_lead
353
354    vfmadd231ss  xmm1,xmm2,xmm0           ; poly = r + 1/2*r*r + 1/3*r*r*r
355    vfmsub213ss  xmm3,xmm5,xmm1           ; (xexp * log2_tail) - poly
356
357    ; m*log(2) + log(G) - poly
358    vmovaps      xmm0,DWORD PTR L_real_log2_lead
359    vfmadd213ss  xmm0,xmm5,[r10 + rax * 4]
360    ; z2 = (xexp * log2_tail) - poly + log_128_tail[index]
361    vaddss       xmm3,xmm3,DWORD PTR [r9 + rax * 4]
362    vaddss       xmm0,xmm0,xmm3           ; return z1 + z2
363
364    StackDeallocate stack_size
365    ret
366
367ALIGN  16
368Llogf_fma3_near_one:
369    ; r = x - 1.0;
370    vmovaps      xmm2,DWORD PTR L_real_two
371    vsubss       xmm0,xmm0,DWORD PTR L_real_one  ; xmm0 = r = = x - 1.0
372
373    ; u = r / (2.0 + r)
374    vaddss       xmm2,xmm2,xmm0           ; (r+2.0)
375    vdivss       xmm1,xmm0,xmm2           ; u = r / (2.0 + r)
376
377    ; correction = r * u
378    vmulss       xmm4,xmm0,xmm1           ; correction = u*r
379
380    ; u = u + u;
381    vaddss       xmm1,xmm1,xmm1           ; u = u+u
382    vmulss       xmm2,xmm1,xmm1           ; v = u^2
383
384    ; r2 = (u * v * (ca_1 + v * ca_2) - correction)
385    vmulss       xmm3,xmm1,xmm2           ; u^3
386    vmovaps      xmm5,DWORD PTR L_real_ca2
387    vfmadd213ss  xmm2,xmm5,DWORD PTR L_real_ca1
388    vfmsub213ss  xmm2,xmm3,xmm4       ; r2 = (ca1 + ca2 * v) * u^3 - correction
389
390    ; r + r2
391    vaddss       xmm0,xmm0,xmm2
392    StackDeallocate stack_size
393    ret
394
395
396ALIGN  16
397Llogf_fma3_subnormal_adjust:
398    vmovaps      xmm3,DWORD PTR L_real_one
399    vpor         xmm2,xmm2,xmm3  ; xmm2 = temp = ((ux &0007FFFFFh) | 03f800000h)
400    vsubss       xmm2,xmm2,xmm3  ; xmm2 = temp -1.0
401    vpsrld       xmm5,xmm2,23                 ; xmm5 = (utemp >> 23)
402    vpand        xmm2,xmm2,DWORD PTR L_real_mant ; xmm2 = (utemp & 0007FFFFFh)
403    vmovaps      xmm0,xmm2
404    vpsubd       xmm5,xmm5,DWORD PTR L_mask_253  ; xmm5 = (utemp >> 23) - 253
405    vcvtdq2ps    xmm5,xmm5               ; xmm5 = (float) ((utemp >> 23) - 253)
406    jmp          Llogf_fma3_continue_common
407
408Llogf_fma3_x_is_zero_or_neg:
409    jne          Llogf_fma3_x_is_neg
410
411    vmovaps      xmm1,DWORD PTR L_real_ninf
412    mov          r8d,DWORD PTR L_flag_x_zero
413    call         fname_special
414
415    StackDeallocate stack_size
416    ret
417
418
419Llogf_fma3_x_is_neg:
420
421    vmovaps      xmm1,DWORD PTR L_real_neg_qnan
422    mov          r8d,DWORD PTR L_flag_x_neg
423    call         fname_special
424
425    StackDeallocate stack_size
426    ret
427
428Llogf_fma3_x_is_inf_or_nan:
429
430    cmp          eax,DWORD PTR L_real_inf
431    je           Llogf_fma3_finish
432
433    cmp          eax,DWORD PTR L_real_ninf
434    je           Llogf_fma3_x_is_neg
435
436    or           eax,DWORD PTR L_real_qnanbit
437    vmovd        xmm1,eax
438    mov          r8d,DWORD PTR L_flag_x_nan
439    call         fname_special
440
441    StackDeallocate stack_size
442    ret
443
444Llogf_fma3_finish:
445
446    StackDeallocate stack_size
447    ret
448
449
450fname endp
451END
452