1 /* MIPS implementation of exp
2  *
3  *   Inspired by Intel Approximate Math library, and based on the
4  *   corresponding algorithms of the cephes math library
5  */
6 
7 /* Copyright (C) 2020 Leo <leo@nullptr.com.cn>. All rights reserved.
8  *
9  *  This software is provided 'as-is', without any express or implied
10  *  warranty.  In no event will the authors be held liable for any damages
11  *  arising from the use of this software.
12  *
13  *  Permission is granted to anyone to use this software for any purpose,
14  *  including commercial applications, and to alter it and redistribute it
15  *  freely, subject to the following restrictions:
16  *
17  *  1. The origin of this software must not be misrepresented; you must not
18  *     claim that you wrote the original software. If you use this software
19  *     in a product, an acknowledgment in the product documentation would be
20  *     appreciated but is not required.
21  *  2. Altered source versions must be plainly marked as such, and must not be
22  *     misrepresented as being the original software.
23  *  3. This notice may not be removed or altered from any source distribution.
24  *
25  *  (this is the zlib license)
26  */
27 
28 #ifndef LAYER_MIPS_MATHFUN_H
29 #define LAYER_MIPS_MATHFUN_H
30 
31 #include "mips_common.h"
32 
33 #include <msa.h>
34 
35 _MIPS_FLOAT_CONST(c_1, 1.0f);
36 _MIPS_FLOAT_CONST(c_2, 2.0f);
37 _MIPS_FLOAT_CONST(c_n1, -1.0f);
38 _MIPS_FLOAT_CONST(c_0p5, 0.5f);
39 
40 _MIPS_FLOAT_CONST(c_exp_hi, 88.3762626647949f);
41 _MIPS_FLOAT_CONST(c_exp_lo, -88.3762626647949f);
42 
43 _MIPS_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341);
44 _MIPS_FLOAT_CONST(c_cephes_exp_C1, 0.693359375);
45 _MIPS_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4);
46 
47 _MIPS_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4);
48 _MIPS_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3);
49 _MIPS_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3);
50 _MIPS_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2);
51 _MIPS_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1);
52 _MIPS_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1);
53 
54 /* exp() computed for 4 float at once */
exp_ps(v4f32 x)55 static inline v4f32 exp_ps(v4f32 x)
56 {
57     v4f32 tmp, fx;
58 
59     v4f32 one = (v4f32)__msa_fill_w(c_1.i);
60     x = __msa_fmin_w(x, (v4f32)__msa_fill_w(c_exp_hi.i));
61     x = __msa_fmax_w(x, (v4f32)__msa_fill_w(c_exp_lo.i));
62 
63     /* express exp(x) as exp(g + n*log(2)) */
64     fx = __msa_fmul_w(x, (v4f32)__msa_fill_w(c_cephes_LOG2EF.i));
65     fx = __msa_fadd_w(fx, (v4f32)__msa_fill_w(c_0p5.i));
66 
67     /* perform a floorf */
68     tmp = __msa_ffint_s_w(__msa_ftrunc_s_w(fx));
69 
70     /* if greater, substract 1 */
71     v4i32_w mask = __msa_fslt_w(fx, tmp);
72     mask = (v4i32_w)__msa_and_v((v16u8)mask, (v16u8)one);
73 
74     fx = __msa_fsub_w(tmp, (v4f32)mask);
75 
76     tmp = __msa_fmul_w(fx, (v4f32)__msa_fill_w(c_cephes_exp_C1.i));
77     v4f32 z = __msa_fmul_w(fx, (v4f32)__msa_fill_w(c_cephes_exp_C2.i));
78     x = __msa_fsub_w(x, tmp);
79     x = __msa_fsub_w(x, z);
80 
81     v4f32 y = (v4f32)__msa_fill_w(c_cephes_exp_p0.i);
82     v4f32 c1 = (v4f32)__msa_fill_w(c_cephes_exp_p1.i);
83     v4f32 c2 = (v4f32)__msa_fill_w(c_cephes_exp_p2.i);
84     v4f32 c3 = (v4f32)__msa_fill_w(c_cephes_exp_p3.i);
85     v4f32 c4 = (v4f32)__msa_fill_w(c_cephes_exp_p4.i);
86     v4f32 c5 = (v4f32)__msa_fill_w(c_cephes_exp_p5.i);
87 
88     y = __msa_fmul_w(y, x);
89     z = __msa_fmul_w(x, x);
90 
91     y = __msa_fadd_w(y, c1);
92     y = __msa_fmul_w(y, x);
93     y = __msa_fadd_w(y, c2);
94     y = __msa_fmul_w(y, x);
95     y = __msa_fadd_w(y, c3);
96     y = __msa_fmul_w(y, x);
97     y = __msa_fadd_w(y, c4);
98     y = __msa_fmul_w(y, x);
99     y = __msa_fadd_w(y, c5);
100 
101     y = __msa_fmul_w(y, z);
102     y = __msa_fadd_w(y, x);
103     y = __msa_fadd_w(y, one);
104 
105     /* build 2^n */
106     v4i32 mm;
107     mm = __msa_ftrunc_s_w(fx);
108     mm = __msa_addv_w(mm, __msa_fill_w(0x7f));
109     mm = __msa_sll_w(mm, __msa_fill_w(23));
110 
111     y = __msa_fmul_w(y, (v4f32)mm);
112     return y;
113 }
114 
115 _MIPS_FLOAT_CONST(c_cephes_HALFMAXLOGF, 44.014845935754205f);
116 _MIPS_FLOAT_CONST(c_cephes_tanh_C1, 0.625f);
117 
118 _MIPS_FLOAT_CONST(c_cephes_tanh_p0, -5.70498872745E-3);
119 _MIPS_FLOAT_CONST(c_cephes_tanh_p1, +2.06390887954E-2);
120 _MIPS_FLOAT_CONST(c_cephes_tanh_p2, -5.37397155531E-2);
121 _MIPS_FLOAT_CONST(c_cephes_tanh_p3, +1.33314422036E-1);
122 _MIPS_FLOAT_CONST(c_cephes_tanh_p4, -3.33332819422E-1);
123 
124 /* tanh() computed for 4 float at once */
tanh_ps(v4f32 x)125 static inline v4f32 tanh_ps(v4f32 x)
126 {
127     v4f32 x2 = (v4f32)__msa_bclri_w((v4u32)x, 31);
128 
129     v4i32_w mask_l = __msa_fclt_w((v4f32)__msa_fill_w(c_cephes_tanh_C1.i), x2);
130     v4i32_w mask_l2 = __msa_fcle_w((v4f32)__msa_fill_w(c_cephes_HALFMAXLOGF.i), x2);
131 
132     // abs(x) >= 0.625
133     // tanh(x) = 1 − 2 / (exp(2x) + 1)
134     v4f32 _one = (v4f32)__msa_fill_w(c_1.i);
135     v4f32 _two = (v4f32)__msa_fill_w(c_2.i);
136     v4f32 exp_x_x = exp_ps(__msa_fadd_w(x, x));
137     v4f32 y0 = __msa_fsub_w(_one, __msa_fdiv_w(_two, __msa_fadd_w(exp_x_x, _one)));
138 
139     // abs(x) < 0.625
140     /*
141         z = x2 * x2;
142         z =
143         (((( -5.70498872745E-3 * z
144         + 2.06390887954E-2) * z
145         - 5.37397155531E-2) * z
146         + 1.33314422036E-1) * z
147         - 3.33332819422E-1) * z * x
148         + x;
149     */
150     v4f32 y = (v4f32)__msa_fill_w(c_cephes_tanh_p0.i);
151     v4f32 c1 = (v4f32)__msa_fill_w(c_cephes_tanh_p1.i);
152     v4f32 c2 = (v4f32)__msa_fill_w(c_cephes_tanh_p2.i);
153     v4f32 c3 = (v4f32)__msa_fill_w(c_cephes_tanh_p3.i);
154     v4f32 c4 = (v4f32)__msa_fill_w(c_cephes_tanh_p4.i);
155 
156     v4f32 z = __msa_fmul_w(x, x);
157 
158     y = __msa_fmul_w(y, z);
159     y = __msa_fadd_w(y, c1);
160     y = __msa_fmul_w(y, z);
161     y = __msa_fadd_w(y, c2);
162     y = __msa_fmul_w(y, z);
163     y = __msa_fadd_w(y, c3);
164     y = __msa_fmul_w(y, z);
165     y = __msa_fadd_w(y, c4);
166 
167     y = __msa_fmul_w(y, z);
168     y = __msa_fmul_w(y, x);
169     y = __msa_fadd_w(y, x);
170 
171     // abs(x) > HALFMAXLOGF
172     // return 1.0 or -1.0
173     v4i32_w mask_pos = __msa_fcle_w((v4f32)__msa_fill_w(0), x2);
174     v4f32 y1 = (v4f32)__msa_bsel_v((v16u8)mask_pos, (v16u8)__msa_fill_w(c_1.i), (v16u8)__msa_fill_w(c_n1.i));
175 
176     y = (v4f32)__msa_bsel_v((v16u8)mask_l, (v16u8)y0, (v16u8)y);
177     y = (v4f32)__msa_bsel_v((v16u8)mask_l2, (v16u8)y1, (v16u8)y);
178     return y;
179 }
180 
181 #endif // LAYER_MIPS_MATHFUN_H
182