1 /* MIPS implementation of exp
2 *
3 * Inspired by Intel Approximate Math library, and based on the
4 * corresponding algorithms of the cephes math library
5 */
6
7 /* Copyright (C) 2020 Leo <leo@nullptr.com.cn>. All rights reserved.
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the authors be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute it
15 * freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must not
18 * claim that you wrote the original software. If you use this software
19 * in a product, an acknowledgment in the product documentation would be
20 * appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must not be
22 * misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source distribution.
24 *
25 * (this is the zlib license)
26 */
27
28 #ifndef LAYER_MIPS_MATHFUN_H
29 #define LAYER_MIPS_MATHFUN_H
30
31 #include "mips_common.h"
32
33 #include <msa.h>
34
35 _MIPS_FLOAT_CONST(c_1, 1.0f);
36 _MIPS_FLOAT_CONST(c_2, 2.0f);
37 _MIPS_FLOAT_CONST(c_n1, -1.0f);
38 _MIPS_FLOAT_CONST(c_0p5, 0.5f);
39
40 _MIPS_FLOAT_CONST(c_exp_hi, 88.3762626647949f);
41 _MIPS_FLOAT_CONST(c_exp_lo, -88.3762626647949f);
42
43 _MIPS_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341);
44 _MIPS_FLOAT_CONST(c_cephes_exp_C1, 0.693359375);
45 _MIPS_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4);
46
47 _MIPS_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4);
48 _MIPS_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3);
49 _MIPS_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3);
50 _MIPS_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2);
51 _MIPS_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1);
52 _MIPS_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1);
53
54 /* exp() computed for 4 float at once */
exp_ps(v4f32 x)55 static inline v4f32 exp_ps(v4f32 x)
56 {
57 v4f32 tmp, fx;
58
59 v4f32 one = (v4f32)__msa_fill_w(c_1.i);
60 x = __msa_fmin_w(x, (v4f32)__msa_fill_w(c_exp_hi.i));
61 x = __msa_fmax_w(x, (v4f32)__msa_fill_w(c_exp_lo.i));
62
63 /* express exp(x) as exp(g + n*log(2)) */
64 fx = __msa_fmul_w(x, (v4f32)__msa_fill_w(c_cephes_LOG2EF.i));
65 fx = __msa_fadd_w(fx, (v4f32)__msa_fill_w(c_0p5.i));
66
67 /* perform a floorf */
68 tmp = __msa_ffint_s_w(__msa_ftrunc_s_w(fx));
69
70 /* if greater, substract 1 */
71 v4i32_w mask = __msa_fslt_w(fx, tmp);
72 mask = (v4i32_w)__msa_and_v((v16u8)mask, (v16u8)one);
73
74 fx = __msa_fsub_w(tmp, (v4f32)mask);
75
76 tmp = __msa_fmul_w(fx, (v4f32)__msa_fill_w(c_cephes_exp_C1.i));
77 v4f32 z = __msa_fmul_w(fx, (v4f32)__msa_fill_w(c_cephes_exp_C2.i));
78 x = __msa_fsub_w(x, tmp);
79 x = __msa_fsub_w(x, z);
80
81 v4f32 y = (v4f32)__msa_fill_w(c_cephes_exp_p0.i);
82 v4f32 c1 = (v4f32)__msa_fill_w(c_cephes_exp_p1.i);
83 v4f32 c2 = (v4f32)__msa_fill_w(c_cephes_exp_p2.i);
84 v4f32 c3 = (v4f32)__msa_fill_w(c_cephes_exp_p3.i);
85 v4f32 c4 = (v4f32)__msa_fill_w(c_cephes_exp_p4.i);
86 v4f32 c5 = (v4f32)__msa_fill_w(c_cephes_exp_p5.i);
87
88 y = __msa_fmul_w(y, x);
89 z = __msa_fmul_w(x, x);
90
91 y = __msa_fadd_w(y, c1);
92 y = __msa_fmul_w(y, x);
93 y = __msa_fadd_w(y, c2);
94 y = __msa_fmul_w(y, x);
95 y = __msa_fadd_w(y, c3);
96 y = __msa_fmul_w(y, x);
97 y = __msa_fadd_w(y, c4);
98 y = __msa_fmul_w(y, x);
99 y = __msa_fadd_w(y, c5);
100
101 y = __msa_fmul_w(y, z);
102 y = __msa_fadd_w(y, x);
103 y = __msa_fadd_w(y, one);
104
105 /* build 2^n */
106 v4i32 mm;
107 mm = __msa_ftrunc_s_w(fx);
108 mm = __msa_addv_w(mm, __msa_fill_w(0x7f));
109 mm = __msa_sll_w(mm, __msa_fill_w(23));
110
111 y = __msa_fmul_w(y, (v4f32)mm);
112 return y;
113 }
114
115 _MIPS_FLOAT_CONST(c_cephes_HALFMAXLOGF, 44.014845935754205f);
116 _MIPS_FLOAT_CONST(c_cephes_tanh_C1, 0.625f);
117
118 _MIPS_FLOAT_CONST(c_cephes_tanh_p0, -5.70498872745E-3);
119 _MIPS_FLOAT_CONST(c_cephes_tanh_p1, +2.06390887954E-2);
120 _MIPS_FLOAT_CONST(c_cephes_tanh_p2, -5.37397155531E-2);
121 _MIPS_FLOAT_CONST(c_cephes_tanh_p3, +1.33314422036E-1);
122 _MIPS_FLOAT_CONST(c_cephes_tanh_p4, -3.33332819422E-1);
123
124 /* tanh() computed for 4 float at once */
tanh_ps(v4f32 x)125 static inline v4f32 tanh_ps(v4f32 x)
126 {
127 v4f32 x2 = (v4f32)__msa_bclri_w((v4u32)x, 31);
128
129 v4i32_w mask_l = __msa_fclt_w((v4f32)__msa_fill_w(c_cephes_tanh_C1.i), x2);
130 v4i32_w mask_l2 = __msa_fcle_w((v4f32)__msa_fill_w(c_cephes_HALFMAXLOGF.i), x2);
131
132 // abs(x) >= 0.625
133 // tanh(x) = 1 − 2 / (exp(2x) + 1)
134 v4f32 _one = (v4f32)__msa_fill_w(c_1.i);
135 v4f32 _two = (v4f32)__msa_fill_w(c_2.i);
136 v4f32 exp_x_x = exp_ps(__msa_fadd_w(x, x));
137 v4f32 y0 = __msa_fsub_w(_one, __msa_fdiv_w(_two, __msa_fadd_w(exp_x_x, _one)));
138
139 // abs(x) < 0.625
140 /*
141 z = x2 * x2;
142 z =
143 (((( -5.70498872745E-3 * z
144 + 2.06390887954E-2) * z
145 - 5.37397155531E-2) * z
146 + 1.33314422036E-1) * z
147 - 3.33332819422E-1) * z * x
148 + x;
149 */
150 v4f32 y = (v4f32)__msa_fill_w(c_cephes_tanh_p0.i);
151 v4f32 c1 = (v4f32)__msa_fill_w(c_cephes_tanh_p1.i);
152 v4f32 c2 = (v4f32)__msa_fill_w(c_cephes_tanh_p2.i);
153 v4f32 c3 = (v4f32)__msa_fill_w(c_cephes_tanh_p3.i);
154 v4f32 c4 = (v4f32)__msa_fill_w(c_cephes_tanh_p4.i);
155
156 v4f32 z = __msa_fmul_w(x, x);
157
158 y = __msa_fmul_w(y, z);
159 y = __msa_fadd_w(y, c1);
160 y = __msa_fmul_w(y, z);
161 y = __msa_fadd_w(y, c2);
162 y = __msa_fmul_w(y, z);
163 y = __msa_fadd_w(y, c3);
164 y = __msa_fmul_w(y, z);
165 y = __msa_fadd_w(y, c4);
166
167 y = __msa_fmul_w(y, z);
168 y = __msa_fmul_w(y, x);
169 y = __msa_fadd_w(y, x);
170
171 // abs(x) > HALFMAXLOGF
172 // return 1.0 or -1.0
173 v4i32_w mask_pos = __msa_fcle_w((v4f32)__msa_fill_w(0), x2);
174 v4f32 y1 = (v4f32)__msa_bsel_v((v16u8)mask_pos, (v16u8)__msa_fill_w(c_1.i), (v16u8)__msa_fill_w(c_n1.i));
175
176 y = (v4f32)__msa_bsel_v((v16u8)mask_l, (v16u8)y0, (v16u8)y);
177 y = (v4f32)__msa_bsel_v((v16u8)mask_l2, (v16u8)y1, (v16u8)y);
178 return y;
179 }
180
181 #endif // LAYER_MIPS_MATHFUN_H
182