1 /******************************************************************************
2 Copyright (c) 2007-2011, Intel Corp.
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of Intel Corporation nor the names of its contributors
14 may be used to endorse or promote products derived from this software
15 without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27 THE POSSIBILITY OF SUCH DAMAGE.
28 ******************************************************************************/
29
30 #define BASE_NAME cbrt
31 #include "dpml_ux.h"
32
33 #if !defined(MAKE_INCLUDE)
34 # include STR(BUILD_FILE_NAME)
35 #endif
36
37 /*
38 ** The algorithms used for the cbrt function are detailed in the X_FLOAT_NOTES
39 ** file (notes 18.*).
40 **
41 ** The basic approach is to factor the input x into f * 2^n, where
42 ** 1 <= f < 2 and n = 3*m + i, where i = 0, 1, or 2. Then
43 **
44 ** cbrt(x) = cbrt(2^n * f)
45 ** = cbrt(2^(3*m+i) * f)
46 ** = 2^m * cbrt(2^i) * cbrt(f).
47 **
48 ** To get cbrt(f), we do a poly approx y = P(f) good to about 15 bits, then
49 ** perform one Newton's iterations in double precision to get 45 bits and then
50 ** one Newton's iteration in unpacked format good to about 135 bits. We fetch
51 ** 2^(i/3) from a table in double precision and incorporate during the
52 ** double precision Newton's iteration. The result of the unpacked Newton's
53 ** iteration is scaled by m and has its sign bit adjusted to get the final
54 ** result.
55 **
56 ** The poly coefficients and a small table of the roots, 2^(i/3), is generated
57 ** from dpml_cbrt.c and is shared between this file and the routines generated
58 ** form dpml_cbrt.c
59 **
60 ** Given z, an approximation to 1/cbrt(f)^2, the double precision Newton's
61 ** iteration is of the form:
62 **
63 ** y <-- z * f * (14 - 7 * z^3 * f^2 + 2 * z^6 * f^4 ) * 1/9
64 **
65 ** and the unpacked iteration is:
66 **
67 ** y y^3 + 2*x
68 ** y <-- --- * ---------
69 ** 2 y^3 + x/2
70 **
71 **
72 ** Instead of unbiasing the exponent right away, we add and later subtract
73 ** small corrective quantities (ADD_ADJUST, SUB_ADJUST) to get rid of the
74 ** BIAS/3 exactly:
75 **
76 ** (true_expon + BIAS + ADD_ADJUST)*(1/3) - SUB_ADJUST = true_expon/3
77 **
78 ** true_expon + BIAS >= 0, so we can do unsigned arithmetic, which has
79 ** better performance.
80 */
81
82 #define SUB_ADJUST (F_PRECISION + F_EXP_BIAS + 2)/3
83 #define ADD_ADJUST (3*(SUB_ADJUST))
84
85 /*
86 ** Instead of doing integer division, we can multiply by an integer that
87 ** corresponds to 1/3 in "fixed point".
88 **
89 ** If the number is small enough and in the right form, the compiler may
90 ** optimize the multiply into shifts and adds.
91 */
92
93 #define ONE_THIRD 0x1111
94 #define SHIFT_PROD 17
95 #define DIV_BY_3(num) (( (10 * num) * ONE_THIRD + num) >> SHIFT_PROD)
96
97 #if !defined(F_ENTRY_NAME)
98 # define F_ENTRY_NAME F_CBRT_NAME
99 #endif
100
X_X_PROTO(F_ENTRY_NAME,packed_result,packed_argument)101 X_X_PROTO(F_ENTRY_NAME, packed_result, packed_argument)
102 {
103 DECLARE_X_FLOAT(packed_result)
104 WORD fp_class;
105 UX_UNSIGNED_EXPONENT_TYPE m, i, j;
106 UX_FRACTION_DIGIT_TYPE msd, tmp_digit;
107 UX_FLOAT unpacked_argument, unpacked_result, y_cubed, tmp[2];
108 D_UNION u;
109 double y, f, z, f2, z2, z4;
110
111 EXCEPTION_INFO_DECL
112
113 INIT_EXCEPTION_INFO;
114 fp_class = UNPACK(
115 PASS_ARG_X_FLOAT(packed_argument),
116 & unpacked_argument,
117 CBRT_CLASS_TO_ACTION_MAP,
118 PASS_RET_X_FLOAT(packed_result)
119 OPT_EXCEPTION_INFO );
120
121 if (0 >= fp_class)
122 RETURN_X_FLOAT(packed_result);
123
124 /*
125 ** Get f as a double precision value z ~ 1/cbrt(f)^2 by a polynomial
126 ** approximation
127 */
128
129 msd = G_UX_MSD(&unpacked_argument);
130 u.D_HI_WORD = ((WORD)(D_EXP_BIAS-1) << D_EXP_POS) + (msd >> D_EXP_WIDTH);
131
132 # if (BITS_PER_UX_FRACTION_DIGIT_TYPE == 32)
133
134 tmp_digit = G_UX_2nd_MSD(&unpacked_argument);
135 u.F_LO_WORD = (msd << (BITS_PER_UX_FRACTION_DIGIT_TYPE - D_EXP_WIDTH))
136 | (lsd >> D_EXP_WIDTH);
137
138 # endif
139
140 f = u.f;
141 z = RECIP_CBRT_POLY(f);
142
143 /* Get m and i */
144
145 j = G_UX_EXPONENT(&unpacked_argument) + (ADD_ADJUST - 1);
146 m = DIV_BY_3(j);
147 i = j - 3*m;
148
149 /*
150 ** Now evaluate the Newton's iterations and incorporate the factor of
151 ** 2^(i/3). The grouping chosen here is an attempt to maximize parallelism
152 ** and is probably not a good choice on a sequential machine
153 */
154
155 z2 = z*z;
156 z4 = z2*z2;
157 f2 = f*f;
158 y = POW_CBRT_2_TABLE[i]*((((FOURTEEN_NINTHS*f)*z)
159 - z4*((SEVEN_NINTHS*f)*f2))
160 + (z4*(z2*z))*((TWO_NINTHS*f)*(f2*f2)));
161
162 /* Convert the double precision result to unpacked x_float */
163
164 u.f = y;
165 msd = u.D_HI_WORD;
166
167 P_UX_EXPONENT(&unpacked_result, (msd >> D_EXP_POS) + m
168 - (D_EXP_BIAS + SUB_ADJUST - 1));
169 P_UX_SIGN(&unpacked_result, G_UX_SIGN(&unpacked_argument));
170 msd = (msd << D_EXP_WIDTH) | UX_MSB;
171
172 # if (BITS_PER_UX_FRACTION_DIGIT_TYPE == 32)
173
174 tmp_digit = u.F_LO_WORD;
175 P_UX_2nd_MSD(&unpacked_result, tmp_digit << D_EXP_POS);
176 msd |= (tmp_digit >> (BITS_PER_WORD - D_EXP_POS));
177 P_UX_2nd_LSD(&unpacked_result, 0);
178 # endif
179
180 P_UX_MSD(&unpacked_result, msd);
181 P_UX_LSD(&unpacked_result, 0);
182
183 /* Do the Newton's iteration */
184
185 MULTIPLY(&unpacked_result, &unpacked_result, &y_cubed);
186 MULTIPLY(&unpacked_result, &y_cubed, &y_cubed);
187
188 UX_INCR_EXPONENT(&unpacked_argument, 1); /* 2*x */
189 ADDSUB(&y_cubed, &unpacked_argument, ADD, &tmp[0]);
190
191 UX_DECR_EXPONENT(&unpacked_argument, 2); /* x/2 */
192 ADDSUB(&y_cubed, &unpacked_argument, ADD, &tmp[1]);
193
194 DIVIDE(&tmp[0], &tmp[1], FULL_PRECISION, &tmp[0]);
195
196 MULTIPLY(&unpacked_result, &tmp[0], &unpacked_result);
197 UX_DECR_EXPONENT(&unpacked_result, 1);
198
199 PACK(
200 &unpacked_result,
201 PASS_RET_X_FLOAT(packed_result),
202 NOT_USED,
203 NOT_USED
204 OPT_EXCEPTION_INFO );
205
206 RETURN_X_FLOAT(packed_result);
207
208 }
209
210 #if defined(MAKE_INCLUDE)
211
212 @divert -append divertText
213
214 function recip_cbrt(z)
215 {
216 auto t;
217
218 t = cbrt(z);
219 return 1/(t * t);
220 }
221
222
223 # undef TABLE_NAME
224
225 START_TABLE;
226
227 TABLE_COMMENT("Cbrt root class-to-action-mapping");
228 PRINT_CLASS_TO_ACTION_TBL_DEF( "CBRT_CLASS_TO_ACTION_MAP");
229 PRINT_64_TBL_ITEM( CLASS_TO_ACTION_DISP(1) +
230 CLASS_TO_ACTION( F_C_SIG_NAN, RETURN_QUIET_NAN, 0) +
231 CLASS_TO_ACTION( F_C_QUIET_NAN, RETURN_VALUE, 0) +
232 CLASS_TO_ACTION( F_C_POS_INF, RETURN_VALUE, 0) +
233 CLASS_TO_ACTION( F_C_NEG_INF, RETURN_VALUE, 0) +
234 CLASS_TO_ACTION( F_C_POS_DENORM, RETURN_UNPACKED, 0) +
235 CLASS_TO_ACTION( F_C_NEG_DENORM, RETURN_UNPACKED, 0) +
236 CLASS_TO_ACTION( F_C_POS_NORM, RETURN_UNPACKED, 0) +
237 CLASS_TO_ACTION( F_C_NEG_NORM, RETURN_UNPACKED, 0) +
238 CLASS_TO_ACTION( F_C_POS_ZERO, RETURN_VALUE, 0) +
239 CLASS_TO_ACTION( F_C_NEG_ZERO , RETURN_VALUE, 0) );
240
241 /* Generate coefficients and polynomial form for 1/cbrt(f)^2 */
242
243 PRINT_R_TBL_COM_ADEF("coefs to approx 1/cbrt(f)^2", "COEFS\t\t\t");
244 remes(REMES_FIND_POLYNOMIAL + REMES_ABSOLUTE_WEIGHT + REMES_LINEAR_ARG,
245 1.0, 2.0, recip_cbrt, 15, °ree, &poly_coefs);
246 for (i = 0; i <= degree ; i++)
247 { PRINT_R_TBL_ITEM( poly_coefs[i] ); }
248 GENPOLY(COEFS[%%d], RECIP_CBRT_POLY(x), degree);
249
250 /* Now get powers of cbrt(2) */
251
252 PRINT_R_TBL_COM_ADEF("cube roots of 2^i, i = 0, 1, 2","POW_CBRT_2_TABLE\t");
253
254 c = cbrt(2);
255 for( i = 0; i <= 2; i++)
256 { PRINT_R_TBL_ITEM(c^i); }
257
258 /* Last but not least, the Newton's iteration constants */
259
260 TABLE_COMMENT("14/9, 7/9 and 2/9 in double precision");
261
262 PRINT_R_TBL_VDEF_ITEM( "FOURTEEN_NINTHS\t\t", 14/9);
263 PRINT_R_TBL_VDEF_ITEM( "SEVEN_NINTHS\t\t", 7/9);
264 PRINT_R_TBL_VDEF_ITEM( "TWO_NINTHS\t\t", 2/9);
265
266 END_TABLE;
267
268 @end_divert
269 @eval my $tableText; \
270 my $outText = MphocEval( GetStream( "divertText" ) ); \
271 my $defineText = Egrep( "#define", $outText, \$tableText ); \
272 my $polyText = Egrep( STR(GENPOLY_EXECUTABLE), $tableText, \
273 \$tableText ); \
274 $polyText = GenPoly( $polyText ); \
275 $outText = "$tableText\n\n$defineText\n\n$polyText"; \
276 my $headerText = GetHeaderText( STR(BUILD_FILE_NAME), \
277 "Definitions and constants cbrt", \
278 __FILE__ ); \
279 print "$headerText\n$outText";
280
281 #endif
282
283