1 /******************************************************************************
2   Copyright (c) 2007-2011, Intel Corp.
3   All rights reserved.
4 
5   Redistribution and use in source and binary forms, with or without
6   modification, are permitted provided that the following conditions are met:
7 
8     * Redistributions of source code must retain the above copyright notice,
9       this list of conditions and the following disclaimer.
10     * Redistributions in binary form must reproduce the above copyright
11       notice, this list of conditions and the following disclaimer in the
12       documentation and/or other materials provided with the distribution.
13     * Neither the name of Intel Corporation nor the names of its contributors
14       may be used to endorse or promote products derived from this software
15       without specific prior written permission.
16 
17   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18   AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23   SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24   INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25   CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26   ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
27   THE POSSIBILITY OF SUCH DAMAGE.
28 ******************************************************************************/
29 
30 #define BASE_NAME	cbrt
31 #include "dpml_ux.h"
32 
33 #if !defined(MAKE_INCLUDE)
34 #   include STR(BUILD_FILE_NAME)
35 #endif
36 
37 /*
38 **  The algorithms used for the cbrt function are detailed in the X_FLOAT_NOTES
39 **  file (notes 18.*).
40 **
41 **  The basic approach is to factor the input x into f * 2^n, where
42 **  1 <= f < 2  and  n = 3*m + i, where i = 0, 1, or 2.  Then
43 **
44 **		cbrt(x) = cbrt(2^n * f)
45 **		        = cbrt(2^(3*m+i) * f)
46 **		        = 2^m * cbrt(2^i) * cbrt(f).
47 **
48 ** To get cbrt(f), we do a poly approx y = P(f) good to about 15 bits, then
49 ** perform one Newton's iterations in double precision to get 45 bits and then
50 ** one Newton's iteration in unpacked format good to about 135 bits. We fetch
51 ** 2^(i/3) from a table in double precision and incorporate during the
52 ** double precision Newton's iteration.  The result of the unpacked Newton's
53 ** iteration is scaled by m and has its sign bit adjusted to get the final
54 ** result.
55 **
56 ** The poly coefficients and a small table of the roots, 2^(i/3), is generated
57 ** from dpml_cbrt.c and is shared between this file and the routines generated
58 ** form dpml_cbrt.c
59 **
60 ** Given z, an approximation to 1/cbrt(f)^2, the double precision Newton's
61 ** iteration is of the form:
62 **
63 **  	y <--  z * f * (14 -  7 * z^3 * f^2  +  2 * z^6 * f^4 ) * 1/9
64 **
65 ** and the unpacked iteration is:
66 **
67 **	       y    y^3 + 2*x
68 **	y <-- --- * ---------
69 **	       2    y^3 + x/2
70 **
71 **
72 **  Instead of unbiasing the exponent right away, we add and later subtract
73 **  small corrective quantities (ADD_ADJUST, SUB_ADJUST) to get rid of the
74 **  BIAS/3 exactly:
75 **
76 **    (true_expon + BIAS + ADD_ADJUST)*(1/3) - SUB_ADJUST =  true_expon/3
77 **
78 **  true_expon + BIAS >= 0, so we can do unsigned arithmetic, which has
79 **  better performance.
80 */
81 
82 #define SUB_ADJUST  (F_PRECISION + F_EXP_BIAS + 2)/3
83 #define ADD_ADJUST  (3*(SUB_ADJUST))
84 
85 /*
86 ** Instead of doing integer division, we can multiply by an integer that
87 ** corresponds to 1/3 in "fixed point".
88 **
89 ** If the number is small enough and in the right form, the compiler may
90 ** optimize the multiply into shifts and adds.
91 */
92 
93 #define ONE_THIRD  0x1111
94 #define SHIFT_PROD 17
95 #define DIV_BY_3(num)   (( (10 * num) * ONE_THIRD  +  num) >> SHIFT_PROD)
96 
97 #if !defined(F_ENTRY_NAME)
98 #   define F_ENTRY_NAME F_CBRT_NAME
99 #endif
100 
X_X_PROTO(F_ENTRY_NAME,packed_result,packed_argument)101 X_X_PROTO(F_ENTRY_NAME, packed_result, packed_argument)
102     {
103     DECLARE_X_FLOAT(packed_result)
104     WORD fp_class;
105     UX_UNSIGNED_EXPONENT_TYPE m, i, j;
106     UX_FRACTION_DIGIT_TYPE msd, tmp_digit;
107     UX_FLOAT unpacked_argument, unpacked_result, y_cubed, tmp[2];
108     D_UNION u;
109     double y, f, z, f2, z2, z4;
110 
111     EXCEPTION_INFO_DECL
112 
113     INIT_EXCEPTION_INFO;
114     fp_class  = UNPACK(
115         PASS_ARG_X_FLOAT(packed_argument),
116         & unpacked_argument,
117         CBRT_CLASS_TO_ACTION_MAP,
118         PASS_RET_X_FLOAT(packed_result)
119         OPT_EXCEPTION_INFO );
120 
121     if (0 >= fp_class)
122        RETURN_X_FLOAT(packed_result);
123 
124     /*
125     **  Get f as a double precision value z ~ 1/cbrt(f)^2 by a polynomial
126     **  approximation
127     */
128 
129     msd = G_UX_MSD(&unpacked_argument);
130     u.D_HI_WORD = ((WORD)(D_EXP_BIAS-1) << D_EXP_POS) + (msd >> D_EXP_WIDTH);
131 
132 #   if (BITS_PER_UX_FRACTION_DIGIT_TYPE == 32)
133 
134         tmp_digit = G_UX_2nd_MSD(&unpacked_argument);
135         u.F_LO_WORD = (msd << (BITS_PER_UX_FRACTION_DIGIT_TYPE - D_EXP_WIDTH))
136                        | (lsd >> D_EXP_WIDTH);
137 
138 #   endif
139 
140     f = u.f;
141     z = RECIP_CBRT_POLY(f);
142 
143     /* Get m and i */
144 
145     j = G_UX_EXPONENT(&unpacked_argument) + (ADD_ADJUST - 1);
146     m = DIV_BY_3(j);
147     i = j - 3*m;
148 
149     /*
150     ** Now evaluate the Newton's iterations and incorporate the factor of
151     ** 2^(i/3).  The grouping chosen here is an attempt to maximize parallelism
152     ** and is probably not a good choice on a sequential machine
153     */
154 
155     z2 = z*z;
156     z4 = z2*z2;
157     f2 = f*f;
158     y = POW_CBRT_2_TABLE[i]*((((FOURTEEN_NINTHS*f)*z)
159           - z4*((SEVEN_NINTHS*f)*f2))
160             + (z4*(z2*z))*((TWO_NINTHS*f)*(f2*f2)));
161 
162     /* Convert the double precision result to unpacked x_float */
163 
164     u.f = y;
165     msd = u.D_HI_WORD;
166 
167     P_UX_EXPONENT(&unpacked_result, (msd >> D_EXP_POS) + m
168                                       - (D_EXP_BIAS + SUB_ADJUST - 1));
169     P_UX_SIGN(&unpacked_result, G_UX_SIGN(&unpacked_argument));
170     msd = (msd << D_EXP_WIDTH) | UX_MSB;
171 
172 #   if (BITS_PER_UX_FRACTION_DIGIT_TYPE == 32)
173 
174         tmp_digit = u.F_LO_WORD;
175         P_UX_2nd_MSD(&unpacked_result, tmp_digit << D_EXP_POS);
176         msd |= (tmp_digit >> (BITS_PER_WORD - D_EXP_POS));
177         P_UX_2nd_LSD(&unpacked_result, 0);
178 #   endif
179 
180     P_UX_MSD(&unpacked_result, msd);
181     P_UX_LSD(&unpacked_result, 0);
182 
183     /* Do the Newton's iteration */
184 
185     MULTIPLY(&unpacked_result, &unpacked_result, &y_cubed);
186     MULTIPLY(&unpacked_result, &y_cubed,         &y_cubed);
187 
188     UX_INCR_EXPONENT(&unpacked_argument, 1);	/* 2*x */
189     ADDSUB(&y_cubed, &unpacked_argument, ADD, &tmp[0]);
190 
191     UX_DECR_EXPONENT(&unpacked_argument, 2);	/* x/2 */
192     ADDSUB(&y_cubed, &unpacked_argument, ADD, &tmp[1]);
193 
194     DIVIDE(&tmp[0], &tmp[1], FULL_PRECISION, &tmp[0]);
195 
196     MULTIPLY(&unpacked_result, &tmp[0], &unpacked_result);
197     UX_DECR_EXPONENT(&unpacked_result, 1);
198 
199     PACK(
200         &unpacked_result,
201         PASS_RET_X_FLOAT(packed_result),
202         NOT_USED,
203         NOT_USED
204         OPT_EXCEPTION_INFO );
205 
206     RETURN_X_FLOAT(packed_result);
207 
208     }
209 
210 #if defined(MAKE_INCLUDE)
211 
212     @divert -append divertText
213 
214     function recip_cbrt(z)
215         {
216         auto t;
217 
218         t = cbrt(z);
219         return 1/(t * t);
220         }
221 
222 
223 #   undef TABLE_NAME
224 
225     START_TABLE;
226 
227     TABLE_COMMENT("Cbrt root class-to-action-mapping");
228     PRINT_CLASS_TO_ACTION_TBL_DEF( "CBRT_CLASS_TO_ACTION_MAP");
229     PRINT_64_TBL_ITEM( CLASS_TO_ACTION_DISP(1) +
230               CLASS_TO_ACTION( F_C_SIG_NAN,    RETURN_QUIET_NAN, 0) +
231               CLASS_TO_ACTION( F_C_QUIET_NAN,  RETURN_VALUE,     0) +
232               CLASS_TO_ACTION( F_C_POS_INF,    RETURN_VALUE,     0) +
233               CLASS_TO_ACTION( F_C_NEG_INF,    RETURN_VALUE,     0) +
234               CLASS_TO_ACTION( F_C_POS_DENORM, RETURN_UNPACKED,  0) +
235               CLASS_TO_ACTION( F_C_NEG_DENORM, RETURN_UNPACKED,  0) +
236               CLASS_TO_ACTION( F_C_POS_NORM,   RETURN_UNPACKED,  0) +
237               CLASS_TO_ACTION( F_C_NEG_NORM,   RETURN_UNPACKED,  0) +
238               CLASS_TO_ACTION( F_C_POS_ZERO,   RETURN_VALUE,     0) +
239               CLASS_TO_ACTION( F_C_NEG_ZERO ,  RETURN_VALUE,     0) );
240 
241     /* Generate coefficients and polynomial form for 1/cbrt(f)^2 */
242 
243     PRINT_R_TBL_COM_ADEF("coefs to approx 1/cbrt(f)^2", "COEFS\t\t\t");
244     remes(REMES_FIND_POLYNOMIAL + REMES_ABSOLUTE_WEIGHT + REMES_LINEAR_ARG,
245         1.0, 2.0, recip_cbrt, 15, &degree,  &poly_coefs);
246     for (i = 0; i <= degree ; i++)
247         { PRINT_R_TBL_ITEM( poly_coefs[i] ); }
248     GENPOLY(COEFS[%%d], RECIP_CBRT_POLY(x), degree);
249 
250     /* Now get powers of cbrt(2) */
251 
252     PRINT_R_TBL_COM_ADEF("cube roots of 2^i, i = 0, 1, 2","POW_CBRT_2_TABLE\t");
253 
254     c = cbrt(2);
255     for( i = 0; i <= 2; i++)
256         { PRINT_R_TBL_ITEM(c^i); }
257 
258     /* Last but not least, the Newton's iteration constants */
259 
260     TABLE_COMMENT("14/9, 7/9 and 2/9 in double precision");
261 
262     PRINT_R_TBL_VDEF_ITEM( "FOURTEEN_NINTHS\t\t", 14/9);
263     PRINT_R_TBL_VDEF_ITEM( "SEVEN_NINTHS\t\t",     7/9);
264     PRINT_R_TBL_VDEF_ITEM( "TWO_NINTHS\t\t",       2/9);
265 
266     END_TABLE;
267 
268     @end_divert
269     @eval my $tableText;						\
270           my $outText = MphocEval( GetStream( "divertText" ) );		\
271           my $defineText = Egrep( "#define", $outText, \$tableText );	\
272           my $polyText   = Egrep( STR(GENPOLY_EXECUTABLE), $tableText,	\
273                                      \$tableText );			\
274              $polyText   = GenPoly( $polyText );			\
275              $outText    = "$tableText\n\n$defineText\n\n$polyText";	\
276           my $headerText = GetHeaderText( STR(BUILD_FILE_NAME),         \
277                            "Definitions and constants cbrt",  		\
278                               __FILE__ );				\
279              print "$headerText\n$outText";
280 
281 #endif
282 
283