1 /*
2  * Copyright © 2015 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include "nir.h"
26 #include "nir_builder.h"
27 #include "c99_math.h"
28 
29 #include <float.h>
30 
31 /*
32  * Lowers some unsupported double operations, using only:
33  *
34  * - pack/unpackDouble2x32
35  * - conversion to/from single-precision
36  * - double add, mul, and fma
37  * - conditional select
38  * - 32-bit integer and floating point arithmetic
39  */
40 
41 /* Creates a double with the exponent bits set to a given integer value */
42 static nir_ssa_def *
set_exponent(nir_builder * b,nir_ssa_def * src,nir_ssa_def * exp)43 set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp)
44 {
45    /* Split into bits 0-31 and 32-63 */
46    nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src);
47    nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
48 
49    /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent
50     * to 1023
51     */
52    nir_ssa_def *new_hi = nir_bitfield_insert(b, hi, exp,
53                                              nir_imm_int(b, 20),
54                                              nir_imm_int(b, 11));
55    /* recombine */
56    return nir_pack_64_2x32_split(b, lo, new_hi);
57 }
58 
59 static nir_ssa_def *
get_exponent(nir_builder * b,nir_ssa_def * src)60 get_exponent(nir_builder *b, nir_ssa_def *src)
61 {
62    /* get bits 32-63 */
63    nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src);
64 
65    /* extract bits 20-30 of the high word */
66    return nir_ubitfield_extract(b, hi, nir_imm_int(b, 20), nir_imm_int(b, 11));
67 }
68 
69 /* Return infinity with the sign of the given source which is +/-0 */
70 
71 static nir_ssa_def *
get_signed_inf(nir_builder * b,nir_ssa_def * zero)72 get_signed_inf(nir_builder *b, nir_ssa_def *zero)
73 {
74    nir_ssa_def *zero_hi = nir_unpack_64_2x32_split_y(b, zero);
75 
76    /* The bit pattern for infinity is 0x7ff0000000000000, where the sign bit
77     * is the highest bit. Only the sign bit can be non-zero in the passed in
78     * source. So we essentially need to OR the infinity and the zero, except
79     * the low 32 bits are always 0 so we can construct the correct high 32
80     * bits and then pack it together with zero low 32 bits.
81     */
82    nir_ssa_def *inf_hi = nir_ior(b, nir_imm_int(b, 0x7ff00000), zero_hi);
83    return nir_pack_64_2x32_split(b, nir_imm_int(b, 0), inf_hi);
84 }
85 
86 /*
87  * Generates the correctly-signed infinity if the source was zero, and flushes
88  * the result to 0 if the source was infinity or the calculated exponent was
89  * too small to be representable.
90  */
91 
92 static nir_ssa_def *
fix_inv_result(nir_builder * b,nir_ssa_def * res,nir_ssa_def * src,nir_ssa_def * exp)93 fix_inv_result(nir_builder *b, nir_ssa_def *res, nir_ssa_def *src,
94                nir_ssa_def *exp)
95 {
96    /* If the exponent is too small or the original input was infinity/NaN,
97     * force the result to 0 (flush denorms) to avoid the work of handling
98     * denorms properly. Note that this doesn't preserve positive/negative
99     * zeros, but GLSL doesn't require it.
100     */
101    res = nir_bcsel(b, nir_ior(b, nir_ige(b, nir_imm_int(b, 0), exp),
102                               nir_feq(b, nir_fabs(b, src),
103                                       nir_imm_double(b, INFINITY))),
104                    nir_imm_double(b, 0.0f), res);
105 
106    /* If the original input was 0, generate the correctly-signed infinity */
107    res = nir_bcsel(b, nir_fneu(b, src, nir_imm_double(b, 0.0f)),
108                    res, get_signed_inf(b, src));
109 
110    return res;
111 
112 }
113 
114 static nir_ssa_def *
lower_rcp(nir_builder * b,nir_ssa_def * src)115 lower_rcp(nir_builder *b, nir_ssa_def *src)
116 {
117    /* normalize the input to avoid range issues */
118    nir_ssa_def *src_norm = set_exponent(b, src, nir_imm_int(b, 1023));
119 
120    /* cast to float, do an rcp, and then cast back to get an approximate
121     * result
122     */
123    nir_ssa_def *ra = nir_f2f64(b, nir_frcp(b, nir_f2f32(b, src_norm)));
124 
125    /* Fixup the exponent of the result - note that we check if this is too
126     * small below.
127     */
128    nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra),
129                                    nir_isub(b, get_exponent(b, src),
130                                             nir_imm_int(b, 1023)));
131 
132    ra = set_exponent(b, ra, new_exp);
133 
134    /* Do a few Newton-Raphson steps to improve precision.
135     *
136     * Each step doubles the precision, and we started off with around 24 bits,
137     * so we only need to do 2 steps to get to full precision. The step is:
138     *
139     * x_new = x * (2 - x*src)
140     *
141     * But we can re-arrange this to improve precision by using another fused
142     * multiply-add:
143     *
144     * x_new = x + x * (1 - x*src)
145     *
146     * See https://en.wikipedia.org/wiki/Division_algorithm for more details.
147     */
148 
149    ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra);
150    ra = nir_ffma(b, nir_fneg(b, ra), nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra);
151 
152    return fix_inv_result(b, ra, src, new_exp);
153 }
154 
155 static nir_ssa_def *
lower_sqrt_rsq(nir_builder * b,nir_ssa_def * src,bool sqrt)156 lower_sqrt_rsq(nir_builder *b, nir_ssa_def *src, bool sqrt)
157 {
158    /* We want to compute:
159     *
160     * 1/sqrt(m * 2^e)
161     *
162     * When the exponent is even, this is equivalent to:
163     *
164     * 1/sqrt(m) * 2^(-e/2)
165     *
166     * and then the exponent is odd, this is equal to:
167     *
168     * 1/sqrt(m * 2) * 2^(-(e - 1)/2)
169     *
170     * where the m * 2 is absorbed into the exponent. So we want the exponent
171     * inside the square root to be 1 if e is odd and 0 if e is even, and we
172     * want to subtract off e/2 from the final exponent, rounded to negative
173     * infinity. We can do the former by first computing the unbiased exponent,
174     * and then AND'ing it with 1 to get 0 or 1, and we can do the latter by
175     * shifting right by 1.
176     */
177 
178    nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src),
179                                         nir_imm_int(b, 1023));
180    nir_ssa_def *even = nir_iand_imm(b, unbiased_exp, 1);
181    nir_ssa_def *half = nir_ishr_imm(b, unbiased_exp, 1);
182 
183    nir_ssa_def *src_norm = set_exponent(b, src,
184                                         nir_iadd(b, nir_imm_int(b, 1023),
185                                                  even));
186 
187    nir_ssa_def *ra = nir_f2f64(b, nir_frsq(b, nir_f2f32(b, src_norm)));
188    nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), half);
189    ra = set_exponent(b, ra, new_exp);
190 
191    /*
192     * The following implements an iterative algorithm that's very similar
193     * between sqrt and rsqrt. We start with an iteration of Goldschmit's
194     * algorithm, which looks like:
195     *
196     * a = the source
197     * y_0 = initial (single-precision) rsqrt estimate
198     *
199     * h_0 = .5 * y_0
200     * g_0 = a * y_0
201     * r_0 = .5 - h_0 * g_0
202     * g_1 = g_0 * r_0 + g_0
203     * h_1 = h_0 * r_0 + h_0
204     *
205     * Now g_1 ~= sqrt(a), and h_1 ~= 1/(2 * sqrt(a)). We could continue
206     * applying another round of Goldschmit, but since we would never refer
207     * back to a (the original source), we would add too much rounding error.
208     * So instead, we do one last round of Newton-Raphson, which has better
209     * rounding characteristics, to get the final rounding correct. This is
210     * split into two cases:
211     *
212     * 1. sqrt
213     *
214     * Normally, doing a round of Newton-Raphson for sqrt involves taking a
215     * reciprocal of the original estimate, which is slow since it isn't
216     * supported in HW. But we can take advantage of the fact that we already
217     * computed a good estimate of 1/(2 * g_1) by rearranging it like so:
218     *
219     * g_2 = .5 * (g_1 + a / g_1)
220     *     = g_1 + .5 * (a / g_1 - g_1)
221     *     = g_1 + (.5 / g_1) * (a - g_1^2)
222     *     = g_1 + h_1 * (a - g_1^2)
223     *
224     * The second term represents the error, and by splitting it out we can get
225     * better precision by computing it as part of a fused multiply-add. Since
226     * both Newton-Raphson and Goldschmit approximately double the precision of
227     * the result, these two steps should be enough.
228     *
229     * 2. rsqrt
230     *
231     * First off, note that the first round of the Goldschmit algorithm is
232     * really just a Newton-Raphson step in disguise:
233     *
234     * h_1 = h_0 * (.5 - h_0 * g_0) + h_0
235     *     = h_0 * (1.5 - h_0 * g_0)
236     *     = h_0 * (1.5 - .5 * a * y_0^2)
237     *     = (.5 * y_0) * (1.5 - .5 * a * y_0^2)
238     *
239     * which is the standard formula multiplied by .5. Unlike in the sqrt case,
240     * we don't need the inverse to do a Newton-Raphson step; we just need h_1,
241     * so we can skip the calculation of g_1. Instead, we simply do another
242     * Newton-Raphson step:
243     *
244     * y_1 = 2 * h_1
245     * r_1 = .5 - h_1 * y_1 * a
246     * y_2 = y_1 * r_1 + y_1
247     *
248     * Where the difference from Goldschmit is that we calculate y_1 * a
249     * instead of using g_1. Doing it this way should be as fast as computing
250     * y_1 up front instead of h_1, and it lets us share the code for the
251     * initial Goldschmit step with the sqrt case.
252     *
253     * Putting it together, the computations are:
254     *
255     * h_0 = .5 * y_0
256     * g_0 = a * y_0
257     * r_0 = .5 - h_0 * g_0
258     * h_1 = h_0 * r_0 + h_0
259     * if sqrt:
260     *    g_1 = g_0 * r_0 + g_0
261     *    r_1 = a - g_1 * g_1
262     *    g_2 = h_1 * r_1 + g_1
263     * else:
264     *    y_1 = 2 * h_1
265     *    r_1 = .5 - y_1 * (h_1 * a)
266     *    y_2 = y_1 * r_1 + y_1
267     *
268     * For more on the ideas behind this, see "Software Division and Square
269     * Root Using Goldschmit's Algorithms" by Markstein and the Wikipedia page
270     * on square roots
271     * (https://en.wikipedia.org/wiki/Methods_of_computing_square_roots).
272     */
273 
274    nir_ssa_def *one_half = nir_imm_double(b, 0.5);
275    nir_ssa_def *h_0 = nir_fmul(b, one_half, ra);
276    nir_ssa_def *g_0 = nir_fmul(b, src, ra);
277    nir_ssa_def *r_0 = nir_ffma(b, nir_fneg(b, h_0), g_0, one_half);
278    nir_ssa_def *h_1 = nir_ffma(b, h_0, r_0, h_0);
279    nir_ssa_def *res;
280    if (sqrt) {
281       nir_ssa_def *g_1 = nir_ffma(b, g_0, r_0, g_0);
282       nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, g_1), g_1, src);
283       res = nir_ffma(b, h_1, r_1, g_1);
284    } else {
285       nir_ssa_def *y_1 = nir_fmul(b, nir_imm_double(b, 2.0), h_1);
286       nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, y_1), nir_fmul(b, h_1, src),
287                                   one_half);
288       res = nir_ffma(b, y_1, r_1, y_1);
289    }
290 
291    if (sqrt) {
292       /* Here, the special cases we need to handle are
293        * 0 -> 0 and
294        * +inf -> +inf
295        */
296       const bool preserve_denorms =
297          b->shader->info.float_controls_execution_mode &
298          FLOAT_CONTROLS_DENORM_PRESERVE_FP64;
299       nir_ssa_def *src_flushed = src;
300       if (!preserve_denorms) {
301          src_flushed = nir_bcsel(b,
302                                  nir_flt(b, nir_fabs(b, src),
303                                          nir_imm_double(b, DBL_MIN)),
304                                  nir_imm_double(b, 0.0),
305                                  src);
306       }
307       res = nir_bcsel(b, nir_ior(b, nir_feq(b, src_flushed, nir_imm_double(b, 0.0)),
308                                  nir_feq(b, src, nir_imm_double(b, INFINITY))),
309                                  src_flushed, res);
310    } else {
311       res = fix_inv_result(b, res, src, new_exp);
312    }
313 
314    return res;
315 }
316 
317 static nir_ssa_def *
lower_trunc(nir_builder * b,nir_ssa_def * src)318 lower_trunc(nir_builder *b, nir_ssa_def *src)
319 {
320    nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src),
321                                         nir_imm_int(b, 1023));
322 
323    nir_ssa_def *frac_bits = nir_isub(b, nir_imm_int(b, 52), unbiased_exp);
324 
325    /*
326     * Decide the operation to apply depending on the unbiased exponent:
327     *
328     * if (unbiased_exp < 0)
329     *    return 0
330     * else if (unbiased_exp > 52)
331     *    return src
332     * else
333     *    return src & (~0 << frac_bits)
334     *
335     * Notice that the else branch is a 64-bit integer operation that we need
336     * to implement in terms of 32-bit integer arithmetics (at least until we
337     * support 64-bit integer arithmetics).
338     */
339 
340    /* Compute "~0 << frac_bits" in terms of hi/lo 32-bit integer math */
341    nir_ssa_def *mask_lo =
342       nir_bcsel(b,
343                 nir_ige(b, frac_bits, nir_imm_int(b, 32)),
344                 nir_imm_int(b, 0),
345                 nir_ishl(b, nir_imm_int(b, ~0), frac_bits));
346 
347    nir_ssa_def *mask_hi =
348       nir_bcsel(b,
349                 nir_ilt(b, frac_bits, nir_imm_int(b, 33)),
350                 nir_imm_int(b, ~0),
351                 nir_ishl(b,
352                          nir_imm_int(b, ~0),
353                          nir_isub(b, frac_bits, nir_imm_int(b, 32))));
354 
355    nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src);
356    nir_ssa_def *src_hi = nir_unpack_64_2x32_split_y(b, src);
357 
358    return
359       nir_bcsel(b,
360                 nir_ilt(b, unbiased_exp, nir_imm_int(b, 0)),
361                 nir_imm_double(b, 0.0),
362                 nir_bcsel(b, nir_ige(b, unbiased_exp, nir_imm_int(b, 53)),
363                           src,
364                           nir_pack_64_2x32_split(b,
365                                                  nir_iand(b, mask_lo, src_lo),
366                                                  nir_iand(b, mask_hi, src_hi))));
367 }
368 
369 static nir_ssa_def *
lower_floor(nir_builder * b,nir_ssa_def * src)370 lower_floor(nir_builder *b, nir_ssa_def *src)
371 {
372    /*
373     * For x >= 0, floor(x) = trunc(x)
374     * For x < 0,
375     *    - if x is integer, floor(x) = x
376     *    - otherwise, floor(x) = trunc(x) - 1
377     */
378    nir_ssa_def *tr = nir_ftrunc(b, src);
379    nir_ssa_def *positive = nir_fge(b, src, nir_imm_double(b, 0.0));
380    return nir_bcsel(b,
381                     nir_ior(b, positive, nir_feq(b, src, tr)),
382                     tr,
383                     nir_fsub(b, tr, nir_imm_double(b, 1.0)));
384 }
385 
386 static nir_ssa_def *
lower_ceil(nir_builder * b,nir_ssa_def * src)387 lower_ceil(nir_builder *b, nir_ssa_def *src)
388 {
389    /* if x < 0,                    ceil(x) = trunc(x)
390     * else if (x - trunc(x) == 0), ceil(x) = x
391     * else,                        ceil(x) = trunc(x) + 1
392     */
393    nir_ssa_def *tr = nir_ftrunc(b, src);
394    nir_ssa_def *negative = nir_flt(b, src, nir_imm_double(b, 0.0));
395    return nir_bcsel(b,
396                     nir_ior(b, negative, nir_feq(b, src, tr)),
397                     tr,
398                     nir_fadd(b, tr, nir_imm_double(b, 1.0)));
399 }
400 
401 static nir_ssa_def *
lower_fract(nir_builder * b,nir_ssa_def * src)402 lower_fract(nir_builder *b, nir_ssa_def *src)
403 {
404    return nir_fsub(b, src, nir_ffloor(b, src));
405 }
406 
407 static nir_ssa_def *
lower_round_even(nir_builder * b,nir_ssa_def * src)408 lower_round_even(nir_builder *b, nir_ssa_def *src)
409 {
410    /* Add and subtract 2**52 to round off any fractional bits. */
411    nir_ssa_def *two52 = nir_imm_double(b, (double)(1ull << 52));
412    nir_ssa_def *sign = nir_iand(b, nir_unpack_64_2x32_split_y(b, src),
413                                 nir_imm_int(b, 1ull << 31));
414 
415    b->exact = true;
416    nir_ssa_def *res = nir_fsub(b, nir_fadd(b, nir_fabs(b, src), two52), two52);
417    b->exact = false;
418 
419    return nir_bcsel(b, nir_flt(b, nir_fabs(b, src), two52),
420                     nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, res),
421                                            nir_ior(b, nir_unpack_64_2x32_split_y(b, res), sign)), src);
422 }
423 
424 static nir_ssa_def *
lower_mod(nir_builder * b,nir_ssa_def * src0,nir_ssa_def * src1)425 lower_mod(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1)
426 {
427    /* mod(x,y) = x - y * floor(x/y)
428     *
429     * If the division is lowered, it could add some rounding errors that make
430     * floor() to return the quotient minus one when x = N * y. If this is the
431     * case, we should return zero because mod(x, y) output value is [0, y).
432     * But fortunately Vulkan spec allows this kind of errors; from Vulkan
433     * spec, appendix A (Precision and Operation of SPIR-V instructions:
434     *
435     *   "The OpFRem and OpFMod instructions use cheap approximations of
436     *   remainder, and the error can be large due to the discontinuity in
437     *   trunc() and floor(). This can produce mathematically unexpected
438     *   results in some cases, such as FMod(x,x) computing x rather than 0,
439     *   and can also cause the result to have a different sign than the
440     *   infinitely precise result."
441     *
442     * In practice this means the output value is actually in the interval
443     * [0, y].
444     *
445     * While Vulkan states this behaviour explicitly, OpenGL does not, and thus
446     * we need to assume that value should be in range [0, y); but on the other
447     * hand, mod(a,b) is defined as "a - b * floor(a/b)" and OpenGL allows for
448     * some error in division, so a/a could actually end up being 1.0 - 1ULP;
449     * so in this case floor(a/a) would end up as 0, and hence mod(a,a) == a.
450     *
451     * In summary, in the practice mod(a,a) can be "a" both for OpenGL and
452     * Vulkan.
453     */
454    nir_ssa_def *floor = nir_ffloor(b, nir_fdiv(b, src0, src1));
455 
456    return nir_fsub(b, src0, nir_fmul(b, src1, floor));
457 }
458 
459 static nir_ssa_def *
lower_doubles_instr_to_soft(nir_builder * b,nir_alu_instr * instr,const nir_shader * softfp64,nir_lower_doubles_options options)460 lower_doubles_instr_to_soft(nir_builder *b, nir_alu_instr *instr,
461                             const nir_shader *softfp64,
462                             nir_lower_doubles_options options)
463 {
464    if (!(options & nir_lower_fp64_full_software))
465       return NULL;
466 
467    assert(instr->dest.dest.is_ssa);
468 
469    const char *name;
470    const struct glsl_type *return_type = glsl_uint64_t_type();
471 
472    switch (instr->op) {
473    case nir_op_f2i64:
474       if (instr->src[0].src.ssa->bit_size != 64)
475          return false;
476       name = "__fp64_to_int64";
477       return_type = glsl_int64_t_type();
478       break;
479    case nir_op_f2u64:
480       if (instr->src[0].src.ssa->bit_size != 64)
481          return false;
482       name = "__fp64_to_uint64";
483       break;
484    case nir_op_f2f64:
485       name = "__fp32_to_fp64";
486       break;
487    case nir_op_f2f32:
488       name = "__fp64_to_fp32";
489       return_type = glsl_float_type();
490       break;
491    case nir_op_f2i32:
492       name = "__fp64_to_int";
493       return_type = glsl_int_type();
494       break;
495    case nir_op_f2u32:
496       name = "__fp64_to_uint";
497       return_type = glsl_uint_type();
498       break;
499    case nir_op_f2b1:
500    case nir_op_f2b32:
501       name = "__fp64_to_bool";
502       return_type = glsl_bool_type();
503       break;
504    case nir_op_b2f64:
505       name = "__bool_to_fp64";
506       break;
507    case nir_op_i2f64:
508       if (instr->src[0].src.ssa->bit_size == 64)
509          name = "__int64_to_fp64";
510       else
511          name = "__int_to_fp64";
512       break;
513    case nir_op_u2f64:
514       if (instr->src[0].src.ssa->bit_size == 64)
515          name = "__uint64_to_fp64";
516       else
517          name = "__uint_to_fp64";
518       break;
519    case nir_op_fabs:
520       name = "__fabs64";
521       break;
522    case nir_op_fneg:
523       name = "__fneg64";
524       break;
525    case nir_op_fround_even:
526       name = "__fround64";
527       break;
528    case nir_op_ftrunc:
529       name = "__ftrunc64";
530       break;
531    case nir_op_ffloor:
532       name = "__ffloor64";
533       break;
534    case nir_op_ffract:
535       name = "__ffract64";
536       break;
537    case nir_op_fsign:
538       name = "__fsign64";
539       break;
540    case nir_op_feq:
541       name = "__feq64";
542       return_type = glsl_bool_type();
543       break;
544    case nir_op_fneu:
545       name = "__fneu64";
546       return_type = glsl_bool_type();
547       break;
548    case nir_op_flt:
549       name = "__flt64";
550       return_type = glsl_bool_type();
551       break;
552    case nir_op_fge:
553       name = "__fge64";
554       return_type = glsl_bool_type();
555       break;
556    case nir_op_fmin:
557       name = "__fmin64";
558       break;
559    case nir_op_fmax:
560       name = "__fmax64";
561       break;
562    case nir_op_fadd:
563       name = "__fadd64";
564       break;
565    case nir_op_fmul:
566       name = "__fmul64";
567       break;
568    case nir_op_ffma:
569       name = "__ffma64";
570       break;
571    case nir_op_fsat:
572       name = "__fsat64";
573       break;
574    default:
575       return false;
576    }
577 
578    nir_function *func = NULL;
579    nir_foreach_function(function, softfp64) {
580       if (strcmp(function->name, name) == 0) {
581          func = function;
582          break;
583       }
584    }
585    if (!func || !func->impl) {
586       fprintf(stderr, "Cannot find function \"%s\"\n", name);
587       assert(func);
588    }
589 
590    nir_ssa_def *params[4] = { NULL, };
591 
592    nir_variable *ret_tmp =
593       nir_local_variable_create(b->impl, return_type, "return_tmp");
594    nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp);
595    params[0] = &ret_deref->dest.ssa;
596 
597    assert(nir_op_infos[instr->op].num_inputs + 1 == func->num_params);
598    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
599       assert(i + 1 < ARRAY_SIZE(params));
600       params[i + 1] = nir_mov_alu(b, instr->src[i], 1);
601    }
602 
603    nir_inline_function_impl(b, func->impl, params, NULL);
604 
605    return nir_load_deref(b, ret_deref);
606 }
607 
608 nir_lower_doubles_options
nir_lower_doubles_op_to_options_mask(nir_op opcode)609 nir_lower_doubles_op_to_options_mask(nir_op opcode)
610 {
611    switch (opcode) {
612    case nir_op_frcp:          return nir_lower_drcp;
613    case nir_op_fsqrt:         return nir_lower_dsqrt;
614    case nir_op_frsq:          return nir_lower_drsq;
615    case nir_op_ftrunc:        return nir_lower_dtrunc;
616    case nir_op_ffloor:        return nir_lower_dfloor;
617    case nir_op_fceil:         return nir_lower_dceil;
618    case nir_op_ffract:        return nir_lower_dfract;
619    case nir_op_fround_even:   return nir_lower_dround_even;
620    case nir_op_fmod:          return nir_lower_dmod;
621    case nir_op_fsub:          return nir_lower_dsub;
622    case nir_op_fdiv:          return nir_lower_ddiv;
623    default:                   return 0;
624    }
625 }
626 
627 struct lower_doubles_data {
628    const nir_shader *softfp64;
629    nir_lower_doubles_options options;
630 };
631 
632 static bool
should_lower_double_instr(const nir_instr * instr,const void * _data)633 should_lower_double_instr(const nir_instr *instr, const void *_data)
634 {
635    const struct lower_doubles_data *data = _data;
636    const nir_lower_doubles_options options = data->options;
637 
638    if (instr->type != nir_instr_type_alu)
639       return false;
640 
641    const nir_alu_instr *alu = nir_instr_as_alu(instr);
642 
643    assert(alu->dest.dest.is_ssa);
644    bool is_64 = alu->dest.dest.ssa.bit_size == 64;
645 
646    unsigned num_srcs = nir_op_infos[alu->op].num_inputs;
647    for (unsigned i = 0; i < num_srcs; i++) {
648       is_64 |= (nir_src_bit_size(alu->src[i].src) == 64);
649    }
650 
651    if (!is_64)
652       return false;
653 
654    if (options & nir_lower_fp64_full_software)
655       return true;
656 
657    return options & nir_lower_doubles_op_to_options_mask(alu->op);
658 }
659 
660 static nir_ssa_def *
lower_doubles_instr(nir_builder * b,nir_instr * instr,void * _data)661 lower_doubles_instr(nir_builder *b, nir_instr *instr, void *_data)
662 {
663    const struct lower_doubles_data *data = _data;
664    const nir_lower_doubles_options options = data->options;
665    nir_alu_instr *alu = nir_instr_as_alu(instr);
666 
667    nir_ssa_def *soft_def =
668       lower_doubles_instr_to_soft(b, alu, data->softfp64, options);
669    if (soft_def)
670       return soft_def;
671 
672    if (!(options & nir_lower_doubles_op_to_options_mask(alu->op)))
673       return NULL;
674 
675    nir_ssa_def *src = nir_mov_alu(b, alu->src[0],
676                                   alu->dest.dest.ssa.num_components);
677 
678    switch (alu->op) {
679    case nir_op_frcp:
680       return lower_rcp(b, src);
681    case nir_op_fsqrt:
682       return lower_sqrt_rsq(b, src, true);
683    case nir_op_frsq:
684       return lower_sqrt_rsq(b, src, false);
685    case nir_op_ftrunc:
686       return lower_trunc(b, src);
687    case nir_op_ffloor:
688       return lower_floor(b, src);
689    case nir_op_fceil:
690       return lower_ceil(b, src);
691    case nir_op_ffract:
692       return lower_fract(b, src);
693    case nir_op_fround_even:
694       return lower_round_even(b, src);
695 
696    case nir_op_fdiv:
697    case nir_op_fsub:
698    case nir_op_fmod: {
699       nir_ssa_def *src1 = nir_mov_alu(b, alu->src[1],
700                                       alu->dest.dest.ssa.num_components);
701       switch (alu->op) {
702       case nir_op_fdiv:
703          return nir_fmul(b, src, nir_frcp(b, src1));
704       case nir_op_fsub:
705          return nir_fadd(b, src, nir_fneg(b, src1));
706       case nir_op_fmod:
707          return lower_mod(b, src, src1);
708       default:
709          unreachable("unhandled opcode");
710       }
711    }
712    default:
713       unreachable("unhandled opcode");
714    }
715 }
716 
717 static bool
nir_lower_doubles_impl(nir_function_impl * impl,const nir_shader * softfp64,nir_lower_doubles_options options)718 nir_lower_doubles_impl(nir_function_impl *impl,
719                        const nir_shader *softfp64,
720                        nir_lower_doubles_options options)
721 {
722    struct lower_doubles_data data = {
723       .softfp64 = softfp64,
724       .options = options,
725    };
726 
727    bool progress =
728       nir_function_impl_lower_instructions(impl,
729                                            should_lower_double_instr,
730                                            lower_doubles_instr,
731                                            &data);
732 
733    if (progress && (options & nir_lower_fp64_full_software)) {
734       /* SSA and register indices are completely messed up now */
735       nir_index_ssa_defs(impl);
736       nir_index_local_regs(impl);
737 
738       nir_metadata_preserve(impl, nir_metadata_none);
739 
740       /* And we have deref casts we need to clean up thanks to function
741        * inlining.
742        */
743       nir_opt_deref_impl(impl);
744    } else if (progress) {
745       nir_metadata_preserve(impl, nir_metadata_block_index |
746                                   nir_metadata_dominance);
747    } else {
748       nir_metadata_preserve(impl, nir_metadata_all);
749    }
750 
751    return progress;
752 }
753 
754 bool
nir_lower_doubles(nir_shader * shader,const nir_shader * softfp64,nir_lower_doubles_options options)755 nir_lower_doubles(nir_shader *shader,
756                   const nir_shader *softfp64,
757                   nir_lower_doubles_options options)
758 {
759    bool progress = false;
760 
761    nir_foreach_function(function, shader) {
762       if (function->impl) {
763          progress |= nir_lower_doubles_impl(function->impl, softfp64, options);
764       }
765    }
766 
767    return progress;
768 }
769