1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir.h"
25 #include "nir_builder.h"
26 
27 #define COND_LOWER_OP(b, name, ...)                                   \
28         (b->shader->options->lower_int64_options &                    \
29          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
30         lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__)
31 
32 #define COND_LOWER_CMP(b, name, ...)                                  \
33         (b->shader->options->lower_int64_options &                    \
34          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
35         lower_int64_compare(b, nir_op_##name, __VA_ARGS__) :          \
36         nir_##name(b, __VA_ARGS__)
37 
38 #define COND_LOWER_CAST(b, name, ...)                                 \
39         (b->shader->options->lower_int64_options &                    \
40          nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
41         lower_##name(b, __VA_ARGS__) :                                \
42         nir_##name(b, __VA_ARGS__)
43 
44 static nir_ssa_def *
lower_b2i64(nir_builder * b,nir_ssa_def * x)45 lower_b2i64(nir_builder *b, nir_ssa_def *x)
46 {
47    return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0));
48 }
49 
50 static nir_ssa_def *
lower_i2b(nir_builder * b,nir_ssa_def * x)51 lower_i2b(nir_builder *b, nir_ssa_def *x)
52 {
53    return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x),
54                                 nir_unpack_64_2x32_split_y(b, x)),
55                      nir_imm_int(b, 0));
56 }
57 
58 static nir_ssa_def *
lower_i2i8(nir_builder * b,nir_ssa_def * x)59 lower_i2i8(nir_builder *b, nir_ssa_def *x)
60 {
61    return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x));
62 }
63 
64 static nir_ssa_def *
lower_i2i16(nir_builder * b,nir_ssa_def * x)65 lower_i2i16(nir_builder *b, nir_ssa_def *x)
66 {
67    return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x));
68 }
69 
70 
71 static nir_ssa_def *
lower_i2i32(nir_builder * b,nir_ssa_def * x)72 lower_i2i32(nir_builder *b, nir_ssa_def *x)
73 {
74    return nir_unpack_64_2x32_split_x(b, x);
75 }
76 
77 static nir_ssa_def *
lower_i2i64(nir_builder * b,nir_ssa_def * x)78 lower_i2i64(nir_builder *b, nir_ssa_def *x)
79 {
80    nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x);
81    return nir_pack_64_2x32_split(b, x32, nir_ishr(b, x32, nir_imm_int(b, 31)));
82 }
83 
84 static nir_ssa_def *
lower_u2u8(nir_builder * b,nir_ssa_def * x)85 lower_u2u8(nir_builder *b, nir_ssa_def *x)
86 {
87    return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x));
88 }
89 
90 static nir_ssa_def *
lower_u2u16(nir_builder * b,nir_ssa_def * x)91 lower_u2u16(nir_builder *b, nir_ssa_def *x)
92 {
93    return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x));
94 }
95 
96 static nir_ssa_def *
lower_u2u32(nir_builder * b,nir_ssa_def * x)97 lower_u2u32(nir_builder *b, nir_ssa_def *x)
98 {
99    return nir_unpack_64_2x32_split_x(b, x);
100 }
101 
102 static nir_ssa_def *
lower_u2u64(nir_builder * b,nir_ssa_def * x)103 lower_u2u64(nir_builder *b, nir_ssa_def *x)
104 {
105    nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x);
106    return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0));
107 }
108 
109 static nir_ssa_def *
lower_bcsel64(nir_builder * b,nir_ssa_def * cond,nir_ssa_def * x,nir_ssa_def * y)110 lower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y)
111 {
112    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
113    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
114    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
115    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
116 
117    return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo),
118                                     nir_bcsel(b, cond, x_hi, y_hi));
119 }
120 
121 static nir_ssa_def *
lower_inot64(nir_builder * b,nir_ssa_def * x)122 lower_inot64(nir_builder *b, nir_ssa_def *x)
123 {
124    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
125    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
126 
127    return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi));
128 }
129 
130 static nir_ssa_def *
lower_iand64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)131 lower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
132 {
133    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
134    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
135    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
136    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
137 
138    return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo),
139                                     nir_iand(b, x_hi, y_hi));
140 }
141 
142 static nir_ssa_def *
lower_ior64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)143 lower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
144 {
145    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
146    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
147    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
148    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
149 
150    return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo),
151                                     nir_ior(b, x_hi, y_hi));
152 }
153 
154 static nir_ssa_def *
lower_ixor64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)155 lower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
156 {
157    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
158    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
159    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
160    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
161 
162    return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo),
163                                     nir_ixor(b, x_hi, y_hi));
164 }
165 
166 static nir_ssa_def *
lower_ishl64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)167 lower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
168 {
169    /* Implemented as
170     *
171     * uint64_t lshift(uint64_t x, int c)
172     * {
173     *    if (c == 0) return x;
174     *
175     *    uint32_t lo = LO(x), hi = HI(x);
176     *
177     *    if (c < 32) {
178     *       uint32_t lo_shifted = lo << c;
179     *       uint32_t hi_shifted = hi << c;
180     *       uint32_t lo_shifted_hi = lo >> abs(32 - c);
181     *       return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
182     *    } else {
183     *       uint32_t lo_shifted_hi = lo << abs(32 - c);
184     *       return pack_64(0, lo_shifted_hi);
185     *    }
186     * }
187     */
188    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
189    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
190 
191    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
192    nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y);
193    nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y);
194    nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count);
195 
196    nir_ssa_def *res_if_lt_32 =
197       nir_pack_64_2x32_split(b, lo_shifted,
198                                 nir_ior(b, hi_shifted, lo_shifted_hi));
199    nir_ssa_def *res_if_ge_32 =
200       nir_pack_64_2x32_split(b, nir_imm_int(b, 0),
201                                 nir_ishl(b, x_lo, reverse_count));
202 
203    return nir_bcsel(b,
204                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
205                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
206                                  res_if_ge_32, res_if_lt_32));
207 }
208 
209 static nir_ssa_def *
lower_ishr64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)210 lower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
211 {
212    /* Implemented as
213     *
214     * uint64_t arshift(uint64_t x, int c)
215     * {
216     *    if (c == 0) return x;
217     *
218     *    uint32_t lo = LO(x);
219     *    int32_t  hi = HI(x);
220     *
221     *    if (c < 32) {
222     *       uint32_t lo_shifted = lo >> c;
223     *       uint32_t hi_shifted = hi >> c;
224     *       uint32_t hi_shifted_lo = hi << abs(32 - c);
225     *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
226     *    } else {
227     *       uint32_t hi_shifted = hi >> 31;
228     *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
229     *       return pack_64(hi_shifted, hi_shifted_lo);
230     *    }
231     * }
232     */
233    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
234    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
235 
236    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
237    nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
238    nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y);
239    nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
240 
241    nir_ssa_def *res_if_lt_32 =
242       nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
243                                 hi_shifted);
244    nir_ssa_def *res_if_ge_32 =
245       nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count),
246                                 nir_ishr(b, x_hi, nir_imm_int(b, 31)));
247 
248    return nir_bcsel(b,
249                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
250                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
251                                  res_if_ge_32, res_if_lt_32));
252 }
253 
254 static nir_ssa_def *
lower_ushr64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)255 lower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
256 {
257    /* Implemented as
258     *
259     * uint64_t rshift(uint64_t x, int c)
260     * {
261     *    if (c == 0) return x;
262     *
263     *    uint32_t lo = LO(x), hi = HI(x);
264     *
265     *    if (c < 32) {
266     *       uint32_t lo_shifted = lo >> c;
267     *       uint32_t hi_shifted = hi >> c;
268     *       uint32_t hi_shifted_lo = hi << abs(32 - c);
269     *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
270     *    } else {
271     *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
272     *       return pack_64(0, hi_shifted_lo);
273     *    }
274     * }
275     */
276 
277    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
278    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
279 
280    nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
281    nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
282    nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y);
283    nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
284 
285    nir_ssa_def *res_if_lt_32 =
286       nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
287                                 hi_shifted);
288    nir_ssa_def *res_if_ge_32 =
289       nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count),
290                                 nir_imm_int(b, 0));
291 
292    return nir_bcsel(b,
293                     nir_ieq(b, y, nir_imm_int(b, 0)), x,
294                     nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
295                                  res_if_ge_32, res_if_lt_32));
296 }
297 
298 static nir_ssa_def *
lower_iadd64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)299 lower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
300 {
301    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
302    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
303    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
304    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
305 
306    nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo);
307    nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo));
308    nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi));
309 
310    return nir_pack_64_2x32_split(b, res_lo, res_hi);
311 }
312 
313 static nir_ssa_def *
lower_isub64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)314 lower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
315 {
316    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
317    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
318    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
319    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
320 
321    nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo);
322    nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo)));
323    nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow);
324 
325    return nir_pack_64_2x32_split(b, res_lo, res_hi);
326 }
327 
328 static nir_ssa_def *
lower_ineg64(nir_builder * b,nir_ssa_def * x)329 lower_ineg64(nir_builder *b, nir_ssa_def *x)
330 {
331    /* Since isub is the same number of instructions (with better dependencies)
332     * as iadd, subtraction is actually more efficient for ineg than the usual
333     * 2's complement "flip the bits and add one".
334     */
335    return lower_isub64(b, nir_imm_int64(b, 0), x);
336 }
337 
338 static nir_ssa_def *
lower_iabs64(nir_builder * b,nir_ssa_def * x)339 lower_iabs64(nir_builder *b, nir_ssa_def *x)
340 {
341    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
342    nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0));
343    return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x);
344 }
345 
346 static nir_ssa_def *
lower_int64_compare(nir_builder * b,nir_op op,nir_ssa_def * x,nir_ssa_def * y)347 lower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y)
348 {
349    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
350    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
351    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
352    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
353 
354    switch (op) {
355    case nir_op_ieq:
356       return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo));
357    case nir_op_ine:
358       return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo));
359    case nir_op_ult:
360       return nir_ior(b, nir_ult(b, x_hi, y_hi),
361                         nir_iand(b, nir_ieq(b, x_hi, y_hi),
362                                     nir_ult(b, x_lo, y_lo)));
363    case nir_op_ilt:
364       return nir_ior(b, nir_ilt(b, x_hi, y_hi),
365                         nir_iand(b, nir_ieq(b, x_hi, y_hi),
366                                     nir_ult(b, x_lo, y_lo)));
367       break;
368    case nir_op_uge:
369       /* Lower as !(x < y) in the hopes of better CSE */
370       return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y));
371    case nir_op_ige:
372       /* Lower as !(x < y) in the hopes of better CSE */
373       return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y));
374    default:
375       unreachable("Invalid comparison");
376    }
377 }
378 
379 static nir_ssa_def *
lower_umax64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)380 lower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
381 {
382    return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x);
383 }
384 
385 static nir_ssa_def *
lower_imax64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)386 lower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
387 {
388    return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x);
389 }
390 
391 static nir_ssa_def *
lower_umin64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)392 lower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
393 {
394    return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y);
395 }
396 
397 static nir_ssa_def *
lower_imin64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)398 lower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
399 {
400    return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
401 }
402 
403 static nir_ssa_def *
lower_mul_2x32_64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y,bool sign_extend)404 lower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
405                   bool sign_extend)
406 {
407    nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
408                                      : nir_umul_high(b, x, y);
409 
410    return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
411 }
412 
413 static nir_ssa_def *
lower_imul64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y)414 lower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
415 {
416    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
417    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
418    nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
419    nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
420 
421    nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
422    nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
423                          nir_iadd(b, nir_imul(b, x_lo, y_hi),
424                                      nir_imul(b, x_hi, y_lo)));
425 
426    return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
427                                  res_hi);
428 }
429 
430 static nir_ssa_def *
lower_mul_high64(nir_builder * b,nir_ssa_def * x,nir_ssa_def * y,bool sign_extend)431 lower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
432                  bool sign_extend)
433 {
434    nir_ssa_def *x32[4], *y32[4];
435    x32[0] = nir_unpack_64_2x32_split_x(b, x);
436    x32[1] = nir_unpack_64_2x32_split_y(b, x);
437    if (sign_extend) {
438       x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31));
439    } else {
440       x32[2] = x32[3] = nir_imm_int(b, 0);
441    }
442 
443    y32[0] = nir_unpack_64_2x32_split_x(b, y);
444    y32[1] = nir_unpack_64_2x32_split_y(b, y);
445    if (sign_extend) {
446       y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31));
447    } else {
448       y32[2] = y32[3] = nir_imm_int(b, 0);
449    }
450 
451    nir_ssa_def *res[8] = { NULL, };
452 
453    /* Yes, the following generates a pile of code.  However, we throw res[0]
454     * and res[1] away in the end and, if we're in the umul case, four of our
455     * eight dword operands will be constant zero and opt_algebraic will clean
456     * this up nicely.
457     */
458    for (unsigned i = 0; i < 4; i++) {
459       nir_ssa_def *carry = NULL;
460       for (unsigned j = 0; j < 4; j++) {
461          /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
462           * maximum value of tmp is UINT32_MAX * UINT32_MAX.  The maximum
463           * value that will fit in tmp is
464           *
465           *    UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
466           *               = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
467           *               = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
468           *
469           * so we're guaranteed that we can add in two more 32-bit values
470           * without overflowing tmp.
471           */
472          nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
473 
474          if (res[i + j])
475             tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
476          if (carry)
477             tmp = nir_iadd(b, tmp, carry);
478          res[i + j] = nir_u2u32(b, tmp);
479          carry = nir_ushr(b, tmp, nir_imm_int(b, 32));
480       }
481       res[i + 4] = nir_u2u32(b, carry);
482    }
483 
484    return nir_pack_64_2x32_split(b, res[2], res[3]);
485 }
486 
487 static nir_ssa_def *
lower_isign64(nir_builder * b,nir_ssa_def * x)488 lower_isign64(nir_builder *b, nir_ssa_def *x)
489 {
490    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
491    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
492 
493    nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi));
494    nir_ssa_def *res_hi = nir_ishr(b, x_hi, nir_imm_int(b, 31));
495    nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero));
496 
497    return nir_pack_64_2x32_split(b, res_lo, res_hi);
498 }
499 
500 static void
lower_udiv64_mod64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d,nir_ssa_def ** q,nir_ssa_def ** r)501 lower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d,
502                    nir_ssa_def **q, nir_ssa_def **r)
503 {
504    /* TODO: We should specially handle the case where the denominator is a
505     * constant.  In that case, we should be able to reduce it to a multiply by
506     * a constant, some shifts, and an add.
507     */
508    nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n);
509    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
510    nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d);
511    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
512 
513    nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32);
514    nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32);
515 
516    nir_ssa_def *n_hi_before_if = n_hi;
517    nir_ssa_def *q_hi_before_if = q_hi;
518 
519    /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
520     * greater than 32 bits to occur.  If the upper 32 bits of the numerator
521     * are zero, it is impossible for (denom << [63, 32]) <= numer unless
522     * denom == 0.
523     */
524    nir_ssa_def *need_high_div =
525       nir_iand(b, nir_ieq(b, d_hi, nir_imm_int(b, 0)), nir_uge(b, n_hi, d_lo));
526    nir_push_if(b, nir_bany(b, need_high_div));
527    {
528       /* If we only have one component, then the bany above goes away and
529        * this is always true within the if statement.
530        */
531       if (n->num_components == 1)
532          need_high_div = nir_imm_true(b);
533 
534       nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo);
535 
536       for (int i = 31; i >= 0; i--) {
537          /* if ((d.x << i) <= n.y) {
538           *    n.y -= d.x << i;
539           *    quot.y |= 1U << i;
540           * }
541           */
542          nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i));
543          nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift);
544          nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i));
545          nir_ssa_def *cond = nir_iand(b, need_high_div,
546                                          nir_uge(b, n_hi, d_shift));
547          if (i != 0) {
548             /* log2_d_lo is always <= 31, so we don't need to bother with it
549              * in the last iteration.
550              */
551             cond = nir_iand(b, cond,
552                                nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo));
553          }
554          n_hi = nir_bcsel(b, cond, new_n_hi, n_hi);
555          q_hi = nir_bcsel(b, cond, new_q_hi, q_hi);
556       }
557    }
558    nir_pop_if(b, NULL);
559    n_hi = nir_if_phi(b, n_hi, n_hi_before_if);
560    q_hi = nir_if_phi(b, q_hi, q_hi_before_if);
561 
562    nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi);
563 
564    n = nir_pack_64_2x32_split(b, n_lo, n_hi);
565    d = nir_pack_64_2x32_split(b, d_lo, d_hi);
566    for (int i = 31; i >= 0; i--) {
567       /* if ((d64 << i) <= n64) {
568        *    n64 -= d64 << i;
569        *    quot.x |= 1U << i;
570        * }
571        */
572       nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i));
573       nir_ssa_def *new_n = nir_isub(b, n, d_shift);
574       nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i));
575       nir_ssa_def *cond = nir_uge(b, n, d_shift);
576       if (i != 0) {
577          /* log2_denom is always <= 31, so we don't need to bother with it
578           * in the last iteration.
579           */
580          cond = nir_iand(b, cond,
581                             nir_ige(b, nir_imm_int(b, 31 - i), log2_denom));
582       }
583       n = nir_bcsel(b, cond, new_n, n);
584       q_lo = nir_bcsel(b, cond, new_q_lo, q_lo);
585    }
586 
587    *q = nir_pack_64_2x32_split(b, q_lo, q_hi);
588    *r = n;
589 }
590 
591 static nir_ssa_def *
lower_udiv64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d)592 lower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
593 {
594    nir_ssa_def *q, *r;
595    lower_udiv64_mod64(b, n, d, &q, &r);
596    return q;
597 }
598 
599 static nir_ssa_def *
lower_idiv64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d)600 lower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
601 {
602    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
603    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
604 
605    nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)),
606                                     nir_ilt(b, d_hi, nir_imm_int(b, 0)));
607    nir_ssa_def *q, *r;
608    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
609    return nir_bcsel(b, negate, nir_ineg(b, q), q);
610 }
611 
612 static nir_ssa_def *
lower_umod64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d)613 lower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
614 {
615    nir_ssa_def *q, *r;
616    lower_udiv64_mod64(b, n, d, &q, &r);
617    return r;
618 }
619 
620 static nir_ssa_def *
lower_imod64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d)621 lower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
622 {
623    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
624    nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
625    nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
626    nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0));
627 
628    nir_ssa_def *q, *r;
629    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
630 
631    nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
632 
633    return nir_bcsel(b, nir_ieq(b, r, nir_imm_int64(b, 0)), nir_imm_int64(b, 0),
634           nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem,
635                        nir_iadd(b, rem, d)));
636 }
637 
638 static nir_ssa_def *
lower_irem64(nir_builder * b,nir_ssa_def * n,nir_ssa_def * d)639 lower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
640 {
641    nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
642    nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
643 
644    nir_ssa_def *q, *r;
645    lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
646    return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
647 }
648 
649 static nir_ssa_def *
lower_extract(nir_builder * b,nir_op op,nir_ssa_def * x,nir_ssa_def * c)650 lower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c)
651 {
652    assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 ||
653           op == nir_op_extract_u16 || op == nir_op_extract_i16);
654 
655    const int chunk = nir_src_as_uint(nir_src_for_ssa(c));
656    const int chunk_bits =
657       (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16;
658    const int num_chunks_in_32 = 32 / chunk_bits;
659 
660    nir_ssa_def *extract32;
661    if (chunk < num_chunks_in_32) {
662       extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x),
663                                    nir_imm_int(b, chunk),
664                                    NULL, NULL);
665    } else {
666       extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x),
667                                    nir_imm_int(b, chunk - num_chunks_in_32),
668                                    NULL, NULL);
669    }
670 
671    if (op == nir_op_extract_i8 || op == nir_op_extract_i16)
672       return lower_i2i64(b, extract32);
673    else
674       return lower_u2u64(b, extract32);
675 }
676 
677 static nir_ssa_def *
lower_ufind_msb64(nir_builder * b,nir_ssa_def * x)678 lower_ufind_msb64(nir_builder *b, nir_ssa_def *x)
679 {
680 
681    nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
682    nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
683    nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo);
684    nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi);
685    nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0));
686    nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count);
687    return nir_bcsel(b, valid_hi_bits, hi_res, lo_count);
688 }
689 
690 static nir_ssa_def *
lower_2f(nir_builder * b,nir_ssa_def * x,unsigned dest_bit_size,bool src_is_signed)691 lower_2f(nir_builder *b, nir_ssa_def *x, unsigned dest_bit_size,
692          bool src_is_signed)
693 {
694    nir_ssa_def *x_sign = NULL;
695 
696    if (src_is_signed) {
697       x_sign = nir_bcsel(b, COND_LOWER_CMP(b, ilt, x, nir_imm_int64(b, 0)),
698                          nir_imm_floatN_t(b, -1, dest_bit_size),
699                          nir_imm_floatN_t(b, 1, dest_bit_size));
700       x = COND_LOWER_OP(b, iabs, x);
701    }
702 
703    nir_ssa_def *exp = COND_LOWER_OP(b, ufind_msb, x);
704    unsigned significand_bits;
705 
706    switch (dest_bit_size) {
707    case 32:
708       significand_bits = 23;
709       break;
710    case 16:
711       significand_bits = 10;
712       break;
713    default:
714       unreachable("Invalid dest_bit_size");
715    }
716 
717    nir_ssa_def *discard =
718       nir_imax(b, nir_isub(b, exp, nir_imm_int(b, significand_bits)),
719                   nir_imm_int(b, 0));
720    nir_ssa_def *significand =
721       COND_LOWER_CAST(b, u2u32, COND_LOWER_OP(b, ushr, x, discard));
722 
723    /* Round-to-nearest-even implementation:
724     * - if the non-representable part of the significand is higher than half
725     *   the minimum representable significand, we round-up
726     * - if the non-representable part of the significand is equal to half the
727     *   minimum representable significand and the representable part of the
728     *   significand is odd, we round-up
729     * - in any other case, we round-down
730     */
731    nir_ssa_def *lsb_mask = COND_LOWER_OP(b, ishl, nir_imm_int64(b, 1), discard);
732    nir_ssa_def *rem_mask = COND_LOWER_OP(b, isub, lsb_mask, nir_imm_int64(b, 1));
733    nir_ssa_def *half = COND_LOWER_OP(b, ishr, lsb_mask, nir_imm_int(b, 1));
734    nir_ssa_def *rem = COND_LOWER_OP(b, iand, x, rem_mask);
735    nir_ssa_def *halfway = nir_iand(b, COND_LOWER_CMP(b, ieq, rem, half),
736                                    nir_ine(b, discard, nir_imm_int(b, 0)));
737    nir_ssa_def *is_odd = nir_i2b(b, nir_iand(b, significand, nir_imm_int(b, 1)));
738    nir_ssa_def *round_up = nir_ior(b, COND_LOWER_CMP(b, ilt, half, rem),
739                                    nir_iand(b, halfway, is_odd));
740    significand = nir_iadd(b, significand, nir_b2i32(b, round_up));
741 
742    nir_ssa_def *res;
743 
744    if (dest_bit_size == 32)
745       res = nir_fmul(b, nir_u2f32(b, significand),
746                      nir_fexp2(b, nir_u2f32(b, discard)));
747    else
748       res = nir_fmul(b, nir_u2f16(b, significand),
749                      nir_fexp2(b, nir_u2f16(b, discard)));
750 
751    if (src_is_signed)
752       res = nir_fmul(b, res, x_sign);
753 
754    return res;
755 }
756 
757 static nir_ssa_def *
lower_f2(nir_builder * b,nir_ssa_def * x,bool dst_is_signed)758 lower_f2(nir_builder *b, nir_ssa_def *x, bool dst_is_signed)
759 {
760    assert(x->bit_size == 16 || x->bit_size == 32);
761    nir_ssa_def *x_sign = NULL;
762 
763    if (dst_is_signed)
764       x_sign = nir_fsign(b, x);
765    else
766       x = nir_fmin(b, x, nir_imm_floatN_t(b, UINT64_MAX, x->bit_size));
767 
768    x = nir_ftrunc(b, x);
769 
770    if (dst_is_signed) {
771       x = nir_fmin(b, x, nir_imm_floatN_t(b, INT64_MAX, x->bit_size));
772       x = nir_fmax(b, x, nir_imm_floatN_t(b, INT64_MIN, x->bit_size));
773       x = nir_fabs(b, x);
774    }
775 
776    nir_ssa_def *div = nir_imm_floatN_t(b, 1ULL << 32, x->bit_size);
777    nir_ssa_def *res_hi = nir_f2u32(b, nir_fdiv(b, x, div));
778    nir_ssa_def *res_lo = nir_f2u32(b, nir_frem(b, x, div));
779    nir_ssa_def *res = nir_pack_64_2x32_split(b, res_lo, res_hi);
780 
781    if (dst_is_signed)
782       res = nir_bcsel(b, nir_flt(b, x_sign, nir_imm_float(b, 0)),
783                       nir_ineg(b, res), res);
784 
785    return res;
786 }
787 
788 nir_lower_int64_options
nir_lower_int64_op_to_options_mask(nir_op opcode)789 nir_lower_int64_op_to_options_mask(nir_op opcode)
790 {
791    switch (opcode) {
792    case nir_op_imul:
793    case nir_op_amul:
794       return nir_lower_imul64;
795    case nir_op_imul_2x32_64:
796    case nir_op_umul_2x32_64:
797       return nir_lower_imul_2x32_64;
798    case nir_op_imul_high:
799    case nir_op_umul_high:
800       return nir_lower_imul_high64;
801    case nir_op_isign:
802       return nir_lower_isign64;
803    case nir_op_udiv:
804    case nir_op_idiv:
805    case nir_op_umod:
806    case nir_op_imod:
807    case nir_op_irem:
808       return nir_lower_divmod64;
809    case nir_op_b2i64:
810    case nir_op_i2b1:
811    case nir_op_i2i8:
812    case nir_op_i2i16:
813    case nir_op_i2i32:
814    case nir_op_i2i64:
815    case nir_op_u2u8:
816    case nir_op_u2u16:
817    case nir_op_u2u32:
818    case nir_op_u2u64:
819    case nir_op_i2f32:
820    case nir_op_u2f32:
821    case nir_op_i2f16:
822    case nir_op_u2f16:
823    case nir_op_f2i64:
824    case nir_op_f2u64:
825    case nir_op_bcsel:
826       return nir_lower_mov64;
827    case nir_op_ieq:
828    case nir_op_ine:
829    case nir_op_ult:
830    case nir_op_ilt:
831    case nir_op_uge:
832    case nir_op_ige:
833       return nir_lower_icmp64;
834    case nir_op_iadd:
835    case nir_op_isub:
836       return nir_lower_iadd64;
837    case nir_op_imin:
838    case nir_op_imax:
839    case nir_op_umin:
840    case nir_op_umax:
841    case nir_op_imin3:
842    case nir_op_imax3:
843    case nir_op_umin3:
844    case nir_op_umax3:
845    case nir_op_imed3:
846    case nir_op_umed3:
847       return nir_lower_minmax64;
848    case nir_op_iabs:
849       return nir_lower_iabs64;
850    case nir_op_ineg:
851       return nir_lower_ineg64;
852    case nir_op_iand:
853    case nir_op_ior:
854    case nir_op_ixor:
855    case nir_op_inot:
856       return nir_lower_logic64;
857    case nir_op_ishl:
858    case nir_op_ishr:
859    case nir_op_ushr:
860       return nir_lower_shift64;
861    case nir_op_extract_u8:
862    case nir_op_extract_i8:
863    case nir_op_extract_u16:
864    case nir_op_extract_i16:
865       return nir_lower_extract64;
866    case nir_op_ufind_msb:
867       return nir_lower_ufind_msb64;
868    default:
869       return 0;
870    }
871 }
872 
873 static nir_ssa_def *
lower_int64_alu_instr(nir_builder * b,nir_instr * instr,void * _state)874 lower_int64_alu_instr(nir_builder *b, nir_instr *instr, void *_state)
875 {
876    nir_alu_instr *alu = nir_instr_as_alu(instr);
877 
878    nir_ssa_def *src[4];
879    for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
880       src[i] = nir_ssa_for_alu_src(b, alu, i);
881 
882    switch (alu->op) {
883    case nir_op_imul:
884    case nir_op_amul:
885       return lower_imul64(b, src[0], src[1]);
886    case nir_op_imul_2x32_64:
887       return lower_mul_2x32_64(b, src[0], src[1], true);
888    case nir_op_umul_2x32_64:
889       return lower_mul_2x32_64(b, src[0], src[1], false);
890    case nir_op_imul_high:
891       return lower_mul_high64(b, src[0], src[1], true);
892    case nir_op_umul_high:
893       return lower_mul_high64(b, src[0], src[1], false);
894    case nir_op_isign:
895       return lower_isign64(b, src[0]);
896    case nir_op_udiv:
897       return lower_udiv64(b, src[0], src[1]);
898    case nir_op_idiv:
899       return lower_idiv64(b, src[0], src[1]);
900    case nir_op_umod:
901       return lower_umod64(b, src[0], src[1]);
902    case nir_op_imod:
903       return lower_imod64(b, src[0], src[1]);
904    case nir_op_irem:
905       return lower_irem64(b, src[0], src[1]);
906    case nir_op_b2i64:
907       return lower_b2i64(b, src[0]);
908    case nir_op_i2b1:
909       return lower_i2b(b, src[0]);
910    case nir_op_i2i8:
911       return lower_i2i8(b, src[0]);
912    case nir_op_i2i16:
913       return lower_i2i16(b, src[0]);
914    case nir_op_i2i32:
915       return lower_i2i32(b, src[0]);
916    case nir_op_i2i64:
917       return lower_i2i64(b, src[0]);
918    case nir_op_u2u8:
919       return lower_u2u8(b, src[0]);
920    case nir_op_u2u16:
921       return lower_u2u16(b, src[0]);
922    case nir_op_u2u32:
923       return lower_u2u32(b, src[0]);
924    case nir_op_u2u64:
925       return lower_u2u64(b, src[0]);
926    case nir_op_bcsel:
927       return lower_bcsel64(b, src[0], src[1], src[2]);
928    case nir_op_ieq:
929    case nir_op_ine:
930    case nir_op_ult:
931    case nir_op_ilt:
932    case nir_op_uge:
933    case nir_op_ige:
934       return lower_int64_compare(b, alu->op, src[0], src[1]);
935    case nir_op_iadd:
936       return lower_iadd64(b, src[0], src[1]);
937    case nir_op_isub:
938       return lower_isub64(b, src[0], src[1]);
939    case nir_op_imin:
940       return lower_imin64(b, src[0], src[1]);
941    case nir_op_imax:
942       return lower_imax64(b, src[0], src[1]);
943    case nir_op_umin:
944       return lower_umin64(b, src[0], src[1]);
945    case nir_op_umax:
946       return lower_umax64(b, src[0], src[1]);
947    case nir_op_imin3:
948       return lower_imin64(b, src[0], lower_imin64(b, src[1], src[2]));
949    case nir_op_imax3:
950       return lower_imax64(b, src[0], lower_imax64(b, src[1], src[2]));
951    case nir_op_umin3:
952       return lower_umin64(b, src[0], lower_umin64(b, src[1], src[2]));
953    case nir_op_umax3:
954       return lower_umax64(b, src[0], lower_umax64(b, src[1], src[2]));
955    case nir_op_imed3:
956       return lower_imax64(b, lower_imin64(b, lower_imax64(b, src[0], src[1]), src[2]), lower_imin64(b, src[0], src[1]));
957    case nir_op_umed3:
958       return lower_umax64(b, lower_umin64(b, lower_umax64(b, src[0], src[1]), src[2]), lower_umin64(b, src[0], src[1]));
959    case nir_op_iabs:
960       return lower_iabs64(b, src[0]);
961    case nir_op_ineg:
962       return lower_ineg64(b, src[0]);
963    case nir_op_iand:
964       return lower_iand64(b, src[0], src[1]);
965    case nir_op_ior:
966       return lower_ior64(b, src[0], src[1]);
967    case nir_op_ixor:
968       return lower_ixor64(b, src[0], src[1]);
969    case nir_op_inot:
970       return lower_inot64(b, src[0]);
971    case nir_op_ishl:
972       return lower_ishl64(b, src[0], src[1]);
973    case nir_op_ishr:
974       return lower_ishr64(b, src[0], src[1]);
975    case nir_op_ushr:
976       return lower_ushr64(b, src[0], src[1]);
977    case nir_op_extract_u8:
978    case nir_op_extract_i8:
979    case nir_op_extract_u16:
980    case nir_op_extract_i16:
981       return lower_extract(b, alu->op, src[0], src[1]);
982    case nir_op_ufind_msb:
983       return lower_ufind_msb64(b, src[0]);
984    case nir_op_i2f64:
985    case nir_op_i2f32:
986    case nir_op_i2f16:
987       return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), true);
988    case nir_op_u2f64:
989    case nir_op_u2f32:
990    case nir_op_u2f16:
991       return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), false);
992    case nir_op_f2i64:
993    case nir_op_f2u64:
994       /* We don't support f64toi64 (yet?). */
995       if (src[0]->bit_size > 32)
996          return false;
997 
998       return lower_f2(b, src[0], alu->op == nir_op_f2i64);
999    default:
1000       unreachable("Invalid ALU opcode to lower");
1001    }
1002 }
1003 
1004 static bool
should_lower_int64_alu_instr(const nir_instr * instr,const void * _data)1005 should_lower_int64_alu_instr(const nir_instr *instr, const void *_data)
1006 {
1007    const nir_shader_compiler_options *options =
1008       (const nir_shader_compiler_options *)_data;
1009 
1010    if (instr->type != nir_instr_type_alu)
1011       return false;
1012 
1013    const nir_alu_instr *alu = nir_instr_as_alu(instr);
1014 
1015    switch (alu->op) {
1016    case nir_op_i2b1:
1017    case nir_op_i2i8:
1018    case nir_op_i2i16:
1019    case nir_op_i2i32:
1020    case nir_op_u2u8:
1021    case nir_op_u2u16:
1022    case nir_op_u2u32:
1023       assert(alu->src[0].src.is_ssa);
1024       if (alu->src[0].src.ssa->bit_size != 64)
1025          return false;
1026       break;
1027    case nir_op_bcsel:
1028       assert(alu->src[1].src.is_ssa);
1029       assert(alu->src[2].src.is_ssa);
1030       assert(alu->src[1].src.ssa->bit_size ==
1031              alu->src[2].src.ssa->bit_size);
1032       if (alu->src[1].src.ssa->bit_size != 64)
1033          return false;
1034       break;
1035    case nir_op_ieq:
1036    case nir_op_ine:
1037    case nir_op_ult:
1038    case nir_op_ilt:
1039    case nir_op_uge:
1040    case nir_op_ige:
1041       assert(alu->src[0].src.is_ssa);
1042       assert(alu->src[1].src.is_ssa);
1043       assert(alu->src[0].src.ssa->bit_size ==
1044              alu->src[1].src.ssa->bit_size);
1045       if (alu->src[0].src.ssa->bit_size != 64)
1046          return false;
1047       break;
1048    case nir_op_ufind_msb:
1049       assert(alu->src[0].src.is_ssa);
1050       if (alu->src[0].src.ssa->bit_size != 64)
1051          return false;
1052       break;
1053    case nir_op_amul:
1054       assert(alu->dest.dest.is_ssa);
1055       if (options->has_imul24)
1056          return false;
1057       if (alu->dest.dest.ssa.bit_size != 64)
1058          return false;
1059       break;
1060    case nir_op_i2f64:
1061    case nir_op_u2f64:
1062    case nir_op_i2f32:
1063    case nir_op_u2f32:
1064    case nir_op_i2f16:
1065    case nir_op_u2f16:
1066       assert(alu->src[0].src.is_ssa);
1067       if (alu->src[0].src.ssa->bit_size != 64)
1068          return false;
1069       break;
1070    case nir_op_f2u64:
1071    case nir_op_f2i64:
1072       /* fall-through */
1073    default:
1074       assert(alu->dest.dest.is_ssa);
1075       if (alu->dest.dest.ssa.bit_size != 64)
1076          return false;
1077       break;
1078    }
1079 
1080    unsigned mask = nir_lower_int64_op_to_options_mask(alu->op);
1081    return (options->lower_int64_options & mask) != 0;
1082 }
1083 
1084 bool
nir_lower_int64(nir_shader * shader)1085 nir_lower_int64(nir_shader *shader)
1086 {
1087    return nir_shader_lower_instructions(shader,
1088                                         should_lower_int64_alu_instr,
1089                                         lower_int64_alu_instr,
1090                                         (void *)shader->options);
1091 }
1092