1 /*
2  * Copyright © 2014-2015 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "compiler/nir/nir.h"
25 #include "compiler/nir/nir_deref.h"
26 #include "nir/nir_to_tgsi.h"
27 #include "pipe/p_screen.h"
28 #include "pipe/p_state.h"
29 #include "tgsi/tgsi_dump.h"
30 #include "tgsi/tgsi_from_mesa.h"
31 #include "tgsi/tgsi_info.h"
32 #include "tgsi/tgsi_ureg.h"
33 #include "util/debug.h"
34 #include "util/u_math.h"
35 #include "util/u_memory.h"
36 
37 struct ntt_compile {
38    nir_shader *s;
39    nir_function_impl *impl;
40    struct pipe_screen *screen;
41    struct ureg_program *ureg;
42 
43    bool needs_texcoord_semantic;
44    bool any_reg_as_address;
45    bool native_integers;
46    bool has_txf_lz;
47 
48    int next_addr_reg;
49    bool addr_declared[2];
50    struct ureg_dst addr_reg[2];
51 
52    /* if condition set up at the end of a block, for ntt_emit_if(). */
53    struct ureg_src if_cond;
54 
55    /* TGSI temps for our NIR SSA and register values. */
56    struct ureg_dst *reg_temp;
57    struct ureg_src *ssa_temp;
58 
59    nir_instr_liveness *liveness;
60 
61    /* Mappings from driver_location to TGSI input/output number.
62     *
63     * We'll be declaring TGSI input/outputs in an arbitrary order, and they get
64     * their numbers assigned incrementally, unlike inputs or constants.
65     */
66    struct ureg_src *input_index_map;
67    uint64_t centroid_inputs;
68 
69    uint32_t first_ubo;
70 
71    struct ureg_src images[PIPE_MAX_SHADER_IMAGES];
72 };
73 
74 static void ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list);
75 
76 /**
77  * Interprets a nir_load_const used as a NIR src as a uint.
78  *
79  * For non-native-integers drivers, nir_load_const_instrs used by an integer ALU
80  * instruction (or in a phi-web used by an integer ALU instruction) were
81  * converted to floats and the ALU instruction swapped to the float equivalent.
82  * However, this means that integer load_consts used by intrinsics (which don't
83  * normally get that conversion) may have been reformatted to be floats.  Given
84  * that all of our intrinsic nir_src_as_uint() calls are expected to be small,
85  * we can just look and see if they look like floats and convert them back to
86  * ints.
87  */
88 static uint32_t
ntt_src_as_uint(struct ntt_compile * c,nir_src src)89 ntt_src_as_uint(struct ntt_compile *c, nir_src src)
90 {
91    uint32_t val = nir_src_as_uint(src);
92    if (!c->native_integers && val >= fui(1.0))
93       val = (uint32_t)uif(val);
94    return val;
95 }
96 
97 static unsigned
ntt_64bit_write_mask(unsigned write_mask)98 ntt_64bit_write_mask(unsigned write_mask)
99 {
100    return ((write_mask & 1) ? 0x3 : 0) | ((write_mask & 2) ? 0xc : 0);
101 }
102 
103 static struct ureg_src
ntt_64bit_1f(struct ntt_compile * c)104 ntt_64bit_1f(struct ntt_compile *c)
105 {
106    return ureg_imm4u(c->ureg,
107                      0x00000000, 0x3ff00000,
108                      0x00000000, 0x3ff00000);
109 }
110 
111 static const struct glsl_type *
ntt_shader_input_type(struct ntt_compile * c,struct nir_variable * var)112 ntt_shader_input_type(struct ntt_compile *c,
113                       struct nir_variable *var)
114 {
115    switch (c->s->info.stage) {
116    case MESA_SHADER_GEOMETRY:
117    case MESA_SHADER_TESS_EVAL:
118    case MESA_SHADER_TESS_CTRL:
119       if (glsl_type_is_array(var->type))
120          return glsl_get_array_element(var->type);
121       else
122          return var->type;
123    default:
124       return var->type;
125    }
126 }
127 
128 static void
ntt_get_gl_varying_semantic(struct ntt_compile * c,unsigned location,unsigned * semantic_name,unsigned * semantic_index)129 ntt_get_gl_varying_semantic(struct ntt_compile *c, unsigned location,
130                             unsigned *semantic_name, unsigned *semantic_index)
131 {
132    /* We want to use most of tgsi_get_gl_varying_semantic(), but the
133     * !texcoord shifting has already been applied, so avoid that.
134     */
135    if (!c->needs_texcoord_semantic &&
136        (location >= VARYING_SLOT_VAR0 && location < VARYING_SLOT_PATCH0)) {
137       *semantic_name = TGSI_SEMANTIC_GENERIC;
138       *semantic_index = location - VARYING_SLOT_VAR0;
139       return;
140    }
141 
142    tgsi_get_gl_varying_semantic(location, true,
143                                 semantic_name, semantic_index);
144 }
145 
146 /* TGSI varying declarations have a component usage mask associated (used by
147  * r600 and svga).
148  */
149 static uint32_t
ntt_tgsi_usage_mask(unsigned start_component,unsigned num_components,bool is_64)150 ntt_tgsi_usage_mask(unsigned start_component, unsigned num_components,
151                     bool is_64)
152 {
153    uint32_t usage_mask =
154       u_bit_consecutive(start_component, num_components);
155 
156    if (is_64) {
157       if (start_component >= 2)
158          usage_mask >>= 2;
159 
160       uint32_t tgsi_usage_mask = 0;
161 
162       if (usage_mask & TGSI_WRITEMASK_X)
163          tgsi_usage_mask |= TGSI_WRITEMASK_XY;
164       if (usage_mask & TGSI_WRITEMASK_Y)
165          tgsi_usage_mask |= TGSI_WRITEMASK_ZW;
166 
167       return tgsi_usage_mask;
168    } else {
169       return usage_mask;
170    }
171 }
172 
173 /* TGSI varying declarations have a component usage mask associated (used by
174  * r600 and svga).
175  */
176 static uint32_t
ntt_tgsi_var_usage_mask(const struct nir_variable * var)177 ntt_tgsi_var_usage_mask(const struct nir_variable *var)
178 {
179    const struct glsl_type *type_without_array =
180       glsl_without_array(var->type);
181    unsigned num_components = glsl_get_vector_elements(type_without_array);
182    if (num_components == 0) /* structs */
183       num_components = 4;
184 
185    return ntt_tgsi_usage_mask(var->data.location_frac, num_components,
186                               glsl_type_is_64bit(type_without_array));
187 }
188 
189 static struct ureg_dst
ntt_output_decl(struct ntt_compile * c,nir_intrinsic_instr * instr,uint32_t * frac)190 ntt_output_decl(struct ntt_compile *c, nir_intrinsic_instr *instr, uint32_t *frac)
191 {
192    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
193    int base = nir_intrinsic_base(instr);
194    *frac = nir_intrinsic_component(instr);
195    bool is_64 = nir_src_bit_size(instr->src[0]) == 64;
196 
197    struct ureg_dst out;
198    if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
199       unsigned semantic_name, semantic_index;
200       tgsi_get_gl_frag_result_semantic(semantics.location,
201                                        &semantic_name, &semantic_index);
202       semantic_index += semantics.dual_source_blend_index;
203 
204       switch (semantics.location) {
205       case FRAG_RESULT_DEPTH:
206          *frac = 2; /* z write is the to the .z channel in TGSI */
207          break;
208       case FRAG_RESULT_STENCIL:
209          *frac = 1;
210          break;
211       default:
212          break;
213       }
214 
215       out = ureg_DECL_output(c->ureg, semantic_name, semantic_index);
216    } else {
217       unsigned semantic_name, semantic_index;
218 
219       ntt_get_gl_varying_semantic(c, semantics.location,
220                                   &semantic_name, &semantic_index);
221 
222       uint32_t usage_mask = ntt_tgsi_usage_mask(*frac,
223                                                 instr->num_components,
224                                                 is_64);
225       uint32_t gs_streams = semantics.gs_streams;
226       for (int i = 0; i < 4; i++) {
227          if (!(usage_mask & (1 << i)))
228             gs_streams &= ~(0x3 << 2 * i);
229       }
230 
231       /* No driver appears to use array_id of outputs. */
232       unsigned array_id = 0;
233 
234       /* This bit is lost in the i/o semantics, but it's unused in in-tree
235        * drivers.
236        */
237       bool invariant = false;
238 
239       out = ureg_DECL_output_layout(c->ureg,
240                                     semantic_name, semantic_index,
241                                     gs_streams,
242                                     base,
243                                     usage_mask,
244                                     array_id,
245                                     semantics.num_slots,
246                                     invariant);
247    }
248 
249    unsigned write_mask;
250    if (nir_intrinsic_has_write_mask(instr))
251       write_mask = nir_intrinsic_write_mask(instr);
252    else
253       write_mask = ((1 << instr->num_components) - 1) << *frac;
254 
255    if (is_64) {
256       write_mask = ntt_64bit_write_mask(write_mask);
257       if (*frac >= 2)
258          write_mask = write_mask << 2;
259    } else {
260       write_mask = write_mask << *frac;
261    }
262    return ureg_writemask(out, write_mask);
263 }
264 
265 /* If this reg or SSA def is used only for storing an output, then in the simple
266  * cases we can write directly to the TGSI output instead of having store_output
267  * emit its own MOV.
268  */
269 static bool
ntt_try_store_in_tgsi_output(struct ntt_compile * c,struct ureg_dst * dst,struct list_head * uses,struct list_head * if_uses)270 ntt_try_store_in_tgsi_output(struct ntt_compile *c, struct ureg_dst *dst,
271                              struct list_head *uses, struct list_head *if_uses)
272 {
273    *dst = ureg_dst_undef();
274 
275    switch (c->s->info.stage) {
276    case MESA_SHADER_FRAGMENT:
277    case MESA_SHADER_VERTEX:
278       break;
279    default:
280       /* tgsi_exec (at least) requires that output stores happen per vertex
281        * emitted, you don't get to reuse a previous output value for the next
282        * vertex.
283        */
284       return false;
285    }
286 
287    if (!list_is_empty(if_uses) || !list_is_singular(uses))
288       return false;
289 
290    nir_src *src = list_first_entry(uses, nir_src, use_link);
291 
292    if (src->parent_instr->type != nir_instr_type_intrinsic)
293       return false;
294 
295    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(src->parent_instr);
296    if (intr->intrinsic != nir_intrinsic_store_output ||
297        !nir_src_is_const(intr->src[1])) {
298       return false;
299    }
300 
301    uint32_t frac;
302    *dst = ntt_output_decl(c, intr, &frac);
303    dst->Index += ntt_src_as_uint(c, intr->src[1]);
304 
305    return frac == 0;
306 }
307 
308 static void
ntt_setup_inputs(struct ntt_compile * c)309 ntt_setup_inputs(struct ntt_compile *c)
310 {
311    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
312       return;
313 
314    unsigned num_inputs = 0;
315    int num_input_arrays = 0;
316 
317    nir_foreach_shader_in_variable(var, c->s) {
318       const struct glsl_type *type = ntt_shader_input_type(c, var);
319       unsigned array_len =
320          glsl_count_attribute_slots(type, false);
321 
322       num_inputs = MAX2(num_inputs, var->data.driver_location + array_len);
323    }
324 
325    c->input_index_map = ralloc_array(c, struct ureg_src, num_inputs);
326 
327    nir_foreach_shader_in_variable(var, c->s) {
328       const struct glsl_type *type = ntt_shader_input_type(c, var);
329       unsigned array_len =
330          glsl_count_attribute_slots(type, false);
331 
332       unsigned interpolation = TGSI_INTERPOLATE_CONSTANT;
333       unsigned sample_loc;
334       struct ureg_src decl;
335 
336       if (c->s->info.stage == MESA_SHADER_FRAGMENT) {
337          interpolation =
338             tgsi_get_interp_mode(var->data.interpolation,
339                                  var->data.location == VARYING_SLOT_COL0 ||
340                                  var->data.location == VARYING_SLOT_COL1);
341 
342          if (var->data.location == VARYING_SLOT_POS)
343             interpolation = TGSI_INTERPOLATE_LINEAR;
344       }
345 
346       unsigned semantic_name, semantic_index;
347       ntt_get_gl_varying_semantic(c, var->data.location,
348                                   &semantic_name, &semantic_index);
349 
350       if (var->data.sample) {
351          sample_loc = TGSI_INTERPOLATE_LOC_SAMPLE;
352       } else if (var->data.centroid) {
353          sample_loc = TGSI_INTERPOLATE_LOC_CENTROID;
354          c->centroid_inputs |= (BITSET_MASK(array_len) <<
355                                 var->data.driver_location);
356       } else {
357          sample_loc = TGSI_INTERPOLATE_LOC_CENTER;
358       }
359 
360       unsigned array_id = 0;
361       if (glsl_type_is_array(type))
362          array_id = ++num_input_arrays;
363 
364       uint32_t usage_mask = ntt_tgsi_var_usage_mask(var);
365 
366       decl = ureg_DECL_fs_input_centroid_layout(c->ureg,
367                                                 semantic_name,
368                                                 semantic_index,
369                                                 interpolation,
370                                                 sample_loc,
371                                                 var->data.driver_location,
372                                                 usage_mask,
373                                                 array_id, array_len);
374 
375       if (semantic_name == TGSI_SEMANTIC_FACE) {
376          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
377          /* NIR is ~0 front and 0 back, while TGSI is +1 front */
378          ureg_SGE(c->ureg, temp, decl, ureg_imm1f(c->ureg, 0));
379          decl = ureg_src(temp);
380       }
381 
382       for (unsigned i = 0; i < array_len; i++) {
383          c->input_index_map[var->data.driver_location + i] = decl;
384          c->input_index_map[var->data.driver_location + i].Index += i;
385       }
386    }
387 }
388 
389 static int
ntt_sort_by_location(const nir_variable * a,const nir_variable * b)390 ntt_sort_by_location(const nir_variable *a, const nir_variable *b)
391 {
392    return a->data.location - b->data.location;
393 }
394 
395 /**
396  * Workaround for virglrenderer requiring that TGSI FS output color variables
397  * are declared in order.  Besides, it's a lot nicer to read the TGSI this way.
398  */
399 static void
ntt_setup_outputs(struct ntt_compile * c)400 ntt_setup_outputs(struct ntt_compile *c)
401 {
402    if (c->s->info.stage != MESA_SHADER_FRAGMENT)
403       return;
404 
405    nir_sort_variables_with_modes(c->s, ntt_sort_by_location, nir_var_shader_out);
406 
407    nir_foreach_shader_out_variable(var, c->s) {
408       if (var->data.location == FRAG_RESULT_COLOR)
409          ureg_property(c->ureg, TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS, 1);
410 
411       unsigned semantic_name, semantic_index;
412       tgsi_get_gl_frag_result_semantic(var->data.location,
413                                        &semantic_name, &semantic_index);
414 
415       (void)ureg_DECL_output(c->ureg, semantic_name, semantic_index);
416    }
417 }
418 
419 static enum tgsi_texture_type
tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim,bool is_array,bool is_shadow)420 tgsi_texture_type_from_sampler_dim(enum glsl_sampler_dim dim, bool is_array, bool is_shadow)
421 {
422    switch (dim) {
423    case GLSL_SAMPLER_DIM_1D:
424       if (is_shadow)
425          return is_array ? TGSI_TEXTURE_SHADOW1D_ARRAY : TGSI_TEXTURE_SHADOW1D;
426       else
427          return is_array ? TGSI_TEXTURE_1D_ARRAY : TGSI_TEXTURE_1D;
428    case GLSL_SAMPLER_DIM_2D:
429    case GLSL_SAMPLER_DIM_EXTERNAL:
430       if (is_shadow)
431          return is_array ? TGSI_TEXTURE_SHADOW2D_ARRAY : TGSI_TEXTURE_SHADOW2D;
432       else
433          return is_array ? TGSI_TEXTURE_2D_ARRAY : TGSI_TEXTURE_2D;
434    case GLSL_SAMPLER_DIM_3D:
435       return TGSI_TEXTURE_3D;
436    case GLSL_SAMPLER_DIM_CUBE:
437       if (is_shadow)
438          return is_array ? TGSI_TEXTURE_SHADOWCUBE_ARRAY : TGSI_TEXTURE_SHADOWCUBE;
439       else
440          return is_array ? TGSI_TEXTURE_CUBE_ARRAY : TGSI_TEXTURE_CUBE;
441    case GLSL_SAMPLER_DIM_RECT:
442       if (is_shadow)
443          return TGSI_TEXTURE_SHADOWRECT;
444       else
445          return TGSI_TEXTURE_RECT;
446    case GLSL_SAMPLER_DIM_MS:
447       return is_array ? TGSI_TEXTURE_2D_ARRAY_MSAA : TGSI_TEXTURE_2D_MSAA;
448    case GLSL_SAMPLER_DIM_BUF:
449       return TGSI_TEXTURE_BUFFER;
450    default:
451       unreachable("unknown sampler dim");
452    }
453 }
454 
455 static enum tgsi_return_type
tgsi_return_type_from_base_type(enum glsl_base_type type)456 tgsi_return_type_from_base_type(enum glsl_base_type type)
457 {
458    switch (type) {
459    case GLSL_TYPE_INT:
460       return TGSI_RETURN_TYPE_SINT;
461    case GLSL_TYPE_UINT:
462       return TGSI_RETURN_TYPE_UINT;
463    case GLSL_TYPE_FLOAT:
464      return TGSI_RETURN_TYPE_FLOAT;
465    default:
466       unreachable("unexpected texture type");
467    }
468 }
469 
470 static void
ntt_setup_uniforms(struct ntt_compile * c)471 ntt_setup_uniforms(struct ntt_compile *c)
472 {
473    nir_foreach_uniform_variable(var, c->s) {
474       int image_count = glsl_type_get_image_count(var->type);
475 
476       if (glsl_type_is_sampler(glsl_without_array(var->type))) {
477          /* Don't use this size for the check for samplers -- arrays of structs
478           * containing samplers should be ignored, and just the separate lowered
479           * sampler uniform decl used.
480           */
481          int size = glsl_type_get_sampler_count(var->type);
482 
483          const struct glsl_type *stype = glsl_without_array(var->type);
484          enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(stype),
485                                                                             glsl_sampler_type_is_array(stype),
486                                                                             glsl_sampler_type_is_shadow(stype));
487          enum tgsi_return_type ret_type = tgsi_return_type_from_base_type(glsl_get_sampler_result_type(stype));
488          for (int i = 0; i < size; i++) {
489             ureg_DECL_sampler_view(c->ureg, var->data.binding + i,
490                target, ret_type, ret_type, ret_type, ret_type);
491             ureg_DECL_sampler(c->ureg, var->data.binding + i);
492          }
493       } else if (image_count) {
494          const struct glsl_type *itype = glsl_without_array(var->type);
495          enum tgsi_texture_type tex_type =
496              tgsi_texture_type_from_sampler_dim(glsl_get_sampler_dim(itype),
497                                                 glsl_sampler_type_is_array(itype), false);
498 
499          for (int i = 0; i < image_count; i++) {
500             c->images[var->data.binding] = ureg_DECL_image(c->ureg,
501                                                            var->data.binding + i,
502                                                            tex_type,
503                                                            var->data.image.format,
504                                                            !(var->data.access & ACCESS_NON_WRITEABLE),
505                                                            false);
506          }
507       } else if (glsl_contains_atomic(var->type)) {
508          uint32_t offset = var->data.offset / 4;
509          uint32_t size = glsl_atomic_size(var->type) / 4;
510          ureg_DECL_hw_atomic(c->ureg, offset, offset + size - 1, var->data.binding, 0);
511       }
512 
513       /* lower_uniforms_to_ubo lowered non-sampler uniforms to UBOs, so CB0
514        * size declaration happens with other UBOs below.
515        */
516    }
517 
518    c->first_ubo = ~0;
519 
520    unsigned ubo_sizes[PIPE_MAX_CONSTANT_BUFFERS] = {0};
521    nir_foreach_variable_with_modes(var, c->s, nir_var_mem_ubo) {
522       int ubo = var->data.driver_location;
523       if (ubo == -1)
524          continue;
525 
526       if (!(ubo == 0 && c->s->info.first_ubo_is_default_ubo))
527          c->first_ubo = MIN2(c->first_ubo, ubo);
528 
529       unsigned size = glsl_get_explicit_size(var->interface_type, false);
530 
531       int array_size = 1;
532       if (glsl_type_is_interface(glsl_without_array(var->type)))
533          array_size = MAX2(1, glsl_array_size(var->type));
534       for (int i = 0; i < array_size; i++) {
535          /* Even if multiple NIR variables are in the same uniform block, their
536           * explicit size is the size of the block.
537           */
538          if (ubo_sizes[ubo + i])
539             assert(ubo_sizes[ubo + i] == size);
540 
541          ubo_sizes[ubo + i] = size;
542       }
543    }
544 
545    for (int i = 0; i < ARRAY_SIZE(ubo_sizes); i++) {
546       if (ubo_sizes[i])
547          ureg_DECL_constant2D(c->ureg, 0, DIV_ROUND_UP(ubo_sizes[i], 16) - 1, i);
548    }
549 
550    for (int i = 0; i < c->s->info.num_ssbos; i++) {
551       /* XXX: nv50 uses the atomic flag to set caching for (lowered) atomic
552        * counters
553        */
554       bool atomic = false;
555       ureg_DECL_buffer(c->ureg, i, atomic);
556    }
557 }
558 
559 static void
ntt_setup_registers(struct ntt_compile * c,struct exec_list * list)560 ntt_setup_registers(struct ntt_compile *c, struct exec_list *list)
561 {
562    foreach_list_typed(nir_register, nir_reg, node, list) {
563       struct ureg_dst decl;
564       if (nir_reg->num_array_elems == 0) {
565          uint32_t write_mask = BITFIELD_MASK(nir_reg->num_components);
566          if (!ntt_try_store_in_tgsi_output(c, &decl, &nir_reg->uses, &nir_reg->if_uses)) {
567             if (nir_reg->bit_size == 64) {
568                if (nir_reg->num_components > 2) {
569                   fprintf(stderr, "NIR-to-TGSI: error: %d-component NIR r%d\n",
570                         nir_reg->num_components, nir_reg->index);
571                }
572 
573                write_mask = ntt_64bit_write_mask(write_mask);
574             }
575 
576             decl = ureg_writemask(ureg_DECL_temporary(c->ureg), write_mask);
577          }
578       } else {
579          decl = ureg_DECL_array_temporary(c->ureg, nir_reg->num_array_elems,
580                                           true);
581       }
582       c->reg_temp[nir_reg->index] = decl;
583    }
584 }
585 
586 static struct ureg_src
ntt_get_load_const_src(struct ntt_compile * c,nir_load_const_instr * instr)587 ntt_get_load_const_src(struct ntt_compile *c, nir_load_const_instr *instr)
588 {
589    int num_components = instr->def.num_components;
590 
591    if (!c->native_integers) {
592       float values[4];
593       assert(instr->def.bit_size == 32);
594       for (int i = 0; i < num_components; i++)
595          values[i] = uif(instr->value[i].u32);
596 
597       return ureg_DECL_immediate(c->ureg, values, num_components);
598    } else {
599       uint32_t values[4];
600 
601       if (instr->def.bit_size == 32) {
602          for (int i = 0; i < num_components; i++)
603             values[i] = instr->value[i].u32;
604       } else {
605          assert(num_components <= 2);
606          for (int i = 0; i < num_components; i++) {
607             values[i * 2 + 0] = instr->value[i].u64 & 0xffffffff;
608             values[i * 2 + 1] = instr->value[i].u64 >> 32;
609          }
610          num_components *= 2;
611       }
612 
613       return ureg_DECL_immediate_uint(c->ureg, values, num_components);
614    }
615 }
616 
617 static struct ureg_src
ntt_reladdr(struct ntt_compile * c,struct ureg_src addr)618 ntt_reladdr(struct ntt_compile *c, struct ureg_src addr)
619 {
620    if (c->any_reg_as_address) {
621       /* Make sure we're getting the refcounting right even on any_reg
622        * drivers.
623        */
624       c->next_addr_reg++;
625 
626       return ureg_scalar(addr, 0);
627    }
628 
629    assert(c->next_addr_reg < ARRAY_SIZE(c->addr_reg));
630 
631    if (!c->addr_declared[c->next_addr_reg]) {
632       c->addr_reg[c->next_addr_reg] = ureg_writemask(ureg_DECL_address(c->ureg),
633                                                      TGSI_WRITEMASK_X);
634       c->addr_declared[c->next_addr_reg] = true;
635    }
636 
637    if (c->native_integers)
638       ureg_UARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
639    else
640       ureg_ARL(c->ureg, c->addr_reg[c->next_addr_reg], addr);
641    return ureg_scalar(ureg_src(c->addr_reg[c->next_addr_reg++]), 0);
642 }
643 
644 static void
ntt_put_reladdr(struct ntt_compile * c)645 ntt_put_reladdr(struct ntt_compile *c)
646 {
647    c->next_addr_reg--;
648    assert(c->next_addr_reg >= 0);
649 }
650 
651 static void
ntt_reladdr_dst_put(struct ntt_compile * c,struct ureg_dst dst)652 ntt_reladdr_dst_put(struct ntt_compile *c, struct ureg_dst dst)
653 {
654    if (c->any_reg_as_address)
655       return;
656 
657    if (dst.Indirect)
658       ntt_put_reladdr(c);
659    if (dst.DimIndirect)
660       ntt_put_reladdr(c);
661 }
662 
663 static struct ureg_src
ntt_get_src(struct ntt_compile * c,nir_src src)664 ntt_get_src(struct ntt_compile *c, nir_src src)
665 {
666    if (src.is_ssa) {
667       if (src.ssa->parent_instr->type == nir_instr_type_load_const)
668          return ntt_get_load_const_src(c, nir_instr_as_load_const(src.ssa->parent_instr));
669 
670       return c->ssa_temp[src.ssa->index];
671    } else {
672       nir_register *reg = src.reg.reg;
673       struct ureg_dst reg_temp = c->reg_temp[reg->index];
674       reg_temp.Index += src.reg.base_offset;
675 
676       if (src.reg.indirect) {
677          struct ureg_src offset = ntt_get_src(c, *src.reg.indirect);
678          return ureg_src_indirect(ureg_src(reg_temp),
679                                   ntt_reladdr(c, offset));
680       } else {
681          return ureg_src(reg_temp);
682       }
683    }
684 }
685 
686 static struct ureg_src
ntt_get_alu_src(struct ntt_compile * c,nir_alu_instr * instr,int i)687 ntt_get_alu_src(struct ntt_compile *c, nir_alu_instr *instr, int i)
688 {
689    nir_alu_src src = instr->src[i];
690    struct ureg_src usrc = ntt_get_src(c, src.src);
691 
692    if (nir_src_bit_size(src.src) == 64) {
693       int chan0 = 0, chan1 = 1;
694       if (nir_op_infos[instr->op].input_sizes[i] == 0) {
695          chan0 = ffs(instr->dest.write_mask) - 1;
696          chan1 = ffs(instr->dest.write_mask & ~(1 << chan0)) - 1;
697          if (chan1 == -1)
698             chan1 = chan0;
699       }
700       usrc = ureg_swizzle(usrc,
701                           src.swizzle[chan0] * 2,
702                           src.swizzle[chan0] * 2 + 1,
703                           src.swizzle[chan1] * 2,
704                           src.swizzle[chan1] * 2 + 1);
705    } else {
706       usrc = ureg_swizzle(usrc,
707                           src.swizzle[0],
708                           src.swizzle[1],
709                           src.swizzle[2],
710                           src.swizzle[3]);
711    }
712 
713    if (src.abs)
714       usrc = ureg_abs(usrc);
715    if (src.negate)
716       usrc = ureg_negate(usrc);
717 
718    return usrc;
719 }
720 
721 /* Reswizzles a source so that the unset channels in the write mask still refer
722  * to one of the channels present in the write mask.
723  */
724 static struct ureg_src
ntt_swizzle_for_write_mask(struct ureg_src src,uint32_t write_mask)725 ntt_swizzle_for_write_mask(struct ureg_src src, uint32_t write_mask)
726 {
727    assert(write_mask);
728    int first_chan = ffs(write_mask) - 1;
729    return ureg_swizzle(src,
730                        (write_mask & TGSI_WRITEMASK_X) ? TGSI_SWIZZLE_X : first_chan,
731                        (write_mask & TGSI_WRITEMASK_Y) ? TGSI_SWIZZLE_Y : first_chan,
732                        (write_mask & TGSI_WRITEMASK_Z) ? TGSI_SWIZZLE_Z : first_chan,
733                        (write_mask & TGSI_WRITEMASK_W) ? TGSI_SWIZZLE_W : first_chan);
734 }
735 
736 static struct ureg_dst
ntt_get_ssa_def_decl(struct ntt_compile * c,nir_ssa_def * ssa)737 ntt_get_ssa_def_decl(struct ntt_compile *c, nir_ssa_def *ssa)
738 {
739    uint32_t writemask = BITSET_MASK(ssa->num_components);
740    if (ssa->bit_size == 64)
741       writemask = ntt_64bit_write_mask(writemask);
742 
743    struct ureg_dst dst;
744    if (!ntt_try_store_in_tgsi_output(c, &dst, &ssa->uses, &ssa->if_uses))
745       dst = ureg_DECL_temporary(c->ureg);
746 
747    c->ssa_temp[ssa->index] = ntt_swizzle_for_write_mask(ureg_src(dst), writemask);
748 
749    return ureg_writemask(dst, writemask);
750 }
751 
752 static struct ureg_dst
ntt_get_dest_decl(struct ntt_compile * c,nir_dest * dest)753 ntt_get_dest_decl(struct ntt_compile *c, nir_dest *dest)
754 {
755    if (dest->is_ssa)
756       return ntt_get_ssa_def_decl(c, &dest->ssa);
757    else
758       return c->reg_temp[dest->reg.reg->index];
759 }
760 
761 static struct ureg_dst
ntt_get_dest(struct ntt_compile * c,nir_dest * dest)762 ntt_get_dest(struct ntt_compile *c, nir_dest *dest)
763 {
764    struct ureg_dst dst = ntt_get_dest_decl(c, dest);
765 
766    if (!dest->is_ssa) {
767       dst.Index += dest->reg.base_offset;
768 
769       if (dest->reg.indirect) {
770          struct ureg_src offset = ntt_get_src(c, *dest->reg.indirect);
771          dst = ureg_dst_indirect(dst, ntt_reladdr(c, offset));
772       }
773    }
774 
775    return dst;
776 }
777 
778 /* For an SSA dest being populated by a constant src, replace the storage with
779  * a copy of the ureg_src.
780  */
781 static void
ntt_store_def(struct ntt_compile * c,nir_ssa_def * def,struct ureg_src src)782 ntt_store_def(struct ntt_compile *c, nir_ssa_def *def, struct ureg_src src)
783 {
784    if (!src.Indirect && !src.DimIndirect) {
785       switch (src.File) {
786       case TGSI_FILE_IMMEDIATE:
787       case TGSI_FILE_INPUT:
788       case TGSI_FILE_CONSTANT:
789       case TGSI_FILE_SYSTEM_VALUE:
790          c->ssa_temp[def->index] = src;
791          return;
792       }
793    }
794 
795    ureg_MOV(c->ureg, ntt_get_ssa_def_decl(c, def), src);
796 }
797 
798 static void
ntt_store(struct ntt_compile * c,nir_dest * dest,struct ureg_src src)799 ntt_store(struct ntt_compile *c, nir_dest *dest, struct ureg_src src)
800 {
801    if (dest->is_ssa)
802       ntt_store_def(c, &dest->ssa, src);
803    else {
804       struct ureg_dst dst = ntt_get_dest(c, dest);
805       ureg_MOV(c->ureg, dst, src);
806    }
807 }
808 
809 static void
ntt_emit_scalar(struct ntt_compile * c,unsigned tgsi_op,struct ureg_dst dst,struct ureg_src src0,struct ureg_src src1)810 ntt_emit_scalar(struct ntt_compile *c, unsigned tgsi_op,
811                 struct ureg_dst dst,
812                 struct ureg_src src0,
813                 struct ureg_src src1)
814 {
815    unsigned i;
816    int num_src;
817 
818    /* POW is the only 2-operand scalar op. */
819    if (tgsi_op  == TGSI_OPCODE_POW) {
820       num_src = 2;
821    } else {
822       num_src = 1;
823       src1 = src0;
824    }
825 
826    for (i = 0; i < 4; i++) {
827       if (dst.WriteMask & (1 << i)) {
828          struct ureg_dst this_dst = dst;
829          struct ureg_src srcs[2] = {
830             ureg_scalar(src0, i),
831             ureg_scalar(src1, i),
832          };
833          this_dst.WriteMask = (1 << i);
834 
835          ureg_insn(c->ureg, tgsi_op, &this_dst, 1, srcs, num_src, false);
836       }
837    }
838 }
839 
840 static void
ntt_emit_alu(struct ntt_compile * c,nir_alu_instr * instr)841 ntt_emit_alu(struct ntt_compile *c, nir_alu_instr *instr)
842 {
843    struct ureg_src src[4];
844    struct ureg_dst dst;
845    unsigned i;
846    int dst_64 = nir_dest_bit_size(instr->dest.dest) == 64;
847    int src_64 = nir_src_bit_size(instr->src[0].src) == 64;
848    int num_srcs = nir_op_infos[instr->op].num_inputs;
849 
850    assert(num_srcs <= ARRAY_SIZE(src));
851    for (i = 0; i < num_srcs; i++)
852       src[i] = ntt_get_alu_src(c, instr, i);
853    dst = ntt_get_dest(c, &instr->dest.dest);
854 
855    if (instr->dest.saturate)
856       dst.Saturate = true;
857 
858    if (dst_64)
859       dst = ureg_writemask(dst, ntt_64bit_write_mask(instr->dest.write_mask));
860    else
861       dst = ureg_writemask(dst, instr->dest.write_mask);
862 
863    static enum tgsi_opcode op_map[][2] = {
864       [nir_op_mov] = { TGSI_OPCODE_MOV, TGSI_OPCODE_MOV },
865 
866       /* fabs/fneg 32-bit are special-cased below. */
867       [nir_op_fabs] = { 0, TGSI_OPCODE_DABS },
868       [nir_op_fneg] = { 0, TGSI_OPCODE_DNEG },
869 
870       [nir_op_fdot2] = { TGSI_OPCODE_DP2 },
871       [nir_op_fdot3] = { TGSI_OPCODE_DP3 },
872       [nir_op_fdot4] = { TGSI_OPCODE_DP4 },
873       [nir_op_ffloor] = { TGSI_OPCODE_FLR, TGSI_OPCODE_DFLR },
874       [nir_op_ffract] = { TGSI_OPCODE_FRC, TGSI_OPCODE_DFRAC },
875       [nir_op_fceil] = { TGSI_OPCODE_CEIL, TGSI_OPCODE_DCEIL },
876       [nir_op_fround_even] = { TGSI_OPCODE_ROUND, TGSI_OPCODE_DROUND },
877       [nir_op_fdiv] = { TGSI_OPCODE_DIV, TGSI_OPCODE_DDIV },
878       [nir_op_idiv] = { TGSI_OPCODE_IDIV, TGSI_OPCODE_I64DIV },
879       [nir_op_udiv] = { TGSI_OPCODE_UDIV, TGSI_OPCODE_U64DIV },
880 
881       [nir_op_frcp] = { 0, TGSI_OPCODE_DRCP },
882       [nir_op_frsq] = { 0, TGSI_OPCODE_DRSQ },
883       [nir_op_fsqrt] = { 0, TGSI_OPCODE_DSQRT },
884 
885       /* The conversions will have one combination of src and dst bitsize. */
886       [nir_op_f2f32] = { 0, TGSI_OPCODE_D2F },
887       [nir_op_f2f64] = { TGSI_OPCODE_F2D },
888       [nir_op_i2i64] = { TGSI_OPCODE_I2I64 },
889 
890       [nir_op_f2i32] = { TGSI_OPCODE_F2I, TGSI_OPCODE_D2I },
891       [nir_op_f2i64] = { TGSI_OPCODE_F2I64, TGSI_OPCODE_D2I64 },
892       [nir_op_f2u32] = { TGSI_OPCODE_F2U, TGSI_OPCODE_D2U },
893       [nir_op_f2u64] = { TGSI_OPCODE_F2U64, TGSI_OPCODE_D2U64 },
894       [nir_op_i2f32] = { TGSI_OPCODE_I2F, TGSI_OPCODE_I642F },
895       [nir_op_i2f64] = { TGSI_OPCODE_I2D, TGSI_OPCODE_I642D },
896       [nir_op_u2f32] = { TGSI_OPCODE_U2F, TGSI_OPCODE_U642F },
897       [nir_op_u2f64] = { TGSI_OPCODE_U2D, TGSI_OPCODE_U642D },
898 
899       [nir_op_slt] = { TGSI_OPCODE_SLT },
900       [nir_op_sge] = { TGSI_OPCODE_SGE },
901       [nir_op_seq] = { TGSI_OPCODE_SEQ },
902       [nir_op_sne] = { TGSI_OPCODE_SNE },
903 
904       [nir_op_flt32] = { TGSI_OPCODE_FSLT, TGSI_OPCODE_DSLT },
905       [nir_op_fge32] = { TGSI_OPCODE_FSGE, TGSI_OPCODE_DSGE },
906       [nir_op_feq32] = { TGSI_OPCODE_FSEQ, TGSI_OPCODE_DSEQ },
907       [nir_op_fneu32] = { TGSI_OPCODE_FSNE, TGSI_OPCODE_DSNE },
908 
909       [nir_op_ilt32] = { TGSI_OPCODE_ISLT, TGSI_OPCODE_I64SLT },
910       [nir_op_ige32] = { TGSI_OPCODE_ISGE, TGSI_OPCODE_I64SGE },
911       [nir_op_ieq32] = { TGSI_OPCODE_USEQ, TGSI_OPCODE_U64SEQ },
912       [nir_op_ine32] = { TGSI_OPCODE_USNE, TGSI_OPCODE_U64SNE },
913 
914       [nir_op_ult32] = { TGSI_OPCODE_USLT, TGSI_OPCODE_U64SLT },
915       [nir_op_uge32] = { TGSI_OPCODE_USGE, TGSI_OPCODE_U64SGE },
916 
917       [nir_op_iabs] = { TGSI_OPCODE_IABS, TGSI_OPCODE_I64ABS },
918       [nir_op_ineg] = { TGSI_OPCODE_INEG, TGSI_OPCODE_I64NEG },
919       [nir_op_fsign] = { TGSI_OPCODE_SSG },
920       [nir_op_isign] = { TGSI_OPCODE_ISSG },
921       [nir_op_ftrunc] = { TGSI_OPCODE_TRUNC, TGSI_OPCODE_DTRUNC },
922       [nir_op_fddx] = { TGSI_OPCODE_DDX },
923       [nir_op_fddy] = { TGSI_OPCODE_DDY },
924       [nir_op_fddx_coarse] = { TGSI_OPCODE_DDX },
925       [nir_op_fddy_coarse] = { TGSI_OPCODE_DDY },
926       [nir_op_fddx_fine] = { TGSI_OPCODE_DDX_FINE },
927       [nir_op_fddy_fine] = { TGSI_OPCODE_DDY_FINE },
928       [nir_op_pack_half_2x16] = { TGSI_OPCODE_PK2H },
929       [nir_op_unpack_half_2x16] = { TGSI_OPCODE_UP2H },
930       [nir_op_ibitfield_extract] = { TGSI_OPCODE_IBFE },
931       [nir_op_ubitfield_extract] = { TGSI_OPCODE_UBFE },
932       [nir_op_bitfield_insert] = { TGSI_OPCODE_BFI },
933       [nir_op_bitfield_reverse] = { TGSI_OPCODE_BREV },
934       [nir_op_bit_count] = { TGSI_OPCODE_POPC },
935       [nir_op_ifind_msb] = { TGSI_OPCODE_IMSB },
936       [nir_op_ufind_msb] = { TGSI_OPCODE_UMSB },
937       [nir_op_find_lsb] = { TGSI_OPCODE_LSB },
938       [nir_op_fadd] = { TGSI_OPCODE_ADD, TGSI_OPCODE_DADD },
939       [nir_op_iadd] = { TGSI_OPCODE_UADD, TGSI_OPCODE_U64ADD },
940       [nir_op_fmul] = { TGSI_OPCODE_MUL, TGSI_OPCODE_DMUL },
941       [nir_op_imul] = { TGSI_OPCODE_UMUL, TGSI_OPCODE_U64MUL },
942       [nir_op_imod] = { TGSI_OPCODE_MOD, TGSI_OPCODE_I64MOD },
943       [nir_op_umod] = { TGSI_OPCODE_UMOD, TGSI_OPCODE_U64MOD },
944       [nir_op_imul_high] = { TGSI_OPCODE_IMUL_HI },
945       [nir_op_umul_high] = { TGSI_OPCODE_UMUL_HI },
946       [nir_op_ishl] = { TGSI_OPCODE_SHL, TGSI_OPCODE_U64SHL },
947       [nir_op_ishr] = { TGSI_OPCODE_ISHR, TGSI_OPCODE_I64SHR },
948       [nir_op_ushr] = { TGSI_OPCODE_USHR, TGSI_OPCODE_U64SHR },
949 
950       /* These bitwise ops don't care about 32 vs 64 types, so they have the
951        * same TGSI op.
952        */
953       [nir_op_inot] = { TGSI_OPCODE_NOT, TGSI_OPCODE_NOT },
954       [nir_op_iand] = { TGSI_OPCODE_AND, TGSI_OPCODE_AND },
955       [nir_op_ior] = { TGSI_OPCODE_OR, TGSI_OPCODE_OR },
956       [nir_op_ixor] = { TGSI_OPCODE_XOR, TGSI_OPCODE_XOR },
957 
958       [nir_op_fmin] = { TGSI_OPCODE_MIN, TGSI_OPCODE_DMIN },
959       [nir_op_imin] = { TGSI_OPCODE_IMIN, TGSI_OPCODE_I64MIN },
960       [nir_op_umin] = { TGSI_OPCODE_UMIN, TGSI_OPCODE_U64MIN },
961       [nir_op_fmax] = { TGSI_OPCODE_MAX, TGSI_OPCODE_DMAX },
962       [nir_op_imax] = { TGSI_OPCODE_IMAX, TGSI_OPCODE_I64MAX },
963       [nir_op_umax] = { TGSI_OPCODE_UMAX, TGSI_OPCODE_U64MAX },
964       [nir_op_ffma] = { TGSI_OPCODE_MAD, TGSI_OPCODE_DMAD },
965       [nir_op_ldexp] = { TGSI_OPCODE_LDEXP, 0 },
966    };
967 
968    /* TGSI's 64 bit compares storing to 32-bit are weird and write .xz instead
969     * of .xy.  Store to a temp and move it to the real dst.
970     */
971    bool tgsi_64bit_compare = src_64 && !dst_64 &&
972       (num_srcs == 2 ||
973         nir_op_infos[instr->op].output_type == nir_type_bool32) &&
974       (dst.WriteMask != TGSI_WRITEMASK_X);
975 
976    /* TGSI 64bit-to-32-bit conversions only generate results in the .xy
977     * channels and will need to get fixed up.
978     */
979    bool tgsi_64bit_downconvert = (src_64 && !dst_64 &&
980                                   num_srcs == 1 && !tgsi_64bit_compare &&
981                                   (dst.WriteMask & ~TGSI_WRITEMASK_XY));
982 
983    struct ureg_dst real_dst = ureg_dst_undef();
984    if (tgsi_64bit_compare || tgsi_64bit_downconvert) {
985       real_dst = dst;
986       dst = ureg_DECL_temporary(c->ureg);
987    }
988 
989    bool table_op64 = src_64;
990    if (instr->op < ARRAY_SIZE(op_map) && op_map[instr->op][table_op64] != 0) {
991       /* The normal path for NIR to TGSI ALU op translation */
992       ureg_insn(c->ureg, op_map[instr->op][table_op64],
993                 &dst, 1, src, num_srcs, false);
994    } else {
995       /* Special cases for NIR to TGSI ALU op translation. */
996 
997       /* TODO: Use something like the ntt_store() path for the MOV calls so we
998        * don't emit extra MOVs for swizzles/srcmods of inputs/const/imm.
999        */
1000 
1001       switch (instr->op) {
1002       case nir_op_u2u64:
1003          ureg_AND(c->ureg, dst, ureg_swizzle(src[0],
1004                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1005                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1006                   ureg_imm4u(c->ureg, ~0, 0, ~0, 0));
1007          break;
1008 
1009       case nir_op_i2i32:
1010       case nir_op_u2u32:
1011          assert(src_64);
1012          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1013                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1014                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_X));
1015          break;
1016 
1017       case nir_op_fabs:
1018          ureg_MOV(c->ureg, dst, ureg_abs(src[0]));
1019          break;
1020 
1021       case nir_op_fsat:
1022          if (dst_64) {
1023             ureg_MIN(c->ureg, dst, src[0], ntt_64bit_1f(c));
1024             ureg_MAX(c->ureg, dst, ureg_src(dst), ureg_imm1u(c->ureg, 0));
1025          } else {
1026             ureg_MOV(c->ureg, ureg_saturate(dst), src[0]);
1027          }
1028          break;
1029 
1030       case nir_op_fneg:
1031          ureg_MOV(c->ureg, dst, ureg_negate(src[0]));
1032          break;
1033 
1034          /* NOTE: TGSI 32-bit math ops have the old "one source channel
1035           * replicated to all dst channels" behavior, while 64 is normal mapping
1036           * of src channels to dst.
1037           */
1038       case nir_op_frcp:
1039          assert(!dst_64);
1040          ntt_emit_scalar(c, TGSI_OPCODE_RCP, dst, src[0], src[1]);
1041          break;
1042 
1043       case nir_op_frsq:
1044          assert(!dst_64);
1045          ntt_emit_scalar(c, TGSI_OPCODE_RSQ, dst, src[0], src[1]);
1046          break;
1047 
1048       case nir_op_fsqrt:
1049          assert(!dst_64);
1050          ntt_emit_scalar(c, TGSI_OPCODE_SQRT, dst, src[0], src[1]);
1051          break;
1052 
1053       case nir_op_fexp2:
1054          assert(!dst_64);
1055          ntt_emit_scalar(c, TGSI_OPCODE_EX2, dst, src[0], src[1]);
1056          break;
1057 
1058       case nir_op_flog2:
1059          assert(!dst_64);
1060          ntt_emit_scalar(c, TGSI_OPCODE_LG2, dst, src[0], src[1]);
1061          break;
1062 
1063       case nir_op_b2f32:
1064          ureg_AND(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 1.0));
1065          break;
1066 
1067       case nir_op_b2f64:
1068          ureg_AND(c->ureg, dst,
1069                   ureg_swizzle(src[0],
1070                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1071                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1072                   ntt_64bit_1f(c));
1073          break;
1074 
1075       case nir_op_f2b32:
1076          if (src_64)
1077             ureg_DSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
1078          else
1079             ureg_FSNE(c->ureg, dst, src[0], ureg_imm1f(c->ureg, 0));
1080          break;
1081 
1082       case nir_op_i2b32:
1083          if (src_64) {
1084             ureg_U64SNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
1085          } else
1086             ureg_USNE(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 0));
1087          break;
1088 
1089       case nir_op_b2i32:
1090          ureg_AND(c->ureg, dst, src[0], ureg_imm1u(c->ureg, 1));
1091          break;
1092 
1093       case nir_op_b2i64:
1094          ureg_AND(c->ureg, dst,
1095                   ureg_swizzle(src[0],
1096                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1097                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1098                   ureg_imm4u(c->ureg, 1, 0, 1, 0));
1099          break;
1100 
1101       case nir_op_fsin:
1102          ntt_emit_scalar(c, TGSI_OPCODE_SIN, dst, src[0], src[1]);
1103          break;
1104 
1105       case nir_op_fcos:
1106          ntt_emit_scalar(c, TGSI_OPCODE_COS, dst, src[0], src[1]);
1107          break;
1108 
1109       case nir_op_fsub:
1110          assert(!dst_64);
1111          ureg_ADD(c->ureg, dst, src[0], ureg_negate(src[1]));
1112          break;
1113 
1114       case nir_op_isub:
1115          assert(!dst_64);
1116          ureg_UADD(c->ureg, dst, src[0], ureg_negate(src[1]));
1117          break;
1118 
1119       case nir_op_fmod:
1120          unreachable("should be handled by .lower_fmod = true");
1121          break;
1122 
1123       case nir_op_fpow:
1124          ntt_emit_scalar(c, TGSI_OPCODE_POW, dst, src[0], src[1]);
1125          break;
1126 
1127       case nir_op_flrp:
1128          ureg_LRP(c->ureg, dst, src[2], src[1], src[0]);
1129          break;
1130 
1131       case nir_op_pack_64_2x32_split:
1132          ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_XZ),
1133                   ureg_swizzle(src[0],
1134                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1135                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1136          ureg_MOV(c->ureg, ureg_writemask(dst, TGSI_WRITEMASK_YW),
1137                   ureg_swizzle(src[1],
1138                                TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1139                                TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1140          break;
1141 
1142       case nir_op_unpack_64_2x32_split_x:
1143          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1144                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z,
1145                                              TGSI_SWIZZLE_X, TGSI_SWIZZLE_Z));
1146          break;
1147 
1148       case nir_op_unpack_64_2x32_split_y:
1149          ureg_MOV(c->ureg, dst, ureg_swizzle(src[0],
1150                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W,
1151                                              TGSI_SWIZZLE_Y, TGSI_SWIZZLE_W));
1152          break;
1153 
1154       case nir_op_b32csel:
1155          if (nir_src_bit_size(instr->src[1].src) == 64) {
1156             ureg_UCMP(c->ureg, dst, ureg_swizzle(src[0],
1157                                                  TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1158                                                  TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y),
1159                       src[1], src[2]);
1160          } else {
1161             ureg_UCMP(c->ureg, dst, src[0], src[1], src[2]);
1162          }
1163          break;
1164 
1165       case nir_op_fcsel:
1166          /* NIR is src0 != 0 ? src1 : src2.
1167           * TGSI is src0 < 0 ? src1 : src2.
1168           *
1169           * However, fcsel so far as I can find only appears on bools-as-floats
1170           * (1.0 or 0.0), so we can just negate it for the TGSI op.  It's
1171           * important to not have an abs here, as i915g has to make extra
1172           * instructions to do the abs.
1173           */
1174          ureg_CMP(c->ureg, dst, ureg_negate(src[0]), src[1], src[2]);
1175          break;
1176 
1177          /* It would be nice if we could get this left as scalar in NIR, since
1178           * the TGSI op is scalar.
1179           */
1180       case nir_op_frexp_sig:
1181       case nir_op_frexp_exp: {
1182          assert(src_64);
1183          struct ureg_dst temp = ureg_DECL_temporary(c->ureg);
1184 
1185          for (int chan = 0; chan < 2; chan++) {
1186             int wm = 1 << chan;
1187 
1188             if (!(instr->dest.write_mask & wm))
1189                continue;
1190 
1191             struct ureg_dst dsts[2] = { temp, temp };
1192             if (instr->op == nir_op_frexp_sig) {
1193                dsts[0] = ureg_writemask(dst, ntt_64bit_write_mask(wm));
1194             } else {
1195                dsts[1] = ureg_writemask(dst, wm);
1196             }
1197 
1198             struct ureg_src chan_src = ureg_swizzle(src[0],
1199                                                     chan * 2, chan * 2 + 1,
1200                                                     chan * 2, chan * 2 + 1);
1201 
1202             ureg_insn(c->ureg, TGSI_OPCODE_DFRACEXP,
1203                       dsts, 2,
1204                       &chan_src, 1, false);
1205          }
1206 
1207          ureg_release_temporary(c->ureg, temp);
1208          break;
1209       }
1210 
1211       case nir_op_ldexp:
1212          assert(dst_64); /* 32bit handled in table. */
1213          ureg_DLDEXP(c->ureg, dst, src[0],
1214                      ureg_swizzle(src[1],
1215                                   TGSI_SWIZZLE_X, TGSI_SWIZZLE_X,
1216                                   TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Y));
1217          break;
1218 
1219       case nir_op_vec4:
1220       case nir_op_vec3:
1221       case nir_op_vec2:
1222          unreachable("covered by nir_lower_vec_to_movs()");
1223 
1224       default:
1225          fprintf(stderr, "Unknown NIR opcode: %s\n", nir_op_infos[instr->op].name);
1226          unreachable("Unknown NIR opcode");
1227       }
1228    }
1229 
1230    /* 64-bit op fixup movs */
1231    if (!ureg_dst_is_undef(real_dst)) {
1232       if (tgsi_64bit_compare) {
1233          ureg_MOV(c->ureg, real_dst,
1234                   ureg_swizzle(ureg_src(dst), 0, 2, 0, 2));
1235       } else {
1236          assert(tgsi_64bit_downconvert);
1237          uint8_t swizzle[] = {0, 0, 0, 0};
1238          uint32_t second_bit = real_dst.WriteMask & ~(1 << (ffs(real_dst.WriteMask) - 1));
1239          if (second_bit)
1240             swizzle[ffs(second_bit) - 1] = 1;
1241          ureg_MOV(c->ureg, real_dst, ureg_swizzle(ureg_src(dst),
1242                                                   swizzle[0],
1243                                                   swizzle[1],
1244                                                   swizzle[2],
1245                                                   swizzle[3]));
1246       }
1247       ureg_release_temporary(c->ureg, dst);
1248    }
1249 }
1250 
1251 static struct ureg_src
ntt_ureg_src_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1252 ntt_ureg_src_indirect(struct ntt_compile *c, struct ureg_src usrc,
1253                       nir_src src)
1254 {
1255    if (nir_src_is_const(src)) {
1256       usrc.Index += ntt_src_as_uint(c, src);
1257       return usrc;
1258    } else {
1259       return ureg_src_indirect(usrc, ntt_reladdr(c, ntt_get_src(c, src)));
1260    }
1261 }
1262 
1263 static struct ureg_dst
ntt_ureg_dst_indirect(struct ntt_compile * c,struct ureg_dst dst,nir_src src)1264 ntt_ureg_dst_indirect(struct ntt_compile *c, struct ureg_dst dst,
1265                       nir_src src)
1266 {
1267    if (nir_src_is_const(src)) {
1268       dst.Index += ntt_src_as_uint(c, src);
1269       return dst;
1270    } else {
1271       return ureg_dst_indirect(dst, ntt_reladdr(c, ntt_get_src(c, src)));
1272    }
1273 }
1274 
1275 static struct ureg_src
ntt_ureg_src_dimension_indirect(struct ntt_compile * c,struct ureg_src usrc,nir_src src)1276 ntt_ureg_src_dimension_indirect(struct ntt_compile *c, struct ureg_src usrc,
1277                          nir_src src)
1278 {
1279    if (nir_src_is_const(src)) {
1280       return ureg_src_dimension(usrc, ntt_src_as_uint(c, src));
1281    }
1282    else
1283    {
1284       return ureg_src_dimension_indirect(usrc,
1285                                          ntt_reladdr(c, ntt_get_src(c, src)),
1286                                          0);
1287    }
1288 }
1289 
1290 static struct ureg_dst
ntt_ureg_dst_dimension_indirect(struct ntt_compile * c,struct ureg_dst udst,nir_src src)1291 ntt_ureg_dst_dimension_indirect(struct ntt_compile *c, struct ureg_dst udst,
1292                                 nir_src src)
1293 {
1294    if (nir_src_is_const(src)) {
1295       return ureg_dst_dimension(udst, ntt_src_as_uint(c, src));
1296    } else {
1297       return ureg_dst_dimension_indirect(udst,
1298                                          ntt_reladdr(c, ntt_get_src(c, src)),
1299                                          0);
1300    }
1301 }
1302 /* Some load operations in NIR will have a fractional offset that we need to
1303  * swizzle down before storing to the result register.
1304  */
1305 static struct ureg_src
ntt_shift_by_frac(struct ureg_src src,unsigned frac,unsigned num_components)1306 ntt_shift_by_frac(struct ureg_src src, unsigned frac, unsigned num_components)
1307 {
1308    return ureg_swizzle(src,
1309                        frac,
1310                        frac + MIN2(num_components - 1, 1),
1311                        frac + MIN2(num_components - 1, 2),
1312                        frac + MIN2(num_components - 1, 3));
1313 }
1314 
1315 
1316 static void
ntt_emit_load_ubo(struct ntt_compile * c,nir_intrinsic_instr * instr)1317 ntt_emit_load_ubo(struct ntt_compile *c, nir_intrinsic_instr *instr)
1318 {
1319    int bit_size = nir_dest_bit_size(instr->dest);
1320    assert(bit_size == 32 || instr->num_components <= 2);
1321 
1322    struct ureg_src src = ureg_src_register(TGSI_FILE_CONSTANT, 0);
1323 
1324    struct ureg_dst addr_temp = ureg_dst_undef();
1325 
1326    if (nir_src_is_const(instr->src[0])) {
1327       src = ureg_src_dimension(src, ntt_src_as_uint(c, instr->src[0]));
1328    } else {
1329       /* virglrenderer requires that indirect UBO references have the UBO
1330        * array's base index in the Index field, not added to the indrect
1331        * address.
1332        *
1333        * Many nir intrinsics have a base address const value for the start of
1334        * their array indirection, but load_ubo doesn't.  We fake it by
1335        * subtracting it off here.
1336        */
1337       addr_temp = ureg_DECL_temporary(c->ureg);
1338       ureg_UADD(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, -c->first_ubo));
1339       src = ureg_src_dimension_indirect(src,
1340                                          ntt_reladdr(c, ureg_src(addr_temp)),
1341                                          c->first_ubo);
1342    }
1343 
1344    if (instr->intrinsic == nir_intrinsic_load_ubo_vec4) {
1345       /* !PIPE_CAP_LOAD_CONSTBUF: Just emit it as a vec4 reference to the const
1346        * file.
1347        */
1348 
1349       if (nir_src_is_const(instr->src[1])) {
1350          src.Index += ntt_src_as_uint(c, instr->src[1]);
1351       } else {
1352          src = ureg_src_indirect(src, ntt_reladdr(c, ntt_get_src(c, instr->src[1])));
1353       }
1354 
1355       int start_component = nir_intrinsic_component(instr);
1356       if (bit_size == 64)
1357          start_component *= 2;
1358 
1359       src = ntt_shift_by_frac(src, start_component,
1360                               instr->num_components * bit_size / 32);
1361 
1362       ntt_store(c, &instr->dest, src);
1363    } else {
1364       /* PIPE_CAP_LOAD_CONSTBUF: Not necessarily vec4 aligned, emit a
1365        * TGSI_OPCODE_LOAD instruction from the const file.
1366        */
1367       struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
1368       struct ureg_src srcs[2] = {
1369           src,
1370           ntt_get_src(c, instr->src[1]),
1371       };
1372       ureg_memory_insn(c->ureg, TGSI_OPCODE_LOAD,
1373                        &dst, 1,
1374                        srcs, ARRAY_SIZE(srcs),
1375                        0 /* qualifier */,
1376                        0 /* tex target */,
1377                        0 /* format: unused */
1378       );
1379    }
1380 
1381    ureg_release_temporary(c->ureg, addr_temp);
1382 }
1383 
1384 static unsigned
ntt_get_access_qualifier(nir_intrinsic_instr * instr)1385 ntt_get_access_qualifier(nir_intrinsic_instr *instr)
1386 {
1387    enum gl_access_qualifier access = nir_intrinsic_access(instr);
1388    unsigned qualifier = 0;
1389 
1390    if (access & ACCESS_COHERENT)
1391       qualifier |= TGSI_MEMORY_COHERENT;
1392    if (access & ACCESS_VOLATILE)
1393       qualifier |= TGSI_MEMORY_VOLATILE;
1394    if (access & ACCESS_RESTRICT)
1395       qualifier |= TGSI_MEMORY_RESTRICT;
1396 
1397    return qualifier;
1398 }
1399 
1400 static void
ntt_emit_mem(struct ntt_compile * c,nir_intrinsic_instr * instr,nir_variable_mode mode)1401 ntt_emit_mem(struct ntt_compile *c, nir_intrinsic_instr *instr,
1402              nir_variable_mode mode)
1403 {
1404    bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo ||
1405                     instr->intrinsic == nir_intrinsic_store_shared);
1406    bool is_load = (instr->intrinsic == nir_intrinsic_atomic_counter_read ||
1407                     instr->intrinsic == nir_intrinsic_load_ssbo ||
1408                     instr->intrinsic == nir_intrinsic_load_shared);
1409    unsigned opcode;
1410    struct ureg_src src[4];
1411    int num_src = 0;
1412    int nir_src;
1413    struct ureg_dst addr_temp = ureg_dst_undef();
1414 
1415    struct ureg_src memory;
1416    switch (mode) {
1417    case nir_var_mem_ssbo:
1418       memory = ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_BUFFER, 0),
1419                                      instr->src[is_store ? 1 : 0]);
1420       nir_src = 1;
1421       break;
1422    case nir_var_mem_shared:
1423       memory = ureg_src_register(TGSI_FILE_MEMORY, 0);
1424       nir_src = 0;
1425       break;
1426    case nir_var_uniform: { /* HW atomic buffers */
1427       memory = ureg_src_register(TGSI_FILE_HW_ATOMIC, 0);
1428       /* ntt_ureg_src_indirect, except dividing by 4 */
1429       if (nir_src_is_const(instr->src[0])) {
1430          memory.Index += nir_src_as_uint(instr->src[0]) / 4;
1431       } else {
1432          addr_temp = ureg_DECL_temporary(c->ureg);
1433          ureg_USHR(c->ureg, addr_temp, ntt_get_src(c, instr->src[0]), ureg_imm1i(c->ureg, 2));
1434          memory = ureg_src_indirect(memory, ntt_reladdr(c, ureg_src(addr_temp)));
1435       }
1436       memory = ureg_src_dimension(memory, nir_intrinsic_base(instr));
1437       nir_src = 0;
1438       break;
1439    }
1440 
1441    default:
1442       unreachable("unknown memory type");
1443    }
1444 
1445    if (is_store) {
1446       src[num_src++] = ntt_get_src(c, instr->src[nir_src + 1]); /* offset */
1447       src[num_src++] = ntt_get_src(c, instr->src[0]); /* value */
1448    } else {
1449       src[num_src++] = memory;
1450       if (instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1451          src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* offset */
1452          switch (instr->intrinsic) {
1453          case nir_intrinsic_atomic_counter_inc:
1454             src[num_src++] = ureg_imm1i(c->ureg, 1);
1455             break;
1456          case nir_intrinsic_atomic_counter_post_dec:
1457             src[num_src++] = ureg_imm1i(c->ureg, -1);
1458             break;
1459          default:
1460             if (!is_load)
1461                src[num_src++] = ntt_get_src(c, instr->src[nir_src++]); /* value */
1462             break;
1463          }
1464       }
1465    }
1466 
1467 
1468    switch (instr->intrinsic) {
1469    case nir_intrinsic_atomic_counter_add:
1470    case nir_intrinsic_atomic_counter_inc:
1471    case nir_intrinsic_atomic_counter_post_dec:
1472    case nir_intrinsic_ssbo_atomic_add:
1473    case nir_intrinsic_shared_atomic_add:
1474       opcode = TGSI_OPCODE_ATOMUADD;
1475       break;
1476    case nir_intrinsic_ssbo_atomic_fadd:
1477    case nir_intrinsic_shared_atomic_fadd:
1478       opcode = TGSI_OPCODE_ATOMFADD;
1479       break;
1480    case nir_intrinsic_atomic_counter_min:
1481    case nir_intrinsic_ssbo_atomic_imin:
1482    case nir_intrinsic_shared_atomic_imin:
1483       opcode = TGSI_OPCODE_ATOMIMIN;
1484       break;
1485    case nir_intrinsic_atomic_counter_max:
1486    case nir_intrinsic_ssbo_atomic_imax:
1487    case nir_intrinsic_shared_atomic_imax:
1488       opcode = TGSI_OPCODE_ATOMIMAX;
1489       break;
1490    case nir_intrinsic_ssbo_atomic_umin:
1491    case nir_intrinsic_shared_atomic_umin:
1492       opcode = TGSI_OPCODE_ATOMUMIN;
1493       break;
1494    case nir_intrinsic_ssbo_atomic_umax:
1495    case nir_intrinsic_shared_atomic_umax:
1496       opcode = TGSI_OPCODE_ATOMUMAX;
1497       break;
1498    case nir_intrinsic_atomic_counter_and:
1499    case nir_intrinsic_ssbo_atomic_and:
1500    case nir_intrinsic_shared_atomic_and:
1501       opcode = TGSI_OPCODE_ATOMAND;
1502       break;
1503    case nir_intrinsic_atomic_counter_or:
1504    case nir_intrinsic_ssbo_atomic_or:
1505    case nir_intrinsic_shared_atomic_or:
1506       opcode = TGSI_OPCODE_ATOMOR;
1507       break;
1508    case nir_intrinsic_atomic_counter_xor:
1509    case nir_intrinsic_ssbo_atomic_xor:
1510    case nir_intrinsic_shared_atomic_xor:
1511       opcode = TGSI_OPCODE_ATOMXOR;
1512       break;
1513    case nir_intrinsic_atomic_counter_exchange:
1514    case nir_intrinsic_ssbo_atomic_exchange:
1515    case nir_intrinsic_shared_atomic_exchange:
1516       opcode = TGSI_OPCODE_ATOMXCHG;
1517       break;
1518    case nir_intrinsic_atomic_counter_comp_swap:
1519    case nir_intrinsic_ssbo_atomic_comp_swap:
1520    case nir_intrinsic_shared_atomic_comp_swap:
1521       opcode = TGSI_OPCODE_ATOMCAS;
1522       src[num_src++] = ntt_get_src(c, instr->src[nir_src++]);
1523       break;
1524    case nir_intrinsic_atomic_counter_read:
1525    case nir_intrinsic_load_ssbo:
1526    case nir_intrinsic_load_shared:
1527       opcode = TGSI_OPCODE_LOAD;
1528       break;
1529    case nir_intrinsic_store_ssbo:
1530    case nir_intrinsic_store_shared:
1531       opcode = TGSI_OPCODE_STORE;
1532       break;
1533    case nir_intrinsic_get_ssbo_size:
1534       opcode = TGSI_OPCODE_RESQ;
1535       break;
1536    default:
1537       unreachable("unknown memory op");
1538    }
1539 
1540    unsigned qualifier = 0;
1541    if (mode == nir_var_mem_ssbo &&
1542        instr->intrinsic != nir_intrinsic_get_ssbo_size) {
1543       qualifier = ntt_get_access_qualifier(instr);
1544    }
1545 
1546    struct ureg_dst dst;
1547    if (is_store) {
1548       dst = ureg_dst(memory);
1549 
1550       unsigned write_mask = nir_intrinsic_write_mask(instr);
1551       if (nir_src_bit_size(instr->src[0]) == 64)
1552          write_mask = ntt_64bit_write_mask(write_mask);
1553       dst = ureg_writemask(dst, write_mask);
1554    } else {
1555       dst = ntt_get_dest(c, &instr->dest);
1556    }
1557 
1558    ureg_memory_insn(c->ureg, opcode,
1559                     &dst, 1,
1560                     src, num_src,
1561                     qualifier,
1562                     TGSI_TEXTURE_BUFFER,
1563                     0 /* format: unused */);
1564 
1565    ureg_release_temporary(c->ureg, addr_temp);
1566 }
1567 
1568 static void
ntt_emit_image_load_store(struct ntt_compile * c,nir_intrinsic_instr * instr)1569 ntt_emit_image_load_store(struct ntt_compile *c, nir_intrinsic_instr *instr)
1570 {
1571    unsigned op;
1572    struct ureg_src srcs[4];
1573    int num_src = 0;
1574    enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
1575    bool is_array = nir_intrinsic_image_array(instr);
1576 
1577    struct ureg_dst temp = ureg_dst_undef();
1578 
1579    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(dim, is_array, false);
1580 
1581    struct ureg_src resource =
1582       ntt_ureg_src_indirect(c, ureg_src_register(TGSI_FILE_IMAGE, 0),
1583                             instr->src[0]);
1584 
1585    struct ureg_dst dst;
1586    if (instr->intrinsic == nir_intrinsic_image_store) {
1587       dst = ureg_dst(resource);
1588    } else {
1589       srcs[num_src++] = resource;
1590       dst = ntt_get_dest(c, &instr->dest);
1591    }
1592 
1593    if (instr->intrinsic != nir_intrinsic_image_size) {
1594       struct ureg_src coord = ntt_get_src(c, instr->src[1]);
1595 
1596       if (dim == GLSL_SAMPLER_DIM_MS) {
1597          temp = ureg_DECL_temporary(c->ureg);
1598          ureg_MOV(c->ureg, temp, coord);
1599          ureg_MOV(c->ureg, ureg_writemask(temp, 1 << (is_array ? 3 : 2)),
1600                   ureg_scalar(ntt_get_src(c, instr->src[2]), TGSI_SWIZZLE_X));
1601          coord = ureg_src(temp);
1602       }
1603       srcs[num_src++] = coord;
1604 
1605       if (instr->intrinsic != nir_intrinsic_image_load) {
1606          srcs[num_src++] = ntt_get_src(c, instr->src[3]); /* data */
1607          if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
1608             srcs[num_src++] = ntt_get_src(c, instr->src[4]); /* data2 */
1609       }
1610    }
1611 
1612    switch (instr->intrinsic) {
1613    case nir_intrinsic_image_load:
1614       op = TGSI_OPCODE_LOAD;
1615       break;
1616    case nir_intrinsic_image_store:
1617       op = TGSI_OPCODE_STORE;
1618       break;
1619    case nir_intrinsic_image_size:
1620       op = TGSI_OPCODE_RESQ;
1621       break;
1622    case nir_intrinsic_image_atomic_add:
1623       op = TGSI_OPCODE_ATOMUADD;
1624       break;
1625    case nir_intrinsic_image_atomic_fadd:
1626       op = TGSI_OPCODE_ATOMFADD;
1627       break;
1628    case nir_intrinsic_image_atomic_imin:
1629       op = TGSI_OPCODE_ATOMIMIN;
1630       break;
1631    case nir_intrinsic_image_atomic_umin:
1632       op = TGSI_OPCODE_ATOMUMIN;
1633       break;
1634    case nir_intrinsic_image_atomic_imax:
1635       op = TGSI_OPCODE_ATOMIMAX;
1636       break;
1637    case nir_intrinsic_image_atomic_umax:
1638       op = TGSI_OPCODE_ATOMUMAX;
1639       break;
1640    case nir_intrinsic_image_atomic_and:
1641       op = TGSI_OPCODE_ATOMAND;
1642       break;
1643    case nir_intrinsic_image_atomic_or:
1644       op = TGSI_OPCODE_ATOMOR;
1645       break;
1646    case nir_intrinsic_image_atomic_xor:
1647       op = TGSI_OPCODE_ATOMXOR;
1648       break;
1649    case nir_intrinsic_image_atomic_exchange:
1650       op = TGSI_OPCODE_ATOMXCHG;
1651       break;
1652    case nir_intrinsic_image_atomic_comp_swap:
1653       op = TGSI_OPCODE_ATOMCAS;
1654       break;
1655    default:
1656       unreachable("bad op");
1657    }
1658 
1659    ureg_memory_insn(c->ureg, op, &dst, 1, srcs, num_src,
1660                     ntt_get_access_qualifier(instr),
1661                     target,
1662                     nir_intrinsic_format(instr));
1663 
1664    if (!ureg_dst_is_undef(temp))
1665       ureg_release_temporary(c->ureg, temp);
1666 }
1667 
1668 static void
ntt_emit_load_input(struct ntt_compile * c,nir_intrinsic_instr * instr)1669 ntt_emit_load_input(struct ntt_compile *c, nir_intrinsic_instr *instr)
1670 {
1671    uint32_t frac = nir_intrinsic_component(instr);
1672    uint32_t num_components = instr->num_components;
1673    unsigned base = nir_intrinsic_base(instr);
1674    struct ureg_src input;
1675    nir_io_semantics semantics = nir_intrinsic_io_semantics(instr);
1676    bool is_64 = nir_dest_bit_size(instr->dest) == 64;
1677 
1678    if (c->s->info.stage == MESA_SHADER_VERTEX) {
1679       input = ureg_DECL_vs_input(c->ureg, base);
1680       for (int i = 1; i < semantics.num_slots; i++)
1681          ureg_DECL_vs_input(c->ureg, base + i);
1682    } else if (c->s->info.stage != MESA_SHADER_FRAGMENT) {
1683       unsigned semantic_name, semantic_index;
1684       ntt_get_gl_varying_semantic(c, semantics.location,
1685                                   &semantic_name, &semantic_index);
1686 
1687       /* XXX: ArrayID is used in r600 gs inputs */
1688       uint32_t array_id = 0;
1689 
1690       input = ureg_DECL_input_layout(c->ureg,
1691                                      semantic_name,
1692                                      semantic_index,
1693                                      base,
1694                                      ntt_tgsi_usage_mask(frac,
1695                                                          instr->num_components,
1696                                                          is_64),
1697                                      array_id,
1698                                      semantics.num_slots);
1699    } else {
1700       input = c->input_index_map[base];
1701    }
1702 
1703    if (is_64)
1704       num_components *= 2;
1705 
1706    input = ntt_shift_by_frac(input, frac, num_components);
1707 
1708    switch (instr->intrinsic) {
1709    case nir_intrinsic_load_input:
1710       input = ntt_ureg_src_indirect(c, input, instr->src[0]);
1711       ntt_store(c, &instr->dest, input);
1712       break;
1713 
1714    case nir_intrinsic_load_per_vertex_input:
1715       input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1716       input = ntt_ureg_src_dimension_indirect(c, input, instr->src[0]);
1717       ntt_store(c, &instr->dest, input);
1718       break;
1719 
1720    case nir_intrinsic_load_interpolated_input: {
1721       input = ntt_ureg_src_indirect(c, input, instr->src[1]);
1722 
1723       nir_intrinsic_instr *bary_instr =
1724          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
1725 
1726       switch (bary_instr->intrinsic) {
1727       case nir_intrinsic_load_barycentric_pixel:
1728       case nir_intrinsic_load_barycentric_sample:
1729          /* For these, we know that the barycentric load matches the
1730           * interpolation on the input declaration, so we can use it directly.
1731           */
1732          ntt_store(c, &instr->dest, input);
1733          break;
1734 
1735       case nir_intrinsic_load_barycentric_centroid:
1736          /* If the input was declared centroid, then there's no need to
1737           * emit the extra TGSI interp instruction, we can just read the
1738           * input.
1739           */
1740          if (c->centroid_inputs & (1ull << nir_intrinsic_base(instr))) {
1741             ntt_store(c, &instr->dest, input);
1742          } else {
1743             ureg_INTERP_CENTROID(c->ureg, ntt_get_dest(c, &instr->dest),
1744                                  input);
1745          }
1746          break;
1747 
1748       case nir_intrinsic_load_barycentric_at_sample:
1749          /* We stored the sample in the fake "bary" dest. */
1750          ureg_INTERP_SAMPLE(c->ureg, ntt_get_dest(c, &instr->dest), input,
1751                             ntt_get_src(c, instr->src[0]));
1752          break;
1753 
1754       case nir_intrinsic_load_barycentric_at_offset:
1755          /* We stored the offset in the fake "bary" dest. */
1756          ureg_INTERP_OFFSET(c->ureg, ntt_get_dest(c, &instr->dest), input,
1757                             ntt_get_src(c, instr->src[0]));
1758          break;
1759 
1760       default:
1761          unreachable("bad barycentric interp intrinsic\n");
1762       }
1763       break;
1764    }
1765 
1766    default:
1767       unreachable("bad load input intrinsic\n");
1768    }
1769 }
1770 
1771 static void
ntt_emit_store_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1772 ntt_emit_store_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1773 {
1774    struct ureg_src src = ntt_get_src(c, instr->src[0]);
1775 
1776    if (src.File == TGSI_FILE_OUTPUT) {
1777       /* If our src is the output file, that's an indication that we were able
1778        * to emit the output stores in the generating instructions and we have
1779        * nothing to do here.
1780        */
1781       return;
1782    }
1783 
1784    uint32_t frac;
1785    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
1786 
1787    if (instr->intrinsic == nir_intrinsic_store_per_vertex_output) {
1788       out = ntt_ureg_dst_indirect(c, out, instr->src[2]);
1789       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[1]);
1790    } else {
1791       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1792    }
1793 
1794    uint8_t swizzle[4] = { 0, 0, 0, 0 };
1795    for (int i = frac; i <= 4; i++) {
1796       if (out.WriteMask & (1 << i))
1797          swizzle[i] = i - frac;
1798    }
1799 
1800    src = ureg_swizzle(src, swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1801 
1802    ureg_MOV(c->ureg, out, src);
1803    ntt_reladdr_dst_put(c, out);
1804 }
1805 
1806 static void
ntt_emit_load_output(struct ntt_compile * c,nir_intrinsic_instr * instr)1807 ntt_emit_load_output(struct ntt_compile *c, nir_intrinsic_instr *instr)
1808 {
1809    /* ntt_try_store_in_tgsi_output() optimization is not valid if load_output
1810     * is present.
1811     */
1812    assert(c->s->info.stage != MESA_SHADER_VERTEX &&
1813           c->s->info.stage != MESA_SHADER_FRAGMENT);
1814 
1815    uint32_t frac;
1816    struct ureg_dst out = ntt_output_decl(c, instr, &frac);
1817 
1818    if (instr->intrinsic == nir_intrinsic_load_per_vertex_output) {
1819       out = ntt_ureg_dst_indirect(c, out, instr->src[1]);
1820       out = ntt_ureg_dst_dimension_indirect(c, out, instr->src[0]);
1821    } else {
1822       out = ntt_ureg_dst_indirect(c, out, instr->src[0]);
1823    }
1824 
1825    ureg_MOV(c->ureg, ntt_get_dest(c, &instr->dest), ureg_src(out));
1826    ntt_reladdr_dst_put(c, out);
1827 }
1828 
1829 static void
ntt_emit_load_sysval(struct ntt_compile * c,nir_intrinsic_instr * instr)1830 ntt_emit_load_sysval(struct ntt_compile *c, nir_intrinsic_instr *instr)
1831 {
1832    gl_system_value sysval = nir_system_value_from_intrinsic(instr->intrinsic);
1833    enum tgsi_semantic semantic = tgsi_get_sysval_semantic(sysval);
1834    struct ureg_src sv = ureg_DECL_system_value(c->ureg, semantic, 0);
1835 
1836    /* virglrenderer doesn't like references to channels of the sysval that
1837     * aren't defined, even if they aren't really read.  (GLSL compile fails on
1838     * gl_NumWorkGroups.w, for example).
1839     */
1840    uint32_t write_mask = BITSET_MASK(nir_dest_num_components(instr->dest));
1841    sv = ntt_swizzle_for_write_mask(sv, write_mask);
1842 
1843    /* TGSI and NIR define these intrinsics as always loading ints, but they can
1844     * still appear on hardware with non-native-integers fragment shaders using
1845     * the draw path (i915g).  In that case, having called nir_lower_int_to_float
1846     * means that we actually want floats instead.
1847     */
1848    if (!c->native_integers) {
1849       switch (instr->intrinsic) {
1850       case nir_intrinsic_load_vertex_id:
1851       case nir_intrinsic_load_instance_id:
1852          ureg_U2F(c->ureg, ntt_get_dest(c, &instr->dest), sv);
1853          return;
1854 
1855       default:
1856          break;
1857       }
1858    }
1859 
1860    ntt_store(c, &instr->dest, sv);
1861 }
1862 
1863 static void
ntt_emit_intrinsic(struct ntt_compile * c,nir_intrinsic_instr * instr)1864 ntt_emit_intrinsic(struct ntt_compile *c, nir_intrinsic_instr *instr)
1865 {
1866    switch (instr->intrinsic) {
1867    case nir_intrinsic_load_ubo:
1868    case nir_intrinsic_load_ubo_vec4:
1869       ntt_emit_load_ubo(c, instr);
1870       break;
1871 
1872       /* Vertex */
1873    case nir_intrinsic_load_vertex_id:
1874    case nir_intrinsic_load_vertex_id_zero_base:
1875    case nir_intrinsic_load_base_vertex:
1876    case nir_intrinsic_load_base_instance:
1877    case nir_intrinsic_load_instance_id:
1878    case nir_intrinsic_load_draw_id:
1879    case nir_intrinsic_load_invocation_id:
1880    case nir_intrinsic_load_frag_coord:
1881    case nir_intrinsic_load_point_coord:
1882    case nir_intrinsic_load_front_face:
1883    case nir_intrinsic_load_sample_id:
1884    case nir_intrinsic_load_sample_pos:
1885    case nir_intrinsic_load_sample_mask_in:
1886    case nir_intrinsic_load_helper_invocation:
1887    case nir_intrinsic_load_tess_coord:
1888    case nir_intrinsic_load_patch_vertices_in:
1889    case nir_intrinsic_load_primitive_id:
1890    case nir_intrinsic_load_tess_level_outer:
1891    case nir_intrinsic_load_tess_level_inner:
1892    case nir_intrinsic_load_local_invocation_id:
1893    case nir_intrinsic_load_workgroup_id:
1894    case nir_intrinsic_load_num_workgroups:
1895    case nir_intrinsic_load_workgroup_size:
1896    case nir_intrinsic_load_subgroup_size:
1897    case nir_intrinsic_load_subgroup_invocation:
1898    case nir_intrinsic_load_subgroup_eq_mask:
1899    case nir_intrinsic_load_subgroup_ge_mask:
1900    case nir_intrinsic_load_subgroup_gt_mask:
1901    case nir_intrinsic_load_subgroup_lt_mask:
1902       ntt_emit_load_sysval(c, instr);
1903       break;
1904 
1905    case nir_intrinsic_load_input:
1906    case nir_intrinsic_load_per_vertex_input:
1907    case nir_intrinsic_load_interpolated_input:
1908       ntt_emit_load_input(c, instr);
1909       break;
1910 
1911    case nir_intrinsic_store_output:
1912    case nir_intrinsic_store_per_vertex_output:
1913       ntt_emit_store_output(c, instr);
1914       break;
1915 
1916    case nir_intrinsic_load_output:
1917    case nir_intrinsic_load_per_vertex_output:
1918       ntt_emit_load_output(c, instr);
1919       break;
1920 
1921    case nir_intrinsic_discard:
1922       ureg_KILL(c->ureg);
1923       break;
1924 
1925    case nir_intrinsic_discard_if: {
1926       struct ureg_src cond = ureg_scalar(ntt_get_src(c, instr->src[0]), 0);
1927 
1928       if (c->native_integers) {
1929          struct ureg_dst temp = ureg_writemask(ureg_DECL_temporary(c->ureg), 1);
1930          ureg_AND(c->ureg, temp, cond, ureg_imm1f(c->ureg, 1.0));
1931          ureg_KILL_IF(c->ureg, ureg_scalar(ureg_negate(ureg_src(temp)), 0));
1932          ureg_release_temporary(c->ureg, temp);
1933       } else {
1934          /* For !native_integers, the bool got lowered to 1.0 or 0.0. */
1935          ureg_KILL_IF(c->ureg, ureg_negate(cond));
1936       }
1937       break;
1938    }
1939 
1940    case nir_intrinsic_load_ssbo:
1941    case nir_intrinsic_store_ssbo:
1942    case nir_intrinsic_ssbo_atomic_add:
1943    case nir_intrinsic_ssbo_atomic_fadd:
1944    case nir_intrinsic_ssbo_atomic_imin:
1945    case nir_intrinsic_ssbo_atomic_imax:
1946    case nir_intrinsic_ssbo_atomic_umin:
1947    case nir_intrinsic_ssbo_atomic_umax:
1948    case nir_intrinsic_ssbo_atomic_and:
1949    case nir_intrinsic_ssbo_atomic_or:
1950    case nir_intrinsic_ssbo_atomic_xor:
1951    case nir_intrinsic_ssbo_atomic_exchange:
1952    case nir_intrinsic_ssbo_atomic_comp_swap:
1953    case nir_intrinsic_get_ssbo_size:
1954       ntt_emit_mem(c, instr, nir_var_mem_ssbo);
1955       break;
1956 
1957    case nir_intrinsic_load_shared:
1958    case nir_intrinsic_store_shared:
1959    case nir_intrinsic_shared_atomic_add:
1960    case nir_intrinsic_shared_atomic_fadd:
1961    case nir_intrinsic_shared_atomic_imin:
1962    case nir_intrinsic_shared_atomic_imax:
1963    case nir_intrinsic_shared_atomic_umin:
1964    case nir_intrinsic_shared_atomic_umax:
1965    case nir_intrinsic_shared_atomic_and:
1966    case nir_intrinsic_shared_atomic_or:
1967    case nir_intrinsic_shared_atomic_xor:
1968    case nir_intrinsic_shared_atomic_exchange:
1969    case nir_intrinsic_shared_atomic_comp_swap:
1970       ntt_emit_mem(c, instr, nir_var_mem_shared);
1971       break;
1972 
1973    case nir_intrinsic_atomic_counter_read:
1974    case nir_intrinsic_atomic_counter_add:
1975    case nir_intrinsic_atomic_counter_inc:
1976    case nir_intrinsic_atomic_counter_post_dec:
1977    case nir_intrinsic_atomic_counter_min:
1978    case nir_intrinsic_atomic_counter_max:
1979    case nir_intrinsic_atomic_counter_and:
1980    case nir_intrinsic_atomic_counter_or:
1981    case nir_intrinsic_atomic_counter_xor:
1982    case nir_intrinsic_atomic_counter_exchange:
1983    case nir_intrinsic_atomic_counter_comp_swap:
1984       ntt_emit_mem(c, instr, nir_var_uniform);
1985       break;
1986    case nir_intrinsic_atomic_counter_pre_dec:
1987       unreachable("Should be lowered by ntt_lower_atomic_pre_dec()");
1988       break;
1989 
1990    case nir_intrinsic_image_load:
1991    case nir_intrinsic_image_store:
1992    case nir_intrinsic_image_size:
1993    case nir_intrinsic_image_atomic_add:
1994    case nir_intrinsic_image_atomic_fadd:
1995    case nir_intrinsic_image_atomic_imin:
1996    case nir_intrinsic_image_atomic_umin:
1997    case nir_intrinsic_image_atomic_imax:
1998    case nir_intrinsic_image_atomic_umax:
1999    case nir_intrinsic_image_atomic_and:
2000    case nir_intrinsic_image_atomic_or:
2001    case nir_intrinsic_image_atomic_xor:
2002    case nir_intrinsic_image_atomic_exchange:
2003    case nir_intrinsic_image_atomic_comp_swap:
2004       ntt_emit_image_load_store(c, instr);
2005       break;
2006 
2007    case nir_intrinsic_control_barrier:
2008    case nir_intrinsic_memory_barrier_tcs_patch:
2009       ureg_BARRIER(c->ureg);
2010       break;
2011 
2012    case nir_intrinsic_memory_barrier:
2013       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
2014                                       TGSI_MEMBAR_SHADER_BUFFER |
2015                                       TGSI_MEMBAR_ATOMIC_BUFFER |
2016                                       TGSI_MEMBAR_SHADER_IMAGE |
2017                                       TGSI_MEMBAR_SHARED));
2018       break;
2019 
2020    case nir_intrinsic_memory_barrier_atomic_counter:
2021       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_ATOMIC_BUFFER));
2022       break;
2023 
2024    case nir_intrinsic_memory_barrier_buffer:
2025       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_BUFFER));
2026       break;
2027 
2028    case nir_intrinsic_memory_barrier_image:
2029       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHADER_IMAGE));
2030       break;
2031 
2032    case nir_intrinsic_memory_barrier_shared:
2033       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg, TGSI_MEMBAR_SHARED));
2034       break;
2035 
2036    case nir_intrinsic_group_memory_barrier:
2037       ureg_MEMBAR(c->ureg, ureg_imm1u(c->ureg,
2038                                       TGSI_MEMBAR_SHADER_BUFFER |
2039                                       TGSI_MEMBAR_ATOMIC_BUFFER |
2040                                       TGSI_MEMBAR_SHADER_IMAGE |
2041                                       TGSI_MEMBAR_SHARED |
2042                                       TGSI_MEMBAR_THREAD_GROUP));
2043       break;
2044 
2045    case nir_intrinsic_end_primitive:
2046       ureg_ENDPRIM(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2047       break;
2048 
2049    case nir_intrinsic_emit_vertex:
2050       ureg_EMIT(c->ureg, ureg_imm1u(c->ureg, nir_intrinsic_stream_id(instr)));
2051       break;
2052 
2053       /* In TGSI we don't actually generate the barycentric coords, and emit
2054        * interp intrinsics later.  However, we do need to store the
2055        * load_barycentric_at_* argument so that we can use it at that point.
2056        */
2057    case nir_intrinsic_load_barycentric_pixel:
2058    case nir_intrinsic_load_barycentric_centroid:
2059    case nir_intrinsic_load_barycentric_sample:
2060       break;
2061    case nir_intrinsic_load_barycentric_at_sample:
2062    case nir_intrinsic_load_barycentric_at_offset:
2063       ntt_store(c, &instr->dest, ntt_get_src(c, instr->src[0]));
2064       break;
2065 
2066    default:
2067       fprintf(stderr, "Unknown intrinsic: ");
2068       nir_print_instr(&instr->instr, stderr);
2069       fprintf(stderr, "\n");
2070       break;
2071    }
2072 }
2073 
2074 struct ntt_tex_operand_state {
2075    struct ureg_src srcs[4];
2076    unsigned i;
2077 };
2078 
2079 static void
ntt_push_tex_arg(struct ntt_compile * c,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_tex_operand_state * s)2080 ntt_push_tex_arg(struct ntt_compile *c,
2081                  nir_tex_instr *instr,
2082                  nir_tex_src_type tex_src_type,
2083                  struct ntt_tex_operand_state *s)
2084 {
2085    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2086    if (tex_src < 0)
2087       return;
2088 
2089    s->srcs[s->i++] = ntt_get_src(c, instr->src[tex_src].src);
2090 }
2091 
2092 static void
ntt_emit_texture(struct ntt_compile * c,nir_tex_instr * instr)2093 ntt_emit_texture(struct ntt_compile *c, nir_tex_instr *instr)
2094 {
2095    struct ureg_dst dst = ntt_get_dest(c, &instr->dest);
2096    enum tgsi_texture_type target = tgsi_texture_type_from_sampler_dim(instr->sampler_dim, instr->is_array, instr->is_shadow);
2097    unsigned tex_opcode;
2098 
2099    struct ureg_src sampler = ureg_DECL_sampler(c->ureg, instr->sampler_index);
2100    int sampler_src = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset);
2101    if (sampler_src >= 0) {
2102       struct ureg_src reladdr = ntt_get_src(c, instr->src[sampler_src].src);
2103       sampler = ureg_src_indirect(sampler, ntt_reladdr(c, reladdr));
2104    }
2105 
2106    switch (instr->op) {
2107    case nir_texop_tex:
2108       if (nir_tex_instr_src_size(instr, nir_tex_instr_src_index(instr, nir_tex_src_backend1)) >
2109          MAX2(instr->coord_components, 2) + instr->is_shadow)
2110          tex_opcode = TGSI_OPCODE_TXP;
2111       else
2112          tex_opcode = TGSI_OPCODE_TEX;
2113       break;
2114    case nir_texop_txf:
2115    case nir_texop_txf_ms:
2116       tex_opcode = TGSI_OPCODE_TXF;
2117 
2118       if (c->has_txf_lz) {
2119          int lod_src = nir_tex_instr_src_index(instr, nir_tex_src_lod);
2120          if (lod_src >= 0 &&
2121              nir_src_is_const(instr->src[lod_src].src) &&
2122              ntt_src_as_uint(c, instr->src[lod_src].src) == 0) {
2123             tex_opcode = TGSI_OPCODE_TXF_LZ;
2124          }
2125       }
2126       break;
2127    case nir_texop_txl:
2128       tex_opcode = TGSI_OPCODE_TXL;
2129       break;
2130    case nir_texop_txb:
2131       tex_opcode = TGSI_OPCODE_TXB;
2132       break;
2133    case nir_texop_txd:
2134       tex_opcode = TGSI_OPCODE_TXD;
2135       break;
2136    case nir_texop_txs:
2137       tex_opcode = TGSI_OPCODE_TXQ;
2138       break;
2139    case nir_texop_tg4:
2140       tex_opcode = TGSI_OPCODE_TG4;
2141       break;
2142    case nir_texop_query_levels:
2143       tex_opcode = TGSI_OPCODE_TXQ;
2144       break;
2145    case nir_texop_lod:
2146       tex_opcode = TGSI_OPCODE_LODQ;
2147       break;
2148    case nir_texop_texture_samples:
2149       tex_opcode = TGSI_OPCODE_TXQS;
2150       break;
2151    default:
2152       unreachable("unsupported tex op");
2153    }
2154 
2155    struct ntt_tex_operand_state s = { .i = 0 };
2156    ntt_push_tex_arg(c, instr, nir_tex_src_backend1, &s);
2157    ntt_push_tex_arg(c, instr, nir_tex_src_backend2, &s);
2158 
2159    /* non-coord arg for TXQ */
2160    if (tex_opcode == TGSI_OPCODE_TXQ) {
2161       ntt_push_tex_arg(c, instr, nir_tex_src_lod, &s);
2162       /* virglrenderer mistakenly looks at .w instead of .x, so make sure it's
2163        * scalar
2164        */
2165       s.srcs[s.i - 1] = ureg_scalar(s.srcs[s.i - 1], 0);
2166    }
2167 
2168    if (s.i > 1) {
2169       if (tex_opcode == TGSI_OPCODE_TEX)
2170          tex_opcode = TGSI_OPCODE_TEX2;
2171       if (tex_opcode == TGSI_OPCODE_TXB)
2172          tex_opcode = TGSI_OPCODE_TXB2;
2173       if (tex_opcode == TGSI_OPCODE_TXL)
2174          tex_opcode = TGSI_OPCODE_TXL2;
2175    }
2176 
2177    if (instr->op == nir_texop_txd) {
2178       /* Derivs appear in their own src args */
2179       int ddx = nir_tex_instr_src_index(instr, nir_tex_src_ddx);
2180       int ddy = nir_tex_instr_src_index(instr, nir_tex_src_ddy);
2181       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddx].src);
2182       s.srcs[s.i++] = ntt_get_src(c, instr->src[ddy].src);
2183    }
2184 
2185    if (instr->op == nir_texop_tg4 && target != TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
2186       if (c->screen->get_param(c->screen,
2187                                PIPE_CAP_TGSI_TG4_COMPONENT_IN_SWIZZLE)) {
2188          sampler = ureg_scalar(sampler, instr->component);
2189          s.srcs[s.i++] = ureg_src_undef();
2190       } else {
2191          s.srcs[s.i++] = ureg_imm1u(c->ureg, instr->component);
2192       }
2193    }
2194 
2195    s.srcs[s.i++] = sampler;
2196 
2197    enum tgsi_return_type tex_type;
2198    switch (instr->dest_type) {
2199    case nir_type_float32:
2200       tex_type = TGSI_RETURN_TYPE_FLOAT;
2201       break;
2202    case nir_type_int32:
2203       tex_type = TGSI_RETURN_TYPE_SINT;
2204       break;
2205    case nir_type_uint32:
2206       tex_type = TGSI_RETURN_TYPE_UINT;
2207       break;
2208    default:
2209       unreachable("unknown texture type");
2210    }
2211 
2212    struct tgsi_texture_offset tex_offsets[4];
2213    unsigned num_tex_offsets = 0;
2214    int tex_offset_src = nir_tex_instr_src_index(instr, nir_tex_src_offset);
2215    if (tex_offset_src >= 0) {
2216       struct ureg_src offset = ntt_get_src(c, instr->src[tex_offset_src].src);
2217 
2218       tex_offsets[0].File = offset.File;
2219       tex_offsets[0].Index = offset.Index;
2220       tex_offsets[0].SwizzleX = offset.SwizzleX;
2221       tex_offsets[0].SwizzleY = offset.SwizzleY;
2222       tex_offsets[0].SwizzleZ = offset.SwizzleZ;
2223       tex_offsets[0].Padding = 0;
2224 
2225       num_tex_offsets = 1;
2226    }
2227 
2228    struct ureg_dst tex_dst;
2229    if (instr->op == nir_texop_query_levels)
2230       tex_dst = ureg_writemask(ureg_DECL_temporary(c->ureg), TGSI_WRITEMASK_W);
2231    else
2232       tex_dst = dst;
2233 
2234    ureg_tex_insn(c->ureg, tex_opcode,
2235                  &tex_dst, 1,
2236                  target,
2237                  tex_type,
2238                  tex_offsets, num_tex_offsets,
2239                  s.srcs, s.i);
2240 
2241    if (instr->op == nir_texop_query_levels) {
2242       ureg_MOV(c->ureg, dst, ureg_scalar(ureg_src(tex_dst), 3));
2243       ureg_release_temporary(c->ureg, tex_dst);
2244    }
2245 }
2246 
2247 static void
ntt_emit_jump(struct ntt_compile * c,nir_jump_instr * jump)2248 ntt_emit_jump(struct ntt_compile *c, nir_jump_instr *jump)
2249 {
2250    switch (jump->type) {
2251    case nir_jump_break:
2252       ureg_BRK(c->ureg);
2253       break;
2254 
2255    case nir_jump_continue:
2256       ureg_CONT(c->ureg);
2257       break;
2258 
2259    default:
2260       fprintf(stderr, "Unknown jump instruction: ");
2261       nir_print_instr(&jump->instr, stderr);
2262       fprintf(stderr, "\n");
2263       abort();
2264    }
2265 }
2266 
2267 static void
ntt_emit_ssa_undef(struct ntt_compile * c,nir_ssa_undef_instr * instr)2268 ntt_emit_ssa_undef(struct ntt_compile *c, nir_ssa_undef_instr *instr)
2269 {
2270    /* Nothing to do but make sure that we have some storage to deref. */
2271    (void)ntt_get_ssa_def_decl(c, &instr->def);
2272 }
2273 
2274 static void
ntt_emit_instr(struct ntt_compile * c,nir_instr * instr)2275 ntt_emit_instr(struct ntt_compile *c, nir_instr *instr)
2276 {
2277    /* There is no addr reg in use before we start emitting an instr. */
2278    c->next_addr_reg = 0;
2279 
2280    switch (instr->type) {
2281    case nir_instr_type_deref:
2282       /* ignored, will be walked by nir_intrinsic_image_*_deref. */
2283       break;
2284 
2285    case nir_instr_type_alu:
2286       ntt_emit_alu(c, nir_instr_as_alu(instr));
2287       break;
2288 
2289    case nir_instr_type_intrinsic:
2290       ntt_emit_intrinsic(c, nir_instr_as_intrinsic(instr));
2291       break;
2292 
2293    case nir_instr_type_load_const:
2294       /* Nothing to do here, as load consts are done directly from
2295        * ntt_get_src() (since many constant NIR srcs will often get folded
2296        * directly into a register file index instead of as a TGSI src).
2297        */
2298       break;
2299 
2300    case nir_instr_type_tex:
2301       ntt_emit_texture(c, nir_instr_as_tex(instr));
2302       break;
2303 
2304    case nir_instr_type_jump:
2305       ntt_emit_jump(c, nir_instr_as_jump(instr));
2306       break;
2307 
2308    case nir_instr_type_ssa_undef:
2309       ntt_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr));
2310       break;
2311 
2312    default:
2313       fprintf(stderr, "Unknown NIR instr type: ");
2314       nir_print_instr(instr, stderr);
2315       fprintf(stderr, "\n");
2316       abort();
2317    }
2318 }
2319 
2320 static void
ntt_emit_if(struct ntt_compile * c,nir_if * if_stmt)2321 ntt_emit_if(struct ntt_compile *c, nir_if *if_stmt)
2322 {
2323    unsigned label;
2324    ureg_UIF(c->ureg, c->if_cond, &label);
2325    ntt_emit_cf_list(c, &if_stmt->then_list);
2326 
2327    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
2328       ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2329       ureg_ELSE(c->ureg, &label);
2330       ntt_emit_cf_list(c, &if_stmt->else_list);
2331    }
2332 
2333    ureg_fixup_label(c->ureg, label, ureg_get_instruction_number(c->ureg));
2334    ureg_ENDIF(c->ureg);
2335 }
2336 
2337 static void
ntt_emit_loop(struct ntt_compile * c,nir_loop * loop)2338 ntt_emit_loop(struct ntt_compile *c, nir_loop *loop)
2339 {
2340    /* GLSL-to-TGSI never set the begin/end labels to anything, even though nvfx
2341     * does reference BGNLOOP's.  Follow the former behavior unless something comes up
2342     * with a need.
2343     */
2344    unsigned begin_label;
2345    ureg_BGNLOOP(c->ureg, &begin_label);
2346    ntt_emit_cf_list(c, &loop->body);
2347 
2348    unsigned end_label;
2349    ureg_ENDLOOP(c->ureg, &end_label);
2350 }
2351 
2352 static void
ntt_free_ssa_temp_by_index(struct ntt_compile * c,int index)2353 ntt_free_ssa_temp_by_index(struct ntt_compile *c, int index)
2354 {
2355    /* We do store CONST/IMM/INPUT/etc. in ssa_temp[] */
2356    if (c->ssa_temp[index].File != TGSI_FILE_TEMPORARY)
2357       return;
2358 
2359    ureg_release_temporary(c->ureg, ureg_dst(c->ssa_temp[index]));
2360    memset(&c->ssa_temp[index], 0, sizeof(c->ssa_temp[index]));
2361 }
2362 
2363 /* Releases any temporaries for SSA defs with a live interval ending at this
2364  * instruction.
2365  */
2366 static bool
ntt_src_live_interval_end_cb(nir_src * src,void * state)2367 ntt_src_live_interval_end_cb(nir_src *src, void *state)
2368 {
2369    struct ntt_compile *c = state;
2370 
2371    if (src->is_ssa) {
2372       nir_ssa_def *def = src->ssa;
2373 
2374       if (c->liveness->defs[def->index].end == src->parent_instr->index)
2375          ntt_free_ssa_temp_by_index(c, def->index);
2376    }
2377 
2378    return true;
2379 }
2380 
2381 static void
ntt_emit_block(struct ntt_compile * c,nir_block * block)2382 ntt_emit_block(struct ntt_compile *c, nir_block *block)
2383 {
2384    nir_foreach_instr(instr, block) {
2385       ntt_emit_instr(c, instr);
2386 
2387       nir_foreach_src(instr, ntt_src_live_interval_end_cb, c);
2388    }
2389 
2390    /* Set up the if condition for ntt_emit_if(), which we have to do before
2391     * freeing up the temps (the "if" is treated as inside the block for liveness
2392     * purposes, despite not being an instruction)
2393     *
2394     * Note that, while IF and UIF are supposed to look at only .x, virglrenderer
2395     * looks at all of .xyzw.  No harm in working around the bug.
2396     */
2397    nir_if *nif = nir_block_get_following_if(block);
2398    if (nif)
2399       c->if_cond = ureg_scalar(ntt_get_src(c, nif->condition), TGSI_SWIZZLE_X);
2400 
2401    /* Free up any SSA temps that are unused at the end of the block. */
2402    unsigned index;
2403    BITSET_FOREACH_SET(index, block->live_out, BITSET_WORDS(c->impl->ssa_alloc)) {
2404       unsigned def_end_ip = c->liveness->defs[index].end;
2405       if (def_end_ip == block->end_ip)
2406          ntt_free_ssa_temp_by_index(c, index);
2407    }
2408 }
2409 
2410 static void
ntt_emit_cf_list(struct ntt_compile * c,struct exec_list * list)2411 ntt_emit_cf_list(struct ntt_compile *c, struct exec_list *list)
2412 {
2413    /* There is no addr reg in use before we start emitting any part of a CF
2414     * node (such as an if condition)
2415     */
2416    c->next_addr_reg = 0;
2417 
2418    foreach_list_typed(nir_cf_node, node, node, list) {
2419       switch (node->type) {
2420       case nir_cf_node_block:
2421          ntt_emit_block(c, nir_cf_node_as_block(node));
2422          break;
2423 
2424       case nir_cf_node_if:
2425          ntt_emit_if(c, nir_cf_node_as_if(node));
2426          break;
2427 
2428       case nir_cf_node_loop:
2429          ntt_emit_loop(c, nir_cf_node_as_loop(node));
2430          break;
2431 
2432       default:
2433          unreachable("unknown CF type");
2434       }
2435    }
2436 }
2437 
2438 static void
ntt_emit_impl(struct ntt_compile * c,nir_function_impl * impl)2439 ntt_emit_impl(struct ntt_compile *c, nir_function_impl *impl)
2440 {
2441    c->impl = impl;
2442    c->liveness = nir_live_ssa_defs_per_instr(impl);
2443 
2444    c->ssa_temp = rzalloc_array(c, struct ureg_src, impl->ssa_alloc);
2445    c->reg_temp = rzalloc_array(c, struct ureg_dst, impl->reg_alloc);
2446 
2447    ntt_setup_registers(c, &impl->registers);
2448    ntt_emit_cf_list(c, &impl->body);
2449 
2450    ralloc_free(c->liveness);
2451    c->liveness = NULL;
2452 }
2453 
2454 static int
type_size(const struct glsl_type * type,bool bindless)2455 type_size(const struct glsl_type *type, bool bindless)
2456 {
2457    return glsl_count_attribute_slots(type, false);
2458 }
2459 
2460 /* Allow vectorizing of ALU instructions, but avoid vectorizing past what we
2461  * can handle for 64-bit values in TGSI.
2462  */
2463 static bool
ntt_should_vectorize_instr(const nir_instr * instr,void * data)2464 ntt_should_vectorize_instr(const nir_instr *instr, void *data)
2465 {
2466    if (instr->type != nir_instr_type_alu)
2467       return false;
2468 
2469    nir_alu_instr *alu = nir_instr_as_alu(instr);
2470 
2471    switch (alu->op) {
2472    case nir_op_ibitfield_extract:
2473    case nir_op_ubitfield_extract:
2474    case nir_op_bitfield_insert:
2475       /* virglrenderer only looks at the .x channel of the offset/bits operands
2476        * when translating to GLSL.  tgsi.rst doesn't seem to require scalar
2477        * offset/bits operands.
2478        *
2479        * https://gitlab.freedesktop.org/virgl/virglrenderer/-/issues/195
2480        */
2481       return false;
2482 
2483    default:
2484       break;
2485    }
2486 
2487    unsigned num_components = alu->dest.dest.ssa.num_components;
2488 
2489    int src_bit_size = nir_src_bit_size(alu->src[0].src);
2490    int dst_bit_size = nir_dest_bit_size(alu->dest.dest);
2491 
2492    if (src_bit_size == 64 || dst_bit_size == 64) {
2493       if (num_components > 1)
2494          return false;
2495    }
2496 
2497    return true;
2498 }
2499 
2500 static bool
ntt_should_vectorize_io(unsigned align,unsigned bit_size,unsigned num_components,unsigned high_offset,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)2501 ntt_should_vectorize_io(unsigned align, unsigned bit_size,
2502                         unsigned num_components, unsigned high_offset,
2503                         nir_intrinsic_instr *low, nir_intrinsic_instr *high,
2504                         void *data)
2505 {
2506    if (bit_size != 32)
2507       return false;
2508 
2509    /* Our offset alignment should aways be at least 4 bytes */
2510    if (align < 4)
2511       return false;
2512 
2513    /* No wrapping off the end of a TGSI reg.  We could do a bit better by
2514     * looking at low's actual offset.  XXX: With LOAD_CONSTBUF maybe we don't
2515     * need this restriction.
2516     */
2517    unsigned worst_start_component = align == 4 ? 3 : align / 4;
2518    if (worst_start_component + num_components > 4)
2519       return false;
2520 
2521    return true;
2522 }
2523 
2524 static nir_variable_mode
ntt_no_indirects_mask(nir_shader * s,struct pipe_screen * screen)2525 ntt_no_indirects_mask(nir_shader *s, struct pipe_screen *screen)
2526 {
2527    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2528    unsigned indirect_mask = 0;
2529 
2530    if (!screen->get_shader_param(screen, pipe_stage,
2531                                  PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR)) {
2532       indirect_mask |= nir_var_shader_in;
2533    }
2534 
2535    if (!screen->get_shader_param(screen, pipe_stage,
2536                                  PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR)) {
2537       indirect_mask |= nir_var_shader_out;
2538    }
2539 
2540    if (!screen->get_shader_param(screen, pipe_stage,
2541                                  PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR)) {
2542       indirect_mask |= nir_var_function_temp;
2543    }
2544 
2545    return indirect_mask;
2546 }
2547 
2548 static void
ntt_optimize_nir(struct nir_shader * s,struct pipe_screen * screen)2549 ntt_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
2550 {
2551    bool progress;
2552    unsigned pipe_stage = pipe_shader_type_from_mesa(s->info.stage);
2553    unsigned control_flow_depth =
2554       screen->get_shader_param(screen, pipe_stage,
2555                                PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
2556    do {
2557       progress = false;
2558 
2559       NIR_PASS_V(s, nir_lower_vars_to_ssa);
2560 
2561       NIR_PASS(progress, s, nir_copy_prop);
2562       NIR_PASS(progress, s, nir_opt_algebraic);
2563       NIR_PASS(progress, s, nir_opt_constant_folding);
2564       NIR_PASS(progress, s, nir_opt_remove_phis);
2565       NIR_PASS(progress, s, nir_opt_conditional_discard);
2566       NIR_PASS(progress, s, nir_opt_dce);
2567       NIR_PASS(progress, s, nir_opt_dead_cf);
2568       NIR_PASS(progress, s, nir_opt_cse);
2569       NIR_PASS(progress, s, nir_opt_find_array_copies);
2570       NIR_PASS(progress, s, nir_opt_if, true);
2571       NIR_PASS(progress, s, nir_opt_peephole_select,
2572                control_flow_depth == 0 ? ~0 : 8, true, true);
2573       NIR_PASS(progress, s, nir_opt_algebraic);
2574       NIR_PASS(progress, s, nir_opt_constant_folding);
2575       nir_load_store_vectorize_options vectorize_opts = {
2576          .modes = nir_var_mem_ubo,
2577          .callback = ntt_should_vectorize_io,
2578          .robust_modes = 0,
2579       };
2580       NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts);
2581       NIR_PASS(progress, s, nir_opt_shrink_vectors, true);
2582       NIR_PASS(progress, s, nir_opt_trivial_continues);
2583       NIR_PASS(progress, s, nir_opt_vectorize, ntt_should_vectorize_instr, NULL);
2584       NIR_PASS(progress, s, nir_opt_undef);
2585       NIR_PASS(progress, s, nir_opt_loop_unroll);
2586 
2587    } while (progress);
2588 }
2589 
2590 /* Scalarizes all 64-bit ALU ops.  Note that we only actually need to
2591  * scalarize vec3/vec4s, should probably fix that.
2592  */
2593 static bool
scalarize_64bit(const nir_instr * instr,const void * data)2594 scalarize_64bit(const nir_instr *instr, const void *data)
2595 {
2596    const nir_alu_instr *alu = nir_instr_as_alu(instr);
2597 
2598    return (nir_dest_bit_size(alu->dest.dest) == 64 ||
2599            nir_src_bit_size(alu->src[0].src) == 64);
2600 }
2601 
2602 static bool
nir_to_tgsi_lower_64bit_intrinsic(nir_builder * b,nir_intrinsic_instr * instr)2603 nir_to_tgsi_lower_64bit_intrinsic(nir_builder *b, nir_intrinsic_instr *instr)
2604 {
2605    b->cursor = nir_after_instr(&instr->instr);
2606 
2607    switch (instr->intrinsic) {
2608    case nir_intrinsic_load_ubo:
2609    case nir_intrinsic_load_ubo_vec4:
2610    case nir_intrinsic_load_ssbo:
2611    case nir_intrinsic_load_input:
2612    case nir_intrinsic_load_interpolated_input:
2613    case nir_intrinsic_load_per_vertex_input:
2614    case nir_intrinsic_store_output:
2615    case nir_intrinsic_store_ssbo:
2616       break;
2617    default:
2618       return false;
2619    }
2620 
2621    if (instr->num_components <= 2)
2622       return false;
2623 
2624    bool has_dest = nir_intrinsic_infos[instr->intrinsic].has_dest;
2625    if (has_dest) {
2626       if (nir_dest_bit_size(instr->dest) != 64)
2627          return false;
2628    } else  {
2629       if (nir_src_bit_size(instr->src[0]) != 64)
2630           return false;
2631    }
2632 
2633    nir_intrinsic_instr *first =
2634       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2635    nir_intrinsic_instr *second =
2636       nir_instr_as_intrinsic(nir_instr_clone(b->shader, &instr->instr));
2637 
2638    switch (instr->intrinsic) {
2639    case nir_intrinsic_load_ubo:
2640    case nir_intrinsic_load_ubo_vec4:
2641    case nir_intrinsic_load_ssbo:
2642    case nir_intrinsic_store_ssbo:
2643       break;
2644 
2645    default: {
2646       nir_io_semantics semantics = nir_intrinsic_io_semantics(second);
2647       semantics.location++;
2648       semantics.num_slots--;
2649       nir_intrinsic_set_io_semantics(second, semantics);
2650 
2651       nir_intrinsic_set_base(second, nir_intrinsic_base(second) + 1);
2652       break;
2653    }
2654    }
2655 
2656    first->num_components = 2;
2657    second->num_components -= 2;
2658    if (has_dest) {
2659       first->dest.ssa.num_components = 2;
2660       second->dest.ssa.num_components -= 2;
2661    }
2662 
2663    nir_builder_instr_insert(b, &first->instr);
2664    nir_builder_instr_insert(b, &second->instr);
2665 
2666    if (has_dest) {
2667       /* Merge the two loads' results back into a vector. */
2668       nir_ssa_def *channels[4] = {
2669          nir_channel(b, &first->dest.ssa, 0),
2670          nir_channel(b, &first->dest.ssa, 1),
2671          nir_channel(b, &second->dest.ssa, 0),
2672          second->num_components > 1 ? nir_channel(b, &second->dest.ssa, 1) : NULL,
2673       };
2674       nir_ssa_def *new = nir_vec(b, channels, instr->num_components);
2675       nir_ssa_def_rewrite_uses(&instr->dest.ssa, new);
2676    } else {
2677       /* Split the src value across the two stores. */
2678       b->cursor = nir_before_instr(&instr->instr);
2679 
2680       nir_ssa_def *src0 = instr->src[0].ssa;
2681       nir_ssa_def *channels[4] = { 0 };
2682       for (int i = 0; i < instr->num_components; i++)
2683          channels[i] = nir_channel(b, src0, i);
2684 
2685       nir_intrinsic_set_write_mask(first, nir_intrinsic_write_mask(instr) & 3);
2686       nir_intrinsic_set_write_mask(second, nir_intrinsic_write_mask(instr) >> 2);
2687 
2688       nir_instr_rewrite_src(&first->instr, &first->src[0],
2689                             nir_src_for_ssa(nir_vec(b, channels, 2)));
2690       nir_instr_rewrite_src(&second->instr, &second->src[0],
2691                             nir_src_for_ssa(nir_vec(b, &channels[2],
2692                                                     second->num_components)));
2693    }
2694 
2695    int offset_src = -1;
2696    uint32_t offset_amount = 16;
2697 
2698    switch (instr->intrinsic) {
2699    case nir_intrinsic_load_ssbo:
2700    case nir_intrinsic_load_ubo:
2701       offset_src = 1;
2702       break;
2703    case nir_intrinsic_load_ubo_vec4:
2704       offset_src = 1;
2705       offset_amount = 1;
2706       break;
2707    case nir_intrinsic_store_ssbo:
2708       offset_src = 2;
2709       break;
2710    default:
2711       break;
2712    }
2713    if (offset_src != -1) {
2714       b->cursor = nir_before_instr(&second->instr);
2715       nir_ssa_def *second_offset =
2716          nir_iadd_imm(b, second->src[offset_src].ssa, offset_amount);
2717       nir_instr_rewrite_src(&second->instr, &second->src[offset_src],
2718                             nir_src_for_ssa(second_offset));
2719    }
2720 
2721    /* DCE stores we generated with no writemask (nothing else does this
2722     * currently).
2723     */
2724    if (!has_dest) {
2725       if (nir_intrinsic_write_mask(first) == 0)
2726          nir_instr_remove(&first->instr);
2727       if (nir_intrinsic_write_mask(second) == 0)
2728          nir_instr_remove(&second->instr);
2729    }
2730 
2731    nir_instr_remove(&instr->instr);
2732 
2733    return true;
2734 }
2735 
2736 static bool
nir_to_tgsi_lower_64bit_load_const(nir_builder * b,nir_load_const_instr * instr)2737 nir_to_tgsi_lower_64bit_load_const(nir_builder *b, nir_load_const_instr *instr)
2738 {
2739    int num_components = instr->def.num_components;
2740 
2741    if (instr->def.bit_size != 64 || num_components <= 2)
2742       return false;
2743 
2744    b->cursor = nir_before_instr(&instr->instr);
2745 
2746    nir_load_const_instr *first =
2747       nir_load_const_instr_create(b->shader, 2, 64);
2748    nir_load_const_instr *second =
2749       nir_load_const_instr_create(b->shader, num_components - 2, 64);
2750 
2751    first->value[0] = instr->value[0];
2752    first->value[1] = instr->value[1];
2753    second->value[0] = instr->value[2];
2754    if (num_components == 4)
2755       second->value[1] = instr->value[3];
2756 
2757    nir_builder_instr_insert(b, &first->instr);
2758    nir_builder_instr_insert(b, &second->instr);
2759 
2760    nir_ssa_def *channels[4] = {
2761       nir_channel(b, &first->def, 0),
2762       nir_channel(b, &first->def, 1),
2763       nir_channel(b, &second->def, 0),
2764       num_components == 4 ? nir_channel(b, &second->def, 1) : NULL,
2765    };
2766    nir_ssa_def *new = nir_vec(b, channels, num_components);
2767    nir_ssa_def_rewrite_uses(&instr->def, new);
2768    nir_instr_remove(&instr->instr);
2769 
2770    return true;
2771 }
2772 
2773 static bool
nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder * b,nir_instr * instr,void * data)2774 nir_to_tgsi_lower_64bit_to_vec2_instr(nir_builder *b, nir_instr *instr,
2775                                       void *data)
2776 {
2777    switch (instr->type) {
2778    case nir_instr_type_load_const:
2779       return nir_to_tgsi_lower_64bit_load_const(b, nir_instr_as_load_const(instr));
2780 
2781    case nir_instr_type_intrinsic:
2782       return nir_to_tgsi_lower_64bit_intrinsic(b, nir_instr_as_intrinsic(instr));
2783    default:
2784       return false;
2785    }
2786 }
2787 
2788 static bool
nir_to_tgsi_lower_64bit_to_vec2(nir_shader * s)2789 nir_to_tgsi_lower_64bit_to_vec2(nir_shader *s)
2790 {
2791    return nir_shader_instructions_pass(s,
2792                                        nir_to_tgsi_lower_64bit_to_vec2_instr,
2793                                        nir_metadata_block_index |
2794                                        nir_metadata_dominance,
2795                                        NULL);
2796 }
2797 
2798 struct ntt_lower_tex_state {
2799    nir_ssa_def *channels[8];
2800    unsigned i;
2801 };
2802 
2803 static void
nir_to_tgsi_lower_tex_instr_arg(nir_builder * b,nir_tex_instr * instr,nir_tex_src_type tex_src_type,struct ntt_lower_tex_state * s)2804 nir_to_tgsi_lower_tex_instr_arg(nir_builder *b,
2805                                 nir_tex_instr *instr,
2806                                 nir_tex_src_type tex_src_type,
2807                                 struct ntt_lower_tex_state *s)
2808 {
2809    int tex_src = nir_tex_instr_src_index(instr, tex_src_type);
2810    if (tex_src < 0)
2811       return;
2812 
2813    assert(instr->src[tex_src].src.is_ssa);
2814 
2815    nir_ssa_def *def = instr->src[tex_src].src.ssa;
2816    for (int i = 0; i < def->num_components; i++) {
2817       s->channels[s->i++] = nir_channel(b, def, i);
2818    }
2819 
2820    nir_tex_instr_remove_src(instr, tex_src);
2821 }
2822 
2823 /**
2824  * Merges together a vec4 of tex coordinate/compare/bias/lod into a backend tex
2825  * src.  This lets NIR handle the coalescing of the vec4 rather than trying to
2826  * manage it on our own, and may lead to more vectorization.
2827  */
2828 static bool
nir_to_tgsi_lower_tex_instr(nir_builder * b,nir_instr * instr,void * data)2829 nir_to_tgsi_lower_tex_instr(nir_builder *b, nir_instr *instr, void *data)
2830 {
2831    if (instr->type != nir_instr_type_tex)
2832       return false;
2833 
2834    nir_tex_instr *tex = nir_instr_as_tex(instr);
2835 
2836    if (nir_tex_instr_src_index(tex, nir_tex_src_coord) < 0)
2837       return false;
2838 
2839    /* NIR after lower_tex will have LOD set to 0 for tex ops that wanted
2840     * implicit lod in shader stages that don't have quad-based derivatives.
2841     * TGSI doesn't want that, it requires that the backend do implict LOD 0 for
2842     * those stages.
2843     */
2844    if (!nir_shader_supports_implicit_lod(b->shader) && tex->op == nir_texop_txl) {
2845       int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
2846       nir_src *lod_src = &tex->src[lod_index].src;
2847       if (nir_src_is_const(*lod_src) && nir_src_as_uint(*lod_src) == 0) {
2848          nir_tex_instr_remove_src(tex, lod_index);
2849          tex->op = nir_texop_tex;
2850       }
2851    }
2852 
2853    b->cursor = nir_before_instr(instr);
2854 
2855    struct ntt_lower_tex_state s = {0};
2856 
2857    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_coord, &s);
2858    /* We always have at least two slots for the coordinate, even on 1D. */
2859    s.i = MAX2(s.i, 2);
2860 
2861    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_comparator, &s);
2862    s.i = MAX2(s.i, 3);
2863 
2864    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_bias, &s);
2865 
2866    /* XXX: LZ */
2867    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_lod, &s);
2868    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_projector, &s);
2869    nir_to_tgsi_lower_tex_instr_arg(b, tex, nir_tex_src_ms_index, &s);
2870 
2871    /* No need to pack undefs in unused channels of the tex instr */
2872    while (!s.channels[s.i - 1])
2873       s.i--;
2874 
2875    /* Instead of putting undefs in the unused slots of the vecs, just put in
2876     * another used channel.  Otherwise, we'll get unnecessary moves into
2877     * registers.
2878     */
2879    assert(s.channels[0] != NULL);
2880    for (int i = 1; i < s.i; i++) {
2881       if (!s.channels[i])
2882          s.channels[i] = s.channels[0];
2883    }
2884 
2885    nir_tex_instr_add_src(tex, nir_tex_src_backend1, nir_src_for_ssa(nir_vec(b, s.channels, MIN2(s.i, 4))));
2886    if (s.i > 4)
2887       nir_tex_instr_add_src(tex, nir_tex_src_backend2, nir_src_for_ssa(nir_vec(b, &s.channels[4], s.i - 4)));
2888 
2889    return true;
2890 }
2891 
2892 static bool
nir_to_tgsi_lower_tex(nir_shader * s)2893 nir_to_tgsi_lower_tex(nir_shader *s)
2894 {
2895    return nir_shader_instructions_pass(s,
2896                                        nir_to_tgsi_lower_tex_instr,
2897                                        nir_metadata_block_index |
2898                                        nir_metadata_dominance,
2899                                        NULL);
2900 }
2901 
2902 static void
ntt_fix_nir_options(struct pipe_screen * screen,struct nir_shader * s)2903 ntt_fix_nir_options(struct pipe_screen *screen, struct nir_shader *s)
2904 {
2905    const struct nir_shader_compiler_options *options = s->options;
2906    bool lower_fsqrt =
2907       !screen->get_shader_param(screen, pipe_shader_type_from_mesa(s->info.stage),
2908                                 PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED);
2909 
2910    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
2911 
2912    if (!options->lower_extract_byte ||
2913        !options->lower_extract_word ||
2914        !options->lower_insert_byte ||
2915        !options->lower_insert_word ||
2916        !options->lower_fdph ||
2917        !options->lower_flrp64 ||
2918        !options->lower_fmod ||
2919        !options->lower_rotate ||
2920        !options->lower_uniforms_to_ubo ||
2921        !options->lower_vector_cmp ||
2922        options->lower_fsqrt != lower_fsqrt ||
2923        options->force_indirect_unrolling != no_indirects_mask) {
2924       nir_shader_compiler_options *new_options = ralloc(s, nir_shader_compiler_options);
2925       *new_options = *s->options;
2926 
2927       new_options->lower_extract_byte = true;
2928       new_options->lower_extract_word = true;
2929       new_options->lower_insert_byte = true;
2930       new_options->lower_insert_word = true;
2931       new_options->lower_fdph = true;
2932       new_options->lower_flrp64 = true;
2933       new_options->lower_fmod = true;
2934       new_options->lower_rotate = true;
2935       new_options->lower_uniforms_to_ubo = true,
2936       new_options->lower_vector_cmp = true;
2937       new_options->lower_fsqrt = lower_fsqrt;
2938       new_options->force_indirect_unrolling = no_indirects_mask;
2939 
2940       s->options = new_options;
2941    }
2942 }
2943 
2944 static bool
ntt_lower_atomic_pre_dec_filter(const nir_instr * instr,const void * _data)2945 ntt_lower_atomic_pre_dec_filter(const nir_instr *instr, const void *_data)
2946 {
2947    return (instr->type == nir_instr_type_intrinsic &&
2948            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_atomic_counter_pre_dec);
2949 }
2950 
2951 static nir_ssa_def *
ntt_lower_atomic_pre_dec_lower(nir_builder * b,nir_instr * instr,void * _data)2952 ntt_lower_atomic_pre_dec_lower(nir_builder *b, nir_instr *instr, void *_data)
2953 {
2954    nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
2955 
2956    nir_ssa_def *old_result = &intr->dest.ssa;
2957    intr->intrinsic = nir_intrinsic_atomic_counter_post_dec;
2958 
2959    return nir_iadd_imm(b, old_result, -1);
2960 }
2961 
2962 static bool
ntt_lower_atomic_pre_dec(nir_shader * s)2963 ntt_lower_atomic_pre_dec(nir_shader *s)
2964 {
2965    return nir_shader_lower_instructions(s,
2966                                         ntt_lower_atomic_pre_dec_filter,
2967                                         ntt_lower_atomic_pre_dec_lower, NULL);
2968 }
2969 
2970 /* Lowers texture projectors if we can't do them as TGSI_OPCODE_TXP. */
2971 static void
nir_to_tgsi_lower_txp(nir_shader * s)2972 nir_to_tgsi_lower_txp(nir_shader *s)
2973 {
2974    nir_lower_tex_options lower_tex_options = {
2975        .lower_txp = 0,
2976    };
2977 
2978    nir_foreach_block(block, nir_shader_get_entrypoint(s)) {
2979       nir_foreach_instr(instr, block) {
2980          if (instr->type != nir_instr_type_tex)
2981             continue;
2982          nir_tex_instr *tex = nir_instr_as_tex(instr);
2983 
2984          if (nir_tex_instr_src_index(tex, nir_tex_src_projector) < 0)
2985             continue;
2986 
2987          bool has_compare = nir_tex_instr_src_index(tex, nir_tex_src_comparator) >= 0;
2988          bool has_lod = nir_tex_instr_src_index(tex, nir_tex_src_lod) >= 0 || s->info.stage != MESA_SHADER_FRAGMENT;
2989          bool has_offset = nir_tex_instr_src_index(tex, nir_tex_src_offset) >= 0;
2990 
2991          /* We can do TXP for any tex (not txg) where we can fit all the
2992           * coordinates and comparator and projector in one vec4 without any
2993           * other modifiers to add on.
2994           *
2995           * nir_lower_tex() only handles the lowering on a sampler-dim basis, so
2996           * if we get any funny projectors then we just blow them all away.
2997           */
2998          if (tex->op != nir_texop_tex || has_lod || has_offset || (tex->coord_components >= 3 && has_compare))
2999             lower_tex_options.lower_txp |= 1 << tex->sampler_dim;
3000       }
3001    }
3002 
3003    /* nir_lower_tex must be run even if no options are set, because we need the
3004     * LOD to be set for query_levels and for non-fragment shaders.
3005     */
3006    NIR_PASS_V(s, nir_lower_tex, &lower_tex_options);
3007 }
3008 
3009 static bool
nir_lower_primid_sysval_to_input_filter(const nir_instr * instr,const void * _data)3010 nir_lower_primid_sysval_to_input_filter(const nir_instr *instr, const void *_data)
3011 {
3012    return (instr->type == nir_instr_type_intrinsic &&
3013            nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_primitive_id);
3014 }
3015 
3016 static nir_ssa_def *
nir_lower_primid_sysval_to_input_lower(nir_builder * b,nir_instr * instr,void * data)3017 nir_lower_primid_sysval_to_input_lower(nir_builder *b, nir_instr *instr, void *data)
3018 {
3019    nir_variable *var = *(nir_variable **)data;
3020    if (!var) {
3021       var = nir_variable_create(b->shader, nir_var_shader_in, glsl_uint_type(), "gl_PrimitiveID");
3022       var->data.location = VARYING_SLOT_PRIMITIVE_ID;
3023       b->shader->info.inputs_read |= VARYING_BIT_PRIMITIVE_ID;
3024       var->data.driver_location = b->shader->num_outputs++;
3025 
3026       *(nir_variable **)data = var;
3027    }
3028 
3029    nir_io_semantics semantics = {
3030       .location = var->data.location,
3031        .num_slots = 1
3032    };
3033    return nir_load_input(b, 1, 32, nir_imm_int(b, 0),
3034                          .base = var->data.driver_location,
3035                          .io_semantics = semantics);
3036 }
3037 
3038 static bool
nir_lower_primid_sysval_to_input(nir_shader * s)3039 nir_lower_primid_sysval_to_input(nir_shader *s)
3040 {
3041    nir_variable *input = NULL;
3042 
3043    return nir_shader_lower_instructions(s,
3044                                         nir_lower_primid_sysval_to_input_filter,
3045                                         nir_lower_primid_sysval_to_input_lower, &input);
3046 }
3047 
3048 /**
3049  * Translates the NIR shader to TGSI.
3050  *
3051  * This requires some lowering of the NIR shader to prepare it for translation.
3052  * We take ownership of the NIR shader passed, returning a reference to the new
3053  * TGSI tokens instead.  If you need to keep the NIR, then pass us a clone.
3054  */
3055 const void *
nir_to_tgsi(struct nir_shader * s,struct pipe_screen * screen)3056 nir_to_tgsi(struct nir_shader *s,
3057             struct pipe_screen *screen)
3058 {
3059    struct ntt_compile *c;
3060    const void *tgsi_tokens;
3061    bool debug = env_var_as_boolean("NIR_TO_TGSI_DEBUG", false);
3062    nir_variable_mode no_indirects_mask = ntt_no_indirects_mask(s, screen);
3063    bool native_integers = screen->get_shader_param(screen,
3064                                                    pipe_shader_type_from_mesa(s->info.stage),
3065                                                    PIPE_SHADER_CAP_INTEGERS);
3066    const struct nir_shader_compiler_options *original_options = s->options;
3067 
3068    ntt_fix_nir_options(screen, s);
3069 
3070    NIR_PASS_V(s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
3071               type_size, (nir_lower_io_options)0);
3072    NIR_PASS_V(s, nir_lower_regs_to_ssa);
3073 
3074    nir_to_tgsi_lower_txp(s);
3075    NIR_PASS_V(s, nir_to_tgsi_lower_tex);
3076 
3077    /* While TGSI can represent PRIMID as either an input or a system value,
3078     * glsl-to-tgsi had the GS (not TCS or TES) primid as an input, and drivers
3079     * depend on that.
3080     */
3081    if (s->info.stage == MESA_SHADER_GEOMETRY)
3082       NIR_PASS_V(s, nir_lower_primid_sysval_to_input);
3083 
3084    if (s->info.num_abos)
3085       NIR_PASS_V(s, ntt_lower_atomic_pre_dec);
3086 
3087    if (!original_options->lower_uniforms_to_ubo) {
3088       NIR_PASS_V(s, nir_lower_uniforms_to_ubo,
3089                  screen->get_param(screen, PIPE_CAP_PACKED_UNIFORMS),
3090                  !native_integers);
3091    }
3092 
3093    /* Do lowering so we can directly translate f64/i64 NIR ALU ops to TGSI --
3094     * TGSI stores up to a vec2 in each slot, so to avoid a whole bunch of op
3095     * duplication logic we just make it so that we only see vec2s.
3096     */
3097    NIR_PASS_V(s, nir_lower_alu_to_scalar, scalarize_64bit, NULL);
3098    NIR_PASS_V(s, nir_to_tgsi_lower_64bit_to_vec2);
3099 
3100    if (!screen->get_param(screen, PIPE_CAP_LOAD_CONSTBUF))
3101       NIR_PASS_V(s, nir_lower_ubo_vec4);
3102 
3103    ntt_optimize_nir(s, screen);
3104 
3105    NIR_PASS_V(s, nir_lower_indirect_derefs, no_indirects_mask, UINT32_MAX);
3106 
3107    bool progress;
3108    do {
3109       progress = false;
3110       NIR_PASS(progress, s, nir_opt_algebraic_late);
3111       if (progress) {
3112          NIR_PASS_V(s, nir_copy_prop);
3113          NIR_PASS_V(s, nir_opt_dce);
3114          NIR_PASS_V(s, nir_opt_cse);
3115       }
3116    } while (progress);
3117 
3118    if (screen->get_shader_param(screen,
3119                                 pipe_shader_type_from_mesa(s->info.stage),
3120                                 PIPE_SHADER_CAP_INTEGERS)) {
3121       NIR_PASS_V(s, nir_lower_bool_to_int32);
3122    } else {
3123       NIR_PASS_V(s, nir_lower_int_to_float);
3124       NIR_PASS_V(s, nir_lower_bool_to_float);
3125       /* bool_to_float generates MOVs for b2f32 that we want to clean up. */
3126       NIR_PASS_V(s, nir_copy_prop);
3127       NIR_PASS_V(s, nir_opt_dce);
3128    }
3129 
3130    /* Only lower 32-bit floats.  The only other modifier type officially
3131     * supported by TGSI is 32-bit integer negates, but even those are broken on
3132     * virglrenderer, so skip lowering all integer and f64 float mods.
3133     */
3134    NIR_PASS_V(s, nir_lower_to_source_mods, nir_lower_float_source_mods);
3135    NIR_PASS_V(s, nir_convert_from_ssa, true);
3136    NIR_PASS_V(s, nir_lower_vec_to_movs, NULL, NULL);
3137 
3138    /* locals_to_regs will leave dead derefs that are good to clean up. */
3139    NIR_PASS_V(s, nir_lower_locals_to_regs);
3140    NIR_PASS_V(s, nir_opt_dce);
3141 
3142    if (debug) {
3143       fprintf(stderr, "NIR before translation to TGSI:\n");
3144       nir_print_shader(s, stderr);
3145    }
3146 
3147    c = rzalloc(NULL, struct ntt_compile);
3148    c->screen = screen;
3149 
3150    c->needs_texcoord_semantic =
3151       screen->get_param(screen, PIPE_CAP_TGSI_TEXCOORD);
3152    c->any_reg_as_address =
3153       screen->get_param(screen, PIPE_CAP_TGSI_ANY_REG_AS_ADDRESS);
3154    c->has_txf_lz =
3155       screen->get_param(screen, PIPE_CAP_TGSI_TEX_TXF_LZ);
3156 
3157    c->s = s;
3158    c->native_integers = native_integers;
3159    c->ureg = ureg_create(pipe_shader_type_from_mesa(s->info.stage));
3160    ureg_setup_shader_info(c->ureg, &s->info);
3161 
3162    ntt_setup_inputs(c);
3163    ntt_setup_outputs(c);
3164    ntt_setup_uniforms(c);
3165 
3166    if (s->info.stage == MESA_SHADER_FRAGMENT) {
3167       /* The draw module's polygon stipple layer doesn't respect the chosen
3168        * coordinate mode, so leave it as unspecified unless we're actually
3169        * reading the position in the shader already.  See
3170        * gl-2.1-polygon-stipple-fs on softpipe.
3171        */
3172       if ((s->info.inputs_read & VARYING_BIT_POS) ||
3173           BITSET_TEST(s->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
3174          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN,
3175                        s->info.fs.origin_upper_left ?
3176                        TGSI_FS_COORD_ORIGIN_UPPER_LEFT :
3177                        TGSI_FS_COORD_ORIGIN_LOWER_LEFT);
3178 
3179          ureg_property(c->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER,
3180                        s->info.fs.pixel_center_integer ?
3181                        TGSI_FS_COORD_PIXEL_CENTER_INTEGER :
3182                        TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER);
3183       }
3184    }
3185    /* Emit the main function */
3186    nir_function_impl *impl = nir_shader_get_entrypoint(c->s);
3187    ntt_emit_impl(c, impl);
3188    ureg_END(c->ureg);
3189 
3190    tgsi_tokens = ureg_get_tokens(c->ureg, NULL);
3191 
3192    if (debug) {
3193       fprintf(stderr, "TGSI after translation from NIR:\n");
3194       tgsi_dump(tgsi_tokens, 0);
3195    }
3196 
3197    ureg_destroy(c->ureg);
3198 
3199    ralloc_free(c);
3200    ralloc_free(s);
3201 
3202    return tgsi_tokens;
3203 }
3204 
3205 static const nir_shader_compiler_options nir_to_tgsi_compiler_options = {
3206    .fuse_ffma32 = true,
3207    .fuse_ffma64 = true,
3208    .lower_extract_byte = true,
3209    .lower_extract_word = true,
3210    .lower_insert_byte = true,
3211    .lower_insert_word = true,
3212    .lower_fdph = true,
3213    .lower_flrp64 = true,
3214    .lower_fmod = true,
3215    .lower_rotate = true,
3216    .lower_uniforms_to_ubo = true,
3217    .lower_vector_cmp = true,
3218    .use_interpolated_input_intrinsics = true,
3219 };
3220 
3221 /* Returns a default compiler options for drivers with only nir-to-tgsi-based
3222  * NIR support.
3223  */
3224 const void *
nir_to_tgsi_get_compiler_options(struct pipe_screen * pscreen,enum pipe_shader_ir ir,unsigned shader)3225 nir_to_tgsi_get_compiler_options(struct pipe_screen *pscreen,
3226                                  enum pipe_shader_ir ir,
3227                                  unsigned shader)
3228 {
3229    assert(ir == PIPE_SHADER_IR_NIR);
3230    return &nir_to_tgsi_compiler_options;
3231 }
3232