1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  * All Rights Reserved.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * on the rights to use, copy, modify, merge, publish, distribute, sub
9  * license, and/or sell copies of the Software, and to permit persons to whom
10  * the Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include "nir_builder.h"
26 #include "si_pipe.h"
27 
28 
si_alu_to_scalar_filter(const nir_instr * instr,const void * data)29 static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
30 {
31    struct si_screen *sscreen = (struct si_screen *)data;
32 
33    if (sscreen->options.fp16 &&
34        instr->type == nir_instr_type_alu) {
35       nir_alu_instr *alu = nir_instr_as_alu(instr);
36 
37       if (alu->dest.dest.is_ssa &&
38           alu->dest.dest.ssa.bit_size == 16 &&
39           alu->dest.dest.ssa.num_components == 2)
40          return false;
41    }
42 
43    return true;
44 }
45 
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)46 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
47 {
48    bool progress;
49 
50    do {
51       progress = false;
52       bool lower_alu_to_scalar = false;
53       bool lower_phis_to_scalar = false;
54 
55       NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
56       NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
57       NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
58 
59       if (first) {
60          NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
61          NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
62          NIR_PASS(progress, nir, nir_opt_find_array_copies);
63       }
64       NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
65       NIR_PASS(progress, nir, nir_opt_dead_write_vars);
66 
67       NIR_PASS(lower_alu_to_scalar, nir, nir_opt_trivial_continues);
68       /* (Constant) copy propagation is needed for txf with offsets. */
69       NIR_PASS(progress, nir, nir_copy_prop);
70       NIR_PASS(progress, nir, nir_opt_remove_phis);
71       NIR_PASS(progress, nir, nir_opt_dce);
72       NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if, true);
73       NIR_PASS(progress, nir, nir_opt_dead_cf);
74 
75       if (lower_alu_to_scalar)
76          NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
77       if (lower_phis_to_scalar)
78          NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
79       progress |= lower_alu_to_scalar | lower_phis_to_scalar;
80 
81       NIR_PASS(progress, nir, nir_opt_cse);
82       NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
83 
84       /* Needed for algebraic lowering */
85       NIR_PASS(progress, nir, nir_opt_algebraic);
86       NIR_PASS(progress, nir, nir_opt_constant_folding);
87 
88       if (!nir->info.flrp_lowered) {
89          unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
90                                (nir->options->lower_flrp32 ? 32 : 0) |
91                                (nir->options->lower_flrp64 ? 64 : 0);
92          assert(lower_flrp);
93          bool lower_flrp_progress = false;
94 
95          NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
96          if (lower_flrp_progress) {
97             NIR_PASS(progress, nir, nir_opt_constant_folding);
98             progress = true;
99          }
100 
101          /* Nothing should rematerialize any flrps, so we only
102           * need to do this lowering once.
103           */
104          nir->info.flrp_lowered = true;
105       }
106 
107       NIR_PASS(progress, nir, nir_opt_undef);
108       NIR_PASS(progress, nir, nir_opt_conditional_discard);
109       if (nir->options->max_unroll_iterations) {
110          NIR_PASS(progress, nir, nir_opt_loop_unroll);
111       }
112 
113       if (nir->info.stage == MESA_SHADER_FRAGMENT)
114          NIR_PASS_V(nir, nir_opt_move_discards_to_top);
115 
116       if (sscreen->options.fp16)
117          NIR_PASS(progress, nir, nir_opt_vectorize, NULL, NULL);
118    } while (progress);
119 
120    NIR_PASS_V(nir, nir_lower_var_copies);
121 }
122 
si_nir_late_opts(nir_shader * nir)123 void si_nir_late_opts(nir_shader *nir)
124 {
125    bool more_late_algebraic = true;
126    while (more_late_algebraic) {
127       more_late_algebraic = false;
128       NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
129       NIR_PASS_V(nir, nir_opt_constant_folding);
130       NIR_PASS_V(nir, nir_copy_prop);
131       NIR_PASS_V(nir, nir_opt_dce);
132       NIR_PASS_V(nir, nir_opt_cse);
133    }
134 }
135 
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)136 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
137 {
138    /* Optimize and fix types of image_sample sources and destinations.
139     *
140     * The image_sample constraints are:
141     *   nir_tex_src_coord:       has_a16 ? select 16 or 32 : 32
142     *   nir_tex_src_comparator:  32
143     *   nir_tex_src_offset:      32
144     *   nir_tex_src_bias:        32
145     *   nir_tex_src_lod:         match coord
146     *   nir_tex_src_min_lod:     match coord
147     *   nir_tex_src_ms_index:    match coord
148     *   nir_tex_src_ddx:         has_g16 && coord == 32 ? select 16 or 32 : match coord
149     *   nir_tex_src_ddy:         match ddy
150     *
151     * coord and ddx are selected optimally. The types of the rest are legalized
152     * based on those two.
153     */
154    /* TODO: The constraints can't represent the ddx constraint. */
155    /*bool has_g16 = sscreen->info.chip_class >= GFX10 && LLVM_VERSION_MAJOR >= 12;*/
156    bool has_g16 = false;
157    nir_tex_src_type_constraints tex_constraints = {
158       [nir_tex_src_comparator]   = {true, 32},
159       [nir_tex_src_offset]       = {true, 32},
160       [nir_tex_src_bias]         = {true, 32},
161       [nir_tex_src_lod]          = {true, 0, nir_tex_src_coord},
162       [nir_tex_src_min_lod]      = {true, 0, nir_tex_src_coord},
163       [nir_tex_src_ms_index]     = {true, 0, nir_tex_src_coord},
164       [nir_tex_src_ddx]          = {!has_g16, 0, nir_tex_src_coord},
165       [nir_tex_src_ddy]          = {true, 0, has_g16 ? nir_tex_src_ddx : nir_tex_src_coord},
166    };
167    bool changed = false;
168 
169    NIR_PASS(changed, nir, nir_fold_16bit_sampler_conversions,
170             (1 << nir_tex_src_coord) |
171             (has_g16 ? 1 << nir_tex_src_ddx : 0));
172    NIR_PASS(changed, nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
173 
174    if (changed) {
175       si_nir_opts(sscreen, nir, false);
176       si_nir_late_opts(nir);
177    }
178 }
179 
180 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)181 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
182 {
183    return instr->type == nir_instr_type_intrinsic;
184 }
185 
186 static nir_ssa_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)187 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
188 {
189    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
190 
191    switch (intrin->intrinsic) {
192    case nir_intrinsic_is_sparse_texels_resident:
193       /* code==0 means sparse texels are resident */
194       return nir_ieq_imm(b, intrin->src[0].ssa, 0);
195    default:
196       return NULL;
197    }
198 }
199 
si_lower_intrinsics(nir_shader * nir)200 static bool si_lower_intrinsics(nir_shader *nir)
201 {
202    return nir_shader_lower_instructions(nir,
203                                         lower_intrinsic_filter,
204                                         lower_intrinsic_instr,
205                                         NULL);
206 }
207 
208 /**
209  * Perform "lowering" operations on the NIR that are run once when the shader
210  * selector is created.
211  */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)212 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
213 {
214    /* Perform lowerings (and optimizations) of code.
215     *
216     * Performance considerations aside, we must:
217     * - lower certain ALU operations
218     * - ensure constant offsets for texture instructions are folded
219     *   and copy-propagated
220     */
221 
222    static const struct nir_lower_tex_options lower_tex_options = {
223       .lower_txp = ~0u,
224       .lower_txs_cube_array = true,
225    };
226    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
227 
228    static const struct nir_lower_image_options lower_image_options = {
229       .lower_cube_size = true,
230    };
231    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
232 
233    NIR_PASS_V(nir, si_lower_intrinsics);
234 
235    const nir_lower_subgroups_options subgroups_options = {
236       .subgroup_size = 64,
237       .ballot_bit_size = 64,
238       .ballot_components = 1,
239       .lower_to_scalar = true,
240       .lower_subgroup_masks = true,
241       .lower_vote_trivial = false,
242       .lower_vote_eq = true,
243    };
244    NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
245 
246    NIR_PASS_V(nir, nir_lower_discard_or_demote,
247               (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||
248                nir->info.is_arb_asm);
249 
250    /* Lower load constants to scalar and then clean up the mess */
251    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
252    NIR_PASS_V(nir, nir_lower_var_copies);
253    NIR_PASS_V(nir, nir_opt_intrinsics);
254    NIR_PASS_V(nir, nir_lower_system_values);
255    NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
256 
257    if (nir->info.stage == MESA_SHADER_COMPUTE) {
258       if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
259          /* If we are shuffling local_invocation_id for quad derivatives, we
260           * need to derive local_invocation_index from local_invocation_id
261           * first, so that the value corresponds to the shuffled
262           * local_invocation_id.
263           */
264          nir_lower_compute_system_values_options options = {0};
265          options.lower_local_invocation_index = true;
266          NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
267       }
268 
269       nir_opt_cse(nir); /* CSE load_local_invocation_id */
270       nir_lower_compute_system_values_options options = {0};
271       options.shuffle_local_ids_for_quad_derivatives = true;
272       NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
273    }
274 
275    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) {
276       NIR_PASS_V(nir, nir_lower_mediump_io,
277                  /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
278                   * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
279                   */
280                  (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
281                  BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
282                  true);
283    }
284 
285    si_nir_opts(sscreen, nir, true);
286    /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
287    si_nir_late_opts(nir);
288 
289    if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
290       si_late_optimize_16bit_samplers(sscreen, nir);
291 
292    NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
293 }
294 
si_finalize_nir(struct pipe_screen * screen,void * nirptr)295 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
296 {
297    struct si_screen *sscreen = (struct si_screen *)screen;
298    struct nir_shader *nir = (struct nir_shader *)nirptr;
299 
300    nir_lower_io_passes(nir, NULL);
301 
302    /* Remove dead derefs, so that we can remove uniforms. */
303    NIR_PASS_V(nir, nir_opt_dce);
304 
305    /* Remove uniforms because those should have been lowered to UBOs already. */
306    nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
307       if (!glsl_type_get_image_count(var->type) &&
308           !glsl_type_get_sampler_count(var->type))
309          exec_node_remove(&var->node);
310    }
311 
312    si_lower_nir(sscreen, nir);
313    nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
314 
315    if (sscreen->options.inline_uniforms)
316       nir_find_inlinable_uniforms(nir);
317 
318    NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
319    NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
320 
321    return NULL;
322 }
323