1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "nir_builder.h"
26 #include "si_pipe.h"
27
28
si_alu_to_scalar_filter(const nir_instr * instr,const void * data)29 static bool si_alu_to_scalar_filter(const nir_instr *instr, const void *data)
30 {
31 struct si_screen *sscreen = (struct si_screen *)data;
32
33 if (sscreen->options.fp16 &&
34 instr->type == nir_instr_type_alu) {
35 nir_alu_instr *alu = nir_instr_as_alu(instr);
36
37 if (alu->dest.dest.is_ssa &&
38 alu->dest.dest.ssa.bit_size == 16 &&
39 alu->dest.dest.ssa.num_components == 2)
40 return false;
41 }
42
43 return true;
44 }
45
si_nir_opts(struct si_screen * sscreen,struct nir_shader * nir,bool first)46 void si_nir_opts(struct si_screen *sscreen, struct nir_shader *nir, bool first)
47 {
48 bool progress;
49
50 do {
51 progress = false;
52 bool lower_alu_to_scalar = false;
53 bool lower_phis_to_scalar = false;
54
55 NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
56 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
57 NIR_PASS(progress, nir, nir_lower_phis_to_scalar, false);
58
59 if (first) {
60 NIR_PASS(progress, nir, nir_split_array_vars, nir_var_function_temp);
61 NIR_PASS(lower_alu_to_scalar, nir, nir_shrink_vec_array_vars, nir_var_function_temp);
62 NIR_PASS(progress, nir, nir_opt_find_array_copies);
63 }
64 NIR_PASS(progress, nir, nir_opt_copy_prop_vars);
65 NIR_PASS(progress, nir, nir_opt_dead_write_vars);
66
67 NIR_PASS(lower_alu_to_scalar, nir, nir_opt_trivial_continues);
68 /* (Constant) copy propagation is needed for txf with offsets. */
69 NIR_PASS(progress, nir, nir_copy_prop);
70 NIR_PASS(progress, nir, nir_opt_remove_phis);
71 NIR_PASS(progress, nir, nir_opt_dce);
72 NIR_PASS(lower_phis_to_scalar, nir, nir_opt_if, true);
73 NIR_PASS(progress, nir, nir_opt_dead_cf);
74
75 if (lower_alu_to_scalar)
76 NIR_PASS_V(nir, nir_lower_alu_to_scalar, si_alu_to_scalar_filter, sscreen);
77 if (lower_phis_to_scalar)
78 NIR_PASS_V(nir, nir_lower_phis_to_scalar, false);
79 progress |= lower_alu_to_scalar | lower_phis_to_scalar;
80
81 NIR_PASS(progress, nir, nir_opt_cse);
82 NIR_PASS(progress, nir, nir_opt_peephole_select, 8, true, true);
83
84 /* Needed for algebraic lowering */
85 NIR_PASS(progress, nir, nir_opt_algebraic);
86 NIR_PASS(progress, nir, nir_opt_constant_folding);
87
88 if (!nir->info.flrp_lowered) {
89 unsigned lower_flrp = (nir->options->lower_flrp16 ? 16 : 0) |
90 (nir->options->lower_flrp32 ? 32 : 0) |
91 (nir->options->lower_flrp64 ? 64 : 0);
92 assert(lower_flrp);
93 bool lower_flrp_progress = false;
94
95 NIR_PASS(lower_flrp_progress, nir, nir_lower_flrp, lower_flrp, false /* always_precise */);
96 if (lower_flrp_progress) {
97 NIR_PASS(progress, nir, nir_opt_constant_folding);
98 progress = true;
99 }
100
101 /* Nothing should rematerialize any flrps, so we only
102 * need to do this lowering once.
103 */
104 nir->info.flrp_lowered = true;
105 }
106
107 NIR_PASS(progress, nir, nir_opt_undef);
108 NIR_PASS(progress, nir, nir_opt_conditional_discard);
109 if (nir->options->max_unroll_iterations) {
110 NIR_PASS(progress, nir, nir_opt_loop_unroll);
111 }
112
113 if (nir->info.stage == MESA_SHADER_FRAGMENT)
114 NIR_PASS_V(nir, nir_opt_move_discards_to_top);
115
116 if (sscreen->options.fp16)
117 NIR_PASS(progress, nir, nir_opt_vectorize, NULL, NULL);
118 } while (progress);
119
120 NIR_PASS_V(nir, nir_lower_var_copies);
121 }
122
si_nir_late_opts(nir_shader * nir)123 void si_nir_late_opts(nir_shader *nir)
124 {
125 bool more_late_algebraic = true;
126 while (more_late_algebraic) {
127 more_late_algebraic = false;
128 NIR_PASS(more_late_algebraic, nir, nir_opt_algebraic_late);
129 NIR_PASS_V(nir, nir_opt_constant_folding);
130 NIR_PASS_V(nir, nir_copy_prop);
131 NIR_PASS_V(nir, nir_opt_dce);
132 NIR_PASS_V(nir, nir_opt_cse);
133 }
134 }
135
si_late_optimize_16bit_samplers(struct si_screen * sscreen,nir_shader * nir)136 static void si_late_optimize_16bit_samplers(struct si_screen *sscreen, nir_shader *nir)
137 {
138 /* Optimize and fix types of image_sample sources and destinations.
139 *
140 * The image_sample constraints are:
141 * nir_tex_src_coord: has_a16 ? select 16 or 32 : 32
142 * nir_tex_src_comparator: 32
143 * nir_tex_src_offset: 32
144 * nir_tex_src_bias: 32
145 * nir_tex_src_lod: match coord
146 * nir_tex_src_min_lod: match coord
147 * nir_tex_src_ms_index: match coord
148 * nir_tex_src_ddx: has_g16 && coord == 32 ? select 16 or 32 : match coord
149 * nir_tex_src_ddy: match ddy
150 *
151 * coord and ddx are selected optimally. The types of the rest are legalized
152 * based on those two.
153 */
154 /* TODO: The constraints can't represent the ddx constraint. */
155 /*bool has_g16 = sscreen->info.chip_class >= GFX10 && LLVM_VERSION_MAJOR >= 12;*/
156 bool has_g16 = false;
157 nir_tex_src_type_constraints tex_constraints = {
158 [nir_tex_src_comparator] = {true, 32},
159 [nir_tex_src_offset] = {true, 32},
160 [nir_tex_src_bias] = {true, 32},
161 [nir_tex_src_lod] = {true, 0, nir_tex_src_coord},
162 [nir_tex_src_min_lod] = {true, 0, nir_tex_src_coord},
163 [nir_tex_src_ms_index] = {true, 0, nir_tex_src_coord},
164 [nir_tex_src_ddx] = {!has_g16, 0, nir_tex_src_coord},
165 [nir_tex_src_ddy] = {true, 0, has_g16 ? nir_tex_src_ddx : nir_tex_src_coord},
166 };
167 bool changed = false;
168
169 NIR_PASS(changed, nir, nir_fold_16bit_sampler_conversions,
170 (1 << nir_tex_src_coord) |
171 (has_g16 ? 1 << nir_tex_src_ddx : 0));
172 NIR_PASS(changed, nir, nir_legalize_16bit_sampler_srcs, tex_constraints);
173
174 if (changed) {
175 si_nir_opts(sscreen, nir, false);
176 si_nir_late_opts(nir);
177 }
178 }
179
180 static bool
lower_intrinsic_filter(const nir_instr * instr,const void * dummy)181 lower_intrinsic_filter(const nir_instr *instr, const void *dummy)
182 {
183 return instr->type == nir_instr_type_intrinsic;
184 }
185
186 static nir_ssa_def *
lower_intrinsic_instr(nir_builder * b,nir_instr * instr,void * dummy)187 lower_intrinsic_instr(nir_builder *b, nir_instr *instr, void *dummy)
188 {
189 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
190
191 switch (intrin->intrinsic) {
192 case nir_intrinsic_is_sparse_texels_resident:
193 /* code==0 means sparse texels are resident */
194 return nir_ieq_imm(b, intrin->src[0].ssa, 0);
195 default:
196 return NULL;
197 }
198 }
199
si_lower_intrinsics(nir_shader * nir)200 static bool si_lower_intrinsics(nir_shader *nir)
201 {
202 return nir_shader_lower_instructions(nir,
203 lower_intrinsic_filter,
204 lower_intrinsic_instr,
205 NULL);
206 }
207
208 /**
209 * Perform "lowering" operations on the NIR that are run once when the shader
210 * selector is created.
211 */
si_lower_nir(struct si_screen * sscreen,struct nir_shader * nir)212 static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
213 {
214 /* Perform lowerings (and optimizations) of code.
215 *
216 * Performance considerations aside, we must:
217 * - lower certain ALU operations
218 * - ensure constant offsets for texture instructions are folded
219 * and copy-propagated
220 */
221
222 static const struct nir_lower_tex_options lower_tex_options = {
223 .lower_txp = ~0u,
224 .lower_txs_cube_array = true,
225 };
226 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
227
228 static const struct nir_lower_image_options lower_image_options = {
229 .lower_cube_size = true,
230 };
231 NIR_PASS_V(nir, nir_lower_image, &lower_image_options);
232
233 NIR_PASS_V(nir, si_lower_intrinsics);
234
235 const nir_lower_subgroups_options subgroups_options = {
236 .subgroup_size = 64,
237 .ballot_bit_size = 64,
238 .ballot_components = 1,
239 .lower_to_scalar = true,
240 .lower_subgroup_masks = true,
241 .lower_vote_trivial = false,
242 .lower_vote_eq = true,
243 };
244 NIR_PASS_V(nir, nir_lower_subgroups, &subgroups_options);
245
246 NIR_PASS_V(nir, nir_lower_discard_or_demote,
247 (sscreen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL)) ||
248 nir->info.is_arb_asm);
249
250 /* Lower load constants to scalar and then clean up the mess */
251 NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
252 NIR_PASS_V(nir, nir_lower_var_copies);
253 NIR_PASS_V(nir, nir_opt_intrinsics);
254 NIR_PASS_V(nir, nir_lower_system_values);
255 NIR_PASS_V(nir, nir_lower_compute_system_values, NULL);
256
257 if (nir->info.stage == MESA_SHADER_COMPUTE) {
258 if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
259 /* If we are shuffling local_invocation_id for quad derivatives, we
260 * need to derive local_invocation_index from local_invocation_id
261 * first, so that the value corresponds to the shuffled
262 * local_invocation_id.
263 */
264 nir_lower_compute_system_values_options options = {0};
265 options.lower_local_invocation_index = true;
266 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
267 }
268
269 nir_opt_cse(nir); /* CSE load_local_invocation_id */
270 nir_lower_compute_system_values_options options = {0};
271 options.shuffle_local_ids_for_quad_derivatives = true;
272 NIR_PASS_V(nir, nir_lower_compute_system_values, &options);
273 }
274
275 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16)) {
276 NIR_PASS_V(nir, nir_lower_mediump_io,
277 /* TODO: LLVM fails to compile this test if VS inputs are 16-bit:
278 * dEQP-GLES31.functional.shaders.builtin_functions.integer.bitfieldinsert.uvec3_lowp_geometry
279 */
280 (nir->info.stage != MESA_SHADER_VERTEX ? nir_var_shader_in : 0) | nir_var_shader_out,
281 BITFIELD64_BIT(VARYING_SLOT_PNTC) | BITFIELD64_RANGE(VARYING_SLOT_VAR0, 32),
282 true);
283 }
284
285 si_nir_opts(sscreen, nir, true);
286 /* Run late optimizations to fuse ffma and eliminate 16-bit conversions. */
287 si_nir_late_opts(nir);
288
289 if (sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
290 si_late_optimize_16bit_samplers(sscreen, nir);
291
292 NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_function_temp, NULL);
293 }
294
si_finalize_nir(struct pipe_screen * screen,void * nirptr)295 char *si_finalize_nir(struct pipe_screen *screen, void *nirptr)
296 {
297 struct si_screen *sscreen = (struct si_screen *)screen;
298 struct nir_shader *nir = (struct nir_shader *)nirptr;
299
300 nir_lower_io_passes(nir, NULL);
301
302 /* Remove dead derefs, so that we can remove uniforms. */
303 NIR_PASS_V(nir, nir_opt_dce);
304
305 /* Remove uniforms because those should have been lowered to UBOs already. */
306 nir_foreach_variable_with_modes_safe(var, nir, nir_var_uniform) {
307 if (!glsl_type_get_image_count(var->type) &&
308 !glsl_type_get_sampler_count(var->type))
309 exec_node_remove(&var->node);
310 }
311
312 si_lower_nir(sscreen, nir);
313 nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
314
315 if (sscreen->options.inline_uniforms)
316 nir_find_inlinable_uniforms(nir);
317
318 NIR_PASS_V(nir, nir_convert_to_lcssa, true, true); /* required by divergence analysis */
319 NIR_PASS_V(nir, nir_divergence_analysis); /* to find divergent loops */
320
321 return NULL;
322 }
323