1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_nir.h"
25 #include "brw_nir_rt.h"
26 #include "brw_shader.h"
27 #include "dev/intel_debug.h"
28 #include "compiler/glsl_types.h"
29 #include "compiler/nir/nir_builder.h"
30 #include "util/u_math.h"
31
32 static bool
remap_tess_levels(nir_builder * b,nir_intrinsic_instr * intr,enum tess_primitive_mode _primitive_mode)33 remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
34 enum tess_primitive_mode _primitive_mode)
35 {
36 const int location = nir_intrinsic_base(intr);
37 const unsigned component = nir_intrinsic_component(intr);
38 bool out_of_bounds;
39
40 if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
41 switch (_primitive_mode) {
42 case TESS_PRIMITIVE_QUADS:
43 /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
44 nir_intrinsic_set_base(intr, 0);
45 nir_intrinsic_set_component(intr, 3 - component);
46 out_of_bounds = false;
47 break;
48 case TESS_PRIMITIVE_TRIANGLES:
49 /* gl_TessLevelInner[0] lives at DWord 4. */
50 nir_intrinsic_set_base(intr, 1);
51 out_of_bounds = component > 0;
52 break;
53 case TESS_PRIMITIVE_ISOLINES:
54 out_of_bounds = true;
55 break;
56 default:
57 unreachable("Bogus tessellation domain");
58 }
59 } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
60 if (_primitive_mode == TESS_PRIMITIVE_ISOLINES) {
61 /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
62 nir_intrinsic_set_base(intr, 1);
63 nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr));
64 out_of_bounds = component > 1;
65 } else {
66 /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */
67 nir_intrinsic_set_base(intr, 1);
68 nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr));
69 out_of_bounds = component == 3 && _primitive_mode == TESS_PRIMITIVE_TRIANGLES;
70 }
71 } else {
72 return false;
73 }
74
75 if (out_of_bounds) {
76 if (nir_intrinsic_infos[intr->intrinsic].has_dest) {
77 b->cursor = nir_before_instr(&intr->instr);
78 nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
79 nir_ssa_def_rewrite_uses(&intr->dest.ssa, undef);
80 }
81 nir_instr_remove(&intr->instr);
82 }
83
84 return true;
85 }
86
87 static bool
is_input(nir_intrinsic_instr * intrin)88 is_input(nir_intrinsic_instr *intrin)
89 {
90 return intrin->intrinsic == nir_intrinsic_load_input ||
91 intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
92 intrin->intrinsic == nir_intrinsic_load_interpolated_input;
93 }
94
95 static bool
is_output(nir_intrinsic_instr * intrin)96 is_output(nir_intrinsic_instr *intrin)
97 {
98 return intrin->intrinsic == nir_intrinsic_load_output ||
99 intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
100 intrin->intrinsic == nir_intrinsic_store_output ||
101 intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
102 }
103
104
105 static bool
remap_patch_urb_offsets(nir_block * block,nir_builder * b,const struct brw_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)106 remap_patch_urb_offsets(nir_block *block, nir_builder *b,
107 const struct brw_vue_map *vue_map,
108 enum tess_primitive_mode tes_primitive_mode)
109 {
110 const bool is_passthrough_tcs = b->shader->info.name &&
111 strcmp(b->shader->info.name, "passthrough TCS") == 0;
112
113 nir_foreach_instr_safe(instr, block) {
114 if (instr->type != nir_instr_type_intrinsic)
115 continue;
116
117 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
118
119 gl_shader_stage stage = b->shader->info.stage;
120
121 if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
122 (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
123
124 if (!is_passthrough_tcs &&
125 remap_tess_levels(b, intrin, tes_primitive_mode))
126 continue;
127
128 int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
129 assert(vue_slot != -1);
130 intrin->const_index[0] = vue_slot;
131
132 nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
133 if (vertex) {
134 if (nir_src_is_const(*vertex)) {
135 intrin->const_index[0] += nir_src_as_uint(*vertex) *
136 vue_map->num_per_vertex_slots;
137 } else {
138 b->cursor = nir_before_instr(&intrin->instr);
139
140 /* Multiply by the number of per-vertex slots. */
141 nir_ssa_def *vertex_offset =
142 nir_imul(b,
143 nir_ssa_for_src(b, *vertex, 1),
144 nir_imm_int(b,
145 vue_map->num_per_vertex_slots));
146
147 /* Add it to the existing offset */
148 nir_src *offset = nir_get_io_offset_src(intrin);
149 nir_ssa_def *total_offset =
150 nir_iadd(b, vertex_offset,
151 nir_ssa_for_src(b, *offset, 1));
152
153 nir_instr_rewrite_src(&intrin->instr, offset,
154 nir_src_for_ssa(total_offset));
155 }
156 }
157 }
158 }
159 return true;
160 }
161
162 void
brw_nir_lower_vs_inputs(nir_shader * nir,bool edgeflag_is_last,const uint8_t * vs_attrib_wa_flags)163 brw_nir_lower_vs_inputs(nir_shader *nir,
164 bool edgeflag_is_last,
165 const uint8_t *vs_attrib_wa_flags)
166 {
167 /* Start with the location of the variable's base. */
168 nir_foreach_shader_in_variable(var, nir)
169 var->data.driver_location = var->data.location;
170
171 /* Now use nir_lower_io to walk dereference chains. Attribute arrays are
172 * loaded as one vec4 or dvec4 per element (or matrix column), depending on
173 * whether it is a double-precision type or not.
174 */
175 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
176 nir_lower_io_lower_64bit_to_32);
177
178 /* This pass needs actual constants */
179 nir_opt_constant_folding(nir);
180
181 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
182
183 brw_nir_apply_attribute_workarounds(nir, vs_attrib_wa_flags);
184
185 /* The last step is to remap VERT_ATTRIB_* to actual registers */
186
187 /* Whether or not we have any system generated values. gl_DrawID is not
188 * included here as it lives in its own vec4.
189 */
190 const bool has_sgvs =
191 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FIRST_VERTEX) ||
192 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_BASE_INSTANCE) ||
193 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) ||
194 BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_INSTANCE_ID);
195
196 const unsigned num_inputs = util_bitcount64(nir->info.inputs_read);
197
198 nir_foreach_function(function, nir) {
199 if (!function->impl)
200 continue;
201
202 nir_builder b;
203 nir_builder_init(&b, function->impl);
204
205 nir_foreach_block(block, function->impl) {
206 nir_foreach_instr_safe(instr, block) {
207 if (instr->type != nir_instr_type_intrinsic)
208 continue;
209
210 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
211
212 switch (intrin->intrinsic) {
213 case nir_intrinsic_load_first_vertex:
214 case nir_intrinsic_load_base_instance:
215 case nir_intrinsic_load_vertex_id_zero_base:
216 case nir_intrinsic_load_instance_id:
217 case nir_intrinsic_load_is_indexed_draw:
218 case nir_intrinsic_load_draw_id: {
219 b.cursor = nir_after_instr(&intrin->instr);
220
221 /* gl_VertexID and friends are stored by the VF as the last
222 * vertex element. We convert them to load_input intrinsics at
223 * the right location.
224 */
225 nir_intrinsic_instr *load =
226 nir_intrinsic_instr_create(nir, nir_intrinsic_load_input);
227 load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0));
228
229 nir_intrinsic_set_base(load, num_inputs);
230 switch (intrin->intrinsic) {
231 case nir_intrinsic_load_first_vertex:
232 nir_intrinsic_set_component(load, 0);
233 break;
234 case nir_intrinsic_load_base_instance:
235 nir_intrinsic_set_component(load, 1);
236 break;
237 case nir_intrinsic_load_vertex_id_zero_base:
238 nir_intrinsic_set_component(load, 2);
239 break;
240 case nir_intrinsic_load_instance_id:
241 nir_intrinsic_set_component(load, 3);
242 break;
243 case nir_intrinsic_load_draw_id:
244 case nir_intrinsic_load_is_indexed_draw:
245 /* gl_DrawID and IsIndexedDraw are stored right after
246 * gl_VertexID and friends if any of them exist.
247 */
248 nir_intrinsic_set_base(load, num_inputs + has_sgvs);
249 if (intrin->intrinsic == nir_intrinsic_load_draw_id)
250 nir_intrinsic_set_component(load, 0);
251 else
252 nir_intrinsic_set_component(load, 1);
253 break;
254 default:
255 unreachable("Invalid system value intrinsic");
256 }
257
258 load->num_components = 1;
259 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
260 nir_builder_instr_insert(&b, &load->instr);
261
262 nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
263 &load->dest.ssa);
264 nir_instr_remove(&intrin->instr);
265 break;
266 }
267
268 case nir_intrinsic_load_input: {
269 /* Attributes come in a contiguous block, ordered by their
270 * gl_vert_attrib value. That means we can compute the slot
271 * number for an attribute by masking out the enabled attributes
272 * before it and counting the bits.
273 */
274 int attr = nir_intrinsic_base(intrin);
275 uint64_t inputs_read = nir->info.inputs_read;
276 int slot = -1;
277 if (edgeflag_is_last) {
278 inputs_read &= ~BITFIELD64_BIT(VERT_ATTRIB_EDGEFLAG);
279 if (attr == VERT_ATTRIB_EDGEFLAG)
280 slot = num_inputs - 1;
281 }
282 if (slot == -1)
283 slot = util_bitcount64(inputs_read &
284 BITFIELD64_MASK(attr));
285 nir_intrinsic_set_base(intrin, slot);
286 break;
287 }
288
289 default:
290 break; /* Nothing to do */
291 }
292 }
293 }
294 }
295 }
296
297 void
brw_nir_lower_vue_inputs(nir_shader * nir,const struct brw_vue_map * vue_map)298 brw_nir_lower_vue_inputs(nir_shader *nir,
299 const struct brw_vue_map *vue_map)
300 {
301 nir_foreach_shader_in_variable(var, nir)
302 var->data.driver_location = var->data.location;
303
304 /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
305 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
306 nir_lower_io_lower_64bit_to_32);
307
308 /* This pass needs actual constants */
309 nir_opt_constant_folding(nir);
310
311 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
312
313 nir_foreach_function(function, nir) {
314 if (!function->impl)
315 continue;
316
317 nir_foreach_block(block, function->impl) {
318 nir_foreach_instr(instr, block) {
319 if (instr->type != nir_instr_type_intrinsic)
320 continue;
321
322 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
323
324 if (intrin->intrinsic == nir_intrinsic_load_input ||
325 intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
326 /* Offset 0 is the VUE header, which contains
327 * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and
328 * VARYING_SLOT_PSIZ [.w].
329 */
330 int varying = nir_intrinsic_base(intrin);
331 int vue_slot;
332 switch (varying) {
333 case VARYING_SLOT_PSIZ:
334 nir_intrinsic_set_base(intrin, 0);
335 nir_intrinsic_set_component(intrin, 3);
336 break;
337
338 default:
339 vue_slot = vue_map->varying_to_slot[varying];
340 assert(vue_slot != -1);
341 nir_intrinsic_set_base(intrin, vue_slot);
342 break;
343 }
344 }
345 }
346 }
347 }
348 }
349
350 void
brw_nir_lower_tes_inputs(nir_shader * nir,const struct brw_vue_map * vue_map)351 brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map)
352 {
353 nir_foreach_shader_in_variable(var, nir)
354 var->data.driver_location = var->data.location;
355
356 nir_lower_io(nir, nir_var_shader_in, type_size_vec4,
357 nir_lower_io_lower_64bit_to_32);
358
359 /* This pass needs actual constants */
360 nir_opt_constant_folding(nir);
361
362 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
363
364 nir_foreach_function(function, nir) {
365 if (function->impl) {
366 nir_builder b;
367 nir_builder_init(&b, function->impl);
368 nir_foreach_block(block, function->impl) {
369 remap_patch_urb_offsets(block, &b, vue_map,
370 nir->info.tess._primitive_mode);
371 }
372 }
373 }
374 }
375
376 /**
377 * Convert interpolateAtOffset() offsets from [-0.5, +0.5] floating point
378 * offsets to integer [-8, +7] offsets (in units of 1/16th of a pixel).
379 *
380 * We clamp to +7/16 on the upper end of the range, since +0.5 isn't
381 * representable in a S0.4 value; a naive conversion would give us -8/16,
382 * which is the opposite of what was intended.
383 *
384 * This is allowed by GL_ARB_gpu_shader5's quantization rules:
385 *
386 * "Not all values of <offset> may be supported; x and y offsets may
387 * be rounded to fixed-point values with the number of fraction bits
388 * given by the implementation-dependent constant
389 * FRAGMENT_INTERPOLATION_OFFSET_BITS."
390 */
391 static bool
lower_barycentric_at_offset(nir_builder * b,nir_instr * instr,void * data)392 lower_barycentric_at_offset(nir_builder *b, nir_instr *instr, void *data)
393 {
394 if (instr->type != nir_instr_type_intrinsic)
395 return false;
396
397 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
398
399 if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_offset)
400 return false;
401
402 b->cursor = nir_before_instr(instr);
403
404 assert(intrin->src[0].ssa);
405 nir_ssa_def *offset =
406 nir_imin(b, nir_imm_int(b, 7),
407 nir_f2i32(b, nir_fmul(b, nir_imm_float(b, 16),
408 intrin->src[0].ssa)));
409
410 nir_instr_rewrite_src(instr, &intrin->src[0], nir_src_for_ssa(offset));
411
412 return true;
413 }
414
415 void
brw_nir_lower_fs_inputs(nir_shader * nir,const struct intel_device_info * devinfo,const struct brw_wm_prog_key * key)416 brw_nir_lower_fs_inputs(nir_shader *nir,
417 const struct intel_device_info *devinfo,
418 const struct brw_wm_prog_key *key)
419 {
420 nir_foreach_shader_in_variable(var, nir) {
421 var->data.driver_location = var->data.location;
422
423 /* Apply default interpolation mode.
424 *
425 * Everything defaults to smooth except for the legacy GL color
426 * built-in variables, which might be flat depending on API state.
427 */
428 if (var->data.interpolation == INTERP_MODE_NONE) {
429 const bool flat = key->flat_shade &&
430 (var->data.location == VARYING_SLOT_COL0 ||
431 var->data.location == VARYING_SLOT_COL1);
432
433 var->data.interpolation = flat ? INTERP_MODE_FLAT
434 : INTERP_MODE_SMOOTH;
435 }
436
437 /* On Ironlake and below, there is only one interpolation mode.
438 * Centroid interpolation doesn't mean anything on this hardware --
439 * there is no multisampling.
440 */
441 if (devinfo->ver < 6) {
442 var->data.centroid = false;
443 var->data.sample = false;
444 }
445 }
446
447 nir_lower_io_options lower_io_options = nir_lower_io_lower_64bit_to_32;
448 if (key->persample_interp)
449 lower_io_options |= nir_lower_io_force_sample_interpolation;
450
451 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options);
452 if (devinfo->ver >= 11)
453 nir_lower_interpolation(nir, ~0);
454
455 nir_shader_instructions_pass(nir, lower_barycentric_at_offset,
456 nir_metadata_block_index |
457 nir_metadata_dominance,
458 NULL);
459
460 /* This pass needs actual constants */
461 nir_opt_constant_folding(nir);
462
463 nir_io_add_const_offset_to_base(nir, nir_var_shader_in);
464 }
465
466 void
brw_nir_lower_vue_outputs(nir_shader * nir)467 brw_nir_lower_vue_outputs(nir_shader *nir)
468 {
469 nir_foreach_shader_out_variable(var, nir) {
470 var->data.driver_location = var->data.location;
471 }
472
473 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
474 nir_lower_io_lower_64bit_to_32);
475 }
476
477 void
brw_nir_lower_tcs_outputs(nir_shader * nir,const struct brw_vue_map * vue_map,enum tess_primitive_mode tes_primitive_mode)478 brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map,
479 enum tess_primitive_mode tes_primitive_mode)
480 {
481 nir_foreach_shader_out_variable(var, nir) {
482 var->data.driver_location = var->data.location;
483 }
484
485 nir_lower_io(nir, nir_var_shader_out, type_size_vec4,
486 nir_lower_io_lower_64bit_to_32);
487
488 /* This pass needs actual constants */
489 nir_opt_constant_folding(nir);
490
491 nir_io_add_const_offset_to_base(nir, nir_var_shader_out);
492
493 nir_foreach_function(function, nir) {
494 if (function->impl) {
495 nir_builder b;
496 nir_builder_init(&b, function->impl);
497 nir_foreach_block(block, function->impl) {
498 remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
499 }
500 }
501 }
502 }
503
504 void
brw_nir_lower_fs_outputs(nir_shader * nir)505 brw_nir_lower_fs_outputs(nir_shader *nir)
506 {
507 nir_foreach_shader_out_variable(var, nir) {
508 var->data.driver_location =
509 SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
510 SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
511 }
512
513 nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
514 }
515
516 #define OPT(pass, ...) ({ \
517 bool this_progress = false; \
518 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \
519 if (this_progress) \
520 progress = true; \
521 this_progress; \
522 })
523
524 void
brw_nir_optimize(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar,bool allow_copies)525 brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
526 bool is_scalar, bool allow_copies)
527 {
528 bool progress;
529 unsigned lower_flrp =
530 (nir->options->lower_flrp16 ? 16 : 0) |
531 (nir->options->lower_flrp32 ? 32 : 0) |
532 (nir->options->lower_flrp64 ? 64 : 0);
533
534 do {
535 progress = false;
536 OPT(nir_split_array_vars, nir_var_function_temp);
537 OPT(nir_shrink_vec_array_vars, nir_var_function_temp);
538 OPT(nir_opt_deref);
539 if (OPT(nir_opt_memcpy))
540 OPT(nir_split_var_copies);
541 OPT(nir_lower_vars_to_ssa);
542 if (allow_copies) {
543 /* Only run this pass in the first call to brw_nir_optimize. Later
544 * calls assume that we've lowered away any copy_deref instructions
545 * and we don't want to introduce any more.
546 */
547 OPT(nir_opt_find_array_copies);
548 }
549 OPT(nir_opt_copy_prop_vars);
550 OPT(nir_opt_dead_write_vars);
551 OPT(nir_opt_combine_stores, nir_var_all);
552
553 OPT(nir_opt_ray_queries);
554
555 if (is_scalar) {
556 OPT(nir_lower_alu_to_scalar, NULL, NULL);
557 } else {
558 OPT(nir_opt_shrink_stores, true);
559 OPT(nir_opt_shrink_vectors);
560 }
561
562 OPT(nir_copy_prop);
563
564 if (is_scalar) {
565 OPT(nir_lower_phis_to_scalar, false);
566 }
567
568 OPT(nir_copy_prop);
569 OPT(nir_opt_dce);
570 OPT(nir_opt_cse);
571 OPT(nir_opt_combine_stores, nir_var_all);
572
573 /* Passing 0 to the peephole select pass causes it to convert
574 * if-statements that contain only move instructions in the branches
575 * regardless of the count.
576 *
577 * Passing 1 to the peephole select pass causes it to convert
578 * if-statements that contain at most a single ALU instruction (total)
579 * in both branches. Before Gfx6, some math instructions were
580 * prohibitively expensive and the results of compare operations need an
581 * extra resolve step. For these reasons, this pass is more harmful
582 * than good on those platforms.
583 *
584 * For indirect loads of uniforms (push constants), we assume that array
585 * indices will nearly always be in bounds and the cost of the load is
586 * low. Therefore there shouldn't be a performance benefit to avoid it.
587 * However, in vec4 tessellation shaders, these loads operate by
588 * actually pulling from memory.
589 */
590 const bool is_vec4_tessellation = !is_scalar &&
591 (nir->info.stage == MESA_SHADER_TESS_CTRL ||
592 nir->info.stage == MESA_SHADER_TESS_EVAL);
593 OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false);
594 OPT(nir_opt_peephole_select, 8, !is_vec4_tessellation,
595 compiler->devinfo->ver >= 6);
596
597 OPT(nir_opt_intrinsics);
598 OPT(nir_opt_idiv_const, 32);
599 OPT(nir_opt_algebraic);
600 OPT(nir_lower_constant_convert_alu_types);
601 OPT(nir_opt_constant_folding);
602
603 if (lower_flrp != 0) {
604 if (OPT(nir_lower_flrp,
605 lower_flrp,
606 false /* always_precise */)) {
607 OPT(nir_opt_constant_folding);
608 }
609
610 /* Nothing should rematerialize any flrps, so we only need to do this
611 * lowering once.
612 */
613 lower_flrp = 0;
614 }
615
616 OPT(nir_opt_dead_cf);
617 if (OPT(nir_opt_trivial_continues)) {
618 /* If nir_opt_trivial_continues makes progress, then we need to clean
619 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
620 * to make progress.
621 */
622 OPT(nir_copy_prop);
623 OPT(nir_opt_dce);
624 }
625 OPT(nir_opt_if, false);
626 OPT(nir_opt_conditional_discard);
627 if (nir->options->max_unroll_iterations != 0) {
628 OPT(nir_opt_loop_unroll);
629 }
630 OPT(nir_opt_remove_phis);
631 OPT(nir_opt_gcm, false);
632 OPT(nir_opt_undef);
633 OPT(nir_lower_pack);
634 } while (progress);
635
636 /* Workaround Gfxbench unused local sampler variable which will trigger an
637 * assert in the opt_large_constants pass.
638 */
639 OPT(nir_remove_dead_variables, nir_var_function_temp, NULL);
640 }
641
642 static unsigned
lower_bit_size_callback(const nir_instr * instr,UNUSED void * data)643 lower_bit_size_callback(const nir_instr *instr, UNUSED void *data)
644 {
645 const struct brw_compiler *compiler = (const struct brw_compiler *) data;
646 const struct intel_device_info *devinfo = compiler->devinfo;
647
648 switch (instr->type) {
649 case nir_instr_type_alu: {
650 nir_alu_instr *alu = nir_instr_as_alu(instr);
651 assert(alu->dest.dest.is_ssa);
652 if (alu->dest.dest.ssa.bit_size >= 32)
653 return 0;
654
655 /* Note: nir_op_iabs and nir_op_ineg are not lowered here because the
656 * 8-bit ABS or NEG instruction should eventually get copy propagated
657 * into the MOV that does the type conversion. This results in far
658 * fewer MOV instructions.
659 */
660 switch (alu->op) {
661 case nir_op_idiv:
662 case nir_op_imod:
663 case nir_op_irem:
664 case nir_op_udiv:
665 case nir_op_umod:
666 case nir_op_fceil:
667 case nir_op_ffloor:
668 case nir_op_ffract:
669 case nir_op_fround_even:
670 case nir_op_ftrunc:
671 return 32;
672 case nir_op_frcp:
673 case nir_op_frsq:
674 case nir_op_fsqrt:
675 case nir_op_fpow:
676 case nir_op_fexp2:
677 case nir_op_flog2:
678 case nir_op_fsin:
679 case nir_op_fcos:
680 return devinfo->ver < 9 ? 32 : 0;
681 case nir_op_isign:
682 assert(!"Should have been lowered by nir_opt_algebraic.");
683 return 0;
684 default:
685 if (nir_op_infos[alu->op].num_inputs >= 2 &&
686 alu->dest.dest.ssa.bit_size == 8)
687 return 16;
688
689 if (nir_alu_instr_is_comparison(alu) &&
690 alu->src[0].src.ssa->bit_size == 8)
691 return 16;
692
693 return 0;
694 }
695 break;
696 }
697
698 case nir_instr_type_intrinsic: {
699 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
700 switch (intrin->intrinsic) {
701 case nir_intrinsic_read_invocation:
702 case nir_intrinsic_read_first_invocation:
703 case nir_intrinsic_vote_feq:
704 case nir_intrinsic_vote_ieq:
705 case nir_intrinsic_shuffle:
706 case nir_intrinsic_shuffle_xor:
707 case nir_intrinsic_shuffle_up:
708 case nir_intrinsic_shuffle_down:
709 case nir_intrinsic_quad_broadcast:
710 case nir_intrinsic_quad_swap_horizontal:
711 case nir_intrinsic_quad_swap_vertical:
712 case nir_intrinsic_quad_swap_diagonal:
713 if (intrin->src[0].ssa->bit_size == 8)
714 return 16;
715 return 0;
716
717 case nir_intrinsic_reduce:
718 case nir_intrinsic_inclusive_scan:
719 case nir_intrinsic_exclusive_scan:
720 /* There are a couple of register region issues that make things
721 * complicated for 8-bit types:
722 *
723 * 1. Only raw moves are allowed to write to a packed 8-bit
724 * destination.
725 * 2. If we use a strided destination, the efficient way to do
726 * scan operations ends up using strides that are too big to
727 * encode in an instruction.
728 *
729 * To get around these issues, we just do all 8-bit scan operations
730 * in 16 bits. It's actually fewer instructions than what we'd have
731 * to do if we were trying to do it in native 8-bit types and the
732 * results are the same once we truncate to 8 bits at the end.
733 */
734 if (intrin->dest.ssa.bit_size == 8)
735 return 16;
736 return 0;
737
738 default:
739 return 0;
740 }
741 break;
742 }
743
744 case nir_instr_type_phi: {
745 nir_phi_instr *phi = nir_instr_as_phi(instr);
746 if (phi->dest.ssa.bit_size == 8)
747 return 16;
748 return 0;
749 }
750
751 default:
752 return 0;
753 }
754 }
755
756 /* On gfx12.5+, if the offsets are not both constant and in the {-8,7} range,
757 * we will have nir_lower_tex() lower the source offset by returning true from
758 * this filter function.
759 */
760 static bool
lower_xehp_tg4_offset_filter(const nir_instr * instr,UNUSED const void * data)761 lower_xehp_tg4_offset_filter(const nir_instr *instr, UNUSED const void *data)
762 {
763 if (instr->type != nir_instr_type_tex)
764 return false;
765
766 nir_tex_instr *tex = nir_instr_as_tex(instr);
767
768 if (tex->op != nir_texop_tg4)
769 return false;
770
771 int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
772 if (offset_index < 0)
773 return false;
774
775 if (!nir_src_is_const(tex->src[offset_index].src))
776 return true;
777
778 int64_t offset_x = nir_src_comp_as_int(tex->src[offset_index].src, 0);
779 int64_t offset_y = nir_src_comp_as_int(tex->src[offset_index].src, 1);
780
781 return offset_x < -8 || offset_x > 7 || offset_y < -8 || offset_y > 7;
782 }
783
784 /* Does some simple lowering and runs the standard suite of optimizations
785 *
786 * This is intended to be called more-or-less directly after you get the
787 * shader out of GLSL or some other source. While it is geared towards i965,
788 * it is not at all generator-specific except for the is_scalar flag. Even
789 * there, it is safe to call with is_scalar = false for a shader that is
790 * intended for the FS backend as long as nir_optimize is called again with
791 * is_scalar = true to scalarize everything prior to code gen.
792 */
793 void
brw_preprocess_nir(const struct brw_compiler * compiler,nir_shader * nir,const nir_shader * softfp64)794 brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
795 const nir_shader *softfp64)
796 {
797 const struct intel_device_info *devinfo = compiler->devinfo;
798 UNUSED bool progress; /* Written by OPT */
799
800 const bool is_scalar = compiler->scalar_stage[nir->info.stage];
801
802 nir_validate_ssa_dominance(nir, "before brw_preprocess_nir");
803
804 if (is_scalar) {
805 OPT(nir_lower_alu_to_scalar, NULL, NULL);
806 }
807
808 if (nir->info.stage == MESA_SHADER_GEOMETRY)
809 OPT(nir_lower_gs_intrinsics, 0);
810
811 /* See also brw_nir_trig_workarounds.py */
812 if (compiler->precise_trig &&
813 !(devinfo->ver >= 10 || devinfo->platform == INTEL_PLATFORM_KBL))
814 OPT(brw_nir_apply_trig_workarounds);
815
816 if (devinfo->ver >= 12)
817 OPT(brw_nir_clamp_image_1d_2d_array_sizes);
818
819 const nir_lower_tex_options tex_options = {
820 .lower_txp = ~0,
821 .lower_txf_offset = true,
822 .lower_rect_offset = true,
823 .lower_txd_cube_map = true,
824 .lower_txd_3d = devinfo->verx10 >= 125,
825 .lower_txb_shadow_clamp = true,
826 .lower_txd_shadow_clamp = true,
827 .lower_txd_offset_clamp = true,
828 .lower_tg4_offsets = true,
829 .lower_txs_lod = true, /* Wa_14012320009 */
830 .lower_offset_filter =
831 devinfo->verx10 >= 125 ? lower_xehp_tg4_offset_filter : NULL,
832 };
833
834 OPT(nir_lower_tex, &tex_options);
835 OPT(nir_normalize_cubemap_coords);
836
837 OPT(nir_lower_global_vars_to_local);
838
839 OPT(nir_split_var_copies);
840 OPT(nir_split_struct_vars, nir_var_function_temp);
841
842 brw_nir_optimize(nir, compiler, is_scalar, true);
843
844 OPT(nir_lower_doubles, softfp64, nir->options->lower_doubles_options);
845 OPT(nir_lower_int64);
846
847 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
848
849 if (is_scalar) {
850 OPT(nir_lower_load_const_to_scalar);
851 }
852
853 /* Lower a bunch of stuff */
854 OPT(nir_lower_var_copies);
855
856 /* This needs to be run after the first optimization pass but before we
857 * lower indirect derefs away
858 */
859 if (compiler->supports_shader_constants) {
860 OPT(nir_opt_large_constants, NULL, 32);
861 }
862
863 OPT(nir_lower_system_values);
864 OPT(nir_lower_compute_system_values, NULL);
865
866 const nir_lower_subgroups_options subgroups_options = {
867 .ballot_bit_size = 32,
868 .ballot_components = 1,
869 .lower_to_scalar = true,
870 .lower_vote_trivial = !is_scalar,
871 .lower_relative_shuffle = true,
872 .lower_quad_broadcast_dynamic = true,
873 .lower_elect = true,
874 };
875 OPT(nir_lower_subgroups, &subgroups_options);
876
877 OPT(nir_lower_clip_cull_distance_arrays);
878
879 nir_variable_mode indirect_mask =
880 brw_nir_no_indirect_mask(compiler, nir->info.stage);
881 OPT(nir_lower_indirect_derefs, indirect_mask, UINT32_MAX);
882
883 /* Even in cases where we can handle indirect temporaries via scratch, we
884 * it can still be expensive. Lower indirects on small arrays to
885 * conditional load/stores.
886 *
887 * The threshold of 16 was chosen semi-arbitrarily. The idea is that an
888 * indirect on an array of 16 elements is about 30 instructions at which
889 * point, you may be better off doing a send. With a SIMD8 program, 16
890 * floats is 1/8 of the entire register file. Any array larger than that
891 * is likely to cause pressure issues. Also, this value is sufficiently
892 * high that the benchmarks known to suffer from large temporary array
893 * issues are helped but nothing else in shader-db is hurt except for maybe
894 * that one kerbal space program shader.
895 */
896 if (is_scalar && !(indirect_mask & nir_var_function_temp))
897 OPT(nir_lower_indirect_derefs, nir_var_function_temp, 16);
898
899 /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and
900 * SSBOs, our back-end is capable of loading an entire vec4 at a time and
901 * we would like to take advantage of that whenever possible regardless of
902 * whether or not the app gives us full loads. This should allow the
903 * optimizer to combine UBO and SSBO load operations and save us some send
904 * messages.
905 */
906 OPT(nir_lower_array_deref_of_vec,
907 nir_var_mem_ubo | nir_var_mem_ssbo,
908 nir_lower_direct_array_deref_of_vec_load);
909
910 /* Get rid of split copies */
911 brw_nir_optimize(nir, compiler, is_scalar, false);
912 }
913
914 void
brw_nir_link_shaders(const struct brw_compiler * compiler,nir_shader * producer,nir_shader * consumer)915 brw_nir_link_shaders(const struct brw_compiler *compiler,
916 nir_shader *producer, nir_shader *consumer)
917 {
918 if (producer->info.stage == MESA_SHADER_MESH &&
919 consumer->info.stage == MESA_SHADER_FRAGMENT) {
920 /* gl_MeshPerPrimitiveNV[].gl_ViewportIndex, gl_PrimitiveID and gl_Layer
921 * are per primitive, but fragment shader does not have them marked as
922 * such. Add the annotation here.
923 */
924 nir_foreach_shader_in_variable(var, consumer) {
925 switch (var->data.location) {
926 case VARYING_SLOT_LAYER:
927 case VARYING_SLOT_PRIMITIVE_ID:
928 case VARYING_SLOT_VIEWPORT:
929 var->data.per_primitive = 1;
930 break;
931 default:
932 continue;
933 }
934 }
935 }
936
937 nir_lower_io_arrays_to_elements(producer, consumer);
938 nir_validate_shader(producer, "after nir_lower_io_arrays_to_elements");
939 nir_validate_shader(consumer, "after nir_lower_io_arrays_to_elements");
940
941 const bool p_is_scalar = compiler->scalar_stage[producer->info.stage];
942 const bool c_is_scalar = compiler->scalar_stage[consumer->info.stage];
943
944 if (p_is_scalar && c_is_scalar) {
945 NIR_PASS_V(producer, nir_lower_io_to_scalar_early, nir_var_shader_out);
946 NIR_PASS_V(consumer, nir_lower_io_to_scalar_early, nir_var_shader_in);
947 brw_nir_optimize(producer, compiler, p_is_scalar, false);
948 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
949 }
950
951 if (nir_link_opt_varyings(producer, consumer))
952 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
953
954 NIR_PASS_V(producer, nir_remove_dead_variables, nir_var_shader_out, NULL);
955 NIR_PASS_V(consumer, nir_remove_dead_variables, nir_var_shader_in, NULL);
956
957 if (nir_remove_unused_varyings(producer, consumer)) {
958 NIR_PASS_V(producer, nir_lower_global_vars_to_local);
959 NIR_PASS_V(consumer, nir_lower_global_vars_to_local);
960
961 /* The backend might not be able to handle indirects on
962 * temporaries so we need to lower indirects on any of the
963 * varyings we have demoted here.
964 */
965 NIR_PASS_V(producer, nir_lower_indirect_derefs,
966 brw_nir_no_indirect_mask(compiler, producer->info.stage),
967 UINT32_MAX);
968 NIR_PASS_V(consumer, nir_lower_indirect_derefs,
969 brw_nir_no_indirect_mask(compiler, consumer->info.stage),
970 UINT32_MAX);
971
972 brw_nir_optimize(producer, compiler, p_is_scalar, false);
973 brw_nir_optimize(consumer, compiler, c_is_scalar, false);
974 }
975
976 NIR_PASS_V(producer, nir_lower_io_to_vector, nir_var_shader_out);
977 NIR_PASS_V(producer, nir_opt_combine_stores, nir_var_shader_out);
978 NIR_PASS_V(consumer, nir_lower_io_to_vector, nir_var_shader_in);
979
980 if (producer->info.stage != MESA_SHADER_TESS_CTRL &&
981 producer->info.stage != MESA_SHADER_MESH &&
982 producer->info.stage != MESA_SHADER_TASK) {
983 /* Calling lower_io_to_vector creates output variable writes with
984 * write-masks. On non-TCS outputs, the back-end can't handle it and we
985 * need to call nir_lower_io_to_temporaries to get rid of them. This,
986 * in turn, creates temporary variables and extra copy_deref intrinsics
987 * that we need to clean up.
988 *
989 * Note Mesh/Task don't support I/O as temporaries (I/O is shared
990 * between whole workgroup, possibly using multiple HW threads). For
991 * those write-mask in output is handled by I/O lowering.
992 */
993 NIR_PASS_V(producer, nir_lower_io_to_temporaries,
994 nir_shader_get_entrypoint(producer), true, false);
995 NIR_PASS_V(producer, nir_lower_global_vars_to_local);
996 NIR_PASS_V(producer, nir_split_var_copies);
997 NIR_PASS_V(producer, nir_lower_var_copies);
998 }
999 }
1000
1001 static bool
brw_nir_should_vectorize_mem(unsigned align_mul,unsigned align_offset,unsigned bit_size,unsigned num_components,nir_intrinsic_instr * low,nir_intrinsic_instr * high,void * data)1002 brw_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
1003 unsigned bit_size,
1004 unsigned num_components,
1005 nir_intrinsic_instr *low,
1006 nir_intrinsic_instr *high,
1007 void *data)
1008 {
1009 /* Don't combine things to generate 64-bit loads/stores. We have to split
1010 * those back into 32-bit ones anyway and UBO loads aren't split in NIR so
1011 * we don't want to make a mess for the back-end.
1012 */
1013 if (bit_size > 32)
1014 return false;
1015
1016 /* We can handle at most a vec4 right now. Anything bigger would get
1017 * immediately split by brw_nir_lower_mem_access_bit_sizes anyway.
1018 */
1019 if (num_components > 4)
1020 return false;
1021
1022
1023 uint32_t align;
1024 if (align_offset)
1025 align = 1 << (ffs(align_offset) - 1);
1026 else
1027 align = align_mul;
1028
1029 if (align < bit_size / 8)
1030 return false;
1031
1032 return true;
1033 }
1034
1035 static
combine_all_barriers(nir_intrinsic_instr * a,nir_intrinsic_instr * b,void * data)1036 bool combine_all_barriers(nir_intrinsic_instr *a,
1037 nir_intrinsic_instr *b,
1038 void *data)
1039 {
1040 /* Translation to backend IR will get rid of modes we don't care about, so
1041 * no harm in always combining them.
1042 *
1043 * TODO: While HW has only ACQUIRE|RELEASE fences, we could improve the
1044 * scheduling so that it can take advantage of the different semantics.
1045 */
1046 nir_intrinsic_set_memory_modes(a, nir_intrinsic_memory_modes(a) |
1047 nir_intrinsic_memory_modes(b));
1048 nir_intrinsic_set_memory_semantics(a, nir_intrinsic_memory_semantics(a) |
1049 nir_intrinsic_memory_semantics(b));
1050 nir_intrinsic_set_memory_scope(a, MAX2(nir_intrinsic_memory_scope(a),
1051 nir_intrinsic_memory_scope(b)));
1052 return true;
1053 }
1054
1055 static void
brw_vectorize_lower_mem_access(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar,bool robust_buffer_access)1056 brw_vectorize_lower_mem_access(nir_shader *nir,
1057 const struct brw_compiler *compiler,
1058 bool is_scalar,
1059 bool robust_buffer_access)
1060 {
1061 const struct intel_device_info *devinfo = compiler->devinfo;
1062 bool progress = false;
1063
1064 if (is_scalar) {
1065 nir_load_store_vectorize_options options = {
1066 .modes = nir_var_mem_ubo | nir_var_mem_ssbo |
1067 nir_var_mem_global | nir_var_mem_shared,
1068 .callback = brw_nir_should_vectorize_mem,
1069 .robust_modes = (nir_variable_mode)0,
1070 };
1071
1072 if (robust_buffer_access) {
1073 options.robust_modes = nir_var_mem_ubo | nir_var_mem_ssbo |
1074 nir_var_mem_global;
1075 }
1076
1077 OPT(nir_opt_load_store_vectorize, &options);
1078 }
1079
1080 OPT(brw_nir_lower_mem_access_bit_sizes, devinfo);
1081
1082 while (progress) {
1083 progress = false;
1084
1085 OPT(nir_lower_pack);
1086 OPT(nir_copy_prop);
1087 OPT(nir_opt_dce);
1088 OPT(nir_opt_cse);
1089 OPT(nir_opt_algebraic);
1090 OPT(nir_opt_constant_folding);
1091 }
1092 }
1093
1094 static bool
nir_shader_has_local_variables(const nir_shader * nir)1095 nir_shader_has_local_variables(const nir_shader *nir)
1096 {
1097 nir_foreach_function(func, nir) {
1098 if (func->impl && !exec_list_is_empty(&func->impl->locals))
1099 return true;
1100 }
1101
1102 return false;
1103 }
1104
1105 /* Prepare the given shader for codegen
1106 *
1107 * This function is intended to be called right before going into the actual
1108 * backend and is highly backend-specific. Also, once this function has been
1109 * called on a shader, it will no longer be in SSA form so most optimizations
1110 * will not work.
1111 */
1112 void
brw_postprocess_nir(nir_shader * nir,const struct brw_compiler * compiler,bool is_scalar,bool debug_enabled,bool robust_buffer_access)1113 brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
1114 bool is_scalar, bool debug_enabled,
1115 bool robust_buffer_access)
1116 {
1117 const struct intel_device_info *devinfo = compiler->devinfo;
1118
1119 UNUSED bool progress; /* Written by OPT */
1120
1121 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler);
1122
1123 OPT(brw_nir_lower_scoped_barriers);
1124 OPT(nir_opt_combine_memory_barriers, combine_all_barriers, NULL);
1125
1126 do {
1127 progress = false;
1128 OPT(nir_opt_algebraic_before_ffma);
1129 } while (progress);
1130
1131 if (devinfo->verx10 >= 125) {
1132 const nir_lower_idiv_options options = {
1133 .imprecise_32bit_lowering = false,
1134 .allow_fp16 = false
1135 };
1136 OPT(nir_lower_idiv, &options);
1137 }
1138
1139 if (gl_shader_stage_can_set_fragment_shading_rate(nir->info.stage))
1140 brw_nir_lower_shading_rate_output(nir);
1141
1142 brw_nir_optimize(nir, compiler, is_scalar, false);
1143
1144 if (is_scalar && nir_shader_has_local_variables(nir)) {
1145 OPT(nir_lower_vars_to_explicit_types, nir_var_function_temp,
1146 glsl_get_natural_size_align_bytes);
1147 OPT(nir_lower_explicit_io, nir_var_function_temp,
1148 nir_address_format_32bit_offset);
1149 brw_nir_optimize(nir, compiler, is_scalar, false);
1150 }
1151
1152 brw_vectorize_lower_mem_access(nir, compiler, is_scalar,
1153 robust_buffer_access);
1154
1155 if (OPT(nir_lower_int64))
1156 brw_nir_optimize(nir, compiler, is_scalar, false);
1157
1158 if (devinfo->ver >= 6) {
1159 /* Try and fuse multiply-adds */
1160 OPT(brw_nir_opt_peephole_ffma);
1161 }
1162
1163 if (OPT(nir_opt_comparison_pre)) {
1164 OPT(nir_copy_prop);
1165 OPT(nir_opt_dce);
1166 OPT(nir_opt_cse);
1167
1168 /* Do the select peepehole again. nir_opt_comparison_pre (combined with
1169 * the other optimization passes) will have removed at least one
1170 * instruction from one of the branches of the if-statement, so now it
1171 * might be under the threshold of conversion to bcsel.
1172 *
1173 * See brw_nir_optimize for the explanation of is_vec4_tessellation.
1174 */
1175 const bool is_vec4_tessellation = !is_scalar &&
1176 (nir->info.stage == MESA_SHADER_TESS_CTRL ||
1177 nir->info.stage == MESA_SHADER_TESS_EVAL);
1178 OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false);
1179 OPT(nir_opt_peephole_select, 1, is_vec4_tessellation,
1180 compiler->devinfo->ver >= 6);
1181 }
1182
1183 do {
1184 progress = false;
1185 if (OPT(nir_opt_algebraic_late)) {
1186 /* At this late stage, anything that makes more constants will wreak
1187 * havok on the vec4 backend. The handling of constants in the vec4
1188 * backend is not good.
1189 */
1190 if (is_scalar)
1191 OPT(nir_opt_constant_folding);
1192
1193 OPT(nir_copy_prop);
1194 OPT(nir_opt_dce);
1195 OPT(nir_opt_cse);
1196 }
1197 } while (progress);
1198
1199
1200 OPT(brw_nir_lower_conversions);
1201
1202 if (is_scalar)
1203 OPT(nir_lower_alu_to_scalar, NULL, NULL);
1204
1205 while (OPT(nir_opt_algebraic_distribute_src_mods)) {
1206 OPT(nir_copy_prop);
1207 OPT(nir_opt_dce);
1208 OPT(nir_opt_cse);
1209 }
1210
1211 OPT(nir_copy_prop);
1212 OPT(nir_opt_dce);
1213 OPT(nir_opt_move, nir_move_comparisons);
1214 OPT(nir_opt_dead_cf);
1215
1216 NIR_PASS_V(nir, nir_convert_to_lcssa, true, true);
1217 NIR_PASS_V(nir, nir_divergence_analysis);
1218
1219 /* TODO: Enable nir_opt_uniform_atomics on Gfx7.x too.
1220 * It currently fails Vulkan tests on Haswell for an unknown reason.
1221 */
1222 if (devinfo->ver >= 8 && OPT(nir_opt_uniform_atomics)) {
1223 const nir_lower_subgroups_options subgroups_options = {
1224 .ballot_bit_size = 32,
1225 .ballot_components = 1,
1226 .lower_elect = true,
1227 };
1228 OPT(nir_lower_subgroups, &subgroups_options);
1229
1230 if (OPT(nir_lower_int64))
1231 brw_nir_optimize(nir, compiler, is_scalar, false);
1232 }
1233
1234 /* Clean up LCSSA phis */
1235 OPT(nir_opt_remove_phis);
1236
1237 OPT(nir_lower_bool_to_int32);
1238 OPT(nir_copy_prop);
1239 OPT(nir_opt_dce);
1240
1241 OPT(nir_lower_locals_to_regs);
1242
1243 if (unlikely(debug_enabled)) {
1244 /* Re-index SSA defs so we print more sensible numbers. */
1245 nir_foreach_function(function, nir) {
1246 if (function->impl)
1247 nir_index_ssa_defs(function->impl);
1248 }
1249
1250 fprintf(stderr, "NIR (SSA form) for %s shader:\n",
1251 _mesa_shader_stage_to_string(nir->info.stage));
1252 nir_print_shader(nir, stderr);
1253 }
1254
1255 nir_validate_ssa_dominance(nir, "before nir_convert_from_ssa");
1256
1257 OPT(nir_convert_from_ssa, true);
1258
1259 if (!is_scalar) {
1260 OPT(nir_move_vec_src_uses_to_dest);
1261 OPT(nir_lower_vec_to_movs, NULL, NULL);
1262 }
1263
1264 OPT(nir_opt_dce);
1265
1266 if (OPT(nir_opt_rematerialize_compares))
1267 OPT(nir_opt_dce);
1268
1269 /* This is the last pass we run before we start emitting stuff. It
1270 * determines when we need to insert boolean resolves on Gen <= 5. We
1271 * run it last because it stashes data in instr->pass_flags and we don't
1272 * want that to be squashed by other NIR passes.
1273 */
1274 if (devinfo->ver <= 5)
1275 brw_nir_analyze_boolean_resolves(nir);
1276
1277 nir_sweep(nir);
1278
1279 if (unlikely(debug_enabled)) {
1280 fprintf(stderr, "NIR (final form) for %s shader:\n",
1281 _mesa_shader_stage_to_string(nir->info.stage));
1282 nir_print_shader(nir, stderr);
1283 }
1284 }
1285
1286 static bool
brw_nir_apply_sampler_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_sampler_prog_key_data * key_tex)1287 brw_nir_apply_sampler_key(nir_shader *nir,
1288 const struct brw_compiler *compiler,
1289 const struct brw_sampler_prog_key_data *key_tex)
1290 {
1291 const struct intel_device_info *devinfo = compiler->devinfo;
1292 nir_lower_tex_options tex_options = {
1293 .lower_txd_clamp_bindless_sampler = true,
1294 .lower_txd_clamp_if_sampler_index_not_lt_16 = true,
1295 };
1296
1297 /* Iron Lake and prior require lowering of all rectangle textures */
1298 if (devinfo->ver < 6)
1299 tex_options.lower_rect = true;
1300
1301 /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
1302 if (devinfo->ver < 8) {
1303 tex_options.saturate_s = key_tex->gl_clamp_mask[0];
1304 tex_options.saturate_t = key_tex->gl_clamp_mask[1];
1305 tex_options.saturate_r = key_tex->gl_clamp_mask[2];
1306 }
1307
1308 /* Prior to Haswell, we have to lower gradients on shadow samplers */
1309 tex_options.lower_txd_shadow = devinfo->verx10 <= 70;
1310
1311 tex_options.lower_y_uv_external = key_tex->y_uv_image_mask;
1312 tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask;
1313 tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask;
1314 tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask;
1315 tex_options.lower_ayuv_external = key_tex->ayuv_image_mask;
1316 tex_options.lower_xyuv_external = key_tex->xyuv_image_mask;
1317 tex_options.bt709_external = key_tex->bt709_mask;
1318 tex_options.bt2020_external = key_tex->bt2020_mask;
1319
1320 /* Setup array of scaling factors for each texture. */
1321 memcpy(&tex_options.scale_factors, &key_tex->scale_factors,
1322 sizeof(tex_options.scale_factors));
1323
1324 return nir_lower_tex(nir, &tex_options);
1325 }
1326
1327 static unsigned
get_subgroup_size(gl_shader_stage stage,const struct brw_base_prog_key * key,unsigned max_subgroup_size)1328 get_subgroup_size(gl_shader_stage stage,
1329 const struct brw_base_prog_key *key,
1330 unsigned max_subgroup_size)
1331 {
1332 switch (key->subgroup_size_type) {
1333 case BRW_SUBGROUP_SIZE_API_CONSTANT:
1334 /* We have to use the global constant size. */
1335 return BRW_SUBGROUP_SIZE;
1336
1337 case BRW_SUBGROUP_SIZE_UNIFORM:
1338 /* It has to be uniform across all invocations but can vary per stage
1339 * if we want. This gives us a bit more freedom.
1340 *
1341 * For compute, brw_nir_apply_key is called per-dispatch-width so this
1342 * is the actual subgroup size and not a maximum. However, we only
1343 * invoke one size of any given compute shader so it's still guaranteed
1344 * to be uniform across invocations.
1345 */
1346 return max_subgroup_size;
1347
1348 case BRW_SUBGROUP_SIZE_VARYING:
1349 /* The subgroup size is allowed to be fully varying. For geometry
1350 * stages, we know it's always 8 which is max_subgroup_size so we can
1351 * return that. For compute, brw_nir_apply_key is called once per
1352 * dispatch-width so max_subgroup_size is the real subgroup size.
1353 *
1354 * For fragment, we return 0 and let it fall through to the back-end
1355 * compiler. This means we can't optimize based on subgroup size but
1356 * that's a risk the client took when it asked for a varying subgroup
1357 * size.
1358 */
1359 return stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size;
1360
1361 case BRW_SUBGROUP_SIZE_REQUIRE_8:
1362 case BRW_SUBGROUP_SIZE_REQUIRE_16:
1363 case BRW_SUBGROUP_SIZE_REQUIRE_32:
1364 assert(gl_shader_stage_uses_workgroup(stage));
1365 /* These enum values are expressly chosen to be equal to the subgroup
1366 * size that they require.
1367 */
1368 return key->subgroup_size_type;
1369 }
1370
1371 unreachable("Invalid subgroup size type");
1372 }
1373
1374 void
brw_nir_apply_key(nir_shader * nir,const struct brw_compiler * compiler,const struct brw_base_prog_key * key,unsigned max_subgroup_size,bool is_scalar)1375 brw_nir_apply_key(nir_shader *nir,
1376 const struct brw_compiler *compiler,
1377 const struct brw_base_prog_key *key,
1378 unsigned max_subgroup_size,
1379 bool is_scalar)
1380 {
1381 bool progress = false;
1382
1383 OPT(brw_nir_apply_sampler_key, compiler, &key->tex);
1384
1385 const nir_lower_subgroups_options subgroups_options = {
1386 .subgroup_size = get_subgroup_size(nir->info.stage, key,
1387 max_subgroup_size),
1388 .ballot_bit_size = 32,
1389 .ballot_components = 1,
1390 .lower_subgroup_masks = true,
1391 };
1392 OPT(nir_lower_subgroups, &subgroups_options);
1393
1394 if (progress)
1395 brw_nir_optimize(nir, compiler, is_scalar, false);
1396 }
1397
1398 enum brw_conditional_mod
brw_cmod_for_nir_comparison(nir_op op)1399 brw_cmod_for_nir_comparison(nir_op op)
1400 {
1401 switch (op) {
1402 case nir_op_flt:
1403 case nir_op_flt32:
1404 case nir_op_ilt:
1405 case nir_op_ilt32:
1406 case nir_op_ult:
1407 case nir_op_ult32:
1408 return BRW_CONDITIONAL_L;
1409
1410 case nir_op_fge:
1411 case nir_op_fge32:
1412 case nir_op_ige:
1413 case nir_op_ige32:
1414 case nir_op_uge:
1415 case nir_op_uge32:
1416 return BRW_CONDITIONAL_GE;
1417
1418 case nir_op_feq:
1419 case nir_op_feq32:
1420 case nir_op_ieq:
1421 case nir_op_ieq32:
1422 case nir_op_b32all_fequal2:
1423 case nir_op_b32all_iequal2:
1424 case nir_op_b32all_fequal3:
1425 case nir_op_b32all_iequal3:
1426 case nir_op_b32all_fequal4:
1427 case nir_op_b32all_iequal4:
1428 return BRW_CONDITIONAL_Z;
1429
1430 case nir_op_fneu:
1431 case nir_op_fneu32:
1432 case nir_op_ine:
1433 case nir_op_ine32:
1434 case nir_op_b32any_fnequal2:
1435 case nir_op_b32any_inequal2:
1436 case nir_op_b32any_fnequal3:
1437 case nir_op_b32any_inequal3:
1438 case nir_op_b32any_fnequal4:
1439 case nir_op_b32any_inequal4:
1440 return BRW_CONDITIONAL_NZ;
1441
1442 default:
1443 unreachable("Unsupported NIR comparison op");
1444 }
1445 }
1446
1447 uint32_t
brw_aop_for_nir_intrinsic(const nir_intrinsic_instr * atomic)1448 brw_aop_for_nir_intrinsic(const nir_intrinsic_instr *atomic)
1449 {
1450 switch (atomic->intrinsic) {
1451 #define AOP_CASE(atom) \
1452 case nir_intrinsic_image_atomic_##atom: \
1453 case nir_intrinsic_bindless_image_atomic_##atom: \
1454 case nir_intrinsic_ssbo_atomic_##atom: \
1455 case nir_intrinsic_shared_atomic_##atom: \
1456 case nir_intrinsic_global_atomic_##atom
1457
1458 AOP_CASE(add): {
1459 unsigned src_idx;
1460 switch (atomic->intrinsic) {
1461 case nir_intrinsic_image_atomic_add:
1462 case nir_intrinsic_bindless_image_atomic_add:
1463 src_idx = 3;
1464 break;
1465 case nir_intrinsic_ssbo_atomic_add:
1466 src_idx = 2;
1467 break;
1468 case nir_intrinsic_shared_atomic_add:
1469 case nir_intrinsic_global_atomic_add:
1470 src_idx = 1;
1471 break;
1472 default:
1473 unreachable("Invalid add atomic opcode");
1474 }
1475
1476 if (nir_src_is_const(atomic->src[src_idx])) {
1477 int64_t add_val = nir_src_as_int(atomic->src[src_idx]);
1478 if (add_val == 1)
1479 return BRW_AOP_INC;
1480 else if (add_val == -1)
1481 return BRW_AOP_DEC;
1482 }
1483 return BRW_AOP_ADD;
1484 }
1485
1486 AOP_CASE(imin): return BRW_AOP_IMIN;
1487 AOP_CASE(umin): return BRW_AOP_UMIN;
1488 AOP_CASE(imax): return BRW_AOP_IMAX;
1489 AOP_CASE(umax): return BRW_AOP_UMAX;
1490 AOP_CASE(and): return BRW_AOP_AND;
1491 AOP_CASE(or): return BRW_AOP_OR;
1492 AOP_CASE(xor): return BRW_AOP_XOR;
1493 AOP_CASE(exchange): return BRW_AOP_MOV;
1494 AOP_CASE(comp_swap): return BRW_AOP_CMPWR;
1495
1496 #undef AOP_CASE
1497 #define AOP_CASE(atom) \
1498 case nir_intrinsic_ssbo_atomic_##atom: \
1499 case nir_intrinsic_shared_atomic_##atom: \
1500 case nir_intrinsic_global_atomic_##atom
1501
1502 AOP_CASE(fmin): return BRW_AOP_FMIN;
1503 AOP_CASE(fmax): return BRW_AOP_FMAX;
1504 AOP_CASE(fcomp_swap): return BRW_AOP_FCMPWR;
1505 AOP_CASE(fadd): return BRW_AOP_FADD;
1506
1507 #undef AOP_CASE
1508
1509 default:
1510 unreachable("Unsupported NIR atomic intrinsic");
1511 }
1512 }
1513
1514 enum brw_reg_type
brw_type_for_nir_type(const struct intel_device_info * devinfo,nir_alu_type type)1515 brw_type_for_nir_type(const struct intel_device_info *devinfo,
1516 nir_alu_type type)
1517 {
1518 switch (type) {
1519 case nir_type_uint:
1520 case nir_type_uint32:
1521 return BRW_REGISTER_TYPE_UD;
1522 case nir_type_bool:
1523 case nir_type_int:
1524 case nir_type_bool32:
1525 case nir_type_int32:
1526 return BRW_REGISTER_TYPE_D;
1527 case nir_type_float:
1528 case nir_type_float32:
1529 return BRW_REGISTER_TYPE_F;
1530 case nir_type_float16:
1531 return BRW_REGISTER_TYPE_HF;
1532 case nir_type_float64:
1533 return BRW_REGISTER_TYPE_DF;
1534 case nir_type_int64:
1535 return devinfo->ver < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q;
1536 case nir_type_uint64:
1537 return devinfo->ver < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ;
1538 case nir_type_int16:
1539 return BRW_REGISTER_TYPE_W;
1540 case nir_type_uint16:
1541 return BRW_REGISTER_TYPE_UW;
1542 case nir_type_int8:
1543 return BRW_REGISTER_TYPE_B;
1544 case nir_type_uint8:
1545 return BRW_REGISTER_TYPE_UB;
1546 default:
1547 unreachable("unknown type");
1548 }
1549
1550 return BRW_REGISTER_TYPE_F;
1551 }
1552
1553 nir_shader *
brw_nir_create_passthrough_tcs(void * mem_ctx,const struct brw_compiler * compiler,const nir_shader_compiler_options * options,const struct brw_tcs_prog_key * key)1554 brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler,
1555 const nir_shader_compiler_options *options,
1556 const struct brw_tcs_prog_key *key)
1557 {
1558 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_TESS_CTRL,
1559 options, "passthrough TCS");
1560 ralloc_steal(mem_ctx, b.shader);
1561 nir_shader *nir = b.shader;
1562 nir_variable *var;
1563 nir_ssa_def *load;
1564 nir_ssa_def *zero = nir_imm_int(&b, 0);
1565 nir_ssa_def *invoc_id = nir_load_invocation_id(&b);
1566
1567 nir->info.inputs_read = key->outputs_written &
1568 ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER);
1569 nir->info.outputs_written = key->outputs_written;
1570 nir->info.tess.tcs_vertices_out = key->input_vertices;
1571 nir->num_uniforms = 8 * sizeof(uint32_t);
1572
1573 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0");
1574 var->data.location = 0;
1575 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_1");
1576 var->data.location = 1;
1577
1578 /* Write the patch URB header. */
1579 for (int i = 0; i <= 1; i++) {
1580 load = nir_load_uniform(&b, 4, 32, zero, .base = i * 4 * sizeof(uint32_t));
1581
1582 nir_store_output(&b, load, zero,
1583 .base = VARYING_SLOT_TESS_LEVEL_INNER - i,
1584 .write_mask = WRITEMASK_XYZW);
1585 }
1586
1587 /* Copy inputs to outputs. */
1588 uint64_t varyings = nir->info.inputs_read;
1589
1590 while (varyings != 0) {
1591 const int varying = ffsll(varyings) - 1;
1592
1593 load = nir_load_per_vertex_input(&b, 4, 32, invoc_id, zero, .base = varying);
1594
1595 nir_store_per_vertex_output(&b, load, invoc_id, zero,
1596 .base = varying,
1597 .write_mask = WRITEMASK_XYZW);
1598
1599 varyings &= ~BITFIELD64_BIT(varying);
1600 }
1601
1602 nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs");
1603
1604 brw_preprocess_nir(compiler, nir, NULL);
1605
1606 return nir;
1607 }
1608
1609 nir_ssa_def *
brw_nir_load_global_const(nir_builder * b,nir_intrinsic_instr * load_uniform,nir_ssa_def * base_addr,unsigned off)1610 brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load_uniform,
1611 nir_ssa_def *base_addr, unsigned off)
1612 {
1613 assert(load_uniform->intrinsic == nir_intrinsic_load_uniform);
1614 assert(load_uniform->dest.is_ssa);
1615 assert(load_uniform->src[0].is_ssa);
1616
1617 unsigned bit_size = load_uniform->dest.ssa.bit_size;
1618 assert(bit_size >= 8 && bit_size % 8 == 0);
1619 unsigned byte_size = bit_size / 8;
1620 nir_ssa_def *sysval;
1621
1622 if (nir_src_is_const(load_uniform->src[0])) {
1623 uint64_t offset = off +
1624 nir_intrinsic_base(load_uniform) +
1625 nir_src_as_uint(load_uniform->src[0]);
1626
1627 /* Things should be component-aligned. */
1628 assert(offset % byte_size == 0);
1629
1630 unsigned suboffset = offset % 64;
1631 uint64_t aligned_offset = offset - suboffset;
1632
1633 /* Load two just in case we go over a 64B boundary */
1634 nir_ssa_def *data[2];
1635 for (unsigned i = 0; i < 2; i++) {
1636 nir_ssa_def *addr = nir_iadd_imm(b, base_addr, aligned_offset + i * 64);
1637 data[i] = nir_load_global_const_block_intel(b, 16, addr,
1638 nir_imm_true(b));
1639 }
1640
1641 sysval = nir_extract_bits(b, data, 2, suboffset * 8,
1642 load_uniform->num_components, bit_size);
1643 } else {
1644 nir_ssa_def *offset32 =
1645 nir_iadd_imm(b, load_uniform->src[0].ssa,
1646 off + nir_intrinsic_base(load_uniform));
1647 nir_ssa_def *addr = nir_iadd(b, base_addr, nir_u2u64(b, offset32));
1648 sysval = nir_load_global_constant(b, addr, byte_size,
1649 load_uniform->num_components, bit_size);
1650 }
1651
1652 return sysval;
1653 }
1654