1 /*
2  * Copyright (c) 2020 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_nir_rt.h"
25 #include "brw_nir_rt_builder.h"
26 
27 static nir_ssa_def *
build_leaf_is_procedural(nir_builder * b,struct brw_nir_rt_mem_hit_defs * hit)28 build_leaf_is_procedural(nir_builder *b, struct brw_nir_rt_mem_hit_defs *hit)
29 {
30    switch (b->shader->info.stage) {
31    case MESA_SHADER_ANY_HIT:
32       /* Any-hit shaders are always compiled into intersection shaders for
33        * procedural geometry.  If we got here in an any-hit shader, it's for
34        * triangles.
35        */
36       return nir_imm_false(b);
37 
38    case MESA_SHADER_INTERSECTION:
39       return nir_imm_true(b);
40 
41    default:
42       return nir_ieq(b, hit->leaf_type,
43                         nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
44    }
45 }
46 
47 static void
lower_rt_intrinsics_impl(nir_function_impl * impl,const struct intel_device_info * devinfo)48 lower_rt_intrinsics_impl(nir_function_impl *impl,
49                          const struct intel_device_info *devinfo)
50 {
51    nir_builder build;
52    nir_builder_init(&build, impl);
53    nir_builder *b = &build;
54 
55    b->cursor = nir_before_block(nir_start_block(b->impl));
56 
57    struct brw_nir_rt_globals_defs globals;
58    brw_nir_rt_load_globals(b, &globals);
59 
60    nir_ssa_def *hotzone_addr = brw_nir_rt_sw_hotzone_addr(b, devinfo);
61    nir_ssa_def *hotzone = nir_load_global(b, hotzone_addr, 16, 4, 32);
62 
63    gl_shader_stage stage = b->shader->info.stage;
64    struct brw_nir_rt_mem_ray_defs world_ray_in = {};
65    struct brw_nir_rt_mem_ray_defs object_ray_in = {};
66    struct brw_nir_rt_mem_hit_defs hit_in = {};
67    switch (stage) {
68    case MESA_SHADER_ANY_HIT:
69    case MESA_SHADER_CLOSEST_HIT:
70    case MESA_SHADER_INTERSECTION:
71       brw_nir_rt_load_mem_hit(b, &hit_in,
72                               stage == MESA_SHADER_CLOSEST_HIT);
73       brw_nir_rt_load_mem_ray(b, &object_ray_in,
74                               BRW_RT_BVH_LEVEL_OBJECT);
75       FALLTHROUGH;
76 
77    case MESA_SHADER_MISS:
78       brw_nir_rt_load_mem_ray(b, &world_ray_in,
79                               BRW_RT_BVH_LEVEL_WORLD);
80       break;
81 
82    default:
83       break;
84    }
85 
86    nir_ssa_def *thread_stack_base_addr = brw_nir_rt_sw_stack_addr(b, devinfo);
87    nir_ssa_def *stack_base_offset = nir_channel(b, hotzone, 0);
88    nir_ssa_def *stack_base_addr =
89       nir_iadd(b, thread_stack_base_addr, nir_u2u64(b, stack_base_offset));
90    ASSERTED bool seen_scratch_base_ptr_load = false;
91    ASSERTED bool found_resume = false;
92 
93    nir_foreach_block(block, impl) {
94       nir_foreach_instr_safe(instr, block) {
95          if (instr->type != nir_instr_type_intrinsic)
96             continue;
97 
98          nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
99 
100          b->cursor = nir_after_instr(&intrin->instr);
101 
102          nir_ssa_def *sysval = NULL;
103          switch (intrin->intrinsic) {
104          case nir_intrinsic_load_scratch_base_ptr:
105             assert(nir_intrinsic_base(intrin) == 1);
106             seen_scratch_base_ptr_load = true;
107             sysval = stack_base_addr;
108             break;
109 
110          case nir_intrinsic_btd_stack_push_intel: {
111             int32_t stack_size = nir_intrinsic_stack_size(intrin);
112             if (stack_size > 0) {
113                nir_ssa_def *child_stack_offset =
114                   nir_iadd_imm(b, stack_base_offset, stack_size);
115                nir_store_global(b, hotzone_addr, 16, child_stack_offset, 0x1);
116             }
117             nir_instr_remove(instr);
118             break;
119          }
120 
121          case nir_intrinsic_rt_resume:
122             /* This is the first "interesting" instruction */
123             assert(block == nir_start_block(impl));
124             assert(!seen_scratch_base_ptr_load);
125             found_resume = true;
126 
127             int32_t stack_size = nir_intrinsic_stack_size(intrin);
128             if (stack_size > 0) {
129                stack_base_offset =
130                   nir_iadd_imm(b, stack_base_offset, -stack_size);
131                nir_store_global(b, hotzone_addr, 16, stack_base_offset, 0x1);
132                stack_base_addr = nir_iadd(b, thread_stack_base_addr,
133                                           nir_u2u64(b, stack_base_offset));
134             }
135             nir_instr_remove(instr);
136             break;
137 
138          case nir_intrinsic_load_uniform: {
139             /* We don't want to lower this in the launch trampoline. */
140             if (stage == MESA_SHADER_COMPUTE)
141                break;
142 
143             assert(intrin->dest.is_ssa);
144             assert(intrin->src[0].is_ssa);
145 
146             unsigned bit_size = intrin->dest.ssa.bit_size;
147             assert(bit_size >= 8 && bit_size % 8 == 0);
148             unsigned byte_size = bit_size / 8;
149 
150             if (nir_src_is_const(intrin->src[0])) {
151                uint64_t offset = BRW_RT_PUSH_CONST_OFFSET +
152                                  nir_intrinsic_base(intrin) +
153                                  nir_src_as_uint(intrin->src[0]);
154 
155                /* Things should be component-aligned. */
156                assert(offset % byte_size == 0);
157 
158                unsigned suboffset = offset % 64;
159                uint64_t aligned_offset = offset - suboffset;
160 
161                /* Load two just in case we go over a 64B boundary */
162                nir_ssa_def *data[2];
163                for (unsigned i = 0; i < 2; i++) {
164                   nir_ssa_def *addr =
165                      nir_iadd_imm(b, nir_load_btd_global_arg_addr_intel(b),
166                                      aligned_offset + i * 64);
167                   data[i] = nir_load_global_const_block_intel(b, 16, addr,
168                                                               nir_imm_true(b));
169                }
170 
171                sysval = nir_extract_bits(b, data, 2, suboffset * 8,
172                                          intrin->num_components, bit_size);
173             } else {
174                nir_ssa_def *offset32 =
175                   nir_iadd_imm(b, intrin->src[0].ssa,
176                                   BRW_RT_PUSH_CONST_OFFSET +
177                                   nir_intrinsic_base(intrin));
178                nir_ssa_def *addr =
179                   nir_iadd(b, nir_load_btd_global_arg_addr_intel(b),
180                               nir_u2u64(b, offset32));
181                sysval = nir_load_global_constant(b, addr, byte_size,
182                                                  intrin->num_components, bit_size);
183             }
184             break;
185          }
186 
187          case nir_intrinsic_load_ray_launch_id:
188             sysval = nir_channels(b, hotzone, 0xe);
189             break;
190 
191          case nir_intrinsic_load_ray_launch_size:
192             sysval = globals.launch_size;
193             break;
194 
195          case nir_intrinsic_load_ray_world_origin:
196             sysval = world_ray_in.orig;
197             break;
198 
199          case nir_intrinsic_load_ray_world_direction:
200             sysval = world_ray_in.dir;
201             break;
202 
203          case nir_intrinsic_load_ray_object_origin:
204             sysval = object_ray_in.orig;
205             break;
206 
207          case nir_intrinsic_load_ray_object_direction:
208             sysval = object_ray_in.dir;
209             break;
210 
211          case nir_intrinsic_load_ray_t_min:
212             /* It shouldn't matter which we pull this from */
213             sysval = world_ray_in.t_near;
214             break;
215 
216          case nir_intrinsic_load_ray_t_max:
217             if (stage == MESA_SHADER_MISS)
218                sysval = world_ray_in.t_far;
219             else
220                sysval = hit_in.t;
221             break;
222 
223          case nir_intrinsic_load_primitive_id: {
224             /* It's in dw[3] for procedural and dw[2] for quad
225              *
226              * TODO: We really need some helpers here.
227              */
228             nir_ssa_def *offset =
229                nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
230                             nir_iadd_imm(b, hit_in.prim_leaf_index, 12),
231                             nir_imm_int(b, 8));
232             sysval = nir_load_global(b, nir_iadd(b, hit_in.prim_leaf_ptr,
233                                                     nir_u2u64(b, offset)),
234                                      4, /* align */ 1, 32);
235             break;
236          }
237 
238          case nir_intrinsic_load_instance_id: {
239             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
240             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
241             sysval = leaf.instance_index;
242             break;
243          }
244 
245          case nir_intrinsic_load_ray_object_to_world: {
246             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
247             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
248             sysval = leaf.object_to_world[nir_intrinsic_column(intrin)];
249             break;
250          }
251 
252          case nir_intrinsic_load_ray_world_to_object: {
253             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
254             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
255             sysval = leaf.world_to_object[nir_intrinsic_column(intrin)];
256             break;
257          }
258 
259          case nir_intrinsic_load_ray_hit_kind: {
260             nir_ssa_def *tri_hit_kind =
261                nir_bcsel(b, hit_in.front_face,
262                             nir_imm_int(b, BRW_RT_HIT_KIND_FRONT_FACE),
263                             nir_imm_int(b, BRW_RT_HIT_KIND_BACK_FACE));
264             sysval = nir_bcsel(b, build_leaf_is_procedural(b, &hit_in),
265                                   hit_in.aabb_hit_kind, tri_hit_kind);
266             break;
267          }
268 
269          case nir_intrinsic_load_ray_flags:
270             sysval = nir_u2u32(b, world_ray_in.ray_flags);
271             break;
272 
273          case nir_intrinsic_load_ray_geometry_index: {
274             nir_ssa_def *geometry_index_dw =
275                nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
276                                1, 32);
277             sysval = nir_iand_imm(b, geometry_index_dw, BITFIELD_MASK(29));
278             break;
279          }
280 
281          case nir_intrinsic_load_ray_instance_custom_index: {
282             struct brw_nir_rt_bvh_instance_leaf_defs leaf;
283             brw_nir_rt_load_bvh_instance_leaf(b, &leaf, hit_in.inst_leaf_ptr);
284             sysval = leaf.instance_id;
285             break;
286          }
287 
288          case nir_intrinsic_load_shader_record_ptr:
289             /* We can't handle this intrinsic in resume shaders because the
290              * handle we get there won't be from the original SBT.  The shader
291              * call lowering/splitting pass should have ensured that this
292              * value was spilled from the initial shader and unspilled in any
293              * resume shaders that need it.
294              */
295             assert(!found_resume);
296             sysval = nir_load_btd_local_arg_addr_intel(b);
297             break;
298 
299          case nir_intrinsic_load_ray_base_mem_addr_intel:
300             sysval = globals.base_mem_addr;
301             break;
302 
303          case nir_intrinsic_load_ray_hw_stack_size_intel:
304             sysval = nir_imul_imm(b, globals.hw_stack_size, 64);
305             break;
306 
307          case nir_intrinsic_load_ray_sw_stack_size_intel:
308             sysval = nir_imul_imm(b, globals.sw_stack_size, 64);
309             break;
310 
311          case nir_intrinsic_load_ray_num_dss_rt_stacks_intel:
312             sysval = globals.num_dss_rt_stacks;
313             break;
314 
315          case nir_intrinsic_load_ray_hit_sbt_addr_intel:
316             sysval = globals.hit_sbt_addr;
317             break;
318 
319          case nir_intrinsic_load_ray_hit_sbt_stride_intel:
320             sysval = globals.hit_sbt_stride;
321             break;
322 
323          case nir_intrinsic_load_ray_miss_sbt_addr_intel:
324             sysval = globals.miss_sbt_addr;
325             break;
326 
327          case nir_intrinsic_load_ray_miss_sbt_stride_intel:
328             sysval = globals.miss_sbt_stride;
329             break;
330 
331          case nir_intrinsic_load_callable_sbt_addr_intel:
332             sysval = globals.call_sbt_addr;
333             break;
334 
335          case nir_intrinsic_load_callable_sbt_stride_intel:
336             sysval = globals.call_sbt_stride;
337             break;
338 
339          case nir_intrinsic_load_btd_resume_sbt_addr_intel:
340             sysval = nir_pack_64_2x32_split(b,
341                nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW),
342                nir_load_reloc_const_intel(b, BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH));
343             break;
344 
345          case nir_intrinsic_load_leaf_procedural_intel:
346             sysval = build_leaf_is_procedural(b, &hit_in);
347             break;
348 
349          case nir_intrinsic_load_leaf_opaque_intel: {
350             if (stage == MESA_SHADER_INTERSECTION) {
351                /* In intersection shaders, the opaque bit is passed to us in
352                 * the front_face bit.
353                 */
354                sysval = hit_in.front_face;
355             } else {
356                nir_ssa_def *flags_dw =
357                   nir_load_global(b, nir_iadd_imm(b, hit_in.prim_leaf_ptr, 4), 4,
358                                   1, 32);
359                sysval = nir_i2b(b, nir_iand_imm(b, flags_dw, 1u << 30));
360             }
361             break;
362          }
363 
364          default:
365             continue;
366          }
367 
368          if (sysval) {
369             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
370                                      sysval);
371             nir_instr_remove(&intrin->instr);
372          }
373       }
374    }
375 
376    nir_metadata_preserve(impl, nir_metadata_block_index |
377                                nir_metadata_dominance);
378 }
379 
380 /** Lower ray-tracing system values and intrinsics
381  *
382  * In most 3D shader stages, intrinsics are a fairly thin wrapper around
383  * hardware functionality and system values represent magic bits that come
384  * into the shader from FF hardware.  Ray-tracing, however, looks a bit more
385  * like the OpenGL 1.0 world where the underlying hardware is simple and most
386  * of the API implementation is software.
387  *
388  * In particular, most things that are treated as system values (or built-ins
389  * in SPIR-V) don't get magically dropped into registers for us.  Instead, we
390  * have to fetch them from the relevant data structures shared with the
391  * ray-tracing hardware.  Most come from either the RT_DISPATCH_GLOBALS or
392  * from one of the MemHit data structures.  Some, such as primitive_id require
393  * us to fetch the leaf address from the MemHit struct and then manually read
394  * the data out of the BVH.  Instead of trying to emit all this code deep in
395  * the back-end where we can't effectively optimize it, we lower it all to
396  * global memory access in NIR.
397  *
398  * Once this pass is complete, the only real system values left are the two
399  * argument pointer system values for BTD dispatch: btd_local_arg_addr and
400  * btd_global_arg_addr.
401  */
402 void
brw_nir_lower_rt_intrinsics(nir_shader * nir,const struct intel_device_info * devinfo)403 brw_nir_lower_rt_intrinsics(nir_shader *nir,
404                             const struct intel_device_info *devinfo)
405 {
406    nir_foreach_function(function, nir) {
407       if (function->impl)
408          lower_rt_intrinsics_impl(function->impl, devinfo);
409    }
410 }
411