1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #ifndef BRW_NIR_RT_BUILDER_H
25 #define BRW_NIR_RT_BUILDER_H
26
27 /* This file provides helpers to access memory based data structures that the
28 * RT hardware reads/writes and their locations.
29 *
30 * See also "Memory Based Data Structures for Ray Tracing" (BSpec 47547) and
31 * "Ray Tracing Address Computation for Memory Resident Structures" (BSpec
32 * 47550).
33 */
34
35 #include "brw_rt.h"
36 #include "nir_builder.h"
37
38 #define is_access_for_builder(b) \
39 ((b)->shader->info.stage == MESA_SHADER_FRAGMENT ? \
40 ACCESS_INCLUDE_HELPERS : 0)
41
42 static inline nir_ssa_def *
brw_nir_rt_load(nir_builder * b,nir_ssa_def * addr,unsigned align,unsigned components,unsigned bit_size)43 brw_nir_rt_load(nir_builder *b, nir_ssa_def *addr, unsigned align,
44 unsigned components, unsigned bit_size)
45 {
46 return nir_build_load_global(b, components, bit_size, addr,
47 .align_mul = align,
48 .access = is_access_for_builder(b));
49 }
50
51 static inline void
brw_nir_rt_store(nir_builder * b,nir_ssa_def * addr,unsigned align,nir_ssa_def * value,unsigned write_mask)52 brw_nir_rt_store(nir_builder *b, nir_ssa_def *addr, unsigned align,
53 nir_ssa_def *value, unsigned write_mask)
54 {
55 nir_build_store_global(b, value, addr,
56 .align_mul = align,
57 .write_mask = (write_mask) &
58 BITFIELD_MASK(value->num_components),
59 .access = is_access_for_builder(b));
60 }
61
62 static inline nir_ssa_def *
brw_nir_rt_load_const(nir_builder * b,unsigned components,nir_ssa_def * addr,nir_ssa_def * pred)63 brw_nir_rt_load_const(nir_builder *b, unsigned components,
64 nir_ssa_def *addr, nir_ssa_def *pred)
65 {
66 return nir_build_load_global_const_block_intel(b, components, addr, pred);
67 }
68
69 static inline nir_ssa_def *
brw_load_btd_dss_id(nir_builder * b)70 brw_load_btd_dss_id(nir_builder *b)
71 {
72 return nir_build_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_DSS);
73 }
74
75 static inline nir_ssa_def *
brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder * b,const struct intel_device_info * devinfo)76 brw_nir_rt_load_num_simd_lanes_per_dss(nir_builder *b,
77 const struct intel_device_info *devinfo)
78 {
79 return nir_imm_int(b, devinfo->num_thread_per_eu *
80 devinfo->max_eus_per_subslice *
81 16 /* The RT computation is based off SIMD16 */);
82 }
83
84 static inline nir_ssa_def *
brw_load_eu_thread_simd(nir_builder * b)85 brw_load_eu_thread_simd(nir_builder *b)
86 {
87 return nir_build_load_topology_id_intel(b, .base = BRW_TOPOLOGY_ID_EU_THREAD_SIMD);
88 }
89
90 static inline nir_ssa_def *
brw_nir_rt_async_stack_id(nir_builder * b)91 brw_nir_rt_async_stack_id(nir_builder *b)
92 {
93 assert(gl_shader_stage_is_callable(b->shader->info.stage) ||
94 b->shader->info.stage == MESA_SHADER_RAYGEN);
95 return nir_iadd(b, nir_umul_32x16(b, nir_load_ray_num_dss_rt_stacks_intel(b),
96 brw_load_btd_dss_id(b)),
97 nir_load_btd_stack_id_intel(b));
98 }
99
100 static inline nir_ssa_def *
brw_nir_rt_sync_stack_id(nir_builder * b)101 brw_nir_rt_sync_stack_id(nir_builder *b)
102 {
103 return brw_load_eu_thread_simd(b);
104 }
105
106 /* We have our own load/store scratch helpers because they emit a global
107 * memory read or write based on the scratch_base_ptr system value rather
108 * than a load/store_scratch intrinsic.
109 */
110 static inline nir_ssa_def *
brw_nir_rt_load_scratch(nir_builder * b,uint32_t offset,unsigned align,unsigned num_components,unsigned bit_size)111 brw_nir_rt_load_scratch(nir_builder *b, uint32_t offset, unsigned align,
112 unsigned num_components, unsigned bit_size)
113 {
114 nir_ssa_def *addr =
115 nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
116 return brw_nir_rt_load(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
117 num_components, bit_size);
118 }
119
120 static inline void
brw_nir_rt_store_scratch(nir_builder * b,uint32_t offset,unsigned align,nir_ssa_def * value,nir_component_mask_t write_mask)121 brw_nir_rt_store_scratch(nir_builder *b, uint32_t offset, unsigned align,
122 nir_ssa_def *value, nir_component_mask_t write_mask)
123 {
124 nir_ssa_def *addr =
125 nir_iadd_imm(b, nir_load_scratch_base_ptr(b, 1, 64, 1), offset);
126 brw_nir_rt_store(b, addr, MIN2(align, BRW_BTD_STACK_ALIGN),
127 value, write_mask);
128 }
129
130 static inline void
brw_nir_btd_spawn(nir_builder * b,nir_ssa_def * record_addr)131 brw_nir_btd_spawn(nir_builder *b, nir_ssa_def *record_addr)
132 {
133 nir_btd_spawn_intel(b, nir_load_btd_global_arg_addr_intel(b), record_addr);
134 }
135
136 static inline void
brw_nir_btd_retire(nir_builder * b)137 brw_nir_btd_retire(nir_builder *b)
138 {
139 nir_btd_retire_intel(b);
140 }
141
142 /** This is a pseudo-op which does a bindless return
143 *
144 * It loads the return address from the stack and calls btd_spawn to spawn the
145 * resume shader.
146 */
147 static inline void
brw_nir_btd_return(struct nir_builder * b)148 brw_nir_btd_return(struct nir_builder *b)
149 {
150 assert(b->shader->scratch_size == BRW_BTD_STACK_CALLEE_DATA_SIZE);
151 nir_ssa_def *resume_addr =
152 brw_nir_rt_load_scratch(b, BRW_BTD_STACK_RESUME_BSR_ADDR_OFFSET,
153 8 /* align */, 1, 64);
154 brw_nir_btd_spawn(b, resume_addr);
155 }
156
157 static inline void
assert_def_size(nir_ssa_def * def,unsigned num_components,unsigned bit_size)158 assert_def_size(nir_ssa_def *def, unsigned num_components, unsigned bit_size)
159 {
160 assert(def->num_components == num_components);
161 assert(def->bit_size == bit_size);
162 }
163
164 static inline nir_ssa_def *
brw_nir_num_rt_stacks(nir_builder * b,const struct intel_device_info * devinfo)165 brw_nir_num_rt_stacks(nir_builder *b,
166 const struct intel_device_info *devinfo)
167 {
168 return nir_imul_imm(b, nir_load_ray_num_dss_rt_stacks_intel(b),
169 intel_device_info_num_dual_subslices(devinfo));
170 }
171
172 static inline nir_ssa_def *
brw_nir_rt_sw_hotzone_addr(nir_builder * b,const struct intel_device_info * devinfo)173 brw_nir_rt_sw_hotzone_addr(nir_builder *b,
174 const struct intel_device_info *devinfo)
175 {
176 nir_ssa_def *offset32 =
177 nir_imul_imm(b, brw_nir_rt_async_stack_id(b),
178 BRW_RT_SIZEOF_HOTZONE);
179
180 offset32 = nir_iadd(b, offset32, nir_ineg(b,
181 nir_imul_imm(b, brw_nir_num_rt_stacks(b, devinfo),
182 BRW_RT_SIZEOF_HOTZONE)));
183
184 return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
185 nir_i2i64(b, offset32));
186 }
187
188 static inline nir_ssa_def *
brw_nir_rt_sync_stack_addr(nir_builder * b,nir_ssa_def * base_mem_addr,const struct intel_device_info * devinfo)189 brw_nir_rt_sync_stack_addr(nir_builder *b,
190 nir_ssa_def *base_mem_addr,
191 const struct intel_device_info *devinfo)
192 {
193 /* For Ray queries (Synchronous Ray Tracing), the formula is similar but
194 * goes down from rtMemBasePtr :
195 *
196 * syncBase = RTDispatchGlobals.rtMemBasePtr
197 * - (DSSID * NUM_SIMD_LANES_PER_DSS + SyncStackID + 1)
198 * * syncStackSize
199 *
200 * We assume that we can calculate a 32-bit offset first and then add it
201 * to the 64-bit base address at the end.
202 */
203 nir_ssa_def *offset32 =
204 nir_imul(b,
205 nir_iadd(b,
206 nir_imul(b, brw_load_btd_dss_id(b),
207 brw_nir_rt_load_num_simd_lanes_per_dss(b, devinfo)),
208 nir_iadd_imm(b, brw_nir_rt_sync_stack_id(b), 1)),
209 nir_imm_int(b, BRW_RT_SIZEOF_RAY_QUERY));
210 return nir_isub(b, base_mem_addr, nir_u2u64(b, offset32));
211 }
212
213 static inline nir_ssa_def *
brw_nir_rt_stack_addr(nir_builder * b)214 brw_nir_rt_stack_addr(nir_builder *b)
215 {
216 /* From the BSpec "Address Computation for Memory Based Data Structures:
217 * Ray and TraversalStack (Async Ray Tracing)":
218 *
219 * stackBase = RTDispatchGlobals.rtMemBasePtr
220 * + (DSSID * RTDispatchGlobals.numDSSRTStacks + stackID)
221 * * RTDispatchGlobals.stackSizePerRay // 64B aligned
222 *
223 * We assume that we can calculate a 32-bit offset first and then add it
224 * to the 64-bit base address at the end.
225 */
226 nir_ssa_def *offset32 =
227 nir_imul(b, brw_nir_rt_async_stack_id(b),
228 nir_load_ray_hw_stack_size_intel(b));
229 return nir_iadd(b, nir_load_ray_base_mem_addr_intel(b),
230 nir_u2u64(b, offset32));
231 }
232
233 static inline nir_ssa_def *
brw_nir_rt_mem_hit_addr_from_addr(nir_builder * b,nir_ssa_def * stack_addr,bool committed)234 brw_nir_rt_mem_hit_addr_from_addr(nir_builder *b,
235 nir_ssa_def *stack_addr,
236 bool committed)
237 {
238 return nir_iadd_imm(b, stack_addr, committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
239 }
240
241 static inline nir_ssa_def *
brw_nir_rt_mem_hit_addr(nir_builder * b,bool committed)242 brw_nir_rt_mem_hit_addr(nir_builder *b, bool committed)
243 {
244 return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
245 committed ? 0 : BRW_RT_SIZEOF_HIT_INFO);
246 }
247
248 static inline nir_ssa_def *
brw_nir_rt_hit_attrib_data_addr(nir_builder * b)249 brw_nir_rt_hit_attrib_data_addr(nir_builder *b)
250 {
251 return nir_iadd_imm(b, brw_nir_rt_stack_addr(b),
252 BRW_RT_OFFSETOF_HIT_ATTRIB_DATA);
253 }
254
255 static inline nir_ssa_def *
brw_nir_rt_mem_ray_addr(nir_builder * b,nir_ssa_def * stack_addr,enum brw_rt_bvh_level bvh_level)256 brw_nir_rt_mem_ray_addr(nir_builder *b,
257 nir_ssa_def *stack_addr,
258 enum brw_rt_bvh_level bvh_level)
259 {
260 /* From the BSpec "Address Computation for Memory Based Data Structures:
261 * Ray and TraversalStack (Async Ray Tracing)":
262 *
263 * rayBase = stackBase + sizeof(HitInfo) * 2 // 64B aligned
264 * rayPtr = rayBase + bvhLevel * sizeof(Ray); // 64B aligned
265 *
266 * In Vulkan, we always have exactly two levels of BVH: World and Object.
267 */
268 uint32_t offset = BRW_RT_SIZEOF_HIT_INFO * 2 +
269 bvh_level * BRW_RT_SIZEOF_RAY;
270 return nir_iadd_imm(b, stack_addr, offset);
271 }
272
273 static inline nir_ssa_def *
brw_nir_rt_sw_stack_addr(nir_builder * b,const struct intel_device_info * devinfo)274 brw_nir_rt_sw_stack_addr(nir_builder *b,
275 const struct intel_device_info *devinfo)
276 {
277 nir_ssa_def *addr = nir_load_ray_base_mem_addr_intel(b);
278
279 nir_ssa_def *offset32 = nir_imul(b, brw_nir_num_rt_stacks(b, devinfo),
280 nir_load_ray_hw_stack_size_intel(b));
281 addr = nir_iadd(b, addr, nir_u2u64(b, offset32));
282
283 nir_ssa_def *offset_in_stack =
284 nir_imul(b, nir_u2u64(b, brw_nir_rt_async_stack_id(b)),
285 nir_u2u64(b, nir_load_ray_sw_stack_size_intel(b)));
286
287 return nir_iadd(b, addr, offset_in_stack);
288 }
289
290 static inline nir_ssa_def *
nir_unpack_64_4x16_split_z(nir_builder * b,nir_ssa_def * val)291 nir_unpack_64_4x16_split_z(nir_builder *b, nir_ssa_def *val)
292 {
293 return nir_unpack_32_2x16_split_x(b, nir_unpack_64_2x32_split_y(b, val));
294 }
295
296 struct brw_nir_rt_globals_defs {
297 nir_ssa_def *base_mem_addr;
298 nir_ssa_def *call_stack_handler_addr;
299 nir_ssa_def *hw_stack_size;
300 nir_ssa_def *num_dss_rt_stacks;
301 nir_ssa_def *hit_sbt_addr;
302 nir_ssa_def *hit_sbt_stride;
303 nir_ssa_def *miss_sbt_addr;
304 nir_ssa_def *miss_sbt_stride;
305 nir_ssa_def *sw_stack_size;
306 nir_ssa_def *launch_size;
307 nir_ssa_def *call_sbt_addr;
308 nir_ssa_def *call_sbt_stride;
309 nir_ssa_def *resume_sbt_addr;
310 };
311
312 static inline void
brw_nir_rt_load_globals_addr(nir_builder * b,struct brw_nir_rt_globals_defs * defs,nir_ssa_def * addr)313 brw_nir_rt_load_globals_addr(nir_builder *b,
314 struct brw_nir_rt_globals_defs *defs,
315 nir_ssa_def *addr)
316 {
317 nir_ssa_def *data;
318 data = brw_nir_rt_load_const(b, 16, addr, nir_imm_true(b));
319 defs->base_mem_addr = nir_pack_64_2x32(b, nir_channels(b, data, 0x3));
320
321 defs->call_stack_handler_addr =
322 nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
323
324 defs->hw_stack_size = nir_channel(b, data, 4);
325 defs->num_dss_rt_stacks = nir_iand_imm(b, nir_channel(b, data, 5), 0xffff);
326 defs->hit_sbt_addr =
327 nir_pack_64_2x32_split(b, nir_channel(b, data, 8),
328 nir_extract_i16(b, nir_channel(b, data, 9),
329 nir_imm_int(b, 0)));
330 defs->hit_sbt_stride =
331 nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 9));
332 defs->miss_sbt_addr =
333 nir_pack_64_2x32_split(b, nir_channel(b, data, 10),
334 nir_extract_i16(b, nir_channel(b, data, 11),
335 nir_imm_int(b, 0)));
336 defs->miss_sbt_stride =
337 nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 11));
338 defs->sw_stack_size = nir_channel(b, data, 12);
339 defs->launch_size = nir_channels(b, data, 0x7u << 13);
340
341 data = brw_nir_rt_load_const(b, 8, nir_iadd_imm(b, addr, 64), nir_imm_true(b));
342 defs->call_sbt_addr =
343 nir_pack_64_2x32_split(b, nir_channel(b, data, 0),
344 nir_extract_i16(b, nir_channel(b, data, 1),
345 nir_imm_int(b, 0)));
346 defs->call_sbt_stride =
347 nir_unpack_32_2x16_split_y(b, nir_channel(b, data, 1));
348
349 defs->resume_sbt_addr =
350 nir_pack_64_2x32(b, nir_channels(b, data, 0x3 << 2));
351 }
352
353 static inline void
brw_nir_rt_load_globals(nir_builder * b,struct brw_nir_rt_globals_defs * defs)354 brw_nir_rt_load_globals(nir_builder *b,
355 struct brw_nir_rt_globals_defs *defs)
356 {
357 brw_nir_rt_load_globals_addr(b, defs, nir_load_btd_global_arg_addr_intel(b));
358 }
359
360 static inline nir_ssa_def *
brw_nir_rt_unpack_leaf_ptr(nir_builder * b,nir_ssa_def * vec2)361 brw_nir_rt_unpack_leaf_ptr(nir_builder *b, nir_ssa_def *vec2)
362 {
363 /* Hit record leaf pointers are 42-bit and assumed to be in 64B chunks.
364 * This leaves 22 bits at the top for other stuff.
365 */
366 nir_ssa_def *ptr64 = nir_imul_imm(b, nir_pack_64_2x32(b, vec2), 64);
367
368 /* The top 16 bits (remember, we shifted by 6 already) contain garbage
369 * that we need to get rid of.
370 */
371 nir_ssa_def *ptr_lo = nir_unpack_64_2x32_split_x(b, ptr64);
372 nir_ssa_def *ptr_hi = nir_unpack_64_2x32_split_y(b, ptr64);
373 ptr_hi = nir_extract_i16(b, ptr_hi, nir_imm_int(b, 0));
374 return nir_pack_64_2x32_split(b, ptr_lo, ptr_hi);
375 }
376
377 struct brw_nir_rt_mem_hit_defs {
378 nir_ssa_def *t;
379 nir_ssa_def *tri_bary; /**< Only valid for triangle geometry */
380 nir_ssa_def *aabb_hit_kind; /**< Only valid for AABB geometry */
381 nir_ssa_def *valid;
382 nir_ssa_def *leaf_type;
383 nir_ssa_def *prim_leaf_index;
384 nir_ssa_def *bvh_level;
385 nir_ssa_def *front_face;
386 nir_ssa_def *done; /**< Only for ray queries */
387 nir_ssa_def *prim_leaf_ptr;
388 nir_ssa_def *inst_leaf_ptr;
389 };
390
391 static inline void
brw_nir_rt_load_mem_hit_from_addr(nir_builder * b,struct brw_nir_rt_mem_hit_defs * defs,nir_ssa_def * stack_addr,bool committed)392 brw_nir_rt_load_mem_hit_from_addr(nir_builder *b,
393 struct brw_nir_rt_mem_hit_defs *defs,
394 nir_ssa_def *stack_addr,
395 bool committed)
396 {
397 nir_ssa_def *hit_addr =
398 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
399
400 nir_ssa_def *data = brw_nir_rt_load(b, hit_addr, 16, 4, 32);
401 defs->t = nir_channel(b, data, 0);
402 defs->aabb_hit_kind = nir_channel(b, data, 1);
403 defs->tri_bary = nir_channels(b, data, 0x6);
404 nir_ssa_def *bitfield = nir_channel(b, data, 3);
405 defs->valid = nir_i2b(b, nir_iand_imm(b, bitfield, 1u << 16));
406 defs->leaf_type =
407 nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 17), nir_imm_int(b, 3));
408 defs->prim_leaf_index =
409 nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 20), nir_imm_int(b, 4));
410 defs->bvh_level =
411 nir_ubitfield_extract(b, bitfield, nir_imm_int(b, 24), nir_imm_int(b, 3));
412 defs->front_face = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 27));
413 defs->done = nir_i2b(b, nir_iand_imm(b, bitfield, 1 << 28));
414
415 data = brw_nir_rt_load(b, nir_iadd_imm(b, hit_addr, 16), 16, 4, 32);
416 defs->prim_leaf_ptr =
417 brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 0));
418 defs->inst_leaf_ptr =
419 brw_nir_rt_unpack_leaf_ptr(b, nir_channels(b, data, 0x3 << 2));
420 }
421
422 static inline void
brw_nir_rt_init_mem_hit_at_addr(nir_builder * b,nir_ssa_def * stack_addr,bool committed,nir_ssa_def * t_max)423 brw_nir_rt_init_mem_hit_at_addr(nir_builder *b,
424 nir_ssa_def *stack_addr,
425 bool committed,
426 nir_ssa_def *t_max)
427 {
428 nir_ssa_def *mem_hit_addr =
429 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, committed);
430
431 /* Set the t_max value from the ray initialization */
432 nir_ssa_def *hit_t_addr = mem_hit_addr;
433 brw_nir_rt_store(b, hit_t_addr, 4, t_max, 0x1);
434
435 /* Clear all the flags packed behind primIndexDelta */
436 nir_ssa_def *state_addr = nir_iadd_imm(b, mem_hit_addr, 12);
437 brw_nir_rt_store(b, state_addr, 4, nir_imm_int(b, 0), 0x1);
438 }
439
440 static inline void
brw_nir_rt_load_mem_hit(nir_builder * b,struct brw_nir_rt_mem_hit_defs * defs,bool committed)441 brw_nir_rt_load_mem_hit(nir_builder *b,
442 struct brw_nir_rt_mem_hit_defs *defs,
443 bool committed)
444 {
445 brw_nir_rt_load_mem_hit_from_addr(b, defs, brw_nir_rt_stack_addr(b),
446 committed);
447 }
448
449 static inline void
brw_nir_memcpy_global(nir_builder * b,nir_ssa_def * dst_addr,uint32_t dst_align,nir_ssa_def * src_addr,uint32_t src_align,uint32_t size)450 brw_nir_memcpy_global(nir_builder *b,
451 nir_ssa_def *dst_addr, uint32_t dst_align,
452 nir_ssa_def *src_addr, uint32_t src_align,
453 uint32_t size)
454 {
455 /* We're going to copy in 16B chunks */
456 assert(size % 16 == 0);
457 dst_align = MIN2(dst_align, 16);
458 src_align = MIN2(src_align, 16);
459
460 for (unsigned offset = 0; offset < size; offset += 16) {
461 nir_ssa_def *data =
462 brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), src_align,
463 4, 32);
464 brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
465 data, 0xf /* write_mask */);
466 }
467 }
468
469 static inline void
brw_nir_memclear_global(nir_builder * b,nir_ssa_def * dst_addr,uint32_t dst_align,uint32_t size)470 brw_nir_memclear_global(nir_builder *b,
471 nir_ssa_def *dst_addr, uint32_t dst_align,
472 uint32_t size)
473 {
474 /* We're going to copy in 16B chunks */
475 assert(size % 16 == 0);
476 dst_align = MIN2(dst_align, 16);
477
478 nir_ssa_def *zero = nir_imm_ivec4(b, 0, 0, 0, 0);
479 for (unsigned offset = 0; offset < size; offset += 16) {
480 brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), dst_align,
481 zero, 0xf /* write_mask */);
482 }
483 }
484
485 static inline nir_ssa_def *
brw_nir_rt_query_done(nir_builder * b,nir_ssa_def * stack_addr)486 brw_nir_rt_query_done(nir_builder *b, nir_ssa_def *stack_addr)
487 {
488 struct brw_nir_rt_mem_hit_defs hit_in = {};
489 brw_nir_rt_load_mem_hit_from_addr(b, &hit_in, stack_addr,
490 false /* committed */);
491
492 return hit_in.done;
493 }
494
495 static inline void
brw_nir_rt_set_dword_bit_at(nir_builder * b,nir_ssa_def * addr,uint32_t addr_offset,uint32_t bit)496 brw_nir_rt_set_dword_bit_at(nir_builder *b,
497 nir_ssa_def *addr,
498 uint32_t addr_offset,
499 uint32_t bit)
500 {
501 nir_ssa_def *dword_addr = nir_iadd_imm(b, addr, addr_offset);
502 nir_ssa_def *dword = brw_nir_rt_load(b, dword_addr, 4, 1, 32);
503 brw_nir_rt_store(b, dword_addr, 4, nir_ior_imm(b, dword, 1u << bit), 0x1);
504 }
505
506 static inline void
brw_nir_rt_query_mark_done(nir_builder * b,nir_ssa_def * stack_addr)507 brw_nir_rt_query_mark_done(nir_builder *b, nir_ssa_def *stack_addr)
508 {
509 brw_nir_rt_set_dword_bit_at(b,
510 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
511 false /* committed */),
512 4 * 3 /* dword offset */, 28 /* bit */);
513 }
514
515 /* This helper clears the 3rd dword of the MemHit structure where the valid
516 * bit is located.
517 */
518 static inline void
brw_nir_rt_query_mark_init(nir_builder * b,nir_ssa_def * stack_addr)519 brw_nir_rt_query_mark_init(nir_builder *b, nir_ssa_def *stack_addr)
520 {
521 nir_ssa_def *dword_addr;
522
523 for (uint32_t i = 0; i < 2; i++) {
524 dword_addr =
525 nir_iadd_imm(b,
526 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr,
527 i == 0 /* committed */),
528 4 * 3 /* dword offset */);
529 brw_nir_rt_store(b, dword_addr, 4, nir_imm_int(b, 0), 0x1);
530 }
531 }
532
533 /* This helper is pretty much a memcpy of uncommitted into committed hit
534 * structure, just adding the valid bit.
535 */
536 static inline void
brw_nir_rt_commit_hit_addr(nir_builder * b,nir_ssa_def * stack_addr)537 brw_nir_rt_commit_hit_addr(nir_builder *b, nir_ssa_def *stack_addr)
538 {
539 nir_ssa_def *dst_addr =
540 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
541 nir_ssa_def *src_addr =
542 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
543
544 for (unsigned offset = 0; offset < BRW_RT_SIZEOF_HIT_INFO; offset += 16) {
545 nir_ssa_def *data =
546 brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, offset), 16, 4, 32);
547
548 if (offset == 0) {
549 data = nir_vec4(b,
550 nir_channel(b, data, 0),
551 nir_channel(b, data, 1),
552 nir_channel(b, data, 2),
553 nir_ior_imm(b,
554 nir_channel(b, data, 3),
555 0x1 << 16 /* valid */));
556
557 /* Also write the potential hit as we change it. */
558 brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, offset), 16,
559 data, 0xf /* write_mask */);
560 }
561
562 brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, offset), 16,
563 data, 0xf /* write_mask */);
564 }
565 }
566
567 static inline void
brw_nir_rt_commit_hit(nir_builder * b)568 brw_nir_rt_commit_hit(nir_builder *b)
569 {
570 nir_ssa_def *stack_addr = brw_nir_rt_stack_addr(b);
571 brw_nir_rt_commit_hit_addr(b, stack_addr);
572 }
573
574 static inline void
brw_nir_rt_generate_hit_addr(nir_builder * b,nir_ssa_def * stack_addr,nir_ssa_def * t_val)575 brw_nir_rt_generate_hit_addr(nir_builder *b, nir_ssa_def *stack_addr, nir_ssa_def *t_val)
576 {
577 nir_ssa_def *dst_addr =
578 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, true /* committed */);
579 nir_ssa_def *src_addr =
580 brw_nir_rt_mem_hit_addr_from_addr(b, stack_addr, false /* committed */);
581
582 /* Load 2 vec4 */
583 nir_ssa_def *potential_data[2] = {
584 brw_nir_rt_load(b, src_addr, 16, 4, 32),
585 brw_nir_rt_load(b, nir_iadd_imm(b, src_addr, 16), 16, 4, 32),
586 };
587
588 /* Update the potential hit distance */
589 brw_nir_rt_store(b, src_addr, 4, t_val, 0x1);
590 /* Also mark the potential hit as valid */
591 brw_nir_rt_store(b, nir_iadd_imm(b, src_addr, 12), 4,
592 nir_ior_imm(b, nir_channel(b, potential_data[0], 3),
593 (0x1 << 16) /* valid */), 0x1);
594
595 /* Now write the committed hit. */
596 nir_ssa_def *committed_data[2] = {
597 nir_vec4(b,
598 t_val,
599 nir_imm_float(b, 0.0f), /* barycentric */
600 nir_imm_float(b, 0.0f), /* barycentric */
601 nir_ior_imm(b,
602 /* Just keep leaf_type */
603 nir_iand_imm(b, nir_channel(b, potential_data[0], 3), 0x0000e000),
604 (0x1 << 16) /* valid */ |
605 (BRW_RT_BVH_LEVEL_OBJECT << 5))),
606 potential_data[1],
607 };
608
609 brw_nir_rt_store(b, dst_addr, 16, committed_data[0], 0xf /* write_mask */);
610 brw_nir_rt_store(b, nir_iadd_imm(b, dst_addr, 16), 16,
611 committed_data[1], 0xf /* write_mask */);
612 }
613
614 struct brw_nir_rt_mem_ray_defs {
615 nir_ssa_def *orig;
616 nir_ssa_def *dir;
617 nir_ssa_def *t_near;
618 nir_ssa_def *t_far;
619 nir_ssa_def *root_node_ptr;
620 nir_ssa_def *ray_flags;
621 nir_ssa_def *hit_group_sr_base_ptr;
622 nir_ssa_def *hit_group_sr_stride;
623 nir_ssa_def *miss_sr_ptr;
624 nir_ssa_def *shader_index_multiplier;
625 nir_ssa_def *inst_leaf_ptr;
626 nir_ssa_def *ray_mask;
627 };
628
629 static inline void
brw_nir_rt_store_mem_ray_query_at_addr(nir_builder * b,nir_ssa_def * ray_addr,const struct brw_nir_rt_mem_ray_defs * defs)630 brw_nir_rt_store_mem_ray_query_at_addr(nir_builder *b,
631 nir_ssa_def *ray_addr,
632 const struct brw_nir_rt_mem_ray_defs *defs)
633 {
634 assert_def_size(defs->orig, 3, 32);
635 assert_def_size(defs->dir, 3, 32);
636 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
637 nir_vec4(b, nir_channel(b, defs->orig, 0),
638 nir_channel(b, defs->orig, 1),
639 nir_channel(b, defs->orig, 2),
640 nir_channel(b, defs->dir, 0)),
641 ~0 /* write mask */);
642
643 assert_def_size(defs->t_near, 1, 32);
644 assert_def_size(defs->t_far, 1, 32);
645 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
646 nir_vec4(b, nir_channel(b, defs->dir, 1),
647 nir_channel(b, defs->dir, 2),
648 defs->t_near,
649 defs->t_far),
650 ~0 /* write mask */);
651
652 assert_def_size(defs->root_node_ptr, 1, 64);
653 assert_def_size(defs->ray_flags, 1, 16);
654 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
655 nir_vec2(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
656 nir_pack_32_2x16_split(b,
657 nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
658 defs->ray_flags)),
659 0x3 /* write mask */);
660
661 /* leaf_ptr is optional */
662 nir_ssa_def *inst_leaf_ptr;
663 if (defs->inst_leaf_ptr) {
664 inst_leaf_ptr = defs->inst_leaf_ptr;
665 } else {
666 inst_leaf_ptr = nir_imm_int64(b, 0);
667 }
668
669 assert_def_size(inst_leaf_ptr, 1, 64);
670 assert_def_size(defs->ray_mask, 1, 32);
671 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 56), 8,
672 nir_vec2(b, nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
673 nir_pack_32_2x16_split(b,
674 nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
675 nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
676 ~0 /* write mask */);
677 }
678
679 static inline void
brw_nir_rt_store_mem_ray(nir_builder * b,const struct brw_nir_rt_mem_ray_defs * defs,enum brw_rt_bvh_level bvh_level)680 brw_nir_rt_store_mem_ray(nir_builder *b,
681 const struct brw_nir_rt_mem_ray_defs *defs,
682 enum brw_rt_bvh_level bvh_level)
683 {
684 nir_ssa_def *ray_addr =
685 brw_nir_rt_mem_ray_addr(b, brw_nir_rt_stack_addr(b), bvh_level);
686
687 assert_def_size(defs->orig, 3, 32);
688 assert_def_size(defs->dir, 3, 32);
689 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 0), 16,
690 nir_vec4(b, nir_channel(b, defs->orig, 0),
691 nir_channel(b, defs->orig, 1),
692 nir_channel(b, defs->orig, 2),
693 nir_channel(b, defs->dir, 0)),
694 ~0 /* write mask */);
695
696 assert_def_size(defs->t_near, 1, 32);
697 assert_def_size(defs->t_far, 1, 32);
698 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 16), 16,
699 nir_vec4(b, nir_channel(b, defs->dir, 1),
700 nir_channel(b, defs->dir, 2),
701 defs->t_near,
702 defs->t_far),
703 ~0 /* write mask */);
704
705 assert_def_size(defs->root_node_ptr, 1, 64);
706 assert_def_size(defs->ray_flags, 1, 16);
707 assert_def_size(defs->hit_group_sr_base_ptr, 1, 64);
708 assert_def_size(defs->hit_group_sr_stride, 1, 16);
709 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 32), 16,
710 nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->root_node_ptr),
711 nir_pack_32_2x16_split(b,
712 nir_unpack_64_4x16_split_z(b, defs->root_node_ptr),
713 defs->ray_flags),
714 nir_unpack_64_2x32_split_x(b, defs->hit_group_sr_base_ptr),
715 nir_pack_32_2x16_split(b,
716 nir_unpack_64_4x16_split_z(b, defs->hit_group_sr_base_ptr),
717 defs->hit_group_sr_stride)),
718 ~0 /* write mask */);
719
720 /* leaf_ptr is optional */
721 nir_ssa_def *inst_leaf_ptr;
722 if (defs->inst_leaf_ptr) {
723 inst_leaf_ptr = defs->inst_leaf_ptr;
724 } else {
725 inst_leaf_ptr = nir_imm_int64(b, 0);
726 }
727
728 assert_def_size(defs->miss_sr_ptr, 1, 64);
729 assert_def_size(defs->shader_index_multiplier, 1, 32);
730 assert_def_size(inst_leaf_ptr, 1, 64);
731 assert_def_size(defs->ray_mask, 1, 32);
732 brw_nir_rt_store(b, nir_iadd_imm(b, ray_addr, 48), 16,
733 nir_vec4(b, nir_unpack_64_2x32_split_x(b, defs->miss_sr_ptr),
734 nir_pack_32_2x16_split(b,
735 nir_unpack_64_4x16_split_z(b, defs->miss_sr_ptr),
736 nir_unpack_32_2x16_split_x(b,
737 nir_ishl(b, defs->shader_index_multiplier,
738 nir_imm_int(b, 8)))),
739 nir_unpack_64_2x32_split_x(b, inst_leaf_ptr),
740 nir_pack_32_2x16_split(b,
741 nir_unpack_64_4x16_split_z(b, inst_leaf_ptr),
742 nir_unpack_32_2x16_split_x(b, defs->ray_mask))),
743 ~0 /* write mask */);
744 }
745
746 static inline void
brw_nir_rt_load_mem_ray_from_addr(nir_builder * b,struct brw_nir_rt_mem_ray_defs * defs,nir_ssa_def * ray_base_addr,enum brw_rt_bvh_level bvh_level)747 brw_nir_rt_load_mem_ray_from_addr(nir_builder *b,
748 struct brw_nir_rt_mem_ray_defs *defs,
749 nir_ssa_def *ray_base_addr,
750 enum brw_rt_bvh_level bvh_level)
751 {
752 nir_ssa_def *ray_addr = brw_nir_rt_mem_ray_addr(b,
753 ray_base_addr,
754 bvh_level);
755
756 nir_ssa_def *data[4] = {
757 brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 0), 16, 4, 32),
758 brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 16), 16, 4, 32),
759 brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 32), 16, 4, 32),
760 brw_nir_rt_load(b, nir_iadd_imm(b, ray_addr, 48), 16, 4, 32),
761 };
762
763 defs->orig = nir_channels(b, data[0], 0x7);
764 defs->dir = nir_vec3(b, nir_channel(b, data[0], 3),
765 nir_channel(b, data[1], 0),
766 nir_channel(b, data[1], 1));
767 defs->t_near = nir_channel(b, data[1], 2);
768 defs->t_far = nir_channel(b, data[1], 3);
769 defs->root_node_ptr =
770 nir_pack_64_2x32_split(b, nir_channel(b, data[2], 0),
771 nir_extract_i16(b, nir_channel(b, data[2], 1),
772 nir_imm_int(b, 0)));
773 defs->ray_flags =
774 nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 1));
775 defs->hit_group_sr_base_ptr =
776 nir_pack_64_2x32_split(b, nir_channel(b, data[2], 2),
777 nir_extract_i16(b, nir_channel(b, data[2], 3),
778 nir_imm_int(b, 0)));
779 defs->hit_group_sr_stride =
780 nir_unpack_32_2x16_split_y(b, nir_channel(b, data[2], 3));
781 defs->miss_sr_ptr =
782 nir_pack_64_2x32_split(b, nir_channel(b, data[3], 0),
783 nir_extract_i16(b, nir_channel(b, data[3], 1),
784 nir_imm_int(b, 0)));
785 defs->shader_index_multiplier =
786 nir_ushr(b, nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 1)),
787 nir_imm_int(b, 8));
788 defs->inst_leaf_ptr =
789 nir_pack_64_2x32_split(b, nir_channel(b, data[3], 2),
790 nir_extract_i16(b, nir_channel(b, data[3], 3),
791 nir_imm_int(b, 0)));
792 defs->ray_mask =
793 nir_unpack_32_2x16_split_y(b, nir_channel(b, data[3], 3));
794 }
795
796 static inline void
brw_nir_rt_load_mem_ray(nir_builder * b,struct brw_nir_rt_mem_ray_defs * defs,enum brw_rt_bvh_level bvh_level)797 brw_nir_rt_load_mem_ray(nir_builder *b,
798 struct brw_nir_rt_mem_ray_defs *defs,
799 enum brw_rt_bvh_level bvh_level)
800 {
801 brw_nir_rt_load_mem_ray_from_addr(b, defs, brw_nir_rt_stack_addr(b),
802 bvh_level);
803 }
804
805 struct brw_nir_rt_bvh_instance_leaf_defs {
806 nir_ssa_def *shader_index;
807 nir_ssa_def *contribution_to_hit_group_index;
808 nir_ssa_def *world_to_object[4];
809 nir_ssa_def *instance_id;
810 nir_ssa_def *instance_index;
811 nir_ssa_def *object_to_world[4];
812 };
813
814 static inline void
brw_nir_rt_load_bvh_instance_leaf(nir_builder * b,struct brw_nir_rt_bvh_instance_leaf_defs * defs,nir_ssa_def * leaf_addr)815 brw_nir_rt_load_bvh_instance_leaf(nir_builder *b,
816 struct brw_nir_rt_bvh_instance_leaf_defs *defs,
817 nir_ssa_def *leaf_addr)
818 {
819 defs->shader_index =
820 nir_iand_imm(b, brw_nir_rt_load(b, leaf_addr, 4, 1, 32), (1 << 24) - 1);
821 defs->contribution_to_hit_group_index =
822 nir_iand_imm(b,
823 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 4), 4, 1, 32),
824 (1 << 24) - 1);
825
826 defs->world_to_object[0] =
827 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 16), 4, 3, 32);
828 defs->world_to_object[1] =
829 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 28), 4, 3, 32);
830 defs->world_to_object[2] =
831 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 40), 4, 3, 32);
832 /* The last column of the matrices is swapped between the two probably
833 * because it makes it easier/faster for hardware somehow.
834 */
835 defs->object_to_world[3] =
836 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 52), 4, 3, 32);
837
838 nir_ssa_def *data =
839 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 64), 4, 4, 32);
840 defs->instance_id = nir_channel(b, data, 2);
841 defs->instance_index = nir_channel(b, data, 3);
842
843 defs->object_to_world[0] =
844 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 80), 4, 3, 32);
845 defs->object_to_world[1] =
846 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 92), 4, 3, 32);
847 defs->object_to_world[2] =
848 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 104), 4, 3, 32);
849 defs->world_to_object[3] =
850 brw_nir_rt_load(b, nir_iadd_imm(b, leaf_addr, 116), 4, 3, 32);
851 }
852
853 struct brw_nir_rt_bvh_primitive_leaf_defs {
854 nir_ssa_def *shader_index;
855 nir_ssa_def *geom_mask;
856 nir_ssa_def *geom_index;
857 nir_ssa_def *type;
858 nir_ssa_def *geom_flags;
859 };
860
861 static inline void
brw_nir_rt_load_bvh_primitive_leaf(nir_builder * b,struct brw_nir_rt_bvh_primitive_leaf_defs * defs,nir_ssa_def * leaf_addr)862 brw_nir_rt_load_bvh_primitive_leaf(nir_builder *b,
863 struct brw_nir_rt_bvh_primitive_leaf_defs *defs,
864 nir_ssa_def *leaf_addr)
865 {
866 nir_ssa_def *desc = brw_nir_rt_load(b, leaf_addr, 4, 2, 32);
867
868 defs->shader_index =
869 nir_ubitfield_extract(b, nir_channel(b, desc, 0),
870 nir_imm_int(b, 23), nir_imm_int(b, 0));
871 defs->geom_mask =
872 nir_ubitfield_extract(b, nir_channel(b, desc, 0),
873 nir_imm_int(b, 31), nir_imm_int(b, 24));
874
875 defs->geom_index =
876 nir_ubitfield_extract(b, nir_channel(b, desc, 1),
877 nir_imm_int(b, 28), nir_imm_int(b, 0));
878 defs->type =
879 nir_ubitfield_extract(b, nir_channel(b, desc, 1),
880 nir_imm_int(b, 29), nir_imm_int(b, 29));
881 defs->geom_flags =
882 nir_ubitfield_extract(b, nir_channel(b, desc, 1),
883 nir_imm_int(b, 31), nir_imm_int(b, 30));
884 }
885
886 static inline nir_ssa_def *
brw_nir_rt_load_primitive_id_from_hit(nir_builder * b,nir_ssa_def * is_procedural,const struct brw_nir_rt_mem_hit_defs * defs)887 brw_nir_rt_load_primitive_id_from_hit(nir_builder *b,
888 nir_ssa_def *is_procedural,
889 const struct brw_nir_rt_mem_hit_defs *defs)
890 {
891 if (!is_procedural) {
892 is_procedural =
893 nir_ieq(b, defs->leaf_type,
894 nir_imm_int(b, BRW_RT_BVH_NODE_TYPE_PROCEDURAL));
895 }
896
897 /* The IDs are located in the leaf. Take the index of the hit.
898 *
899 * The index in dw[3] for procedural and dw[2] for quad.
900 */
901 nir_ssa_def *offset =
902 nir_bcsel(b, is_procedural,
903 nir_iadd_imm(b, nir_ishl_imm(b, defs->prim_leaf_index, 2), 12),
904 nir_imm_int(b, 8));
905 return nir_load_global(b, nir_iadd(b, defs->prim_leaf_ptr,
906 nir_u2u64(b, offset)),
907 4, /* align */ 1, 32);
908 }
909
910 static inline nir_ssa_def *
brw_nir_rt_acceleration_structure_to_root_node(nir_builder * b,nir_ssa_def * as_addr)911 brw_nir_rt_acceleration_structure_to_root_node(nir_builder *b,
912 nir_ssa_def *as_addr)
913 {
914 /* The HW memory structure in which we specify what acceleration structure
915 * to traverse, takes the address to the root node in the acceleration
916 * structure, not the acceleration structure itself. To find that, we have
917 * to read the root node offset from the acceleration structure which is
918 * the first QWord.
919 *
920 * But if the acceleration structure pointer is NULL, then we should return
921 * NULL as root node pointer.
922 */
923 nir_ssa_def *root_node_ptr, *null_node_ptr;
924 nir_push_if(b, nir_ieq(b, as_addr, nir_imm_int64(b, 0)));
925 {
926 null_node_ptr = nir_imm_int64(b, 0);
927 }
928 nir_push_else(b, NULL);
929 {
930 root_node_ptr =
931 nir_iadd(b, as_addr, brw_nir_rt_load(b, as_addr, 256, 1, 64));
932 }
933 nir_pop_if(b, NULL);
934
935 return nir_if_phi(b, null_node_ptr, root_node_ptr);
936 }
937
938 #endif /* BRW_NIR_RT_BUILDER_H */
939