1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s
4; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s
5
6; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
7; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
8; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr)
9; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr)
10
11declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
12declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
13declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>)
14declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>)
15declare i32 @llvm.amdgcn.workitem.id.x()
16
17define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
18; GFX1030-LABEL: image_bvh_intersect_ray:
19; GFX1030:       ; %bb.0:
20; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3]
21; GFX1030-NEXT:    s_waitcnt vmcnt(0)
22; GFX1030-NEXT:    ; return to shader part epilog
23;
24; GFX1013-LABEL: image_bvh_intersect_ray:
25; GFX1013:       ; %bb.0:
26; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
27; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
28; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
29; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
30; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
31; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
32; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
33; GFX1013-NEXT:    s_waitcnt vmcnt(0)
34; GFX1013-NEXT:    ; return to shader part epilog
35; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget
36  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
37  %r = bitcast <4 x i32> %v to <4 x float>
38  ret <4 x float> %r
39}
40
41define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
42; GCN-LABEL: image_bvh_intersect_ray_flat:
43; GCN:       ; %bb.0:
44; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
45; GCN-NEXT:    s_waitcnt vmcnt(0)
46; GCN-NEXT:    ; return to shader part epilog
47  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
48  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
49  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
50  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
51  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
52  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
53  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
54  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
55  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
56  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
57 %r = bitcast <4 x i32> %v to <4 x float>
58 ret <4 x float> %r
59}
60
61define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
62; GCN-LABEL: image_bvh_intersect_ray_a16:
63; GCN:       ; %bb.0:
64; GCN-NEXT:    s_mov_b32 s4, 0xffff
65; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
66; GCN-NEXT:    v_and_b32_e32 v10, s4, v8
67; GCN-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
68; GCN-NEXT:    v_and_b32_e32 v9, s4, v9
69; GCN-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
70; GCN-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
71; GCN-NEXT:    v_and_or_b32 v5, v6, s4, v5
72; GCN-NEXT:    v_and_or_b32 v6, v7, s4, v10
73; GCN-NEXT:    v_lshl_or_b32 v7, v9, 16, v8
74; GCN-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
75; GCN-NEXT:    s_waitcnt vmcnt(0)
76; GCN-NEXT:    ; return to shader part epilog
77  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
78  %r = bitcast <4 x i32> %v to <4 x float>
79  ret <4 x float> %r
80}
81
82define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) {
83; GFX1030-LABEL: image_bvh64_intersect_ray:
84; GFX1030:       ; %bb.0:
85; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3]
86; GFX1030-NEXT:    s_waitcnt vmcnt(0)
87; GFX1030-NEXT:    ; return to shader part epilog
88;
89; GFX1013-LABEL: image_bvh64_intersect_ray:
90; GFX1013:       ; %bb.0:
91; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
92; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
93; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
94; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
95; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
96; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
97; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
98; GFX1013-NEXT:    s_waitcnt vmcnt(0)
99; GFX1013-NEXT:    ; return to shader part epilog
100  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
101  %r = bitcast <4 x i32> %v to <4 x float>
102  ret <4 x float> %r
103}
104
105define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) {
106; GCN-LABEL: image_bvh64_intersect_ray_flat:
107; GCN:       ; %bb.0:
108; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
109; GCN-NEXT:    s_waitcnt vmcnt(0)
110; GCN-NEXT:    ; return to shader part epilog
111  %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64
112  %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0
113  %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1
114  %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2
115  %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0
116  %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1
117  %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2
118  %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0
119  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1
120  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2
121  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
122 %r = bitcast <4 x i32> %v to <4 x float>
123 ret <4 x float> %r
124}
125
126define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) {
127; GCN-LABEL: image_bvh64_intersect_ray_a16:
128; GCN:       ; %bb.0:
129; GCN-NEXT:    s_mov_b32 s4, 0xffff
130; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
131; GCN-NEXT:    v_and_b32_e32 v11, s4, v9
132; GCN-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
133; GCN-NEXT:    v_and_b32_e32 v10, s4, v10
134; GCN-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
135; GCN-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
136; GCN-NEXT:    v_and_or_b32 v6, v7, s4, v6
137; GCN-NEXT:    v_and_or_b32 v7, v8, s4, v11
138; GCN-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
139; GCN-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
140; GCN-NEXT:    s_waitcnt vmcnt(0)
141; GCN-NEXT:    ; return to shader part epilog
142  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
143  %r = bitcast <4 x i32> %v to <4 x float>
144  ret <4 x float> %r
145}
146
147define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
148; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr:
149; GFX1030:       ; %bb.0:
150; GFX1030-NEXT:    v_mov_b32_e32 v5, v0
151; GFX1030-NEXT:    v_mov_b32_e32 v9, v1
152; GFX1030-NEXT:    v_mov_b32_e32 v13, v2
153; GFX1030-NEXT:    v_mov_b32_e32 v18, v3
154; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
155; GFX1030-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
156; GFX1030-NEXT:    v_readfirstlane_b32 s4, v14
157; GFX1030-NEXT:    v_readfirstlane_b32 s5, v15
158; GFX1030-NEXT:    v_readfirstlane_b32 s6, v16
159; GFX1030-NEXT:    v_readfirstlane_b32 s7, v17
160; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
161; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v5, v9, v13, v18, v4, v6, v7, v8, v10, v11, v12], s[4:7]
162; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
163; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
164; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
165; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
166; GFX1030-NEXT:    s_cbranch_execnz BB6_1
167; GFX1030-NEXT:  ; %bb.2:
168; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
169; GFX1030-NEXT:    s_waitcnt vmcnt(0)
170; GFX1030-NEXT:    ; return to shader part epilog
171;
172; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr:
173; GFX1013:       ; %bb.0:
174; GFX1013-NEXT:    v_mov_b32_e32 v5, v6
175; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
176; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
177; GFX1013-NEXT:    v_mov_b32_e32 v8, v10
178; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
179; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
180; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
181; GFX1013-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
182; GFX1013-NEXT:    v_readfirstlane_b32 s4, v14
183; GFX1013-NEXT:    v_readfirstlane_b32 s5, v15
184; GFX1013-NEXT:    v_readfirstlane_b32 s6, v16
185; GFX1013-NEXT:    v_readfirstlane_b32 s7, v17
186; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15]
187; GFX1013-NEXT:    image_bvh_intersect_ray v[18:21], v[0:15], s[4:7]
188; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[16:17]
189; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
190; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
191; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
192; GFX1013-NEXT:    s_cbranch_execnz BB6_1
193; GFX1013-NEXT:  ; %bb.2:
194; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
195; GFX1013-NEXT:    s_waitcnt vmcnt(0)
196; GFX1013-NEXT:    v_mov_b32_e32 v0, v18
197; GFX1013-NEXT:    v_mov_b32_e32 v1, v19
198; GFX1013-NEXT:    v_mov_b32_e32 v2, v20
199; GFX1013-NEXT:    v_mov_b32_e32 v3, v21
200; GFX1013-NEXT:    ; return to shader part epilog
201  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
202  %r = bitcast <4 x i32> %v to <4 x float>
203  ret <4 x float> %r
204}
205
206define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
207; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
208; GFX1030:       ; %bb.0:
209; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
210; GFX1030-NEXT:    v_mov_b32_e32 v5, v0
211; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
212; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
213; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v8
214; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
215; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
216; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
217; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
218; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
219; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v9
220; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
221; GFX1030-NEXT:    v_and_or_b32 v6, v6, s0, v0
222; GFX1030-NEXT:    v_and_or_b32 v7, v7, s0, v1
223; GFX1030-NEXT:    v_lshl_or_b32 v8, v3, 16, v2
224; GFX1030-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
225; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
226; GFX1030-NEXT:    v_readfirstlane_b32 s5, v11
227; GFX1030-NEXT:    v_readfirstlane_b32 s6, v12
228; GFX1030-NEXT:    v_readfirstlane_b32 s7, v13
229; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
230; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], [v5, v14, v15, v16, v4, v6, v7, v8], s[4:7] a16
231; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
232; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
233; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
234; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
235; GFX1030-NEXT:    s_cbranch_execnz BB7_1
236; GFX1030-NEXT:  ; %bb.2:
237; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
238; GFX1030-NEXT:    s_waitcnt vmcnt(0)
239; GFX1030-NEXT:    ; return to shader part epilog
240;
241; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
242; GFX1013:       ; %bb.0:
243; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
244; GFX1013-NEXT:    v_lshrrev_b32_e32 v5, 16, v6
245; GFX1013-NEXT:    v_and_b32_e32 v14, s0, v8
246; GFX1013-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
247; GFX1013-NEXT:    v_and_b32_e32 v9, s0, v9
248; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
249; GFX1013-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
250; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
251; GFX1013-NEXT:    v_and_or_b32 v5, v6, s0, v5
252; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v14
253; GFX1013-NEXT:    v_lshl_or_b32 v7, v9, 16, v8
254; GFX1013-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
255; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
256; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
257; GFX1013-NEXT:    v_readfirstlane_b32 s6, v12
258; GFX1013-NEXT:    v_readfirstlane_b32 s7, v13
259; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11]
260; GFX1013-NEXT:    image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16
261; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[12:13]
262; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
263; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
264; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
265; GFX1013-NEXT:    s_cbranch_execnz BB7_1
266; GFX1013-NEXT:  ; %bb.2:
267; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
268; GFX1013-NEXT:    s_waitcnt vmcnt(0)
269; GFX1013-NEXT:    v_mov_b32_e32 v0, v14
270; GFX1013-NEXT:    v_mov_b32_e32 v1, v15
271; GFX1013-NEXT:    v_mov_b32_e32 v2, v16
272; GFX1013-NEXT:    v_mov_b32_e32 v3, v17
273; GFX1013-NEXT:    ; return to shader part epilog
274  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
275  %r = bitcast <4 x i32> %v to <4 x float>
276  ret <4 x float> %r
277}
278
279define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) {
280; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr:
281; GFX1030:       ; %bb.0:
282; GFX1030-NEXT:    v_mov_b32_e32 v6, v0
283; GFX1030-NEXT:    v_mov_b32_e32 v10, v1
284; GFX1030-NEXT:    v_mov_b32_e32 v14, v2
285; GFX1030-NEXT:    v_mov_b32_e32 v19, v3
286; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
287; GFX1030-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
288; GFX1030-NEXT:    v_readfirstlane_b32 s4, v15
289; GFX1030-NEXT:    v_readfirstlane_b32 s5, v16
290; GFX1030-NEXT:    v_readfirstlane_b32 s6, v17
291; GFX1030-NEXT:    v_readfirstlane_b32 s7, v18
292; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
293; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v6, v10, v14, v19, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7]
294; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
295; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
296; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
297; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
298; GFX1030-NEXT:    s_cbranch_execnz BB8_1
299; GFX1030-NEXT:  ; %bb.2:
300; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
301; GFX1030-NEXT:    s_waitcnt vmcnt(0)
302; GFX1030-NEXT:    ; return to shader part epilog
303;
304; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr:
305; GFX1013:       ; %bb.0:
306; GFX1013-NEXT:    v_mov_b32_e32 v6, v7
307; GFX1013-NEXT:    v_mov_b32_e32 v7, v8
308; GFX1013-NEXT:    v_mov_b32_e32 v8, v9
309; GFX1013-NEXT:    v_mov_b32_e32 v9, v11
310; GFX1013-NEXT:    v_mov_b32_e32 v10, v12
311; GFX1013-NEXT:    v_mov_b32_e32 v11, v13
312; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
313; GFX1013-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
314; GFX1013-NEXT:    v_readfirstlane_b32 s4, v15
315; GFX1013-NEXT:    v_readfirstlane_b32 s5, v16
316; GFX1013-NEXT:    v_readfirstlane_b32 s6, v17
317; GFX1013-NEXT:    v_readfirstlane_b32 s7, v18
318; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16]
319; GFX1013-NEXT:    image_bvh64_intersect_ray v[19:22], v[0:15], s[4:7]
320; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[17:18]
321; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
322; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
323; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
324; GFX1013-NEXT:    s_cbranch_execnz BB8_1
325; GFX1013-NEXT:  ; %bb.2:
326; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
327; GFX1013-NEXT:    s_waitcnt vmcnt(0)
328; GFX1013-NEXT:    v_mov_b32_e32 v0, v19
329; GFX1013-NEXT:    v_mov_b32_e32 v1, v20
330; GFX1013-NEXT:    v_mov_b32_e32 v2, v21
331; GFX1013-NEXT:    v_mov_b32_e32 v3, v22
332; GFX1013-NEXT:    ; return to shader part epilog
333  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
334  %r = bitcast <4 x i32> %v to <4 x float>
335  ret <4 x float> %r
336}
337
338define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) {
339; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
340; GFX1030:       ; %bb.0:
341; GFX1030-NEXT:    s_mov_b32 s0, 0xffff
342; GFX1030-NEXT:    v_mov_b32_e32 v6, v0
343; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
344; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
345; GFX1030-NEXT:    v_and_b32_e32 v1, s0, v9
346; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
347; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
348; GFX1030-NEXT:    v_lshrrev_b32_e32 v2, 16, v9
349; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
350; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
351; GFX1030-NEXT:    v_and_b32_e32 v3, s0, v10
352; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
353; GFX1030-NEXT:    v_and_or_b32 v7, v7, s0, v0
354; GFX1030-NEXT:    v_and_or_b32 v8, v8, s0, v1
355; GFX1030-NEXT:    v_lshl_or_b32 v9, v3, 16, v2
356; GFX1030-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
357; GFX1030-NEXT:    v_readfirstlane_b32 s4, v11
358; GFX1030-NEXT:    v_readfirstlane_b32 s5, v12
359; GFX1030-NEXT:    v_readfirstlane_b32 s6, v13
360; GFX1030-NEXT:    v_readfirstlane_b32 s7, v14
361; GFX1030-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
362; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], [v6, v15, v16, v17, v4, v5, v7, v8, v9], s[4:7] a16
363; GFX1030-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
364; GFX1030-NEXT:    s_and_b32 s0, s0, vcc_lo
365; GFX1030-NEXT:    s_and_saveexec_b32 s0, s0
366; GFX1030-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
367; GFX1030-NEXT:    s_cbranch_execnz BB9_1
368; GFX1030-NEXT:  ; %bb.2:
369; GFX1030-NEXT:    s_mov_b32 exec_lo, s1
370; GFX1030-NEXT:    s_waitcnt vmcnt(0)
371; GFX1030-NEXT:    ; return to shader part epilog
372;
373; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
374; GFX1013:       ; %bb.0:
375; GFX1013-NEXT:    s_mov_b32 s0, 0xffff
376; GFX1013-NEXT:    v_lshrrev_b32_e32 v6, 16, v7
377; GFX1013-NEXT:    v_and_b32_e32 v15, s0, v9
378; GFX1013-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
379; GFX1013-NEXT:    v_and_b32_e32 v10, s0, v10
380; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
381; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
382; GFX1013-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
383; GFX1013-NEXT:    v_and_or_b32 v6, v7, s0, v6
384; GFX1013-NEXT:    v_and_or_b32 v7, v8, s0, v15
385; GFX1013-NEXT:    v_lshl_or_b32 v8, v10, 16, v9
386; GFX1013-NEXT:  BB9_1: ; =>This Inner Loop Header: Depth=1
387; GFX1013-NEXT:    v_readfirstlane_b32 s4, v11
388; GFX1013-NEXT:    v_readfirstlane_b32 s5, v12
389; GFX1013-NEXT:    v_readfirstlane_b32 s6, v13
390; GFX1013-NEXT:    v_readfirstlane_b32 s7, v14
391; GFX1013-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12]
392; GFX1013-NEXT:    s_waitcnt vmcnt(0)
393; GFX1013-NEXT:    image_bvh64_intersect_ray v[15:18], v[0:15], s[4:7] a16
394; GFX1013-NEXT:    v_cmp_eq_u64_e64 s0, s[6:7], v[13:14]
395; GFX1013-NEXT:    s_and_b32 s0, s0, vcc_lo
396; GFX1013-NEXT:    s_and_saveexec_b32 s0, s0
397; GFX1013-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
398; GFX1013-NEXT:    s_cbranch_execnz BB9_1
399; GFX1013-NEXT:  ; %bb.2:
400; GFX1013-NEXT:    s_mov_b32 exec_lo, s1
401; GFX1013-NEXT:    s_waitcnt vmcnt(0)
402; GFX1013-NEXT:    v_mov_b32_e32 v0, v15
403; GFX1013-NEXT:    v_mov_b32_e32 v1, v16
404; GFX1013-NEXT:    v_mov_b32_e32 v2, v17
405; GFX1013-NEXT:    v_mov_b32_e32 v3, v18
406; GFX1013-NEXT:    ; return to shader part epilog
407  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
408  %r = bitcast <4 x i32> %v to <4 x float>
409  ret <4 x float> %r
410}
411
412define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
413; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign:
414; GFX1030:       ; %bb.0:
415; GFX1030-NEXT:    s_clause 0x1
416; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
417; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
418; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
419; GFX1030-NEXT:    v_mov_b32_e32 v5, 0x40400000
420; GFX1030-NEXT:    v_mov_b32_e32 v6, 4.0
421; GFX1030-NEXT:    v_mov_b32_e32 v7, 0x40a00000
422; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40c00000
423; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40e00000
424; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x41000000
425; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
426; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
427; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
428; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
429; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
430; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
431; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
432; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
433; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
434; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
435; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
436; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
437; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
438; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
439; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
440; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[0:3]
441; GFX1030-NEXT:    s_waitcnt vmcnt(0)
442; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
443; GFX1030-NEXT:    s_endpgm
444;
445; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign:
446; GFX1013:       ; %bb.0:
447; GFX1013-NEXT:    s_clause 0x1
448; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
449; GFX1013-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
450; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
451; GFX1013-NEXT:    v_mov_b32_e32 v7, 0x40a00000
452; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40c00000
453; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40e00000
454; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x41000000
455; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
457; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
458; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
459; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
460; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
461; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
462; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
463; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
464; GFX1013-NEXT:    v_mov_b32_e32 v6, 4.0
465; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
466; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
467; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
468; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
469; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
470; GFX1013-NEXT:    v_mov_b32_e32 v5, 0x40400000
471; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
472; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:15], s[8:11]
473; GFX1013-NEXT:    s_waitcnt vmcnt(0)
474; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
475; GFX1013-NEXT:    s_endpgm
476  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
477  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
478  %node_ptr = load i32, i32* %gep_node_ptr, align 4
479  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
480  %ray_extent = load float, float* %gep_ray, align 4
481  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
482  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
483  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
484  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
485  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
486  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
487  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
488  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
489  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
490  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
491  store <4 x i32> %v, <4 x i32>* undef
492  ret void
493}
494
495define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) {
496; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
497; GFX1030:       ; %bb.0:
498; GFX1030-NEXT:    s_clause 0x1
499; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
500; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
501; GFX1030-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
502; GFX1030-NEXT:    s_movk_i32 s9, 0x4600
503; GFX1030-NEXT:    s_movk_i32 s8, 0x4700
504; GFX1030-NEXT:    s_bfe_u32 s8, s8, 0x100000
505; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
506; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
507; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
508; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
509; GFX1030-NEXT:    v_mov_b32_e32 v3, s7
510; GFX1030-NEXT:    s_movk_i32 s5, 0x4400
511; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
512; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
513; GFX1030-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
514; GFX1030-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
515; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
516; GFX1030-NEXT:    flat_load_dword v0, v[0:1]
517; GFX1030-NEXT:    flat_load_dword v1, v[2:3]
518; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
519; GFX1030-NEXT:    s_movk_i32 s7, 0x4800
520; GFX1030-NEXT:    s_bfe_u32 s6, s6, 0x100000
521; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
522; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
523; GFX1030-NEXT:    s_or_b32 s5, s6, s5
524; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
525; GFX1030-NEXT:    s_bfe_u32 s7, s7, 0x100000
526; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
527; GFX1030-NEXT:    s_lshl_b32 s6, s6, 16
528; GFX1030-NEXT:    s_lshl_b32 s7, s7, 16
529; GFX1030-NEXT:    s_or_b32 s4, s4, s6
530; GFX1030-NEXT:    s_or_b32 s6, s8, s7
531; GFX1030-NEXT:    v_mov_b32_e32 v2, 0
532; GFX1030-NEXT:    v_mov_b32_e32 v3, 1.0
533; GFX1030-NEXT:    v_mov_b32_e32 v4, 2.0
534; GFX1030-NEXT:    v_mov_b32_e32 v5, s5
535; GFX1030-NEXT:    v_mov_b32_e32 v6, s4
536; GFX1030-NEXT:    v_mov_b32_e32 v7, s6
537; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
538; GFX1030-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
539; GFX1030-NEXT:    s_waitcnt vmcnt(0)
540; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
541; GFX1030-NEXT:    s_endpgm
542;
543; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign:
544; GFX1013:       ; %bb.0:
545; GFX1013-NEXT:    s_clause 0x1
546; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
547; GFX1013-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
548; GFX1013-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
549; GFX1013-NEXT:    s_movk_i32 s1, 0x4400
550; GFX1013-NEXT:    s_movk_i32 s2, 0x4200
551; GFX1013-NEXT:    s_bfe_u32 s1, s1, 0x100000
552; GFX1013-NEXT:    s_movk_i32 s3, 0x4800
553; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
554; GFX1013-NEXT:    s_lshl_b32 s1, s1, 16
555; GFX1013-NEXT:    s_movk_i32 s0, 0x4500
556; GFX1013-NEXT:    s_or_b32 s1, s2, s1
557; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
558; GFX1013-NEXT:    s_bfe_u32 s0, s0, 0x100000
559; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
560; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX1013-NEXT:    v_mov_b32_e32 v0, s4
562; GFX1013-NEXT:    v_mov_b32_e32 v1, s5
563; GFX1013-NEXT:    v_mov_b32_e32 v2, s6
564; GFX1013-NEXT:    v_mov_b32_e32 v3, s7
565; GFX1013-NEXT:    s_movk_i32 s5, 0x4600
566; GFX1013-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v6
567; GFX1013-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
568; GFX1013-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v6
569; GFX1013-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
570; GFX1013-NEXT:    s_movk_i32 s4, 0x4700
571; GFX1013-NEXT:    flat_load_dword v0, v[4:5]
572; GFX1013-NEXT:    flat_load_dword v1, v[2:3]
573; GFX1013-NEXT:    s_bfe_u32 s2, s5, 0x100000
574; GFX1013-NEXT:    s_bfe_u32 s4, s4, 0x100000
575; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
576; GFX1013-NEXT:    v_mov_b32_e32 v2, 0
577; GFX1013-NEXT:    s_or_b32 s0, s0, s2
578; GFX1013-NEXT:    s_or_b32 s2, s4, s3
579; GFX1013-NEXT:    v_mov_b32_e32 v3, 1.0
580; GFX1013-NEXT:    v_mov_b32_e32 v4, 2.0
581; GFX1013-NEXT:    v_mov_b32_e32 v5, s1
582; GFX1013-NEXT:    v_mov_b32_e32 v6, s0
583; GFX1013-NEXT:    v_mov_b32_e32 v7, s2
584; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
585; GFX1013-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16
586; GFX1013-NEXT:    s_waitcnt vmcnt(0)
587; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
588; GFX1013-NEXT:    s_endpgm
589  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
590  %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid
591  %node_ptr = load i32, i32* %gep_node_ptr, align 4
592  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
593  %ray_extent = load float, float* %gep_ray, align 4
594  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
595  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
596  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
597  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
598  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
599  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
600  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
601  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
602  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
603  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
604  store <4 x i32> %v, <4 x i32>* undef
605  ret void
606}
607
608define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
609; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
610; GFX1030:       ; %bb.0:
611; GFX1030-NEXT:    s_clause 0x1
612; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
613; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
614; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
615; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
616; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
617; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
618; GFX1030-NEXT:    v_mov_b32_e32 v6, 0x40400000
619; GFX1030-NEXT:    v_mov_b32_e32 v7, 4.0
620; GFX1030-NEXT:    v_mov_b32_e32 v8, 0x40a00000
621; GFX1030-NEXT:    v_mov_b32_e32 v9, 0x40c00000
622; GFX1030-NEXT:    v_mov_b32_e32 v10, 0x40e00000
623; GFX1030-NEXT:    v_mov_b32_e32 v11, 0x41000000
624; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
626; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
627; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
628; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
629; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
630; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
631; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
632; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
633; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3]
634; GFX1030-NEXT:    s_waitcnt vmcnt(0)
635; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
636; GFX1030-NEXT:    s_endpgm
637;
638; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
639; GFX1013:       ; %bb.0:
640; GFX1013-NEXT:    s_clause 0x1
641; GFX1013-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
642; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
643; GFX1013-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
644; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
645; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
646; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
647; GFX1013-NEXT:    v_mov_b32_e32 v6, 0x40400000
648; GFX1013-NEXT:    v_mov_b32_e32 v7, 4.0
649; GFX1013-NEXT:    v_mov_b32_e32 v8, 0x40a00000
650; GFX1013-NEXT:    v_mov_b32_e32 v9, 0x40c00000
651; GFX1013-NEXT:    v_mov_b32_e32 v10, 0x40e00000
652; GFX1013-NEXT:    v_mov_b32_e32 v11, 0x41000000
653; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
654; GFX1013-NEXT:    v_mov_b32_e32 v0, s2
655; GFX1013-NEXT:    v_mov_b32_e32 v1, s3
656; GFX1013-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
657; GFX1013-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
658; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
659; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c7
660; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
661; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
662; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7]
663; GFX1013-NEXT:    s_waitcnt vmcnt(0)
664; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
665; GFX1013-NEXT:    s_endpgm
666  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
667  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
668  %ray_extent = load float, float* %gep_ray, align 4
669  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
670  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
671  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
672  %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0
673  %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1
674  %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2
675  %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0
676  %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1
677  %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2
678  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr)
679  store <4 x i32> %v, <4 x i32>* undef
680  ret void
681}
682
683define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) {
684; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
685; GFX1030:       ; %bb.0:
686; GFX1030-NEXT:    s_clause 0x1
687; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
688; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
689; GFX1030-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
690; GFX1030-NEXT:    s_movk_i32 s6, 0x4200
691; GFX1030-NEXT:    s_movk_i32 s7, 0x4800
692; GFX1030-NEXT:    s_bfe_u32 s6, s6, 0x100000
693; GFX1030-NEXT:    s_movk_i32 s9, 0x4600
694; GFX1030-NEXT:    s_movk_i32 s8, 0x4700
695; GFX1030-NEXT:    s_bfe_u32 s7, s7, 0x100000
696; GFX1030-NEXT:    s_bfe_u32 s8, s8, 0x100000
697; GFX1030-NEXT:    s_lshl_b32 s7, s7, 16
698; GFX1030-NEXT:    v_mov_b32_e32 v3, 0
699; GFX1030-NEXT:    v_mov_b32_e32 v4, 1.0
700; GFX1030-NEXT:    v_mov_b32_e32 v5, 2.0
701; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
702; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
703; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
704; GFX1030-NEXT:    s_movk_i32 s5, 0x4400
705; GFX1030-NEXT:    s_movk_i32 s4, 0x4500
706; GFX1030-NEXT:    s_bfe_u32 s5, s5, 0x100000
707; GFX1030-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
708; GFX1030-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
709; GFX1030-NEXT:    s_lshl_b32 s5, s5, 16
710; GFX1030-NEXT:    s_bfe_u32 s4, s4, 0x100000
711; GFX1030-NEXT:    s_or_b32 s5, s6, s5
712; GFX1030-NEXT:    flat_load_dword v2, v[0:1]
713; GFX1030-NEXT:    s_bfe_u32 s6, s9, 0x100000
714; GFX1030-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
715; GFX1030-NEXT:    s_lshl_b32 s6, s6, 16
716; GFX1030-NEXT:    v_mov_b32_e32 v1, 0x102
717; GFX1030-NEXT:    s_or_b32 s4, s4, s6
718; GFX1030-NEXT:    s_or_b32 s6, s8, s7
719; GFX1030-NEXT:    v_mov_b32_e32 v6, s5
720; GFX1030-NEXT:    v_mov_b32_e32 v7, s4
721; GFX1030-NEXT:    v_mov_b32_e32 v8, s6
722; GFX1030-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
723; GFX1030-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16
724; GFX1030-NEXT:    s_waitcnt vmcnt(0)
725; GFX1030-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
726; GFX1030-NEXT:    s_endpgm
727;
728; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
729; GFX1013:       ; %bb.0:
730; GFX1013-NEXT:    s_clause 0x1
731; GFX1013-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
732; GFX1013-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
733; GFX1013-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
734; GFX1013-NEXT:    s_movk_i32 s1, 0x4400
735; GFX1013-NEXT:    s_movk_i32 s9, 0x4600
736; GFX1013-NEXT:    s_bfe_u32 s1, s1, 0x100000
737; GFX1013-NEXT:    s_movk_i32 s0, 0x4500
738; GFX1013-NEXT:    s_lshl_b32 s1, s1, 16
739; GFX1013-NEXT:    s_movk_i32 s8, 0x4700
740; GFX1013-NEXT:    s_bfe_u32 s0, s0, 0x100000
741; GFX1013-NEXT:    s_bfe_u32 s8, s8, 0x100000
742; GFX1013-NEXT:    v_mov_b32_e32 v3, 0
743; GFX1013-NEXT:    v_mov_b32_e32 v4, 1.0
744; GFX1013-NEXT:    v_mov_b32_e32 v5, 2.0
745; GFX1013-NEXT:    s_waitcnt lgkmcnt(0)
746; GFX1013-NEXT:    v_mov_b32_e32 v0, s2
747; GFX1013-NEXT:    v_mov_b32_e32 v1, s3
748; GFX1013-NEXT:    s_movk_i32 s2, 0x4200
749; GFX1013-NEXT:    s_movk_i32 s3, 0x4800
750; GFX1013-NEXT:    s_bfe_u32 s2, s2, 0x100000
751; GFX1013-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
752; GFX1013-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
753; GFX1013-NEXT:    s_or_b32 s1, s2, s1
754; GFX1013-NEXT:    s_bfe_u32 s2, s9, 0x100000
755; GFX1013-NEXT:    s_bfe_u32 s3, s3, 0x100000
756; GFX1013-NEXT:    flat_load_dword v2, v[0:1]
757; GFX1013-NEXT:    s_lshl_b32 s2, s2, 16
758; GFX1013-NEXT:    s_lshl_b32 s3, s3, 16
759; GFX1013-NEXT:    s_or_b32 s0, s0, s2
760; GFX1013-NEXT:    s_or_b32 s2, s8, s3
761; GFX1013-NEXT:    v_mov_b32_e32 v0, 0xb36211c6
762; GFX1013-NEXT:    v_mov_b32_e32 v1, 0x102
763; GFX1013-NEXT:    v_mov_b32_e32 v6, s1
764; GFX1013-NEXT:    v_mov_b32_e32 v7, s0
765; GFX1013-NEXT:    v_mov_b32_e32 v8, s2
766; GFX1013-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
767; GFX1013-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16
768; GFX1013-NEXT:    s_waitcnt vmcnt(0)
769; GFX1013-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
770; GFX1013-NEXT:    s_endpgm
771  %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
772  %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid
773  %ray_extent = load float, float* %gep_ray, align 4
774  %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0
775  %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1
776  %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2
777  %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0
778  %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1
779  %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2
780  %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0
781  %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1
782  %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2
783  %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr)
784  store <4 x i32> %v, <4 x i32>* undef
785  ret void
786}
787