1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1030 %s 3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1013 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1013 %s 4; RUN: not --crash llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s -o /dev/null 2>&1 | FileCheck -check-prefix=ERR %s 5 6; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(uint node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) 7; uint4 llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(uint node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) 8; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(ulong node_ptr, float ray_extent, float4 ray_origin, float4 ray_dir, float4 ray_inv_dir, uint4 texture_descr) 9; uint4 llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(ulong node_ptr, float ray_extent, float4 ray_origin, half4 ray_dir, half4 ray_inv_dir, uint4 texture_descr) 10 11declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) 12declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) 13declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64, float, <4 x float>, <4 x float>, <4 x float>, <4 x i32>) 14declare <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64, float, <4 x float>, <4 x half>, <4 x half>, <4 x i32>) 15declare i32 @llvm.amdgcn.workitem.id.x() 16 17define amdgpu_ps <4 x float> @image_bvh_intersect_ray(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { 18; GFX1030-LABEL: image_bvh_intersect_ray: 19; GFX1030: ; %bb.0: 20; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v6, v7, v8, v10, v11, v12], s[0:3] 21; GFX1030-NEXT: s_waitcnt vmcnt(0) 22; GFX1030-NEXT: ; return to shader part epilog 23; 24; GFX1013-LABEL: image_bvh_intersect_ray: 25; GFX1013: ; %bb.0: 26; GFX1013-NEXT: v_mov_b32_e32 v5, v6 27; GFX1013-NEXT: v_mov_b32_e32 v6, v7 28; GFX1013-NEXT: v_mov_b32_e32 v7, v8 29; GFX1013-NEXT: v_mov_b32_e32 v8, v10 30; GFX1013-NEXT: v_mov_b32_e32 v9, v11 31; GFX1013-NEXT: v_mov_b32_e32 v10, v12 32; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 33; GFX1013-NEXT: s_waitcnt vmcnt(0) 34; GFX1013-NEXT: ; return to shader part epilog 35; ERR: in function image_bvh_intersect_ray{{.*}}intrinsic not supported on subtarget 36 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 37 %r = bitcast <4 x i32> %v to <4 x float> 38 ret <4 x float> %r 39} 40 41define amdgpu_ps <4 x float> @image_bvh_intersect_ray_flat(i32 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 42; GCN-LABEL: image_bvh_intersect_ray_flat: 43; GCN: ; %bb.0: 44; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 45; GCN-NEXT: s_waitcnt vmcnt(0) 46; GCN-NEXT: ; return to shader part epilog 47 %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 48 %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 49 %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 50 %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 51 %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 52 %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 53 %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 54 %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 55 %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 56 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 57 %r = bitcast <4 x i32> %v to <4 x float> 58 ret <4 x float> %r 59} 60 61define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { 62; GCN-LABEL: image_bvh_intersect_ray_a16: 63; GCN: ; %bb.0: 64; GCN-NEXT: s_mov_b32 s4, 0xffff 65; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v6 66; GCN-NEXT: v_and_b32_e32 v10, s4, v8 67; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8 68; GCN-NEXT: v_and_b32_e32 v9, s4, v9 69; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5 70; GCN-NEXT: v_lshlrev_b32_e32 v10, 16, v10 71; GCN-NEXT: v_and_or_b32 v5, v6, s4, v5 72; GCN-NEXT: v_and_or_b32 v6, v7, s4, v10 73; GCN-NEXT: v_lshl_or_b32 v7, v9, 16, v8 74; GCN-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 75; GCN-NEXT: s_waitcnt vmcnt(0) 76; GCN-NEXT: ; return to shader part epilog 77 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 78 %r = bitcast <4 x i32> %v to <4 x float> 79 ret <4 x float> %r 80} 81 82define amdgpu_ps <4 x float> @image_bvh64_intersect_ray(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> inreg %tdescr) { 83; GFX1030-LABEL: image_bvh64_intersect_ray: 84; GFX1030: ; %bb.0: 85; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v0, v1, v2, v3, v4, v5, v7, v8, v9, v11, v12, v13], s[0:3] 86; GFX1030-NEXT: s_waitcnt vmcnt(0) 87; GFX1030-NEXT: ; return to shader part epilog 88; 89; GFX1013-LABEL: image_bvh64_intersect_ray: 90; GFX1013: ; %bb.0: 91; GFX1013-NEXT: v_mov_b32_e32 v6, v7 92; GFX1013-NEXT: v_mov_b32_e32 v7, v8 93; GFX1013-NEXT: v_mov_b32_e32 v8, v9 94; GFX1013-NEXT: v_mov_b32_e32 v9, v11 95; GFX1013-NEXT: v_mov_b32_e32 v10, v12 96; GFX1013-NEXT: v_mov_b32_e32 v11, v13 97; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 98; GFX1013-NEXT: s_waitcnt vmcnt(0) 99; GFX1013-NEXT: ; return to shader part epilog 100 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 101 %r = bitcast <4 x i32> %v to <4 x float> 102 ret <4 x float> %r 103} 104 105define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_flat(<2 x i32> %node_ptr_vec, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, float %ray_inv_dir_x, float %ray_inv_dir_y, float %ray_inv_dir_z, <4 x i32> inreg %tdescr) { 106; GCN-LABEL: image_bvh64_intersect_ray_flat: 107; GCN: ; %bb.0: 108; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 109; GCN-NEXT: s_waitcnt vmcnt(0) 110; GCN-NEXT: ; return to shader part epilog 111 %node_ptr = bitcast <2 x i32> %node_ptr_vec to i64 112 %ray_origin0 = insertelement <4 x float> undef, float %ray_origin_x, i32 0 113 %ray_origin1 = insertelement <4 x float> %ray_origin0, float %ray_origin_y, i32 1 114 %ray_origin = insertelement <4 x float> %ray_origin1, float %ray_origin_z, i32 2 115 %ray_dir0 = insertelement <4 x float> undef, float %ray_dir_x, i32 0 116 %ray_dir1 = insertelement <4 x float> %ray_dir0, float %ray_dir_y, i32 1 117 %ray_dir = insertelement <4 x float> %ray_dir1, float %ray_dir_z, i32 2 118 %ray_inv_dir0 = insertelement <4 x float> undef, float %ray_inv_dir_x, i32 0 119 %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float %ray_inv_dir_y, i32 1 120 %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float %ray_inv_dir_z, i32 2 121 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 122 %r = bitcast <4 x i32> %v to <4 x float> 123 ret <4 x float> %r 124} 125 126define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> inreg %tdescr) { 127; GCN-LABEL: image_bvh64_intersect_ray_a16: 128; GCN: ; %bb.0: 129; GCN-NEXT: s_mov_b32 s4, 0xffff 130; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v7 131; GCN-NEXT: v_and_b32_e32 v11, s4, v9 132; GCN-NEXT: v_lshrrev_b32_e32 v9, 16, v9 133; GCN-NEXT: v_and_b32_e32 v10, s4, v10 134; GCN-NEXT: v_lshlrev_b32_e32 v6, 16, v6 135; GCN-NEXT: v_lshlrev_b32_e32 v11, 16, v11 136; GCN-NEXT: v_and_or_b32 v6, v7, s4, v6 137; GCN-NEXT: v_and_or_b32 v7, v8, s4, v11 138; GCN-NEXT: v_lshl_or_b32 v8, v10, 16, v9 139; GCN-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 140; GCN-NEXT: s_waitcnt vmcnt(0) 141; GCN-NEXT: ; return to shader part epilog 142 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 143 %r = bitcast <4 x i32> %v to <4 x float> 144 ret <4 x float> %r 145} 146 147define amdgpu_ps <4 x float> @image_bvh_intersect_ray_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { 148; GFX1030-LABEL: image_bvh_intersect_ray_vgpr_descr: 149; GFX1030: ; %bb.0: 150; GFX1030-NEXT: v_mov_b32_e32 v5, v0 151; GFX1030-NEXT: v_mov_b32_e32 v9, v1 152; GFX1030-NEXT: v_mov_b32_e32 v13, v2 153; GFX1030-NEXT: v_mov_b32_e32 v18, v3 154; GFX1030-NEXT: s_mov_b32 s1, exec_lo 155; GFX1030-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 156; GFX1030-NEXT: v_readfirstlane_b32 s4, v14 157; GFX1030-NEXT: v_readfirstlane_b32 s5, v15 158; GFX1030-NEXT: v_readfirstlane_b32 s6, v16 159; GFX1030-NEXT: v_readfirstlane_b32 s7, v17 160; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] 161; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v5, v9, v13, v18, v4, v6, v7, v8, v10, v11, v12], s[4:7] 162; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] 163; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo 164; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 165; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 166; GFX1030-NEXT: s_cbranch_execnz BB6_1 167; GFX1030-NEXT: ; %bb.2: 168; GFX1030-NEXT: s_mov_b32 exec_lo, s1 169; GFX1030-NEXT: s_waitcnt vmcnt(0) 170; GFX1030-NEXT: ; return to shader part epilog 171; 172; GFX1013-LABEL: image_bvh_intersect_ray_vgpr_descr: 173; GFX1013: ; %bb.0: 174; GFX1013-NEXT: v_mov_b32_e32 v5, v6 175; GFX1013-NEXT: v_mov_b32_e32 v6, v7 176; GFX1013-NEXT: v_mov_b32_e32 v7, v8 177; GFX1013-NEXT: v_mov_b32_e32 v8, v10 178; GFX1013-NEXT: v_mov_b32_e32 v9, v11 179; GFX1013-NEXT: v_mov_b32_e32 v10, v12 180; GFX1013-NEXT: s_mov_b32 s1, exec_lo 181; GFX1013-NEXT: BB6_1: ; =>This Inner Loop Header: Depth=1 182; GFX1013-NEXT: v_readfirstlane_b32 s4, v14 183; GFX1013-NEXT: v_readfirstlane_b32 s5, v15 184; GFX1013-NEXT: v_readfirstlane_b32 s6, v16 185; GFX1013-NEXT: v_readfirstlane_b32 s7, v17 186; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[14:15] 187; GFX1013-NEXT: image_bvh_intersect_ray v[18:21], v[0:15], s[4:7] 188; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[16:17] 189; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo 190; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 191; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 192; GFX1013-NEXT: s_cbranch_execnz BB6_1 193; GFX1013-NEXT: ; %bb.2: 194; GFX1013-NEXT: s_mov_b32 exec_lo, s1 195; GFX1013-NEXT: s_waitcnt vmcnt(0) 196; GFX1013-NEXT: v_mov_b32_e32 v0, v18 197; GFX1013-NEXT: v_mov_b32_e32 v1, v19 198; GFX1013-NEXT: v_mov_b32_e32 v2, v20 199; GFX1013-NEXT: v_mov_b32_e32 v3, v21 200; GFX1013-NEXT: ; return to shader part epilog 201 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 202 %r = bitcast <4 x i32> %v to <4 x float> 203 ret <4 x float> %r 204} 205 206define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { 207; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: 208; GFX1030: ; %bb.0: 209; GFX1030-NEXT: s_mov_b32 s0, 0xffff 210; GFX1030-NEXT: v_mov_b32_e32 v5, v0 211; GFX1030-NEXT: v_mov_b32_e32 v14, v1 212; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v6 213; GFX1030-NEXT: v_and_b32_e32 v1, s0, v8 214; GFX1030-NEXT: v_mov_b32_e32 v15, v2 215; GFX1030-NEXT: v_mov_b32_e32 v16, v3 216; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v8 217; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 218; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 219; GFX1030-NEXT: v_and_b32_e32 v3, s0, v9 220; GFX1030-NEXT: s_mov_b32 s1, exec_lo 221; GFX1030-NEXT: v_and_or_b32 v6, v6, s0, v0 222; GFX1030-NEXT: v_and_or_b32 v7, v7, s0, v1 223; GFX1030-NEXT: v_lshl_or_b32 v8, v3, 16, v2 224; GFX1030-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 225; GFX1030-NEXT: v_readfirstlane_b32 s4, v10 226; GFX1030-NEXT: v_readfirstlane_b32 s5, v11 227; GFX1030-NEXT: v_readfirstlane_b32 s6, v12 228; GFX1030-NEXT: v_readfirstlane_b32 s7, v13 229; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] 230; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], [v5, v14, v15, v16, v4, v6, v7, v8], s[4:7] a16 231; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] 232; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo 233; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 234; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 235; GFX1030-NEXT: s_cbranch_execnz BB7_1 236; GFX1030-NEXT: ; %bb.2: 237; GFX1030-NEXT: s_mov_b32 exec_lo, s1 238; GFX1030-NEXT: s_waitcnt vmcnt(0) 239; GFX1030-NEXT: ; return to shader part epilog 240; 241; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: 242; GFX1013: ; %bb.0: 243; GFX1013-NEXT: s_mov_b32 s0, 0xffff 244; GFX1013-NEXT: v_lshrrev_b32_e32 v5, 16, v6 245; GFX1013-NEXT: v_and_b32_e32 v14, s0, v8 246; GFX1013-NEXT: v_lshrrev_b32_e32 v8, 16, v8 247; GFX1013-NEXT: v_and_b32_e32 v9, s0, v9 248; GFX1013-NEXT: s_mov_b32 s1, exec_lo 249; GFX1013-NEXT: v_lshlrev_b32_e32 v5, 16, v5 250; GFX1013-NEXT: v_lshlrev_b32_e32 v14, 16, v14 251; GFX1013-NEXT: v_and_or_b32 v5, v6, s0, v5 252; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v14 253; GFX1013-NEXT: v_lshl_or_b32 v7, v9, 16, v8 254; GFX1013-NEXT: BB7_1: ; =>This Inner Loop Header: Depth=1 255; GFX1013-NEXT: v_readfirstlane_b32 s4, v10 256; GFX1013-NEXT: v_readfirstlane_b32 s5, v11 257; GFX1013-NEXT: v_readfirstlane_b32 s6, v12 258; GFX1013-NEXT: v_readfirstlane_b32 s7, v13 259; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[10:11] 260; GFX1013-NEXT: image_bvh_intersect_ray v[14:17], v[0:7], s[4:7] a16 261; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[12:13] 262; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo 263; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 264; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 265; GFX1013-NEXT: s_cbranch_execnz BB7_1 266; GFX1013-NEXT: ; %bb.2: 267; GFX1013-NEXT: s_mov_b32 exec_lo, s1 268; GFX1013-NEXT: s_waitcnt vmcnt(0) 269; GFX1013-NEXT: v_mov_b32_e32 v0, v14 270; GFX1013-NEXT: v_mov_b32_e32 v1, v15 271; GFX1013-NEXT: v_mov_b32_e32 v2, v16 272; GFX1013-NEXT: v_mov_b32_e32 v3, v17 273; GFX1013-NEXT: ; return to shader part epilog 274 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 275 %r = bitcast <4 x i32> %v to <4 x float> 276 ret <4 x float> %r 277} 278 279define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) { 280; GFX1030-LABEL: image_bvh64_intersect_ray_vgpr_descr: 281; GFX1030: ; %bb.0: 282; GFX1030-NEXT: v_mov_b32_e32 v6, v0 283; GFX1030-NEXT: v_mov_b32_e32 v10, v1 284; GFX1030-NEXT: v_mov_b32_e32 v14, v2 285; GFX1030-NEXT: v_mov_b32_e32 v19, v3 286; GFX1030-NEXT: s_mov_b32 s1, exec_lo 287; GFX1030-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 288; GFX1030-NEXT: v_readfirstlane_b32 s4, v15 289; GFX1030-NEXT: v_readfirstlane_b32 s5, v16 290; GFX1030-NEXT: v_readfirstlane_b32 s6, v17 291; GFX1030-NEXT: v_readfirstlane_b32 s7, v18 292; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] 293; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v6, v10, v14, v19, v4, v5, v7, v8, v9, v11, v12, v13], s[4:7] 294; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] 295; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo 296; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 297; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 298; GFX1030-NEXT: s_cbranch_execnz BB8_1 299; GFX1030-NEXT: ; %bb.2: 300; GFX1030-NEXT: s_mov_b32 exec_lo, s1 301; GFX1030-NEXT: s_waitcnt vmcnt(0) 302; GFX1030-NEXT: ; return to shader part epilog 303; 304; GFX1013-LABEL: image_bvh64_intersect_ray_vgpr_descr: 305; GFX1013: ; %bb.0: 306; GFX1013-NEXT: v_mov_b32_e32 v6, v7 307; GFX1013-NEXT: v_mov_b32_e32 v7, v8 308; GFX1013-NEXT: v_mov_b32_e32 v8, v9 309; GFX1013-NEXT: v_mov_b32_e32 v9, v11 310; GFX1013-NEXT: v_mov_b32_e32 v10, v12 311; GFX1013-NEXT: v_mov_b32_e32 v11, v13 312; GFX1013-NEXT: s_mov_b32 s1, exec_lo 313; GFX1013-NEXT: BB8_1: ; =>This Inner Loop Header: Depth=1 314; GFX1013-NEXT: v_readfirstlane_b32 s4, v15 315; GFX1013-NEXT: v_readfirstlane_b32 s5, v16 316; GFX1013-NEXT: v_readfirstlane_b32 s6, v17 317; GFX1013-NEXT: v_readfirstlane_b32 s7, v18 318; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[15:16] 319; GFX1013-NEXT: image_bvh64_intersect_ray v[19:22], v[0:15], s[4:7] 320; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[17:18] 321; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo 322; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 323; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 324; GFX1013-NEXT: s_cbranch_execnz BB8_1 325; GFX1013-NEXT: ; %bb.2: 326; GFX1013-NEXT: s_mov_b32 exec_lo, s1 327; GFX1013-NEXT: s_waitcnt vmcnt(0) 328; GFX1013-NEXT: v_mov_b32_e32 v0, v19 329; GFX1013-NEXT: v_mov_b32_e32 v1, v20 330; GFX1013-NEXT: v_mov_b32_e32 v2, v21 331; GFX1013-NEXT: v_mov_b32_e32 v3, v22 332; GFX1013-NEXT: ; return to shader part epilog 333 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 334 %r = bitcast <4 x i32> %v to <4 x float> 335 ret <4 x float> %r 336} 337 338define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) { 339; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: 340; GFX1030: ; %bb.0: 341; GFX1030-NEXT: s_mov_b32 s0, 0xffff 342; GFX1030-NEXT: v_mov_b32_e32 v6, v0 343; GFX1030-NEXT: v_mov_b32_e32 v15, v1 344; GFX1030-NEXT: v_lshrrev_b32_e32 v0, 16, v7 345; GFX1030-NEXT: v_and_b32_e32 v1, s0, v9 346; GFX1030-NEXT: v_mov_b32_e32 v16, v2 347; GFX1030-NEXT: v_mov_b32_e32 v17, v3 348; GFX1030-NEXT: v_lshrrev_b32_e32 v2, 16, v9 349; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 16, v0 350; GFX1030-NEXT: v_lshlrev_b32_e32 v1, 16, v1 351; GFX1030-NEXT: v_and_b32_e32 v3, s0, v10 352; GFX1030-NEXT: s_mov_b32 s1, exec_lo 353; GFX1030-NEXT: v_and_or_b32 v7, v7, s0, v0 354; GFX1030-NEXT: v_and_or_b32 v8, v8, s0, v1 355; GFX1030-NEXT: v_lshl_or_b32 v9, v3, 16, v2 356; GFX1030-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 357; GFX1030-NEXT: v_readfirstlane_b32 s4, v11 358; GFX1030-NEXT: v_readfirstlane_b32 s5, v12 359; GFX1030-NEXT: v_readfirstlane_b32 s6, v13 360; GFX1030-NEXT: v_readfirstlane_b32 s7, v14 361; GFX1030-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] 362; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], [v6, v15, v16, v17, v4, v5, v7, v8, v9], s[4:7] a16 363; GFX1030-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] 364; GFX1030-NEXT: s_and_b32 s0, s0, vcc_lo 365; GFX1030-NEXT: s_and_saveexec_b32 s0, s0 366; GFX1030-NEXT: s_xor_b32 exec_lo, exec_lo, s0 367; GFX1030-NEXT: s_cbranch_execnz BB9_1 368; GFX1030-NEXT: ; %bb.2: 369; GFX1030-NEXT: s_mov_b32 exec_lo, s1 370; GFX1030-NEXT: s_waitcnt vmcnt(0) 371; GFX1030-NEXT: ; return to shader part epilog 372; 373; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: 374; GFX1013: ; %bb.0: 375; GFX1013-NEXT: s_mov_b32 s0, 0xffff 376; GFX1013-NEXT: v_lshrrev_b32_e32 v6, 16, v7 377; GFX1013-NEXT: v_and_b32_e32 v15, s0, v9 378; GFX1013-NEXT: v_lshrrev_b32_e32 v9, 16, v9 379; GFX1013-NEXT: v_and_b32_e32 v10, s0, v10 380; GFX1013-NEXT: s_mov_b32 s1, exec_lo 381; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 16, v6 382; GFX1013-NEXT: v_lshlrev_b32_e32 v15, 16, v15 383; GFX1013-NEXT: v_and_or_b32 v6, v7, s0, v6 384; GFX1013-NEXT: v_and_or_b32 v7, v8, s0, v15 385; GFX1013-NEXT: v_lshl_or_b32 v8, v10, 16, v9 386; GFX1013-NEXT: BB9_1: ; =>This Inner Loop Header: Depth=1 387; GFX1013-NEXT: v_readfirstlane_b32 s4, v11 388; GFX1013-NEXT: v_readfirstlane_b32 s5, v12 389; GFX1013-NEXT: v_readfirstlane_b32 s6, v13 390; GFX1013-NEXT: v_readfirstlane_b32 s7, v14 391; GFX1013-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[11:12] 392; GFX1013-NEXT: s_waitcnt vmcnt(0) 393; GFX1013-NEXT: image_bvh64_intersect_ray v[15:18], v[0:15], s[4:7] a16 394; GFX1013-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[13:14] 395; GFX1013-NEXT: s_and_b32 s0, s0, vcc_lo 396; GFX1013-NEXT: s_and_saveexec_b32 s0, s0 397; GFX1013-NEXT: s_xor_b32 exec_lo, exec_lo, s0 398; GFX1013-NEXT: s_cbranch_execnz BB9_1 399; GFX1013-NEXT: ; %bb.2: 400; GFX1013-NEXT: s_mov_b32 exec_lo, s1 401; GFX1013-NEXT: s_waitcnt vmcnt(0) 402; GFX1013-NEXT: v_mov_b32_e32 v0, v15 403; GFX1013-NEXT: v_mov_b32_e32 v1, v16 404; GFX1013-NEXT: v_mov_b32_e32 v2, v17 405; GFX1013-NEXT: v_mov_b32_e32 v3, v18 406; GFX1013-NEXT: ; return to shader part epilog 407 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 408 %r = bitcast <4 x i32> %v to <4 x float> 409 ret <4 x float> %r 410} 411 412define amdgpu_kernel void @image_bvh_intersect_ray_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { 413; GFX1030-LABEL: image_bvh_intersect_ray_nsa_reassign: 414; GFX1030: ; %bb.0: 415; GFX1030-NEXT: s_clause 0x1 416; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 417; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 418; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 419; GFX1030-NEXT: v_mov_b32_e32 v5, 0x40400000 420; GFX1030-NEXT: v_mov_b32_e32 v6, 4.0 421; GFX1030-NEXT: v_mov_b32_e32 v7, 0x40a00000 422; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40c00000 423; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40e00000 424; GFX1030-NEXT: v_mov_b32_e32 v10, 0x41000000 425; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 426; GFX1030-NEXT: v_mov_b32_e32 v0, s4 427; GFX1030-NEXT: v_mov_b32_e32 v1, s5 428; GFX1030-NEXT: v_mov_b32_e32 v2, s6 429; GFX1030-NEXT: v_mov_b32_e32 v3, s7 430; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 431; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 432; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 433; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 434; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 435; GFX1030-NEXT: flat_load_dword v0, v[0:1] 436; GFX1030-NEXT: flat_load_dword v1, v[2:3] 437; GFX1030-NEXT: v_mov_b32_e32 v2, 0 438; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 439; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 440; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[0:3] 441; GFX1030-NEXT: s_waitcnt vmcnt(0) 442; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 443; GFX1030-NEXT: s_endpgm 444; 445; GFX1013-LABEL: image_bvh_intersect_ray_nsa_reassign: 446; GFX1013: ; %bb.0: 447; GFX1013-NEXT: s_clause 0x1 448; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 449; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 450; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 451; GFX1013-NEXT: v_mov_b32_e32 v7, 0x40a00000 452; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40c00000 453; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40e00000 454; GFX1013-NEXT: v_mov_b32_e32 v10, 0x41000000 455; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 456; GFX1013-NEXT: v_mov_b32_e32 v0, s4 457; GFX1013-NEXT: v_mov_b32_e32 v1, s5 458; GFX1013-NEXT: v_mov_b32_e32 v2, s6 459; GFX1013-NEXT: v_mov_b32_e32 v3, s7 460; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 461; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 462; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 463; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 464; GFX1013-NEXT: v_mov_b32_e32 v6, 4.0 465; GFX1013-NEXT: flat_load_dword v0, v[4:5] 466; GFX1013-NEXT: flat_load_dword v1, v[2:3] 467; GFX1013-NEXT: v_mov_b32_e32 v2, 0 468; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 469; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 470; GFX1013-NEXT: v_mov_b32_e32 v5, 0x40400000 471; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 472; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:15], s[8:11] 473; GFX1013-NEXT: s_waitcnt vmcnt(0) 474; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 475; GFX1013-NEXT: s_endpgm 476 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 477 %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid 478 %node_ptr = load i32, i32* %gep_node_ptr, align 4 479 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 480 %ray_extent = load float, float* %gep_ray, align 4 481 %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 482 %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 483 %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 484 %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 485 %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 486 %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 487 %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 488 %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 489 %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 490 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f32(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 491 store <4 x i32> %v, <4 x i32>* undef 492 ret void 493} 494 495define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(i32* %p_node_ptr, float* %p_ray, <4 x i32> inreg %tdescr) { 496; GFX1030-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 497; GFX1030: ; %bb.0: 498; GFX1030-NEXT: s_clause 0x1 499; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 500; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 501; GFX1030-NEXT: v_lshlrev_b32_e32 v4, 2, v0 502; GFX1030-NEXT: s_movk_i32 s9, 0x4600 503; GFX1030-NEXT: s_movk_i32 s8, 0x4700 504; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 505; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 506; GFX1030-NEXT: v_mov_b32_e32 v0, s4 507; GFX1030-NEXT: v_mov_b32_e32 v1, s5 508; GFX1030-NEXT: v_mov_b32_e32 v2, s6 509; GFX1030-NEXT: v_mov_b32_e32 v3, s7 510; GFX1030-NEXT: s_movk_i32 s5, 0x4400 511; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 512; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 513; GFX1030-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 514; GFX1030-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 515; GFX1030-NEXT: s_movk_i32 s6, 0x4200 516; GFX1030-NEXT: flat_load_dword v0, v[0:1] 517; GFX1030-NEXT: flat_load_dword v1, v[2:3] 518; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 519; GFX1030-NEXT: s_movk_i32 s7, 0x4800 520; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 521; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 522; GFX1030-NEXT: s_movk_i32 s4, 0x4500 523; GFX1030-NEXT: s_or_b32 s5, s6, s5 524; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 525; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000 526; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 527; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 528; GFX1030-NEXT: s_lshl_b32 s7, s7, 16 529; GFX1030-NEXT: s_or_b32 s4, s4, s6 530; GFX1030-NEXT: s_or_b32 s6, s8, s7 531; GFX1030-NEXT: v_mov_b32_e32 v2, 0 532; GFX1030-NEXT: v_mov_b32_e32 v3, 1.0 533; GFX1030-NEXT: v_mov_b32_e32 v4, 2.0 534; GFX1030-NEXT: v_mov_b32_e32 v5, s5 535; GFX1030-NEXT: v_mov_b32_e32 v6, s4 536; GFX1030-NEXT: v_mov_b32_e32 v7, s6 537; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 538; GFX1030-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16 539; GFX1030-NEXT: s_waitcnt vmcnt(0) 540; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 541; GFX1030-NEXT: s_endpgm 542; 543; GFX1013-LABEL: image_bvh_intersect_ray_a16_nsa_reassign: 544; GFX1013: ; %bb.0: 545; GFX1013-NEXT: s_clause 0x1 546; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 547; GFX1013-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 548; GFX1013-NEXT: v_lshlrev_b32_e32 v6, 2, v0 549; GFX1013-NEXT: s_movk_i32 s1, 0x4400 550; GFX1013-NEXT: s_movk_i32 s2, 0x4200 551; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 552; GFX1013-NEXT: s_movk_i32 s3, 0x4800 553; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 554; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 555; GFX1013-NEXT: s_movk_i32 s0, 0x4500 556; GFX1013-NEXT: s_or_b32 s1, s2, s1 557; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 558; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 559; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 560; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 561; GFX1013-NEXT: v_mov_b32_e32 v0, s4 562; GFX1013-NEXT: v_mov_b32_e32 v1, s5 563; GFX1013-NEXT: v_mov_b32_e32 v2, s6 564; GFX1013-NEXT: v_mov_b32_e32 v3, s7 565; GFX1013-NEXT: s_movk_i32 s5, 0x4600 566; GFX1013-NEXT: v_add_co_u32 v4, vcc_lo, v0, v6 567; GFX1013-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo 568; GFX1013-NEXT: v_add_co_u32 v2, vcc_lo, v2, v6 569; GFX1013-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo 570; GFX1013-NEXT: s_movk_i32 s4, 0x4700 571; GFX1013-NEXT: flat_load_dword v0, v[4:5] 572; GFX1013-NEXT: flat_load_dword v1, v[2:3] 573; GFX1013-NEXT: s_bfe_u32 s2, s5, 0x100000 574; GFX1013-NEXT: s_bfe_u32 s4, s4, 0x100000 575; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 576; GFX1013-NEXT: v_mov_b32_e32 v2, 0 577; GFX1013-NEXT: s_or_b32 s0, s0, s2 578; GFX1013-NEXT: s_or_b32 s2, s4, s3 579; GFX1013-NEXT: v_mov_b32_e32 v3, 1.0 580; GFX1013-NEXT: v_mov_b32_e32 v4, 2.0 581; GFX1013-NEXT: v_mov_b32_e32 v5, s1 582; GFX1013-NEXT: v_mov_b32_e32 v6, s0 583; GFX1013-NEXT: v_mov_b32_e32 v7, s2 584; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 585; GFX1013-NEXT: image_bvh_intersect_ray v[0:3], v[0:7], s[8:11] a16 586; GFX1013-NEXT: s_waitcnt vmcnt(0) 587; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 588; GFX1013-NEXT: s_endpgm 589 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 590 %gep_node_ptr = getelementptr inbounds i32, i32* %p_node_ptr, i32 %lid 591 %node_ptr = load i32, i32* %gep_node_ptr, align 4 592 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 593 %ray_extent = load float, float* %gep_ray, align 4 594 %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 595 %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 596 %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 597 %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 598 %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 599 %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 600 %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 601 %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 602 %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 603 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v4f16(i32 %node_ptr, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 604 store <4 x i32> %v, <4 x i32>* undef 605 ret void 606} 607 608define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { 609; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign: 610; GFX1030: ; %bb.0: 611; GFX1030-NEXT: s_clause 0x1 612; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 613; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 614; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 615; GFX1030-NEXT: v_mov_b32_e32 v3, 0 616; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 617; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 618; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000 619; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0 620; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000 621; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000 622; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000 623; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000 624; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 625; GFX1030-NEXT: v_mov_b32_e32 v0, s4 626; GFX1030-NEXT: v_mov_b32_e32 v1, s5 627; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 628; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 629; GFX1030-NEXT: flat_load_dword v2, v[0:1] 630; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7 631; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 632; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 633; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] 634; GFX1030-NEXT: s_waitcnt vmcnt(0) 635; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 636; GFX1030-NEXT: s_endpgm 637; 638; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign: 639; GFX1013: ; %bb.0: 640; GFX1013-NEXT: s_clause 0x1 641; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 642; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 643; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 644; GFX1013-NEXT: v_mov_b32_e32 v3, 0 645; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 646; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 647; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000 648; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0 649; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000 650; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000 651; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000 652; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000 653; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 654; GFX1013-NEXT: v_mov_b32_e32 v0, s2 655; GFX1013-NEXT: v_mov_b32_e32 v1, s3 656; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 657; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 658; GFX1013-NEXT: flat_load_dword v2, v[0:1] 659; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7 660; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 661; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 662; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] 663; GFX1013-NEXT: s_waitcnt vmcnt(0) 664; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 665; GFX1013-NEXT: s_endpgm 666 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 667 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 668 %ray_extent = load float, float* %gep_ray, align 4 669 %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 670 %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 671 %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 672 %ray_dir0 = insertelement <4 x float> undef, float 3.0, i32 0 673 %ray_dir1 = insertelement <4 x float> %ray_dir0, float 4.0, i32 1 674 %ray_dir = insertelement <4 x float> %ray_dir1, float 5.0, i32 2 675 %ray_inv_dir0 = insertelement <4 x float> undef, float 6.0, i32 0 676 %ray_inv_dir1 = insertelement <4 x float> %ray_inv_dir0, float 7.0, i32 1 677 %ray_inv_dir = insertelement <4 x float> %ray_inv_dir1, float 8.0, i32 2 678 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f32(i64 1111111111111, float %ray_extent, <4 x float> %ray_origin, <4 x float> %ray_dir, <4 x float> %ray_inv_dir, <4 x i32> %tdescr) 679 store <4 x i32> %v, <4 x i32>* undef 680 ret void 681} 682 683define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(float* %p_ray, <4 x i32> inreg %tdescr) { 684; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 685; GFX1030: ; %bb.0: 686; GFX1030-NEXT: s_clause 0x1 687; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 688; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 689; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0 690; GFX1030-NEXT: s_movk_i32 s6, 0x4200 691; GFX1030-NEXT: s_movk_i32 s7, 0x4800 692; GFX1030-NEXT: s_bfe_u32 s6, s6, 0x100000 693; GFX1030-NEXT: s_movk_i32 s9, 0x4600 694; GFX1030-NEXT: s_movk_i32 s8, 0x4700 695; GFX1030-NEXT: s_bfe_u32 s7, s7, 0x100000 696; GFX1030-NEXT: s_bfe_u32 s8, s8, 0x100000 697; GFX1030-NEXT: s_lshl_b32 s7, s7, 16 698; GFX1030-NEXT: v_mov_b32_e32 v3, 0 699; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0 700; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0 701; GFX1030-NEXT: s_waitcnt lgkmcnt(0) 702; GFX1030-NEXT: v_mov_b32_e32 v0, s4 703; GFX1030-NEXT: v_mov_b32_e32 v1, s5 704; GFX1030-NEXT: s_movk_i32 s5, 0x4400 705; GFX1030-NEXT: s_movk_i32 s4, 0x4500 706; GFX1030-NEXT: s_bfe_u32 s5, s5, 0x100000 707; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 708; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 709; GFX1030-NEXT: s_lshl_b32 s5, s5, 16 710; GFX1030-NEXT: s_bfe_u32 s4, s4, 0x100000 711; GFX1030-NEXT: s_or_b32 s5, s6, s5 712; GFX1030-NEXT: flat_load_dword v2, v[0:1] 713; GFX1030-NEXT: s_bfe_u32 s6, s9, 0x100000 714; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6 715; GFX1030-NEXT: s_lshl_b32 s6, s6, 16 716; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102 717; GFX1030-NEXT: s_or_b32 s4, s4, s6 718; GFX1030-NEXT: s_or_b32 s6, s8, s7 719; GFX1030-NEXT: v_mov_b32_e32 v6, s5 720; GFX1030-NEXT: v_mov_b32_e32 v7, s4 721; GFX1030-NEXT: v_mov_b32_e32 v8, s6 722; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 723; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[0:3] a16 724; GFX1030-NEXT: s_waitcnt vmcnt(0) 725; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 726; GFX1030-NEXT: s_endpgm 727; 728; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign: 729; GFX1013: ; %bb.0: 730; GFX1013-NEXT: s_clause 0x1 731; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 732; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 733; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0 734; GFX1013-NEXT: s_movk_i32 s1, 0x4400 735; GFX1013-NEXT: s_movk_i32 s9, 0x4600 736; GFX1013-NEXT: s_bfe_u32 s1, s1, 0x100000 737; GFX1013-NEXT: s_movk_i32 s0, 0x4500 738; GFX1013-NEXT: s_lshl_b32 s1, s1, 16 739; GFX1013-NEXT: s_movk_i32 s8, 0x4700 740; GFX1013-NEXT: s_bfe_u32 s0, s0, 0x100000 741; GFX1013-NEXT: s_bfe_u32 s8, s8, 0x100000 742; GFX1013-NEXT: v_mov_b32_e32 v3, 0 743; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0 744; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0 745; GFX1013-NEXT: s_waitcnt lgkmcnt(0) 746; GFX1013-NEXT: v_mov_b32_e32 v0, s2 747; GFX1013-NEXT: v_mov_b32_e32 v1, s3 748; GFX1013-NEXT: s_movk_i32 s2, 0x4200 749; GFX1013-NEXT: s_movk_i32 s3, 0x4800 750; GFX1013-NEXT: s_bfe_u32 s2, s2, 0x100000 751; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 752; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo 753; GFX1013-NEXT: s_or_b32 s1, s2, s1 754; GFX1013-NEXT: s_bfe_u32 s2, s9, 0x100000 755; GFX1013-NEXT: s_bfe_u32 s3, s3, 0x100000 756; GFX1013-NEXT: flat_load_dword v2, v[0:1] 757; GFX1013-NEXT: s_lshl_b32 s2, s2, 16 758; GFX1013-NEXT: s_lshl_b32 s3, s3, 16 759; GFX1013-NEXT: s_or_b32 s0, s0, s2 760; GFX1013-NEXT: s_or_b32 s2, s8, s3 761; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6 762; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102 763; GFX1013-NEXT: v_mov_b32_e32 v6, s1 764; GFX1013-NEXT: v_mov_b32_e32 v7, s0 765; GFX1013-NEXT: v_mov_b32_e32 v8, s2 766; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 767; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:15], s[4:7] a16 768; GFX1013-NEXT: s_waitcnt vmcnt(0) 769; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3] 770; GFX1013-NEXT: s_endpgm 771 %lid = tail call i32 @llvm.amdgcn.workitem.id.x() 772 %gep_ray = getelementptr inbounds float, float* %p_ray, i32 %lid 773 %ray_extent = load float, float* %gep_ray, align 4 774 %ray_origin0 = insertelement <4 x float> undef, float 0.0, i32 0 775 %ray_origin1 = insertelement <4 x float> %ray_origin0, float 1.0, i32 1 776 %ray_origin = insertelement <4 x float> %ray_origin1, float 2.0, i32 2 777 %ray_dir0 = insertelement <4 x half> undef, half 3.0, i32 0 778 %ray_dir1 = insertelement <4 x half> %ray_dir0, half 4.0, i32 1 779 %ray_dir = insertelement <4 x half> %ray_dir1, half 5.0, i32 2 780 %ray_inv_dir0 = insertelement <4 x half> undef, half 6.0, i32 0 781 %ray_inv_dir1 = insertelement <4 x half> %ray_inv_dir0, half 7.0, i32 1 782 %ray_inv_dir = insertelement <4 x half> %ray_inv_dir1, half 8.0, i32 2 783 %v = call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i64.v4f16(i64 1111111111110, float %ray_extent, <4 x float> %ray_origin, <4 x half> %ray_dir, <4 x half> %ray_inv_dir, <4 x i32> %tdescr) 784 store <4 x i32> %v, <4 x i32>* undef 785 ret void 786} 787