1; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN 2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s 3 4; GCN-LABEL: {{^}}full_mask: 5; GCN: s_mov_b64 exec, -1 6; GCN: v_add_f32_e32 v0, 7define amdgpu_ps float @full_mask(float %a, float %b) { 8main_body: 9 %s = fadd float %a, %b 10 call void @llvm.amdgcn.init.exec(i64 -1) 11 ret float %s 12} 13 14; GCN-LABEL: {{^}}partial_mask: 15; GCN: s_mov_b64 exec, 0x1e240 16; GCN: v_add_f32_e32 v0, 17define amdgpu_ps float @partial_mask(float %a, float %b) { 18main_body: 19 %s = fadd float %a, %b 20 call void @llvm.amdgcn.init.exec(i64 123456) 21 ret float %s 22} 23 24; GCN-LABEL: {{^}}input_s3off8: 25; GCN: s_bfe_u32 s0, s3, 0x70008 26; GCN: s_bfm_b64 exec, s0, 0 27; GCN: s_cmp_eq_u32 s0, 64 28; GCN: s_cmov_b64 exec, -1 29; GCN: v_add_f32_e32 v0, 30define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) { 31main_body: 32 %s = fadd float %a, %b 33 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 34 ret float %s 35} 36 37; GCN-LABEL: {{^}}input_s0off19: 38; GCN: s_bfe_u32 s0, s0, 0x70013 39; GCN: s_bfm_b64 exec, s0, 0 40; GCN: s_cmp_eq_u32 s0, 64 41; GCN: s_cmov_b64 exec, -1 42; GCN: v_add_f32_e32 v0, 43define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) { 44main_body: 45 %s = fadd float %a, %b 46 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 47 ret float %s 48} 49 50; GCN-LABEL: {{^}}reuse_input: 51; GCN: s_bfe_u32 s1, s0, 0x70013 52; GCN: s_bfm_b64 exec, s1, 0 53; GCN: s_cmp_eq_u32 s1, 64 54; GCN: s_cmov_b64 exec, -1 55; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 56define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) { 57main_body: 58 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 59 %s = add i32 %a, %count 60 %f = sitofp i32 %s to float 61 ret float %f 62} 63 64; GCN-LABEL: {{^}}reuse_input2: 65; GCN: s_bfe_u32 s1, s0, 0x70013 66; GCN: s_bfm_b64 exec, s1, 0 67; GCN: s_cmp_eq_u32 s1, 64 68; GCN: s_cmov_b64 exec, -1 69; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0 70define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) { 71main_body: 72 %s = add i32 %a, %count 73 %f = sitofp i32 %s to float 74 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19) 75 ret float %f 76} 77 78; GCN-LABEL: {{^}}init_unreachable: 79; 80; This used to crash. 81define amdgpu_ps void @init_unreachable() { 82main_body: 83 call void @llvm.amdgcn.init.exec(i64 -1) 84 unreachable 85} 86 87; GCN-LABEL: {{^}}init_exec_before_frame_materialize: 88; GCN-NOT: {{^}}v_ 89; GCN: s_mov_b64 exec, -1 90; GCN: v_mov 91; GCN: v_add 92define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) { 93main_body: 94 %array0 = alloca [1024 x i32], align 16, addrspace(5) 95 %array1 = alloca [20 x i32], align 16, addrspace(5) 96 call void @llvm.amdgcn.init.exec(i64 -1) 97 98 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 99 store i32 %a, i32 addrspace(5)* %ptr0, align 4 100 101 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 102 store i32 %a, i32 addrspace(5)* %ptr1, align 4 103 104 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 105 store i32 %b, i32 addrspace(5)* %ptr2, align 4 106 107 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 108 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 109 110 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 111 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 112 113 %v5 = add i32 %v3, %v4 114 %v = bitcast i32 %v5 to float 115 ret float %v 116} 117 118; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize: 119; GCN-NOT: {{^}}v_ 120; GCN: s_bfe_u32 s2, s2, 0x70008 121; GCN-NEXT: s_bfm_b64 exec, s2, 0 122; GCN-NEXT: s_cmp_eq_u32 s2, 64 123; GCN-NEXT: s_cmov_b64 exec, -1 124; GCN: v_mov 125; GCN: v_add 126define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 127main_body: 128 %array0 = alloca [1024 x i32], align 16, addrspace(5) 129 %array1 = alloca [20 x i32], align 16, addrspace(5) 130 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 131 132 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 133 store i32 %a, i32 addrspace(5)* %ptr0, align 4 134 135 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 136 store i32 %a, i32 addrspace(5)* %ptr1, align 4 137 138 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 139 store i32 %b, i32 addrspace(5)* %ptr2, align 4 140 141 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 142 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 143 144 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 145 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 146 147 %v5 = add i32 %v3, %v4 148 %v = bitcast i32 %v5 to float 149 ret float %v 150} 151 152; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry: 153; GCN-NOT: {{^}}v_ 154; GCN: %endif 155; GCN: s_bfe_u32 s3, s2, 0x70008 156; GCN-NEXT: s_bfm_b64 exec, s3, 0 157; GCN-NEXT: s_cmp_eq_u32 s3, 64 158; GCN-NEXT: s_cmov_b64 exec, -1 159; GCN: v_mov 160; GCN: v_add 161define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) { 162main_body: 163 ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel 164 %array0 = alloca [1024 x i32], align 16, addrspace(5) 165 %array1 = alloca [20 x i32], align 16, addrspace(5) 166 167 %cc = icmp uge i32 %count, 32 168 br i1 %cc, label %endif, label %if 169 170if: 171 call void asm sideeffect "", ""() 172 br label %endif 173 174endif: 175 call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8) 176 177 %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1 178 store i32 %a, i32 addrspace(5)* %ptr0, align 4 179 180 %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1 181 store i32 %a, i32 addrspace(5)* %ptr1, align 4 182 183 %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2 184 store i32 %b, i32 addrspace(5)* %ptr2, align 4 185 186 %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b 187 %v3 = load i32, i32 addrspace(5)* %ptr3, align 4 188 189 %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b 190 %v4 = load i32, i32 addrspace(5)* %ptr4, align 4 191 192 %v5 = add i32 %v3, %v4 193 %v6 = add i32 %v5, %count 194 %v = bitcast i32 %v6 to float 195 ret float %v 196} 197 198declare void @llvm.amdgcn.init.exec(i64) #1 199declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1 200 201attributes #1 = { convergent } 202