1; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s 2; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s 3 4; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 5; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4 6 7define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { 8entry: 9 %stack = alloca [5 x i32], align 4 10 %0 = load i32, i32 addrspace(1)* %in, align 4 11 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 12 store i32 4, i32* %arrayidx1, align 4 13 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 14 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 15 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 16 store i32 5, i32* %arrayidx3, align 4 17 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 18 %2 = load i32, i32* %arrayidx10, align 4 19 store i32 %2, i32 addrspace(1)* %out, align 4 20 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 21 %3 = load i32, i32* %arrayidx12 22 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 23 store i32 %3, i32 addrspace(1)* %arrayidx13 24 ret void 25} 26 27; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4 28 29define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { 30entry: 31 %stack = alloca [5 x i32], align 4 32 %0 = load i32, i32 addrspace(1)* %in, align 4 33 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 34 store i32 4, i32* %arrayidx1, align 4 35 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 36 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 37 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 38 store i32 5, i32* %arrayidx3, align 4 39 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 40 %2 = load i32, i32* %arrayidx10, align 4 41 store i32 %2, i32 addrspace(1)* %out, align 4 42 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 43 %3 = load i32, i32* %arrayidx12 44 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 45 store i32 %3, i32 addrspace(1)* %arrayidx13 46 ret void 47} 48 49; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4 50 51define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { 52entry: 53 %stack = alloca [5 x i32], align 4 54 %0 = load i32, i32 addrspace(1)* %in, align 4 55 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 56 store i32 4, i32* %arrayidx1, align 4 57 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 58 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 59 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 60 store i32 5, i32* %arrayidx3, align 4 61 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 62 %2 = load i32, i32* %arrayidx10, align 4 63 store i32 %2, i32 addrspace(1)* %out, align 4 64 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 65 %3 = load i32, i32* %arrayidx12 66 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 67 store i32 %3, i32 addrspace(1)* %arrayidx13 68 ret void 69} 70 71; ALL-LABEL: @occupancy_0( 72; CI-NOT: alloca [5 x i32] 73; SI: alloca [5 x i32] 74define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { 75entry: 76 %stack = alloca [5 x i32], align 4 77 %0 = load i32, i32 addrspace(1)* %in, align 4 78 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 79 store i32 4, i32* %arrayidx1, align 4 80 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 81 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 82 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 83 store i32 5, i32* %arrayidx3, align 4 84 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 85 %2 = load i32, i32* %arrayidx10, align 4 86 store i32 %2, i32 addrspace(1)* %out, align 4 87 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 88 %3 = load i32, i32* %arrayidx12 89 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 90 store i32 %3, i32 addrspace(1)* %arrayidx13 91 ret void 92} 93 94; ALL-LABEL: @occupancy_max( 95; CI-NOT: alloca [5 x i32] 96; SI: alloca [5 x i32] 97define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { 98entry: 99 %stack = alloca [5 x i32], align 4 100 %0 = load i32, i32 addrspace(1)* %in, align 4 101 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0 102 store i32 4, i32* %arrayidx1, align 4 103 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 104 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 105 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1 106 store i32 5, i32* %arrayidx3, align 4 107 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0 108 %2 = load i32, i32* %arrayidx10, align 4 109 store i32 %2, i32 addrspace(1)* %out, align 4 110 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1 111 %3 = load i32, i32* %arrayidx12 112 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 113 store i32 %3, i32 addrspace(1)* %arrayidx13 114 ret void 115} 116 117; SI-LABEL: @occupancy_6( 118; CI-LABEL: @occupancy_6( 119; SI: alloca 120; CI-NOT: alloca 121define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 122entry: 123 %stack = alloca [42 x i8], align 4 124 %tmp = load i8, i8 addrspace(1)* %in, align 1 125 %tmp4 = sext i8 %tmp to i64 126 %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp4 127 store i8 4, i8* %arrayidx1, align 1 128 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 129 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 130 %tmp5 = sext i8 %tmp1 to i64 131 %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 %tmp5 132 store i8 5, i8* %arrayidx3, align 1 133 %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 0 134 %tmp2 = load i8, i8* %arrayidx10, align 1 135 store i8 %tmp2, i8 addrspace(1)* %out, align 1 136 %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8]* %stack, i64 0, i64 1 137 %tmp3 = load i8, i8* %arrayidx12, align 1 138 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 139 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 140 ret void 141} 142 143; ALL-LABEL: @occupancy_6_over( 144; ALL: alloca [43 x i8] 145define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { 146entry: 147 %stack = alloca [43 x i8], align 4 148 %tmp = load i8, i8 addrspace(1)* %in, align 1 149 %tmp4 = sext i8 %tmp to i64 150 %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp4 151 store i8 4, i8* %arrayidx1, align 1 152 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 153 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 154 %tmp5 = sext i8 %tmp1 to i64 155 %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 %tmp5 156 store i8 5, i8* %arrayidx3, align 1 157 %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 0 158 %tmp2 = load i8, i8* %arrayidx10, align 1 159 store i8 %tmp2, i8 addrspace(1)* %out, align 1 160 %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8]* %stack, i64 0, i64 1 161 %tmp3 = load i8, i8* %arrayidx12, align 1 162 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 163 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 164 ret void 165} 166 167; SI-LABEL: @occupancy_8( 168; CI-LABEL: @occupancy_8( 169; SI: alloca 170; CI-NOT: alloca 171define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 172entry: 173 %stack = alloca [32 x i8], align 4 174 %tmp = load i8, i8 addrspace(1)* %in, align 1 175 %tmp4 = sext i8 %tmp to i64 176 %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp4 177 store i8 4, i8* %arrayidx1, align 1 178 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 179 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 180 %tmp5 = sext i8 %tmp1 to i64 181 %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 %tmp5 182 store i8 5, i8* %arrayidx3, align 1 183 %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 0 184 %tmp2 = load i8, i8* %arrayidx10, align 1 185 store i8 %tmp2, i8 addrspace(1)* %out, align 1 186 %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8]* %stack, i64 0, i64 1 187 %tmp3 = load i8, i8* %arrayidx12, align 1 188 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 189 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 190 ret void 191} 192 193; ALL-LABEL: @occupancy_8_over( 194; ALL: alloca [33 x i8] 195define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { 196entry: 197 %stack = alloca [33 x i8], align 4 198 %tmp = load i8, i8 addrspace(1)* %in, align 1 199 %tmp4 = sext i8 %tmp to i64 200 %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp4 201 store i8 4, i8* %arrayidx1, align 1 202 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 203 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 204 %tmp5 = sext i8 %tmp1 to i64 205 %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 %tmp5 206 store i8 5, i8* %arrayidx3, align 1 207 %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 0 208 %tmp2 = load i8, i8* %arrayidx10, align 1 209 store i8 %tmp2, i8 addrspace(1)* %out, align 1 210 %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8]* %stack, i64 0, i64 1 211 %tmp3 = load i8, i8* %arrayidx12, align 1 212 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 213 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 214 ret void 215} 216 217; SI-LABEL: @occupancy_9( 218; CI-LABEL: @occupancy_9( 219; SI: alloca 220; CI-NOT: alloca 221define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 222entry: 223 %stack = alloca [28 x i8], align 4 224 %tmp = load i8, i8 addrspace(1)* %in, align 1 225 %tmp4 = sext i8 %tmp to i64 226 %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp4 227 store i8 4, i8* %arrayidx1, align 1 228 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 229 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 230 %tmp5 = sext i8 %tmp1 to i64 231 %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 %tmp5 232 store i8 5, i8* %arrayidx3, align 1 233 %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 0 234 %tmp2 = load i8, i8* %arrayidx10, align 1 235 store i8 %tmp2, i8 addrspace(1)* %out, align 1 236 %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8]* %stack, i64 0, i64 1 237 %tmp3 = load i8, i8* %arrayidx12, align 1 238 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 239 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 240 ret void 241} 242 243; ALL-LABEL: @occupancy_9_over( 244; ALL: alloca [29 x i8] 245define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { 246entry: 247 %stack = alloca [29 x i8], align 4 248 %tmp = load i8, i8 addrspace(1)* %in, align 1 249 %tmp4 = sext i8 %tmp to i64 250 %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp4 251 store i8 4, i8* %arrayidx1, align 1 252 %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 253 %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 254 %tmp5 = sext i8 %tmp1 to i64 255 %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 %tmp5 256 store i8 5, i8* %arrayidx3, align 1 257 %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 0 258 %tmp2 = load i8, i8* %arrayidx10, align 1 259 store i8 %tmp2, i8 addrspace(1)* %out, align 1 260 %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8]* %stack, i64 0, i64 1 261 %tmp3 = load i8, i8* %arrayidx12, align 1 262 %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 263 store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 264 ret void 265} 266 267attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" } 268attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" } 269attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" } 270attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" } 271attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" } 272attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" } 273attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" } 274attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" } 275