1; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa --amdhsa-code-object-version=2 < %s | FileCheck -check-prefix=HSA %s 2 3@lds.align16.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 4@lds.align16.1 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 16 5 6@lds.align8.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 8 7@lds.align32.0 = internal unnamed_addr addrspace(3) global [38 x i8] undef, align 32 8 9@lds.missing.align.0 = internal unnamed_addr addrspace(3) global [39 x i32] undef 10@lds.missing.align.1 = internal unnamed_addr addrspace(3) global [7 x i64] undef 11 12declare void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i1) #0 13declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i1) #0 14 15 16; HSA-LABEL: {{^}}test_no_round_size_1: 17; HSA: workgroup_group_segment_byte_size = 38 18define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 19 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 20 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 21 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) 22 ret void 23} 24 25; There are two objects, so one requires padding to be correctly 26; aligned after the other. 27 28; (38 -> 48) + 38 = 92 29 30; I don't think it is necessary to add padding after since if there 31; were to be a dynamically sized LDS kernel arg, the runtime should 32; add the alignment padding if necessary alignment padding if needed. 33 34; HSA-LABEL: {{^}}test_round_size_2: 35; HSA: workgroup_group_segment_byte_size = 86 36; HSA: group_segment_alignment = 4 37define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 38 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 39 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 40 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) 41 42 %lds.align16.1.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.1 to i8 addrspace(3)* 43 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.1.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 44 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.1.bc, i32 38, i1 false) 45 46 ret void 47} 48 49; 38 + (10 pad) + 38 (= 86) 50; HSA-LABEL: {{^}}test_round_size_2_align_8: 51; HSA: workgroup_group_segment_byte_size = 86 52; HSA: group_segment_alignment = 4 53define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 54 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 55 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 56 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 57 58 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 59 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 60 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 61 62 ret void 63} 64 65; HSA-LABEL: {{^}}test_round_local_lds_and_arg: 66; HSA: workgroup_group_segment_byte_size = 38 67; HSA: group_segment_alignment = 4 68define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 69 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 70 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.align16.0.bc, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 71 72 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.align16.0.bc, i32 38, i1 false) 73 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 74 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false) 75 ret void 76} 77 78; HSA-LABEL: {{^}}test_round_lds_arg: 79; HSA: workgroup_group_segment_byte_size = 0 80; HSA: group_segment_alignment = 4 81define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 { 82 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.arg, i8 addrspace(1)* align 4 %in, i32 38, i1 false) 83 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.arg, i32 38, i1 false) 84 ret void 85} 86 87; FIXME: Parameter alignment not considered 88; HSA-LABEL: {{^}}test_high_align_lds_arg: 89; HSA: workgroup_group_segment_byte_size = 0 90; HSA: group_segment_alignment = 4 91define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 { 92 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 64 %lds.arg, i8 addrspace(1)* align 64 %in, i32 38, i1 false) 93 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 64 %out, i8 addrspace(3)* align 64 %lds.arg, i32 38, i1 false) 94 ret void 95} 96 97; (39 * 4) + (4 pad) + (7 * 8) = 216 98; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0: 99; HSA: workgroup_group_segment_byte_size = 216 100; HSA: group_segment_alignment = 4 101define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 102 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 103 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) 104 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) 105 106 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 107 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false) 108 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false) 109 110 ret void 111} 112 113; (39 * 4) + (4 pad) + (7 * 8) = 216 114; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1: 115; HSA: workgroup_group_segment_byte_size = 216 116; HSA: group_segment_alignment = 4 117define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 118 %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)* 119 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i8 addrspace(1)* align 8 %in, i32 56, i1 false) 120 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.missing.align.1.bc, i32 56, i1 false) 121 122 %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)* 123 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i8 addrspace(1)* align 4 %in, i32 160, i1 false) 124 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out, i8 addrspace(3)* align 4 %lds.missing.align.0.bc, i32 160, i1 false) 125 126 ret void 127} 128 129; align 32, 16, 16 130; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) 131; HSA-LABEL: {{^}}test_round_size_3_order0: 132; HSA: workgroup_group_segment_byte_size = 134 133; HSA: group_segment_alignment = 4 134define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 135 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 136 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 137 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 138 139 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 140 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 141 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 142 143 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 144 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 145 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 146 147 ret void 148} 149 150; align 32, 16, 16 151; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134) 152; HSA-LABEL: {{^}}test_round_size_3_order1: 153; HSA: workgroup_group_segment_byte_size = 134 154; HSA: group_segment_alignment = 4 155define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 156 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 157 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 158 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 159 160 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 161 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 162 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 163 164 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 165 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 166 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 167 168 ret void 169} 170 171; align 32, 16, 16 172; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126) 173; HSA-LABEL: {{^}}test_round_size_3_order2: 174; HSA: workgroup_group_segment_byte_size = 134 175; HSA: group_segment_alignment = 4 176define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 177 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 178 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 179 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 180 181 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 182 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 183 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 184 185 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 186 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 187 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 188 189 ret void 190} 191 192; align 32, 16, 16 193; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134) 194; HSA-LABEL: {{^}}test_round_size_3_order3: 195; HSA: workgroup_group_segment_byte_size = 134 196; HSA: group_segment_alignment = 4 197define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 198 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 199 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 200 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 201 202 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 203 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 204 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 205 206 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 207 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 208 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 209 210 ret void 211} 212 213; align 32, 16, 16 214; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) 215; HSA-LABEL: {{^}}test_round_size_3_order4: 216; HSA: workgroup_group_segment_byte_size = 134 217; HSA: group_segment_alignment = 4 218define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 219 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 220 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 221 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 222 223 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 224 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 225 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 226 227 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 228 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 229 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 230 231 ret void 232} 233 234; align 32, 16, 16 235; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134) 236; HSA-LABEL: {{^}}test_round_size_3_order5: 237; HSA: workgroup_group_segment_byte_size = 134 238; HSA: group_segment_alignment = 4 239define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { 240 %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)* 241 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align8.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 242 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align8.0.bc, i32 38, i1 false) 243 244 %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)* 245 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align16.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 246 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align16.0.bc, i32 38, i1 false) 247 248 %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)* 249 call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 8 %lds.align32.0.bc, i8 addrspace(1)* align 8 %in, i32 38, i1 false) 250 call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 8 %out, i8 addrspace(3)* align 8 %lds.align32.0.bc, i32 38, i1 false) 251 252 ret void 253} 254 255attributes #0 = { argmemonly nounwind } 256attributes #1 = { nounwind } 257attributes #2 = { convergent nounwind } 258