1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; FIXME: Manually added checks for metadata nodes at bottom
3; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=HSA %s
4; RUN: opt -mtriple=amdgcn-- -S -o - -amdgpu-lower-kernel-arguments %s | FileCheck -check-prefix=MESA %s
5
6target datalayout = "A5"
7
8define amdgpu_kernel void @kern_noargs() {
9; HSA-LABEL: @kern_noargs(
10; HSA-NEXT:    ret void
11;
12; MESA-LABEL: @kern_noargs(
13; MESA-NEXT:    ret void
14;
15  ret void
16}
17
18define amdgpu_kernel void @kern_i8(i8 %arg) #0 {
19; HSA-LABEL: @kern_i8(
20; HSA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
21; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 0
22; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
23; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
24; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
25; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
26; HSA-NEXT:    ret void
27;
28; MESA-LABEL: @kern_i8(
29; MESA-NEXT:    [[KERN_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
30; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_KERNARG_SEGMENT]], i64 36
31; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
32; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
33; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
34; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
35; MESA-NEXT:    ret void
36;
37  store i8 %arg, i8 addrspace(1)* undef, align 1
38  ret void
39}
40
41define amdgpu_kernel void @kern_i16(i16 %arg) #0 {
42; HSA-LABEL: @kern_i16(
43; HSA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
44; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 0
45; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
46; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
47; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
48; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
49; HSA-NEXT:    ret void
50;
51; MESA-LABEL: @kern_i16(
52; MESA-NEXT:    [[KERN_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
53; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I16_KERNARG_SEGMENT]], i64 36
54; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
55; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
56; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
57; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
58; MESA-NEXT:    ret void
59;
60  store i16 %arg, i16 addrspace(1)* undef, align 1
61  ret void
62}
63
64define amdgpu_kernel void @kern_f16(half %arg) #0 {
65; HSA-LABEL: @kern_f16(
66; HSA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
67; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 0
68; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
69; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
70; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
71; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
72; HSA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
73; HSA-NEXT:    ret void
74;
75; MESA-LABEL: @kern_f16(
76; MESA-NEXT:    [[KERN_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
77; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F16_KERNARG_SEGMENT]], i64 36
78; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
79; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
80; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
81; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
82; MESA-NEXT:    store half [[ARG_LOAD]], half addrspace(1)* undef, align 1
83; MESA-NEXT:    ret void
84;
85  store half %arg, half addrspace(1)* undef, align 1
86  ret void
87}
88
89define amdgpu_kernel void @kern_zeroext_i8(i8 zeroext %arg) #0 {
90; HSA-LABEL: @kern_zeroext_i8(
91; HSA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
92; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 0
93; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
94; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
95; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
96; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
97; HSA-NEXT:    ret void
98;
99; MESA-LABEL: @kern_zeroext_i8(
100; MESA-NEXT:    [[KERN_ZEROEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
101; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I8_KERNARG_SEGMENT]], i64 36
102; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
103; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
104; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
105; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
106; MESA-NEXT:    ret void
107;
108  store i8 %arg, i8 addrspace(1)* undef, align 1
109  ret void
110}
111
112define amdgpu_kernel void @kern_zeroext_i16(i16 zeroext %arg) #0 {
113; HSA-LABEL: @kern_zeroext_i16(
114; HSA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
115; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 0
116; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
117; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
118; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
119; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
120; HSA-NEXT:    ret void
121;
122; MESA-LABEL: @kern_zeroext_i16(
123; MESA-NEXT:    [[KERN_ZEROEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
124; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ZEROEXT_I16_KERNARG_SEGMENT]], i64 36
125; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
126; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
127; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
128; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
129; MESA-NEXT:    ret void
130;
131  store i16 %arg, i16 addrspace(1)* undef, align 1
132  ret void
133}
134
135define amdgpu_kernel void @kern_signext_i8(i8 signext %arg) #0 {
136; HSA-LABEL: @kern_signext_i8(
137; HSA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
138; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 0
139; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
140; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
141; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
142; HSA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
143; HSA-NEXT:    ret void
144;
145; MESA-LABEL: @kern_signext_i8(
146; MESA-NEXT:    [[KERN_SIGNEXT_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
147; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I8_KERNARG_SEGMENT]], i64 36
148; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
149; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
150; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
151; MESA-NEXT:    store i8 [[TMP2]], i8 addrspace(1)* undef, align 1
152; MESA-NEXT:    ret void
153;
154  store i8 %arg, i8 addrspace(1)* undef, align 1
155  ret void
156}
157
158define amdgpu_kernel void @kern_signext_i16(i16 signext %arg) #0 {
159; HSA-LABEL: @kern_signext_i16(
160; HSA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
161; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 0
162; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
163; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
164; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
165; HSA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
166; HSA-NEXT:    ret void
167;
168; MESA-LABEL: @kern_signext_i16(
169; MESA-NEXT:    [[KERN_SIGNEXT_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
170; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_SIGNEXT_I16_KERNARG_SEGMENT]], i64 36
171; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
172; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
173; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
174; MESA-NEXT:    store i16 [[TMP2]], i16 addrspace(1)* undef, align 1
175; MESA-NEXT:    ret void
176;
177  store i16 %arg, i16 addrspace(1)* undef, align 1
178  ret void
179}
180
181define amdgpu_kernel void @kern_i8_i8(i8 %arg0, i8 %arg1) {
182; HSA-LABEL: @kern_i8_i8(
183; HSA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
184; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
185; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
186; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
187; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
188; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 0
189; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
190; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
191; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
192; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
193; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
194; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
195; HSA-NEXT:    ret void
196;
197; MESA-LABEL: @kern_i8_i8(
198; MESA-NEXT:    [[KERN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
199; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
200; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
201; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
202; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
203; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I8_I8_KERNARG_SEGMENT]], i64 36
204; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
205; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
206; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
207; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
208; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
209; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
210; MESA-NEXT:    ret void
211;
212  store volatile i8 %arg0, i8 addrspace(1)* undef, align 1
213  store volatile i8 %arg1, i8 addrspace(1)* undef, align 1
214  ret void
215}
216
217define amdgpu_kernel void @kern_v3i8(<3 x i8> %arg) {
218; HSA-LABEL: @kern_v3i8(
219; HSA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
220; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 0
221; HSA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
222; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
223; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
224; HSA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
225; HSA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
226; HSA-NEXT:    ret void
227;
228; MESA-LABEL: @kern_v3i8(
229; MESA-NEXT:    [[KERN_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
230; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I8_KERNARG_SEGMENT]], i64 36
231; MESA-NEXT:    [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
232; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
233; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
234; MESA-NEXT:    [[ARG_LOAD:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
235; MESA-NEXT:    store <3 x i8> [[ARG_LOAD]], <3 x i8> addrspace(1)* undef, align 4
236; MESA-NEXT:    ret void
237;
238  store <3 x i8> %arg, <3 x i8> addrspace(1)* undef, align 4
239  ret void
240}
241
242define amdgpu_kernel void @kern_i24(i24 %arg0) {
243; HSA-LABEL: @kern_i24(
244; HSA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
245; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 0
246; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
247; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
248; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
249; HSA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
250; HSA-NEXT:    ret void
251;
252; MESA-LABEL: @kern_i24(
253; MESA-NEXT:    [[KERN_I24_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
254; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I24_KERNARG_SEGMENT]], i64 36
255; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
256; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
257; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i24
258; MESA-NEXT:    store i24 [[TMP2]], i24 addrspace(1)* undef, align 4
259; MESA-NEXT:    ret void
260;
261  store i24 %arg0, i24 addrspace(1)* undef
262  ret void
263}
264
265define amdgpu_kernel void @kern_i32(i32 %arg0) {
266; HSA-LABEL: @kern_i32(
267; HSA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
268; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 0
269; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
270; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
271; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
272; HSA-NEXT:    ret void
273;
274; MESA-LABEL: @kern_i32(
275; MESA-NEXT:    [[KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
276; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_KERNARG_SEGMENT]], i64 36
277; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
278; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
279; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
280; MESA-NEXT:    ret void
281;
282  store i32 %arg0, i32 addrspace(1)* undef
283  ret void
284}
285
286define amdgpu_kernel void @kern_f32(float %arg0) {
287; HSA-LABEL: @kern_f32(
288; HSA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
289; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 0
290; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
291; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
292; HSA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
293; HSA-NEXT:    ret void
294;
295; MESA-LABEL: @kern_f32(
296; MESA-NEXT:    [[KERN_F32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
297; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_F32_KERNARG_SEGMENT]], i64 36
298; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to float addrspace(4)*
299; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load float, float addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
300; MESA-NEXT:    store float [[ARG0_LOAD]], float addrspace(1)* undef, align 4
301; MESA-NEXT:    ret void
302;
303  store float %arg0, float addrspace(1)* undef
304  ret void
305}
306
307define amdgpu_kernel void @kern_v3i32(<3 x i32> %arg0) {
308; HSA-LABEL: @kern_v3i32(
309; HSA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
310; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 0
311; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
312; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
313; HSA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
314; HSA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
315; HSA-NEXT:    ret void
316;
317; MESA-LABEL: @kern_v3i32(
318; MESA-NEXT:    [[KERN_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
319; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V3I32_KERNARG_SEGMENT]], i64 36
320; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
321; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
322; MESA-NEXT:    [[ARG0_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
323; MESA-NEXT:    store <3 x i32> [[ARG0_LOAD]], <3 x i32> addrspace(1)* undef, align 4
324; MESA-NEXT:    ret void
325;
326  store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef, align 4
327  ret void
328}
329
330define amdgpu_kernel void @kern_v8i32(<8 x i32> %arg) #0 {
331; HSA-LABEL: @kern_v8i32(
332; HSA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
333; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 0
334; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
335; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
336; HSA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
337; HSA-NEXT:    ret void
338;
339; MESA-LABEL: @kern_v8i32(
340; MESA-NEXT:    [[KERN_V8I32_KERNARG_SEGMENT:%.*]] = call nonnull align 32 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
341; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I32_KERNARG_SEGMENT]], i64 36
342; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i32> addrspace(4)*
343; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
344; MESA-NEXT:    store <8 x i32> [[ARG_LOAD]], <8 x i32> addrspace(1)* undef, align 32
345; MESA-NEXT:    ret void
346;
347  store <8 x i32> %arg, <8 x i32> addrspace(1)* undef
348  ret void
349}
350
351define amdgpu_kernel void @kern_v8i64(<8 x i64> %arg) #0 {
352; HSA-LABEL: @kern_v8i64(
353; HSA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
354; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 0
355; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
356; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
357; HSA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
358; HSA-NEXT:    ret void
359;
360; MESA-LABEL: @kern_v8i64(
361; MESA-NEXT:    [[KERN_V8I64_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(100) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
362; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V8I64_KERNARG_SEGMENT]], i64 36
363; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <8 x i64> addrspace(4)*
364; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <8 x i64>, <8 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
365; MESA-NEXT:    store <8 x i64> [[ARG_LOAD]], <8 x i64> addrspace(1)* undef, align 64
366; MESA-NEXT:    ret void
367;
368  store <8 x i64> %arg, <8 x i64> addrspace(1)* undef
369  ret void
370}
371
372define amdgpu_kernel void @kern_v16i64(<16 x i64> %arg) #0 {
373; HSA-LABEL: @kern_v16i64(
374; HSA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(128) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
375; HSA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 0
376; HSA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
377; HSA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
378; HSA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
379; HSA-NEXT:    ret void
380;
381; MESA-LABEL: @kern_v16i64(
382; MESA-NEXT:    [[KERN_V16I64_KERNARG_SEGMENT:%.*]] = call nonnull align 128 dereferenceable(164) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
383; MESA-NEXT:    [[ARG_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_V16I64_KERNARG_SEGMENT]], i64 36
384; MESA-NEXT:    [[ARG_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG_KERNARG_OFFSET]] to <16 x i64> addrspace(4)*
385; MESA-NEXT:    [[ARG_LOAD:%.*]] = load <16 x i64>, <16 x i64> addrspace(4)* [[ARG_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
386; MESA-NEXT:    store <16 x i64> [[ARG_LOAD]], <16 x i64> addrspace(1)* undef, align 128
387; MESA-NEXT:    ret void
388;
389  store <16 x i64> %arg, <16 x i64> addrspace(1)* undef
390  ret void
391}
392
393define amdgpu_kernel void @kern_i32_v3i32(i32 %arg0, <3 x i32> %arg1) {
394; HSA-LABEL: @kern_i32_v3i32(
395; HSA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
396; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 0
397; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
398; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
399; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 16
400; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
401; HSA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
402; HSA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
403; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
404; HSA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
405; HSA-NEXT:    ret void
406;
407; MESA-LABEL: @kern_i32_v3i32(
408; MESA-NEXT:    [[KERN_I32_V3I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
409; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 36
410; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
411; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
412; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_I32_V3I32_KERNARG_SEGMENT]], i64 52
413; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
414; MESA-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
415; MESA-NEXT:    [[ARG1_LOAD:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
416; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
417; MESA-NEXT:    store <3 x i32> [[ARG1_LOAD]], <3 x i32> addrspace(1)* undef, align 4
418; MESA-NEXT:    ret void
419;
420  store i32 %arg0, i32 addrspace(1)* undef
421  store <3 x i32> %arg1, <3 x i32> addrspace(1)* undef, align 4
422  ret void
423}
424
425%struct.a = type { i32, i8, [4 x i8] }
426%struct.b.packed = type { i8, i32, [3 x i16], <2 x double> }
427
428define amdgpu_kernel void @kern_struct_a(%struct.a %arg0) {
429; HSA-LABEL: @kern_struct_a(
430; HSA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
431; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 0
432; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
433; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
434; HSA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], [[STRUCT_A]] addrspace(1)* undef, align 4
435; HSA-NEXT:    ret void
436;
437; MESA-LABEL: @kern_struct_a(
438; MESA-NEXT:    [[KERN_STRUCT_A_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
439; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_A_KERNARG_SEGMENT]], i64 36
440; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_A:%.*]] addrspace(4)*
441; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_A]], [[STRUCT_A]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
442; MESA-NEXT:    store [[STRUCT_A]] [[ARG0_LOAD]], [[STRUCT_A]] addrspace(1)* undef, align 4
443; MESA-NEXT:    ret void
444;
445  store %struct.a %arg0, %struct.a addrspace(1)* undef
446  ret void
447}
448
449define amdgpu_kernel void @kern_struct_b_packed(%struct.b.packed %arg0) #0 {
450; HSA-LABEL: @kern_struct_b_packed(
451; HSA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(32) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
452; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 0
453; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
454; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
455; HSA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
456; HSA-NEXT:    ret void
457;
458; MESA-LABEL: @kern_struct_b_packed(
459; MESA-NEXT:    [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(68) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
460; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_STRUCT_B_PACKED_KERNARG_SEGMENT]], i64 36
461; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to [[STRUCT_B_PACKED:%.*]] addrspace(4)*
462; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load [[STRUCT_B_PACKED]], [[STRUCT_B_PACKED]] addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
463; MESA-NEXT:    store [[STRUCT_B_PACKED]] [[ARG0_LOAD]], [[STRUCT_B_PACKED]] addrspace(1)* undef, align 16
464; MESA-NEXT:    ret void
465;
466  store %struct.b.packed %arg0, %struct.b.packed addrspace(1)* undef
467  ret void
468}
469
470define amdgpu_kernel void @kern_implicit_arg_num_bytes(i32 %arg0) #1 {
471; HSA-LABEL: @kern_implicit_arg_num_bytes(
472; HSA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
473; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 0
474; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
475; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
476; HSA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
477; HSA-NEXT:    ret void
478;
479; MESA-LABEL: @kern_implicit_arg_num_bytes(
480; MESA-NEXT:    [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
481; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_IMPLICIT_ARG_NUM_BYTES_KERNARG_SEGMENT]], i64 36
482; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
483; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
484; MESA-NEXT:    store i32 [[ARG0_LOAD]], i32 addrspace(1)* undef, align 4
485; MESA-NEXT:    ret void
486;
487  store i32 %arg0, i32 addrspace(1)* undef
488  ret void
489}
490
491define amdgpu_kernel void @kernel_implicitarg_no_struct_align(<16 x i32>, i32 %arg1) #1 {
492; HSA-LABEL: @kernel_implicitarg_no_struct_align(
493; HSA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(112) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
494; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 64
495; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
496; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
497; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
498; HSA-NEXT:    ret void
499;
500; MESA-LABEL: @kernel_implicitarg_no_struct_align(
501; MESA-NEXT:    [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(108) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
502; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERNEL_IMPLICITARG_NO_STRUCT_ALIGN_KERNARG_SEGMENT]], i64 100
503; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
504; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
505; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
506; MESA-NEXT:    ret void
507;
508  store i32 %arg1, i32 addrspace(1)* undef
509  ret void
510}
511
512define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
513; HSA-LABEL: @kern_lds_ptr(
514; HSA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
515; HSA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 0
516; HSA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
517; HSA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
518; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
519; HSA-NEXT:    ret void
520;
521; MESA-LABEL: @kern_lds_ptr(
522; MESA-NEXT:    [[KERN_LDS_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
523; MESA-NEXT:    [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_KERNARG_SEGMENT]], i64 36
524; MESA-NEXT:    [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
525; MESA-NEXT:    [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
526; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
527; MESA-NEXT:    ret void
528;
529  store i32 0, i32 addrspace(3)* %lds, align 4
530  ret void
531}
532
533define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
534; HSA-LABEL: @kern_lds_ptr_si(
535; HSA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
536; HSA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
537; HSA-NEXT:    ret void
538;
539; MESA-LABEL: @kern_lds_ptr_si(
540; MESA-NEXT:    [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
541; MESA-NEXT:    store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
542; MESA-NEXT:    ret void
543;
544  store i32 0, i32 addrspace(3)* %lds, align 4
545  ret void
546}
547
548define amdgpu_kernel void @kern_realign_i8_i8(i8 %arg0, i8 %arg1) #0 {
549; HSA-LABEL: @kern_realign_i8_i8(
550; HSA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
551; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
552; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
553; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
554; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
555; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 0
556; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
557; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
558; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
559; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
560; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
561; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
562; HSA-NEXT:    ret void
563;
564; MESA-LABEL: @kern_realign_i8_i8(
565; MESA-NEXT:    [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
566; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
567; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
568; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
569; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
570; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_KERNARG_SEGMENT]], i64 36
571; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
572; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
573; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
574; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
575; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
576; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
577; MESA-NEXT:    ret void
578;
579  store volatile i8 %arg0, i8 addrspace(1)* undef
580  store volatile i8 %arg1, i8 addrspace(1)* undef
581  ret void
582}
583
584define amdgpu_kernel void @kern_realign_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2) #0 {
585; HSA-LABEL: @kern_realign_i8_i8_i8(
586; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
587; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
588; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
589; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
590; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
591; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
592; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
593; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
594; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
595; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
596; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 0
597; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
598; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
599; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
600; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
601; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
602; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
603; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
604; HSA-NEXT:    ret void
605;
606; MESA-LABEL: @kern_realign_i8_i8_i8(
607; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
608; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
609; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
610; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
611; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
612; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
613; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
614; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
615; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
616; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
617; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_KERNARG_SEGMENT]], i64 36
618; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
619; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
620; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
621; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
622; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
623; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
624; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
625; MESA-NEXT:    ret void
626;
627  store volatile i8 %arg0, i8 addrspace(1)* undef
628  store volatile i8 %arg1, i8 addrspace(1)* undef
629  store volatile i8 %arg2, i8 addrspace(1)* undef
630  ret void
631}
632
633define amdgpu_kernel void @kern_realign_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3) #0 {
634; HSA-LABEL: @kern_realign_i8_i8_i8_i8(
635; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
636; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
637; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
638; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
639; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
640; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
641; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
642; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
643; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
644; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
645; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
646; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
647; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
648; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
649; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
650; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
651; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
652; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
653; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
654; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
655; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
656; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
657; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
658; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
659; HSA-NEXT:    ret void
660;
661; MESA-LABEL: @kern_realign_i8_i8_i8_i8(
662; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
663; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
664; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
665; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
666; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
667; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
668; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
669; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
670; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
671; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
672; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
673; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
674; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
675; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
676; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
677; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
678; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
679; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
680; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
681; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
682; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
683; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
684; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
685; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
686; MESA-NEXT:    ret void
687;
688  store volatile i8 %arg0, i8 addrspace(1)* undef
689  store volatile i8 %arg1, i8 addrspace(1)* undef
690  store volatile i8 %arg2, i8 addrspace(1)* undef
691  store volatile i8 %arg3, i8 addrspace(1)* undef
692  ret void
693}
694
695define amdgpu_kernel void @kern_realign_i8_v3i8(i8 %arg0, <3 x i8> %arg1) #0 {
696; HSA-LABEL: @kern_realign_i8_v3i8(
697; HSA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
698; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 0
699; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
700; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
701; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
702; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 4
703; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
704; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
705; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
706; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
707; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
708; HSA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
709; HSA-NEXT:    ret void
710;
711; MESA-LABEL: @kern_realign_i8_v3i8(
712; MESA-NEXT:    [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
713; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 36
714; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
715; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
716; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
717; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_V3I8_KERNARG_SEGMENT]], i64 40
718; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
719; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
720; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i24
721; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i24 [[TMP4]] to <3 x i8>
722; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
723; MESA-NEXT:    store volatile <3 x i8> [[ARG1_LOAD]], <3 x i8> addrspace(1)* undef, align 4
724; MESA-NEXT:    ret void
725;
726  store volatile i8 %arg0, i8 addrspace(1)* undef
727  store volatile <3 x i8> %arg1, <3 x i8> addrspace(1)* undef
728  ret void
729}
730
731define amdgpu_kernel void @kern_realign_i8_i16(i8 %arg0, i16 %arg1) #0 {
732; HSA-LABEL: @kern_realign_i8_i16(
733; HSA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
734; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
735; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
736; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
737; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
738; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 0
739; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
740; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
741; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
742; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
743; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
744; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
745; HSA-NEXT:    ret void
746;
747; MESA-LABEL: @kern_realign_i8_i16(
748; MESA-NEXT:    [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
749; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
750; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
751; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
752; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
753; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I16_KERNARG_SEGMENT]], i64 36
754; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
755; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
756; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
757; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
758; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
759; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
760; MESA-NEXT:    ret void
761;
762  store volatile i8 %arg0, i8 addrspace(1)* undef
763  store volatile i16 %arg1, i16 addrspace(1)* undef
764  ret void
765}
766
767define amdgpu_kernel void @kern_realign_i1_i1(i1 %arg0, i1 %arg1) #0 {
768; HSA-LABEL: @kern_realign_i1_i1(
769; HSA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
770; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
771; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
772; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
773; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
774; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 0
775; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
776; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
777; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
778; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
779; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
780; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
781; HSA-NEXT:    ret void
782;
783; MESA-LABEL: @kern_realign_i1_i1(
784; MESA-NEXT:    [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
785; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
786; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
787; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
788; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
789; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_KERNARG_SEGMENT]], i64 36
790; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
791; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
792; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
793; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
794; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
795; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
796; MESA-NEXT:    ret void
797;
798  store volatile i1 %arg0, i1 addrspace(1)* undef
799  store volatile i1 %arg1, i1 addrspace(1)* undef
800  ret void
801}
802
803define amdgpu_kernel void @kern_realign_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2) #0 {
804; HSA-LABEL: @kern_realign_i1_i1_i1(
805; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
806; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
807; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
808; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
809; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
810; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
811; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
812; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
813; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
814; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
815; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 0
816; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
817; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
818; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
819; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
820; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
821; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
822; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
823; HSA-NEXT:    ret void
824;
825; MESA-LABEL: @kern_realign_i1_i1_i1(
826; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
827; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
828; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
829; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
830; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
831; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
832; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
833; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
834; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
835; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
836; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_KERNARG_SEGMENT]], i64 36
837; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
838; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
839; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
840; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
841; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
842; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
843; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
844; MESA-NEXT:    ret void
845;
846  store volatile i1 %arg0, i1 addrspace(1)* undef
847  store volatile i1 %arg1, i1 addrspace(1)* undef
848  store volatile i1 %arg2, i1 addrspace(1)* undef
849  ret void
850}
851
852define amdgpu_kernel void @kern_realign_i1_i1_i1_i1(i1 %arg0, i1 %arg1, i1 %arg2, i1 %arg3) #0 {
853; HSA-LABEL: @kern_realign_i1_i1_i1_i1(
854; HSA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
855; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
856; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
857; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
858; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
859; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
860; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
861; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
862; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
863; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
864; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
865; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
866; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
867; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
868; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
869; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 0
870; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
871; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
872; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
873; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
874; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
875; HSA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
876; HSA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
877; HSA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
878; HSA-NEXT:    ret void
879;
880; MESA-LABEL: @kern_realign_i1_i1_i1_i1(
881; MESA-NEXT:    [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
882; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
883; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
884; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
885; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
886; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
887; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
888; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
889; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
890; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i1
891; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
892; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
893; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
894; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
895; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i1
896; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I1_I1_I1_KERNARG_SEGMENT]], i64 36
897; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
898; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
899; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
900; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i1
901; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
902; MESA-NEXT:    store volatile i1 [[TMP5]], i1 addrspace(1)* undef, align 1
903; MESA-NEXT:    store volatile i1 [[TMP8]], i1 addrspace(1)* undef, align 1
904; MESA-NEXT:    store volatile i1 [[TMP11]], i1 addrspace(1)* undef, align 1
905; MESA-NEXT:    ret void
906;
907  store volatile i1 %arg0, i1 addrspace(1)* undef
908  store volatile i1 %arg1, i1 addrspace(1)* undef
909  store volatile i1 %arg2, i1 addrspace(1)* undef
910  store volatile i1 %arg3, i1 addrspace(1)* undef
911  ret void
912}
913
914define amdgpu_kernel void @kern_realign_i1_v3i1(i1 %arg0, <3 x i1> %arg1) #0 {
915; HSA-LABEL: @kern_realign_i1_v3i1(
916; HSA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
917; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 0
918; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
919; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
920; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
921; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 4
922; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
923; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
924; HSA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
925; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
926; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
927; HSA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
928; HSA-NEXT:    ret void
929;
930; MESA-LABEL: @kern_realign_i1_v3i1(
931; MESA-NEXT:    [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
932; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 36
933; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
934; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
935; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
936; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_V3I1_KERNARG_SEGMENT]], i64 40
937; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
938; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
939; MESA-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i3
940; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i3 [[TMP4]] to <3 x i1>
941; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
942; MESA-NEXT:    store volatile <3 x i1> [[ARG1_LOAD]], <3 x i1> addrspace(1)* undef, align 4
943; MESA-NEXT:    ret void
944;
945  store volatile i1 %arg0, i1 addrspace(1)* undef
946  store volatile <3 x i1> %arg1, <3 x i1> addrspace(1)* undef
947  ret void
948}
949
950define amdgpu_kernel void @kern_realign_i1_i16(i1 %arg0, i16 %arg1) #0 {
951; HSA-LABEL: @kern_realign_i1_i16(
952; HSA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
953; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
954; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
955; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
956; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
957; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 0
958; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
959; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
960; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
961; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
962; HSA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
963; HSA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
964; HSA-NEXT:    ret void
965;
966; MESA-LABEL: @kern_realign_i1_i16(
967; MESA-NEXT:    [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
968; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
969; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
970; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
971; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i1
972; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I1_I16_KERNARG_SEGMENT]], i64 36
973; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
974; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
975; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
976; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
977; MESA-NEXT:    store volatile i1 [[TMP2]], i1 addrspace(1)* undef, align 1
978; MESA-NEXT:    store volatile i16 [[TMP5]], i16 addrspace(1)* undef, align 2
979; MESA-NEXT:    ret void
980;
981  store volatile i1 %arg0, i1 addrspace(1)* undef
982  store volatile i16 %arg1, i16 addrspace(1)* undef
983  ret void
984}
985
986define amdgpu_kernel void @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(i8 %arg0, i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, i8 %arg5, i8 %arg6, i8 %arg7) #0 {
987; HSA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
988; HSA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
989; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
990; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
991; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
992; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
993; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
994; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
995; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
996; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
997; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
998; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
999; HSA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1000; HSA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1001; HSA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1002; HSA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1003; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 0
1004; HSA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1005; HSA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1006; HSA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1007; HSA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1008; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1009; HSA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1010; HSA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1011; HSA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1012; HSA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1013; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1014; HSA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1015; HSA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1016; HSA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1017; HSA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1018; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 4
1019; HSA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1020; HSA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1021; HSA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1022; HSA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1023; HSA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1024; HSA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1025; HSA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1026; HSA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1027; HSA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1028; HSA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1029; HSA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1030; HSA-NEXT:    ret void
1031;
1032; MESA-LABEL: @kern_realign_i8_i8_i8_i8_i8_i8_i8_i8(
1033; MESA-NEXT:    [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1034; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1035; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1036; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1037; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
1038; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1039; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1040; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1041; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 8
1042; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
1043; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1044; MESA-NEXT:    [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1045; MESA-NEXT:    [[TMP6:%.*]] = load i32, i32 addrspace(4)* [[ARG2_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1046; MESA-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
1047; MESA-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
1048; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 36
1049; MESA-NEXT:    [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1050; MESA-NEXT:    [[TMP9:%.*]] = load i32, i32 addrspace(4)* [[ARG3_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1051; MESA-NEXT:    [[TMP10:%.*]] = lshr i32 [[TMP9]], 24
1052; MESA-NEXT:    [[TMP11:%.*]] = trunc i32 [[TMP10]] to i8
1053; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1054; MESA-NEXT:    [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1055; MESA-NEXT:    [[TMP12:%.*]] = load i32, i32 addrspace(4)* [[ARG5_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1056; MESA-NEXT:    [[TMP13:%.*]] = lshr i32 [[TMP12]], 8
1057; MESA-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
1058; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1059; MESA-NEXT:    [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1060; MESA-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(4)* [[ARG6_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1061; MESA-NEXT:    [[TMP16:%.*]] = lshr i32 [[TMP15]], 16
1062; MESA-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1063; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_I8_I8_I8_I8_I8_I8_I8_I8_KERNARG_SEGMENT]], i64 40
1064; MESA-NEXT:    [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1065; MESA-NEXT:    [[TMP18:%.*]] = load i32, i32 addrspace(4)* [[ARG7_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 8, !invariant.load !0
1066; MESA-NEXT:    [[TMP19:%.*]] = lshr i32 [[TMP18]], 24
1067; MESA-NEXT:    [[TMP20:%.*]] = trunc i32 [[TMP19]] to i8
1068; MESA-NEXT:    store volatile i8 [[TMP2]], i8 addrspace(1)* undef, align 1
1069; MESA-NEXT:    store volatile i8 [[TMP5]], i8 addrspace(1)* undef, align 1
1070; MESA-NEXT:    store volatile i8 [[TMP8]], i8 addrspace(1)* undef, align 1
1071; MESA-NEXT:    store volatile i8 [[TMP11]], i8 addrspace(1)* undef, align 1
1072; MESA-NEXT:    store volatile i8 [[TMP14]], i8 addrspace(1)* undef, align 1
1073; MESA-NEXT:    store volatile i8 [[TMP17]], i8 addrspace(1)* undef, align 1
1074; MESA-NEXT:    store volatile i8 [[TMP20]], i8 addrspace(1)* undef, align 1
1075; MESA-NEXT:    ret void
1076;
1077  store volatile i8 %arg0, i8 addrspace(1)* undef
1078  store volatile i8 %arg1, i8 addrspace(1)* undef
1079  store volatile i8 %arg2, i8 addrspace(1)* undef
1080  store volatile i8 %arg3, i8 addrspace(1)* undef
1081  store volatile i8 %arg5, i8 addrspace(1)* undef
1082  store volatile i8 %arg6, i8 addrspace(1)* undef
1083  store volatile i8 %arg7, i8 addrspace(1)* undef
1084  ret void
1085}
1086
1087define amdgpu_kernel void @kern_realign_f16_f16(half %arg0, half %arg1) #0 {
1088; HSA-LABEL: @kern_realign_f16_f16(
1089; HSA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1090; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1091; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1092; HSA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1093; HSA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1094; HSA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1095; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 0
1096; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1097; HSA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 16, !invariant.load !0
1098; HSA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1099; HSA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1100; HSA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1101; HSA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1102; HSA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1103; HSA-NEXT:    ret void
1104;
1105; MESA-LABEL: @kern_realign_f16_f16(
1106; MESA-NEXT:    [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1107; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1108; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1109; MESA-NEXT:    [[TMP1:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1110; MESA-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
1111; MESA-NEXT:    [[ARG0_LOAD:%.*]] = bitcast i16 [[TMP2]] to half
1112; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_REALIGN_F16_F16_KERNARG_SEGMENT]], i64 36
1113; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN]] to i32 addrspace(4)*
1114; MESA-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_ALIGN_DOWN_CAST]], align 4, !invariant.load !0
1115; MESA-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 16
1116; MESA-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
1117; MESA-NEXT:    [[ARG1_LOAD:%.*]] = bitcast i16 [[TMP5]] to half
1118; MESA-NEXT:    store volatile half [[ARG0_LOAD]], half addrspace(1)* undef, align 2
1119; MESA-NEXT:    store volatile half [[ARG1_LOAD]], half addrspace(1)* undef, align 2
1120; MESA-NEXT:    ret void
1121;
1122  store volatile half %arg0, half addrspace(1)* undef
1123  store volatile half %arg1, half addrspace(1)* undef
1124  ret void
1125}
1126
1127define amdgpu_kernel void @kern_global_ptr(i8 addrspace(1)* %ptr) #0 {
1128; HSA-LABEL: @kern_global_ptr(
1129; HSA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1130; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1131; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1132; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1133; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1134; HSA-NEXT:    ret void
1135;
1136; MESA-LABEL: @kern_global_ptr(
1137; MESA-NEXT:    [[KERN_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1138; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1139; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1140; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1141; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1142; MESA-NEXT:    ret void
1143;
1144  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1145  ret void
1146}
1147
1148define amdgpu_kernel void @kern_global_ptr_dereferencable(i8 addrspace(1)* dereferenceable(42) %ptr) #0 {
1149; HSA-LABEL: @kern_global_ptr_dereferencable(
1150; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1151; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 0
1152; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1153; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable !1
1154; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1155; HSA-NEXT:    ret void
1156;
1157; MESA-LABEL: @kern_global_ptr_dereferencable(
1158; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1159; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_KERNARG_SEGMENT]], i64 36
1160; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1161; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable !1
1162; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1163; MESA-NEXT:    ret void
1164;
1165  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1166  ret void
1167}
1168
1169define amdgpu_kernel void @kern_global_ptr_dereferencable_or_null(i8 addrspace(1)* dereferenceable_or_null(128) %ptr) #0 {
1170; HSA-LABEL: @kern_global_ptr_dereferencable_or_null(
1171; HSA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1172; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 0
1173; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1174; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !dereferenceable_or_null !2
1175; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1176; HSA-NEXT:    ret void
1177;
1178; MESA-LABEL: @kern_global_ptr_dereferencable_or_null(
1179; MESA-NEXT:    [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1180; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_GLOBAL_PTR_DEREFERENCABLE_OR_NULL_KERNARG_SEGMENT]], i64 36
1181; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1182; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !dereferenceable_or_null !2
1183; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1184; MESA-NEXT:    ret void
1185;
1186  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1187  ret void
1188}
1189
1190define amdgpu_kernel void @kern_nonnull_global_ptr(i8 addrspace(1)* nonnull %ptr) #0 {
1191; HSA-LABEL: @kern_nonnull_global_ptr(
1192; HSA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1193; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1194; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1195; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !nonnull !0
1196; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1197; HSA-NEXT:    ret void
1198;
1199; MESA-LABEL: @kern_nonnull_global_ptr(
1200; MESA-NEXT:    [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1201; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_NONNULL_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1202; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1203; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !nonnull !0
1204; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1205; MESA-NEXT:    ret void
1206;
1207  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1208  ret void
1209}
1210
1211define amdgpu_kernel void @kern_align32_global_ptr(i8 addrspace(1)* align 1024 %ptr) #0 {
1212; HSA-LABEL: @kern_align32_global_ptr(
1213; HSA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1214; HSA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 0
1215; HSA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1216; HSA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0, !align !3
1217; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1218; HSA-NEXT:    ret void
1219;
1220; MESA-LABEL: @kern_align32_global_ptr(
1221; MESA-NEXT:    [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1222; MESA-NEXT:    [[PTR_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_ALIGN32_GLOBAL_PTR_KERNARG_SEGMENT]], i64 36
1223; MESA-NEXT:    [[PTR_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[PTR_KERNARG_OFFSET]] to i8 addrspace(1)* addrspace(4)*
1224; MESA-NEXT:    [[PTR_LOAD:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* [[PTR_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0, !align !3
1225; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR_LOAD]], i8 addrspace(1)* addrspace(1)* undef, align 8
1226; MESA-NEXT:    ret void
1227;
1228  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1229  ret void
1230}
1231
1232define amdgpu_kernel void @kern_noalias_global_ptr(i8 addrspace(1)* noalias %ptr) #0 {
1233; HSA-LABEL: @kern_noalias_global_ptr(
1234; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1235; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1236; HSA-NEXT:    ret void
1237;
1238; MESA-LABEL: @kern_noalias_global_ptr(
1239; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1240; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1241; MESA-NEXT:    ret void
1242;
1243  store volatile i8 addrspace(1)* %ptr, i8 addrspace(1)* addrspace(1)* undef
1244  ret void
1245}
1246
1247define amdgpu_kernel void @kern_noalias_global_ptr_x2(i8 addrspace(1)* noalias %ptr0, i8 addrspace(1)* noalias %ptr1) #0 {
1248; HSA-LABEL: @kern_noalias_global_ptr_x2(
1249; HSA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1250; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1251; HSA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1252; HSA-NEXT:    ret void
1253;
1254; MESA-LABEL: @kern_noalias_global_ptr_x2(
1255; MESA-NEXT:    [[KERN_NOALIAS_GLOBAL_PTR_X2_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1256; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR0:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1257; MESA-NEXT:    store volatile i8 addrspace(1)* [[PTR1:%.*]], i8 addrspace(1)* addrspace(1)* undef, align 8
1258; MESA-NEXT:    ret void
1259;
1260  store volatile i8 addrspace(1)* %ptr0, i8 addrspace(1)* addrspace(1)* undef
1261  store volatile i8 addrspace(1)* %ptr1, i8 addrspace(1)* addrspace(1)* undef
1262  ret void
1263}
1264
1265define amdgpu_kernel void @struct_i8_i8_arg({i8, i8} %in) #0 {
1266; HSA-LABEL: @struct_i8_i8_arg(
1267; HSA-NEXT:  entry:
1268; HSA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1269; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 0
1270; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1271; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1272; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1273; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1274; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1275; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1276; HSA-NEXT:    ret void
1277;
1278; MESA-LABEL: @struct_i8_i8_arg(
1279; MESA-NEXT:  entry:
1280; MESA-NEXT:    [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1281; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I8_ARG_KERNARG_SEGMENT]], i64 36
1282; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i8 } addrspace(4)*
1283; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i8 }, { i8, i8 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1284; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 0
1285; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i8 } [[IN_LOAD]], 1
1286; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1287; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1288; MESA-NEXT:    ret void
1289;
1290entry:
1291  %elt0 = extractvalue {i8, i8} %in, 0
1292  %elt1 = extractvalue {i8, i8} %in, 1
1293  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1294  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1295  ret void
1296}
1297
1298define amdgpu_kernel void @struct_i8_i16_arg({i8, i16} %in) #0 {
1299; HSA-LABEL: @struct_i8_i16_arg(
1300; HSA-NEXT:  entry:
1301; HSA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1302; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 0
1303; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1304; HSA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1305; HSA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1306; HSA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1307; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1308; HSA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1309; HSA-NEXT:    ret void
1310;
1311; MESA-LABEL: @struct_i8_i16_arg(
1312; MESA-NEXT:  entry:
1313; MESA-NEXT:    [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1314; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STRUCT_I8_I16_ARG_KERNARG_SEGMENT]], i64 36
1315; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to { i8, i16 } addrspace(4)*
1316; MESA-NEXT:    [[IN_LOAD:%.*]] = load { i8, i16 }, { i8, i16 } addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1317; MESA-NEXT:    [[ELT0:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 0
1318; MESA-NEXT:    [[ELT1:%.*]] = extractvalue { i8, i16 } [[IN_LOAD]], 1
1319; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1320; MESA-NEXT:    store volatile i16 [[ELT1]], i16 addrspace(1)* null, align 4
1321; MESA-NEXT:    ret void
1322;
1323entry:
1324  %elt0 = extractvalue {i8, i16} %in, 0
1325  %elt1 = extractvalue {i8, i16} %in, 1
1326  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1327  store volatile i16 %elt1, i16 addrspace(1)* null, align 4
1328  ret void
1329}
1330
1331define amdgpu_kernel void @array_2xi8_arg([2 x i8] %in) #0 {
1332; HSA-LABEL: @array_2xi8_arg(
1333; HSA-NEXT:  entry:
1334; HSA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1335; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 0
1336; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1337; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1338; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1339; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1340; HSA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1341; HSA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1342; HSA-NEXT:    ret void
1343;
1344; MESA-LABEL: @array_2xi8_arg(
1345; MESA-NEXT:  entry:
1346; MESA-NEXT:    [[ARRAY_2XI8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1347; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI8_ARG_KERNARG_SEGMENT]], i64 36
1348; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i8] addrspace(4)*
1349; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i8], [2 x i8] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1350; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 0
1351; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i8] [[IN_LOAD]], 1
1352; MESA-NEXT:    store volatile i8 [[ELT0]], i8 addrspace(1)* null, align 4
1353; MESA-NEXT:    store volatile i8 [[ELT1]], i8 addrspace(1)* null, align 4
1354; MESA-NEXT:    ret void
1355;
1356entry:
1357  %elt0 = extractvalue [2 x i8] %in, 0
1358  %elt1 = extractvalue [2 x i8] %in, 1
1359  store volatile i8 %elt0, i8 addrspace(1)* null, align 4
1360  store volatile i8 %elt1, i8 addrspace(1)* null, align 4
1361  ret void
1362}
1363
1364define amdgpu_kernel void @array_2xi1_arg([2 x i1] %in) #0 {
1365; HSA-LABEL: @array_2xi1_arg(
1366; HSA-NEXT:  entry:
1367; HSA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1368; HSA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 0
1369; HSA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1370; HSA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1371; HSA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1372; HSA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1373; HSA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1374; HSA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1375; HSA-NEXT:    ret void
1376;
1377; MESA-LABEL: @array_2xi1_arg(
1378; MESA-NEXT:  entry:
1379; MESA-NEXT:    [[ARRAY_2XI1_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1380; MESA-NEXT:    [[IN_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[ARRAY_2XI1_ARG_KERNARG_SEGMENT]], i64 36
1381; MESA-NEXT:    [[IN_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[IN_KERNARG_OFFSET]] to [2 x i1] addrspace(4)*
1382; MESA-NEXT:    [[IN_LOAD:%.*]] = load [2 x i1], [2 x i1] addrspace(4)* [[IN_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1383; MESA-NEXT:    [[ELT0:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 0
1384; MESA-NEXT:    [[ELT1:%.*]] = extractvalue [2 x i1] [[IN_LOAD]], 1
1385; MESA-NEXT:    store volatile i1 [[ELT0]], i1 addrspace(1)* null, align 4
1386; MESA-NEXT:    store volatile i1 [[ELT1]], i1 addrspace(1)* null, align 4
1387; MESA-NEXT:    ret void
1388;
1389entry:
1390  %elt0 = extractvalue [2 x i1] %in, 0
1391  %elt1 = extractvalue [2 x i1] %in, 1
1392  store volatile i1 %elt0, i1 addrspace(1)* null, align 4
1393  store volatile i1 %elt1, i1 addrspace(1)* null, align 4
1394  ret void
1395}
1396
1397define amdgpu_kernel void @only_empty_struct({} %empty) #0 {
1398; HSA-LABEL: @only_empty_struct(
1399; HSA-NEXT:    ret void
1400;
1401; MESA-LABEL: @only_empty_struct(
1402; MESA-NEXT:    [[ONLY_EMPTY_STRUCT_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1403; MESA-NEXT:    ret void
1404;
1405  ret void
1406}
1407
1408define amdgpu_kernel void @empty_struct_with_other({} %empty, i32 %arg1) #0 {
1409; HSA-LABEL: @empty_struct_with_other(
1410; HSA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1411; HSA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 0
1412; HSA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1413; HSA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1414; HSA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1415; HSA-NEXT:    ret void
1416;
1417; MESA-LABEL: @empty_struct_with_other(
1418; MESA-NEXT:    [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1419; MESA-NEXT:    [[ARG1_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[EMPTY_STRUCT_WITH_OTHER_KERNARG_SEGMENT]], i64 36
1420; MESA-NEXT:    [[ARG1_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG1_KERNARG_OFFSET]] to i32 addrspace(4)*
1421; MESA-NEXT:    [[ARG1_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG1_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1422; MESA-NEXT:    store i32 [[ARG1_LOAD]], i32 addrspace(1)* undef, align 4
1423; MESA-NEXT:    ret void
1424;
1425  store i32 %arg1, i32 addrspace(1)* undef
1426  ret void
1427}
1428
1429; Should insert code after the allocas
1430define amdgpu_kernel void @static_alloca_kern_i32(i32 %arg0) {
1431; HSA-LABEL: @static_alloca_kern_i32(
1432; HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1433; HSA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1434; HSA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 0
1435; HSA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1436; HSA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1437; HSA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1438; HSA-NEXT:    ret void
1439;
1440; MESA-LABEL: @static_alloca_kern_i32(
1441; MESA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
1442; MESA-NEXT:    [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1443; MESA-NEXT:    [[ARG0_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[STATIC_ALLOCA_KERN_I32_KERNARG_SEGMENT]], i64 36
1444; MESA-NEXT:    [[ARG0_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[ARG0_KERNARG_OFFSET]] to i32 addrspace(4)*
1445; MESA-NEXT:    [[ARG0_LOAD:%.*]] = load i32, i32 addrspace(4)* [[ARG0_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1446; MESA-NEXT:    store volatile i32 [[ARG0_LOAD]], i32 addrspace(5)* [[ALLOCA]], align 4
1447; MESA-NEXT:    ret void
1448;
1449  %alloca = alloca i32, addrspace(5)
1450  store volatile i32 %arg0, i32 addrspace(5)* %alloca
1451  ret void
1452}
1453
1454; Make sure we don't break the IR if an alloca depends on the
1455; kernargs.
1456define amdgpu_kernel void @dyn_alloca_kernarg_i32(i32 %n) {
1457; HSA-LABEL: @dyn_alloca_kernarg_i32(
1458; HSA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1459; HSA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1460; HSA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 0
1461; HSA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1462; HSA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1463; HSA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1464; HSA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1465; HSA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1466; HSA-NEXT:    ret void
1467;
1468; MESA-LABEL: @dyn_alloca_kernarg_i32(
1469; MESA-NEXT:    [[ALLOCA0:%.*]] = alloca i32, align 4, addrspace(5)
1470; MESA-NEXT:    [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1471; MESA-NEXT:    [[N_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[DYN_ALLOCA_KERNARG_I32_KERNARG_SEGMENT]], i64 36
1472; MESA-NEXT:    [[N_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[N_KERNARG_OFFSET]] to i32 addrspace(4)*
1473; MESA-NEXT:    [[N_LOAD:%.*]] = load i32, i32 addrspace(4)* [[N_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1474; MESA-NEXT:    [[ALLOCA1:%.*]] = alloca i32, i32 [[N_LOAD]], align 4, addrspace(5)
1475; MESA-NEXT:    store volatile i32 0, i32 addrspace(5)* [[ALLOCA0]], align 4
1476; MESA-NEXT:    store volatile i32 1, i32 addrspace(5)* [[ALLOCA1]], align 4
1477; MESA-NEXT:    ret void
1478;
1479  %alloca0 = alloca i32, addrspace(5)
1480  %alloca1 = alloca i32, i32 %n, addrspace(5)
1481  store volatile i32 0, i32 addrspace(5)* %alloca0
1482  store volatile i32 1, i32 addrspace(5)* %alloca1
1483  ret void
1484}
1485
1486; Byref pointers should only be treated as offsets from kernarg
1487define amdgpu_kernel void @byref_constant_i8_arg(i32 addrspace(1)* nocapture %out, i8 addrspace(4)* byref(i8) %in.byref) {
1488; HSA-LABEL: @byref_constant_i8_arg(
1489; HSA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1490; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 0
1491; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1492; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1493; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 8
1494; HSA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1495; HSA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1496; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1497; HSA-NEXT:    ret void
1498;
1499; MESA-LABEL: @byref_constant_i8_arg(
1500; MESA-NEXT:    [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1501; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 36
1502; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1503; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1504; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I8_ARG_KERNARG_SEGMENT]], i64 44
1505; MESA-NEXT:    [[IN:%.*]] = load i8, i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]], align 1
1506; MESA-NEXT:    [[EXT:%.*]] = zext i8 [[IN]] to i32
1507; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1508; MESA-NEXT:    ret void
1509;
1510  %in = load i8, i8 addrspace(4)* %in.byref
1511  %ext = zext i8 %in to i32
1512  store i32 %ext, i32 addrspace(1)* %out, align 4
1513  ret void
1514}
1515
1516define amdgpu_kernel void @byref_constant_i16_arg(i32 addrspace(1)* nocapture %out, i16 addrspace(4)* byref(i16) %in.byref) {
1517; HSA-LABEL: @byref_constant_i16_arg(
1518; HSA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1519; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 0
1520; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1521; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1522; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 8
1523; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1524; HSA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1525; HSA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1526; HSA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1527; HSA-NEXT:    ret void
1528;
1529; MESA-LABEL: @byref_constant_i16_arg(
1530; MESA-NEXT:    [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1531; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 36
1532; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1533; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1534; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I16_ARG_KERNARG_SEGMENT]], i64 44
1535; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i16 addrspace(4)*
1536; MESA-NEXT:    [[IN:%.*]] = load i16, i16 addrspace(4)* [[TMP1]], align 2
1537; MESA-NEXT:    [[EXT:%.*]] = zext i16 [[IN]] to i32
1538; MESA-NEXT:    store i32 [[EXT]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1539; MESA-NEXT:    ret void
1540;
1541  %in = load i16, i16 addrspace(4)* %in.byref
1542  %ext = zext i16 %in to i32
1543  store i32 %ext, i32 addrspace(1)* %out, align 4
1544  ret void
1545}
1546
1547define amdgpu_kernel void @byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in.byref, i32 %after.offset) {
1548; HSA-LABEL: @byref_constant_i32_arg(
1549; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(16) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1550; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1551; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1552; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1553; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1554; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1555; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1556; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1557; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1558; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1559; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1560; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1561; HSA-NEXT:    ret void
1562;
1563; MESA-LABEL: @byref_constant_i32_arg(
1564; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(52) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1565; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1566; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1567; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1568; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1569; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1570; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1571; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1572; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1573; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1574; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1575; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1576; MESA-NEXT:    ret void
1577;
1578  %in = load i32, i32 addrspace(4)* %in.byref
1579  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1580  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1581  ret void
1582}
1583
1584define amdgpu_kernel void @byref_constant_v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> addrspace(4)* byref(<4 x i32>) %in.byref, i32 %after.offset) {
1585; HSA-LABEL: @byref_constant_v4i32_arg(
1586; HSA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(36) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1587; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 0
1588; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1589; HSA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1590; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 16
1591; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1592; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 32
1593; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1594; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1595; HSA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1596; HSA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1597; HSA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1598; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1599; HSA-NEXT:    ret void
1600;
1601; MESA-LABEL: @byref_constant_v4i32_arg(
1602; MESA-NEXT:    [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(72) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1603; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 36
1604; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to <4 x i32> addrspace(1)* addrspace(4)*
1605; MESA-NEXT:    [[OUT_LOAD:%.*]] = load <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1606; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 52
1607; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <4 x i32> addrspace(4)*
1608; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_V4I32_ARG_KERNARG_SEGMENT]], i64 68
1609; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1610; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1611; MESA-NEXT:    [[IN:%.*]] = load <4 x i32>, <4 x i32> addrspace(4)* [[TMP1]], align 16
1612; MESA-NEXT:    store volatile <4 x i32> [[IN]], <4 x i32> addrspace(1)* [[OUT_LOAD]], align 4
1613; MESA-NEXT:    [[OUT_CAST:%.*]] = bitcast <4 x i32> addrspace(1)* [[OUT_LOAD]] to i32 addrspace(1)*
1614; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_CAST]], align 4
1615; MESA-NEXT:    ret void
1616;
1617  %in = load <4 x i32>, <4 x i32> addrspace(4)* %in.byref
1618  store volatile <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
1619  %out.cast = bitcast <4 x i32> addrspace(1)* %out to i32 addrspace(1)*
1620  store volatile i32 %after.offset, i32 addrspace(1)* %out.cast, align 4
1621  ret void
1622}
1623
1624define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
1625; HSA-LABEL: @byref_align_constant_i32_arg(
1626; HSA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(264) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1627; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1628; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1629; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1630; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 256
1631; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1632; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 260
1633; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1634; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1635; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1636; HSA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1637; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1638; HSA-NEXT:    ret void
1639;
1640; MESA-LABEL: @byref_align_constant_i32_arg(
1641; MESA-NEXT:    [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 256 dereferenceable(300) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1642; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1643; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1644; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1645; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 292
1646; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1647; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_ALIGN_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 296
1648; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1649; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 8, !invariant.load !0
1650; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1651; MESA-NEXT:    store volatile i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1652; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1653; MESA-NEXT:    ret void
1654;
1655  %in = load i32, i32 addrspace(4)* %in.byref
1656  store volatile i32 %in, i32 addrspace(1)* %out, align 4
1657  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1658  ret void
1659}
1660
1661define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
1662; HSA-LABEL: @byref_natural_align_constant_v16i32_arg(
1663; HSA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(132) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1664; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 0
1665; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1666; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1667; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 64
1668; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1669; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 128
1670; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1671; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1672; HSA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1673; HSA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1674; HSA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1675; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1676; HSA-NEXT:    ret void
1677;
1678; MESA-LABEL: @byref_natural_align_constant_v16i32_arg(
1679; MESA-NEXT:    [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 64 dereferenceable(168) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1680; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 36
1681; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1682; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1683; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 100
1684; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to <16 x i32> addrspace(4)*
1685; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_NATURAL_ALIGN_CONSTANT_V16I32_ARG_KERNARG_SEGMENT]], i64 164
1686; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1687; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1688; MESA-NEXT:    [[IN:%.*]] = load <16 x i32>, <16 x i32> addrspace(4)* [[TMP2]], align 64
1689; MESA-NEXT:    [[CAST_OUT:%.*]] = bitcast i32 addrspace(1)* [[OUT_LOAD]] to <16 x i32> addrspace(1)*
1690; MESA-NEXT:    store volatile <16 x i32> [[IN]], <16 x i32> addrspace(1)* [[CAST_OUT]], align 4
1691; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1692; MESA-NEXT:    ret void
1693;
1694  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
1695  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
1696  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
1697  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1698  ret void
1699}
1700
1701; Also accept byref kernel arguments with other global address spaces.
1702define amdgpu_kernel void @byref_global_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* byref(i32) %in.byref) {
1703; HSA-LABEL: @byref_global_i32_arg(
1704; HSA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1705; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1706; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1707; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1708; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1709; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1710; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1711; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1712; HSA-NEXT:    ret void
1713;
1714; MESA-LABEL: @byref_global_i32_arg(
1715; MESA-NEXT:    [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1716; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1717; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1718; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1719; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_GLOBAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1720; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(1)*
1721; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(1)* [[TMP1]], align 4
1722; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1723; MESA-NEXT:    ret void
1724;
1725  %in = load i32, i32 addrspace(1)* %in.byref
1726  store i32 %in, i32 addrspace(1)* %out, align 4
1727  ret void
1728}
1729
1730define amdgpu_kernel void @byref_flat_i32_arg(i32 addrspace(1)* nocapture %out, i32* byref(i32) %in.byref) {
1731; HSA-LABEL: @byref_flat_i32_arg(
1732; HSA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1733; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 0
1734; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1735; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1736; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 8
1737; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1738; HSA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1739; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1740; HSA-NEXT:    ret void
1741;
1742; MESA-LABEL: @byref_flat_i32_arg(
1743; MESA-NEXT:    [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1744; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 36
1745; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1746; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1747; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_FLAT_I32_ARG_KERNARG_SEGMENT]], i64 44
1748; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32*
1749; MESA-NEXT:    [[IN:%.*]] = load i32, i32* [[TMP1]], align 4
1750; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1751; MESA-NEXT:    ret void
1752;
1753  %in = load i32, i32* %in.byref
1754  store i32 %in, i32 addrspace(1)* %out, align 4
1755  ret void
1756}
1757
1758define amdgpu_kernel void @byref_constant_32bit_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(6)* byref(i32) %in.byref) {
1759; HSA-LABEL: @byref_constant_32bit_i32_arg(
1760; HSA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1761; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 0
1762; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1763; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1764; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 8
1765; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1766; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1767; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1768; HSA-NEXT:    ret void
1769;
1770; MESA-LABEL: @byref_constant_32bit_i32_arg(
1771; MESA-NEXT:    [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1772; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 36
1773; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1774; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1775; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_32BIT_I32_ARG_KERNARG_SEGMENT]], i64 44
1776; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(6)*
1777; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(6)* [[TMP1]], align 4
1778; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1779; MESA-NEXT:    ret void
1780;
1781  %in = load i32, i32 addrspace(6)* %in.byref
1782  store i32 %in, i32 addrspace(1)* %out, align 4
1783  ret void
1784}
1785
1786define amdgpu_kernel void @byref_unknown_as_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(999)* byref(i32) %in.byref) {
1787; HSA-LABEL: @byref_unknown_as_i32_arg(
1788; HSA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1789; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 0
1790; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1791; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1792; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 8
1793; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1794; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1795; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1796; HSA-NEXT:    ret void
1797;
1798; MESA-LABEL: @byref_unknown_as_i32_arg(
1799; MESA-NEXT:    [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1800; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 36
1801; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1802; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1803; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_UNKNOWN_AS_I32_ARG_KERNARG_SEGMENT]], i64 44
1804; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(999)*
1805; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(999)* [[TMP1]], align 4
1806; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1807; MESA-NEXT:    ret void
1808;
1809  %in = load i32, i32 addrspace(999)* %in.byref
1810  store i32 %in, i32 addrspace(1)* %out, align 4
1811  ret void
1812}
1813
1814; Invalid, but should not crash.
1815define amdgpu_kernel void @byref_local_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(3)* byref(i32) %in.byref) {
1816; HSA-LABEL: @byref_local_i32_arg(
1817; HSA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1818; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 0
1819; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1820; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1821; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 8
1822; HSA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1823; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1824; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1825; HSA-NEXT:    ret void
1826;
1827; MESA-LABEL: @byref_local_i32_arg(
1828; MESA-NEXT:    [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(48) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1829; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 36
1830; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1831; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1832; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_LOCAL_I32_ARG_KERNARG_SEGMENT]], i64 44
1833; MESA-NEXT:    [[TMP1:%.*]] = addrspacecast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(3)*
1834; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(3)* [[TMP1]], align 4
1835; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1836; MESA-NEXT:    ret void
1837;
1838  %in = load i32, i32 addrspace(3)* %in.byref
1839  store i32 %in, i32 addrspace(1)* %out, align 4
1840  ret void
1841}
1842
1843define amdgpu_kernel void @multi_byref_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) %in0.byref, i32 addrspace(4)* byref(i32) %in1.byref, i32 %after.offset) {
1844; HSA-LABEL: @multi_byref_constant_i32_arg(
1845; HSA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(20) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1846; HSA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 0
1847; HSA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1848; HSA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1849; HSA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 8
1850; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1851; HSA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 12
1852; HSA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1853; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 16
1854; HSA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1855; HSA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
1856; HSA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1857; HSA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1858; HSA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1859; HSA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1860; HSA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1861; HSA-NEXT:    ret void
1862;
1863; MESA-LABEL: @multi_byref_constant_i32_arg(
1864; MESA-NEXT:    [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(56) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1865; MESA-NEXT:    [[OUT_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 36
1866; MESA-NEXT:    [[OUT_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[OUT_KERNARG_OFFSET]] to i32 addrspace(1)* addrspace(4)*
1867; MESA-NEXT:    [[OUT_LOAD:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* [[OUT_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1868; MESA-NEXT:    [[IN0_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 44
1869; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN0_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1870; MESA-NEXT:    [[IN1_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 48
1871; MESA-NEXT:    [[TMP2:%.*]] = bitcast i8 addrspace(4)* [[IN1_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1872; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[MULTI_BYREF_CONSTANT_I32_ARG_KERNARG_SEGMENT]], i64 52
1873; MESA-NEXT:    [[AFTER_OFFSET_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET]] to i32 addrspace(4)*
1874; MESA-NEXT:    [[AFTER_OFFSET_LOAD:%.*]] = load i32, i32 addrspace(4)* [[AFTER_OFFSET_KERNARG_OFFSET_CAST]], align 4, !invariant.load !0
1875; MESA-NEXT:    [[IN0:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1876; MESA-NEXT:    [[IN1:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4
1877; MESA-NEXT:    store volatile i32 [[IN0]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1878; MESA-NEXT:    store volatile i32 [[IN1]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1879; MESA-NEXT:    store volatile i32 [[AFTER_OFFSET_LOAD]], i32 addrspace(1)* [[OUT_LOAD]], align 4
1880; MESA-NEXT:    ret void
1881;
1882  %in0 = load i32, i32 addrspace(4)* %in0.byref
1883  %in1 = load i32, i32 addrspace(4)* %in1.byref
1884  store volatile i32 %in0, i32 addrspace(1)* %out, align 4
1885  store volatile i32 %in1, i32 addrspace(1)* %out, align 4
1886  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
1887  ret void
1888}
1889
1890define amdgpu_kernel void @byref_constant_i32_arg_offset0(i32 addrspace(4)* byref(i32) %in.byref) {
1891; HSA-LABEL: @byref_constant_i32_arg_offset0(
1892; HSA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(4) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1893; HSA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 0
1894; HSA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1895; HSA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1896; HSA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1897; HSA-NEXT:    ret void
1898;
1899; MESA-LABEL: @byref_constant_i32_arg_offset0(
1900; MESA-NEXT:    [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(40) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
1901; MESA-NEXT:    [[IN_BYREF_BYVAL_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[BYREF_CONSTANT_I32_ARG_OFFSET0_KERNARG_SEGMENT]], i64 36
1902; MESA-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[IN_BYREF_BYVAL_KERNARG_OFFSET]] to i32 addrspace(4)*
1903; MESA-NEXT:    [[IN:%.*]] = load i32, i32 addrspace(4)* [[TMP1]], align 4
1904; MESA-NEXT:    store i32 [[IN]], i32 addrspace(1)* undef, align 4
1905; MESA-NEXT:    ret void
1906;
1907  %in = load i32, i32 addrspace(4)* %in.byref
1908  store i32 %in, i32 addrspace(1)* undef, align 4
1909  ret void
1910}
1911
1912attributes #0 = { nounwind "target-cpu"="kaveri" }
1913attributes #1 = { nounwind "target-cpu"="kaveri" "amdgpu-implicitarg-num-bytes"="40" }
1914attributes #2 = { nounwind "target-cpu"="tahiti" }
1915
1916; GCN: 0 = !{}
1917; GCN: !1 = !{i64 42}
1918; GCN: !2 = !{i64 128}
1919; GCN: !3 = !{i64 1024}
1920