1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4
5; Test using saddr addressing mode of global_*load_* flat instructions.
6
7; --------------------------------------------------------------------------------
8; No vgpr offset, constants
9; --------------------------------------------------------------------------------
10
11; SGPR base only
12define amdgpu_ps float @global_load_saddr_i8_offset_0(i8 addrspace(1)* inreg %sbase) {
13; GCN-LABEL: global_load_saddr_i8_offset_0:
14; GCN:       ; %bb.0:
15; GCN-NEXT:    v_mov_b32_e32 v0, 0
16; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
17; GCN-NEXT:    s_waitcnt vmcnt(0)
18; GCN-NEXT:    ; return to shader part epilog
19  %load = load i8, i8 addrspace(1)* %sbase
20  %zext = zext i8 %load to i32
21  %to.vgpr = bitcast i32 %zext to float
22  ret float %to.vgpr
23}
24
25; SGPR base with maximum gfx9 immediate offset
26define amdgpu_ps float @global_load_saddr_i8_offset_4095(i8 addrspace(1)* inreg %sbase) {
27; GFX9-LABEL: global_load_saddr_i8_offset_4095:
28; GFX9:       ; %bb.0:
29; GFX9-NEXT:    v_mov_b32_e32 v0, 0
30; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
31; GFX9-NEXT:    s_waitcnt vmcnt(0)
32; GFX9-NEXT:    ; return to shader part epilog
33;
34; GFX10-LABEL: global_load_saddr_i8_offset_4095:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
37; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
38; GFX10-NEXT:    s_waitcnt vmcnt(0)
39; GFX10-NEXT:    ; return to shader part epilog
40  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
41  %load = load i8, i8 addrspace(1)* %gep0
42  %zext = zext i8 %load to i32
43  %to.vgpr = bitcast i32 %zext to float
44  ret float %to.vgpr
45}
46
47; SGPR base with maximum gfx9 immediate offset + 1
48define amdgpu_ps float @global_load_saddr_i8_offset_4096(i8 addrspace(1)* inreg %sbase) {
49; GCN-LABEL: global_load_saddr_i8_offset_4096:
50; GCN:       ; %bb.0:
51; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
52; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
53; GCN-NEXT:    s_waitcnt vmcnt(0)
54; GCN-NEXT:    ; return to shader part epilog
55  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4096
56  %load = load i8, i8 addrspace(1)* %gep0
57  %zext = zext i8 %load to i32
58  %to.vgpr = bitcast i32 %zext to float
59  ret float %to.vgpr
60}
61
62; SGPR base with maximum gfx9 immediate offset + 2
63define amdgpu_ps float @global_load_saddr_i8_offset_4097(i8 addrspace(1)* inreg %sbase) {
64; GCN-LABEL: global_load_saddr_i8_offset_4097:
65; GCN:       ; %bb.0:
66; GCN-NEXT:    v_mov_b32_e32 v0, 0x1000
67; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
68; GCN-NEXT:    s_waitcnt vmcnt(0)
69; GCN-NEXT:    ; return to shader part epilog
70  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4097
71  %load = load i8, i8 addrspace(1)* %gep0
72  %zext = zext i8 %load to i32
73  %to.vgpr = bitcast i32 %zext to float
74  ret float %to.vgpr
75}
76
77; SGPR base with maximum negative gfx9 immediate offset
78define amdgpu_ps float @global_load_saddr_i8_offset_neg4096(i8 addrspace(1)* inreg %sbase) {
79; GFX9-LABEL: global_load_saddr_i8_offset_neg4096:
80; GFX9:       ; %bb.0:
81; GFX9-NEXT:    v_mov_b32_e32 v0, 0
82; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
83; GFX9-NEXT:    s_waitcnt vmcnt(0)
84; GFX9-NEXT:    ; return to shader part epilog
85;
86; GFX10-LABEL: global_load_saddr_i8_offset_neg4096:
87; GFX10:       ; %bb.0:
88; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
89; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
90; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
91; GFX10-NEXT:    s_waitcnt vmcnt(0)
92; GFX10-NEXT:    ; return to shader part epilog
93  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4096
94  %load = load i8, i8 addrspace(1)* %gep0
95  %zext = zext i8 %load to i32
96  %to.vgpr = bitcast i32 %zext to float
97  ret float %to.vgpr
98}
99
100; SGPR base with maximum negative gfx9 immediate offset -1
101define amdgpu_ps float @global_load_saddr_i8_offset_neg4097(i8 addrspace(1)* inreg %sbase) {
102; GFX9-LABEL: global_load_saddr_i8_offset_neg4097:
103; GFX9:       ; %bb.0:
104; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffefff
105; GFX9-NEXT:    s_addc_u32 s1, s3, -1
106; GFX9-NEXT:    v_mov_b32_e32 v0, 0
107; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
108; GFX9-NEXT:    s_waitcnt vmcnt(0)
109; GFX9-NEXT:    ; return to shader part epilog
110;
111; GFX10-LABEL: global_load_saddr_i8_offset_neg4097:
112; GFX10:       ; %bb.0:
113; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
114; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
115; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
116; GFX10-NEXT:    s_waitcnt vmcnt(0)
117; GFX10-NEXT:    ; return to shader part epilog
118  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4097
119  %load = load i8, i8 addrspace(1)* %gep0
120  %zext = zext i8 %load to i32
121  %to.vgpr = bitcast i32 %zext to float
122  ret float %to.vgpr
123}
124
125; SGPR base with maximum negative gfx9 immediate offset -2
126define amdgpu_ps float @global_load_saddr_i8_offset_neg4098(i8 addrspace(1)* inreg %sbase) {
127; GFX9-LABEL: global_load_saddr_i8_offset_neg4098:
128; GFX9:       ; %bb.0:
129; GFX9-NEXT:    s_add_u32 s0, s2, 0xffffeffe
130; GFX9-NEXT:    s_addc_u32 s1, s3, -1
131; GFX9-NEXT:    v_mov_b32_e32 v0, 0
132; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
133; GFX9-NEXT:    s_waitcnt vmcnt(0)
134; GFX9-NEXT:    ; return to shader part epilog
135;
136; GFX10-LABEL: global_load_saddr_i8_offset_neg4098:
137; GFX10:       ; %bb.0:
138; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff000, s2
139; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
140; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2
141; GFX10-NEXT:    s_waitcnt vmcnt(0)
142; GFX10-NEXT:    ; return to shader part epilog
143  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4098
144  %load = load i8, i8 addrspace(1)* %gep0
145  %zext = zext i8 %load to i32
146  %to.vgpr = bitcast i32 %zext to float
147  ret float %to.vgpr
148}
149
150; SGPR base with maximum gfx10 immediate offset
151define amdgpu_ps float @global_load_saddr_i8_offset_2048(i8 addrspace(1)* inreg %sbase) {
152; GFX9-LABEL: global_load_saddr_i8_offset_2048:
153; GFX9:       ; %bb.0:
154; GFX9-NEXT:    v_mov_b32_e32 v0, 0
155; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
156; GFX9-NEXT:    s_waitcnt vmcnt(0)
157; GFX9-NEXT:    ; return to shader part epilog
158;
159; GFX10-LABEL: global_load_saddr_i8_offset_2048:
160; GFX10:       ; %bb.0:
161; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
162; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
163; GFX10-NEXT:    s_waitcnt vmcnt(0)
164; GFX10-NEXT:    ; return to shader part epilog
165  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2048
166  %load = load i8, i8 addrspace(1)* %gep0
167  %zext = zext i8 %load to i32
168  %to.vgpr = bitcast i32 %zext to float
169  ret float %to.vgpr
170}
171
172; SGPR base with maximum gfx10 immediate offset + 1
173define amdgpu_ps float @global_load_saddr_i8_offset_2049(i8 addrspace(1)* inreg %sbase) {
174; GFX9-LABEL: global_load_saddr_i8_offset_2049:
175; GFX9:       ; %bb.0:
176; GFX9-NEXT:    v_mov_b32_e32 v0, 0
177; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2049
178; GFX9-NEXT:    s_waitcnt vmcnt(0)
179; GFX9-NEXT:    ; return to shader part epilog
180;
181; GFX10-LABEL: global_load_saddr_i8_offset_2049:
182; GFX10:       ; %bb.0:
183; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
184; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:1
185; GFX10-NEXT:    s_waitcnt vmcnt(0)
186; GFX10-NEXT:    ; return to shader part epilog
187  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2049
188  %load = load i8, i8 addrspace(1)* %gep0
189  %zext = zext i8 %load to i32
190  %to.vgpr = bitcast i32 %zext to float
191  ret float %to.vgpr
192}
193
194; SGPR base with maximum gfx10 immediate offset + 2
195define amdgpu_ps float @global_load_saddr_i8_offset_2050(i8 addrspace(1)* inreg %sbase) {
196; GFX9-LABEL: global_load_saddr_i8_offset_2050:
197; GFX9:       ; %bb.0:
198; GFX9-NEXT:    v_mov_b32_e32 v0, 0
199; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2050
200; GFX9-NEXT:    s_waitcnt vmcnt(0)
201; GFX9-NEXT:    ; return to shader part epilog
202;
203; GFX10-LABEL: global_load_saddr_i8_offset_2050:
204; GFX10:       ; %bb.0:
205; GFX10-NEXT:    v_mov_b32_e32 v0, 0x800
206; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2
207; GFX10-NEXT:    s_waitcnt vmcnt(0)
208; GFX10-NEXT:    ; return to shader part epilog
209  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 2050
210  %load = load i8, i8 addrspace(1)* %gep0
211  %zext = zext i8 %load to i32
212  %to.vgpr = bitcast i32 %zext to float
213  ret float %to.vgpr
214}
215
216; SGPR base with maximum negative gfx10 immediate offset
217define amdgpu_ps float @global_load_saddr_i8_offset_neg2048(i8 addrspace(1)* inreg %sbase) {
218; GCN-LABEL: global_load_saddr_i8_offset_neg2048:
219; GCN:       ; %bb.0:
220; GCN-NEXT:    v_mov_b32_e32 v0, 0
221; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
222; GCN-NEXT:    s_waitcnt vmcnt(0)
223; GCN-NEXT:    ; return to shader part epilog
224  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2048
225  %load = load i8, i8 addrspace(1)* %gep0
226  %zext = zext i8 %load to i32
227  %to.vgpr = bitcast i32 %zext to float
228  ret float %to.vgpr
229}
230
231; SGPR base with maximum negative gfx10 immediate offset - 1
232define amdgpu_ps float @global_load_saddr_i8_offset_neg2049(i8 addrspace(1)* inreg %sbase) {
233; GFX9-LABEL: global_load_saddr_i8_offset_neg2049:
234; GFX9:       ; %bb.0:
235; GFX9-NEXT:    v_mov_b32_e32 v0, 0
236; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
237; GFX9-NEXT:    s_waitcnt vmcnt(0)
238; GFX9-NEXT:    ; return to shader part epilog
239;
240; GFX10-LABEL: global_load_saddr_i8_offset_neg2049:
241; GFX10:       ; %bb.0:
242; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff800, s2
243; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
244; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
245; GFX10-NEXT:    s_waitcnt vmcnt(0)
246; GFX10-NEXT:    ; return to shader part epilog
247  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2049
248  %load = load i8, i8 addrspace(1)* %gep0
249  %zext = zext i8 %load to i32
250  %to.vgpr = bitcast i32 %zext to float
251  ret float %to.vgpr
252}
253
254; SGPR base with maximum negative gfx10 immediate offset - 1
255define amdgpu_ps float @global_load_saddr_i8_offset_neg2050(i8 addrspace(1)* inreg %sbase) {
256; GFX9-LABEL: global_load_saddr_i8_offset_neg2050:
257; GFX9:       ; %bb.0:
258; GFX9-NEXT:    v_mov_b32_e32 v0, 0
259; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2050
260; GFX9-NEXT:    s_waitcnt vmcnt(0)
261; GFX9-NEXT:    ; return to shader part epilog
262;
263; GFX10-LABEL: global_load_saddr_i8_offset_neg2050:
264; GFX10:       ; %bb.0:
265; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0xfffff800, s2
266; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
267; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2
268; GFX10-NEXT:    s_waitcnt vmcnt(0)
269; GFX10-NEXT:    ; return to shader part epilog
270  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -2050
271  %load = load i8, i8 addrspace(1)* %gep0
272  %zext = zext i8 %load to i32
273  %to.vgpr = bitcast i32 %zext to float
274  ret float %to.vgpr
275}
276
277define amdgpu_ps float @global_load_saddr_i8_offset_4294967295(i8 addrspace(1)* inreg %sbase) {
278; GFX9-LABEL: global_load_saddr_i8_offset_4294967295:
279; GFX9:       ; %bb.0:
280; GFX9-NEXT:    v_mov_b32_e32 v0, 0xfffff000
281; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
282; GFX9-NEXT:    s_waitcnt vmcnt(0)
283; GFX9-NEXT:    ; return to shader part epilog
284;
285; GFX10-LABEL: global_load_saddr_i8_offset_4294967295:
286; GFX10:       ; %bb.0:
287; GFX10-NEXT:    v_mov_b32_e32 v0, 0xfffff800
288; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
289; GFX10-NEXT:    s_waitcnt vmcnt(0)
290; GFX10-NEXT:    ; return to shader part epilog
291  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967295
292  %load = load i8, i8 addrspace(1)* %gep0
293  %zext = zext i8 %load to i32
294  %to.vgpr = bitcast i32 %zext to float
295  ret float %to.vgpr
296}
297
298define amdgpu_ps float @global_load_saddr_i8_offset_4294967296(i8 addrspace(1)* inreg %sbase) {
299; GFX9-LABEL: global_load_saddr_i8_offset_4294967296:
300; GFX9:       ; %bb.0:
301; GFX9-NEXT:    v_mov_b32_e32 v1, s3
302; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
303; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
304; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
305; GFX9-NEXT:    s_waitcnt vmcnt(0)
306; GFX9-NEXT:    ; return to shader part epilog
307;
308; GFX10-LABEL: global_load_saddr_i8_offset_4294967296:
309; GFX10:       ; %bb.0:
310; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
311; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
312; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
313; GFX10-NEXT:    s_waitcnt vmcnt(0)
314; GFX10-NEXT:    ; return to shader part epilog
315  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967296
316  %load = load i8, i8 addrspace(1)* %gep0
317  %zext = zext i8 %load to i32
318  %to.vgpr = bitcast i32 %zext to float
319  ret float %to.vgpr
320}
321
322define amdgpu_ps float @global_load_saddr_i8_offset_4294967297(i8 addrspace(1)* inreg %sbase) {
323; GFX9-LABEL: global_load_saddr_i8_offset_4294967297:
324; GFX9:       ; %bb.0:
325; GFX9-NEXT:    v_mov_b32_e32 v1, s3
326; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
327; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 1, v1, vcc
328; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
329; GFX9-NEXT:    s_waitcnt vmcnt(0)
330; GFX9-NEXT:    ; return to shader part epilog
331;
332; GFX10-LABEL: global_load_saddr_i8_offset_4294967297:
333; GFX10:       ; %bb.0:
334; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
335; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
336; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:1
337; GFX10-NEXT:    s_waitcnt vmcnt(0)
338; GFX10-NEXT:    ; return to shader part epilog
339  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294967297
340  %load = load i8, i8 addrspace(1)* %gep0
341  %zext = zext i8 %load to i32
342  %to.vgpr = bitcast i32 %zext to float
343  ret float %to.vgpr
344}
345
346define amdgpu_ps float @global_load_saddr_i8_offset_4294971391(i8 addrspace(1)* inreg %sbase) {
347; GFX9-LABEL: global_load_saddr_i8_offset_4294971391:
348; GFX9:       ; %bb.0:
349; GFX9-NEXT:    s_add_u32 s0, s2, 0xfff
350; GFX9-NEXT:    s_addc_u32 s1, s3, 1
351; GFX9-NEXT:    v_mov_b32_e32 v0, 0
352; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
353; GFX9-NEXT:    s_waitcnt vmcnt(0)
354; GFX9-NEXT:    ; return to shader part epilog
355;
356; GFX10-LABEL: global_load_saddr_i8_offset_4294971391:
357; GFX10:       ; %bb.0:
358; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
359; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
360; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
361; GFX10-NEXT:    s_waitcnt vmcnt(0)
362; GFX10-NEXT:    ; return to shader part epilog
363  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971391
364  %load = load i8, i8 addrspace(1)* %gep0
365  %zext = zext i8 %load to i32
366  %to.vgpr = bitcast i32 %zext to float
367  ret float %to.vgpr
368}
369
370define amdgpu_ps float @global_load_saddr_i8_offset_4294971392(i8 addrspace(1)* inreg %sbase) {
371; GFX9-LABEL: global_load_saddr_i8_offset_4294971392:
372; GFX9:       ; %bb.0:
373; GFX9-NEXT:    s_add_u32 s0, s2, 0x1000
374; GFX9-NEXT:    s_addc_u32 s1, s3, 1
375; GFX9-NEXT:    v_mov_b32_e32 v0, 0
376; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
377; GFX9-NEXT:    s_waitcnt vmcnt(0)
378; GFX9-NEXT:    ; return to shader part epilog
379;
380; GFX10-LABEL: global_load_saddr_i8_offset_4294971392:
381; GFX10:       ; %bb.0:
382; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x1000, s2
383; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], 1, s3, s[0:1]
384; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
385; GFX10-NEXT:    s_waitcnt vmcnt(0)
386; GFX10-NEXT:    ; return to shader part epilog
387  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4294971392
388  %load = load i8, i8 addrspace(1)* %gep0
389  %zext = zext i8 %load to i32
390  %to.vgpr = bitcast i32 %zext to float
391  ret float %to.vgpr
392}
393
394define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967295(i8 addrspace(1)* inreg %sbase) {
395; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967295:
396; GFX9:       ; %bb.0:
397; GFX9-NEXT:    v_mov_b32_e32 v0, s2
398; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
399; GFX9-NEXT:    v_mov_b32_e32 v1, s3
400; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
401; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-4095
402; GFX9-NEXT:    s_waitcnt vmcnt(0)
403; GFX9-NEXT:    ; return to shader part epilog
404;
405; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967295:
406; GFX10:       ; %bb.0:
407; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0x800, s2
408; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
409; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-2047
410; GFX10-NEXT:    s_waitcnt vmcnt(0)
411; GFX10-NEXT:    ; return to shader part epilog
412  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967295
413  %load = load i8, i8 addrspace(1)* %gep0
414  %zext = zext i8 %load to i32
415  %to.vgpr = bitcast i32 %zext to float
416  ret float %to.vgpr
417}
418
419define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967296(i8 addrspace(1)* inreg %sbase) {
420; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967296:
421; GFX9:       ; %bb.0:
422; GFX9-NEXT:    v_mov_b32_e32 v1, s3
423; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
424; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
425; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
426; GFX9-NEXT:    s_waitcnt vmcnt(0)
427; GFX9-NEXT:    ; return to shader part epilog
428;
429; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967296:
430; GFX10:       ; %bb.0:
431; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
432; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
433; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
434; GFX10-NEXT:    s_waitcnt vmcnt(0)
435; GFX10-NEXT:    ; return to shader part epilog
436  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967296
437  %load = load i8, i8 addrspace(1)* %gep0
438  %zext = zext i8 %load to i32
439  %to.vgpr = bitcast i32 %zext to float
440  ret float %to.vgpr
441}
442
443define amdgpu_ps float @global_load_saddr_i8_offset_neg4294967297(i8 addrspace(1)* inreg %sbase) {
444; GFX9-LABEL: global_load_saddr_i8_offset_neg4294967297:
445; GFX9:       ; %bb.0:
446; GFX9-NEXT:    v_mov_b32_e32 v1, s3
447; GFX9-NEXT:    v_add_co_u32_e64 v0, vcc, 0, s2
448; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
449; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
450; GFX9-NEXT:    s_waitcnt vmcnt(0)
451; GFX9-NEXT:    ; return to shader part epilog
452;
453; GFX10-LABEL: global_load_saddr_i8_offset_neg4294967297:
454; GFX10:       ; %bb.0:
455; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], 0, s2
456; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], -1, s3, s[0:1]
457; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
458; GFX10-NEXT:    s_waitcnt vmcnt(0)
459; GFX10-NEXT:    ; return to shader part epilog
460  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 -4294967297
461  %load = load i8, i8 addrspace(1)* %gep0
462  %zext = zext i8 %load to i32
463  %to.vgpr = bitcast i32 %zext to float
464  ret float %to.vgpr
465}
466
467; --------------------------------------------------------------------------------
468; Basic addressing patterns
469; --------------------------------------------------------------------------------
470
471; Basic pattern, no immediate offset.
472define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
473; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
474; GCN:       ; %bb.0:
475; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
476; GCN-NEXT:    s_waitcnt vmcnt(0)
477; GCN-NEXT:    ; return to shader part epilog
478  %zext.offset = zext i32 %voffset to i64
479  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
480  %load = load i8, i8 addrspace(1)* %gep0
481  %zext = zext i8 %load to i32
482  %to.vgpr = bitcast i32 %zext to float
483  ret float %to.vgpr
484}
485
486; Maximum positive offset on gfx9
487define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
488; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
489; GFX9:       ; %bb.0:
490; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
491; GFX9-NEXT:    s_waitcnt vmcnt(0)
492; GFX9-NEXT:    ; return to shader part epilog
493;
494; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
495; GFX10:       ; %bb.0:
496; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
497; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
498; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
499; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
500; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
501; GFX10-NEXT:    s_waitcnt vmcnt(0)
502; GFX10-NEXT:    ; return to shader part epilog
503  %zext.offset = zext i32 %voffset to i64
504  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
505  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
506  %load = load i8, i8 addrspace(1)* %gep1
507  %zext = zext i8 %load to i32
508  %to.vgpr = bitcast i32 %zext to float
509  ret float %to.vgpr
510}
511
512; Maximum positive offset on gfx9 + 1
513define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
514; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
515; GFX9:       ; %bb.0:
516; GFX9-NEXT:    v_mov_b32_e32 v1, s3
517; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
518; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
519; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
520; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
521; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
522; GFX9-NEXT:    s_waitcnt vmcnt(0)
523; GFX9-NEXT:    ; return to shader part epilog
524;
525; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
526; GFX10:       ; %bb.0:
527; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
528; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
529; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x1000, v0
530; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
531; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
532; GFX10-NEXT:    s_waitcnt vmcnt(0)
533; GFX10-NEXT:    ; return to shader part epilog
534  %zext.offset = zext i32 %voffset to i64
535  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
536  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096
537  %load = load i8, i8 addrspace(1)* %gep1
538  %zext = zext i8 %load to i32
539  %to.vgpr = bitcast i32 %zext to float
540  ret float %to.vgpr
541}
542
543; Maximum negative offset on gfx9
544define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
545; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
546; GFX9:       ; %bb.0:
547; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
548; GFX9-NEXT:    s_waitcnt vmcnt(0)
549; GFX9-NEXT:    ; return to shader part epilog
550;
551; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
552; GFX10:       ; %bb.0:
553; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
554; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
555; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff000, v0
556; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
557; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
558; GFX10-NEXT:    s_waitcnt vmcnt(0)
559; GFX10-NEXT:    ; return to shader part epilog
560  %zext.offset = zext i32 %voffset to i64
561  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
562  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096
563  %load = load i8, i8 addrspace(1)* %gep1
564  %zext = zext i8 %load to i32
565  %to.vgpr = bitcast i32 %zext to float
566  ret float %to.vgpr
567}
568
569; Maximum negative offset on gfx9 - 1
570define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
571; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
572; GFX9:       ; %bb.0:
573; GFX9-NEXT:    v_mov_b32_e32 v1, s3
574; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
575; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
576; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
577; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
578; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
579; GFX9-NEXT:    s_waitcnt vmcnt(0)
580; GFX9-NEXT:    ; return to shader part epilog
581;
582; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
583; GFX10:       ; %bb.0:
584; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
585; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
586; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff000, v0
587; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
588; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
589; GFX10-NEXT:    s_waitcnt vmcnt(0)
590; GFX10-NEXT:    ; return to shader part epilog
591  %zext.offset = zext i32 %voffset to i64
592  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
593  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097
594  %load = load i8, i8 addrspace(1)* %gep1
595  %zext = zext i8 %load to i32
596  %to.vgpr = bitcast i32 %zext to float
597  ret float %to.vgpr
598}
599
600; Maximum positive offset on gfx10
601define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
602; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
603; GCN:       ; %bb.0:
604; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
605; GCN-NEXT:    s_waitcnt vmcnt(0)
606; GCN-NEXT:    ; return to shader part epilog
607  %zext.offset = zext i32 %voffset to i64
608  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
609  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
610  %load = load i8, i8 addrspace(1)* %gep1
611  %zext = zext i8 %load to i32
612  %to.vgpr = bitcast i32 %zext to float
613  ret float %to.vgpr
614}
615
616; Maximum positive offset on gfx10 + 1
617define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
618; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
619; GFX9:       ; %bb.0:
620; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
621; GFX9-NEXT:    s_waitcnt vmcnt(0)
622; GFX9-NEXT:    ; return to shader part epilog
623;
624; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
625; GFX10:       ; %bb.0:
626; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
627; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
628; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
629; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
630; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
631; GFX10-NEXT:    s_waitcnt vmcnt(0)
632; GFX10-NEXT:    ; return to shader part epilog
633  %zext.offset = zext i32 %voffset to i64
634  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
635  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
636  %load = load i8, i8 addrspace(1)* %gep1
637  %zext = zext i8 %load to i32
638  %to.vgpr = bitcast i32 %zext to float
639  ret float %to.vgpr
640}
641
642; Maximum negative offset on gfx10
643define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
644; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
645; GCN:       ; %bb.0:
646; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
647; GCN-NEXT:    s_waitcnt vmcnt(0)
648; GCN-NEXT:    ; return to shader part epilog
649  %zext.offset = zext i32 %voffset to i64
650  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
651  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
652  %load = load i8, i8 addrspace(1)* %gep1
653  %zext = zext i8 %load to i32
654  %to.vgpr = bitcast i32 %zext to float
655  ret float %to.vgpr
656}
657
658; Maximum negative offset on gfx10 - 1
659define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
660; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
661; GFX9:       ; %bb.0:
662; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
663; GFX9-NEXT:    s_waitcnt vmcnt(0)
664; GFX9-NEXT:    ; return to shader part epilog
665;
666; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
667; GFX10:       ; %bb.0:
668; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
669; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
670; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0xfffff800, v0
671; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
672; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
673; GFX10-NEXT:    s_waitcnt vmcnt(0)
674; GFX10-NEXT:    ; return to shader part epilog
675  %zext.offset = zext i32 %voffset to i64
676  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
677  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049
678  %load = load i8, i8 addrspace(1)* %gep1
679  %zext = zext i8 %load to i32
680  %to.vgpr = bitcast i32 %zext to float
681  ret float %to.vgpr
682}
683
684; Maximum positive offset on gfx9, and immediate needs to be moved lower.
685define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
686; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
687; GFX9:       ; %bb.0:
688; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
689; GFX9-NEXT:    s_waitcnt vmcnt(0)
690; GFX9-NEXT:    ; return to shader part epilog
691;
692; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
693; GFX10:       ; %bb.0:
694; GFX10-NEXT:    v_add_co_u32 v0, s[0:1], s2, v0
695; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
696; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
697; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
698; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
699; GFX10-NEXT:    s_waitcnt vmcnt(0)
700; GFX10-NEXT:    ; return to shader part epilog
701  %zext.offset = zext i32 %voffset to i64
702  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
703  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset
704  %load = load i8, i8 addrspace(1)* %gep1
705  %zext = zext i8 %load to i32
706  %to.vgpr = bitcast i32 %zext to float
707  ret float %to.vgpr
708}
709
710; pointer addressing done in integers
711define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
712; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
713; GCN:       ; %bb.0:
714; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
715; GCN-NEXT:    s_waitcnt vmcnt(0)
716; GCN-NEXT:    ; return to shader part epilog
717  %zext.offset = zext i32 %voffset to i64
718  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
719  %add = add i64 %sbase.as.int, %zext.offset
720  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
721  %load = load i8, i8 addrspace(1)* %dirty.gep
722  %zext = zext i8 %load to i32
723  %to.vgpr = bitcast i32 %zext to float
724  ret float %to.vgpr
725}
726
727; zext forced to LHS of addressing expression
728define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
729; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
730; GCN:       ; %bb.0:
731; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
732; GCN-NEXT:    s_waitcnt vmcnt(0)
733; GCN-NEXT:    ; return to shader part epilog
734  %zext.offset = zext i32 %voffset to i64
735  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
736  %add = add i64 %zext.offset, %sbase.as.int
737  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
738  %load = load i8, i8 addrspace(1)* %dirty.gep
739  %zext = zext i8 %load to i32
740  %to.vgpr = bitcast i32 %zext to float
741  ret float %to.vgpr
742}
743
744; zext forced to LHS of addressing expression, with immediate offset
745define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
746; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
747; GCN:       ; %bb.0:
748; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
749; GCN-NEXT:    s_waitcnt vmcnt(0)
750; GCN-NEXT:    ; return to shader part epilog
751  %zext.offset = zext i32 %voffset to i64
752  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
753  %add = add i64 %zext.offset, %sbase.as.int
754  %add.immoffset = add i64 %add, 128
755  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
756  %load = load i8, i8 addrspace(1)* %dirty.gep
757  %zext = zext i8 %load to i32
758  %to.vgpr = bitcast i32 %zext to float
759  ret float %to.vgpr
760}
761
762; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
763define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
764; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
765; GCN:       ; %bb.0:
766; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
767; GCN-NEXT:    s_waitcnt vmcnt(0)
768; GCN-NEXT:    ; return to shader part epilog
769  %zext.offset = zext i32 %voffset to i64
770  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
771  %add.immoffset = add i64 %sbase.as.int, 128
772  %add = add i64 %zext.offset, %add.immoffset
773  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
774  %load = load i8, i8 addrspace(1)* %dirty.gep
775  %zext = zext i8 %load to i32
776  %to.vgpr = bitcast i32 %zext to float
777  ret float %to.vgpr
778}
779
780; --------------------------------------------------------------------------------
781; Uniformity edge cases
782; --------------------------------------------------------------------------------
783
784@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
785
786; Base pointer is uniform, but also in VGPRs
787define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
788; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
789; GFX9:       ; %bb.0:
790; GFX9-NEXT:    v_mov_b32_e32 v1, 0
791; GFX9-NEXT:    ds_read_b64 v[1:2], v1
792; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
794; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
795; GFX9-NEXT:    s_nop 4
796; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1]
797; GFX9-NEXT:    s_waitcnt vmcnt(0)
798; GFX9-NEXT:    ; return to shader part epilog
799;
800; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
801; GFX10:       ; %bb.0:
802; GFX10-NEXT:    v_mov_b32_e32 v1, 0
803; GFX10-NEXT:    ds_read_b64 v[1:2], v1
804; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
806; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
807; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1]
808; GFX10-NEXT:    s_waitcnt vmcnt(0)
809; GFX10-NEXT:    ; return to shader part epilog
810  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
811  %zext.offset = zext i32 %voffset to i64
812  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
813  %load = load i8, i8 addrspace(1)* %gep0
814  %zext = zext i8 %load to i32
815  %to.vgpr = bitcast i32 %zext to float
816  ret float %to.vgpr
817}
818
819; Base pointer is uniform, but also in VGPRs, with imm offset
820define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
821; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
822; GFX9:       ; %bb.0:
823; GFX9-NEXT:    v_mov_b32_e32 v1, 0
824; GFX9-NEXT:    ds_read_b64 v[1:2], v1
825; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
827; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
828; GFX9-NEXT:    s_nop 4
829; GFX9-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:42
830; GFX9-NEXT:    s_waitcnt vmcnt(0)
831; GFX9-NEXT:    ; return to shader part epilog
832;
833; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
834; GFX10:       ; %bb.0:
835; GFX10-NEXT:    v_mov_b32_e32 v1, 0
836; GFX10-NEXT:    ds_read_b64 v[1:2], v1
837; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
838; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
839; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
840; GFX10-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:42
841; GFX10-NEXT:    s_waitcnt vmcnt(0)
842; GFX10-NEXT:    ; return to shader part epilog
843  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
844  %zext.offset = zext i32 %voffset to i64
845  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
846  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
847  %load = load i8, i8 addrspace(1)* %gep1
848  %zext = zext i8 %load to i32
849  %to.vgpr = bitcast i32 %zext to float
850  ret float %to.vgpr
851}
852
853; Both 64-bit base and 32-bit offset are scalar
854define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
855; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
856; GCN:       ; %bb.0:
857; GCN-NEXT:    v_mov_b32_e32 v0, s4
858; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
859; GCN-NEXT:    s_waitcnt vmcnt(0)
860; GCN-NEXT:    ; return to shader part epilog
861  %zext.offset = zext i32 %soffset to i64
862  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
863  %load = load i8, i8 addrspace(1)* %gep0
864  %zext = zext i8 %load to i32
865  %to.vgpr = bitcast i32 %zext to float
866  ret float %to.vgpr
867}
868
869; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
870define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
871; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
872; GCN:       ; %bb.0:
873; GCN-NEXT:    v_mov_b32_e32 v0, s4
874; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-24
875; GCN-NEXT:    s_waitcnt vmcnt(0)
876; GCN-NEXT:    ; return to shader part epilog
877  %zext.offset = zext i32 %soffset to i64
878  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
879  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24
880  %load = load i8, i8 addrspace(1)* %gep1
881  %zext = zext i8 %load to i32
882  %to.vgpr = bitcast i32 %zext to float
883  ret float %to.vgpr
884}
885
886; Both components uniform, zext forced to LHS of addressing expression
887define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
888; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
889; GCN:       ; %bb.0:
890; GCN-NEXT:    v_mov_b32_e32 v0, s4
891; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
892; GCN-NEXT:    s_waitcnt vmcnt(0)
893; GCN-NEXT:    ; return to shader part epilog
894  %zext.offset = zext i32 %soffset to i64
895  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
896  %add = add i64 %zext.offset, %sbase.as.int
897  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
898  %load = load i8, i8 addrspace(1)* %dirty.gep
899  %zext = zext i8 %load to i32
900  %to.vgpr = bitcast i32 %zext to float
901  ret float %to.vgpr
902}
903
904; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
905define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
906; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
907; GCN:       ; %bb.0:
908; GCN-NEXT:    v_mov_b32_e32 v0, s4
909; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
910; GCN-NEXT:    s_waitcnt vmcnt(0)
911; GCN-NEXT:    ; return to shader part epilog
912  %zext.offset = zext i32 %soffset to i64
913  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
914  %add = add i64 %zext.offset, %sbase.as.int
915  %add.immoffset = add i64 %add, 128
916  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
917  %load = load i8, i8 addrspace(1)* %dirty.gep
918  %zext = zext i8 %load to i32
919  %to.vgpr = bitcast i32 %zext to float
920  ret float %to.vgpr
921}
922
923; divergent 64-bit base, 32-bit scalar offset.
924define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
925; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
926; GFX9:       ; %bb.0:
927; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
928; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
929; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
930; GFX9-NEXT:    s_waitcnt vmcnt(0)
931; GFX9-NEXT:    ; return to shader part epilog
932;
933; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
934; GFX10:       ; %bb.0:
935; GFX10-NEXT:    v_add_co_u32 v0, vcc, v0, s2
936; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
937; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
938; GFX10-NEXT:    s_waitcnt vmcnt(0)
939; GFX10-NEXT:    ; return to shader part epilog
940  %zext.offset = zext i32 %soffset to i64
941  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
942  %load = load i8, i8 addrspace(1)* %gep0
943  %zext = zext i8 %load to i32
944  %to.vgpr = bitcast i32 %zext to float
945  ret float %to.vgpr
946}
947
948; divergent 64-bit base, 32-bit scalar offset, with imm offset
949define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
950; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
951; GFX9:       ; %bb.0:
952; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
953; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
954; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
955; GFX9-NEXT:    s_waitcnt vmcnt(0)
956; GFX9-NEXT:    ; return to shader part epilog
957;
958; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
959; GFX10:       ; %bb.0:
960; GFX10-NEXT:    v_add_co_u32 v0, vcc, v0, s2
961; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
962; GFX10-NEXT:    v_add_co_u32 v0, vcc, 0x800, v0
963; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
964; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
965; GFX10-NEXT:    s_waitcnt vmcnt(0)
966; GFX10-NEXT:    ; return to shader part epilog
967  %zext.offset = zext i32 %soffset to i64
968  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
969  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
970  %load = load i8, i8 addrspace(1)* %gep1
971  %zext = zext i8 %load to i32
972  %to.vgpr = bitcast i32 %zext to float
973  ret float %to.vgpr
974}
975
976; --------------------------------------------------------------------------------
977; Natural addressing shifts with restricted range
978; --------------------------------------------------------------------------------
979
980; Cannot push the shift into 32-bits, and cannot match.
981define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
982; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
983; GFX9:       ; %bb.0:
984; GFX9-NEXT:    global_load_dword v0, v[0:1], off
985; GFX9-NEXT:    v_mov_b32_e32 v1, 0
986; GFX9-NEXT:    v_mov_b32_e32 v2, s3
987; GFX9-NEXT:    s_waitcnt vmcnt(0)
988; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
989; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
990; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
991; GFX9-NEXT:    global_load_dword v0, v[0:1], off
992; GFX9-NEXT:    s_waitcnt vmcnt(0)
993; GFX9-NEXT:    ; return to shader part epilog
994;
995; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
996; GFX10:       ; %bb.0:
997; GFX10-NEXT:    global_load_dword v0, v[0:1], off
998; GFX10-NEXT:    v_mov_b32_e32 v1, 0
999; GFX10-NEXT:    s_waitcnt vmcnt(0)
1000; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1001; GFX10-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1002; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1003; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1004; GFX10-NEXT:    s_waitcnt vmcnt(0)
1005; GFX10-NEXT:    ; return to shader part epilog
1006  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1007  %zext.offset = zext i32 %voffset to i64
1008  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1009  %load = load float, float addrspace(1)* %gep
1010  ret float %load
1011}
1012
1013; Cannot push the shift into 32-bits, with an immediate offset.
1014define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1015; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
1016; GCN:       ; %bb.0:
1017; GCN-NEXT:    global_load_dword v0, v[0:1], off
1018; GCN-NEXT:    s_waitcnt vmcnt(0)
1019; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:128
1020; GCN-NEXT:    s_waitcnt vmcnt(0)
1021; GCN-NEXT:    ; return to shader part epilog
1022  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
1023  %zext.offset = zext i32 %voffset to i64
1024  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1025  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128
1026  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1027  %load = load float, float addrspace(1)* %gep1.cast
1028  ret float %load
1029}
1030
1031; Range is sufficiently restricted to push the shift into 32-bits.
1032define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1033; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
1034; GCN:       ; %bb.0:
1035; GCN-NEXT:    global_load_dword v0, v[0:1], off
1036; GCN-NEXT:    s_waitcnt vmcnt(0)
1037; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1038; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1039; GCN-NEXT:    s_waitcnt vmcnt(0)
1040; GCN-NEXT:    ; return to shader part epilog
1041  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1042  %zext.offset = zext i32 %voffset to i64
1043  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1044  %load = load float, float addrspace(1)* %gep
1045  ret float %load
1046}
1047
1048; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
1049define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1050; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
1051; GCN:       ; %bb.0:
1052; GCN-NEXT:    global_load_dword v0, v[0:1], off
1053; GCN-NEXT:    s_waitcnt vmcnt(0)
1054; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1055; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:400
1056; GCN-NEXT:    s_waitcnt vmcnt(0)
1057; GCN-NEXT:    ; return to shader part epilog
1058  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
1059  %zext.offset = zext i32 %voffset to i64
1060  %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1061  %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100
1062  %load = load float, float addrspace(1)* %gep1
1063  ret float %load
1064}
1065
1066; Range is 1 beyond the limit where we can move the shift into 32-bits.
1067define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
1068; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1069; GFX9:       ; %bb.0:
1070; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1071; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1072; GFX9-NEXT:    v_mov_b32_e32 v2, s3
1073; GFX9-NEXT:    s_waitcnt vmcnt(0)
1074; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1075; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
1076; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
1077; GFX9-NEXT:    global_load_dword v0, v[0:1], off
1078; GFX9-NEXT:    s_waitcnt vmcnt(0)
1079; GFX9-NEXT:    ; return to shader part epilog
1080;
1081; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
1082; GFX10:       ; %bb.0:
1083; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1084; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1085; GFX10-NEXT:    s_waitcnt vmcnt(0)
1086; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
1087; GFX10-NEXT:    v_add_co_u32 v0, vcc, s2, v0
1088; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1089; GFX10-NEXT:    global_load_dword v0, v[0:1], off
1090; GFX10-NEXT:    s_waitcnt vmcnt(0)
1091; GFX10-NEXT:    ; return to shader part epilog
1092  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1
1093  %zext.offset = zext i32 %voffset to i64
1094  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
1095  %load = load float, float addrspace(1)* %gep
1096  ret float %load
1097}
1098
1099; --------------------------------------------------------------------------------
1100; Stress various type loads
1101; --------------------------------------------------------------------------------
1102
1103define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1104; GCN-LABEL: global_load_saddr_i16:
1105; GCN:       ; %bb.0:
1106; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1107; GCN-NEXT:    s_waitcnt vmcnt(0)
1108; GCN-NEXT:    ; return to shader part epilog
1109  %zext.offset = zext i32 %voffset to i64
1110  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1111  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1112  %load = load i16, i16 addrspace(1)* %gep0.cast
1113  %cast.load = bitcast i16 %load to half
1114  ret half %cast.load
1115}
1116
1117define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1118; GCN-LABEL: global_load_saddr_i16_immneg128:
1119; GCN:       ; %bb.0:
1120; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1121; GCN-NEXT:    s_waitcnt vmcnt(0)
1122; GCN-NEXT:    ; return to shader part epilog
1123  %zext.offset = zext i32 %voffset to i64
1124  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1125  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1126  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1127  %load = load i16, i16 addrspace(1)* %gep1.cast
1128  %cast.load = bitcast i16 %load to half
1129  ret half %cast.load
1130}
1131
1132define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1133; GCN-LABEL: global_load_saddr_f16:
1134; GCN:       ; %bb.0:
1135; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1136; GCN-NEXT:    s_waitcnt vmcnt(0)
1137; GCN-NEXT:    ; return to shader part epilog
1138  %zext.offset = zext i32 %voffset to i64
1139  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1140  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
1141  %load = load half, half addrspace(1)* %gep0.cast
1142  ret half %load
1143}
1144
1145define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1146; GCN-LABEL: global_load_saddr_f16_immneg128:
1147; GCN:       ; %bb.0:
1148; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1149; GCN-NEXT:    s_waitcnt vmcnt(0)
1150; GCN-NEXT:    ; return to shader part epilog
1151  %zext.offset = zext i32 %voffset to i64
1152  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1153  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1154  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
1155  %load = load half, half addrspace(1)* %gep1.cast
1156  ret half %load
1157}
1158
1159define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1160; GCN-LABEL: global_load_saddr_i32:
1161; GCN:       ; %bb.0:
1162; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1163; GCN-NEXT:    s_waitcnt vmcnt(0)
1164; GCN-NEXT:    ; return to shader part epilog
1165  %zext.offset = zext i32 %voffset to i64
1166  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1167  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1168  %load = load i32, i32 addrspace(1)* %gep0.cast
1169  %cast.load = bitcast i32 %load to float
1170  ret float %cast.load
1171}
1172
1173define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1174; GCN-LABEL: global_load_saddr_i32_immneg128:
1175; GCN:       ; %bb.0:
1176; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1177; GCN-NEXT:    s_waitcnt vmcnt(0)
1178; GCN-NEXT:    ; return to shader part epilog
1179  %zext.offset = zext i32 %voffset to i64
1180  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1181  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1182  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1183  %load = load i32, i32 addrspace(1)* %gep1.cast
1184  %cast.load = bitcast i32 %load to float
1185  ret float %cast.load
1186}
1187
1188define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1189; GCN-LABEL: global_load_saddr_f32:
1190; GCN:       ; %bb.0:
1191; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1192; GCN-NEXT:    s_waitcnt vmcnt(0)
1193; GCN-NEXT:    ; return to shader part epilog
1194  %zext.offset = zext i32 %voffset to i64
1195  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1196  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
1197  %load = load float, float addrspace(1)* %gep0.cast
1198  ret float %load
1199}
1200
1201define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1202; GCN-LABEL: global_load_saddr_f32_immneg128:
1203; GCN:       ; %bb.0:
1204; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1205; GCN-NEXT:    s_waitcnt vmcnt(0)
1206; GCN-NEXT:    ; return to shader part epilog
1207  %zext.offset = zext i32 %voffset to i64
1208  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1209  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1210  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
1211  %load = load float, float addrspace(1)* %gep1.cast
1212  ret float %load
1213}
1214
1215define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1216; GCN-LABEL: global_load_saddr_v2i16:
1217; GCN:       ; %bb.0:
1218; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1219; GCN-NEXT:    s_waitcnt vmcnt(0)
1220; GCN-NEXT:    ; return to shader part epilog
1221  %zext.offset = zext i32 %voffset to i64
1222  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1223  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)*
1224  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast
1225  %cast.load = bitcast <2 x i16> %load to <2 x half>
1226  ret <2 x half> %cast.load
1227}
1228
1229define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1230; GCN-LABEL: global_load_saddr_v2i16_immneg128:
1231; GCN:       ; %bb.0:
1232; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1233; GCN-NEXT:    s_waitcnt vmcnt(0)
1234; GCN-NEXT:    ; return to shader part epilog
1235  %zext.offset = zext i32 %voffset to i64
1236  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1237  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1238  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)*
1239  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast
1240  %cast.load = bitcast <2 x i16> %load to <2 x half>
1241  ret <2 x half> %cast.load
1242}
1243
1244define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1245; GCN-LABEL: global_load_saddr_v2f16:
1246; GCN:       ; %bb.0:
1247; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1248; GCN-NEXT:    s_waitcnt vmcnt(0)
1249; GCN-NEXT:    ; return to shader part epilog
1250  %zext.offset = zext i32 %voffset to i64
1251  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1252  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
1253  %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast
1254  ret <2 x half> %load
1255}
1256
1257define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1258; GCN-LABEL: global_load_saddr_v2f16_immneg128:
1259; GCN:       ; %bb.0:
1260; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1261; GCN-NEXT:    s_waitcnt vmcnt(0)
1262; GCN-NEXT:    ; return to shader part epilog
1263  %zext.offset = zext i32 %voffset to i64
1264  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1265  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1266  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
1267  %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast
1268  ret <2 x half> %load
1269}
1270
1271define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1272; GCN-LABEL: global_load_saddr_p3:
1273; GCN:       ; %bb.0:
1274; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
1275; GCN-NEXT:    s_waitcnt vmcnt(0)
1276; GCN-NEXT:    ; return to shader part epilog
1277  %zext.offset = zext i32 %voffset to i64
1278  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1279  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
1280  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast
1281  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1282  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1283  ret <2 x half> %cast.load1
1284}
1285
1286define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1287; GCN-LABEL: global_load_saddr_p3_immneg128:
1288; GCN:       ; %bb.0:
1289; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
1290; GCN-NEXT:    s_waitcnt vmcnt(0)
1291; GCN-NEXT:    ; return to shader part epilog
1292  %zext.offset = zext i32 %voffset to i64
1293  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1294  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1295  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
1296  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast
1297  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
1298  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
1299  ret <2 x half> %cast.load1
1300}
1301
1302define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1303; GCN-LABEL: global_load_saddr_f64:
1304; GCN:       ; %bb.0:
1305; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1306; GCN-NEXT:    s_waitcnt vmcnt(0)
1307; GCN-NEXT:    ; return to shader part epilog
1308  %zext.offset = zext i32 %voffset to i64
1309  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1310  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
1311  %load = load double, double addrspace(1)* %gep0.cast
1312  %cast.load = bitcast double %load to <2 x float>
1313  ret <2 x float> %cast.load
1314}
1315
1316define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1317; GCN-LABEL: global_load_saddr_f64_immneg128:
1318; GCN:       ; %bb.0:
1319; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1320; GCN-NEXT:    s_waitcnt vmcnt(0)
1321; GCN-NEXT:    ; return to shader part epilog
1322  %zext.offset = zext i32 %voffset to i64
1323  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1324  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1325  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
1326  %load = load double, double addrspace(1)* %gep1.cast
1327  %cast.load = bitcast double %load to <2 x float>
1328  ret <2 x float> %cast.load
1329}
1330
1331define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1332; GCN-LABEL: global_load_saddr_i64:
1333; GCN:       ; %bb.0:
1334; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1335; GCN-NEXT:    s_waitcnt vmcnt(0)
1336; GCN-NEXT:    ; return to shader part epilog
1337  %zext.offset = zext i32 %voffset to i64
1338  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1339  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1340  %load = load i64, i64 addrspace(1)* %gep0.cast
1341  %cast.load = bitcast i64 %load to <2 x float>
1342  ret <2 x float> %cast.load
1343}
1344
1345define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1346; GCN-LABEL: global_load_saddr_i64_immneg128:
1347; GCN:       ; %bb.0:
1348; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1349; GCN-NEXT:    s_waitcnt vmcnt(0)
1350; GCN-NEXT:    ; return to shader part epilog
1351  %zext.offset = zext i32 %voffset to i64
1352  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1353  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1354  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1355  %load = load i64, i64 addrspace(1)* %gep1.cast
1356  %cast.load = bitcast i64 %load to <2 x float>
1357  ret <2 x float> %cast.load
1358}
1359
1360define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1361; GCN-LABEL: global_load_saddr_v2f32:
1362; GCN:       ; %bb.0:
1363; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1364; GCN-NEXT:    s_waitcnt vmcnt(0)
1365; GCN-NEXT:    ; return to shader part epilog
1366  %zext.offset = zext i32 %voffset to i64
1367  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1368  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
1369  %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast
1370  ret <2 x float> %load
1371}
1372
1373define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1374; GCN-LABEL: global_load_saddr_v2f32_immneg128:
1375; GCN:       ; %bb.0:
1376; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1377; GCN-NEXT:    s_waitcnt vmcnt(0)
1378; GCN-NEXT:    ; return to shader part epilog
1379  %zext.offset = zext i32 %voffset to i64
1380  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1381  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1382  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
1383  %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast
1384  ret <2 x float> %load
1385}
1386
1387define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1388; GCN-LABEL: global_load_saddr_v2i32:
1389; GCN:       ; %bb.0:
1390; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1391; GCN-NEXT:    s_waitcnt vmcnt(0)
1392; GCN-NEXT:    ; return to shader part epilog
1393  %zext.offset = zext i32 %voffset to i64
1394  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1395  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
1396  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast
1397  %cast.load = bitcast <2 x i32> %load to <2 x float>
1398  ret <2 x float> %cast.load
1399}
1400
1401define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1402; GCN-LABEL: global_load_saddr_v2i32_immneg128:
1403; GCN:       ; %bb.0:
1404; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1405; GCN-NEXT:    s_waitcnt vmcnt(0)
1406; GCN-NEXT:    ; return to shader part epilog
1407  %zext.offset = zext i32 %voffset to i64
1408  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1409  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1410  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
1411  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast
1412  %cast.load = bitcast <2 x i32> %load to <2 x float>
1413  ret <2 x float> %cast.load
1414}
1415
1416define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1417; GCN-LABEL: global_load_saddr_v4i16:
1418; GCN:       ; %bb.0:
1419; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1420; GCN-NEXT:    s_waitcnt vmcnt(0)
1421; GCN-NEXT:    ; return to shader part epilog
1422  %zext.offset = zext i32 %voffset to i64
1423  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1424  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
1425  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast
1426  %cast.load = bitcast <4 x i16> %load to <2 x float>
1427  ret <2 x float> %cast.load
1428}
1429
1430define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1431; GCN-LABEL: global_load_saddr_v4i16_immneg128:
1432; GCN:       ; %bb.0:
1433; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1434; GCN-NEXT:    s_waitcnt vmcnt(0)
1435; GCN-NEXT:    ; return to shader part epilog
1436  %zext.offset = zext i32 %voffset to i64
1437  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1438  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1439  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
1440  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast
1441  %cast.load = bitcast <4 x i16> %load to <2 x float>
1442  ret <2 x float> %cast.load
1443}
1444
1445define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1446; GCN-LABEL: global_load_saddr_v4f16:
1447; GCN:       ; %bb.0:
1448; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1449; GCN-NEXT:    s_waitcnt vmcnt(0)
1450; GCN-NEXT:    ; return to shader part epilog
1451  %zext.offset = zext i32 %voffset to i64
1452  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1453  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
1454  %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast
1455  %cast.load = bitcast <4 x half> %load to <2 x float>
1456  ret <2 x float> %cast.load
1457}
1458
1459define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1460; GCN-LABEL: global_load_saddr_v4f16_immneg128:
1461; GCN:       ; %bb.0:
1462; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1463; GCN-NEXT:    s_waitcnt vmcnt(0)
1464; GCN-NEXT:    ; return to shader part epilog
1465  %zext.offset = zext i32 %voffset to i64
1466  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1467  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1468  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
1469  %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast
1470  %cast.load = bitcast <4 x half> %load to <2 x float>
1471  ret <2 x float> %cast.load
1472}
1473
1474define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1475; GCN-LABEL: global_load_saddr_p1:
1476; GCN:       ; %bb.0:
1477; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
1478; GCN-NEXT:    s_waitcnt vmcnt(0)
1479; GCN-NEXT:    ; return to shader part epilog
1480  %zext.offset = zext i32 %voffset to i64
1481  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1482  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
1483  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast
1484  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1485  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1486  ret <2 x float> %cast.load1
1487}
1488
1489define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1490; GCN-LABEL: global_load_saddr_p1_immneg128:
1491; GCN:       ; %bb.0:
1492; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1493; GCN-NEXT:    s_waitcnt vmcnt(0)
1494; GCN-NEXT:    ; return to shader part epilog
1495  %zext.offset = zext i32 %voffset to i64
1496  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1497  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1498  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
1499  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast
1500  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1501  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1502  ret <2 x float> %cast.load1
1503}
1504
1505define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1506; GCN-LABEL: global_load_saddr_v3f32:
1507; GCN:       ; %bb.0:
1508; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1509; GCN-NEXT:    s_waitcnt vmcnt(0)
1510; GCN-NEXT:    ; return to shader part epilog
1511  %zext.offset = zext i32 %voffset to i64
1512  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1513  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
1514  %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast
1515  ret <3 x float> %load
1516}
1517
1518define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1519; GCN-LABEL: global_load_saddr_v3f32_immneg128:
1520; GCN:       ; %bb.0:
1521; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1522; GCN-NEXT:    s_waitcnt vmcnt(0)
1523; GCN-NEXT:    ; return to shader part epilog
1524  %zext.offset = zext i32 %voffset to i64
1525  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1526  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1527  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
1528  %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast
1529  ret <3 x float> %load
1530}
1531
1532define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1533; GCN-LABEL: global_load_saddr_v3i32:
1534; GCN:       ; %bb.0:
1535; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1536; GCN-NEXT:    s_waitcnt vmcnt(0)
1537; GCN-NEXT:    ; return to shader part epilog
1538  %zext.offset = zext i32 %voffset to i64
1539  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1540  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
1541  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast
1542  %cast.load = bitcast <3 x i32> %load to <3 x float>
1543  ret <3 x float> %cast.load
1544}
1545
1546define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1547; GCN-LABEL: global_load_saddr_v3i32_immneg128:
1548; GCN:       ; %bb.0:
1549; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1550; GCN-NEXT:    s_waitcnt vmcnt(0)
1551; GCN-NEXT:    ; return to shader part epilog
1552  %zext.offset = zext i32 %voffset to i64
1553  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1554  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1555  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
1556  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast
1557  %cast.load = bitcast <3 x i32> %load to <3 x float>
1558  ret <3 x float> %cast.load
1559}
1560
1561define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1562; GCN-LABEL: global_load_saddr_v6f16:
1563; GCN:       ; %bb.0:
1564; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1565; GCN-NEXT:    s_waitcnt vmcnt(0)
1566; GCN-NEXT:    ; return to shader part epilog
1567  %zext.offset = zext i32 %voffset to i64
1568  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1569  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
1570  %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast
1571  ret <6 x half> %load
1572}
1573
1574define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1575; GCN-LABEL: global_load_saddr_v6f16_immneg128:
1576; GCN:       ; %bb.0:
1577; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1578; GCN-NEXT:    s_waitcnt vmcnt(0)
1579; GCN-NEXT:    ; return to shader part epilog
1580  %zext.offset = zext i32 %voffset to i64
1581  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1582  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1583  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
1584  %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast
1585  ret <6 x half> %load
1586}
1587
1588define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1589; GCN-LABEL: global_load_saddr_v4f32:
1590; GCN:       ; %bb.0:
1591; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1592; GCN-NEXT:    s_waitcnt vmcnt(0)
1593; GCN-NEXT:    ; return to shader part epilog
1594  %zext.offset = zext i32 %voffset to i64
1595  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1596  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
1597  %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast
1598  ret <4 x float> %load
1599}
1600
1601define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1602; GCN-LABEL: global_load_saddr_v4f32_immneg128:
1603; GCN:       ; %bb.0:
1604; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1605; GCN-NEXT:    s_waitcnt vmcnt(0)
1606; GCN-NEXT:    ; return to shader part epilog
1607  %zext.offset = zext i32 %voffset to i64
1608  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1609  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1610  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
1611  %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast
1612  ret <4 x float> %load
1613}
1614
1615define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1616; GCN-LABEL: global_load_saddr_v4i32:
1617; GCN:       ; %bb.0:
1618; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1619; GCN-NEXT:    s_waitcnt vmcnt(0)
1620; GCN-NEXT:    ; return to shader part epilog
1621  %zext.offset = zext i32 %voffset to i64
1622  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1623  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
1624  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast
1625  %cast.load = bitcast <4 x i32> %load to <4 x float>
1626  ret <4 x float> %cast.load
1627}
1628
1629define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1630; GCN-LABEL: global_load_saddr_v4i32_immneg128:
1631; GCN:       ; %bb.0:
1632; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1633; GCN-NEXT:    s_waitcnt vmcnt(0)
1634; GCN-NEXT:    ; return to shader part epilog
1635  %zext.offset = zext i32 %voffset to i64
1636  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1637  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1638  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
1639  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast
1640  %cast.load = bitcast <4 x i32> %load to <4 x float>
1641  ret <4 x float> %cast.load
1642}
1643
1644define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1645; GCN-LABEL: global_load_saddr_v2i64:
1646; GCN:       ; %bb.0:
1647; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1648; GCN-NEXT:    s_waitcnt vmcnt(0)
1649; GCN-NEXT:    ; return to shader part epilog
1650  %zext.offset = zext i32 %voffset to i64
1651  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1652  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
1653  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast
1654  %cast.load = bitcast <2 x i64> %load to <4 x float>
1655  ret <4 x float> %cast.load
1656}
1657
1658define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1659; GCN-LABEL: global_load_saddr_v2i64_immneg128:
1660; GCN:       ; %bb.0:
1661; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1662; GCN-NEXT:    s_waitcnt vmcnt(0)
1663; GCN-NEXT:    ; return to shader part epilog
1664  %zext.offset = zext i32 %voffset to i64
1665  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1666  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1667  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
1668  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast
1669  %cast.load = bitcast <2 x i64> %load to <4 x float>
1670  ret <4 x float> %cast.load
1671}
1672
1673define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1674; GCN-LABEL: global_load_saddr_i128:
1675; GCN:       ; %bb.0:
1676; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1677; GCN-NEXT:    s_waitcnt vmcnt(0)
1678; GCN-NEXT:    ; return to shader part epilog
1679  %zext.offset = zext i32 %voffset to i64
1680  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1681  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)*
1682  %load = load i128, i128 addrspace(1)* %gep0.cast
1683  %cast.load = bitcast i128 %load to <4 x float>
1684  ret <4 x float> %cast.load
1685}
1686
1687define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1688; GCN-LABEL: global_load_saddr_i128_immneg128:
1689; GCN:       ; %bb.0:
1690; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1691; GCN-NEXT:    s_waitcnt vmcnt(0)
1692; GCN-NEXT:    ; return to shader part epilog
1693  %zext.offset = zext i32 %voffset to i64
1694  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1695  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1696  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)*
1697  %load = load i128, i128 addrspace(1)* %gep1.cast
1698  %cast.load = bitcast i128 %load to <4 x float>
1699  ret <4 x float> %cast.load
1700}
1701
1702define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1703; GCN-LABEL: global_load_saddr_v2p1:
1704; GCN:       ; %bb.0:
1705; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1706; GCN-NEXT:    s_waitcnt vmcnt(0)
1707; GCN-NEXT:    ; return to shader part epilog
1708  %zext.offset = zext i32 %voffset to i64
1709  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1710  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
1711  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
1712  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1713  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1714  ret <4 x float> %cast.load1
1715}
1716
1717define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1718; GCN-LABEL: global_load_saddr_v2p1_immneg128:
1719; GCN:       ; %bb.0:
1720; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1721; GCN-NEXT:    s_waitcnt vmcnt(0)
1722; GCN-NEXT:    ; return to shader part epilog
1723  %zext.offset = zext i32 %voffset to i64
1724  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1725  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1726  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
1727  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
1728  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1729  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1730  ret <4 x float> %cast.load1
1731}
1732
1733define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1734; GCN-LABEL: global_load_saddr_v4p3:
1735; GCN:       ; %bb.0:
1736; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1737; GCN-NEXT:    s_waitcnt vmcnt(0)
1738; GCN-NEXT:    ; return to shader part epilog
1739  %zext.offset = zext i32 %voffset to i64
1740  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1741  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
1742  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
1743  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1744  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1745  ret <4 x float> %cast.load1
1746}
1747
1748define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1749; GCN-LABEL: global_load_saddr_v4p3_immneg128:
1750; GCN:       ; %bb.0:
1751; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1752; GCN-NEXT:    s_waitcnt vmcnt(0)
1753; GCN-NEXT:    ; return to shader part epilog
1754  %zext.offset = zext i32 %voffset to i64
1755  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1756  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1757  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
1758  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
1759  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1760  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1761  ret <4 x float> %cast.load1
1762}
1763
1764; --------------------------------------------------------------------------------
1765; Extending loads
1766; --------------------------------------------------------------------------------
1767
1768define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1769; GCN-LABEL: global_sextload_saddr_i8:
1770; GCN:       ; %bb.0:
1771; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3]
1772; GCN-NEXT:    s_waitcnt vmcnt(0)
1773; GCN-NEXT:    ; return to shader part epilog
1774  %zext.offset = zext i32 %voffset to i64
1775  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1776  %load = load i8, i8 addrspace(1)* %gep0
1777  %sextload = sext i8 %load to i32
1778  %cast.load = bitcast i32 %sextload to float
1779  ret float %cast.load
1780}
1781
1782define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1783; GCN-LABEL: global_sextload_saddr_i8_immneg128:
1784; GCN:       ; %bb.0:
1785; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3] offset:-128
1786; GCN-NEXT:    s_waitcnt vmcnt(0)
1787; GCN-NEXT:    ; return to shader part epilog
1788  %zext.offset = zext i32 %voffset to i64
1789  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1790  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1791  %load = load i8, i8 addrspace(1)* %gep1
1792  %sextload = sext i8 %load to i32
1793  %cast.load = bitcast i32 %sextload to float
1794  ret float %cast.load
1795}
1796
1797define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1798; GCN-LABEL: global_sextload_saddr_i16:
1799; GCN:       ; %bb.0:
1800; GCN-NEXT:    global_load_sshort v0, v0, s[2:3]
1801; GCN-NEXT:    s_waitcnt vmcnt(0)
1802; GCN-NEXT:    ; return to shader part epilog
1803  %zext.offset = zext i32 %voffset to i64
1804  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1805  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1806  %load = load i16, i16 addrspace(1)* %gep0.cast
1807  %sextload = sext i16 %load to i32
1808  %cast.load = bitcast i32 %sextload to float
1809  ret float %cast.load
1810}
1811
1812define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1813; GCN-LABEL: global_sextload_saddr_i16_immneg128:
1814; GCN:       ; %bb.0:
1815; GCN-NEXT:    global_load_sshort v0, v0, s[2:3] offset:-128
1816; GCN-NEXT:    s_waitcnt vmcnt(0)
1817; GCN-NEXT:    ; return to shader part epilog
1818  %zext.offset = zext i32 %voffset to i64
1819  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1820  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1821  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1822  %load = load i16, i16 addrspace(1)* %gep1.cast
1823  %sextload = sext i16 %load to i32
1824  %cast.load = bitcast i32 %sextload to float
1825  ret float %cast.load
1826}
1827
1828define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1829; GCN-LABEL: global_zextload_saddr_i8:
1830; GCN:       ; %bb.0:
1831; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
1832; GCN-NEXT:    s_waitcnt vmcnt(0)
1833; GCN-NEXT:    ; return to shader part epilog
1834  %zext.offset = zext i32 %voffset to i64
1835  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1836  %load = load i8, i8 addrspace(1)* %gep0
1837  %zextload = zext i8 %load to i32
1838  %cast.load = bitcast i32 %zextload to float
1839  ret float %cast.load
1840}
1841
1842define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1843; GCN-LABEL: global_zextload_saddr_i8_immneg128:
1844; GCN:       ; %bb.0:
1845; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-128
1846; GCN-NEXT:    s_waitcnt vmcnt(0)
1847; GCN-NEXT:    ; return to shader part epilog
1848  %zext.offset = zext i32 %voffset to i64
1849  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1850  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1851  %load = load i8, i8 addrspace(1)* %gep1
1852  %zextload = zext i8 %load to i32
1853  %cast.load = bitcast i32 %zextload to float
1854  ret float %cast.load
1855}
1856
1857define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1858; GCN-LABEL: global_zextload_saddr_i16:
1859; GCN:       ; %bb.0:
1860; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1861; GCN-NEXT:    s_waitcnt vmcnt(0)
1862; GCN-NEXT:    ; return to shader part epilog
1863  %zext.offset = zext i32 %voffset to i64
1864  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1865  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1866  %load = load i16, i16 addrspace(1)* %gep0.cast
1867  %zextload = zext i16 %load to i32
1868  %cast.load = bitcast i32 %zextload to float
1869  ret float %cast.load
1870}
1871
1872define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1873; GCN-LABEL: global_zextload_saddr_i16_immneg128:
1874; GCN:       ; %bb.0:
1875; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1876; GCN-NEXT:    s_waitcnt vmcnt(0)
1877; GCN-NEXT:    ; return to shader part epilog
1878  %zext.offset = zext i32 %voffset to i64
1879  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1880  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1881  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1882  %load = load i16, i16 addrspace(1)* %gep1.cast
1883  %zextload = zext i16 %load to i32
1884  %cast.load = bitcast i32 %zextload to float
1885  ret float %cast.load
1886}
1887
1888; --------------------------------------------------------------------------------
1889; Atomic load
1890; --------------------------------------------------------------------------------
1891
1892define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1893; GFX9-LABEL: atomic_global_load_saddr_i32:
1894; GFX9:       ; %bb.0:
1895; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1896; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
1897; GFX9-NEXT:    s_waitcnt vmcnt(0)
1898; GFX9-NEXT:    buffer_wbinvl1
1899; GFX9-NEXT:    ; return to shader part epilog
1900;
1901; GFX10-LABEL: atomic_global_load_saddr_i32:
1902; GFX10:       ; %bb.0:
1903; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1904; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1905; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
1906; GFX10-NEXT:    s_waitcnt vmcnt(0)
1907; GFX10-NEXT:    buffer_gl0_inv
1908; GFX10-NEXT:    buffer_gl1_inv
1909; GFX10-NEXT:    ; return to shader part epilog
1910  %zext.offset = zext i32 %voffset to i64
1911  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1912  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1913  %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4
1914  %cast.load = bitcast i32 %load to float
1915  ret float %cast.load
1916}
1917
1918define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1919; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
1920; GFX9:       ; %bb.0:
1921; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1922; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc
1923; GFX9-NEXT:    s_waitcnt vmcnt(0)
1924; GFX9-NEXT:    buffer_wbinvl1
1925; GFX9-NEXT:    ; return to shader part epilog
1926;
1927; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
1928; GFX10:       ; %bb.0:
1929; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1930; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1931; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
1932; GFX10-NEXT:    s_waitcnt vmcnt(0)
1933; GFX10-NEXT:    buffer_gl0_inv
1934; GFX10-NEXT:    buffer_gl1_inv
1935; GFX10-NEXT:    ; return to shader part epilog
1936  %zext.offset = zext i32 %voffset to i64
1937  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1938  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1939  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1940  %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4
1941  %cast.load = bitcast i32 %load to float
1942  ret float %cast.load
1943}
1944
1945define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1946; GFX9-LABEL: atomic_global_load_saddr_i64:
1947; GFX9:       ; %bb.0:
1948; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1949; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
1950; GFX9-NEXT:    s_waitcnt vmcnt(0)
1951; GFX9-NEXT:    buffer_wbinvl1
1952; GFX9-NEXT:    ; return to shader part epilog
1953;
1954; GFX10-LABEL: atomic_global_load_saddr_i64:
1955; GFX10:       ; %bb.0:
1956; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1957; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1958; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
1959; GFX10-NEXT:    s_waitcnt vmcnt(0)
1960; GFX10-NEXT:    buffer_gl0_inv
1961; GFX10-NEXT:    buffer_gl1_inv
1962; GFX10-NEXT:    ; return to shader part epilog
1963  %zext.offset = zext i32 %voffset to i64
1964  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1965  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1966  %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8
1967  %cast.load = bitcast i64 %load to <2 x float>
1968  ret <2 x float> %cast.load
1969}
1970
1971define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1972; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
1973; GFX9:       ; %bb.0:
1974; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1975; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
1976; GFX9-NEXT:    s_waitcnt vmcnt(0)
1977; GFX9-NEXT:    buffer_wbinvl1
1978; GFX9-NEXT:    ; return to shader part epilog
1979;
1980; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
1981; GFX10:       ; %bb.0:
1982; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1983; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1984; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
1985; GFX10-NEXT:    s_waitcnt vmcnt(0)
1986; GFX10-NEXT:    buffer_gl0_inv
1987; GFX10-NEXT:    buffer_gl1_inv
1988; GFX10-NEXT:    ; return to shader part epilog
1989  %zext.offset = zext i32 %voffset to i64
1990  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1991  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1992  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1993  %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8
1994  %cast.load = bitcast i64 %load to <2 x float>
1995  ret <2 x float> %cast.load
1996}
1997
1998; --------------------------------------------------------------------------------
1999; D16 load (low 16)
2000; --------------------------------------------------------------------------------
2001
2002define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2003; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
2004; GCN:       ; %bb.0:
2005; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3]
2006; GCN-NEXT:    s_waitcnt vmcnt(0)
2007; GCN-NEXT:    ; return to shader part epilog
2008  %zext.offset = zext i32 %voffset to i64
2009  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2010  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2011  %load = load i16, i16 addrspace(1)* %gep0.cast
2012  %build = insertelement <2 x i16> undef, i16 %load, i32 0
2013  %cast = bitcast <2 x i16> %build to <2 x half>
2014  ret <2 x half> %cast
2015}
2016
2017define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2018; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
2019; GCN:       ; %bb.0:
2020; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3] offset:-128
2021; GCN-NEXT:    s_waitcnt vmcnt(0)
2022; GCN-NEXT:    ; return to shader part epilog
2023  %zext.offset = zext i32 %voffset to i64
2024  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2025  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2026  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2027  %load = load i16, i16 addrspace(1)* %gep1.cast
2028  %build = insertelement <2 x i16> undef, i16 %load, i32 0
2029  %cast = bitcast <2 x i16> %build to <2 x half>
2030  ret <2 x half> %cast
2031}
2032
2033define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2034; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
2035; GCN:       ; %bb.0:
2036; GCN-NEXT:    v_mov_b32_e32 v1, 0
2037; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
2038; GCN-NEXT:    s_waitcnt vmcnt(0)
2039; GCN-NEXT:    v_mov_b32_e32 v0, v1
2040; GCN-NEXT:    ; return to shader part epilog
2041  %zext.offset = zext i32 %voffset to i64
2042  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2043  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2044  %load = load i16, i16 addrspace(1)* %gep0.cast
2045  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2046  %cast = bitcast <2 x i16> %build to <2 x half>
2047  ret <2 x half> %cast
2048}
2049
2050define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2051; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
2052; GCN:       ; %bb.0:
2053; GCN-NEXT:    v_mov_b32_e32 v1, 0
2054; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
2055; GCN-NEXT:    s_waitcnt vmcnt(0)
2056; GCN-NEXT:    v_mov_b32_e32 v0, v1
2057; GCN-NEXT:    ; return to shader part epilog
2058  %zext.offset = zext i32 %voffset to i64
2059  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2060  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2061  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2062  %load = load i16, i16 addrspace(1)* %gep1.cast
2063  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
2064  %cast = bitcast <2 x i16> %build to <2 x half>
2065  ret <2 x half> %cast
2066}
2067
2068define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2069; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
2070; GCN:       ; %bb.0:
2071; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
2072; GCN-NEXT:    s_waitcnt vmcnt(0)
2073; GCN-NEXT:    v_mov_b32_e32 v0, v1
2074; GCN-NEXT:    ; return to shader part epilog
2075  %zext.offset = zext i32 %voffset to i64
2076  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2077  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2078  %load = load i16, i16 addrspace(1)* %gep0.cast
2079  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2080  %cast = bitcast <2 x i16> %build to <2 x half>
2081  ret <2 x half> %cast
2082}
2083
2084define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2085; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
2086; GCN:       ; %bb.0:
2087; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
2088; GCN-NEXT:    s_waitcnt vmcnt(0)
2089; GCN-NEXT:    v_mov_b32_e32 v0, v1
2090; GCN-NEXT:    ; return to shader part epilog
2091  %zext.offset = zext i32 %voffset to i64
2092  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2093  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2094  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2095  %load = load i16, i16 addrspace(1)* %gep1.cast
2096  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
2097  %cast = bitcast <2 x i16> %build to <2 x half>
2098  ret <2 x half> %cast
2099}
2100
2101define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2102; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
2103; GCN:       ; %bb.0:
2104; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3]
2105; GCN-NEXT:    s_waitcnt vmcnt(0)
2106; GCN-NEXT:    v_mov_b32_e32 v0, v1
2107; GCN-NEXT:    ; return to shader part epilog
2108  %zext.offset = zext i32 %voffset to i64
2109  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2110  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2111  %load = load i8, i8 addrspace(1)* %gep0.cast
2112  %zext.load = zext i8 %load to i16
2113  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2114  %cast = bitcast <2 x i16> %build to <2 x half>
2115  ret <2 x half> %cast
2116}
2117
2118define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2119; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
2120; GCN:       ; %bb.0:
2121; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
2122; GCN-NEXT:    s_waitcnt vmcnt(0)
2123; GCN-NEXT:    v_mov_b32_e32 v0, v1
2124; GCN-NEXT:    ; return to shader part epilog
2125  %zext.offset = zext i32 %voffset to i64
2126  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2127  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2128  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2129  %load = load i8, i8 addrspace(1)* %gep1.cast
2130  %zext.load = zext i8 %load to i16
2131  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
2132  %cast = bitcast <2 x i16> %build to <2 x half>
2133  ret <2 x half> %cast
2134}
2135
2136define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2137; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
2138; GCN:       ; %bb.0:
2139; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3]
2140; GCN-NEXT:    s_waitcnt vmcnt(0)
2141; GCN-NEXT:    v_mov_b32_e32 v0, v1
2142; GCN-NEXT:    ; return to shader part epilog
2143  %zext.offset = zext i32 %voffset to i64
2144  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2145  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2146  %load = load i8, i8 addrspace(1)* %gep0.cast
2147  %sext.load = sext i8 %load to i16
2148  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2149  %cast = bitcast <2 x i16> %build to <2 x half>
2150  ret <2 x half> %cast
2151}
2152
2153define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2154; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
2155; GCN:       ; %bb.0:
2156; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
2157; GCN-NEXT:    s_waitcnt vmcnt(0)
2158; GCN-NEXT:    v_mov_b32_e32 v0, v1
2159; GCN-NEXT:    ; return to shader part epilog
2160  %zext.offset = zext i32 %voffset to i64
2161  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2162  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2163  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2164  %load = load i8, i8 addrspace(1)* %gep1.cast
2165  %sext.load = sext i8 %load to i16
2166  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
2167  %cast = bitcast <2 x i16> %build to <2 x half>
2168  ret <2 x half> %cast
2169}
2170
2171; --------------------------------------------------------------------------------
2172; D16 hi load (hi16)
2173; --------------------------------------------------------------------------------
2174
2175define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2176; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
2177; GCN:       ; %bb.0:
2178; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3]
2179; GCN-NEXT:    s_waitcnt vmcnt(0)
2180; GCN-NEXT:    ; return to shader part epilog
2181  %zext.offset = zext i32 %voffset to i64
2182  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2183  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2184  %load = load i16, i16 addrspace(1)* %gep0.cast
2185  %build = insertelement <2 x i16> undef, i16 %load, i32 1
2186  %cast = bitcast <2 x i16> %build to <2 x half>
2187  ret <2 x half> %cast
2188}
2189
2190define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2191; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
2192; GCN:       ; %bb.0:
2193; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3] offset:-128
2194; GCN-NEXT:    s_waitcnt vmcnt(0)
2195; GCN-NEXT:    ; return to shader part epilog
2196  %zext.offset = zext i32 %voffset to i64
2197  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2198  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2199  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2200  %load = load i16, i16 addrspace(1)* %gep1.cast
2201  %build = insertelement <2 x i16> undef, i16 %load, i32 1
2202  %cast = bitcast <2 x i16> %build to <2 x half>
2203  ret <2 x half> %cast
2204}
2205
2206define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2207; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
2208; GCN:       ; %bb.0:
2209; GCN-NEXT:    v_mov_b32_e32 v1, 0
2210; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
2211; GCN-NEXT:    s_waitcnt vmcnt(0)
2212; GCN-NEXT:    v_mov_b32_e32 v0, v1
2213; GCN-NEXT:    ; return to shader part epilog
2214  %zext.offset = zext i32 %voffset to i64
2215  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2216  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2217  %load = load i16, i16 addrspace(1)* %gep0.cast
2218  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
2219  %cast = bitcast <2 x i16> %build to <2 x half>
2220  ret <2 x half> %cast
2221}
2222
2223define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
2224; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
2225; GCN:       ; %bb.0:
2226; GCN-NEXT:    v_mov_b32_e32 v1, 0
2227; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
2228; GCN-NEXT:    s_waitcnt vmcnt(0)
2229; GCN-NEXT:    v_mov_b32_e32 v0, v1
2230; GCN-NEXT:    ; return to shader part epilog
2231  %zext.offset = zext i32 %voffset to i64
2232  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2233  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2234  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2235  %load = load i16, i16 addrspace(1)* %gep1.cast
2236  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
2237  %cast = bitcast <2 x i16> %build to <2 x half>
2238  ret <2 x half> %cast
2239}
2240
2241define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2242; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
2243; GCN:       ; %bb.0:
2244; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
2245; GCN-NEXT:    s_waitcnt vmcnt(0)
2246; GCN-NEXT:    v_mov_b32_e32 v0, v1
2247; GCN-NEXT:    ; return to shader part epilog
2248  %zext.offset = zext i32 %voffset to i64
2249  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2250  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
2251  %load = load i16, i16 addrspace(1)* %gep0.cast
2252  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
2253  %cast = bitcast <2 x i16> %build to <2 x half>
2254  ret <2 x half> %cast
2255}
2256
2257define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2258; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
2259; GCN:       ; %bb.0:
2260; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
2261; GCN-NEXT:    s_waitcnt vmcnt(0)
2262; GCN-NEXT:    v_mov_b32_e32 v0, v1
2263; GCN-NEXT:    ; return to shader part epilog
2264  %zext.offset = zext i32 %voffset to i64
2265  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2266  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2267  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
2268  %load = load i16, i16 addrspace(1)* %gep1.cast
2269  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
2270  %cast = bitcast <2 x i16> %build to <2 x half>
2271  ret <2 x half> %cast
2272}
2273
2274define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2275; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
2276; GCN:       ; %bb.0:
2277; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3]
2278; GCN-NEXT:    s_waitcnt vmcnt(0)
2279; GCN-NEXT:    v_mov_b32_e32 v0, v1
2280; GCN-NEXT:    ; return to shader part epilog
2281  %zext.offset = zext i32 %voffset to i64
2282  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2283  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2284  %load = load i8, i8 addrspace(1)* %gep0.cast
2285  %zext.load = zext i8 %load to i16
2286  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
2287  %cast = bitcast <2 x i16> %build to <2 x half>
2288  ret <2 x half> %cast
2289}
2290
2291define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2292; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
2293; GCN:       ; %bb.0:
2294; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
2295; GCN-NEXT:    s_waitcnt vmcnt(0)
2296; GCN-NEXT:    v_mov_b32_e32 v0, v1
2297; GCN-NEXT:    ; return to shader part epilog
2298  %zext.offset = zext i32 %voffset to i64
2299  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2300  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2301  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2302  %load = load i8, i8 addrspace(1)* %gep1.cast
2303  %zext.load = zext i8 %load to i16
2304  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
2305  %cast = bitcast <2 x i16> %build to <2 x half>
2306  ret <2 x half> %cast
2307}
2308
2309define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2310; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
2311; GCN:       ; %bb.0:
2312; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3]
2313; GCN-NEXT:    s_waitcnt vmcnt(0)
2314; GCN-NEXT:    v_mov_b32_e32 v0, v1
2315; GCN-NEXT:    ; return to shader part epilog
2316  %zext.offset = zext i32 %voffset to i64
2317  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2318  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
2319  %load = load i8, i8 addrspace(1)* %gep0.cast
2320  %sext.load = sext i8 %load to i16
2321  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
2322  %cast = bitcast <2 x i16> %build to <2 x half>
2323  ret <2 x half> %cast
2324}
2325
2326define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
2327; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
2328; GCN:       ; %bb.0:
2329; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
2330; GCN-NEXT:    s_waitcnt vmcnt(0)
2331; GCN-NEXT:    v_mov_b32_e32 v0, v1
2332; GCN-NEXT:    ; return to shader part epilog
2333  %zext.offset = zext i32 %voffset to i64
2334  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2335  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2336  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
2337  %load = load i8, i8 addrspace(1)* %gep1.cast
2338  %sext.load = sext i8 %load to i16
2339  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
2340  %cast = bitcast <2 x i16> %build to <2 x half>
2341  ret <2 x half> %cast
2342}
2343
2344; --------------------------------------------------------------------------------
2345; or-with-constant as add
2346; --------------------------------------------------------------------------------
2347
2348; Check add-as-or with split 64-bit or.
2349define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(i8 addrspace(6)* inreg %sbase, i32 %idx) {
2350; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
2351; GCN:       ; %bb.0:
2352; GCN-NEXT:    v_or_b32_e32 v0, 16, v0
2353; GCN-NEXT:    v_mov_b32_e32 v1, 0
2354; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
2355; GCN-NEXT:    s_waitcnt vmcnt(0)
2356; GCN-NEXT:    ; return to shader part epilog
2357  %zext.idx = zext i32 %idx to i64
2358  %or = or i64 %zext.idx, 16
2359  %addr = inttoptr i64 %or to i8 addrspace(1)*
2360  %load = load i8, i8 addrspace(1)* %addr
2361  %zext = zext i8 %load to i32
2362  %to.vgpr = bitcast i32 %zext to float
2363  ret float %to.vgpr
2364}
2365
2366define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(i8 addrspace(6)* inreg %sbase, i32 %idx) {
2367; GCN-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
2368; GCN:       ; %bb.0:
2369; GCN-NEXT:    v_or_b32_e32 v0, 0x1040, v0
2370; GCN-NEXT:    v_mov_b32_e32 v1, 0
2371; GCN-NEXT:    global_load_ubyte v0, v[0:1], off
2372; GCN-NEXT:    s_waitcnt vmcnt(0)
2373; GCN-NEXT:    ; return to shader part epilog
2374  %zext.idx = zext i32 %idx to i64
2375  %or = or i64 %zext.idx, 4160
2376  %addr = inttoptr i64 %or to i8 addrspace(1)*
2377  %load = load i8, i8 addrspace(1)* %addr
2378  %zext = zext i8 %load to i32
2379  %to.vgpr = bitcast i32 %zext to float
2380  ret float %to.vgpr
2381}
2382
2383; --------------------------------------------------------------------------------
2384; Full 64-bit scalar add.
2385; --------------------------------------------------------------------------------
2386
2387define amdgpu_ps void @global_addr_64bit_lsr_iv(float addrspace(1)* inreg %arg) {
2388; GFX9-LABEL: global_addr_64bit_lsr_iv:
2389; GFX9:       ; %bb.0: ; %bb
2390; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2391; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2392; GFX9-NEXT:  BB128_1: ; %bb3
2393; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2394; GFX9-NEXT:    s_add_u32 s4, s2, s0
2395; GFX9-NEXT:    s_addc_u32 s5, s3, s1
2396; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
2397; GFX9-NEXT:    s_waitcnt vmcnt(0)
2398; GFX9-NEXT:    s_add_u32 s0, s0, 4
2399; GFX9-NEXT:    s_addc_u32 s1, s1, 0
2400; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
2401; GFX9-NEXT:    s_cbranch_scc0 BB128_1
2402; GFX9-NEXT:  ; %bb.2: ; %bb2
2403; GFX9-NEXT:    s_endpgm
2404;
2405; GFX10-LABEL: global_addr_64bit_lsr_iv:
2406; GFX10:       ; %bb.0: ; %bb
2407; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2408; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2409; GFX10-NEXT:  BB128_1: ; %bb3
2410; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2411; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2412; GFX10-NEXT:    s_add_u32 s4, s2, s0
2413; GFX10-NEXT:    s_addc_u32 s5, s3, s1
2414; GFX10-NEXT:    s_add_u32 s0, s0, 4
2415; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
2416; GFX10-NEXT:    s_waitcnt vmcnt(0)
2417; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2418; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
2419; GFX10-NEXT:    s_cbranch_scc0 BB128_1
2420; GFX10-NEXT:  ; %bb.2: ; %bb2
2421; GFX10-NEXT:    s_endpgm
2422bb:
2423  br label %bb3
2424
2425bb2:                                              ; preds = %bb3
2426  ret void
2427
2428bb3:                                              ; preds = %bb3, %bb
2429  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
2430  %i4 = zext i32 %i to i64
2431  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
2432  %i6 = load volatile float, float addrspace(1)* %i5, align 4
2433  %i8 = add nuw nsw i32 %i, 1
2434  %i9 = icmp eq i32 %i8, 256
2435  br i1 %i9, label %bb2, label %bb3
2436}
2437
2438; Make sure we only have a single zero vaddr initialization.
2439
2440define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(float addrspace(1)* inreg %arg, float addrspace(1)* inreg %arg.1) {
2441; GFX9-LABEL: global_addr_64bit_lsr_iv_multiload:
2442; GFX9:       ; %bb.0: ; %bb
2443; GFX9-NEXT:    s_mov_b64 s[0:1], 0
2444; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2445; GFX9-NEXT:  BB129_1: ; %bb3
2446; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
2447; GFX9-NEXT:    s_add_u32 s4, s2, s0
2448; GFX9-NEXT:    s_addc_u32 s5, s3, s1
2449; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
2450; GFX9-NEXT:    s_waitcnt vmcnt(0)
2451; GFX9-NEXT:    global_load_dword v1, v0, s[4:5] glc
2452; GFX9-NEXT:    s_waitcnt vmcnt(0)
2453; GFX9-NEXT:    s_add_u32 s0, s0, 4
2454; GFX9-NEXT:    s_addc_u32 s1, s1, 0
2455; GFX9-NEXT:    s_cmpk_eq_i32 s0, 0x400
2456; GFX9-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
2457; GFX9-NEXT:    s_cbranch_scc0 BB129_1
2458; GFX9-NEXT:  ; %bb.2: ; %bb2
2459; GFX9-NEXT:    s_endpgm
2460;
2461; GFX10-LABEL: global_addr_64bit_lsr_iv_multiload:
2462; GFX10:       ; %bb.0: ; %bb
2463; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2464; GFX10-NEXT:    s_mov_b64 s[0:1], 0
2465; GFX10-NEXT:  BB129_1: ; %bb3
2466; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
2467; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
2468; GFX10-NEXT:    s_add_u32 s4, s2, s0
2469; GFX10-NEXT:    s_addc_u32 s5, s3, s1
2470; GFX10-NEXT:    s_add_u32 s0, s0, 4
2471; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
2472; GFX10-NEXT:    s_waitcnt vmcnt(0)
2473; GFX10-NEXT:    global_load_dword v1, v0, s[4:5] glc dlc
2474; GFX10-NEXT:    s_waitcnt vmcnt(0)
2475; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2476; GFX10-NEXT:    s_cmpk_eq_i32 s0, 0x400
2477; GFX10-NEXT:    ; kill: killed $sgpr4 killed $sgpr5
2478; GFX10-NEXT:    s_cbranch_scc0 BB129_1
2479; GFX10-NEXT:  ; %bb.2: ; %bb2
2480; GFX10-NEXT:    s_endpgm
2481bb:
2482  br label %bb3
2483
2484bb2:                                              ; preds = %bb3
2485  ret void
2486
2487bb3:                                              ; preds = %bb3, %bb
2488  %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ]
2489  %i4 = zext i32 %i to i64
2490  %i5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %i4
2491  %i6 = load volatile float, float addrspace(1)* %i5, align 4
2492  %i5.1 = getelementptr inbounds float, float addrspace(1)* %arg.1, i64 %i4
2493  %i6.1 = load volatile float, float addrspace(1)* %i5, align 4
2494  %i8 = add nuw nsw i32 %i, 1
2495  %i9 = icmp eq i32 %i8, 256
2496  br i1 %i9, label %bb2, label %bb3
2497}
2498
2499!0 = !{i32 0, i32 1073741824} ; (1 << 30)
2500!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
2501