1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
4
5; Test using saddr addressing mode of global_*load_* flat instructions.
6
7; --------------------------------------------------------------------------------
8; Basic addressing patterns
9; --------------------------------------------------------------------------------
10
11; Basic pattern, no immediate offset.
12define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
13; GCN-LABEL: global_load_saddr_i8_zext_vgpr:
14; GCN:       ; %bb.0:
15; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
16; GCN-NEXT:    s_waitcnt vmcnt(0)
17; GCN-NEXT:    ; return to shader part epilog
18  %zext.offset = zext i32 %voffset to i64
19  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
20  %load = load i8, i8 addrspace(1)* %gep0
21  %zext = zext i8 %load to i32
22  %to.vgpr = bitcast i32 %zext to float
23  ret float %to.vgpr
24}
25
26; Maximum positive offset on gfx9
27define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
28; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
29; GFX9:       ; %bb.0:
30; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
31; GFX9-NEXT:    s_waitcnt vmcnt(0)
32; GFX9-NEXT:    ; return to shader part epilog
33;
34; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
37; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
38; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0x800, v0
39; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
40; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
41; GFX10-NEXT:    s_waitcnt vmcnt(0)
42; GFX10-NEXT:    ; return to shader part epilog
43  %zext.offset = zext i32 %voffset to i64
44  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
45  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
46  %load = load i8, i8 addrspace(1)* %gep1
47  %zext = zext i8 %load to i32
48  %to.vgpr = bitcast i32 %zext to float
49  ret float %to.vgpr
50}
51
52; Maximum positive offset on gfx9 + 1
53define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
54; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
55; GFX9:       ; %bb.0:
56; GFX9-NEXT:    v_mov_b32_e32 v1, s3
57; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
58; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
59; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
60; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
61; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
62; GFX9-NEXT:    s_waitcnt vmcnt(0)
63; GFX9-NEXT:    ; return to shader part epilog
64;
65; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096:
66; GFX10:       ; %bb.0:
67; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
68; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
69; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0x1000, v0
70; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
71; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
72; GFX10-NEXT:    s_waitcnt vmcnt(0)
73; GFX10-NEXT:    ; return to shader part epilog
74  %zext.offset = zext i32 %voffset to i64
75  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
76  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096
77  %load = load i8, i8 addrspace(1)* %gep1
78  %zext = zext i8 %load to i32
79  %to.vgpr = bitcast i32 %zext to float
80  ret float %to.vgpr
81}
82
83; Maximum negative offset on gfx9
84define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
85; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
86; GFX9:       ; %bb.0:
87; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-4096
88; GFX9-NEXT:    s_waitcnt vmcnt(0)
89; GFX9-NEXT:    ; return to shader part epilog
90;
91; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096:
92; GFX10:       ; %bb.0:
93; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
94; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
95; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0xfffff000, v0
96; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
97; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
98; GFX10-NEXT:    s_waitcnt vmcnt(0)
99; GFX10-NEXT:    ; return to shader part epilog
100  %zext.offset = zext i32 %voffset to i64
101  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
102  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096
103  %load = load i8, i8 addrspace(1)* %gep1
104  %zext = zext i8 %load to i32
105  %to.vgpr = bitcast i32 %zext to float
106  ret float %to.vgpr
107}
108
109; Maximum negative offset on gfx9 - 1
110define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
111; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
112; GFX9:       ; %bb.0:
113; GFX9-NEXT:    v_mov_b32_e32 v1, s3
114; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
115; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
116; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
117; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
118; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
119; GFX9-NEXT:    s_waitcnt vmcnt(0)
120; GFX9-NEXT:    ; return to shader part epilog
121;
122; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097:
123; GFX10:       ; %bb.0:
124; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
125; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
126; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0xfffff000, v0
127; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
128; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
129; GFX10-NEXT:    s_waitcnt vmcnt(0)
130; GFX10-NEXT:    ; return to shader part epilog
131  %zext.offset = zext i32 %voffset to i64
132  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
133  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097
134  %load = load i8, i8 addrspace(1)* %gep1
135  %zext = zext i8 %load to i32
136  %to.vgpr = bitcast i32 %zext to float
137  ret float %to.vgpr
138}
139
140; Maximum positive offset on gfx10
141define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
142; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047:
143; GCN:       ; %bb.0:
144; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2047
145; GCN-NEXT:    s_waitcnt vmcnt(0)
146; GCN-NEXT:    ; return to shader part epilog
147  %zext.offset = zext i32 %voffset to i64
148  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
149  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
150  %load = load i8, i8 addrspace(1)* %gep1
151  %zext = zext i8 %load to i32
152  %to.vgpr = bitcast i32 %zext to float
153  ret float %to.vgpr
154}
155
156; Maximum positive offset on gfx10 + 1
157define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
158; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
159; GFX9:       ; %bb.0:
160; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:2048
161; GFX9-NEXT:    s_waitcnt vmcnt(0)
162; GFX9-NEXT:    ; return to shader part epilog
163;
164; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048:
165; GFX10:       ; %bb.0:
166; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
167; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
168; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0x800, v0
169; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
170; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
171; GFX10-NEXT:    s_waitcnt vmcnt(0)
172; GFX10-NEXT:    ; return to shader part epilog
173  %zext.offset = zext i32 %voffset to i64
174  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
175  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
176  %load = load i8, i8 addrspace(1)* %gep1
177  %zext = zext i8 %load to i32
178  %to.vgpr = bitcast i32 %zext to float
179  ret float %to.vgpr
180}
181
182; Maximum negative offset on gfx10
183define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
184; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048:
185; GCN:       ; %bb.0:
186; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2048
187; GCN-NEXT:    s_waitcnt vmcnt(0)
188; GCN-NEXT:    ; return to shader part epilog
189  %zext.offset = zext i32 %voffset to i64
190  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
191  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
192  %load = load i8, i8 addrspace(1)* %gep1
193  %zext = zext i8 %load to i32
194  %to.vgpr = bitcast i32 %zext to float
195  ret float %to.vgpr
196}
197
198; Maximum negative offset on gfx10 - 1
199define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
200; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
201; GFX9:       ; %bb.0:
202; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-2049
203; GFX9-NEXT:    s_waitcnt vmcnt(0)
204; GFX9-NEXT:    ; return to shader part epilog
205;
206; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049:
207; GFX10:       ; %bb.0:
208; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
209; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
210; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0xfffff800, v0
211; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc
212; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:-1
213; GFX10-NEXT:    s_waitcnt vmcnt(0)
214; GFX10-NEXT:    ; return to shader part epilog
215  %zext.offset = zext i32 %voffset to i64
216  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
217  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049
218  %load = load i8, i8 addrspace(1)* %gep1
219  %zext = zext i8 %load to i32
220  %to.vgpr = bitcast i32 %zext to float
221  ret float %to.vgpr
222}
223
224; Maximum positive offset on gfx9, and immediate needs to be moved lower.
225define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
226; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
227; GFX9:       ; %bb.0:
228; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:4095
229; GFX9-NEXT:    s_waitcnt vmcnt(0)
230; GFX9-NEXT:    ; return to shader part epilog
231;
232; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order:
233; GFX10:       ; %bb.0:
234; GFX10-NEXT:    v_add_co_u32_e64 v0, s[0:1], s2, v0
235; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1]
236; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0x800, v0
237; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
238; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
239; GFX10-NEXT:    s_waitcnt vmcnt(0)
240; GFX10-NEXT:    ; return to shader part epilog
241  %zext.offset = zext i32 %voffset to i64
242  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095
243  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset
244  %load = load i8, i8 addrspace(1)* %gep1
245  %zext = zext i8 %load to i32
246  %to.vgpr = bitcast i32 %zext to float
247  ret float %to.vgpr
248}
249
250; pointer addressing done in integers
251define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
252; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint:
253; GCN:       ; %bb.0:
254; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
255; GCN-NEXT:    s_waitcnt vmcnt(0)
256; GCN-NEXT:    ; return to shader part epilog
257  %zext.offset = zext i32 %voffset to i64
258  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
259  %add = add i64 %sbase.as.int, %zext.offset
260  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
261  %load = load i8, i8 addrspace(1)* %dirty.gep
262  %zext = zext i8 %load to i32
263  %to.vgpr = bitcast i32 %zext to float
264  ret float %to.vgpr
265}
266
267; zext forced to LHS of addressing expression
268define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
269; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
270; GCN:       ; %bb.0:
271; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
272; GCN-NEXT:    s_waitcnt vmcnt(0)
273; GCN-NEXT:    ; return to shader part epilog
274  %zext.offset = zext i32 %voffset to i64
275  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
276  %add = add i64 %zext.offset, %sbase.as.int
277  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
278  %load = load i8, i8 addrspace(1)* %dirty.gep
279  %zext = zext i8 %load to i32
280  %to.vgpr = bitcast i32 %zext to float
281  ret float %to.vgpr
282}
283
284; zext forced to LHS of addressing expression, with immediate offset
285define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
286; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
287; GCN:       ; %bb.0:
288; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
289; GCN-NEXT:    s_waitcnt vmcnt(0)
290; GCN-NEXT:    ; return to shader part epilog
291  %zext.offset = zext i32 %voffset to i64
292  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
293  %add = add i64 %zext.offset, %sbase.as.int
294  %add.immoffset = add i64 %add, 128
295  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
296  %load = load i8, i8 addrspace(1)* %dirty.gep
297  %zext = zext i8 %load to i32
298  %to.vgpr = bitcast i32 %zext to float
299  ret float %to.vgpr
300}
301
302; zext forced to LHS of addressing expression, with immediate offset in non-canonical position
303define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
304; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
305; GCN:       ; %bb.0:
306; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
307; GCN-NEXT:    s_waitcnt vmcnt(0)
308; GCN-NEXT:    ; return to shader part epilog
309  %zext.offset = zext i32 %voffset to i64
310  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
311  %add.immoffset = add i64 %sbase.as.int, 128
312  %add = add i64 %zext.offset, %add.immoffset
313  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
314  %load = load i8, i8 addrspace(1)* %dirty.gep
315  %zext = zext i8 %load to i32
316  %to.vgpr = bitcast i32 %zext to float
317  ret float %to.vgpr
318}
319
320; --------------------------------------------------------------------------------
321; Uniformity edge cases
322; --------------------------------------------------------------------------------
323
324@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
325
326; Base pointer is uniform, but also in VGPRs
327define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
328; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs:
329; GCN:       ; %bb.0:
330; GCN-NEXT:    v_mov_b32_e32 v1, 0
331; GCN-NEXT:    ds_read_b64 v[1:2], v1
332; GCN-NEXT:    s_waitcnt lgkmcnt(0)
333; GCN-NEXT:    v_readfirstlane_b32 s0, v1
334; GCN-NEXT:    v_readfirstlane_b32 s1, v2
335; GCN-NEXT:    s_nop 4
336; GCN-NEXT:    global_load_ubyte v0, v0, s[0:1]
337; GCN-NEXT:    s_waitcnt vmcnt(0)
338; GCN-NEXT:    ; return to shader part epilog
339  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
340  %zext.offset = zext i32 %voffset to i64
341  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
342  %load = load i8, i8 addrspace(1)* %gep0
343  %zext = zext i8 %load to i32
344  %to.vgpr = bitcast i32 %zext to float
345  ret float %to.vgpr
346}
347
348; Base pointer is uniform, but also in VGPRs, with imm offset
349define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
350; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset:
351; GCN:       ; %bb.0:
352; GCN-NEXT:    v_mov_b32_e32 v1, 0
353; GCN-NEXT:    ds_read_b64 v[1:2], v1
354; GCN-NEXT:    s_waitcnt lgkmcnt(0)
355; GCN-NEXT:    v_readfirstlane_b32 s0, v1
356; GCN-NEXT:    v_readfirstlane_b32 s1, v2
357; GCN-NEXT:    s_nop 4
358; GCN-NEXT:    global_load_ubyte v0, v0, s[0:1] offset:42
359; GCN-NEXT:    s_waitcnt vmcnt(0)
360; GCN-NEXT:    ; return to shader part epilog
361  %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
362  %zext.offset = zext i32 %voffset to i64
363  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
364  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
365  %load = load i8, i8 addrspace(1)* %gep1
366  %zext = zext i8 %load to i32
367  %to.vgpr = bitcast i32 %zext to float
368  ret float %to.vgpr
369}
370
371; Both 64-bit base and 32-bit offset are scalar
372define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
373; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset:
374; GCN:       ; %bb.0:
375; GCN-NEXT:    v_mov_b32_e32 v0, s4
376; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
377; GCN-NEXT:    s_waitcnt vmcnt(0)
378; GCN-NEXT:    ; return to shader part epilog
379  %zext.offset = zext i32 %soffset to i64
380  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
381  %load = load i8, i8 addrspace(1)* %gep0
382  %zext = zext i8 %load to i32
383  %to.vgpr = bitcast i32 %zext to float
384  ret float %to.vgpr
385}
386
387; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
388define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
389; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset:
390; GCN:       ; %bb.0:
391; GCN-NEXT:    v_mov_b32_e32 v0, s4
392; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-24
393; GCN-NEXT:    s_waitcnt vmcnt(0)
394; GCN-NEXT:    ; return to shader part epilog
395  %zext.offset = zext i32 %soffset to i64
396  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
397  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24
398  %load = load i8, i8 addrspace(1)* %gep1
399  %zext = zext i8 %load to i32
400  %to.vgpr = bitcast i32 %zext to float
401  ret float %to.vgpr
402}
403
404; Both components uniform, zext forced to LHS of addressing expression
405define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
406; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
407; GCN:       ; %bb.0:
408; GCN-NEXT:    v_mov_b32_e32 v0, s4
409; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
410; GCN-NEXT:    s_waitcnt vmcnt(0)
411; GCN-NEXT:    ; return to shader part epilog
412  %zext.offset = zext i32 %soffset to i64
413  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
414  %add = add i64 %zext.offset, %sbase.as.int
415  %dirty.gep = inttoptr i64 %add to i8 addrspace(1)*
416  %load = load i8, i8 addrspace(1)* %dirty.gep
417  %zext = zext i8 %load to i32
418  %to.vgpr = bitcast i32 %zext to float
419  ret float %to.vgpr
420}
421
422; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
423define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) {
424; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
425; GCN:       ; %bb.0:
426; GCN-NEXT:    v_mov_b32_e32 v0, s4
427; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:128
428; GCN-NEXT:    s_waitcnt vmcnt(0)
429; GCN-NEXT:    ; return to shader part epilog
430  %zext.offset = zext i32 %soffset to i64
431  %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64
432  %add = add i64 %zext.offset, %sbase.as.int
433  %add.immoffset = add i64 %add, 128
434  %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)*
435  %load = load i8, i8 addrspace(1)* %dirty.gep
436  %zext = zext i8 %load to i32
437  %to.vgpr = bitcast i32 %zext to float
438  ret float %to.vgpr
439}
440
441; divergent 64-bit base, 32-bit scalar offset.
442define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
443; GFX9-LABEL: global_load_i8_vgpr64_sgpr32:
444; GFX9:       ; %bb.0:
445; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
446; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
447; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
448; GFX9-NEXT:    s_waitcnt vmcnt(0)
449; GFX9-NEXT:    ; return to shader part epilog
450;
451; GFX10-LABEL: global_load_i8_vgpr64_sgpr32:
452; GFX10:       ; %bb.0:
453; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, v0, s2
454; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
455; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
456; GFX10-NEXT:    s_waitcnt vmcnt(0)
457; GFX10-NEXT:    ; return to shader part epilog
458  %zext.offset = zext i32 %soffset to i64
459  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
460  %load = load i8, i8 addrspace(1)* %gep0
461  %zext = zext i8 %load to i32
462  %to.vgpr = bitcast i32 %zext to float
463  ret float %to.vgpr
464}
465
466; divergent 64-bit base, 32-bit scalar offset, with imm offset
467define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) {
468; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
469; GFX9:       ; %bb.0:
470; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
471; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
472; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off offset:4095
473; GFX9-NEXT:    s_waitcnt vmcnt(0)
474; GFX9-NEXT:    ; return to shader part epilog
475;
476; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095:
477; GFX10:       ; %bb.0:
478; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, v0, s2
479; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
480; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, 0x800, v0
481; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc
482; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off offset:2047
483; GFX10-NEXT:    s_waitcnt vmcnt(0)
484; GFX10-NEXT:    ; return to shader part epilog
485  %zext.offset = zext i32 %soffset to i64
486  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset
487  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095
488  %load = load i8, i8 addrspace(1)* %gep1
489  %zext = zext i8 %load to i32
490  %to.vgpr = bitcast i32 %zext to float
491  ret float %to.vgpr
492}
493
494; --------------------------------------------------------------------------------
495; Natural addressing shifts with restricted range
496; --------------------------------------------------------------------------------
497
498; Cannot push the shift into 32-bits, and cannot match.
499define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
500; GFX9-LABEL: global_load_saddr_f32_natural_addressing:
501; GFX9:       ; %bb.0:
502; GFX9-NEXT:    global_load_dword v0, v[0:1], off
503; GFX9-NEXT:    v_mov_b32_e32 v1, 0
504; GFX9-NEXT:    v_mov_b32_e32 v2, s3
505; GFX9-NEXT:    s_waitcnt vmcnt(0)
506; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
507; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
508; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
509; GFX9-NEXT:    global_load_dword v0, v[0:1], off
510; GFX9-NEXT:    s_waitcnt vmcnt(0)
511; GFX9-NEXT:    ; return to shader part epilog
512;
513; GFX10-LABEL: global_load_saddr_f32_natural_addressing:
514; GFX10:       ; %bb.0:
515; GFX10-NEXT:    global_load_dword v0, v[0:1], off
516; GFX10-NEXT:    v_mov_b32_e32 v1, 0
517; GFX10-NEXT:    s_waitcnt vmcnt(0)
518; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
519; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
520; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
521; GFX10-NEXT:    global_load_dword v0, v[0:1], off
522; GFX10-NEXT:    s_waitcnt vmcnt(0)
523; GFX10-NEXT:    ; return to shader part epilog
524  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
525  %zext.offset = zext i32 %voffset to i64
526  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
527  %load = load float, float addrspace(1)* %gep
528  ret float %load
529}
530
531; Cannot push the shift into 32-bits, with an immediate offset.
532define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
533; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset:
534; GCN:       ; %bb.0:
535; GCN-NEXT:    global_load_dword v0, v[0:1], off
536; GCN-NEXT:    s_waitcnt vmcnt(0)
537; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:128
538; GCN-NEXT:    s_waitcnt vmcnt(0)
539; GCN-NEXT:    ; return to shader part epilog
540  %voffset = load i32, i32 addrspace(1)* %voffset.ptr
541  %zext.offset = zext i32 %voffset to i64
542  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
543  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128
544  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
545  %load = load float, float addrspace(1)* %gep1.cast
546  ret float %load
547}
548
549; Range is sufficiently restricted to push the shift into 32-bits.
550define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
551; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range:
552; GCN:       ; %bb.0:
553; GCN-NEXT:    global_load_dword v0, v[0:1], off
554; GCN-NEXT:    s_waitcnt vmcnt(0)
555; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
556; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
557; GCN-NEXT:    s_waitcnt vmcnt(0)
558; GCN-NEXT:    ; return to shader part epilog
559  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
560  %zext.offset = zext i32 %voffset to i64
561  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
562  %load = load float, float addrspace(1)* %gep
563  ret float %load
564}
565
566; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset
567define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
568; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset:
569; GCN:       ; %bb.0:
570; GCN-NEXT:    global_load_dword v0, v[0:1], off
571; GCN-NEXT:    s_waitcnt vmcnt(0)
572; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
573; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:400
574; GCN-NEXT:    s_waitcnt vmcnt(0)
575; GCN-NEXT:    ; return to shader part epilog
576  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0
577  %zext.offset = zext i32 %voffset to i64
578  %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
579  %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100
580  %load = load float, float addrspace(1)* %gep1
581  ret float %load
582}
583
584; Range is 1 beyond the limit where we can move the shift into 32-bits.
585define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) {
586; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
587; GFX9:       ; %bb.0:
588; GFX9-NEXT:    global_load_dword v0, v[0:1], off
589; GFX9-NEXT:    v_mov_b32_e32 v1, 0
590; GFX9-NEXT:    v_mov_b32_e32 v2, s3
591; GFX9-NEXT:    s_waitcnt vmcnt(0)
592; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
593; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s2, v0
594; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
595; GFX9-NEXT:    global_load_dword v0, v[0:1], off
596; GFX9-NEXT:    s_waitcnt vmcnt(0)
597; GFX9-NEXT:    ; return to shader part epilog
598;
599; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large:
600; GFX10:       ; %bb.0:
601; GFX10-NEXT:    global_load_dword v0, v[0:1], off
602; GFX10-NEXT:    v_mov_b32_e32 v1, 0
603; GFX10-NEXT:    s_waitcnt vmcnt(0)
604; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
605; GFX10-NEXT:    v_add_co_u32_e64 v0, vcc, s2, v0
606; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
607; GFX10-NEXT:    global_load_dword v0, v[0:1], off
608; GFX10-NEXT:    s_waitcnt vmcnt(0)
609; GFX10-NEXT:    ; return to shader part epilog
610  %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1
611  %zext.offset = zext i32 %voffset to i64
612  %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset
613  %load = load float, float addrspace(1)* %gep
614  ret float %load
615}
616
617; --------------------------------------------------------------------------------
618; Stress various type loads
619; --------------------------------------------------------------------------------
620
621define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
622; GCN-LABEL: global_load_saddr_i16:
623; GCN:       ; %bb.0:
624; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
625; GCN-NEXT:    s_waitcnt vmcnt(0)
626; GCN-NEXT:    ; return to shader part epilog
627  %zext.offset = zext i32 %voffset to i64
628  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
629  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
630  %load = load i16, i16 addrspace(1)* %gep0.cast
631  %cast.load = bitcast i16 %load to half
632  ret half %cast.load
633}
634
635define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
636; GCN-LABEL: global_load_saddr_i16_immneg128:
637; GCN:       ; %bb.0:
638; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
639; GCN-NEXT:    s_waitcnt vmcnt(0)
640; GCN-NEXT:    ; return to shader part epilog
641  %zext.offset = zext i32 %voffset to i64
642  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
643  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
644  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
645  %load = load i16, i16 addrspace(1)* %gep1.cast
646  %cast.load = bitcast i16 %load to half
647  ret half %cast.load
648}
649
650define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
651; GCN-LABEL: global_load_saddr_f16:
652; GCN:       ; %bb.0:
653; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
654; GCN-NEXT:    s_waitcnt vmcnt(0)
655; GCN-NEXT:    ; return to shader part epilog
656  %zext.offset = zext i32 %voffset to i64
657  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
658  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)*
659  %load = load half, half addrspace(1)* %gep0.cast
660  ret half %load
661}
662
663define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
664; GCN-LABEL: global_load_saddr_f16_immneg128:
665; GCN:       ; %bb.0:
666; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
667; GCN-NEXT:    s_waitcnt vmcnt(0)
668; GCN-NEXT:    ; return to shader part epilog
669  %zext.offset = zext i32 %voffset to i64
670  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
671  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
672  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)*
673  %load = load half, half addrspace(1)* %gep1.cast
674  ret half %load
675}
676
677define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
678; GCN-LABEL: global_load_saddr_i32:
679; GCN:       ; %bb.0:
680; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
681; GCN-NEXT:    s_waitcnt vmcnt(0)
682; GCN-NEXT:    ; return to shader part epilog
683  %zext.offset = zext i32 %voffset to i64
684  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
685  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
686  %load = load i32, i32 addrspace(1)* %gep0.cast
687  %cast.load = bitcast i32 %load to float
688  ret float %cast.load
689}
690
691define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
692; GCN-LABEL: global_load_saddr_i32_immneg128:
693; GCN:       ; %bb.0:
694; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
695; GCN-NEXT:    s_waitcnt vmcnt(0)
696; GCN-NEXT:    ; return to shader part epilog
697  %zext.offset = zext i32 %voffset to i64
698  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
699  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
700  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
701  %load = load i32, i32 addrspace(1)* %gep1.cast
702  %cast.load = bitcast i32 %load to float
703  ret float %cast.load
704}
705
706define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
707; GCN-LABEL: global_load_saddr_f32:
708; GCN:       ; %bb.0:
709; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
710; GCN-NEXT:    s_waitcnt vmcnt(0)
711; GCN-NEXT:    ; return to shader part epilog
712  %zext.offset = zext i32 %voffset to i64
713  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
714  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)*
715  %load = load float, float addrspace(1)* %gep0.cast
716  ret float %load
717}
718
719define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
720; GCN-LABEL: global_load_saddr_f32_immneg128:
721; GCN:       ; %bb.0:
722; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
723; GCN-NEXT:    s_waitcnt vmcnt(0)
724; GCN-NEXT:    ; return to shader part epilog
725  %zext.offset = zext i32 %voffset to i64
726  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
727  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
728  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)*
729  %load = load float, float addrspace(1)* %gep1.cast
730  ret float %load
731}
732
733define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
734; GCN-LABEL: global_load_saddr_v2i16:
735; GCN:       ; %bb.0:
736; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
737; GCN-NEXT:    s_waitcnt vmcnt(0)
738; GCN-NEXT:    ; return to shader part epilog
739  %zext.offset = zext i32 %voffset to i64
740  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
741  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)*
742  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast
743  %cast.load = bitcast <2 x i16> %load to <2 x half>
744  ret <2 x half> %cast.load
745}
746
747define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
748; GCN-LABEL: global_load_saddr_v2i16_immneg128:
749; GCN:       ; %bb.0:
750; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
751; GCN-NEXT:    s_waitcnt vmcnt(0)
752; GCN-NEXT:    ; return to shader part epilog
753  %zext.offset = zext i32 %voffset to i64
754  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
755  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
756  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)*
757  %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast
758  %cast.load = bitcast <2 x i16> %load to <2 x half>
759  ret <2 x half> %cast.load
760}
761
762define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
763; GCN-LABEL: global_load_saddr_v2f16:
764; GCN:       ; %bb.0:
765; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
766; GCN-NEXT:    s_waitcnt vmcnt(0)
767; GCN-NEXT:    ; return to shader part epilog
768  %zext.offset = zext i32 %voffset to i64
769  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
770  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)*
771  %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast
772  ret <2 x half> %load
773}
774
775define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
776; GCN-LABEL: global_load_saddr_v2f16_immneg128:
777; GCN:       ; %bb.0:
778; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
779; GCN-NEXT:    s_waitcnt vmcnt(0)
780; GCN-NEXT:    ; return to shader part epilog
781  %zext.offset = zext i32 %voffset to i64
782  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
783  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
784  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)*
785  %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast
786  ret <2 x half> %load
787}
788
789define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
790; GCN-LABEL: global_load_saddr_p3:
791; GCN:       ; %bb.0:
792; GCN-NEXT:    global_load_dword v0, v0, s[2:3]
793; GCN-NEXT:    s_waitcnt vmcnt(0)
794; GCN-NEXT:    ; return to shader part epilog
795  %zext.offset = zext i32 %voffset to i64
796  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
797  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)*
798  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast
799  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
800  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
801  ret <2 x half> %cast.load1
802}
803
804define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
805; GCN-LABEL: global_load_saddr_p3_immneg128:
806; GCN:       ; %bb.0:
807; GCN-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128
808; GCN-NEXT:    s_waitcnt vmcnt(0)
809; GCN-NEXT:    ; return to shader part epilog
810  %zext.offset = zext i32 %voffset to i64
811  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
812  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
813  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)*
814  %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast
815  %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32
816  %cast.load1 = bitcast i32 %cast.load0 to <2 x half>
817  ret <2 x half> %cast.load1
818}
819
820define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
821; GCN-LABEL: global_load_saddr_f64:
822; GCN:       ; %bb.0:
823; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
824; GCN-NEXT:    s_waitcnt vmcnt(0)
825; GCN-NEXT:    ; return to shader part epilog
826  %zext.offset = zext i32 %voffset to i64
827  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
828  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)*
829  %load = load double, double addrspace(1)* %gep0.cast
830  %cast.load = bitcast double %load to <2 x float>
831  ret <2 x float> %cast.load
832}
833
834define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
835; GCN-LABEL: global_load_saddr_f64_immneg128:
836; GCN:       ; %bb.0:
837; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
838; GCN-NEXT:    s_waitcnt vmcnt(0)
839; GCN-NEXT:    ; return to shader part epilog
840  %zext.offset = zext i32 %voffset to i64
841  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
842  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
843  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)*
844  %load = load double, double addrspace(1)* %gep1.cast
845  %cast.load = bitcast double %load to <2 x float>
846  ret <2 x float> %cast.load
847}
848
849define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
850; GCN-LABEL: global_load_saddr_i64:
851; GCN:       ; %bb.0:
852; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
853; GCN-NEXT:    s_waitcnt vmcnt(0)
854; GCN-NEXT:    ; return to shader part epilog
855  %zext.offset = zext i32 %voffset to i64
856  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
857  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
858  %load = load i64, i64 addrspace(1)* %gep0.cast
859  %cast.load = bitcast i64 %load to <2 x float>
860  ret <2 x float> %cast.load
861}
862
863define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
864; GCN-LABEL: global_load_saddr_i64_immneg128:
865; GCN:       ; %bb.0:
866; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
867; GCN-NEXT:    s_waitcnt vmcnt(0)
868; GCN-NEXT:    ; return to shader part epilog
869  %zext.offset = zext i32 %voffset to i64
870  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
871  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
872  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
873  %load = load i64, i64 addrspace(1)* %gep1.cast
874  %cast.load = bitcast i64 %load to <2 x float>
875  ret <2 x float> %cast.load
876}
877
878define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
879; GCN-LABEL: global_load_saddr_v2f32:
880; GCN:       ; %bb.0:
881; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
882; GCN-NEXT:    s_waitcnt vmcnt(0)
883; GCN-NEXT:    ; return to shader part epilog
884  %zext.offset = zext i32 %voffset to i64
885  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
886  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)*
887  %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast
888  ret <2 x float> %load
889}
890
891define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
892; GCN-LABEL: global_load_saddr_v2f32_immneg128:
893; GCN:       ; %bb.0:
894; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
895; GCN-NEXT:    s_waitcnt vmcnt(0)
896; GCN-NEXT:    ; return to shader part epilog
897  %zext.offset = zext i32 %voffset to i64
898  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
899  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
900  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)*
901  %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast
902  ret <2 x float> %load
903}
904
905define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
906; GCN-LABEL: global_load_saddr_v2i32:
907; GCN:       ; %bb.0:
908; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
909; GCN-NEXT:    s_waitcnt vmcnt(0)
910; GCN-NEXT:    ; return to shader part epilog
911  %zext.offset = zext i32 %voffset to i64
912  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
913  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)*
914  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast
915  %cast.load = bitcast <2 x i32> %load to <2 x float>
916  ret <2 x float> %cast.load
917}
918
919define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
920; GCN-LABEL: global_load_saddr_v2i32_immneg128:
921; GCN:       ; %bb.0:
922; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
923; GCN-NEXT:    s_waitcnt vmcnt(0)
924; GCN-NEXT:    ; return to shader part epilog
925  %zext.offset = zext i32 %voffset to i64
926  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
927  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
928  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)*
929  %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast
930  %cast.load = bitcast <2 x i32> %load to <2 x float>
931  ret <2 x float> %cast.load
932}
933
934define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
935; GCN-LABEL: global_load_saddr_v4i16:
936; GCN:       ; %bb.0:
937; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
938; GCN-NEXT:    s_waitcnt vmcnt(0)
939; GCN-NEXT:    ; return to shader part epilog
940  %zext.offset = zext i32 %voffset to i64
941  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
942  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)*
943  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast
944  %cast.load = bitcast <4 x i16> %load to <2 x float>
945  ret <2 x float> %cast.load
946}
947
948define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
949; GCN-LABEL: global_load_saddr_v4i16_immneg128:
950; GCN:       ; %bb.0:
951; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
952; GCN-NEXT:    s_waitcnt vmcnt(0)
953; GCN-NEXT:    ; return to shader part epilog
954  %zext.offset = zext i32 %voffset to i64
955  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
956  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
957  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)*
958  %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast
959  %cast.load = bitcast <4 x i16> %load to <2 x float>
960  ret <2 x float> %cast.load
961}
962
963define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
964; GCN-LABEL: global_load_saddr_v4f16:
965; GCN:       ; %bb.0:
966; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
967; GCN-NEXT:    s_waitcnt vmcnt(0)
968; GCN-NEXT:    ; return to shader part epilog
969  %zext.offset = zext i32 %voffset to i64
970  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
971  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)*
972  %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast
973  %cast.load = bitcast <4 x half> %load to <2 x float>
974  ret <2 x float> %cast.load
975}
976
977define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
978; GCN-LABEL: global_load_saddr_v4f16_immneg128:
979; GCN:       ; %bb.0:
980; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
981; GCN-NEXT:    s_waitcnt vmcnt(0)
982; GCN-NEXT:    ; return to shader part epilog
983  %zext.offset = zext i32 %voffset to i64
984  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
985  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
986  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)*
987  %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast
988  %cast.load = bitcast <4 x half> %load to <2 x float>
989  ret <2 x float> %cast.load
990}
991
992define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
993; GCN-LABEL: global_load_saddr_p1:
994; GCN:       ; %bb.0:
995; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
996; GCN-NEXT:    s_waitcnt vmcnt(0)
997; GCN-NEXT:    ; return to shader part epilog
998  %zext.offset = zext i32 %voffset to i64
999  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1000  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)*
1001  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast
1002  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1003  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1004  ret <2 x float> %cast.load1
1005}
1006
1007define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1008; GCN-LABEL: global_load_saddr_p1_immneg128:
1009; GCN:       ; %bb.0:
1010; GCN-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128
1011; GCN-NEXT:    s_waitcnt vmcnt(0)
1012; GCN-NEXT:    ; return to shader part epilog
1013  %zext.offset = zext i32 %voffset to i64
1014  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1015  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1016  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)*
1017  %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast
1018  %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64
1019  %cast.load1 = bitcast i64 %cast.load0 to <2 x float>
1020  ret <2 x float> %cast.load1
1021}
1022
1023define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1024; GCN-LABEL: global_load_saddr_v3f32:
1025; GCN:       ; %bb.0:
1026; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1027; GCN-NEXT:    s_waitcnt vmcnt(0)
1028; GCN-NEXT:    ; return to shader part epilog
1029  %zext.offset = zext i32 %voffset to i64
1030  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1031  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)*
1032  %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast
1033  ret <3 x float> %load
1034}
1035
1036define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1037; GCN-LABEL: global_load_saddr_v3f32_immneg128:
1038; GCN:       ; %bb.0:
1039; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1040; GCN-NEXT:    s_waitcnt vmcnt(0)
1041; GCN-NEXT:    ; return to shader part epilog
1042  %zext.offset = zext i32 %voffset to i64
1043  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1044  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1045  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)*
1046  %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast
1047  ret <3 x float> %load
1048}
1049
1050define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1051; GCN-LABEL: global_load_saddr_v3i32:
1052; GCN:       ; %bb.0:
1053; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1054; GCN-NEXT:    s_waitcnt vmcnt(0)
1055; GCN-NEXT:    ; return to shader part epilog
1056  %zext.offset = zext i32 %voffset to i64
1057  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1058  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)*
1059  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast
1060  %cast.load = bitcast <3 x i32> %load to <3 x float>
1061  ret <3 x float> %cast.load
1062}
1063
1064define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1065; GCN-LABEL: global_load_saddr_v3i32_immneg128:
1066; GCN:       ; %bb.0:
1067; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1068; GCN-NEXT:    s_waitcnt vmcnt(0)
1069; GCN-NEXT:    ; return to shader part epilog
1070  %zext.offset = zext i32 %voffset to i64
1071  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1072  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1073  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)*
1074  %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast
1075  %cast.load = bitcast <3 x i32> %load to <3 x float>
1076  ret <3 x float> %cast.load
1077}
1078
1079define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1080; GCN-LABEL: global_load_saddr_v6f16:
1081; GCN:       ; %bb.0:
1082; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3]
1083; GCN-NEXT:    s_waitcnt vmcnt(0)
1084; GCN-NEXT:    ; return to shader part epilog
1085  %zext.offset = zext i32 %voffset to i64
1086  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1087  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)*
1088  %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast
1089  ret <6 x half> %load
1090}
1091
1092define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1093; GCN-LABEL: global_load_saddr_v6f16_immneg128:
1094; GCN:       ; %bb.0:
1095; GCN-NEXT:    global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128
1096; GCN-NEXT:    s_waitcnt vmcnt(0)
1097; GCN-NEXT:    ; return to shader part epilog
1098  %zext.offset = zext i32 %voffset to i64
1099  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1100  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1101  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)*
1102  %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast
1103  ret <6 x half> %load
1104}
1105
1106define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1107; GCN-LABEL: global_load_saddr_v4f32:
1108; GCN:       ; %bb.0:
1109; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1110; GCN-NEXT:    s_waitcnt vmcnt(0)
1111; GCN-NEXT:    ; return to shader part epilog
1112  %zext.offset = zext i32 %voffset to i64
1113  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1114  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)*
1115  %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast
1116  ret <4 x float> %load
1117}
1118
1119define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1120; GCN-LABEL: global_load_saddr_v4f32_immneg128:
1121; GCN:       ; %bb.0:
1122; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1123; GCN-NEXT:    s_waitcnt vmcnt(0)
1124; GCN-NEXT:    ; return to shader part epilog
1125  %zext.offset = zext i32 %voffset to i64
1126  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1127  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1128  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)*
1129  %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast
1130  ret <4 x float> %load
1131}
1132
1133define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1134; GCN-LABEL: global_load_saddr_v4i32:
1135; GCN:       ; %bb.0:
1136; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1137; GCN-NEXT:    s_waitcnt vmcnt(0)
1138; GCN-NEXT:    ; return to shader part epilog
1139  %zext.offset = zext i32 %voffset to i64
1140  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1141  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)*
1142  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast
1143  %cast.load = bitcast <4 x i32> %load to <4 x float>
1144  ret <4 x float> %cast.load
1145}
1146
1147define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1148; GCN-LABEL: global_load_saddr_v4i32_immneg128:
1149; GCN:       ; %bb.0:
1150; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1151; GCN-NEXT:    s_waitcnt vmcnt(0)
1152; GCN-NEXT:    ; return to shader part epilog
1153  %zext.offset = zext i32 %voffset to i64
1154  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1155  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1156  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)*
1157  %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast
1158  %cast.load = bitcast <4 x i32> %load to <4 x float>
1159  ret <4 x float> %cast.load
1160}
1161
1162define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1163; GCN-LABEL: global_load_saddr_v2i64:
1164; GCN:       ; %bb.0:
1165; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1166; GCN-NEXT:    s_waitcnt vmcnt(0)
1167; GCN-NEXT:    ; return to shader part epilog
1168  %zext.offset = zext i32 %voffset to i64
1169  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1170  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)*
1171  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast
1172  %cast.load = bitcast <2 x i64> %load to <4 x float>
1173  ret <4 x float> %cast.load
1174}
1175
1176define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1177; GCN-LABEL: global_load_saddr_v2i64_immneg128:
1178; GCN:       ; %bb.0:
1179; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1180; GCN-NEXT:    s_waitcnt vmcnt(0)
1181; GCN-NEXT:    ; return to shader part epilog
1182  %zext.offset = zext i32 %voffset to i64
1183  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1184  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1185  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)*
1186  %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast
1187  %cast.load = bitcast <2 x i64> %load to <4 x float>
1188  ret <4 x float> %cast.load
1189}
1190
1191define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1192; GCN-LABEL: global_load_saddr_i128:
1193; GCN:       ; %bb.0:
1194; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1195; GCN-NEXT:    s_waitcnt vmcnt(0)
1196; GCN-NEXT:    ; return to shader part epilog
1197  %zext.offset = zext i32 %voffset to i64
1198  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1199  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)*
1200  %load = load i128, i128 addrspace(1)* %gep0.cast
1201  %cast.load = bitcast i128 %load to <4 x float>
1202  ret <4 x float> %cast.load
1203}
1204
1205define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1206; GCN-LABEL: global_load_saddr_i128_immneg128:
1207; GCN:       ; %bb.0:
1208; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1209; GCN-NEXT:    s_waitcnt vmcnt(0)
1210; GCN-NEXT:    ; return to shader part epilog
1211  %zext.offset = zext i32 %voffset to i64
1212  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1213  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1214  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)*
1215  %load = load i128, i128 addrspace(1)* %gep1.cast
1216  %cast.load = bitcast i128 %load to <4 x float>
1217  ret <4 x float> %cast.load
1218}
1219
1220define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1221; GCN-LABEL: global_load_saddr_v2p1:
1222; GCN:       ; %bb.0:
1223; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1224; GCN-NEXT:    s_waitcnt vmcnt(0)
1225; GCN-NEXT:    ; return to shader part epilog
1226  %zext.offset = zext i32 %voffset to i64
1227  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1228  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)*
1229  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast
1230  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1231  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1232  ret <4 x float> %cast.load1
1233}
1234
1235define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1236; GCN-LABEL: global_load_saddr_v2p1_immneg128:
1237; GCN:       ; %bb.0:
1238; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1239; GCN-NEXT:    s_waitcnt vmcnt(0)
1240; GCN-NEXT:    ; return to shader part epilog
1241  %zext.offset = zext i32 %voffset to i64
1242  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1243  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1244  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)*
1245  %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast
1246  %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64>
1247  %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float>
1248  ret <4 x float> %cast.load1
1249}
1250
1251define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1252; GCN-LABEL: global_load_saddr_v4p3:
1253; GCN:       ; %bb.0:
1254; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
1255; GCN-NEXT:    s_waitcnt vmcnt(0)
1256; GCN-NEXT:    ; return to shader part epilog
1257  %zext.offset = zext i32 %voffset to i64
1258  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1259  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)*
1260  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast
1261  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1262  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1263  ret <4 x float> %cast.load1
1264}
1265
1266define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1267; GCN-LABEL: global_load_saddr_v4p3_immneg128:
1268; GCN:       ; %bb.0:
1269; GCN-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128
1270; GCN-NEXT:    s_waitcnt vmcnt(0)
1271; GCN-NEXT:    ; return to shader part epilog
1272  %zext.offset = zext i32 %voffset to i64
1273  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1274  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1275  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)*
1276  %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast
1277  %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32>
1278  %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float>
1279  ret <4 x float> %cast.load1
1280}
1281
1282; --------------------------------------------------------------------------------
1283; Extending loads
1284; --------------------------------------------------------------------------------
1285
1286define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1287; GCN-LABEL: global_sextload_saddr_i8:
1288; GCN:       ; %bb.0:
1289; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3]
1290; GCN-NEXT:    s_waitcnt vmcnt(0)
1291; GCN-NEXT:    ; return to shader part epilog
1292  %zext.offset = zext i32 %voffset to i64
1293  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1294  %load = load i8, i8 addrspace(1)* %gep0
1295  %sextload = sext i8 %load to i32
1296  %cast.load = bitcast i32 %sextload to float
1297  ret float %cast.load
1298}
1299
1300define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1301; GCN-LABEL: global_sextload_saddr_i8_immneg128:
1302; GCN:       ; %bb.0:
1303; GCN-NEXT:    global_load_sbyte v0, v0, s[2:3] offset:-128
1304; GCN-NEXT:    s_waitcnt vmcnt(0)
1305; GCN-NEXT:    ; return to shader part epilog
1306  %zext.offset = zext i32 %voffset to i64
1307  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1308  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1309  %load = load i8, i8 addrspace(1)* %gep1
1310  %sextload = sext i8 %load to i32
1311  %cast.load = bitcast i32 %sextload to float
1312  ret float %cast.load
1313}
1314
1315define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1316; GCN-LABEL: global_sextload_saddr_i16:
1317; GCN:       ; %bb.0:
1318; GCN-NEXT:    global_load_sshort v0, v0, s[2:3]
1319; GCN-NEXT:    s_waitcnt vmcnt(0)
1320; GCN-NEXT:    ; return to shader part epilog
1321  %zext.offset = zext i32 %voffset to i64
1322  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1323  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1324  %load = load i16, i16 addrspace(1)* %gep0.cast
1325  %sextload = sext i16 %load to i32
1326  %cast.load = bitcast i32 %sextload to float
1327  ret float %cast.load
1328}
1329
1330define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1331; GCN-LABEL: global_sextload_saddr_i16_immneg128:
1332; GCN:       ; %bb.0:
1333; GCN-NEXT:    global_load_sshort v0, v0, s[2:3] offset:-128
1334; GCN-NEXT:    s_waitcnt vmcnt(0)
1335; GCN-NEXT:    ; return to shader part epilog
1336  %zext.offset = zext i32 %voffset to i64
1337  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1338  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1339  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1340  %load = load i16, i16 addrspace(1)* %gep1.cast
1341  %sextload = sext i16 %load to i32
1342  %cast.load = bitcast i32 %sextload to float
1343  ret float %cast.load
1344}
1345
1346define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1347; GCN-LABEL: global_zextload_saddr_i8:
1348; GCN:       ; %bb.0:
1349; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3]
1350; GCN-NEXT:    s_waitcnt vmcnt(0)
1351; GCN-NEXT:    ; return to shader part epilog
1352  %zext.offset = zext i32 %voffset to i64
1353  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1354  %load = load i8, i8 addrspace(1)* %gep0
1355  %zextload = zext i8 %load to i32
1356  %cast.load = bitcast i32 %zextload to float
1357  ret float %cast.load
1358}
1359
1360define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1361; GCN-LABEL: global_zextload_saddr_i8_immneg128:
1362; GCN:       ; %bb.0:
1363; GCN-NEXT:    global_load_ubyte v0, v0, s[2:3] offset:-128
1364; GCN-NEXT:    s_waitcnt vmcnt(0)
1365; GCN-NEXT:    ; return to shader part epilog
1366  %zext.offset = zext i32 %voffset to i64
1367  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1368  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1369  %load = load i8, i8 addrspace(1)* %gep1
1370  %zextload = zext i8 %load to i32
1371  %cast.load = bitcast i32 %zextload to float
1372  ret float %cast.load
1373}
1374
1375define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1376; GCN-LABEL: global_zextload_saddr_i16:
1377; GCN:       ; %bb.0:
1378; GCN-NEXT:    global_load_ushort v0, v0, s[2:3]
1379; GCN-NEXT:    s_waitcnt vmcnt(0)
1380; GCN-NEXT:    ; return to shader part epilog
1381  %zext.offset = zext i32 %voffset to i64
1382  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1383  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1384  %load = load i16, i16 addrspace(1)* %gep0.cast
1385  %zextload = zext i16 %load to i32
1386  %cast.load = bitcast i32 %zextload to float
1387  ret float %cast.load
1388}
1389
1390define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1391; GCN-LABEL: global_zextload_saddr_i16_immneg128:
1392; GCN:       ; %bb.0:
1393; GCN-NEXT:    global_load_ushort v0, v0, s[2:3] offset:-128
1394; GCN-NEXT:    s_waitcnt vmcnt(0)
1395; GCN-NEXT:    ; return to shader part epilog
1396  %zext.offset = zext i32 %voffset to i64
1397  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1398  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1399  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1400  %load = load i16, i16 addrspace(1)* %gep1.cast
1401  %zextload = zext i16 %load to i32
1402  %cast.load = bitcast i32 %zextload to float
1403  ret float %cast.load
1404}
1405
1406; --------------------------------------------------------------------------------
1407; Atomic load
1408; --------------------------------------------------------------------------------
1409
1410define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1411; GFX9-LABEL: atomic_global_load_saddr_i32:
1412; GFX9:       ; %bb.0:
1413; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1414; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] glc
1415; GFX9-NEXT:    s_waitcnt vmcnt(0)
1416; GFX9-NEXT:    buffer_wbinvl1
1417; GFX9-NEXT:    ; return to shader part epilog
1418;
1419; GFX10-LABEL: atomic_global_load_saddr_i32:
1420; GFX10:       ; %bb.0:
1421; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1422; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1423; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] glc dlc
1424; GFX10-NEXT:    s_waitcnt vmcnt(0)
1425; GFX10-NEXT:    buffer_gl0_inv
1426; GFX10-NEXT:    buffer_gl1_inv
1427; GFX10-NEXT:    ; return to shader part epilog
1428  %zext.offset = zext i32 %voffset to i64
1429  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1430  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1431  %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4
1432  %cast.load = bitcast i32 %load to float
1433  ret float %cast.load
1434}
1435
1436define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1437; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128:
1438; GFX9:       ; %bb.0:
1439; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1440; GFX9-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc
1441; GFX9-NEXT:    s_waitcnt vmcnt(0)
1442; GFX9-NEXT:    buffer_wbinvl1
1443; GFX9-NEXT:    ; return to shader part epilog
1444;
1445; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128:
1446; GFX10:       ; %bb.0:
1447; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1448; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1449; GFX10-NEXT:    global_load_dword v0, v0, s[2:3] offset:-128 glc dlc
1450; GFX10-NEXT:    s_waitcnt vmcnt(0)
1451; GFX10-NEXT:    buffer_gl0_inv
1452; GFX10-NEXT:    buffer_gl1_inv
1453; GFX10-NEXT:    ; return to shader part epilog
1454  %zext.offset = zext i32 %voffset to i64
1455  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1456  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1457  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1458  %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4
1459  %cast.load = bitcast i32 %load to float
1460  ret float %cast.load
1461}
1462
1463define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1464; GFX9-LABEL: atomic_global_load_saddr_i64:
1465; GFX9:       ; %bb.0:
1466; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1467; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc
1468; GFX9-NEXT:    s_waitcnt vmcnt(0)
1469; GFX9-NEXT:    buffer_wbinvl1
1470; GFX9-NEXT:    ; return to shader part epilog
1471;
1472; GFX10-LABEL: atomic_global_load_saddr_i64:
1473; GFX10:       ; %bb.0:
1474; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1475; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1476; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc
1477; GFX10-NEXT:    s_waitcnt vmcnt(0)
1478; GFX10-NEXT:    buffer_gl0_inv
1479; GFX10-NEXT:    buffer_gl1_inv
1480; GFX10-NEXT:    ; return to shader part epilog
1481  %zext.offset = zext i32 %voffset to i64
1482  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1483  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1484  %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8
1485  %cast.load = bitcast i64 %load to <2 x float>
1486  ret <2 x float> %cast.load
1487}
1488
1489define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1490; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128:
1491; GFX9:       ; %bb.0:
1492; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1493; GFX9-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc
1494; GFX9-NEXT:    s_waitcnt vmcnt(0)
1495; GFX9-NEXT:    buffer_wbinvl1
1496; GFX9-NEXT:    ; return to shader part epilog
1497;
1498; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128:
1499; GFX10:       ; %bb.0:
1500; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1501; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1502; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc
1503; GFX10-NEXT:    s_waitcnt vmcnt(0)
1504; GFX10-NEXT:    buffer_gl0_inv
1505; GFX10-NEXT:    buffer_gl1_inv
1506; GFX10-NEXT:    ; return to shader part epilog
1507  %zext.offset = zext i32 %voffset to i64
1508  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1509  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1510  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1511  %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8
1512  %cast.load = bitcast i64 %load to <2 x float>
1513  ret <2 x float> %cast.load
1514}
1515
1516; --------------------------------------------------------------------------------
1517; D16 load (low 16)
1518; --------------------------------------------------------------------------------
1519
1520define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1521; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi:
1522; GCN:       ; %bb.0:
1523; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3]
1524; GCN-NEXT:    s_waitcnt vmcnt(0)
1525; GCN-NEXT:    ; return to shader part epilog
1526  %zext.offset = zext i32 %voffset to i64
1527  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1528  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1529  %load = load i16, i16 addrspace(1)* %gep0.cast
1530  %build = insertelement <2 x i16> undef, i16 %load, i32 0
1531  %cast = bitcast <2 x i16> %build to <2 x half>
1532  ret <2 x half> %cast
1533}
1534
1535define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1536; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128:
1537; GCN:       ; %bb.0:
1538; GCN-NEXT:    global_load_short_d16 v0, v0, s[2:3] offset:-128
1539; GCN-NEXT:    s_waitcnt vmcnt(0)
1540; GCN-NEXT:    ; return to shader part epilog
1541  %zext.offset = zext i32 %voffset to i64
1542  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1543  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1544  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1545  %load = load i16, i16 addrspace(1)* %gep1.cast
1546  %build = insertelement <2 x i16> undef, i16 %load, i32 0
1547  %cast = bitcast <2 x i16> %build to <2 x half>
1548  ret <2 x half> %cast
1549}
1550
1551define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1552; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi:
1553; GCN:       ; %bb.0:
1554; GCN-NEXT:    v_mov_b32_e32 v1, 0
1555; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
1556; GCN-NEXT:    s_waitcnt vmcnt(0)
1557; GCN-NEXT:    v_mov_b32_e32 v0, v1
1558; GCN-NEXT:    ; return to shader part epilog
1559  %zext.offset = zext i32 %voffset to i64
1560  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1561  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1562  %load = load i16, i16 addrspace(1)* %gep0.cast
1563  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
1564  %cast = bitcast <2 x i16> %build to <2 x half>
1565  ret <2 x half> %cast
1566}
1567
1568define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1569; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128:
1570; GCN:       ; %bb.0:
1571; GCN-NEXT:    v_mov_b32_e32 v1, 0
1572; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
1573; GCN-NEXT:    s_waitcnt vmcnt(0)
1574; GCN-NEXT:    v_mov_b32_e32 v0, v1
1575; GCN-NEXT:    ; return to shader part epilog
1576  %zext.offset = zext i32 %voffset to i64
1577  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1578  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1579  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1580  %load = load i16, i16 addrspace(1)* %gep1.cast
1581  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
1582  %cast = bitcast <2 x i16> %build to <2 x half>
1583  ret <2 x half> %cast
1584}
1585
1586define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1587; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi:
1588; GCN:       ; %bb.0:
1589; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3]
1590; GCN-NEXT:    s_waitcnt vmcnt(0)
1591; GCN-NEXT:    v_mov_b32_e32 v0, v1
1592; GCN-NEXT:    ; return to shader part epilog
1593  %zext.offset = zext i32 %voffset to i64
1594  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1595  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1596  %load = load i16, i16 addrspace(1)* %gep0.cast
1597  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
1598  %cast = bitcast <2 x i16> %build to <2 x half>
1599  ret <2 x half> %cast
1600}
1601
1602define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1603; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128:
1604; GCN:       ; %bb.0:
1605; GCN-NEXT:    global_load_short_d16 v1, v0, s[2:3] offset:-128
1606; GCN-NEXT:    s_waitcnt vmcnt(0)
1607; GCN-NEXT:    v_mov_b32_e32 v0, v1
1608; GCN-NEXT:    ; return to shader part epilog
1609  %zext.offset = zext i32 %voffset to i64
1610  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1611  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1612  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1613  %load = load i16, i16 addrspace(1)* %gep1.cast
1614  %build = insertelement <2 x i16> %reg, i16 %load, i32 0
1615  %cast = bitcast <2 x i16> %build to <2 x half>
1616  ret <2 x half> %cast
1617}
1618
1619define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1620; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi:
1621; GCN:       ; %bb.0:
1622; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3]
1623; GCN-NEXT:    s_waitcnt vmcnt(0)
1624; GCN-NEXT:    v_mov_b32_e32 v0, v1
1625; GCN-NEXT:    ; return to shader part epilog
1626  %zext.offset = zext i32 %voffset to i64
1627  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1628  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
1629  %load = load i8, i8 addrspace(1)* %gep0.cast
1630  %zext.load = zext i8 %load to i16
1631  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
1632  %cast = bitcast <2 x i16> %build to <2 x half>
1633  ret <2 x half> %cast
1634}
1635
1636define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1637; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
1638; GCN:       ; %bb.0:
1639; GCN-NEXT:    global_load_ubyte_d16 v1, v0, s[2:3] offset:-128
1640; GCN-NEXT:    s_waitcnt vmcnt(0)
1641; GCN-NEXT:    v_mov_b32_e32 v0, v1
1642; GCN-NEXT:    ; return to shader part epilog
1643  %zext.offset = zext i32 %voffset to i64
1644  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1645  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1646  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
1647  %load = load i8, i8 addrspace(1)* %gep1.cast
1648  %zext.load = zext i8 %load to i16
1649  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0
1650  %cast = bitcast <2 x i16> %build to <2 x half>
1651  ret <2 x half> %cast
1652}
1653
1654define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1655; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi:
1656; GCN:       ; %bb.0:
1657; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3]
1658; GCN-NEXT:    s_waitcnt vmcnt(0)
1659; GCN-NEXT:    v_mov_b32_e32 v0, v1
1660; GCN-NEXT:    ; return to shader part epilog
1661  %zext.offset = zext i32 %voffset to i64
1662  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1663  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
1664  %load = load i8, i8 addrspace(1)* %gep0.cast
1665  %sext.load = sext i8 %load to i16
1666  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
1667  %cast = bitcast <2 x i16> %build to <2 x half>
1668  ret <2 x half> %cast
1669}
1670
1671define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1672; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
1673; GCN:       ; %bb.0:
1674; GCN-NEXT:    global_load_sbyte_d16 v1, v0, s[2:3] offset:-128
1675; GCN-NEXT:    s_waitcnt vmcnt(0)
1676; GCN-NEXT:    v_mov_b32_e32 v0, v1
1677; GCN-NEXT:    ; return to shader part epilog
1678  %zext.offset = zext i32 %voffset to i64
1679  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1680  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1681  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
1682  %load = load i8, i8 addrspace(1)* %gep1.cast
1683  %sext.load = sext i8 %load to i16
1684  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0
1685  %cast = bitcast <2 x i16> %build to <2 x half>
1686  ret <2 x half> %cast
1687}
1688
1689; --------------------------------------------------------------------------------
1690; D16 hi load (hi16)
1691; --------------------------------------------------------------------------------
1692
1693define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1694; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi:
1695; GCN:       ; %bb.0:
1696; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3]
1697; GCN-NEXT:    s_waitcnt vmcnt(0)
1698; GCN-NEXT:    ; return to shader part epilog
1699  %zext.offset = zext i32 %voffset to i64
1700  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1701  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1702  %load = load i16, i16 addrspace(1)* %gep0.cast
1703  %build = insertelement <2 x i16> undef, i16 %load, i32 1
1704  %cast = bitcast <2 x i16> %build to <2 x half>
1705  ret <2 x half> %cast
1706}
1707
1708define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1709; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128:
1710; GCN:       ; %bb.0:
1711; GCN-NEXT:    global_load_short_d16_hi v0, v0, s[2:3] offset:-128
1712; GCN-NEXT:    s_waitcnt vmcnt(0)
1713; GCN-NEXT:    ; return to shader part epilog
1714  %zext.offset = zext i32 %voffset to i64
1715  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1716  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1717  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1718  %load = load i16, i16 addrspace(1)* %gep1.cast
1719  %build = insertelement <2 x i16> undef, i16 %load, i32 1
1720  %cast = bitcast <2 x i16> %build to <2 x half>
1721  ret <2 x half> %cast
1722}
1723
1724define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1725; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi:
1726; GCN:       ; %bb.0:
1727; GCN-NEXT:    v_mov_b32_e32 v1, 0
1728; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
1729; GCN-NEXT:    s_waitcnt vmcnt(0)
1730; GCN-NEXT:    v_mov_b32_e32 v0, v1
1731; GCN-NEXT:    ; return to shader part epilog
1732  %zext.offset = zext i32 %voffset to i64
1733  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1734  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1735  %load = load i16, i16 addrspace(1)* %gep0.cast
1736  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
1737  %cast = bitcast <2 x i16> %build to <2 x half>
1738  ret <2 x half> %cast
1739}
1740
1741define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) {
1742; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128:
1743; GCN:       ; %bb.0:
1744; GCN-NEXT:    v_mov_b32_e32 v1, 0
1745; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
1746; GCN-NEXT:    s_waitcnt vmcnt(0)
1747; GCN-NEXT:    v_mov_b32_e32 v0, v1
1748; GCN-NEXT:    ; return to shader part epilog
1749  %zext.offset = zext i32 %voffset to i64
1750  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1751  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1752  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1753  %load = load i16, i16 addrspace(1)* %gep1.cast
1754  %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
1755  %cast = bitcast <2 x i16> %build to <2 x half>
1756  ret <2 x half> %cast
1757}
1758
1759define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1760; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi:
1761; GCN:       ; %bb.0:
1762; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3]
1763; GCN-NEXT:    s_waitcnt vmcnt(0)
1764; GCN-NEXT:    v_mov_b32_e32 v0, v1
1765; GCN-NEXT:    ; return to shader part epilog
1766  %zext.offset = zext i32 %voffset to i64
1767  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1768  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)*
1769  %load = load i16, i16 addrspace(1)* %gep0.cast
1770  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
1771  %cast = bitcast <2 x i16> %build to <2 x half>
1772  ret <2 x half> %cast
1773}
1774
1775define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1776; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128:
1777; GCN:       ; %bb.0:
1778; GCN-NEXT:    global_load_short_d16_hi v1, v0, s[2:3] offset:-128
1779; GCN-NEXT:    s_waitcnt vmcnt(0)
1780; GCN-NEXT:    v_mov_b32_e32 v0, v1
1781; GCN-NEXT:    ; return to shader part epilog
1782  %zext.offset = zext i32 %voffset to i64
1783  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1784  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1785  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)*
1786  %load = load i16, i16 addrspace(1)* %gep1.cast
1787  %build = insertelement <2 x i16> %reg, i16 %load, i32 1
1788  %cast = bitcast <2 x i16> %build to <2 x half>
1789  ret <2 x half> %cast
1790}
1791
1792define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1793; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi:
1794; GCN:       ; %bb.0:
1795; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3]
1796; GCN-NEXT:    s_waitcnt vmcnt(0)
1797; GCN-NEXT:    v_mov_b32_e32 v0, v1
1798; GCN-NEXT:    ; return to shader part epilog
1799  %zext.offset = zext i32 %voffset to i64
1800  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1801  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
1802  %load = load i8, i8 addrspace(1)* %gep0.cast
1803  %zext.load = zext i8 %load to i16
1804  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
1805  %cast = bitcast <2 x i16> %build to <2 x half>
1806  ret <2 x half> %cast
1807}
1808
1809define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1810; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
1811; GCN:       ; %bb.0:
1812; GCN-NEXT:    global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128
1813; GCN-NEXT:    s_waitcnt vmcnt(0)
1814; GCN-NEXT:    v_mov_b32_e32 v0, v1
1815; GCN-NEXT:    ; return to shader part epilog
1816  %zext.offset = zext i32 %voffset to i64
1817  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1818  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1819  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
1820  %load = load i8, i8 addrspace(1)* %gep1.cast
1821  %zext.load = zext i8 %load to i16
1822  %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1
1823  %cast = bitcast <2 x i16> %build to <2 x half>
1824  ret <2 x half> %cast
1825}
1826
1827define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1828; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi:
1829; GCN:       ; %bb.0:
1830; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3]
1831; GCN-NEXT:    s_waitcnt vmcnt(0)
1832; GCN-NEXT:    v_mov_b32_e32 v0, v1
1833; GCN-NEXT:    ; return to shader part epilog
1834  %zext.offset = zext i32 %voffset to i64
1835  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1836  %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)*
1837  %load = load i8, i8 addrspace(1)* %gep0.cast
1838  %sext.load = sext i8 %load to i16
1839  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
1840  %cast = bitcast <2 x i16> %build to <2 x half>
1841  ret <2 x half> %cast
1842}
1843
1844define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) {
1845; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
1846; GCN:       ; %bb.0:
1847; GCN-NEXT:    global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128
1848; GCN-NEXT:    s_waitcnt vmcnt(0)
1849; GCN-NEXT:    v_mov_b32_e32 v0, v1
1850; GCN-NEXT:    ; return to shader part epilog
1851  %zext.offset = zext i32 %voffset to i64
1852  %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1853  %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1854  %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)*
1855  %load = load i8, i8 addrspace(1)* %gep1.cast
1856  %sext.load = sext i8 %load to i16
1857  %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1
1858  %cast = bitcast <2 x i16> %build to <2 x half>
1859  ret <2 x half> %cast
1860}
1861
1862!0 = !{i32 0, i32 1073741824} ; (1 << 30)
1863!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1
1864