1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s
4
5define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
6; GFX6-LABEL: gather4_2d:
7; GFX6:       ; %bb.0: ; %main_body
8; GFX6-NEXT:    s_mov_b64 s[14:15], exec
9; GFX6-NEXT:    s_mov_b32 s0, s2
10; GFX6-NEXT:    s_mov_b32 s1, s3
11; GFX6-NEXT:    s_mov_b32 s2, s4
12; GFX6-NEXT:    s_mov_b32 s3, s5
13; GFX6-NEXT:    s_mov_b32 s4, s6
14; GFX6-NEXT:    s_mov_b32 s5, s7
15; GFX6-NEXT:    s_mov_b32 s6, s8
16; GFX6-NEXT:    s_mov_b32 s7, s9
17; GFX6-NEXT:    s_mov_b32 s8, s10
18; GFX6-NEXT:    s_mov_b32 s9, s11
19; GFX6-NEXT:    s_mov_b32 s10, s12
20; GFX6-NEXT:    s_mov_b32 s11, s13
21; GFX6-NEXT:    s_wqm_b64 exec, exec
22; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
23; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
24; GFX6-NEXT:    s_waitcnt vmcnt(0)
25; GFX6-NEXT:    ; return to shader part epilog
26;
27; GFX10NSA-LABEL: gather4_2d:
28; GFX10NSA:       ; %bb.0: ; %main_body
29; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
30; GFX10NSA-NEXT:    s_mov_b32 s0, s2
31; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
32; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
33; GFX10NSA-NEXT:    s_mov_b32 s1, s3
34; GFX10NSA-NEXT:    s_mov_b32 s2, s4
35; GFX10NSA-NEXT:    s_mov_b32 s3, s5
36; GFX10NSA-NEXT:    s_mov_b32 s4, s6
37; GFX10NSA-NEXT:    s_mov_b32 s5, s7
38; GFX10NSA-NEXT:    s_mov_b32 s6, s8
39; GFX10NSA-NEXT:    s_mov_b32 s7, s9
40; GFX10NSA-NEXT:    s_mov_b32 s8, s10
41; GFX10NSA-NEXT:    s_mov_b32 s9, s11
42; GFX10NSA-NEXT:    s_mov_b32 s10, s12
43; GFX10NSA-NEXT:    s_mov_b32 s11, s13
44; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
45; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
46; GFX10NSA-NEXT:    ; return to shader part epilog
47main_body:
48  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
49  ret <4 x float> %v
50}
51
52define amdgpu_ps <4 x float> @gather4_2d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
53; GFX6-LABEL: gather4_2d_tfe:
54; GFX6:       ; %bb.0: ; %main_body
55; GFX6-NEXT:    s_mov_b64 s[14:15], exec
56; GFX6-NEXT:    s_mov_b32 s0, s2
57; GFX6-NEXT:    s_mov_b32 s1, s3
58; GFX6-NEXT:    s_mov_b32 s2, s4
59; GFX6-NEXT:    s_mov_b32 s3, s5
60; GFX6-NEXT:    s_mov_b32 s4, s6
61; GFX6-NEXT:    s_mov_b32 s5, s7
62; GFX6-NEXT:    s_mov_b32 s6, s8
63; GFX6-NEXT:    s_mov_b32 s7, s9
64; GFX6-NEXT:    s_mov_b32 s8, s10
65; GFX6-NEXT:    s_mov_b32 s9, s11
66; GFX6-NEXT:    s_mov_b32 s10, s12
67; GFX6-NEXT:    s_mov_b32 s11, s13
68; GFX6-NEXT:    s_wqm_b64 exec, exec
69; GFX6-NEXT:    v_mov_b32_e32 v5, v0
70; GFX6-NEXT:    v_mov_b32_e32 v0, 0
71; GFX6-NEXT:    v_mov_b32_e32 v6, v1
72; GFX6-NEXT:    v_mov_b32_e32 v1, v0
73; GFX6-NEXT:    v_mov_b32_e32 v2, v0
74; GFX6-NEXT:    v_mov_b32_e32 v3, v0
75; GFX6-NEXT:    v_mov_b32_e32 v4, v0
76; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
77; GFX6-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 tfe
78; GFX6-NEXT:    s_waitcnt vmcnt(0)
79; GFX6-NEXT:    ; return to shader part epilog
80;
81; GFX10NSA-LABEL: gather4_2d_tfe:
82; GFX10NSA:       ; %bb.0: ; %main_body
83; GFX10NSA-NEXT:    s_mov_b32 s14, exec_lo
84; GFX10NSA-NEXT:    s_mov_b32 s0, s2
85; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
86; GFX10NSA-NEXT:    v_mov_b32_e32 v5, v0
87; GFX10NSA-NEXT:    v_mov_b32_e32 v0, 0
88; GFX10NSA-NEXT:    v_mov_b32_e32 v6, v1
89; GFX10NSA-NEXT:    s_mov_b32 s1, s3
90; GFX10NSA-NEXT:    s_mov_b32 s2, s4
91; GFX10NSA-NEXT:    s_mov_b32 s3, s5
92; GFX10NSA-NEXT:    s_mov_b32 s4, s6
93; GFX10NSA-NEXT:    s_mov_b32 s5, s7
94; GFX10NSA-NEXT:    s_mov_b32 s6, s8
95; GFX10NSA-NEXT:    s_mov_b32 s7, s9
96; GFX10NSA-NEXT:    s_mov_b32 s8, s10
97; GFX10NSA-NEXT:    s_mov_b32 s9, s11
98; GFX10NSA-NEXT:    s_mov_b32 s10, s12
99; GFX10NSA-NEXT:    s_mov_b32 s11, s13
100; GFX10NSA-NEXT:    v_mov_b32_e32 v1, v0
101; GFX10NSA-NEXT:    v_mov_b32_e32 v2, v0
102; GFX10NSA-NEXT:    v_mov_b32_e32 v3, v0
103; GFX10NSA-NEXT:    v_mov_b32_e32 v4, v0
104; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s14
105; GFX10NSA-NEXT:    image_gather4 v[0:4], v[5:6], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe
106; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
107; GFX10NSA-NEXT:    ; return to shader part epilog
108main_body:
109  %v = call { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
110  %r = extractvalue { <4 x float>, i32 } %v, 0
111  ret <4 x float> %r
112}
113
114define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %face) {
115; GFX6-LABEL: gather4_cube:
116; GFX6:       ; %bb.0: ; %main_body
117; GFX6-NEXT:    s_mov_b64 s[14:15], exec
118; GFX6-NEXT:    s_mov_b32 s0, s2
119; GFX6-NEXT:    s_mov_b32 s1, s3
120; GFX6-NEXT:    s_mov_b32 s2, s4
121; GFX6-NEXT:    s_mov_b32 s3, s5
122; GFX6-NEXT:    s_mov_b32 s4, s6
123; GFX6-NEXT:    s_mov_b32 s5, s7
124; GFX6-NEXT:    s_mov_b32 s6, s8
125; GFX6-NEXT:    s_mov_b32 s7, s9
126; GFX6-NEXT:    s_mov_b32 s8, s10
127; GFX6-NEXT:    s_mov_b32 s9, s11
128; GFX6-NEXT:    s_mov_b32 s10, s12
129; GFX6-NEXT:    s_mov_b32 s11, s13
130; GFX6-NEXT:    s_wqm_b64 exec, exec
131; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
132; GFX6-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
133; GFX6-NEXT:    s_waitcnt vmcnt(0)
134; GFX6-NEXT:    ; return to shader part epilog
135;
136; GFX10NSA-LABEL: gather4_cube:
137; GFX10NSA:       ; %bb.0: ; %main_body
138; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
139; GFX10NSA-NEXT:    s_mov_b32 s0, s2
140; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
141; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
142; GFX10NSA-NEXT:    s_mov_b32 s1, s3
143; GFX10NSA-NEXT:    s_mov_b32 s2, s4
144; GFX10NSA-NEXT:    s_mov_b32 s3, s5
145; GFX10NSA-NEXT:    s_mov_b32 s4, s6
146; GFX10NSA-NEXT:    s_mov_b32 s5, s7
147; GFX10NSA-NEXT:    s_mov_b32 s6, s8
148; GFX10NSA-NEXT:    s_mov_b32 s7, s9
149; GFX10NSA-NEXT:    s_mov_b32 s8, s10
150; GFX10NSA-NEXT:    s_mov_b32 s9, s11
151; GFX10NSA-NEXT:    s_mov_b32 s10, s12
152; GFX10NSA-NEXT:    s_mov_b32 s11, s13
153; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
154; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
155; GFX10NSA-NEXT:    ; return to shader part epilog
156main_body:
157  %v = call <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 1, float %s, float %t, float %face, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
158  ret <4 x float> %v
159}
160
161define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %slice) {
162; GFX6-LABEL: gather4_2darray:
163; GFX6:       ; %bb.0: ; %main_body
164; GFX6-NEXT:    s_mov_b64 s[14:15], exec
165; GFX6-NEXT:    s_mov_b32 s0, s2
166; GFX6-NEXT:    s_mov_b32 s1, s3
167; GFX6-NEXT:    s_mov_b32 s2, s4
168; GFX6-NEXT:    s_mov_b32 s3, s5
169; GFX6-NEXT:    s_mov_b32 s4, s6
170; GFX6-NEXT:    s_mov_b32 s5, s7
171; GFX6-NEXT:    s_mov_b32 s6, s8
172; GFX6-NEXT:    s_mov_b32 s7, s9
173; GFX6-NEXT:    s_mov_b32 s8, s10
174; GFX6-NEXT:    s_mov_b32 s9, s11
175; GFX6-NEXT:    s_mov_b32 s10, s12
176; GFX6-NEXT:    s_mov_b32 s11, s13
177; GFX6-NEXT:    s_wqm_b64 exec, exec
178; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
179; GFX6-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 da
180; GFX6-NEXT:    s_waitcnt vmcnt(0)
181; GFX6-NEXT:    ; return to shader part epilog
182;
183; GFX10NSA-LABEL: gather4_2darray:
184; GFX10NSA:       ; %bb.0: ; %main_body
185; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
186; GFX10NSA-NEXT:    s_mov_b32 s0, s2
187; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
188; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
189; GFX10NSA-NEXT:    s_mov_b32 s1, s3
190; GFX10NSA-NEXT:    s_mov_b32 s2, s4
191; GFX10NSA-NEXT:    s_mov_b32 s3, s5
192; GFX10NSA-NEXT:    s_mov_b32 s4, s6
193; GFX10NSA-NEXT:    s_mov_b32 s5, s7
194; GFX10NSA-NEXT:    s_mov_b32 s6, s8
195; GFX10NSA-NEXT:    s_mov_b32 s7, s9
196; GFX10NSA-NEXT:    s_mov_b32 s8, s10
197; GFX10NSA-NEXT:    s_mov_b32 s9, s11
198; GFX10NSA-NEXT:    s_mov_b32 s10, s12
199; GFX10NSA-NEXT:    s_mov_b32 s11, s13
200; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
201; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
202; GFX10NSA-NEXT:    ; return to shader part epilog
203main_body:
204  %v = call <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 1, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
205  ret <4 x float> %v
206}
207
208define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
209; GFX6-LABEL: gather4_c_2d:
210; GFX6:       ; %bb.0: ; %main_body
211; GFX6-NEXT:    s_mov_b64 s[14:15], exec
212; GFX6-NEXT:    s_mov_b32 s0, s2
213; GFX6-NEXT:    s_mov_b32 s1, s3
214; GFX6-NEXT:    s_mov_b32 s2, s4
215; GFX6-NEXT:    s_mov_b32 s3, s5
216; GFX6-NEXT:    s_mov_b32 s4, s6
217; GFX6-NEXT:    s_mov_b32 s5, s7
218; GFX6-NEXT:    s_mov_b32 s6, s8
219; GFX6-NEXT:    s_mov_b32 s7, s9
220; GFX6-NEXT:    s_mov_b32 s8, s10
221; GFX6-NEXT:    s_mov_b32 s9, s11
222; GFX6-NEXT:    s_mov_b32 s10, s12
223; GFX6-NEXT:    s_mov_b32 s11, s13
224; GFX6-NEXT:    s_wqm_b64 exec, exec
225; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
226; GFX6-NEXT:    image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
227; GFX6-NEXT:    s_waitcnt vmcnt(0)
228; GFX6-NEXT:    ; return to shader part epilog
229;
230; GFX10NSA-LABEL: gather4_c_2d:
231; GFX10NSA:       ; %bb.0: ; %main_body
232; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
233; GFX10NSA-NEXT:    s_mov_b32 s0, s2
234; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
235; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
236; GFX10NSA-NEXT:    s_mov_b32 s1, s3
237; GFX10NSA-NEXT:    s_mov_b32 s2, s4
238; GFX10NSA-NEXT:    s_mov_b32 s3, s5
239; GFX10NSA-NEXT:    s_mov_b32 s4, s6
240; GFX10NSA-NEXT:    s_mov_b32 s5, s7
241; GFX10NSA-NEXT:    s_mov_b32 s6, s8
242; GFX10NSA-NEXT:    s_mov_b32 s7, s9
243; GFX10NSA-NEXT:    s_mov_b32 s8, s10
244; GFX10NSA-NEXT:    s_mov_b32 s9, s11
245; GFX10NSA-NEXT:    s_mov_b32 s10, s12
246; GFX10NSA-NEXT:    s_mov_b32 s11, s13
247; GFX10NSA-NEXT:    image_gather4_c v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
248; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
249; GFX10NSA-NEXT:    ; return to shader part epilog
250main_body:
251  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
252  ret <4 x float> %v
253}
254
255define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %clamp) {
256; GFX6-LABEL: gather4_cl_2d:
257; GFX6:       ; %bb.0: ; %main_body
258; GFX6-NEXT:    s_mov_b64 s[14:15], exec
259; GFX6-NEXT:    s_mov_b32 s0, s2
260; GFX6-NEXT:    s_mov_b32 s1, s3
261; GFX6-NEXT:    s_mov_b32 s2, s4
262; GFX6-NEXT:    s_mov_b32 s3, s5
263; GFX6-NEXT:    s_mov_b32 s4, s6
264; GFX6-NEXT:    s_mov_b32 s5, s7
265; GFX6-NEXT:    s_mov_b32 s6, s8
266; GFX6-NEXT:    s_mov_b32 s7, s9
267; GFX6-NEXT:    s_mov_b32 s8, s10
268; GFX6-NEXT:    s_mov_b32 s9, s11
269; GFX6-NEXT:    s_mov_b32 s10, s12
270; GFX6-NEXT:    s_mov_b32 s11, s13
271; GFX6-NEXT:    s_wqm_b64 exec, exec
272; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
273; GFX6-NEXT:    image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
274; GFX6-NEXT:    s_waitcnt vmcnt(0)
275; GFX6-NEXT:    ; return to shader part epilog
276;
277; GFX10NSA-LABEL: gather4_cl_2d:
278; GFX10NSA:       ; %bb.0: ; %main_body
279; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
280; GFX10NSA-NEXT:    s_mov_b32 s0, s2
281; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
282; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
283; GFX10NSA-NEXT:    s_mov_b32 s1, s3
284; GFX10NSA-NEXT:    s_mov_b32 s2, s4
285; GFX10NSA-NEXT:    s_mov_b32 s3, s5
286; GFX10NSA-NEXT:    s_mov_b32 s4, s6
287; GFX10NSA-NEXT:    s_mov_b32 s5, s7
288; GFX10NSA-NEXT:    s_mov_b32 s6, s8
289; GFX10NSA-NEXT:    s_mov_b32 s7, s9
290; GFX10NSA-NEXT:    s_mov_b32 s8, s10
291; GFX10NSA-NEXT:    s_mov_b32 s9, s11
292; GFX10NSA-NEXT:    s_mov_b32 s10, s12
293; GFX10NSA-NEXT:    s_mov_b32 s11, s13
294; GFX10NSA-NEXT:    image_gather4_cl v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
295; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
296; GFX10NSA-NEXT:    ; return to shader part epilog
297main_body:
298  %v = call <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 1, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
299  ret <4 x float> %v
300}
301
302define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %clamp) {
303; GFX6-LABEL: gather4_c_cl_2d:
304; GFX6:       ; %bb.0: ; %main_body
305; GFX6-NEXT:    s_mov_b64 s[14:15], exec
306; GFX6-NEXT:    s_mov_b32 s0, s2
307; GFX6-NEXT:    s_mov_b32 s1, s3
308; GFX6-NEXT:    s_mov_b32 s2, s4
309; GFX6-NEXT:    s_mov_b32 s3, s5
310; GFX6-NEXT:    s_mov_b32 s4, s6
311; GFX6-NEXT:    s_mov_b32 s5, s7
312; GFX6-NEXT:    s_mov_b32 s6, s8
313; GFX6-NEXT:    s_mov_b32 s7, s9
314; GFX6-NEXT:    s_mov_b32 s8, s10
315; GFX6-NEXT:    s_mov_b32 s9, s11
316; GFX6-NEXT:    s_mov_b32 s10, s12
317; GFX6-NEXT:    s_mov_b32 s11, s13
318; GFX6-NEXT:    s_wqm_b64 exec, exec
319; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
320; GFX6-NEXT:    image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
321; GFX6-NEXT:    s_waitcnt vmcnt(0)
322; GFX6-NEXT:    ; return to shader part epilog
323;
324; GFX10NSA-LABEL: gather4_c_cl_2d:
325; GFX10NSA:       ; %bb.0: ; %main_body
326; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
327; GFX10NSA-NEXT:    s_mov_b32 s0, s2
328; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
329; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
330; GFX10NSA-NEXT:    s_mov_b32 s1, s3
331; GFX10NSA-NEXT:    s_mov_b32 s2, s4
332; GFX10NSA-NEXT:    s_mov_b32 s3, s5
333; GFX10NSA-NEXT:    s_mov_b32 s4, s6
334; GFX10NSA-NEXT:    s_mov_b32 s5, s7
335; GFX10NSA-NEXT:    s_mov_b32 s6, s8
336; GFX10NSA-NEXT:    s_mov_b32 s7, s9
337; GFX10NSA-NEXT:    s_mov_b32 s8, s10
338; GFX10NSA-NEXT:    s_mov_b32 s9, s11
339; GFX10NSA-NEXT:    s_mov_b32 s10, s12
340; GFX10NSA-NEXT:    s_mov_b32 s11, s13
341; GFX10NSA-NEXT:    image_gather4_c_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
342; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
343; GFX10NSA-NEXT:    ; return to shader part epilog
344main_body:
345  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
346  ret <4 x float> %v
347}
348
349define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
350; GFX6-LABEL: gather4_b_2d:
351; GFX6:       ; %bb.0: ; %main_body
352; GFX6-NEXT:    s_mov_b64 s[14:15], exec
353; GFX6-NEXT:    s_mov_b32 s0, s2
354; GFX6-NEXT:    s_mov_b32 s1, s3
355; GFX6-NEXT:    s_mov_b32 s2, s4
356; GFX6-NEXT:    s_mov_b32 s3, s5
357; GFX6-NEXT:    s_mov_b32 s4, s6
358; GFX6-NEXT:    s_mov_b32 s5, s7
359; GFX6-NEXT:    s_mov_b32 s6, s8
360; GFX6-NEXT:    s_mov_b32 s7, s9
361; GFX6-NEXT:    s_mov_b32 s8, s10
362; GFX6-NEXT:    s_mov_b32 s9, s11
363; GFX6-NEXT:    s_mov_b32 s10, s12
364; GFX6-NEXT:    s_mov_b32 s11, s13
365; GFX6-NEXT:    s_wqm_b64 exec, exec
366; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
367; GFX6-NEXT:    image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
368; GFX6-NEXT:    s_waitcnt vmcnt(0)
369; GFX6-NEXT:    ; return to shader part epilog
370;
371; GFX10NSA-LABEL: gather4_b_2d:
372; GFX10NSA:       ; %bb.0: ; %main_body
373; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
374; GFX10NSA-NEXT:    s_mov_b32 s0, s2
375; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
376; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
377; GFX10NSA-NEXT:    s_mov_b32 s1, s3
378; GFX10NSA-NEXT:    s_mov_b32 s2, s4
379; GFX10NSA-NEXT:    s_mov_b32 s3, s5
380; GFX10NSA-NEXT:    s_mov_b32 s4, s6
381; GFX10NSA-NEXT:    s_mov_b32 s5, s7
382; GFX10NSA-NEXT:    s_mov_b32 s6, s8
383; GFX10NSA-NEXT:    s_mov_b32 s7, s9
384; GFX10NSA-NEXT:    s_mov_b32 s8, s10
385; GFX10NSA-NEXT:    s_mov_b32 s9, s11
386; GFX10NSA-NEXT:    s_mov_b32 s10, s12
387; GFX10NSA-NEXT:    s_mov_b32 s11, s13
388; GFX10NSA-NEXT:    image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
389; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
390; GFX10NSA-NEXT:    ; return to shader part epilog
391main_body:
392  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
393  ret <4 x float> %v
394}
395
396define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t) {
397; GFX6-LABEL: gather4_c_b_2d:
398; GFX6:       ; %bb.0: ; %main_body
399; GFX6-NEXT:    s_mov_b64 s[14:15], exec
400; GFX6-NEXT:    s_mov_b32 s0, s2
401; GFX6-NEXT:    s_mov_b32 s1, s3
402; GFX6-NEXT:    s_mov_b32 s2, s4
403; GFX6-NEXT:    s_mov_b32 s3, s5
404; GFX6-NEXT:    s_mov_b32 s4, s6
405; GFX6-NEXT:    s_mov_b32 s5, s7
406; GFX6-NEXT:    s_mov_b32 s6, s8
407; GFX6-NEXT:    s_mov_b32 s7, s9
408; GFX6-NEXT:    s_mov_b32 s8, s10
409; GFX6-NEXT:    s_mov_b32 s9, s11
410; GFX6-NEXT:    s_mov_b32 s10, s12
411; GFX6-NEXT:    s_mov_b32 s11, s13
412; GFX6-NEXT:    s_wqm_b64 exec, exec
413; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
414; GFX6-NEXT:    image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
415; GFX6-NEXT:    s_waitcnt vmcnt(0)
416; GFX6-NEXT:    ; return to shader part epilog
417;
418; GFX10NSA-LABEL: gather4_c_b_2d:
419; GFX10NSA:       ; %bb.0: ; %main_body
420; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
421; GFX10NSA-NEXT:    s_mov_b32 s0, s2
422; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
423; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
424; GFX10NSA-NEXT:    s_mov_b32 s1, s3
425; GFX10NSA-NEXT:    s_mov_b32 s2, s4
426; GFX10NSA-NEXT:    s_mov_b32 s3, s5
427; GFX10NSA-NEXT:    s_mov_b32 s4, s6
428; GFX10NSA-NEXT:    s_mov_b32 s5, s7
429; GFX10NSA-NEXT:    s_mov_b32 s6, s8
430; GFX10NSA-NEXT:    s_mov_b32 s7, s9
431; GFX10NSA-NEXT:    s_mov_b32 s8, s10
432; GFX10NSA-NEXT:    s_mov_b32 s9, s11
433; GFX10NSA-NEXT:    s_mov_b32 s10, s12
434; GFX10NSA-NEXT:    s_mov_b32 s11, s13
435; GFX10NSA-NEXT:    image_gather4_c_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
436; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
437; GFX10NSA-NEXT:    ; return to shader part epilog
438main_body:
439  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
440  ret <4 x float> %v
441}
442
443define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t, float %clamp) {
444; GFX6-LABEL: gather4_b_cl_2d:
445; GFX6:       ; %bb.0: ; %main_body
446; GFX6-NEXT:    s_mov_b64 s[14:15], exec
447; GFX6-NEXT:    s_mov_b32 s0, s2
448; GFX6-NEXT:    s_mov_b32 s1, s3
449; GFX6-NEXT:    s_mov_b32 s2, s4
450; GFX6-NEXT:    s_mov_b32 s3, s5
451; GFX6-NEXT:    s_mov_b32 s4, s6
452; GFX6-NEXT:    s_mov_b32 s5, s7
453; GFX6-NEXT:    s_mov_b32 s6, s8
454; GFX6-NEXT:    s_mov_b32 s7, s9
455; GFX6-NEXT:    s_mov_b32 s8, s10
456; GFX6-NEXT:    s_mov_b32 s9, s11
457; GFX6-NEXT:    s_mov_b32 s10, s12
458; GFX6-NEXT:    s_mov_b32 s11, s13
459; GFX6-NEXT:    s_wqm_b64 exec, exec
460; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
461; GFX6-NEXT:    image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
462; GFX6-NEXT:    s_waitcnt vmcnt(0)
463; GFX6-NEXT:    ; return to shader part epilog
464;
465; GFX10NSA-LABEL: gather4_b_cl_2d:
466; GFX10NSA:       ; %bb.0: ; %main_body
467; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
468; GFX10NSA-NEXT:    s_mov_b32 s0, s2
469; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
470; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
471; GFX10NSA-NEXT:    s_mov_b32 s1, s3
472; GFX10NSA-NEXT:    s_mov_b32 s2, s4
473; GFX10NSA-NEXT:    s_mov_b32 s3, s5
474; GFX10NSA-NEXT:    s_mov_b32 s4, s6
475; GFX10NSA-NEXT:    s_mov_b32 s5, s7
476; GFX10NSA-NEXT:    s_mov_b32 s6, s8
477; GFX10NSA-NEXT:    s_mov_b32 s7, s9
478; GFX10NSA-NEXT:    s_mov_b32 s8, s10
479; GFX10NSA-NEXT:    s_mov_b32 s9, s11
480; GFX10NSA-NEXT:    s_mov_b32 s10, s12
481; GFX10NSA-NEXT:    s_mov_b32 s11, s13
482; GFX10NSA-NEXT:    image_gather4_b_cl v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
483; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
484; GFX10NSA-NEXT:    ; return to shader part epilog
485main_body:
486  %v = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
487  ret <4 x float> %v
488}
489
490define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
491; GFX6-LABEL: gather4_c_b_cl_2d:
492; GFX6:       ; %bb.0: ; %main_body
493; GFX6-NEXT:    s_mov_b64 s[14:15], exec
494; GFX6-NEXT:    s_mov_b32 s0, s2
495; GFX6-NEXT:    s_mov_b32 s1, s3
496; GFX6-NEXT:    s_mov_b32 s2, s4
497; GFX6-NEXT:    s_mov_b32 s3, s5
498; GFX6-NEXT:    s_mov_b32 s4, s6
499; GFX6-NEXT:    s_mov_b32 s5, s7
500; GFX6-NEXT:    s_mov_b32 s6, s8
501; GFX6-NEXT:    s_mov_b32 s7, s9
502; GFX6-NEXT:    s_mov_b32 s8, s10
503; GFX6-NEXT:    s_mov_b32 s9, s11
504; GFX6-NEXT:    s_mov_b32 s10, s12
505; GFX6-NEXT:    s_mov_b32 s11, s13
506; GFX6-NEXT:    s_wqm_b64 exec, exec
507; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
508; GFX6-NEXT:    image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1
509; GFX6-NEXT:    s_waitcnt vmcnt(0)
510; GFX6-NEXT:    ; return to shader part epilog
511;
512; GFX10NSA-LABEL: gather4_c_b_cl_2d:
513; GFX10NSA:       ; %bb.0: ; %main_body
514; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
515; GFX10NSA-NEXT:    s_mov_b32 s0, s2
516; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
517; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
518; GFX10NSA-NEXT:    s_mov_b32 s1, s3
519; GFX10NSA-NEXT:    s_mov_b32 s2, s4
520; GFX10NSA-NEXT:    s_mov_b32 s3, s5
521; GFX10NSA-NEXT:    s_mov_b32 s4, s6
522; GFX10NSA-NEXT:    s_mov_b32 s5, s7
523; GFX10NSA-NEXT:    s_mov_b32 s6, s8
524; GFX10NSA-NEXT:    s_mov_b32 s7, s9
525; GFX10NSA-NEXT:    s_mov_b32 s8, s10
526; GFX10NSA-NEXT:    s_mov_b32 s9, s11
527; GFX10NSA-NEXT:    s_mov_b32 s10, s12
528; GFX10NSA-NEXT:    s_mov_b32 s11, s13
529; GFX10NSA-NEXT:    image_gather4_c_b_cl v[0:3], v[0:4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
530; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
531; GFX10NSA-NEXT:    ; return to shader part epilog
532main_body:
533  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
534  ret <4 x float> %v
535}
536
537define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, float %lod) {
538; GFX6-LABEL: gather4_l_2d:
539; GFX6:       ; %bb.0: ; %main_body
540; GFX6-NEXT:    s_mov_b32 s0, s2
541; GFX6-NEXT:    s_mov_b32 s1, s3
542; GFX6-NEXT:    s_mov_b32 s2, s4
543; GFX6-NEXT:    s_mov_b32 s3, s5
544; GFX6-NEXT:    s_mov_b32 s4, s6
545; GFX6-NEXT:    s_mov_b32 s5, s7
546; GFX6-NEXT:    s_mov_b32 s6, s8
547; GFX6-NEXT:    s_mov_b32 s7, s9
548; GFX6-NEXT:    s_mov_b32 s8, s10
549; GFX6-NEXT:    s_mov_b32 s9, s11
550; GFX6-NEXT:    s_mov_b32 s10, s12
551; GFX6-NEXT:    s_mov_b32 s11, s13
552; GFX6-NEXT:    image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
553; GFX6-NEXT:    s_waitcnt vmcnt(0)
554; GFX6-NEXT:    ; return to shader part epilog
555;
556; GFX10NSA-LABEL: gather4_l_2d:
557; GFX10NSA:       ; %bb.0: ; %main_body
558; GFX10NSA-NEXT:    s_mov_b32 s0, s2
559; GFX10NSA-NEXT:    s_mov_b32 s1, s3
560; GFX10NSA-NEXT:    s_mov_b32 s2, s4
561; GFX10NSA-NEXT:    s_mov_b32 s3, s5
562; GFX10NSA-NEXT:    s_mov_b32 s4, s6
563; GFX10NSA-NEXT:    s_mov_b32 s5, s7
564; GFX10NSA-NEXT:    s_mov_b32 s6, s8
565; GFX10NSA-NEXT:    s_mov_b32 s7, s9
566; GFX10NSA-NEXT:    s_mov_b32 s8, s10
567; GFX10NSA-NEXT:    s_mov_b32 s9, s11
568; GFX10NSA-NEXT:    s_mov_b32 s10, s12
569; GFX10NSA-NEXT:    s_mov_b32 s11, s13
570; GFX10NSA-NEXT:    image_gather4_l v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
571; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
572; GFX10NSA-NEXT:    ; return to shader part epilog
573main_body:
574  %v = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
575  ret <4 x float> %v
576}
577
578define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t, float %lod) {
579; GFX6-LABEL: gather4_c_l_2d:
580; GFX6:       ; %bb.0: ; %main_body
581; GFX6-NEXT:    s_mov_b32 s0, s2
582; GFX6-NEXT:    s_mov_b32 s1, s3
583; GFX6-NEXT:    s_mov_b32 s2, s4
584; GFX6-NEXT:    s_mov_b32 s3, s5
585; GFX6-NEXT:    s_mov_b32 s4, s6
586; GFX6-NEXT:    s_mov_b32 s5, s7
587; GFX6-NEXT:    s_mov_b32 s6, s8
588; GFX6-NEXT:    s_mov_b32 s7, s9
589; GFX6-NEXT:    s_mov_b32 s8, s10
590; GFX6-NEXT:    s_mov_b32 s9, s11
591; GFX6-NEXT:    s_mov_b32 s10, s12
592; GFX6-NEXT:    s_mov_b32 s11, s13
593; GFX6-NEXT:    image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1
594; GFX6-NEXT:    s_waitcnt vmcnt(0)
595; GFX6-NEXT:    ; return to shader part epilog
596;
597; GFX10NSA-LABEL: gather4_c_l_2d:
598; GFX10NSA:       ; %bb.0: ; %main_body
599; GFX10NSA-NEXT:    s_mov_b32 s0, s2
600; GFX10NSA-NEXT:    s_mov_b32 s1, s3
601; GFX10NSA-NEXT:    s_mov_b32 s2, s4
602; GFX10NSA-NEXT:    s_mov_b32 s3, s5
603; GFX10NSA-NEXT:    s_mov_b32 s4, s6
604; GFX10NSA-NEXT:    s_mov_b32 s5, s7
605; GFX10NSA-NEXT:    s_mov_b32 s6, s8
606; GFX10NSA-NEXT:    s_mov_b32 s7, s9
607; GFX10NSA-NEXT:    s_mov_b32 s8, s10
608; GFX10NSA-NEXT:    s_mov_b32 s9, s11
609; GFX10NSA-NEXT:    s_mov_b32 s10, s12
610; GFX10NSA-NEXT:    s_mov_b32 s11, s13
611; GFX10NSA-NEXT:    image_gather4_c_l v[0:3], v[0:3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
612; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
613; GFX10NSA-NEXT:    ; return to shader part epilog
614main_body:
615  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, float %lod, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
616  ret <4 x float> %v
617}
618
619define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
620; GFX6-LABEL: gather4_lz_2d:
621; GFX6:       ; %bb.0: ; %main_body
622; GFX6-NEXT:    s_mov_b32 s0, s2
623; GFX6-NEXT:    s_mov_b32 s1, s3
624; GFX6-NEXT:    s_mov_b32 s2, s4
625; GFX6-NEXT:    s_mov_b32 s3, s5
626; GFX6-NEXT:    s_mov_b32 s4, s6
627; GFX6-NEXT:    s_mov_b32 s5, s7
628; GFX6-NEXT:    s_mov_b32 s6, s8
629; GFX6-NEXT:    s_mov_b32 s7, s9
630; GFX6-NEXT:    s_mov_b32 s8, s10
631; GFX6-NEXT:    s_mov_b32 s9, s11
632; GFX6-NEXT:    s_mov_b32 s10, s12
633; GFX6-NEXT:    s_mov_b32 s11, s13
634; GFX6-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1
635; GFX6-NEXT:    s_waitcnt vmcnt(0)
636; GFX6-NEXT:    ; return to shader part epilog
637;
638; GFX10NSA-LABEL: gather4_lz_2d:
639; GFX10NSA:       ; %bb.0: ; %main_body
640; GFX10NSA-NEXT:    s_mov_b32 s0, s2
641; GFX10NSA-NEXT:    s_mov_b32 s1, s3
642; GFX10NSA-NEXT:    s_mov_b32 s2, s4
643; GFX10NSA-NEXT:    s_mov_b32 s3, s5
644; GFX10NSA-NEXT:    s_mov_b32 s4, s6
645; GFX10NSA-NEXT:    s_mov_b32 s5, s7
646; GFX10NSA-NEXT:    s_mov_b32 s6, s8
647; GFX10NSA-NEXT:    s_mov_b32 s7, s9
648; GFX10NSA-NEXT:    s_mov_b32 s8, s10
649; GFX10NSA-NEXT:    s_mov_b32 s9, s11
650; GFX10NSA-NEXT:    s_mov_b32 s10, s12
651; GFX10NSA-NEXT:    s_mov_b32 s11, s13
652; GFX10NSA-NEXT:    image_gather4_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
653; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
654; GFX10NSA-NEXT:    ; return to shader part epilog
655main_body:
656  %v = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
657  ret <4 x float> %v
658}
659
660define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s, float %t) {
661; GFX6-LABEL: gather4_c_lz_2d:
662; GFX6:       ; %bb.0: ; %main_body
663; GFX6-NEXT:    s_mov_b32 s0, s2
664; GFX6-NEXT:    s_mov_b32 s1, s3
665; GFX6-NEXT:    s_mov_b32 s2, s4
666; GFX6-NEXT:    s_mov_b32 s3, s5
667; GFX6-NEXT:    s_mov_b32 s4, s6
668; GFX6-NEXT:    s_mov_b32 s5, s7
669; GFX6-NEXT:    s_mov_b32 s6, s8
670; GFX6-NEXT:    s_mov_b32 s7, s9
671; GFX6-NEXT:    s_mov_b32 s8, s10
672; GFX6-NEXT:    s_mov_b32 s9, s11
673; GFX6-NEXT:    s_mov_b32 s10, s12
674; GFX6-NEXT:    s_mov_b32 s11, s13
675; GFX6-NEXT:    image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1
676; GFX6-NEXT:    s_waitcnt vmcnt(0)
677; GFX6-NEXT:    ; return to shader part epilog
678;
679; GFX10NSA-LABEL: gather4_c_lz_2d:
680; GFX10NSA:       ; %bb.0: ; %main_body
681; GFX10NSA-NEXT:    s_mov_b32 s0, s2
682; GFX10NSA-NEXT:    s_mov_b32 s1, s3
683; GFX10NSA-NEXT:    s_mov_b32 s2, s4
684; GFX10NSA-NEXT:    s_mov_b32 s3, s5
685; GFX10NSA-NEXT:    s_mov_b32 s4, s6
686; GFX10NSA-NEXT:    s_mov_b32 s5, s7
687; GFX10NSA-NEXT:    s_mov_b32 s6, s8
688; GFX10NSA-NEXT:    s_mov_b32 s7, s9
689; GFX10NSA-NEXT:    s_mov_b32 s8, s10
690; GFX10NSA-NEXT:    s_mov_b32 s9, s11
691; GFX10NSA-NEXT:    s_mov_b32 s10, s12
692; GFX10NSA-NEXT:    s_mov_b32 s11, s13
693; GFX10NSA-NEXT:    image_gather4_c_lz v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
694; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
695; GFX10NSA-NEXT:    ; return to shader part epilog
696main_body:
697  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float %zcompare, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
698  ret <4 x float> %v
699}
700
701define amdgpu_ps <4 x float> @gather4_2d_dmask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
702; GFX6-LABEL: gather4_2d_dmask_2:
703; GFX6:       ; %bb.0: ; %main_body
704; GFX6-NEXT:    s_mov_b64 s[14:15], exec
705; GFX6-NEXT:    s_mov_b32 s0, s2
706; GFX6-NEXT:    s_mov_b32 s1, s3
707; GFX6-NEXT:    s_mov_b32 s2, s4
708; GFX6-NEXT:    s_mov_b32 s3, s5
709; GFX6-NEXT:    s_mov_b32 s4, s6
710; GFX6-NEXT:    s_mov_b32 s5, s7
711; GFX6-NEXT:    s_mov_b32 s6, s8
712; GFX6-NEXT:    s_mov_b32 s7, s9
713; GFX6-NEXT:    s_mov_b32 s8, s10
714; GFX6-NEXT:    s_mov_b32 s9, s11
715; GFX6-NEXT:    s_mov_b32 s10, s12
716; GFX6-NEXT:    s_mov_b32 s11, s13
717; GFX6-NEXT:    s_wqm_b64 exec, exec
718; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
719; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2
720; GFX6-NEXT:    s_waitcnt vmcnt(0)
721; GFX6-NEXT:    ; return to shader part epilog
722;
723; GFX10NSA-LABEL: gather4_2d_dmask_2:
724; GFX10NSA:       ; %bb.0: ; %main_body
725; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
726; GFX10NSA-NEXT:    s_mov_b32 s0, s2
727; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
728; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
729; GFX10NSA-NEXT:    s_mov_b32 s1, s3
730; GFX10NSA-NEXT:    s_mov_b32 s2, s4
731; GFX10NSA-NEXT:    s_mov_b32 s3, s5
732; GFX10NSA-NEXT:    s_mov_b32 s4, s6
733; GFX10NSA-NEXT:    s_mov_b32 s5, s7
734; GFX10NSA-NEXT:    s_mov_b32 s6, s8
735; GFX10NSA-NEXT:    s_mov_b32 s7, s9
736; GFX10NSA-NEXT:    s_mov_b32 s8, s10
737; GFX10NSA-NEXT:    s_mov_b32 s9, s11
738; GFX10NSA-NEXT:    s_mov_b32 s10, s12
739; GFX10NSA-NEXT:    s_mov_b32 s11, s13
740; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x2 dim:SQ_RSRC_IMG_2D
741; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
742; GFX10NSA-NEXT:    ; return to shader part epilog
743main_body:
744  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 2, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
745  ret <4 x float> %v
746}
747
748define amdgpu_ps <4 x float> @gather4_2d_dmask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
749; GFX6-LABEL: gather4_2d_dmask_4:
750; GFX6:       ; %bb.0: ; %main_body
751; GFX6-NEXT:    s_mov_b64 s[14:15], exec
752; GFX6-NEXT:    s_mov_b32 s0, s2
753; GFX6-NEXT:    s_mov_b32 s1, s3
754; GFX6-NEXT:    s_mov_b32 s2, s4
755; GFX6-NEXT:    s_mov_b32 s3, s5
756; GFX6-NEXT:    s_mov_b32 s4, s6
757; GFX6-NEXT:    s_mov_b32 s5, s7
758; GFX6-NEXT:    s_mov_b32 s6, s8
759; GFX6-NEXT:    s_mov_b32 s7, s9
760; GFX6-NEXT:    s_mov_b32 s8, s10
761; GFX6-NEXT:    s_mov_b32 s9, s11
762; GFX6-NEXT:    s_mov_b32 s10, s12
763; GFX6-NEXT:    s_mov_b32 s11, s13
764; GFX6-NEXT:    s_wqm_b64 exec, exec
765; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
766; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4
767; GFX6-NEXT:    s_waitcnt vmcnt(0)
768; GFX6-NEXT:    ; return to shader part epilog
769;
770; GFX10NSA-LABEL: gather4_2d_dmask_4:
771; GFX10NSA:       ; %bb.0: ; %main_body
772; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
773; GFX10NSA-NEXT:    s_mov_b32 s0, s2
774; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
775; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
776; GFX10NSA-NEXT:    s_mov_b32 s1, s3
777; GFX10NSA-NEXT:    s_mov_b32 s2, s4
778; GFX10NSA-NEXT:    s_mov_b32 s3, s5
779; GFX10NSA-NEXT:    s_mov_b32 s4, s6
780; GFX10NSA-NEXT:    s_mov_b32 s5, s7
781; GFX10NSA-NEXT:    s_mov_b32 s6, s8
782; GFX10NSA-NEXT:    s_mov_b32 s7, s9
783; GFX10NSA-NEXT:    s_mov_b32 s8, s10
784; GFX10NSA-NEXT:    s_mov_b32 s9, s11
785; GFX10NSA-NEXT:    s_mov_b32 s10, s12
786; GFX10NSA-NEXT:    s_mov_b32 s11, s13
787; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D
788; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
789; GFX10NSA-NEXT:    ; return to shader part epilog
790main_body:
791  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 4, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
792  ret <4 x float> %v
793}
794
795define amdgpu_ps <4 x float> @gather4_2d_dmask_8(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
796; GFX6-LABEL: gather4_2d_dmask_8:
797; GFX6:       ; %bb.0: ; %main_body
798; GFX6-NEXT:    s_mov_b64 s[14:15], exec
799; GFX6-NEXT:    s_mov_b32 s0, s2
800; GFX6-NEXT:    s_mov_b32 s1, s3
801; GFX6-NEXT:    s_mov_b32 s2, s4
802; GFX6-NEXT:    s_mov_b32 s3, s5
803; GFX6-NEXT:    s_mov_b32 s4, s6
804; GFX6-NEXT:    s_mov_b32 s5, s7
805; GFX6-NEXT:    s_mov_b32 s6, s8
806; GFX6-NEXT:    s_mov_b32 s7, s9
807; GFX6-NEXT:    s_mov_b32 s8, s10
808; GFX6-NEXT:    s_mov_b32 s9, s11
809; GFX6-NEXT:    s_mov_b32 s10, s12
810; GFX6-NEXT:    s_mov_b32 s11, s13
811; GFX6-NEXT:    s_wqm_b64 exec, exec
812; GFX6-NEXT:    s_and_b64 exec, exec, s[14:15]
813; GFX6-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8
814; GFX6-NEXT:    s_waitcnt vmcnt(0)
815; GFX6-NEXT:    ; return to shader part epilog
816;
817; GFX10NSA-LABEL: gather4_2d_dmask_8:
818; GFX10NSA:       ; %bb.0: ; %main_body
819; GFX10NSA-NEXT:    s_mov_b32 s1, exec_lo
820; GFX10NSA-NEXT:    s_mov_b32 s0, s2
821; GFX10NSA-NEXT:    s_wqm_b32 exec_lo, exec_lo
822; GFX10NSA-NEXT:    s_and_b32 exec_lo, exec_lo, s1
823; GFX10NSA-NEXT:    s_mov_b32 s1, s3
824; GFX10NSA-NEXT:    s_mov_b32 s2, s4
825; GFX10NSA-NEXT:    s_mov_b32 s3, s5
826; GFX10NSA-NEXT:    s_mov_b32 s4, s6
827; GFX10NSA-NEXT:    s_mov_b32 s5, s7
828; GFX10NSA-NEXT:    s_mov_b32 s6, s8
829; GFX10NSA-NEXT:    s_mov_b32 s7, s9
830; GFX10NSA-NEXT:    s_mov_b32 s8, s10
831; GFX10NSA-NEXT:    s_mov_b32 s9, s11
832; GFX10NSA-NEXT:    s_mov_b32 s10, s12
833; GFX10NSA-NEXT:    s_mov_b32 s11, s13
834; GFX10NSA-NEXT:    image_gather4 v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x8 dim:SQ_RSRC_IMG_2D
835; GFX10NSA-NEXT:    s_waitcnt vmcnt(0)
836; GFX10NSA-NEXT:    ; return to shader part epilog
837main_body:
838  %v = call <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 8, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
839  ret <4 x float> %v
840}
841
842declare <4 x float> @llvm.amdgcn.image.gather4.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
843declare { <4 x float>, i32 } @llvm.amdgcn.image.gather4.2d.sl_v4f32i32s.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
844declare <4 x float> @llvm.amdgcn.image.gather4.cube.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
845declare <4 x float> @llvm.amdgcn.image.gather4.2darray.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
846declare <4 x float> @llvm.amdgcn.image.gather4.c.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
847declare <4 x float> @llvm.amdgcn.image.gather4.cl.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
848declare <4 x float> @llvm.amdgcn.image.gather4.c.cl.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
849declare <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
850declare <4 x float> @llvm.amdgcn.image.gather4.c.b.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
851declare <4 x float> @llvm.amdgcn.image.gather4.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
852declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
853declare <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
854declare <4 x float> @llvm.amdgcn.image.gather4.c.l.2d.v4f32.f32(i32 immarg, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
855declare <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
856declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 immarg, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #0
857
858attributes #0 = { nounwind readonly }
859