1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
3
4define i32 @v_bfe_i32_arg_arg_arg(i32 %src0, i32 %src1, i32 %src2) #0 {
5; GFX6-LABEL: v_bfe_i32_arg_arg_arg:
6; GFX6:       ; %bb.0:
7; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GFX6-NEXT:    v_bfe_i32 v0, v0, v1, v2
9; GFX6-NEXT:    s_setpc_b64 s[30:31]
10  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2)
11  ret i32 %bfe_i32
12}
13
14define amdgpu_ps i32 @s_bfe_i32_arg_arg_arg(i32 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
15; GFX6-LABEL: s_bfe_i32_arg_arg_arg:
16; GFX6:       ; %bb.0:
17; GFX6-NEXT:    s_and_b32 s1, s1, 63
18; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
19; GFX6-NEXT:    s_or_b32 s1, s1, s2
20; GFX6-NEXT:    s_bfe_i32 s0, s0, s1
21; GFX6-NEXT:    ; return to shader part epilog
22  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src2)
23  ret i32 %bfe_i32
24}
25
26; TODO: Need to expand this
27; define i64 @v_bfe_i64_arg_arg_arg(i64 %src0, i32 %src1, i32 %src2) #0 {
28;   %bfe_i64 = call i32 @llvm.amdgcn.sbfe.i64(i32 %src0, i32 %src1, i32 %src2)
29;   ret i64 %bfe_i64
30; }
31
32define amdgpu_ps i64 @s_bfe_i64_arg_arg_arg(i64 inreg %src0, i32 inreg %src1, i32 inreg %src2) #0 {
33; GFX6-LABEL: s_bfe_i64_arg_arg_arg:
34; GFX6:       ; %bb.0:
35; GFX6-NEXT:    s_and_b32 s2, s2, 63
36; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
37; GFX6-NEXT:    s_or_b32 s2, s2, s3
38; GFX6-NEXT:    s_bfe_i64 s[0:1], s[0:1], s2
39; GFX6-NEXT:    ; return to shader part epilog
40  %bfe_i32 = call i64 @llvm.amdgcn.sbfe.i64(i64 %src0, i32 %src1, i32 %src2)
41  ret i64 %bfe_i32
42}
43
44define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
45; GFX6-LABEL: bfe_i32_arg_arg_imm:
46; GFX6:       ; %bb.0:
47; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
48; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
49; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
50; GFX6-NEXT:    s_mov_b32 s6, -1
51; GFX6-NEXT:    s_mov_b32 s7, 0xf000
52; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX6-NEXT:    s_and_b32 s0, s0, 63
54; GFX6-NEXT:    s_or_b32 s0, s0, 0x7b0000
55; GFX6-NEXT:    s_bfe_i32 s0, s2, s0
56; GFX6-NEXT:    v_mov_b32_e32 v0, s0
57; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
58; GFX6-NEXT:    s_endpgm
59  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
60  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
61  ret void
62}
63
64define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
65; GFX6-LABEL: bfe_i32_arg_imm_arg:
66; GFX6:       ; %bb.0:
67; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
68; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
69; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
70; GFX6-NEXT:    s_mov_b32 s6, -1
71; GFX6-NEXT:    s_mov_b32 s7, 0xf000
72; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
74; GFX6-NEXT:    s_or_b32 s0, 59, s0
75; GFX6-NEXT:    s_bfe_i32 s0, s2, s0
76; GFX6-NEXT:    v_mov_b32_e32 v0, s0
77; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
78; GFX6-NEXT:    s_endpgm
79  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
80  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
81  ret void
82}
83
84define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
85; GFX6-LABEL: bfe_i32_imm_arg_arg:
86; GFX6:       ; %bb.0:
87; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
88; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
89; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
90; GFX6-NEXT:    s_mov_b32 s6, -1
91; GFX6-NEXT:    s_mov_b32 s7, 0xf000
92; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
93; GFX6-NEXT:    s_and_b32 s1, s2, 63
94; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
95; GFX6-NEXT:    s_or_b32 s0, s1, s0
96; GFX6-NEXT:    s_bfe_i32 s0, 0x7b, s0
97; GFX6-NEXT:    v_mov_b32_e32 v0, s0
98; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
99; GFX6-NEXT:    s_endpgm
100  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
101  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
102  ret void
103}
104
105define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
106; GFX6-LABEL: v_bfe_print_arg:
107; GFX6:       ; %bb.0:
108; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
109; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
110; GFX6-NEXT:    s_mov_b32 s6, -1
111; GFX6-NEXT:    s_mov_b32 s7, 0xf000
112; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
113; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
114; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x80002
116; GFX6-NEXT:    v_mov_b32_e32 v0, s0
117; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
118; GFX6-NEXT:    s_endpgm
119  %load = load i32, i32 addrspace(1)* %src0, align 4
120  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
121  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
122  ret void
123}
124
125define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
126; GFX6-LABEL: bfe_i32_arg_0_width_reg_offset:
127; GFX6:       ; %bb.0:
128; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
129; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
130; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
131; GFX6-NEXT:    s_mov_b32 s6, -1
132; GFX6-NEXT:    s_mov_b32 s7, 0xf000
133; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
134; GFX6-NEXT:    s_and_b32 s0, s0, 63
135; GFX6-NEXT:    s_bfe_i32 s0, s2, s0
136; GFX6-NEXT:    v_mov_b32_e32 v0, s0
137; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
138; GFX6-NEXT:    s_endpgm
139  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
140  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
141  ret void
142}
143
144define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
145; GFX6-LABEL: bfe_i32_arg_0_width_imm_offset:
146; GFX6:       ; %bb.0:
147; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
148; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
149; GFX6-NEXT:    s_mov_b32 s6, -1
150; GFX6-NEXT:    s_mov_b32 s7, 0xf000
151; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX6-NEXT:    s_bfe_i32 s0, s0, 8
153; GFX6-NEXT:    v_mov_b32_e32 v0, s0
154; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
155; GFX6-NEXT:    s_endpgm
156  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
157  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
158  ret void
159}
160
161define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
162; GFX6-LABEL: bfe_i32_test_6:
163; GFX6:       ; %bb.0:
164; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
165; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
166; GFX6-NEXT:    s_mov_b32 s6, -1
167; GFX6-NEXT:    s_mov_b32 s7, 0xf000
168; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
169; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
170; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
172; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1f0001
173; GFX6-NEXT:    v_mov_b32_e32 v0, s0
174; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
175; GFX6-NEXT:    s_endpgm
176  %x = load i32, i32 addrspace(1)* %in, align 4
177  %shl = shl i32 %x, 31
178  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
179  store i32 %bfe, i32 addrspace(1)* %out, align 4
180  ret void
181}
182
183define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
184; GFX6-LABEL: bfe_i32_test_7:
185; GFX6:       ; %bb.0:
186; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
187; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
188; GFX6-NEXT:    s_mov_b32 s6, -1
189; GFX6-NEXT:    s_mov_b32 s7, 0xf000
190; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
191; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
192; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
194; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1f0000
195; GFX6-NEXT:    v_mov_b32_e32 v0, s0
196; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
197; GFX6-NEXT:    s_endpgm
198  %x = load i32, i32 addrspace(1)* %in, align 4
199  %shl = shl i32 %x, 31
200  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
201  store i32 %bfe, i32 addrspace(1)* %out, align 4
202  ret void
203}
204
205define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
206; GFX6-LABEL: bfe_i32_test_8:
207; GFX6:       ; %bb.0:
208; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
209; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
210; GFX6-NEXT:    s_mov_b32 s6, -1
211; GFX6-NEXT:    s_mov_b32 s7, 0xf000
212; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
214; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
215; GFX6-NEXT:    s_lshl_b32 s0, s0, 31
216; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1001f
217; GFX6-NEXT:    v_mov_b32_e32 v0, s0
218; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
219; GFX6-NEXT:    s_endpgm
220  %x = load i32, i32 addrspace(1)* %in, align 4
221  %shl = shl i32 %x, 31
222  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
223  store i32 %bfe, i32 addrspace(1)* %out, align 4
224  ret void
225}
226
227define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
228; GFX6-LABEL: bfe_i32_test_9:
229; GFX6:       ; %bb.0:
230; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
231; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
232; GFX6-NEXT:    s_mov_b32 s6, -1
233; GFX6-NEXT:    s_mov_b32 s7, 0xf000
234; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
236; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1001f
238; GFX6-NEXT:    v_mov_b32_e32 v0, s0
239; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
240; GFX6-NEXT:    s_endpgm
241  %x = load i32, i32 addrspace(1)* %in, align 4
242  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
243  store i32 %bfe, i32 addrspace(1)* %out, align 4
244  ret void
245}
246
247define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
248; GFX6-LABEL: bfe_i32_test_10:
249; GFX6:       ; %bb.0:
250; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
251; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
252; GFX6-NEXT:    s_mov_b32 s6, -1
253; GFX6-NEXT:    s_mov_b32 s7, 0xf000
254; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
255; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
256; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
257; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1f0001
258; GFX6-NEXT:    v_mov_b32_e32 v0, s0
259; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
260; GFX6-NEXT:    s_endpgm
261  %x = load i32, i32 addrspace(1)* %in, align 4
262  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
263  store i32 %bfe, i32 addrspace(1)* %out, align 4
264  ret void
265}
266
267define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
268; GFX6-LABEL: bfe_i32_test_11:
269; GFX6:       ; %bb.0:
270; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
271; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
272; GFX6-NEXT:    s_mov_b32 s6, -1
273; GFX6-NEXT:    s_mov_b32 s7, 0xf000
274; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
276; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180008
278; GFX6-NEXT:    v_mov_b32_e32 v0, s0
279; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
280; GFX6-NEXT:    s_endpgm
281  %x = load i32, i32 addrspace(1)* %in, align 4
282  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
283  store i32 %bfe, i32 addrspace(1)* %out, align 4
284  ret void
285}
286
287define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
288; GFX6-LABEL: bfe_i32_test_12:
289; GFX6:       ; %bb.0:
290; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
291; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
292; GFX6-NEXT:    s_mov_b32 s6, -1
293; GFX6-NEXT:    s_mov_b32 s7, 0xf000
294; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
296; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x80018
298; GFX6-NEXT:    v_mov_b32_e32 v0, s0
299; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
300; GFX6-NEXT:    s_endpgm
301  %x = load i32, i32 addrspace(1)* %in, align 4
302  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
303  store i32 %bfe, i32 addrspace(1)* %out, align 4
304  ret void
305}
306
307define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
308; GFX6-LABEL: bfe_i32_test_13:
309; GFX6:       ; %bb.0:
310; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
311; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
312; GFX6-NEXT:    s_mov_b32 s6, -1
313; GFX6-NEXT:    s_mov_b32 s7, 0xf000
314; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
315; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
316; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX6-NEXT:    s_ashr_i32 s0, s0, 31
318; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1001f
319; GFX6-NEXT:    v_mov_b32_e32 v0, s0
320; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
321; GFX6-NEXT:    s_endpgm
322  %x = load i32, i32 addrspace(1)* %in, align 4
323  %shl = ashr i32 %x, 31
324  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
325  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
326}
327
328define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
329; GFX6-LABEL: bfe_i32_test_14:
330; GFX6:       ; %bb.0:
331; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
332; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
333; GFX6-NEXT:    s_mov_b32 s6, -1
334; GFX6-NEXT:    s_mov_b32 s7, 0xf000
335; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
337; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX6-NEXT:    s_lshr_b32 s0, s0, 31
339; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x1001f
340; GFX6-NEXT:    v_mov_b32_e32 v0, s0
341; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
342; GFX6-NEXT:    s_endpgm
343  %x = load i32, i32 addrspace(1)* %in, align 4
344  %shl = lshr i32 %x, 31
345  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
346  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
347}
348
349define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
350; GFX6-LABEL: bfe_i32_constant_fold_test_0:
351; GFX6:       ; %bb.0:
352; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
353; GFX6-NEXT:    s_bfe_i32 s2, 0, 0
354; GFX6-NEXT:    v_mov_b32_e32 v0, s2
355; GFX6-NEXT:    s_mov_b32 s2, -1
356; GFX6-NEXT:    s_mov_b32 s3, 0xf000
357; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
358; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
359; GFX6-NEXT:    s_endpgm
360  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
361  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
362  ret void
363}
364
365define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
366; GFX6-LABEL: bfe_i32_constant_fold_test_1:
367; GFX6:       ; %bb.0:
368; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
369; GFX6-NEXT:    s_bfe_i32 s2, 0x302e, 0
370; GFX6-NEXT:    v_mov_b32_e32 v0, s2
371; GFX6-NEXT:    s_mov_b32 s2, -1
372; GFX6-NEXT:    s_mov_b32 s3, 0xf000
373; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
375; GFX6-NEXT:    s_endpgm
376  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
377  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
378  ret void
379}
380
381define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
382; GFX6-LABEL: bfe_i32_constant_fold_test_2:
383; GFX6:       ; %bb.0:
384; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
385; GFX6-NEXT:    s_bfe_i32 s2, 0, 0x10000
386; GFX6-NEXT:    v_mov_b32_e32 v0, s2
387; GFX6-NEXT:    s_mov_b32 s2, -1
388; GFX6-NEXT:    s_mov_b32 s3, 0xf000
389; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
390; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
391; GFX6-NEXT:    s_endpgm
392  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
393  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
394  ret void
395}
396
397define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
398; GFX6-LABEL: bfe_i32_constant_fold_test_3:
399; GFX6:       ; %bb.0:
400; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
401; GFX6-NEXT:    s_bfe_i32 s2, 1, 0x10000
402; GFX6-NEXT:    v_mov_b32_e32 v0, s2
403; GFX6-NEXT:    s_mov_b32 s2, -1
404; GFX6-NEXT:    s_mov_b32 s3, 0xf000
405; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
406; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
407; GFX6-NEXT:    s_endpgm
408  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
409  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
410  ret void
411}
412
413define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
414; GFX6-LABEL: bfe_i32_constant_fold_test_4:
415; GFX6:       ; %bb.0:
416; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
417; GFX6-NEXT:    s_bfe_i32 s2, -1, 0x10000
418; GFX6-NEXT:    v_mov_b32_e32 v0, s2
419; GFX6-NEXT:    s_mov_b32 s2, -1
420; GFX6-NEXT:    s_mov_b32 s3, 0xf000
421; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
423; GFX6-NEXT:    s_endpgm
424  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
425  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
426  ret void
427}
428
429define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
430; GFX6-LABEL: bfe_i32_constant_fold_test_5:
431; GFX6:       ; %bb.0:
432; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
433; GFX6-NEXT:    s_mov_b32 s2, 0x10007
434; GFX6-NEXT:    s_bfe_i32 s2, 0x80, s2
435; GFX6-NEXT:    v_mov_b32_e32 v0, s2
436; GFX6-NEXT:    s_mov_b32 s2, -1
437; GFX6-NEXT:    s_mov_b32 s3, 0xf000
438; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
440; GFX6-NEXT:    s_endpgm
441  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
442  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
443  ret void
444}
445
446define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
447; GFX6-LABEL: bfe_i32_constant_fold_test_6:
448; GFX6:       ; %bb.0:
449; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
450; GFX6-NEXT:    s_mov_b32 s2, 0x80000
451; GFX6-NEXT:    s_bfe_i32 s2, 0x80, s2
452; GFX6-NEXT:    v_mov_b32_e32 v0, s2
453; GFX6-NEXT:    s_mov_b32 s2, -1
454; GFX6-NEXT:    s_mov_b32 s3, 0xf000
455; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
457; GFX6-NEXT:    s_endpgm
458  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
459  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
460  ret void
461}
462
463define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
464; GFX6-LABEL: bfe_i32_constant_fold_test_7:
465; GFX6:       ; %bb.0:
466; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
467; GFX6-NEXT:    s_mov_b32 s2, 0x80000
468; GFX6-NEXT:    s_bfe_i32 s2, 0x7f, s2
469; GFX6-NEXT:    v_mov_b32_e32 v0, s2
470; GFX6-NEXT:    s_mov_b32 s2, -1
471; GFX6-NEXT:    s_mov_b32 s3, 0xf000
472; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
474; GFX6-NEXT:    s_endpgm
475  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
476  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
477  ret void
478}
479
480define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
481; GFX6-LABEL: bfe_i32_constant_fold_test_8:
482; GFX6:       ; %bb.0:
483; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
484; GFX6-NEXT:    s_mov_b32 s2, 0x80006
485; GFX6-NEXT:    s_bfe_i32 s2, 0x7f, s2
486; GFX6-NEXT:    v_mov_b32_e32 v0, s2
487; GFX6-NEXT:    s_mov_b32 s2, -1
488; GFX6-NEXT:    s_mov_b32 s3, 0xf000
489; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
491; GFX6-NEXT:    s_endpgm
492  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
493  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
494  ret void
495}
496
497define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
498; GFX6-LABEL: bfe_i32_constant_fold_test_9:
499; GFX6:       ; %bb.0:
500; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
501; GFX6-NEXT:    s_mov_b32 s2, 0x80010
502; GFX6-NEXT:    s_bfe_i32 s2, 0x10000, s2
503; GFX6-NEXT:    v_mov_b32_e32 v0, s2
504; GFX6-NEXT:    s_mov_b32 s2, -1
505; GFX6-NEXT:    s_mov_b32 s3, 0xf000
506; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
508; GFX6-NEXT:    s_endpgm
509  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
510  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
511  ret void
512}
513
514define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
515; GFX6-LABEL: bfe_i32_constant_fold_test_10:
516; GFX6:       ; %bb.0:
517; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
518; GFX6-NEXT:    s_mov_b32 s2, 0x100010
519; GFX6-NEXT:    s_bfe_i32 s2, 0xffff, s2
520; GFX6-NEXT:    v_mov_b32_e32 v0, s2
521; GFX6-NEXT:    s_mov_b32 s2, -1
522; GFX6-NEXT:    s_mov_b32 s3, 0xf000
523; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
525; GFX6-NEXT:    s_endpgm
526  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
527  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
528  ret void
529}
530
531define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
532; GFX6-LABEL: bfe_i32_constant_fold_test_11:
533; GFX6:       ; %bb.0:
534; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
535; GFX6-NEXT:    s_mov_b32 s2, 0x40004
536; GFX6-NEXT:    s_bfe_i32 s2, 0xa0, s2
537; GFX6-NEXT:    v_mov_b32_e32 v0, s2
538; GFX6-NEXT:    s_mov_b32 s2, -1
539; GFX6-NEXT:    s_mov_b32 s3, 0xf000
540; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
541; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
542; GFX6-NEXT:    s_endpgm
543  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
544  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
545  ret void
546}
547
548define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
549; GFX6-LABEL: bfe_i32_constant_fold_test_12:
550; GFX6:       ; %bb.0:
551; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
552; GFX6-NEXT:    s_mov_b32 s2, 0x1001f
553; GFX6-NEXT:    s_bfe_i32 s2, 0xa0, s2
554; GFX6-NEXT:    v_mov_b32_e32 v0, s2
555; GFX6-NEXT:    s_mov_b32 s2, -1
556; GFX6-NEXT:    s_mov_b32 s3, 0xf000
557; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
558; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
559; GFX6-NEXT:    s_endpgm
560  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
561  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
562  ret void
563}
564
565define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
566; GFX6-LABEL: bfe_i32_constant_fold_test_13:
567; GFX6:       ; %bb.0:
568; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
569; GFX6-NEXT:    s_mov_b32 s2, 0x100010
570; GFX6-NEXT:    s_bfe_i32 s2, 0x1fffe, s2
571; GFX6-NEXT:    v_mov_b32_e32 v0, s2
572; GFX6-NEXT:    s_mov_b32 s2, -1
573; GFX6-NEXT:    s_mov_b32 s3, 0xf000
574; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
576; GFX6-NEXT:    s_endpgm
577  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
578  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
579  ret void
580}
581
582define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
583; GFX6-LABEL: bfe_i32_constant_fold_test_14:
584; GFX6:       ; %bb.0:
585; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
586; GFX6-NEXT:    s_mov_b32 s2, 0x1e0002
587; GFX6-NEXT:    s_bfe_i32 s2, 0xa0, s2
588; GFX6-NEXT:    v_mov_b32_e32 v0, s2
589; GFX6-NEXT:    s_mov_b32 s2, -1
590; GFX6-NEXT:    s_mov_b32 s3, 0xf000
591; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
592; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
593; GFX6-NEXT:    s_endpgm
594  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
595  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
596  ret void
597}
598
599define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
600; GFX6-LABEL: bfe_i32_constant_fold_test_15:
601; GFX6:       ; %bb.0:
602; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
603; GFX6-NEXT:    s_mov_b32 s2, 0x1c0004
604; GFX6-NEXT:    s_bfe_i32 s2, 0xa0, s2
605; GFX6-NEXT:    v_mov_b32_e32 v0, s2
606; GFX6-NEXT:    s_mov_b32 s2, -1
607; GFX6-NEXT:    s_mov_b32 s3, 0xf000
608; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
610; GFX6-NEXT:    s_endpgm
611  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
612  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
613  ret void
614}
615
616define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
617; GFX6-LABEL: bfe_i32_constant_fold_test_16:
618; GFX6:       ; %bb.0:
619; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
620; GFX6-NEXT:    s_bfe_i32 s2, -1, 0x70001
621; GFX6-NEXT:    v_mov_b32_e32 v0, s2
622; GFX6-NEXT:    s_mov_b32 s2, -1
623; GFX6-NEXT:    s_mov_b32 s3, 0xf000
624; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
626; GFX6-NEXT:    s_endpgm
627  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
628  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
629  ret void
630}
631
632define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
633; GFX6-LABEL: bfe_i32_constant_fold_test_17:
634; GFX6:       ; %bb.0:
635; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
636; GFX6-NEXT:    s_mov_b32 s2, 0x1f0001
637; GFX6-NEXT:    s_bfe_i32 s2, 0xff, s2
638; GFX6-NEXT:    v_mov_b32_e32 v0, s2
639; GFX6-NEXT:    s_mov_b32 s2, -1
640; GFX6-NEXT:    s_mov_b32 s3, 0xf000
641; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
643; GFX6-NEXT:    s_endpgm
644  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
645  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
646  ret void
647}
648
649define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
650; GFX6-LABEL: bfe_i32_constant_fold_test_18:
651; GFX6:       ; %bb.0:
652; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
653; GFX6-NEXT:    s_mov_b32 s2, 0x1001f
654; GFX6-NEXT:    s_bfe_i32 s2, 0xff, s2
655; GFX6-NEXT:    v_mov_b32_e32 v0, s2
656; GFX6-NEXT:    s_mov_b32 s2, -1
657; GFX6-NEXT:    s_mov_b32 s3, 0xf000
658; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
659; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
660; GFX6-NEXT:    s_endpgm
661  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
662  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
663  ret void
664}
665
666define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
667; GFX6-LABEL: bfe_sext_in_reg_i24:
668; GFX6:       ; %bb.0:
669; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
670; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
671; GFX6-NEXT:    s_mov_b32 s6, -1
672; GFX6-NEXT:    s_mov_b32 s7, 0xf000
673; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
675; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
677; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x180000
678; GFX6-NEXT:    v_mov_b32_e32 v0, s0
679; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
680; GFX6-NEXT:    s_endpgm
681  %x = load i32, i32 addrspace(1)* %in, align 4
682  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
683  %shl = shl i32 %bfe, 8
684  %ashr = ashr i32 %shl, 8
685  store i32 %ashr, i32 addrspace(1)* %out, align 4
686  ret void
687}
688
689define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
690; GFX6-LABEL: simplify_demanded_bfe_sdiv:
691; GFX6:       ; %bb.0:
692; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, 2.0
693; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
694; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
695; GFX6-NEXT:    s_mov_b32 s6, -1
696; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
697; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
698; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
699; GFX6-NEXT:    s_load_dword s2, s[2:3], 0x0
700; GFX6-NEXT:    s_mov_b32 s7, 0xf000
701; GFX6-NEXT:    v_mul_lo_u32 v1, -2, v0
702; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
703; GFX6-NEXT:    s_bfe_i32 s2, s2, 0x100001
704; GFX6-NEXT:    s_ashr_i32 s3, s2, 31
705; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
706; GFX6-NEXT:    s_add_i32 s2, s2, s3
707; GFX6-NEXT:    s_xor_b32 s2, s2, s3
708; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
709; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
710; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
711; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
712; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
713; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 2, v1
714; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
715; GFX6-NEXT:    v_subrev_i32_e64 v2, s[0:1], 2, v1
716; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
717; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
718; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 2, v1
719; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
720; GFX6-NEXT:    v_xor_b32_e32 v0, s3, v0
721; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
722; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
723; GFX6-NEXT:    s_endpgm
724  %src = load i32, i32 addrspace(1)* %in, align 4
725  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
726  %div = sdiv i32 %bfe, 2
727  store i32 %div, i32 addrspace(1)* %out, align 4
728  ret void
729}
730
731define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
732; GFX6-LABEL: bfe_0_width:
733; GFX6:       ; %bb.0:
734; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
735; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
736; GFX6-NEXT:    s_mov_b32 s6, -1
737; GFX6-NEXT:    s_mov_b32 s7, 0xf000
738; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
740; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
741; GFX6-NEXT:    s_bfe_i32 s0, s0, 8
742; GFX6-NEXT:    v_mov_b32_e32 v0, s0
743; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
744; GFX6-NEXT:    s_endpgm
745  %load = load i32, i32 addrspace(1)* %ptr, align 4
746  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0)
747  store i32 %bfe, i32 addrspace(1)* %out, align 4
748  ret void
749}
750
751define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
752; GFX6-LABEL: bfe_8_bfe_8:
753; GFX6:       ; %bb.0:
754; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
755; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
756; GFX6-NEXT:    s_mov_b32 s6, -1
757; GFX6-NEXT:    s_mov_b32 s7, 0xf000
758; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
759; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
760; GFX6-NEXT:    s_mov_b32 s1, 0x80000
761; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX6-NEXT:    s_bfe_i32 s0, s0, s1
763; GFX6-NEXT:    s_bfe_i32 s0, s0, s1
764; GFX6-NEXT:    v_mov_b32_e32 v0, s0
765; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
766; GFX6-NEXT:    s_endpgm
767  %load = load i32, i32 addrspace(1)* %ptr, align 4
768  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
769  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
770  store i32 %bfe1, i32 addrspace(1)* %out, align 4
771  ret void
772}
773
774define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
775; GFX6-LABEL: bfe_8_bfe_16:
776; GFX6:       ; %bb.0:
777; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
778; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
779; GFX6-NEXT:    s_mov_b32 s6, -1
780; GFX6-NEXT:    s_mov_b32 s7, 0xf000
781; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
783; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x80000
785; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x100000
786; GFX6-NEXT:    v_mov_b32_e32 v0, s0
787; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
788; GFX6-NEXT:    s_endpgm
789  %load = load i32, i32 addrspace(1)* %ptr, align 4
790  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
791  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16)
792  store i32 %bfe1, i32 addrspace(1)* %out, align 4
793  ret void
794}
795
796; This really should be folded into 1
797define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
798; GFX6-LABEL: bfe_16_bfe_8:
799; GFX6:       ; %bb.0:
800; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
801; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
802; GFX6-NEXT:    s_mov_b32 s6, -1
803; GFX6-NEXT:    s_mov_b32 s7, 0xf000
804; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
805; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
806; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
807; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x100000
808; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x80000
809; GFX6-NEXT:    v_mov_b32_e32 v0, s0
810; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
811; GFX6-NEXT:    s_endpgm
812  %load = load i32, i32 addrspace(1)* %ptr, align 4
813  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16)
814  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
815  store i32 %bfe1, i32 addrspace(1)* %out, align 4
816  ret void
817}
818
819; Make sure there isn't a redundant BFE
820define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
821; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe:
822; GFX6:       ; %bb.0:
823; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
824; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
825; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
826; GFX6-NEXT:    s_mov_b32 s6, -1
827; GFX6-NEXT:    s_mov_b32 s7, 0xf000
828; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
829; GFX6-NEXT:    s_add_i32 s2, s2, s0
830; GFX6-NEXT:    s_bfe_i32 s0, s2, 0x80000
831; GFX6-NEXT:    s_sext_i32_i8 s0, s0
832; GFX6-NEXT:    v_mov_b32_e32 v0, s0
833; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
834; GFX6-NEXT:    s_endpgm
835  %c = add i32 %a, %b ; add to prevent folding into extload
836  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8)
837  %shl = shl i32 %bfe, 24
838  %ashr = ashr i32 %shl, 24
839  store i32 %ashr, i32 addrspace(1)* %out, align 4
840  ret void
841}
842
843define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
844; GFX6-LABEL: sext_in_reg_i8_to_i32_bfe_wrong:
845; GFX6:       ; %bb.0:
846; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
847; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
848; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xc
849; GFX6-NEXT:    s_mov_b32 s6, -1
850; GFX6-NEXT:    s_mov_b32 s7, 0xf000
851; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX6-NEXT:    s_add_i32 s2, s2, s0
853; GFX6-NEXT:    s_bfe_i32 s0, s2, 8
854; GFX6-NEXT:    s_sext_i32_i8 s0, s0
855; GFX6-NEXT:    v_mov_b32_e32 v0, s0
856; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
857; GFX6-NEXT:    s_endpgm
858  %c = add i32 %a, %b ; add to prevent folding into extload
859  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0)
860  %shl = shl i32 %bfe, 24
861  %ashr = ashr i32 %shl, 24
862  store i32 %ashr, i32 addrspace(1)* %out, align 4
863  ret void
864}
865
866define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
867; GFX6-LABEL: sextload_i8_to_i32_bfe:
868; GFX6:       ; %bb.0:
869; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
870; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
871; GFX6-NEXT:    s_mov_b32 s2, -1
872; GFX6-NEXT:    s_mov_b32 s3, 0xf000
873; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
874; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
876; GFX6-NEXT:    s_waitcnt vmcnt(0)
877; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
878; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
879; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
880; GFX6-NEXT:    s_endpgm
881  %load = load i8, i8 addrspace(1)* %ptr, align 1
882  %sext = sext i8 %load to i32
883  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8)
884  %shl = shl i32 %bfe, 24
885  %ashr = ashr i32 %shl, 24
886  store i32 %ashr, i32 addrspace(1)* %out, align 4
887  ret void
888}
889
890define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
891; GFX6-LABEL: sextload_i8_to_i32_bfe_0:
892; GFX6:       ; %bb.0:
893; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
894; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
895; GFX6-NEXT:    s_mov_b32 s2, -1
896; GFX6-NEXT:    s_mov_b32 s3, 0xf000
897; GFX6-NEXT:    s_mov_b64 s[6:7], s[2:3]
898; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX6-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0
900; GFX6-NEXT:    s_waitcnt vmcnt(0)
901; GFX6-NEXT:    v_bfe_i32 v0, v0, 8, 0
902; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 8
903; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
904; GFX6-NEXT:    s_endpgm
905  %load = load i8, i8 addrspace(1)* %ptr, align 1
906  %sext = sext i8 %load to i32
907  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0)
908  %shl = shl i32 %bfe, 24
909  %ashr = ashr i32 %shl, 24
910  store i32 %ashr, i32 addrspace(1)* %out, align 4
911  ret void
912}
913
914define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
915; GFX6-LABEL: sext_in_reg_i1_bfe_offset_0:
916; GFX6:       ; %bb.0:
917; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
918; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
919; GFX6-NEXT:    s_mov_b32 s6, -1
920; GFX6-NEXT:    s_mov_b32 s7, 0xf000
921; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
922; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
923; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
924; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
925; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10000
926; GFX6-NEXT:    v_mov_b32_e32 v0, s0
927; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
928; GFX6-NEXT:    s_endpgm
929  %x = load i32, i32 addrspace(1)* %in, align 4
930  %shl = shl i32 %x, 31
931  %shr = ashr i32 %shl, 31
932  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1)
933  store i32 %bfe, i32 addrspace(1)* %out, align 4
934  ret void
935}
936
937define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
938; GFX6-LABEL: sext_in_reg_i1_bfe_offset_1:
939; GFX6:       ; %bb.0:
940; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
941; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
942; GFX6-NEXT:    s_mov_b32 s6, -1
943; GFX6-NEXT:    s_mov_b32 s7, 0xf000
944; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
945; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
946; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
947; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
948; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x10001
949; GFX6-NEXT:    v_mov_b32_e32 v0, s0
950; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
951; GFX6-NEXT:    s_endpgm
952  %x = load i32, i32 addrspace(1)* %in, align 4
953  %shl = shl i32 %x, 30
954  %shr = ashr i32 %shl, 30
955  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1)
956  store i32 %bfe, i32 addrspace(1)* %out, align 4
957  ret void
958}
959
960define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
961; GFX6-LABEL: sext_in_reg_i2_bfe_offset_1:
962; GFX6:       ; %bb.0:
963; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
964; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
965; GFX6-NEXT:    s_mov_b32 s6, -1
966; GFX6-NEXT:    s_mov_b32 s7, 0xf000
967; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
968; GFX6-NEXT:    s_load_dword s0, s[0:1], 0x0
969; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
970; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
971; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20001
972; GFX6-NEXT:    v_mov_b32_e32 v0, s0
973; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
974; GFX6-NEXT:    s_endpgm
975  %x = load i32, i32 addrspace(1)* %in, align 4
976  %shl = shl i32 %x, 30
977  %shr = ashr i32 %shl, 30
978  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2)
979  store i32 %bfe, i32 addrspace(1)* %out, align 4
980  ret void
981}
982
983declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1
984declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) #1
985
986attributes #0 = { nounwind }
987attributes #1 = { nounwind readnone }
988