1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5
6declare i32 @llvm.amdgcn.workitem.id.x() #1
7
8declare i16 @llvm.bitreverse.i16(i16) #1
9declare i32 @llvm.bitreverse.i32(i32) #1
10declare i64 @llvm.bitreverse.i64(i64) #1
11
12declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
13declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
14
15declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
16declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
17
18define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
19; SI-LABEL: s_brev_i16:
20; SI:       ; %bb.0:
21; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
22; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
23; SI-NEXT:    s_mov_b32 s7, 0xf000
24; SI-NEXT:    s_mov_b32 s6, -1
25; SI-NEXT:    s_waitcnt lgkmcnt(0)
26; SI-NEXT:    s_brev_b32 s0, s0
27; SI-NEXT:    s_lshr_b32 s0, s0, 16
28; SI-NEXT:    v_mov_b32_e32 v0, s0
29; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
30; SI-NEXT:    s_endpgm
31;
32; FLAT-LABEL: s_brev_i16:
33; FLAT:       ; %bb.0:
34; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
35; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
36; FLAT-NEXT:    s_mov_b32 s7, 0xf000
37; FLAT-NEXT:    s_mov_b32 s6, -1
38; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
39; FLAT-NEXT:    s_brev_b32 s0, s0
40; FLAT-NEXT:    s_lshr_b32 s0, s0, 16
41; FLAT-NEXT:    v_mov_b32_e32 v0, s0
42; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
43; FLAT-NEXT:    s_endpgm
44  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
45  store i16 %brev, i16 addrspace(1)* %out
46  ret void
47}
48
49define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
50; SI-LABEL: v_brev_i16:
51; SI:       ; %bb.0:
52; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
54; SI-NEXT:    s_mov_b32 s7, 0xf000
55; SI-NEXT:    s_mov_b32 s6, -1
56; SI-NEXT:    s_mov_b32 s2, s6
57; SI-NEXT:    s_mov_b32 s3, s7
58; SI-NEXT:    s_waitcnt lgkmcnt(0)
59; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
60; SI-NEXT:    s_waitcnt vmcnt(0)
61; SI-NEXT:    v_bfrev_b32_e32 v0, v0
62; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
63; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
64; SI-NEXT:    s_endpgm
65;
66; FLAT-LABEL: v_brev_i16:
67; FLAT:       ; %bb.0:
68; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
69; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
70; FLAT-NEXT:    s_mov_b32 s7, 0xf000
71; FLAT-NEXT:    s_mov_b32 s6, -1
72; FLAT-NEXT:    s_mov_b32 s2, s6
73; FLAT-NEXT:    s_mov_b32 s3, s7
74; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
75; FLAT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
76; FLAT-NEXT:    s_waitcnt vmcnt(0)
77; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
78; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
79; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
80; FLAT-NEXT:    s_endpgm
81  %val = load i16, i16 addrspace(1)* %valptr
82  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
83  store i16 %brev, i16 addrspace(1)* %out
84  ret void
85}
86
87define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
88; SI-LABEL: s_brev_i32:
89; SI:       ; %bb.0:
90; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
91; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
92; SI-NEXT:    s_mov_b32 s7, 0xf000
93; SI-NEXT:    s_mov_b32 s6, -1
94; SI-NEXT:    s_waitcnt lgkmcnt(0)
95; SI-NEXT:    s_brev_b32 s0, s0
96; SI-NEXT:    v_mov_b32_e32 v0, s0
97; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
98; SI-NEXT:    s_endpgm
99;
100; FLAT-LABEL: s_brev_i32:
101; FLAT:       ; %bb.0:
102; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
103; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
104; FLAT-NEXT:    s_mov_b32 s7, 0xf000
105; FLAT-NEXT:    s_mov_b32 s6, -1
106; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
107; FLAT-NEXT:    s_brev_b32 s0, s0
108; FLAT-NEXT:    v_mov_b32_e32 v0, s0
109; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
110; FLAT-NEXT:    s_endpgm
111  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
112  store i32 %brev, i32 addrspace(1)* %out
113  ret void
114}
115
116define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
117; SI-LABEL: v_brev_i32:
118; SI:       ; %bb.0:
119; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
120; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
121; SI-NEXT:    s_mov_b32 s7, 0xf000
122; SI-NEXT:    s_mov_b32 s2, 0
123; SI-NEXT:    s_mov_b32 s3, s7
124; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
125; SI-NEXT:    v_mov_b32_e32 v1, 0
126; SI-NEXT:    s_waitcnt lgkmcnt(0)
127; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
128; SI-NEXT:    s_mov_b32 s6, -1
129; SI-NEXT:    s_waitcnt vmcnt(0)
130; SI-NEXT:    v_bfrev_b32_e32 v0, v0
131; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
132; SI-NEXT:    s_endpgm
133;
134; FLAT-LABEL: v_brev_i32:
135; FLAT:       ; %bb.0:
136; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
137; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
138; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
139; FLAT-NEXT:    s_mov_b32 s7, 0xf000
140; FLAT-NEXT:    s_mov_b32 s6, -1
141; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
142; FLAT-NEXT:    v_mov_b32_e32 v1, s1
143; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
144; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
145; FLAT-NEXT:    flat_load_dword v0, v[0:1]
146; FLAT-NEXT:    s_waitcnt vmcnt(0)
147; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
148; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
149; FLAT-NEXT:    s_endpgm
150  %tid = call i32 @llvm.amdgcn.workitem.id.x()
151  %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
152  %val = load i32, i32 addrspace(1)* %gep
153  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
154  store i32 %brev, i32 addrspace(1)* %out
155  ret void
156}
157
158define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
159; SI-LABEL: s_brev_v2i32:
160; SI:       ; %bb.0:
161; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
162; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
163; SI-NEXT:    s_mov_b32 s7, 0xf000
164; SI-NEXT:    s_mov_b32 s6, -1
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    s_brev_b32 s1, s1
167; SI-NEXT:    s_brev_b32 s0, s0
168; SI-NEXT:    v_mov_b32_e32 v0, s0
169; SI-NEXT:    v_mov_b32_e32 v1, s1
170; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
171; SI-NEXT:    s_endpgm
172;
173; FLAT-LABEL: s_brev_v2i32:
174; FLAT:       ; %bb.0:
175; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
176; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
177; FLAT-NEXT:    s_mov_b32 s7, 0xf000
178; FLAT-NEXT:    s_mov_b32 s6, -1
179; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
180; FLAT-NEXT:    s_brev_b32 s1, s1
181; FLAT-NEXT:    s_brev_b32 s0, s0
182; FLAT-NEXT:    v_mov_b32_e32 v0, s0
183; FLAT-NEXT:    v_mov_b32_e32 v1, s1
184; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
185; FLAT-NEXT:    s_endpgm
186  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
187  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
188  ret void
189}
190
191define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
192; SI-LABEL: v_brev_v2i32:
193; SI:       ; %bb.0:
194; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
195; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
196; SI-NEXT:    s_mov_b32 s7, 0xf000
197; SI-NEXT:    s_mov_b32 s2, 0
198; SI-NEXT:    s_mov_b32 s3, s7
199; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
200; SI-NEXT:    v_mov_b32_e32 v1, 0
201; SI-NEXT:    s_waitcnt lgkmcnt(0)
202; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
203; SI-NEXT:    s_mov_b32 s6, -1
204; SI-NEXT:    s_waitcnt vmcnt(0)
205; SI-NEXT:    v_bfrev_b32_e32 v1, v1
206; SI-NEXT:    v_bfrev_b32_e32 v0, v0
207; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
208; SI-NEXT:    s_endpgm
209;
210; FLAT-LABEL: v_brev_v2i32:
211; FLAT:       ; %bb.0:
212; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
213; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
214; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
215; FLAT-NEXT:    s_mov_b32 s7, 0xf000
216; FLAT-NEXT:    s_mov_b32 s6, -1
217; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
218; FLAT-NEXT:    v_mov_b32_e32 v1, s1
219; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
220; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
221; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
222; FLAT-NEXT:    s_waitcnt vmcnt(0)
223; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
224; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
225; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
226; FLAT-NEXT:    s_endpgm
227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
228  %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
229  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
230  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
231  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
232  ret void
233}
234
235define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
236; SI-LABEL: s_brev_i64:
237; SI:       ; %bb.0:
238; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
239; SI-NEXT:    s_mov_b32 s4, 0xff00ff
240; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
241; SI-NEXT:    s_waitcnt lgkmcnt(0)
242; SI-NEXT:    v_alignbit_b32 v0, s2, s2, 8
243; SI-NEXT:    v_alignbit_b32 v1, s2, s2, 24
244; SI-NEXT:    v_alignbit_b32 v2, s3, s3, 8
245; SI-NEXT:    v_alignbit_b32 v3, s3, s3, 24
246; SI-NEXT:    v_bfi_b32 v4, s4, v1, v0
247; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f
248; SI-NEXT:    v_bfi_b32 v2, s4, v3, v2
249; SI-NEXT:    v_and_b32_e32 v1, s2, v4
250; SI-NEXT:    v_and_b32_e32 v0, s2, v2
251; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
252; SI-NEXT:    v_and_b32_e32 v3, s2, v4
253; SI-NEXT:    v_and_b32_e32 v2, s2, v2
254; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
255; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
256; SI-NEXT:    s_mov_b32 s2, 0x33333333
257; SI-NEXT:    v_or_b32_e32 v2, v2, v0
258; SI-NEXT:    v_or_b32_e32 v3, v3, v1
259; SI-NEXT:    v_and_b32_e32 v1, s2, v3
260; SI-NEXT:    v_and_b32_e32 v0, s2, v2
261; SI-NEXT:    s_mov_b32 s2, 0xcccccccc
262; SI-NEXT:    v_and_b32_e32 v3, s2, v3
263; SI-NEXT:    v_and_b32_e32 v2, s2, v2
264; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
265; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
266; SI-NEXT:    s_mov_b32 s2, 0x55555555
267; SI-NEXT:    v_or_b32_e32 v2, v2, v0
268; SI-NEXT:    v_or_b32_e32 v3, v3, v1
269; SI-NEXT:    v_and_b32_e32 v1, s2, v3
270; SI-NEXT:    v_and_b32_e32 v0, s2, v2
271; SI-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
272; SI-NEXT:    v_and_b32_e32 v3, s2, v3
273; SI-NEXT:    v_and_b32_e32 v2, s2, v2
274; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
275; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
276; SI-NEXT:    s_mov_b32 s3, 0xf000
277; SI-NEXT:    s_mov_b32 s2, -1
278; SI-NEXT:    v_or_b32_e32 v0, v2, v0
279; SI-NEXT:    v_or_b32_e32 v1, v3, v1
280; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
281; SI-NEXT:    s_endpgm
282;
283; FLAT-LABEL: s_brev_i64:
284; FLAT:       ; %bb.0:
285; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
286; FLAT-NEXT:    v_mov_b32_e32 v0, 0x10203
287; FLAT-NEXT:    s_mov_b32 s4, 0xf0f0f0f
288; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
289; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
290; FLAT-NEXT:    v_perm_b32 v2, 0, s2, v0
291; FLAT-NEXT:    v_perm_b32 v4, 0, s3, v0
292; FLAT-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
293; FLAT-NEXT:    v_and_b32_e32 v1, s4, v2
294; FLAT-NEXT:    v_and_b32_e32 v0, s4, v4
295; FLAT-NEXT:    v_and_b32_e32 v3, s2, v2
296; FLAT-NEXT:    v_and_b32_e32 v2, s2, v4
297; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
298; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
299; FLAT-NEXT:    s_mov_b32 s2, 0x33333333
300; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
301; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
302; FLAT-NEXT:    v_and_b32_e32 v1, s2, v3
303; FLAT-NEXT:    v_and_b32_e32 v0, s2, v2
304; FLAT-NEXT:    s_mov_b32 s2, 0xcccccccc
305; FLAT-NEXT:    v_and_b32_e32 v3, s2, v3
306; FLAT-NEXT:    v_and_b32_e32 v2, s2, v2
307; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
308; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
309; FLAT-NEXT:    s_mov_b32 s2, 0x55555555
310; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
311; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
312; FLAT-NEXT:    v_and_b32_e32 v1, s2, v3
313; FLAT-NEXT:    v_and_b32_e32 v0, s2, v2
314; FLAT-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
315; FLAT-NEXT:    v_and_b32_e32 v3, s2, v3
316; FLAT-NEXT:    v_and_b32_e32 v2, s2, v2
317; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
318; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
319; FLAT-NEXT:    s_mov_b32 s3, 0xf000
320; FLAT-NEXT:    s_mov_b32 s2, -1
321; FLAT-NEXT:    v_or_b32_e32 v0, v2, v0
322; FLAT-NEXT:    v_or_b32_e32 v1, v3, v1
323; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
324; FLAT-NEXT:    s_endpgm
325  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
326  store i64 %brev, i64 addrspace(1)* %out
327  ret void
328}
329
330define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
331; SI-LABEL: v_brev_i64:
332; SI:       ; %bb.0:
333; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
334; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
335; SI-NEXT:    s_mov_b32 s7, 0xf000
336; SI-NEXT:    s_mov_b32 s2, 0
337; SI-NEXT:    s_mov_b32 s3, s7
338; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
339; SI-NEXT:    v_mov_b32_e32 v1, 0
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
342; SI-NEXT:    s_mov_b32 s0, 0xff00ff
343; SI-NEXT:    s_mov_b32 s1, 0xf0f0f0f
344; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
345; SI-NEXT:    s_mov_b32 s3, 0x33333333
346; SI-NEXT:    s_mov_b32 s6, 0xcccccccc
347; SI-NEXT:    s_waitcnt vmcnt(0)
348; SI-NEXT:    v_alignbit_b32 v2, v0, v0, 8
349; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
350; SI-NEXT:    v_alignbit_b32 v3, v1, v1, 8
351; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
352; SI-NEXT:    v_bfi_b32 v2, s0, v0, v2
353; SI-NEXT:    v_bfi_b32 v4, s0, v1, v3
354; SI-NEXT:    v_and_b32_e32 v1, s1, v2
355; SI-NEXT:    v_and_b32_e32 v0, s1, v4
356; SI-NEXT:    v_and_b32_e32 v3, s2, v2
357; SI-NEXT:    v_and_b32_e32 v2, s2, v4
358; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
359; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
360; SI-NEXT:    s_mov_b32 s0, 0x55555555
361; SI-NEXT:    v_or_b32_e32 v3, v3, v1
362; SI-NEXT:    v_or_b32_e32 v2, v2, v0
363; SI-NEXT:    v_and_b32_e32 v1, s3, v3
364; SI-NEXT:    v_and_b32_e32 v0, s3, v2
365; SI-NEXT:    v_and_b32_e32 v3, s6, v3
366; SI-NEXT:    v_and_b32_e32 v2, s6, v2
367; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
368; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
369; SI-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
370; SI-NEXT:    v_or_b32_e32 v3, v3, v1
371; SI-NEXT:    v_or_b32_e32 v2, v2, v0
372; SI-NEXT:    v_and_b32_e32 v1, s0, v3
373; SI-NEXT:    v_and_b32_e32 v0, s0, v2
374; SI-NEXT:    v_and_b32_e32 v3, s1, v3
375; SI-NEXT:    v_and_b32_e32 v2, s1, v2
376; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
377; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
378; SI-NEXT:    s_mov_b32 s6, -1
379; SI-NEXT:    v_or_b32_e32 v1, v3, v1
380; SI-NEXT:    v_or_b32_e32 v0, v2, v0
381; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
382; SI-NEXT:    s_endpgm
383;
384; FLAT-LABEL: v_brev_i64:
385; FLAT:       ; %bb.0:
386; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
387; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
388; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
389; FLAT-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
390; FLAT-NEXT:    s_mov_b32 s3, 0x33333333
391; FLAT-NEXT:    s_mov_b32 s6, 0xcccccccc
392; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
393; FLAT-NEXT:    v_mov_b32_e32 v1, s1
394; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
395; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
396; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
397; FLAT-NEXT:    s_mov_b32 s0, 0x10203
398; FLAT-NEXT:    s_mov_b32 s1, 0xf0f0f0f
399; FLAT-NEXT:    s_mov_b32 s7, 0xf000
400; FLAT-NEXT:    s_waitcnt vmcnt(0)
401; FLAT-NEXT:    v_perm_b32 v2, 0, v0, s0
402; FLAT-NEXT:    v_perm_b32 v4, 0, v1, s0
403; FLAT-NEXT:    v_and_b32_e32 v1, s1, v2
404; FLAT-NEXT:    v_and_b32_e32 v0, s1, v4
405; FLAT-NEXT:    v_and_b32_e32 v3, s2, v2
406; FLAT-NEXT:    v_and_b32_e32 v2, s2, v4
407; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
408; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
409; FLAT-NEXT:    s_mov_b32 s0, 0x55555555
410; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
411; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
412; FLAT-NEXT:    v_and_b32_e32 v1, s3, v3
413; FLAT-NEXT:    v_and_b32_e32 v0, s3, v2
414; FLAT-NEXT:    v_and_b32_e32 v3, s6, v3
415; FLAT-NEXT:    v_and_b32_e32 v2, s6, v2
416; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
417; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
418; FLAT-NEXT:    s_mov_b32 s1, 0xaaaaaaaa
419; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
420; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
421; FLAT-NEXT:    v_and_b32_e32 v1, s0, v3
422; FLAT-NEXT:    v_and_b32_e32 v0, s0, v2
423; FLAT-NEXT:    v_and_b32_e32 v3, s1, v3
424; FLAT-NEXT:    v_and_b32_e32 v2, s1, v2
425; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
426; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
427; FLAT-NEXT:    s_mov_b32 s6, -1
428; FLAT-NEXT:    v_or_b32_e32 v1, v3, v1
429; FLAT-NEXT:    v_or_b32_e32 v0, v2, v0
430; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
431; FLAT-NEXT:    s_endpgm
432  %tid = call i32 @llvm.amdgcn.workitem.id.x()
433  %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
434  %val = load i64, i64 addrspace(1)* %gep
435  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
436  store i64 %brev, i64 addrspace(1)* %out
437  ret void
438}
439
440define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
441; SI-LABEL: s_brev_v2i64:
442; SI:       ; %bb.0:
443; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
444; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
445; SI-NEXT:    s_mov_b32 s8, 0xff00ff
446; SI-NEXT:    s_mov_b32 s9, 0x33333333
447; SI-NEXT:    s_mov_b32 s10, 0xcccccccc
448; SI-NEXT:    s_mov_b32 s11, 0x55555555
449; SI-NEXT:    s_waitcnt lgkmcnt(0)
450; SI-NEXT:    v_alignbit_b32 v0, s2, s2, 8
451; SI-NEXT:    v_alignbit_b32 v1, s2, s2, 24
452; SI-NEXT:    v_bfi_b32 v3, s8, v1, v0
453; SI-NEXT:    v_alignbit_b32 v2, s3, s3, 8
454; SI-NEXT:    v_alignbit_b32 v0, s3, s3, 24
455; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f
456; SI-NEXT:    v_bfi_b32 v2, s8, v0, v2
457; SI-NEXT:    s_mov_b32 s3, 0xf0f0f0f0
458; SI-NEXT:    v_and_b32_e32 v0, s2, v2
459; SI-NEXT:    v_and_b32_e32 v1, s2, v3
460; SI-NEXT:    v_and_b32_e32 v2, s3, v2
461; SI-NEXT:    v_and_b32_e32 v3, s3, v3
462; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
463; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
464; SI-NEXT:    v_alignbit_b32 v4, s0, s0, 8
465; SI-NEXT:    v_alignbit_b32 v5, s0, s0, 24
466; SI-NEXT:    v_bfi_b32 v7, s8, v5, v4
467; SI-NEXT:    v_alignbit_b32 v4, s1, s1, 8
468; SI-NEXT:    v_alignbit_b32 v5, s1, s1, 24
469; SI-NEXT:    v_bfi_b32 v6, s8, v5, v4
470; SI-NEXT:    v_or_b32_e32 v2, v2, v0
471; SI-NEXT:    v_or_b32_e32 v3, v3, v1
472; SI-NEXT:    v_and_b32_e32 v0, s9, v2
473; SI-NEXT:    v_and_b32_e32 v1, s9, v3
474; SI-NEXT:    v_and_b32_e32 v4, s2, v6
475; SI-NEXT:    v_and_b32_e32 v5, s2, v7
476; SI-NEXT:    v_and_b32_e32 v2, s10, v2
477; SI-NEXT:    v_and_b32_e32 v3, s10, v3
478; SI-NEXT:    v_and_b32_e32 v6, s3, v6
479; SI-NEXT:    v_and_b32_e32 v7, s3, v7
480; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
481; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
482; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 4
483; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 4
484; SI-NEXT:    v_or_b32_e32 v2, v2, v0
485; SI-NEXT:    v_or_b32_e32 v3, v3, v1
486; SI-NEXT:    v_or_b32_e32 v6, v6, v4
487; SI-NEXT:    v_or_b32_e32 v7, v7, v5
488; SI-NEXT:    s_mov_b32 s12, 0xaaaaaaaa
489; SI-NEXT:    v_and_b32_e32 v0, s11, v2
490; SI-NEXT:    v_and_b32_e32 v1, s11, v3
491; SI-NEXT:    v_and_b32_e32 v4, s9, v6
492; SI-NEXT:    v_and_b32_e32 v5, s9, v7
493; SI-NEXT:    v_and_b32_e32 v2, s12, v2
494; SI-NEXT:    v_and_b32_e32 v3, s12, v3
495; SI-NEXT:    v_and_b32_e32 v6, s10, v6
496; SI-NEXT:    v_and_b32_e32 v7, s10, v7
497; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
498; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
499; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 2
500; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 2
501; SI-NEXT:    v_or_b32_e32 v2, v2, v0
502; SI-NEXT:    v_or_b32_e32 v0, v6, v4
503; SI-NEXT:    v_or_b32_e32 v7, v7, v5
504; SI-NEXT:    v_and_b32_e32 v5, s11, v7
505; SI-NEXT:    v_and_b32_e32 v4, s11, v0
506; SI-NEXT:    v_and_b32_e32 v6, s12, v0
507; SI-NEXT:    v_and_b32_e32 v7, s12, v7
508; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
509; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
510; SI-NEXT:    v_or_b32_e32 v3, v3, v1
511; SI-NEXT:    s_mov_b32 s7, 0xf000
512; SI-NEXT:    s_mov_b32 s6, -1
513; SI-NEXT:    v_or_b32_e32 v0, v6, v4
514; SI-NEXT:    v_or_b32_e32 v1, v7, v5
515; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
516; SI-NEXT:    s_endpgm
517;
518; FLAT-LABEL: s_brev_v2i64:
519; FLAT:       ; %bb.0:
520; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
521; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
522; FLAT-NEXT:    v_mov_b32_e32 v4, 0x10203
523; FLAT-NEXT:    s_mov_b32 s8, 0xf0f0f0f
524; FLAT-NEXT:    s_mov_b32 s9, 0xcccccccc
525; FLAT-NEXT:    s_mov_b32 s10, 0x55555555
526; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
527; FLAT-NEXT:    v_perm_b32 v3, 0, s2, v4
528; FLAT-NEXT:    v_perm_b32 v2, 0, s3, v4
529; FLAT-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
530; FLAT-NEXT:    v_and_b32_e32 v0, s8, v2
531; FLAT-NEXT:    v_and_b32_e32 v1, s8, v3
532; FLAT-NEXT:    v_and_b32_e32 v2, s2, v2
533; FLAT-NEXT:    v_and_b32_e32 v3, s2, v3
534; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
535; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
536; FLAT-NEXT:    v_perm_b32 v7, 0, s0, v4
537; FLAT-NEXT:    v_perm_b32 v6, 0, s1, v4
538; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
539; FLAT-NEXT:    s_mov_b32 s3, 0x33333333
540; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
541; FLAT-NEXT:    v_and_b32_e32 v0, s3, v2
542; FLAT-NEXT:    v_and_b32_e32 v1, s3, v3
543; FLAT-NEXT:    v_and_b32_e32 v4, s8, v6
544; FLAT-NEXT:    v_and_b32_e32 v5, s8, v7
545; FLAT-NEXT:    v_and_b32_e32 v2, s9, v2
546; FLAT-NEXT:    v_and_b32_e32 v3, s9, v3
547; FLAT-NEXT:    v_and_b32_e32 v6, s2, v6
548; FLAT-NEXT:    v_and_b32_e32 v7, s2, v7
549; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
550; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
551; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 4, v[4:5]
552; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 4, v[6:7]
553; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
554; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
555; FLAT-NEXT:    v_or_b32_e32 v6, v6, v4
556; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
557; FLAT-NEXT:    s_mov_b32 s11, 0xaaaaaaaa
558; FLAT-NEXT:    v_and_b32_e32 v0, s10, v2
559; FLAT-NEXT:    v_and_b32_e32 v1, s10, v3
560; FLAT-NEXT:    v_and_b32_e32 v4, s3, v6
561; FLAT-NEXT:    v_and_b32_e32 v5, s3, v7
562; FLAT-NEXT:    v_and_b32_e32 v2, s11, v2
563; FLAT-NEXT:    v_and_b32_e32 v3, s11, v3
564; FLAT-NEXT:    v_and_b32_e32 v6, s9, v6
565; FLAT-NEXT:    v_and_b32_e32 v7, s9, v7
566; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
567; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
568; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
569; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 2, v[6:7]
570; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
571; FLAT-NEXT:    v_or_b32_e32 v0, v6, v4
572; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
573; FLAT-NEXT:    v_and_b32_e32 v5, s10, v7
574; FLAT-NEXT:    v_and_b32_e32 v4, s10, v0
575; FLAT-NEXT:    v_and_b32_e32 v6, s11, v0
576; FLAT-NEXT:    v_and_b32_e32 v7, s11, v7
577; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
578; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
579; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
580; FLAT-NEXT:    s_mov_b32 s7, 0xf000
581; FLAT-NEXT:    s_mov_b32 s6, -1
582; FLAT-NEXT:    v_or_b32_e32 v0, v6, v4
583; FLAT-NEXT:    v_or_b32_e32 v1, v7, v5
584; FLAT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
585; FLAT-NEXT:    s_endpgm
586  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
587  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
588  ret void
589}
590
591define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
592; SI-LABEL: v_brev_v2i64:
593; SI:       ; %bb.0:
594; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
595; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
596; SI-NEXT:    s_mov_b32 s7, 0xf000
597; SI-NEXT:    s_mov_b32 s2, 0
598; SI-NEXT:    s_mov_b32 s3, s7
599; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
600; SI-NEXT:    v_mov_b32_e32 v1, 0
601; SI-NEXT:    s_waitcnt lgkmcnt(0)
602; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
603; SI-NEXT:    s_mov_b32 s0, 0xff00ff
604; SI-NEXT:    s_mov_b32 s1, 0xf0f0f0f
605; SI-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
606; SI-NEXT:    s_mov_b32 s3, 0x33333333
607; SI-NEXT:    s_mov_b32 s8, 0xcccccccc
608; SI-NEXT:    s_mov_b32 s9, 0x55555555
609; SI-NEXT:    s_mov_b32 s10, 0xaaaaaaaa
610; SI-NEXT:    s_mov_b32 s6, -1
611; SI-NEXT:    s_waitcnt vmcnt(0)
612; SI-NEXT:    v_alignbit_b32 v4, v2, v2, 8
613; SI-NEXT:    v_alignbit_b32 v2, v2, v2, 24
614; SI-NEXT:    v_alignbit_b32 v5, v3, v3, 8
615; SI-NEXT:    v_alignbit_b32 v6, v0, v0, 8
616; SI-NEXT:    v_alignbit_b32 v0, v0, v0, 24
617; SI-NEXT:    v_alignbit_b32 v7, v1, v1, 8
618; SI-NEXT:    v_alignbit_b32 v1, v1, v1, 24
619; SI-NEXT:    v_alignbit_b32 v3, v3, v3, 24
620; SI-NEXT:    v_bfi_b32 v2, s0, v2, v4
621; SI-NEXT:    v_bfi_b32 v4, s0, v3, v5
622; SI-NEXT:    v_bfi_b32 v6, s0, v0, v6
623; SI-NEXT:    v_bfi_b32 v8, s0, v1, v7
624; SI-NEXT:    v_and_b32_e32 v1, s1, v2
625; SI-NEXT:    v_and_b32_e32 v0, s1, v4
626; SI-NEXT:    v_and_b32_e32 v3, s2, v2
627; SI-NEXT:    v_and_b32_e32 v2, s2, v4
628; SI-NEXT:    v_and_b32_e32 v5, s1, v6
629; SI-NEXT:    v_and_b32_e32 v4, s1, v8
630; SI-NEXT:    v_and_b32_e32 v7, s2, v6
631; SI-NEXT:    v_and_b32_e32 v6, s2, v8
632; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 4
633; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 4
634; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 4
635; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 4
636; SI-NEXT:    v_or_b32_e32 v3, v3, v1
637; SI-NEXT:    v_or_b32_e32 v2, v2, v0
638; SI-NEXT:    v_or_b32_e32 v7, v7, v5
639; SI-NEXT:    v_or_b32_e32 v6, v6, v4
640; SI-NEXT:    v_and_b32_e32 v1, s3, v3
641; SI-NEXT:    v_and_b32_e32 v0, s3, v2
642; SI-NEXT:    v_and_b32_e32 v5, s3, v7
643; SI-NEXT:    v_and_b32_e32 v4, s3, v6
644; SI-NEXT:    v_and_b32_e32 v3, s8, v3
645; SI-NEXT:    v_and_b32_e32 v2, s8, v2
646; SI-NEXT:    v_and_b32_e32 v7, s8, v7
647; SI-NEXT:    v_and_b32_e32 v6, s8, v6
648; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 2
649; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 2
650; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 2
651; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 2
652; SI-NEXT:    v_or_b32_e32 v3, v3, v1
653; SI-NEXT:    v_or_b32_e32 v2, v2, v0
654; SI-NEXT:    v_or_b32_e32 v7, v7, v5
655; SI-NEXT:    v_or_b32_e32 v6, v6, v4
656; SI-NEXT:    v_and_b32_e32 v1, s9, v3
657; SI-NEXT:    v_and_b32_e32 v0, s9, v2
658; SI-NEXT:    v_and_b32_e32 v5, s9, v7
659; SI-NEXT:    v_and_b32_e32 v4, s9, v6
660; SI-NEXT:    v_and_b32_e32 v3, s10, v3
661; SI-NEXT:    v_and_b32_e32 v2, s10, v2
662; SI-NEXT:    v_and_b32_e32 v7, s10, v7
663; SI-NEXT:    v_and_b32_e32 v6, s10, v6
664; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
665; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], 1
666; SI-NEXT:    v_lshl_b64 v[4:5], v[4:5], 1
667; SI-NEXT:    v_lshr_b64 v[6:7], v[6:7], 1
668; SI-NEXT:    v_or_b32_e32 v3, v3, v1
669; SI-NEXT:    v_or_b32_e32 v2, v2, v0
670; SI-NEXT:    v_or_b32_e32 v1, v7, v5
671; SI-NEXT:    v_or_b32_e32 v0, v6, v4
672; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
673; SI-NEXT:    s_endpgm
674;
675; FLAT-LABEL: v_brev_v2i64:
676; FLAT:       ; %bb.0:
677; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
678; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
679; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
680; FLAT-NEXT:    s_mov_b32 s2, 0xf0f0f0f0
681; FLAT-NEXT:    s_mov_b32 s3, 0x33333333
682; FLAT-NEXT:    s_mov_b32 s8, 0xcccccccc
683; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
684; FLAT-NEXT:    v_mov_b32_e32 v1, s1
685; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
686; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
687; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
688; FLAT-NEXT:    s_mov_b32 s0, 0x10203
689; FLAT-NEXT:    s_mov_b32 s1, 0xf0f0f0f
690; FLAT-NEXT:    s_mov_b32 s9, 0x55555555
691; FLAT-NEXT:    s_mov_b32 s10, 0xaaaaaaaa
692; FLAT-NEXT:    s_mov_b32 s7, 0xf000
693; FLAT-NEXT:    s_mov_b32 s6, -1
694; FLAT-NEXT:    s_waitcnt vmcnt(0)
695; FLAT-NEXT:    v_perm_b32 v6, 0, v0, s0
696; FLAT-NEXT:    v_perm_b32 v4, 0, v3, s0
697; FLAT-NEXT:    v_perm_b32 v2, 0, v2, s0
698; FLAT-NEXT:    v_perm_b32 v8, 0, v1, s0
699; FLAT-NEXT:    v_and_b32_e32 v1, s1, v2
700; FLAT-NEXT:    v_and_b32_e32 v0, s1, v4
701; FLAT-NEXT:    v_and_b32_e32 v3, s2, v2
702; FLAT-NEXT:    v_and_b32_e32 v2, s2, v4
703; FLAT-NEXT:    v_and_b32_e32 v5, s1, v6
704; FLAT-NEXT:    v_and_b32_e32 v4, s1, v8
705; FLAT-NEXT:    v_and_b32_e32 v7, s2, v6
706; FLAT-NEXT:    v_and_b32_e32 v6, s2, v8
707; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 4, v[0:1]
708; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 4, v[2:3]
709; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 4, v[4:5]
710; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 4, v[6:7]
711; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
712; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
713; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
714; FLAT-NEXT:    v_or_b32_e32 v6, v6, v4
715; FLAT-NEXT:    v_and_b32_e32 v1, s3, v3
716; FLAT-NEXT:    v_and_b32_e32 v0, s3, v2
717; FLAT-NEXT:    v_and_b32_e32 v5, s3, v7
718; FLAT-NEXT:    v_and_b32_e32 v4, s3, v6
719; FLAT-NEXT:    v_and_b32_e32 v3, s8, v3
720; FLAT-NEXT:    v_and_b32_e32 v2, s8, v2
721; FLAT-NEXT:    v_and_b32_e32 v7, s8, v7
722; FLAT-NEXT:    v_and_b32_e32 v6, s8, v6
723; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
724; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 2, v[2:3]
725; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 2, v[4:5]
726; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 2, v[6:7]
727; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
728; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
729; FLAT-NEXT:    v_or_b32_e32 v7, v7, v5
730; FLAT-NEXT:    v_or_b32_e32 v6, v6, v4
731; FLAT-NEXT:    v_and_b32_e32 v1, s9, v3
732; FLAT-NEXT:    v_and_b32_e32 v0, s9, v2
733; FLAT-NEXT:    v_and_b32_e32 v5, s9, v7
734; FLAT-NEXT:    v_and_b32_e32 v4, s9, v6
735; FLAT-NEXT:    v_and_b32_e32 v3, s10, v3
736; FLAT-NEXT:    v_and_b32_e32 v2, s10, v2
737; FLAT-NEXT:    v_and_b32_e32 v7, s10, v7
738; FLAT-NEXT:    v_and_b32_e32 v6, s10, v6
739; FLAT-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
740; FLAT-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
741; FLAT-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
742; FLAT-NEXT:    v_lshrrev_b64 v[6:7], 1, v[6:7]
743; FLAT-NEXT:    v_or_b32_e32 v3, v3, v1
744; FLAT-NEXT:    v_or_b32_e32 v2, v2, v0
745; FLAT-NEXT:    v_or_b32_e32 v1, v7, v5
746; FLAT-NEXT:    v_or_b32_e32 v0, v6, v4
747; FLAT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
748; FLAT-NEXT:    s_endpgm
749  %tid = call i32 @llvm.amdgcn.workitem.id.x()
750  %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
751  %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
752  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
753  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
754  ret void
755}
756
757define float @missing_truncate_promote_bitreverse(i32 %arg) {
758; SI-LABEL: missing_truncate_promote_bitreverse:
759; SI:       ; %bb.0: ; %bb
760; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761; SI-NEXT:    v_bfrev_b32_e32 v0, v0
762; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
763; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
764; SI-NEXT:    s_setpc_b64 s[30:31]
765;
766; FLAT-LABEL: missing_truncate_promote_bitreverse:
767; FLAT:       ; %bb.0: ; %bb
768; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
769; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
770; FLAT-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
771; FLAT-NEXT:    s_setpc_b64 s[30:31]
772bb:
773  %tmp = trunc i32 %arg to i16
774  %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
775  %tmp2 = bitcast i16 %tmp1 to half
776  %tmp3 = fpext half %tmp2 to float
777  ret float %tmp3
778}
779
780attributes #0 = { nounwind }
781attributes #1 = { nounwind readnone }
782