1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7
8declare i32 @llvm.fshr.i32(i32, i32, i32)
9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
16declare i64 @llvm.fshr.i64(i64, i64, i64)
17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
18declare i24 @llvm.fshr.i24(i24, i24, i24)
19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
20
21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
22; SI-LABEL: fshr_i32:
23; SI:       ; %bb.0: ; %entry
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
25; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
26; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
27; SI-NEXT:    s_mov_b32 s7, 0xf000
28; SI-NEXT:    s_mov_b32 s6, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    v_mov_b32_e32 v0, s3
31; SI-NEXT:    v_mov_b32_e32 v1, s0
32; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v1
33; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fshr_i32:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
39; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
40; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    v_mov_b32_e32 v0, s5
43; VI-NEXT:    v_mov_b32_e32 v1, s0
44; VI-NEXT:    v_alignbit_b32 v2, s4, v0, v1
45; VI-NEXT:    v_mov_b32_e32 v0, s2
46; VI-NEXT:    v_mov_b32_e32 v1, s3
47; VI-NEXT:    flat_store_dword v[0:1], v2
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: fshr_i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
53; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
54; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
55; GFX9-NEXT:    v_mov_b32_e32 v0, 0
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, s5
58; GFX9-NEXT:    v_mov_b32_e32 v2, s6
59; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshr_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
71; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
72; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
73;
74; GFX10-LABEL: fshr_i32:
75; GFX10:       ; %bb.0: ; %entry
76; GFX10-NEXT:    s_clause 0x2
77; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
78; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
79; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
80; GFX10-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-NEXT:    v_mov_b32_e32 v0, s6
83; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, v0
84; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
85; GFX10-NEXT:    s_endpgm
86entry:
87  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
88  store i32 %0, i32 addrspace(1)* %in
89  ret void
90}
91
92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
93; SI-LABEL: fshr_i32_imm:
94; SI:       ; %bb.0: ; %entry
95; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
96; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
97; SI-NEXT:    s_mov_b32 s7, 0xf000
98; SI-NEXT:    s_mov_b32 s6, -1
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    v_mov_b32_e32 v0, s1
101; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 7
102; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
103; SI-NEXT:    s_endpgm
104;
105; VI-LABEL: fshr_i32_imm:
106; VI:       ; %bb.0: ; %entry
107; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
108; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
109; VI-NEXT:    s_waitcnt lgkmcnt(0)
110; VI-NEXT:    v_mov_b32_e32 v0, s1
111; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 7
112; VI-NEXT:    v_mov_b32_e32 v0, s2
113; VI-NEXT:    v_mov_b32_e32 v1, s3
114; VI-NEXT:    flat_store_dword v[0:1], v2
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: fshr_i32_imm:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
120; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
121; GFX9-NEXT:    v_mov_b32_e32 v0, 0
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    v_mov_b32_e32 v1, s5
124; GFX9-NEXT:    v_alignbit_b32 v1, s4, v1, 7
125; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
126; GFX9-NEXT:    s_endpgm
127;
128; R600-LABEL: fshr_i32_imm:
129; R600:       ; %bb.0: ; %entry
130; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
131; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
132; R600-NEXT:    CF_END
133; R600-NEXT:    PAD
134; R600-NEXT:    ALU clause starting at 4:
135; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
136; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
138; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
139;
140; GFX10-LABEL: fshr_i32_imm:
141; GFX10:       ; %bb.0: ; %entry
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
144; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
148; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
149; GFX10-NEXT:    s_endpgm
150entry:
151  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
152  store i32 %0, i32 addrspace(1)* %in
153  ret void
154}
155
156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
157; SI-LABEL: fshr_v2i32:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
160; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
161; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
162; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
163; SI-NEXT:    s_mov_b32 s7, 0xf000
164; SI-NEXT:    s_mov_b32 s6, -1
165; SI-NEXT:    s_waitcnt lgkmcnt(0)
166; SI-NEXT:    v_mov_b32_e32 v0, s9
167; SI-NEXT:    v_mov_b32_e32 v1, s1
168; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
169; SI-NEXT:    v_mov_b32_e32 v0, s8
170; SI-NEXT:    v_mov_b32_e32 v2, s0
171; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
172; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
173; SI-NEXT:    s_endpgm
174;
175; VI-LABEL: fshr_v2i32:
176; VI:       ; %bb.0: ; %entry
177; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
178; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
179; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
180; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    v_mov_b32_e32 v0, s7
183; VI-NEXT:    v_mov_b32_e32 v1, s1
184; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
185; VI-NEXT:    v_mov_b32_e32 v0, s6
186; VI-NEXT:    v_mov_b32_e32 v2, s0
187; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
188; VI-NEXT:    v_mov_b32_e32 v2, s2
189; VI-NEXT:    v_mov_b32_e32 v3, s3
190; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
191; VI-NEXT:    s_endpgm
192;
193; GFX9-LABEL: fshr_v2i32:
194; GFX9:       ; %bb.0: ; %entry
195; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
196; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
197; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
198; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
199; GFX9-NEXT:    v_mov_b32_e32 v2, 0
200; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX9-NEXT:    v_mov_b32_e32 v0, s7
202; GFX9-NEXT:    v_mov_b32_e32 v1, s9
203; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
204; GFX9-NEXT:    v_mov_b32_e32 v0, s6
205; GFX9-NEXT:    v_mov_b32_e32 v3, s8
206; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
207; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
208; GFX9-NEXT:    s_endpgm
209;
210; R600-LABEL: fshr_v2i32:
211; R600:       ; %bb.0: ; %entry
212; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
213; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
214; R600-NEXT:    CF_END
215; R600-NEXT:    PAD
216; R600-NEXT:    ALU clause starting at 4:
217; R600-NEXT:     MOV * T0.W, KC0[4].X,
218; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
219; R600-NEXT:     MOV * T0.W, KC0[3].W,
220; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
221; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
222; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
223;
224; GFX10-LABEL: fshr_v2i32:
225; GFX10:       ; %bb.0: ; %entry
226; GFX10-NEXT:    s_clause 0x3
227; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
228; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
229; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
230; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
231; GFX10-NEXT:    v_mov_b32_e32 v3, 0
232; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX10-NEXT:    v_mov_b32_e32 v0, s3
234; GFX10-NEXT:    v_mov_b32_e32 v2, s2
235; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, v0
236; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, v2
237; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
238; GFX10-NEXT:    s_endpgm
239entry:
240  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
241  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
242  ret void
243}
244
245define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
246; SI-LABEL: fshr_v2i32_imm:
247; SI:       ; %bb.0: ; %entry
248; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
249; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
250; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
251; SI-NEXT:    s_mov_b32 s7, 0xf000
252; SI-NEXT:    s_mov_b32 s6, -1
253; SI-NEXT:    s_waitcnt lgkmcnt(0)
254; SI-NEXT:    v_mov_b32_e32 v0, s1
255; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
256; SI-NEXT:    v_mov_b32_e32 v0, s0
257; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
258; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
259; SI-NEXT:    s_endpgm
260;
261; VI-LABEL: fshr_v2i32_imm:
262; VI:       ; %bb.0: ; %entry
263; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
264; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
265; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
266; VI-NEXT:    s_waitcnt lgkmcnt(0)
267; VI-NEXT:    v_mov_b32_e32 v0, s1
268; VI-NEXT:    v_mov_b32_e32 v2, s0
269; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
270; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
271; VI-NEXT:    v_mov_b32_e32 v2, s2
272; VI-NEXT:    v_mov_b32_e32 v3, s3
273; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
274; VI-NEXT:    s_endpgm
275;
276; GFX9-LABEL: fshr_v2i32_imm:
277; GFX9:       ; %bb.0: ; %entry
278; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
279; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
280; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
281; GFX9-NEXT:    v_mov_b32_e32 v2, 0
282; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX9-NEXT:    v_mov_b32_e32 v0, s7
284; GFX9-NEXT:    v_mov_b32_e32 v3, s6
285; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
286; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
287; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
288; GFX9-NEXT:    s_endpgm
289;
290; R600-LABEL: fshr_v2i32_imm:
291; R600:       ; %bb.0: ; %entry
292; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
293; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
294; R600-NEXT:    CF_END
295; R600-NEXT:    PAD
296; R600-NEXT:    ALU clause starting at 4:
297; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
298; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
299; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
300; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
301; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
302; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
303;
304; GFX10-LABEL: fshr_v2i32_imm:
305; GFX10:       ; %bb.0: ; %entry
306; GFX10-NEXT:    s_clause 0x2
307; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
308; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
309; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
310; GFX10-NEXT:    v_mov_b32_e32 v2, 0
311; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX10-NEXT:    v_alignbit_b32 v1, s3, s5, 9
313; GFX10-NEXT:    v_alignbit_b32 v0, s2, s4, 7
314; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
315; GFX10-NEXT:    s_endpgm
316entry:
317  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
318  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
319  ret void
320}
321
322define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
323; SI-LABEL: fshr_v4i32:
324; SI:       ; %bb.0: ; %entry
325; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
326; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
327; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
328; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
329; SI-NEXT:    s_mov_b32 s7, 0xf000
330; SI-NEXT:    s_mov_b32 s6, -1
331; SI-NEXT:    s_waitcnt lgkmcnt(0)
332; SI-NEXT:    v_mov_b32_e32 v0, s15
333; SI-NEXT:    v_mov_b32_e32 v1, s3
334; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
335; SI-NEXT:    v_mov_b32_e32 v0, s14
336; SI-NEXT:    v_mov_b32_e32 v1, s2
337; SI-NEXT:    v_alignbit_b32 v2, s10, v0, v1
338; SI-NEXT:    v_mov_b32_e32 v0, s13
339; SI-NEXT:    v_mov_b32_e32 v1, s1
340; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
341; SI-NEXT:    v_mov_b32_e32 v0, s12
342; SI-NEXT:    v_mov_b32_e32 v4, s0
343; SI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
344; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
345; SI-NEXT:    s_endpgm
346;
347; VI-LABEL: fshr_v4i32:
348; VI:       ; %bb.0: ; %entry
349; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
350; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
351; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
352; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
353; VI-NEXT:    s_waitcnt lgkmcnt(0)
354; VI-NEXT:    v_mov_b32_e32 v0, s11
355; VI-NEXT:    v_mov_b32_e32 v1, s3
356; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
357; VI-NEXT:    v_mov_b32_e32 v0, s10
358; VI-NEXT:    v_mov_b32_e32 v1, s2
359; VI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
360; VI-NEXT:    v_mov_b32_e32 v0, s9
361; VI-NEXT:    v_mov_b32_e32 v1, s1
362; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
363; VI-NEXT:    v_mov_b32_e32 v0, s8
364; VI-NEXT:    v_mov_b32_e32 v4, s0
365; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
366; VI-NEXT:    v_mov_b32_e32 v4, s12
367; VI-NEXT:    v_mov_b32_e32 v5, s13
368; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
369; VI-NEXT:    s_endpgm
370;
371; GFX9-LABEL: fshr_v4i32:
372; GFX9:       ; %bb.0: ; %entry
373; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
374; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
375; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
376; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
377; GFX9-NEXT:    v_mov_b32_e32 v4, 0
378; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
379; GFX9-NEXT:    v_mov_b32_e32 v0, s11
380; GFX9-NEXT:    v_mov_b32_e32 v1, s15
381; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
382; GFX9-NEXT:    v_mov_b32_e32 v0, s10
383; GFX9-NEXT:    v_mov_b32_e32 v1, s14
384; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
385; GFX9-NEXT:    v_mov_b32_e32 v0, s9
386; GFX9-NEXT:    v_mov_b32_e32 v1, s13
387; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
388; GFX9-NEXT:    v_mov_b32_e32 v0, s8
389; GFX9-NEXT:    v_mov_b32_e32 v5, s12
390; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
391; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
392; GFX9-NEXT:    s_endpgm
393;
394; R600-LABEL: fshr_v4i32:
395; R600:       ; %bb.0: ; %entry
396; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
397; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
398; R600-NEXT:    CF_END
399; R600-NEXT:    PAD
400; R600-NEXT:    ALU clause starting at 4:
401; R600-NEXT:     MOV * T0.W, KC0[6].X,
402; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
403; R600-NEXT:     MOV * T1.W, KC0[5].W,
404; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
405; R600-NEXT:     MOV * T1.W, KC0[5].Z,
406; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
407; R600-NEXT:     MOV * T1.W, KC0[5].Y,
408; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
409; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
410; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
411;
412; GFX10-LABEL: fshr_v4i32:
413; GFX10:       ; %bb.0: ; %entry
414; GFX10-NEXT:    s_clause 0x3
415; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x54
416; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
417; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
418; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
419; GFX10-NEXT:    v_mov_b32_e32 v6, 0
420; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX10-NEXT:    v_mov_b32_e32 v0, s7
422; GFX10-NEXT:    v_mov_b32_e32 v1, s6
423; GFX10-NEXT:    v_mov_b32_e32 v4, s5
424; GFX10-NEXT:    v_mov_b32_e32 v5, s4
425; GFX10-NEXT:    v_alignbit_b32 v3, s15, s11, v0
426; GFX10-NEXT:    v_alignbit_b32 v2, s14, s10, v1
427; GFX10-NEXT:    v_alignbit_b32 v1, s13, s9, v4
428; GFX10-NEXT:    v_alignbit_b32 v0, s12, s8, v5
429; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
430; GFX10-NEXT:    s_endpgm
431entry:
432  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
433  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
434  ret void
435}
436
437define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
438; SI-LABEL: fshr_v4i32_imm:
439; SI:       ; %bb.0: ; %entry
440; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
441; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
442; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
443; SI-NEXT:    s_mov_b32 s7, 0xf000
444; SI-NEXT:    s_mov_b32 s6, -1
445; SI-NEXT:    s_waitcnt lgkmcnt(0)
446; SI-NEXT:    v_mov_b32_e32 v0, s3
447; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
448; SI-NEXT:    v_mov_b32_e32 v0, s2
449; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
450; SI-NEXT:    v_mov_b32_e32 v0, s1
451; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
452; SI-NEXT:    v_mov_b32_e32 v0, s0
453; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
454; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
455; SI-NEXT:    s_endpgm
456;
457; VI-LABEL: fshr_v4i32_imm:
458; VI:       ; %bb.0: ; %entry
459; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
460; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
461; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
462; VI-NEXT:    s_waitcnt lgkmcnt(0)
463; VI-NEXT:    v_mov_b32_e32 v4, s8
464; VI-NEXT:    v_mov_b32_e32 v5, s9
465; VI-NEXT:    v_mov_b32_e32 v0, s3
466; VI-NEXT:    v_mov_b32_e32 v1, s2
467; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
468; VI-NEXT:    v_mov_b32_e32 v0, s1
469; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
470; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
471; VI-NEXT:    v_mov_b32_e32 v0, s0
472; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
473; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
474; VI-NEXT:    s_endpgm
475;
476; GFX9-LABEL: fshr_v4i32_imm:
477; GFX9:       ; %bb.0: ; %entry
478; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
479; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
480; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
481; GFX9-NEXT:    v_mov_b32_e32 v4, 0
482; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
483; GFX9-NEXT:    v_mov_b32_e32 v0, s11
484; GFX9-NEXT:    v_mov_b32_e32 v1, s10
485; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
486; GFX9-NEXT:    v_mov_b32_e32 v0, s9
487; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
488; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
489; GFX9-NEXT:    v_mov_b32_e32 v0, s8
490; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
491; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
492; GFX9-NEXT:    s_endpgm
493;
494; R600-LABEL: fshr_v4i32_imm:
495; R600:       ; %bb.0: ; %entry
496; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
497; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
498; R600-NEXT:    CF_END
499; R600-NEXT:    PAD
500; R600-NEXT:    ALU clause starting at 4:
501; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
502; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
503; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
504; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
505; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
506; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
507; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
508; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
509;
510; GFX10-LABEL: fshr_v4i32_imm:
511; GFX10:       ; %bb.0: ; %entry
512; GFX10-NEXT:    s_clause 0x2
513; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
514; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
515; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
516; GFX10-NEXT:    v_mov_b32_e32 v4, 0
517; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
518; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 1
519; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 9
520; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 7
521; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 1
522; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
523; GFX10-NEXT:    s_endpgm
524entry:
525  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
526  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
527  ret void
528}
529
530define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
531; GFX89-LABEL: v_fshr_i32:
532; GFX89:       ; %bb.0:
533; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
535; GFX89-NEXT:    s_setpc_b64 s[30:31]
536;
537; R600-LABEL: v_fshr_i32:
538; R600:       ; %bb.0:
539; R600-NEXT:    CF_END
540; R600-NEXT:    PAD
541;
542; GFX10-LABEL: v_fshr_i32:
543; GFX10:       ; %bb.0:
544; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
545; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
546; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
547; GFX10-NEXT:    s_setpc_b64 s[30:31]
548  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
549  ret i32 %ret
550}
551
552define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
553; GFX89-LABEL: v_fshr_v2i32:
554; GFX89:       ; %bb.0:
555; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
556; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
557; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
558; GFX89-NEXT:    s_setpc_b64 s[30:31]
559;
560; R600-LABEL: v_fshr_v2i32:
561; R600:       ; %bb.0:
562; R600-NEXT:    CF_END
563; R600-NEXT:    PAD
564;
565; GFX10-LABEL: v_fshr_v2i32:
566; GFX10:       ; %bb.0:
567; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
569; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
570; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
571; GFX10-NEXT:    s_setpc_b64 s[30:31]
572  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
573  ret <2 x i32> %ret
574}
575
576define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
577; GFX89-LABEL: v_fshr_v3i32:
578; GFX89:       ; %bb.0:
579; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
580; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
581; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
582; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
583; GFX89-NEXT:    s_setpc_b64 s[30:31]
584;
585; R600-LABEL: v_fshr_v3i32:
586; R600:       ; %bb.0:
587; R600-NEXT:    CF_END
588; R600-NEXT:    PAD
589;
590; GFX10-LABEL: v_fshr_v3i32:
591; GFX10:       ; %bb.0:
592; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
594; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
595; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
596; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
597; GFX10-NEXT:    s_setpc_b64 s[30:31]
598  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
599  ret <3 x i32> %ret
600}
601
602define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
603; GFX89-LABEL: v_fshr_v4i32:
604; GFX89:       ; %bb.0:
605; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
606; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
607; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
608; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
609; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
610; GFX89-NEXT:    s_setpc_b64 s[30:31]
611;
612; R600-LABEL: v_fshr_v4i32:
613; R600:       ; %bb.0:
614; R600-NEXT:    CF_END
615; R600-NEXT:    PAD
616;
617; GFX10-LABEL: v_fshr_v4i32:
618; GFX10:       ; %bb.0:
619; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
621; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
622; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
623; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
624; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
625; GFX10-NEXT:    s_setpc_b64 s[30:31]
626  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
627  ret <4 x i32> %ret
628}
629
630define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
631; SI-LABEL: v_fshr_i16:
632; SI:       ; %bb.0:
633; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; SI-NEXT:    v_or_b32_e32 v2, 16, v2
635; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
636; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
637; SI-NEXT:    s_setpc_b64 s[30:31]
638;
639; VI-LABEL: v_fshr_i16:
640; VI:       ; %bb.0:
641; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
642; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
643; VI-NEXT:    v_and_b32_e32 v2, 15, v2
644; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
645; VI-NEXT:    v_and_b32_e32 v3, 15, v3
646; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
647; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
648; VI-NEXT:    v_or_b32_e32 v0, v0, v1
649; VI-NEXT:    s_setpc_b64 s[30:31]
650;
651; GFX9-LABEL: v_fshr_i16:
652; GFX9:       ; %bb.0:
653; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
654; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
655; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
656; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
657; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
658; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
659; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
660; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
661; GFX9-NEXT:    s_setpc_b64 s[30:31]
662;
663; R600-LABEL: v_fshr_i16:
664; R600:       ; %bb.0:
665; R600-NEXT:    CF_END
666; R600-NEXT:    PAD
667;
668; GFX10-LABEL: v_fshr_i16:
669; GFX10:       ; %bb.0:
670; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
672; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
673; GFX10-NEXT:    v_and_b32_e32 v2, 15, v2
674; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
675; GFX10-NEXT:    v_and_b32_e32 v3, 15, v3
676; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
677; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
678; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
679; GFX10-NEXT:    s_setpc_b64 s[30:31]
680  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
681  ret i16 %ret
682}
683
684define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
685; SI-LABEL: v_fshr_v2i16:
686; SI:       ; %bb.0:
687; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
688; SI-NEXT:    v_or_b32_e32 v5, 16, v5
689; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
690; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
691; SI-NEXT:    v_or_b32_e32 v3, 16, v4
692; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
693; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
694; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
695; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
696; SI-NEXT:    v_or_b32_e32 v0, v0, v1
697; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
698; SI-NEXT:    s_setpc_b64 s[30:31]
699;
700; VI-LABEL: v_fshr_v2i16:
701; VI:       ; %bb.0:
702; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
703; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
704; VI-NEXT:    v_and_b32_e32 v4, 15, v3
705; VI-NEXT:    v_mov_b32_e32 v5, 1
706; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
707; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
708; VI-NEXT:    v_and_b32_e32 v3, 15, v3
709; VI-NEXT:    v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
710; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
711; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
712; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
713; VI-NEXT:    v_and_b32_e32 v2, 15, v2
714; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
715; VI-NEXT:    v_and_b32_e32 v4, 15, v4
716; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
717; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
718; VI-NEXT:    v_or_b32_e32 v0, v0, v1
719; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
720; VI-NEXT:    s_setpc_b64 s[30:31]
721;
722; GFX9-LABEL: v_fshr_v2i16:
723; GFX9:       ; %bb.0:
724; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
725; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
726; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
727; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
728; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
729; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
730; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
731; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
732; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
733; GFX9-NEXT:    s_setpc_b64 s[30:31]
734;
735; R600-LABEL: v_fshr_v2i16:
736; R600:       ; %bb.0:
737; R600-NEXT:    CF_END
738; R600-NEXT:    PAD
739;
740; GFX10-LABEL: v_fshr_v2i16:
741; GFX10:       ; %bb.0:
742; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
743; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
744; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
745; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
746; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
747; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
748; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
749; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
750; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
751; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
752; GFX10-NEXT:    s_setpc_b64 s[30:31]
753  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
754  ret <2 x i16> %ret
755}
756
757define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
758; SI-LABEL: v_fshr_v3i16:
759; SI:       ; %bb.0:
760; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
761; SI-NEXT:    v_or_b32_e32 v7, 16, v7
762; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
763; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
764; SI-NEXT:    v_or_b32_e32 v4, 16, v6
765; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
766; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
767; SI-NEXT:    s_mov_b32 s4, 0xffff
768; SI-NEXT:    v_or_b32_e32 v3, 16, v8
769; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
770; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
771; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
772; SI-NEXT:    v_and_b32_e32 v0, s4, v0
773; SI-NEXT:    v_or_b32_e32 v0, v0, v1
774; SI-NEXT:    v_and_b32_e32 v2, s4, v3
775; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
776; SI-NEXT:    s_setpc_b64 s[30:31]
777;
778; VI-LABEL: v_fshr_v3i16:
779; VI:       ; %bb.0:
780; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
781; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
782; VI-NEXT:    v_and_b32_e32 v7, 15, v6
783; VI-NEXT:    v_mov_b32_e32 v8, 1
784; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
785; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786; VI-NEXT:    v_and_b32_e32 v6, 15, v6
787; VI-NEXT:    v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
788; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
789; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
790; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
791; VI-NEXT:    v_and_b32_e32 v5, 15, v5
792; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
793; VI-NEXT:    v_and_b32_e32 v7, 15, v7
794; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
795; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
796; VI-NEXT:    v_or_b32_e32 v1, v1, v3
797; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
798; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
799; VI-NEXT:    v_and_b32_e32 v3, 15, v3
800; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
801; VI-NEXT:    v_and_b32_e32 v3, 15, v4
802; VI-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
803; VI-NEXT:    v_or_b32_e32 v0, v0, v2
804; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
805; VI-NEXT:    s_setpc_b64 s[30:31]
806;
807; GFX9-LABEL: v_fshr_v3i16:
808; GFX9:       ; %bb.0:
809; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
811; GFX9-NEXT:    v_and_b32_e32 v7, 15, v6
812; GFX9-NEXT:    v_mov_b32_e32 v8, 1
813; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
814; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
815; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
816; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
817; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
818; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
819; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
820; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
821; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
822; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
823; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
824; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
825; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
826; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
827; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
828; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
829; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
830; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
831; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
832; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
833; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
834; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
835; GFX9-NEXT:    s_setpc_b64 s[30:31]
836;
837; R600-LABEL: v_fshr_v3i16:
838; R600:       ; %bb.0:
839; R600-NEXT:    CF_END
840; R600-NEXT:    PAD
841;
842; GFX10-LABEL: v_fshr_v3i16:
843; GFX10:       ; %bb.0:
844; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
845; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
846; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
847; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
848; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
849; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
850; GFX10-NEXT:    v_and_b32_e32 v4, 15, v4
851; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
852; GFX10-NEXT:    v_and_b32_e32 v9, 15, v6
853; GFX10-NEXT:    v_xor_b32_e32 v6, -1, v6
854; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
855; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
856; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
857; GFX10-NEXT:    v_lshlrev_b16 v10, 1, v10
858; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
859; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v5
860; GFX10-NEXT:    v_lshrrev_b16 v4, v9, v7
861; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
862; GFX10-NEXT:    v_and_b32_e32 v2, 15, v5
863; GFX10-NEXT:    v_lshlrev_b16 v6, v6, v10
864; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
865; GFX10-NEXT:    v_and_b32_e32 v7, 15, v11
866; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
867; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v3
868; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
869; GFX10-NEXT:    v_lshlrev_b16 v1, v7, v1
870; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
871; GFX10-NEXT:    v_or_b32_e32 v1, v1, v2
872; GFX10-NEXT:    s_setpc_b64 s[30:31]
873  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
874  ret <3 x i16> %ret
875}
876
877define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
878; SI-LABEL: v_fshr_v4i16:
879; SI:       ; %bb.0:
880; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
881; SI-NEXT:    v_or_b32_e32 v9, 16, v9
882; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
883; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
884; SI-NEXT:    v_or_b32_e32 v5, 16, v8
885; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
886; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
887; SI-NEXT:    v_or_b32_e32 v4, 16, v11
888; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
889; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
890; SI-NEXT:    v_or_b32_e32 v4, 16, v10
891; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
892; SI-NEXT:    s_mov_b32 s4, 0xffff
893; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
894; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
895; SI-NEXT:    v_and_b32_e32 v2, s4, v2
896; SI-NEXT:    v_or_b32_e32 v2, v2, v3
897; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
898; SI-NEXT:    v_and_b32_e32 v0, s4, v0
899; SI-NEXT:    v_or_b32_e32 v0, v0, v1
900; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
901; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
902; SI-NEXT:    s_setpc_b64 s[30:31]
903;
904; VI-LABEL: v_fshr_v4i16:
905; VI:       ; %bb.0:
906; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
907; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
908; VI-NEXT:    v_and_b32_e32 v7, 15, v6
909; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
910; VI-NEXT:    v_mov_b32_e32 v8, 1
911; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
912; VI-NEXT:    v_and_b32_e32 v6, 15, v6
913; VI-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
914; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
915; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
916; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
917; VI-NEXT:    v_and_b32_e32 v9, 15, v7
918; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
919; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
920; VI-NEXT:    v_and_b32_e32 v7, 15, v7
921; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
922; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
923; VI-NEXT:    v_and_b32_e32 v5, 15, v5
924; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
925; VI-NEXT:    v_and_b32_e32 v8, 15, v8
926; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
927; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
928; VI-NEXT:    v_or_b32_e32 v1, v1, v3
929; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
930; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
931; VI-NEXT:    v_and_b32_e32 v3, 15, v3
932; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
933; VI-NEXT:    v_and_b32_e32 v3, 15, v4
934; VI-NEXT:    v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
935; VI-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
936; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
937; VI-NEXT:    v_or_b32_e32 v0, v0, v2
938; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
939; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
940; VI-NEXT:    s_setpc_b64 s[30:31]
941;
942; GFX9-LABEL: v_fshr_v4i16:
943; GFX9:       ; %bb.0:
944; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
946; GFX9-NEXT:    v_and_b32_e32 v7, 15, v6
947; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
948; GFX9-NEXT:    v_mov_b32_e32 v8, 1
949; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
950; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
951; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
952; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
953; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
954; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
955; GFX9-NEXT:    v_and_b32_e32 v9, 15, v7
956; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
957; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
958; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
959; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
960; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
961; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
962; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
963; GFX9-NEXT:    v_and_b32_e32 v8, 15, v8
964; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
965; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
966; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
967; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
968; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
969; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
970; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
971; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
972; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
973; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
974; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
975; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
976; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
977; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
978; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
979; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
980; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
981; GFX9-NEXT:    s_setpc_b64 s[30:31]
982;
983; R600-LABEL: v_fshr_v4i16:
984; R600:       ; %bb.0:
985; R600-NEXT:    CF_END
986; R600-NEXT:    PAD
987;
988; GFX10-LABEL: v_fshr_v4i16:
989; GFX10:       ; %bb.0:
990; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
991; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
992; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
993; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
994; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
995; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
996; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
997; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v6
998; GFX10-NEXT:    v_and_b32_e32 v6, 15, v6
999; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
1000; GFX10-NEXT:    v_and_b32_e32 v13, 15, v10
1001; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
1002; GFX10-NEXT:    v_and_b32_e32 v9, 15, v9
1003; GFX10-NEXT:    v_lshrrev_b16 v6, v6, v7
1004; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
1005; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
1006; GFX10-NEXT:    v_lshlrev_b16 v11, 1, v11
1007; GFX10-NEXT:    v_lshlrev_b16 v7, v9, v8
1008; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
1009; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v10
1010; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v5
1011; GFX10-NEXT:    v_and_b32_e32 v4, 15, v4
1012; GFX10-NEXT:    v_and_b32_e32 v5, 15, v5
1013; GFX10-NEXT:    v_and_b32_e32 v8, 15, v8
1014; GFX10-NEXT:    v_and_b32_e32 v9, 15, v9
1015; GFX10-NEXT:    v_and_b32_e32 v10, 15, v10
1016; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
1017; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
1018; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
1019; GFX10-NEXT:    v_lshrrev_b16 v4, v13, v12
1020; GFX10-NEXT:    v_lshlrev_b16 v1, v10, v1
1021; GFX10-NEXT:    v_lshlrev_b16 v5, v9, v11
1022; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1023; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
1024; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1025; GFX10-NEXT:    v_or_b32_e32 v3, v7, v6
1026; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
1027; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
1028; GFX10-NEXT:    v_and_b32_e32 v1, v2, v1
1029; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
1030; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
1031; GFX10-NEXT:    s_setpc_b64 s[30:31]
1032  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1033  ret <4 x i16> %ret
1034}
1035
1036define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1037; SI-LABEL: v_fshr_i64:
1038; SI:       ; %bb.0:
1039; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1040; SI-NEXT:    v_and_b32_e32 v5, 63, v4
1041; SI-NEXT:    v_not_b32_e32 v4, v4
1042; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1043; SI-NEXT:    v_and_b32_e32 v4, 63, v4
1044; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
1045; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
1046; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1047; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1048; SI-NEXT:    s_setpc_b64 s[30:31]
1049;
1050; VI-LABEL: v_fshr_i64:
1051; VI:       ; %bb.0:
1052; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053; VI-NEXT:    v_and_b32_e32 v5, 63, v4
1054; VI-NEXT:    v_not_b32_e32 v4, v4
1055; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1056; VI-NEXT:    v_and_b32_e32 v4, 63, v4
1057; VI-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
1058; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1059; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1060; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1061; VI-NEXT:    s_setpc_b64 s[30:31]
1062;
1063; GFX9-LABEL: v_fshr_i64:
1064; GFX9:       ; %bb.0:
1065; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1066; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
1067; GFX9-NEXT:    v_not_b32_e32 v4, v4
1068; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1069; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
1070; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
1071; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1072; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1073; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1074; GFX9-NEXT:    s_setpc_b64 s[30:31]
1075;
1076; R600-LABEL: v_fshr_i64:
1077; R600:       ; %bb.0:
1078; R600-NEXT:    CF_END
1079; R600-NEXT:    PAD
1080;
1081; GFX10-LABEL: v_fshr_i64:
1082; GFX10:       ; %bb.0:
1083; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1084; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1085; GFX10-NEXT:    v_not_b32_e32 v5, v4
1086; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1087; GFX10-NEXT:    v_and_b32_e32 v4, 63, v4
1088; GFX10-NEXT:    v_and_b32_e32 v5, 63, v5
1089; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1090; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1091; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1092; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1093; GFX10-NEXT:    s_setpc_b64 s[30:31]
1094  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1095  ret i64 %ret
1096}
1097
1098define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1099; SI-LABEL: v_fshr_v2i64:
1100; SI:       ; %bb.0:
1101; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102; SI-NEXT:    v_and_b32_e32 v9, 63, v8
1103; SI-NEXT:    v_not_b32_e32 v8, v8
1104; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1105; SI-NEXT:    v_and_b32_e32 v8, 63, v8
1106; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v9
1107; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1108; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1109; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1110; SI-NEXT:    v_and_b32_e32 v5, 63, v10
1111; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v5
1112; SI-NEXT:    v_not_b32_e32 v7, v10
1113; SI-NEXT:    v_and_b32_e32 v7, 63, v7
1114; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1115; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1116; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1117; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1118; SI-NEXT:    s_setpc_b64 s[30:31]
1119;
1120; VI-LABEL: v_fshr_v2i64:
1121; VI:       ; %bb.0:
1122; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123; VI-NEXT:    v_and_b32_e32 v9, 63, v8
1124; VI-NEXT:    v_not_b32_e32 v8, v8
1125; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1126; VI-NEXT:    v_and_b32_e32 v8, 63, v8
1127; VI-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
1128; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1129; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1130; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1131; VI-NEXT:    v_and_b32_e32 v5, 63, v10
1132; VI-NEXT:    v_lshrrev_b64 v[5:6], v5, v[6:7]
1133; VI-NEXT:    v_not_b32_e32 v7, v10
1134; VI-NEXT:    v_and_b32_e32 v7, 63, v7
1135; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1136; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1137; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1138; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1139; VI-NEXT:    s_setpc_b64 s[30:31]
1140;
1141; GFX9-LABEL: v_fshr_v2i64:
1142; GFX9:       ; %bb.0:
1143; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
1145; GFX9-NEXT:    v_not_b32_e32 v8, v8
1146; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1147; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
1148; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
1149; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1150; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1151; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1152; GFX9-NEXT:    v_and_b32_e32 v5, 63, v10
1153; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v5, v[6:7]
1154; GFX9-NEXT:    v_not_b32_e32 v7, v10
1155; GFX9-NEXT:    v_and_b32_e32 v7, 63, v7
1156; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1157; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1158; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1159; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1160; GFX9-NEXT:    s_setpc_b64 s[30:31]
1161;
1162; R600-LABEL: v_fshr_v2i64:
1163; R600:       ; %bb.0:
1164; R600-NEXT:    CF_END
1165; R600-NEXT:    PAD
1166;
1167; GFX10-LABEL: v_fshr_v2i64:
1168; GFX10:       ; %bb.0:
1169; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1170; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1171; GFX10-NEXT:    v_not_b32_e32 v9, v8
1172; GFX10-NEXT:    v_not_b32_e32 v11, v10
1173; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1174; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1175; GFX10-NEXT:    v_and_b32_e32 v8, 63, v8
1176; GFX10-NEXT:    v_and_b32_e32 v9, 63, v9
1177; GFX10-NEXT:    v_and_b32_e32 v10, 63, v10
1178; GFX10-NEXT:    v_and_b32_e32 v11, 63, v11
1179; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1180; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1181; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1182; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1183; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1184; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1185; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1186; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1187; GFX10-NEXT:    s_setpc_b64 s[30:31]
1188  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1189  ret <2 x i64> %ret
1190}
1191
1192define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1193; SI-LABEL: v_fshr_i24:
1194; SI:       ; %bb.0:
1195; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1196; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1197; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
1198; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1199; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1200; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
1201; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1202; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1203; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1204; SI-NEXT:    s_setpc_b64 s[30:31]
1205;
1206; VI-LABEL: v_fshr_i24:
1207; VI:       ; %bb.0:
1208; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1209; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1210; VI-NEXT:    v_mul_hi_u32 v3, v2, s4
1211; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1212; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1213; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
1214; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1215; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1216; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1217; VI-NEXT:    s_setpc_b64 s[30:31]
1218;
1219; GFX9-LABEL: v_fshr_i24:
1220; GFX9:       ; %bb.0:
1221; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1222; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1223; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s4
1224; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1225; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1226; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
1227; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1228; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1229; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1230; GFX9-NEXT:    s_setpc_b64 s[30:31]
1231;
1232; R600-LABEL: v_fshr_i24:
1233; R600:       ; %bb.0:
1234; R600-NEXT:    CF_END
1235; R600-NEXT:    PAD
1236;
1237; GFX10-LABEL: v_fshr_i24:
1238; GFX10:       ; %bb.0:
1239; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1240; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1241; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v2
1242; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1243; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1244; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
1245; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1246; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1247; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1248; GFX10-NEXT:    s_setpc_b64 s[30:31]
1249  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1250  ret i24 %ret
1251}
1252
1253define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1254; SI-LABEL: v_fshr_v2i24:
1255; SI:       ; %bb.0:
1256; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1257; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1258; SI-NEXT:    v_mul_hi_u32 v6, v4, s4
1259; SI-NEXT:    v_mul_hi_u32 v7, v5, s4
1260; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1261; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1262; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1263; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1264; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1265; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1266; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1267; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1268; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1269; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
1270; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1271; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1272; SI-NEXT:    s_setpc_b64 s[30:31]
1273;
1274; VI-LABEL: v_fshr_v2i24:
1275; VI:       ; %bb.0:
1276; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1277; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1278; VI-NEXT:    v_mul_hi_u32 v6, v4, s4
1279; VI-NEXT:    v_mul_hi_u32 v7, v5, s4
1280; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1281; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1282; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1283; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1284; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1285; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1286; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1287; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1288; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1289; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v6
1290; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1291; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1292; VI-NEXT:    s_setpc_b64 s[30:31]
1293;
1294; GFX9-LABEL: v_fshr_v2i24:
1295; GFX9:       ; %bb.0:
1296; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1297; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1298; GFX9-NEXT:    v_mul_hi_u32 v6, v4, s4
1299; GFX9-NEXT:    v_mul_hi_u32 v7, v5, s4
1300; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1301; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1302; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1303; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1304; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1305; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1306; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1307; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1308; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1309; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v6
1310; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1311; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1312; GFX9-NEXT:    s_setpc_b64 s[30:31]
1313;
1314; R600-LABEL: v_fshr_v2i24:
1315; R600:       ; %bb.0:
1316; R600-NEXT:    CF_END
1317; R600-NEXT:    PAD
1318;
1319; GFX10-LABEL: v_fshr_v2i24:
1320; GFX10:       ; %bb.0:
1321; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1322; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1323; GFX10-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1324; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1325; GFX10-NEXT:    v_mul_hi_u32 v6, v4, s4
1326; GFX10-NEXT:    v_mul_hi_u32 v7, v5, s4
1327; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1328; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1329; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1330; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
1331; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
1332; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1333; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1334; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1335; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1336; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1337; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1338; GFX10-NEXT:    s_setpc_b64 s[30:31]
1339  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1340  ret <2 x i24> %ret
1341}
1342