1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6
7define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
8; GFX9-LABEL: s_shl_v2i16:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
11; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
12; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x30
13; GFX9-NEXT:    s_mov_b32 s7, 0xf000
14; GFX9-NEXT:    s_mov_b32 s6, -1
15; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX9-NEXT:    v_mov_b32_e32 v0, s2
17; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
18; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
19; GFX9-NEXT:    s_endpgm
20;
21; VI-LABEL: s_shl_v2i16:
22; VI:       ; %bb.0:
23; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
24; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
25; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
26; VI-NEXT:    s_mov_b32 s3, 0xffff
27; VI-NEXT:    s_mov_b32 s7, 0xf000
28; VI-NEXT:    s_mov_b32 s6, -1
29; VI-NEXT:    s_waitcnt lgkmcnt(0)
30; VI-NEXT:    s_lshr_b32 s1, s2, 16
31; VI-NEXT:    s_lshr_b32 s8, s0, 16
32; VI-NEXT:    s_and_b32 s2, s2, s3
33; VI-NEXT:    s_and_b32 s0, s0, s3
34; VI-NEXT:    s_lshl_b32 s0, s2, s0
35; VI-NEXT:    s_lshl_b32 s1, s1, s8
36; VI-NEXT:    s_lshl_b32 s1, s1, 16
37; VI-NEXT:    s_and_b32 s0, s0, s3
38; VI-NEXT:    s_or_b32 s0, s0, s1
39; VI-NEXT:    v_mov_b32_e32 v0, s0
40; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
41; VI-NEXT:    s_endpgm
42;
43; CI-LABEL: s_shl_v2i16:
44; CI:       ; %bb.0:
45; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
46; CI-NEXT:    s_load_dword s2, s[0:1], 0xb
47; CI-NEXT:    s_load_dword s0, s[0:1], 0xc
48; CI-NEXT:    s_mov_b32 s3, 0xffff
49; CI-NEXT:    s_mov_b32 s7, 0xf000
50; CI-NEXT:    s_mov_b32 s6, -1
51; CI-NEXT:    s_waitcnt lgkmcnt(0)
52; CI-NEXT:    s_lshr_b32 s1, s2, 16
53; CI-NEXT:    s_and_b32 s8, s0, s3
54; CI-NEXT:    s_lshr_b32 s0, s0, 16
55; CI-NEXT:    s_lshl_b32 s0, s1, s0
56; CI-NEXT:    s_lshl_b32 s1, s2, s8
57; CI-NEXT:    s_lshl_b32 s0, s0, 16
58; CI-NEXT:    s_and_b32 s1, s1, s3
59; CI-NEXT:    s_or_b32 s0, s1, s0
60; CI-NEXT:    v_mov_b32_e32 v0, s0
61; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
62; CI-NEXT:    s_endpgm
63;
64; GFX10-LABEL: s_shl_v2i16:
65; GFX10:       ; %bb.0:
66; GFX10-NEXT:    s_clause 0x2
67; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x2c
68; GFX10-NEXT:    s_load_dword s3, s[0:1], 0x30
69; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
70; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
71; GFX10-NEXT:    s_mov_b32 s6, -1
72; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
74; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
75; GFX10-NEXT:    s_endpgm
76  %result = shl <2 x i16> %lhs, %rhs
77  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
78  ret void
79}
80
81define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
82; GFX9-LABEL: v_shl_v2i16:
83; GFX9:       ; %bb.0:
84; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
85; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
86; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
88; GFX9-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
89; GFX9-NEXT:    s_waitcnt vmcnt(0)
90; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
91; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
92; GFX9-NEXT:    s_endpgm
93;
94; VI-LABEL: v_shl_v2i16:
95; VI:       ; %bb.0:
96; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
97; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    v_mov_b32_e32 v1, s3
100; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
101; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
102; VI-NEXT:    v_add_u32_e32 v2, vcc, 4, v0
103; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
104; VI-NEXT:    flat_load_dword v5, v[0:1]
105; VI-NEXT:    flat_load_dword v2, v[2:3]
106; VI-NEXT:    v_mov_b32_e32 v1, s1
107; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
108; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v5
111; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
112; VI-NEXT:    v_or_b32_e32 v2, v3, v2
113; VI-NEXT:    flat_store_dword v[0:1], v2
114; VI-NEXT:    s_endpgm
115;
116; CI-LABEL: v_shl_v2i16:
117; CI:       ; %bb.0:
118; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
119; CI-NEXT:    s_mov_b32 s3, 0xf000
120; CI-NEXT:    s_mov_b32 s2, 0
121; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
122; CI-NEXT:    v_mov_b32_e32 v1, 0
123; CI-NEXT:    s_waitcnt lgkmcnt(0)
124; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
125; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
126; CI-NEXT:    buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
127; CI-NEXT:    s_mov_b32 s0, 0xffff
128; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
129; CI-NEXT:    s_waitcnt vmcnt(1)
130; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
131; CI-NEXT:    s_waitcnt vmcnt(0)
132; CI-NEXT:    v_and_b32_e32 v5, s0, v3
133; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
134; CI-NEXT:    v_lshl_b32_e32 v3, v4, v3
135; CI-NEXT:    v_lshl_b32_e32 v2, v2, v5
136; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
137; CI-NEXT:    v_and_b32_e32 v2, s0, v2
138; CI-NEXT:    v_or_b32_e32 v2, v2, v3
139; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
140; CI-NEXT:    s_endpgm
141;
142; GFX10-LABEL: v_shl_v2i16:
143; GFX10:       ; %bb.0:
144; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
145; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    s_clause 0x1
148; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
149; GFX10-NEXT:    global_load_dword v2, v0, s[2:3] offset:4
150; GFX10-NEXT:    s_waitcnt vmcnt(0)
151; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v2, v1
152; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
153; GFX10-NEXT:    s_endpgm
154  %tid = call i32 @llvm.amdgcn.workitem.id.x()
155  %tid.ext = sext i32 %tid to i64
156  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
157  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
158  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
159  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
160  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
161  %result = shl <2 x i16> %a, %b
162  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
163  ret void
164}
165
166define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
167; GFX9-LABEL: shl_v_s_v2i16:
168; GFX9:       ; %bb.0:
169; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
170; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
171; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
172; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
174; GFX9-NEXT:    s_waitcnt vmcnt(0)
175; GFX9-NEXT:    v_pk_lshlrev_b16 v1, s2, v1
176; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
177; GFX9-NEXT:    s_endpgm
178;
179; VI-LABEL: shl_v_s_v2i16:
180; VI:       ; %bb.0:
181; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
182; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
183; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
184; VI-NEXT:    s_waitcnt lgkmcnt(0)
185; VI-NEXT:    v_mov_b32_e32 v1, s7
186; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
187; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
188; VI-NEXT:    flat_load_dword v3, v[0:1]
189; VI-NEXT:    s_lshr_b32 s1, s0, 16
190; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
191; VI-NEXT:    v_mov_b32_e32 v2, s1
192; VI-NEXT:    v_mov_b32_e32 v1, s5
193; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
194; VI-NEXT:    s_waitcnt vmcnt(0)
195; VI-NEXT:    v_lshlrev_b16_e32 v4, s0, v3
196; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
197; VI-NEXT:    v_or_b32_e32 v2, v4, v2
198; VI-NEXT:    flat_store_dword v[0:1], v2
199; VI-NEXT:    s_endpgm
200;
201; CI-LABEL: shl_v_s_v2i16:
202; CI:       ; %bb.0:
203; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
204; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
205; CI-NEXT:    s_mov_b32 s3, 0xf000
206; CI-NEXT:    s_mov_b32 s2, 0
207; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
208; CI-NEXT:    s_waitcnt lgkmcnt(0)
209; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
210; CI-NEXT:    v_mov_b32_e32 v1, 0
211; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
212; CI-NEXT:    s_mov_b32 s0, 0xffff
213; CI-NEXT:    s_lshr_b32 s1, s8, 16
214; CI-NEXT:    s_and_b32 s8, s8, s0
215; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
216; CI-NEXT:    s_waitcnt vmcnt(0)
217; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
218; CI-NEXT:    v_lshlrev_b32_e32 v2, s8, v2
219; CI-NEXT:    v_lshlrev_b32_e32 v3, s1, v3
220; CI-NEXT:    v_and_b32_e32 v2, s0, v2
221; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
222; CI-NEXT:    v_or_b32_e32 v2, v2, v3
223; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
224; CI-NEXT:    s_endpgm
225;
226; GFX10-LABEL: shl_v_s_v2i16:
227; GFX10:       ; %bb.0:
228; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
229; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
230; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
231; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
232; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
233; GFX10-NEXT:    s_waitcnt vmcnt(0)
234; GFX10-NEXT:    v_pk_lshlrev_b16 v1, s0, v1
235; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
236; GFX10-NEXT:    s_endpgm
237  %tid = call i32 @llvm.amdgcn.workitem.id.x()
238  %tid.ext = sext i32 %tid to i64
239  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
240  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
241  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
242  %result = shl <2 x i16> %vgpr, %sgpr
243  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
244  ret void
245}
246
247define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
248; GFX9-LABEL: shl_s_v_v2i16:
249; GFX9:       ; %bb.0:
250; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
251; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
252; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
253; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
255; GFX9-NEXT:    s_waitcnt vmcnt(0)
256; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, s2
257; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
258; GFX9-NEXT:    s_endpgm
259;
260; VI-LABEL: shl_s_v_v2i16:
261; VI:       ; %bb.0:
262; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
263; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
264; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
265; VI-NEXT:    s_waitcnt lgkmcnt(0)
266; VI-NEXT:    v_mov_b32_e32 v1, s7
267; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v2
268; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
269; VI-NEXT:    flat_load_dword v3, v[0:1]
270; VI-NEXT:    s_lshr_b32 s1, s0, 16
271; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
272; VI-NEXT:    v_mov_b32_e32 v2, s1
273; VI-NEXT:    v_mov_b32_e32 v1, s5
274; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
275; VI-NEXT:    s_waitcnt vmcnt(0)
276; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, s0
277; VI-NEXT:    v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
278; VI-NEXT:    v_or_b32_e32 v2, v4, v2
279; VI-NEXT:    flat_store_dword v[0:1], v2
280; VI-NEXT:    s_endpgm
281;
282; CI-LABEL: shl_s_v_v2i16:
283; CI:       ; %bb.0:
284; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
285; CI-NEXT:    s_load_dword s8, s[0:1], 0xd
286; CI-NEXT:    s_mov_b32 s3, 0xf000
287; CI-NEXT:    s_mov_b32 s2, 0
288; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
289; CI-NEXT:    s_waitcnt lgkmcnt(0)
290; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
291; CI-NEXT:    v_mov_b32_e32 v1, 0
292; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
293; CI-NEXT:    s_mov_b32 s0, 0xffff
294; CI-NEXT:    s_lshr_b32 s1, s8, 16
295; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
296; CI-NEXT:    s_waitcnt vmcnt(0)
297; CI-NEXT:    v_and_b32_e32 v3, s0, v2
298; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
299; CI-NEXT:    v_lshl_b32_e32 v2, s1, v2
300; CI-NEXT:    v_lshl_b32_e32 v3, s8, v3
301; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
302; CI-NEXT:    v_and_b32_e32 v3, s0, v3
303; CI-NEXT:    v_or_b32_e32 v2, v3, v2
304; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
305; CI-NEXT:    s_endpgm
306;
307; GFX10-LABEL: shl_s_v_v2i16:
308; GFX10:       ; %bb.0:
309; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
310; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
311; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x34
312; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
314; GFX10-NEXT:    s_waitcnt vmcnt(0)
315; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, s0
316; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
317; GFX10-NEXT:    s_endpgm
318  %tid = call i32 @llvm.amdgcn.workitem.id.x()
319  %tid.ext = sext i32 %tid to i64
320  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
321  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
322  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
323  %result = shl <2 x i16> %sgpr, %vgpr
324  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
325  ret void
326}
327
328define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
329; GFX9-LABEL: shl_imm_v_v2i16:
330; GFX9:       ; %bb.0:
331; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
332; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
333; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
335; GFX9-NEXT:    s_waitcnt vmcnt(0)
336; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
337; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
338; GFX9-NEXT:    s_endpgm
339;
340; VI-LABEL: shl_imm_v_v2i16:
341; VI:       ; %bb.0:
342; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
343; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
344; VI-NEXT:    v_mov_b32_e32 v4, 8
345; VI-NEXT:    s_waitcnt lgkmcnt(0)
346; VI-NEXT:    v_mov_b32_e32 v1, s3
347; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
348; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
349; VI-NEXT:    flat_load_dword v3, v[0:1]
350; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
351; VI-NEXT:    v_mov_b32_e32 v1, s1
352; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
353; VI-NEXT:    s_waitcnt vmcnt(0)
354; VI-NEXT:    v_lshlrev_b16_e64 v2, v3, 8
355; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
356; VI-NEXT:    v_or_b32_e32 v2, v2, v3
357; VI-NEXT:    flat_store_dword v[0:1], v2
358; VI-NEXT:    s_endpgm
359;
360; CI-LABEL: shl_imm_v_v2i16:
361; CI:       ; %bb.0:
362; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
363; CI-NEXT:    s_mov_b32 s3, 0xf000
364; CI-NEXT:    s_mov_b32 s2, 0
365; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
366; CI-NEXT:    v_mov_b32_e32 v1, 0
367; CI-NEXT:    s_waitcnt lgkmcnt(0)
368; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
369; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
370; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
371; CI-NEXT:    s_waitcnt vmcnt(0)
372; CI-NEXT:    v_and_b32_e32 v3, 0xffff, v2
373; CI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
374; CI-NEXT:    v_lshl_b32_e32 v2, 8, v2
375; CI-NEXT:    v_lshl_b32_e32 v3, 8, v3
376; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
377; CI-NEXT:    v_and_b32_e32 v3, 0xfff8, v3
378; CI-NEXT:    v_or_b32_e32 v2, v3, v2
379; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
380; CI-NEXT:    s_endpgm
381;
382; GFX10-LABEL: shl_imm_v_v2i16:
383; GFX10:       ; %bb.0:
384; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
385; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
388; GFX10-NEXT:    s_waitcnt vmcnt(0)
389; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
390; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
391; GFX10-NEXT:    s_endpgm
392  %tid = call i32 @llvm.amdgcn.workitem.id.x()
393  %tid.ext = sext i32 %tid to i64
394  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
395  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
396  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
397  %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
398  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
399  ret void
400}
401
402define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
403; GFX9-LABEL: shl_v_imm_v2i16:
404; GFX9:       ; %bb.0:
405; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
406; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
409; GFX9-NEXT:    s_waitcnt vmcnt(0)
410; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
411; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
412; GFX9-NEXT:    s_endpgm
413;
414; VI-LABEL: shl_v_imm_v2i16:
415; VI:       ; %bb.0:
416; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
417; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
418; VI-NEXT:    s_waitcnt lgkmcnt(0)
419; VI-NEXT:    v_mov_b32_e32 v1, s3
420; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
421; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
422; VI-NEXT:    flat_load_dword v3, v[0:1]
423; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
424; VI-NEXT:    v_mov_b32_e32 v1, s1
425; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
426; VI-NEXT:    s_waitcnt vmcnt(0)
427; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
428; VI-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
429; VI-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
430; VI-NEXT:    v_or_b32_e32 v2, v3, v2
431; VI-NEXT:    flat_store_dword v[0:1], v2
432; VI-NEXT:    s_endpgm
433;
434; CI-LABEL: shl_v_imm_v2i16:
435; CI:       ; %bb.0:
436; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
437; CI-NEXT:    s_mov_b32 s3, 0xf000
438; CI-NEXT:    s_mov_b32 s2, 0
439; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
440; CI-NEXT:    v_mov_b32_e32 v1, 0
441; CI-NEXT:    s_waitcnt lgkmcnt(0)
442; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
443; CI-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
444; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
445; CI-NEXT:    s_waitcnt vmcnt(0)
446; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
447; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
448; CI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
449; CI-NEXT:    s_endpgm
450;
451; GFX10-LABEL: shl_v_imm_v2i16:
452; GFX10:       ; %bb.0:
453; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
454; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
455; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
456; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
457; GFX10-NEXT:    s_waitcnt vmcnt(0)
458; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
459; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
460; GFX10-NEXT:    s_endpgm
461  %tid = call i32 @llvm.amdgcn.workitem.id.x()
462  %tid.ext = sext i32 %tid to i64
463  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
464  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
465  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
466  %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
467  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
468  ret void
469}
470
471define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
472; GFX9-LABEL: v_shl_v4i16:
473; GFX9:       ; %bb.0:
474; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
475; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
476; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
478; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
479; GFX9-NEXT:    s_waitcnt vmcnt(0)
480; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
481; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
482; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
483; GFX9-NEXT:    s_endpgm
484;
485; VI-LABEL: v_shl_v4i16:
486; VI:       ; %bb.0:
487; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
488; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
489; VI-NEXT:    s_waitcnt lgkmcnt(0)
490; VI-NEXT:    v_mov_b32_e32 v1, s3
491; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
492; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
493; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v0
494; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
495; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
496; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
497; VI-NEXT:    v_mov_b32_e32 v5, s1
498; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
499; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
500; VI-NEXT:    s_waitcnt vmcnt(0)
501; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
502; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
503; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
504; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
505; VI-NEXT:    v_or_b32_e32 v1, v6, v1
506; VI-NEXT:    v_or_b32_e32 v0, v3, v0
507; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
508; VI-NEXT:    s_endpgm
509;
510; CI-LABEL: v_shl_v4i16:
511; CI:       ; %bb.0:
512; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
513; CI-NEXT:    s_mov_b32 s3, 0xf000
514; CI-NEXT:    s_mov_b32 s2, 0
515; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
516; CI-NEXT:    v_mov_b32_e32 v1, 0
517; CI-NEXT:    s_waitcnt lgkmcnt(0)
518; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
519; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
520; CI-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
521; CI-NEXT:    s_mov_b32 s0, 0xffff
522; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
523; CI-NEXT:    s_waitcnt vmcnt(1)
524; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
525; CI-NEXT:    s_waitcnt vmcnt(0)
526; CI-NEXT:    v_and_b32_e32 v8, s0, v4
527; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
528; CI-NEXT:    v_and_b32_e32 v9, s0, v5
529; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
530; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
531; CI-NEXT:    v_lshl_b32_e32 v5, v7, v5
532; CI-NEXT:    v_lshl_b32_e32 v3, v3, v9
533; CI-NEXT:    v_lshl_b32_e32 v4, v6, v4
534; CI-NEXT:    v_lshl_b32_e32 v2, v2, v8
535; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
536; CI-NEXT:    v_and_b32_e32 v3, s0, v3
537; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
538; CI-NEXT:    v_and_b32_e32 v2, s0, v2
539; CI-NEXT:    v_or_b32_e32 v3, v3, v5
540; CI-NEXT:    v_or_b32_e32 v2, v2, v4
541; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
542; CI-NEXT:    s_endpgm
543;
544; GFX10-LABEL: v_shl_v4i16:
545; GFX10:       ; %bb.0:
546; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
547; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
548; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX10-NEXT:    s_clause 0x1
550; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
551; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[2:3] offset:8
552; GFX10-NEXT:    s_waitcnt vmcnt(0)
553; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
554; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
555; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
556; GFX10-NEXT:    s_endpgm
557  %tid = call i32 @llvm.amdgcn.workitem.id.x()
558  %tid.ext = sext i32 %tid to i64
559  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
560  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
561  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
562  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
563  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
564  %result = shl <4 x i16> %a, %b
565  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
566  ret void
567}
568
569define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
570; GFX9-LABEL: shl_v_imm_v4i16:
571; GFX9:       ; %bb.0:
572; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
573; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
574; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
575; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
576; GFX9-NEXT:    s_waitcnt vmcnt(0)
577; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
578; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
579; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
580; GFX9-NEXT:    s_endpgm
581;
582; VI-LABEL: shl_v_imm_v4i16:
583; VI:       ; %bb.0:
584; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
585; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
586; VI-NEXT:    s_waitcnt lgkmcnt(0)
587; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
588; VI-NEXT:    v_mov_b32_e32 v1, s3
589; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
590; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
591; VI-NEXT:    s_mov_b32 s2, 0xff000000
592; VI-NEXT:    v_mov_b32_e32 v3, s1
593; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
594; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
595; VI-NEXT:    s_waitcnt vmcnt(0)
596; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
597; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
598; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
599; VI-NEXT:    v_and_b32_e32 v0, s2, v0
600; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
601; VI-NEXT:    v_and_b32_e32 v4, s2, v4
602; VI-NEXT:    v_or_b32_e32 v1, v1, v4
603; VI-NEXT:    v_or_b32_e32 v0, v5, v0
604; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
605; VI-NEXT:    s_endpgm
606;
607; CI-LABEL: shl_v_imm_v4i16:
608; CI:       ; %bb.0:
609; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
610; CI-NEXT:    s_mov_b32 s3, 0xf000
611; CI-NEXT:    s_mov_b32 s2, 0
612; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
613; CI-NEXT:    v_mov_b32_e32 v1, 0
614; CI-NEXT:    s_waitcnt lgkmcnt(0)
615; CI-NEXT:    s_mov_b64 s[0:1], s[6:7]
616; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
617; CI-NEXT:    s_mov_b32 s0, 0xff00
618; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
619; CI-NEXT:    s_waitcnt vmcnt(0)
620; CI-NEXT:    v_lshrrev_b32_e32 v4, 8, v3
621; CI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
622; CI-NEXT:    v_and_b32_e32 v4, s0, v4
623; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
624; CI-NEXT:    v_and_b32_e32 v3, s0, v3
625; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
626; CI-NEXT:    v_or_b32_e32 v3, v3, v4
627; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
628; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
629; CI-NEXT:    s_endpgm
630;
631; GFX10-LABEL: shl_v_imm_v4i16:
632; GFX10:       ; %bb.0:
633; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
634; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
635; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
637; GFX10-NEXT:    s_waitcnt vmcnt(0)
638; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
639; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
640; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
641; GFX10-NEXT:    s_endpgm
642  %tid = call i32 @llvm.amdgcn.workitem.id.x()
643  %tid.ext = sext i32 %tid to i64
644  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
645  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
646  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
647  %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
648  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
649  ret void
650}
651
652declare i32 @llvm.amdgcn.workitem.id.x() #1
653
654attributes #0 = { nounwind }
655attributes #1 = { nounwind readnone }
656