1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906 %s
3; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
4; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
6
7define i32 @v_sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c) {
8; GFX906-LABEL: v_sdot2:
9; GFX906:       ; %bb.0:
10; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
12; GFX906-NEXT:    s_setpc_b64 s[30:31]
13;
14; GFX908-LABEL: v_sdot2:
15; GFX908:       ; %bb.0:
16; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
18; GFX908-NEXT:    s_setpc_b64 s[30:31]
19;
20; GFX10-LABEL: v_sdot2:
21; GFX10:       ; %bb.0:
22; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
24; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
25; GFX10-NEXT:    ; implicit-def: $vcc_hi
26; GFX10-NEXT:    s_setpc_b64 s[30:31]
27  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
28  ret i32 %r
29}
30
31define i32 @v_sdot2_clamp(<2 x i16> %a, <2 x i16> %b, i32 %c) {
32; GFX906-LABEL: v_sdot2_clamp:
33; GFX906:       ; %bb.0:
34; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
36; GFX906-NEXT:    s_setpc_b64 s[30:31]
37;
38; GFX908-LABEL: v_sdot2_clamp:
39; GFX908:       ; %bb.0:
40; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
42; GFX908-NEXT:    s_setpc_b64 s[30:31]
43;
44; GFX10-LABEL: v_sdot2_clamp:
45; GFX10:       ; %bb.0:
46; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
47; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
48; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 clamp
49; GFX10-NEXT:    ; implicit-def: $vcc_hi
50; GFX10-NEXT:    s_setpc_b64 s[30:31]
51  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 true)
52  ret i32 %r
53}
54
55define amdgpu_ps float @v_sdot2_sgpr_sgpr_sgpr(<2 x i16> inreg %a, <2 x i16> inreg %b, i32 inreg %c) {
56; GFX906-LABEL: v_sdot2_sgpr_sgpr_sgpr:
57; GFX906:       ; %bb.0:
58; GFX906-NEXT:    v_mov_b32_e32 v0, s1
59; GFX906-NEXT:    v_mov_b32_e32 v1, s2
60; GFX906-NEXT:    v_dot2_i32_i16 v0, s0, v0, v1
61; GFX906-NEXT:    ; return to shader part epilog
62;
63; GFX908-LABEL: v_sdot2_sgpr_sgpr_sgpr:
64; GFX908:       ; %bb.0:
65; GFX908-NEXT:    v_mov_b32_e32 v0, s1
66; GFX908-NEXT:    v_mov_b32_e32 v1, s2
67; GFX908-NEXT:    v_dot2_i32_i16 v0, s0, v0, v1
68; GFX908-NEXT:    ; return to shader part epilog
69;
70; GFX10-LABEL: v_sdot2_sgpr_sgpr_sgpr:
71; GFX10:       ; %bb.0:
72; GFX10-NEXT:    v_mov_b32_e32 v0, s2
73; GFX10-NEXT:    ; implicit-def: $vcc_hi
74; GFX10-NEXT:    v_dot2_i32_i16 v0, s0, s1, v0
75; GFX10-NEXT:    ; return to shader part epilog
76  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 false)
77  %cast = bitcast i32 %r to float
78  ret float %cast
79}
80
81define i32 @v_sdot2_inline_literal_a(<2 x i16> %b, i32 %c) {
82; GFX906-LABEL: v_sdot2_inline_literal_a:
83; GFX906:       ; %bb.0:
84; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
85; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
86; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
87; GFX906-NEXT:    s_setpc_b64 s[30:31]
88;
89; GFX908-LABEL: v_sdot2_inline_literal_a:
90; GFX908:       ; %bb.0:
91; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
93; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
94; GFX908-NEXT:    s_setpc_b64 s[30:31]
95;
96; GFX10-LABEL: v_sdot2_inline_literal_a:
97; GFX10:       ; %bb.0:
98; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
100; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
101; GFX10-NEXT:    ; implicit-def: $vcc_hi
102; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
103; GFX10-NEXT:    s_setpc_b64 s[30:31]
104  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 4, i16 4>, <2 x i16> %b, i32 %c, i1 false)
105  ret i32 %r
106}
107
108define i32 @v_sdot2_inline_literal_b(<2 x i16> %a, i32 %c) {
109; GFX906-LABEL: v_sdot2_inline_literal_b:
110; GFX906:       ; %bb.0:
111; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
112; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
113; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
114; GFX906-NEXT:    s_setpc_b64 s[30:31]
115;
116; GFX908-LABEL: v_sdot2_inline_literal_b:
117; GFX908:       ; %bb.0:
118; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
119; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
120; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
121; GFX908-NEXT:    s_setpc_b64 s[30:31]
122;
123; GFX10-LABEL: v_sdot2_inline_literal_b:
124; GFX10:       ; %bb.0:
125; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
127; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 4, 4
128; GFX10-NEXT:    ; implicit-def: $vcc_hi
129; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, s4, v1
130; GFX10-NEXT:    s_setpc_b64 s[30:31]
131  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
132  ret i32 %r
133}
134
135define i32 @v_sdot2_inline_literal_a_b(<2 x i16> %a, i32 %c) {
136; GFX906-LABEL: v_sdot2_inline_literal_a_b:
137; GFX906:       ; %bb.0:
138; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
139; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
140; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
141; GFX906-NEXT:    v_mov_b32_e32 v0, s5
142; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
143; GFX906-NEXT:    s_setpc_b64 s[30:31]
144;
145; GFX908-LABEL: v_sdot2_inline_literal_a_b:
146; GFX908:       ; %bb.0:
147; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
148; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
149; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
150; GFX908-NEXT:    v_mov_b32_e32 v0, s5
151; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, v1
152; GFX908-NEXT:    s_setpc_b64 s[30:31]
153;
154; GFX10-LABEL: v_sdot2_inline_literal_a_b:
155; GFX10:       ; %bb.0:
156; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
157; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
158; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
159; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
160; GFX10-NEXT:    ; implicit-def: $vcc_hi
161; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, s5, v1
162; GFX10-NEXT:    s_setpc_b64 s[30:31]
163  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 %c, i1 false)
164  ret i32 %r
165}
166
167define i32 @v_sdot2_inline_literal_a_b_c() {
168; GFX906-LABEL: v_sdot2_inline_literal_a_b_c:
169; GFX906:       ; %bb.0:
170; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
171; GFX906-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
172; GFX906-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
173; GFX906-NEXT:    v_mov_b32_e32 v0, s5
174; GFX906-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
175; GFX906-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX908-LABEL: v_sdot2_inline_literal_a_b_c:
178; GFX908:       ; %bb.0:
179; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX908-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
181; GFX908-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
182; GFX908-NEXT:    v_mov_b32_e32 v0, s5
183; GFX908-NEXT:    v_dot2_i32_i16 v0, s4, v0, 8
184; GFX908-NEXT:    s_setpc_b64 s[30:31]
185;
186; GFX10-LABEL: v_sdot2_inline_literal_a_b_c:
187; GFX10:       ; %bb.0:
188; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
189; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
190; GFX10-NEXT:    s_pack_ll_b32_b16 s4, 8, 8
191; GFX10-NEXT:    s_pack_ll_b32_b16 s5, 4, 4
192; GFX10-NEXT:    ; implicit-def: $vcc_hi
193; GFX10-NEXT:    v_dot2_i32_i16 v0, s4, s5, 8
194; GFX10-NEXT:    s_setpc_b64 s[30:31]
195  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> <i16 8, i16 8>, <2 x i16> <i16 4, i16 4>, i32 8, i1 false)
196  ret i32 %r
197}
198
199define i32 @v_sdot2_inline_literal_c(<2 x i16> %a, <2 x i16> %b) {
200; GFX906-LABEL: v_sdot2_inline_literal_c:
201; GFX906:       ; %bb.0:
202; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
203; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, 7
204; GFX906-NEXT:    s_setpc_b64 s[30:31]
205;
206; GFX908-LABEL: v_sdot2_inline_literal_c:
207; GFX908:       ; %bb.0:
208; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
209; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, 7
210; GFX908-NEXT:    s_setpc_b64 s[30:31]
211;
212; GFX10-LABEL: v_sdot2_inline_literal_c:
213; GFX10:       ; %bb.0:
214; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
216; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, 7
217; GFX10-NEXT:    ; implicit-def: $vcc_hi
218; GFX10-NEXT:    s_setpc_b64 s[30:31]
219  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 7, i1 false)
220  ret i32 %r
221}
222
223define i32 @v_sdot2_fneg_a(<2 x half> %a, <2 x i16> %b, i32 %c) {
224; GFX906-LABEL: v_sdot2_fneg_a:
225; GFX906:       ; %bb.0:
226; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
228; GFX906-NEXT:    s_setpc_b64 s[30:31]
229;
230; GFX908-LABEL: v_sdot2_fneg_a:
231; GFX908:       ; %bb.0:
232; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
233; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
234; GFX908-NEXT:    s_setpc_b64 s[30:31]
235;
236; GFX10-LABEL: v_sdot2_fneg_a:
237; GFX10:       ; %bb.0:
238; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
240; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
241; GFX10-NEXT:    ; implicit-def: $vcc_hi
242; GFX10-NEXT:    s_setpc_b64 s[30:31]
243  %neg.a = fneg <2 x half> %a
244  %cast.neg.a = bitcast <2 x half> %neg.a to <2 x i16>
245  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %cast.neg.a, <2 x i16> %b, i32 %c, i1 false)
246  ret i32 %r
247}
248
249define i32 @v_sdot2_fneg_b(<2 x i16> %a, <2 x half> %b, i32 %c) {
250; GFX906-LABEL: v_sdot2_fneg_b:
251; GFX906:       ; %bb.0:
252; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
254; GFX906-NEXT:    s_setpc_b64 s[30:31]
255;
256; GFX908-LABEL: v_sdot2_fneg_b:
257; GFX908:       ; %bb.0:
258; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
259; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
260; GFX908-NEXT:    s_setpc_b64 s[30:31]
261;
262; GFX10-LABEL: v_sdot2_fneg_b:
263; GFX10:       ; %bb.0:
264; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
266; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
267; GFX10-NEXT:    ; implicit-def: $vcc_hi
268; GFX10-NEXT:    s_setpc_b64 s[30:31]
269  %neg.b = fneg <2 x half> %b
270  %cast.neg.b = bitcast <2 x half> %neg.b to <2 x i16>
271  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %cast.neg.b, i32 %c, i1 false)
272  ret i32 %r
273}
274
275define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
276; GFX906-LABEL: v_sdot2_fnegf32_c:
277; GFX906:       ; %bb.0:
278; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
280; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
281; GFX906-NEXT:    s_setpc_b64 s[30:31]
282;
283; GFX908-LABEL: v_sdot2_fnegf32_c:
284; GFX908:       ; %bb.0:
285; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
286; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
287; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
288; GFX908-NEXT:    s_setpc_b64 s[30:31]
289;
290; GFX10-LABEL: v_sdot2_fnegf32_c:
291; GFX10:       ; %bb.0:
292; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
293; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
294; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
295; GFX10-NEXT:    ; implicit-def: $vcc_hi
296; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
297; GFX10-NEXT:    s_setpc_b64 s[30:31]
298  %neg.c = fneg float %c
299  %cast.neg.c = bitcast float %neg.c to i32
300  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
301  ret i32 %r
302}
303
304define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
305; GFX906-LABEL: v_sdot2_fnegv2f16_c:
306; GFX906:       ; %bb.0:
307; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
308; GFX906-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
309; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
310; GFX906-NEXT:    s_setpc_b64 s[30:31]
311;
312; GFX908-LABEL: v_sdot2_fnegv2f16_c:
313; GFX908:       ; %bb.0:
314; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
315; GFX908-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
316; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
317; GFX908-NEXT:    s_setpc_b64 s[30:31]
318;
319; GFX10-LABEL: v_sdot2_fnegv2f16_c:
320; GFX10:       ; %bb.0:
321; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
322; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
323; GFX10-NEXT:    v_xor_b32_e32 v2, 0x80008000, v2
324; GFX10-NEXT:    ; implicit-def: $vcc_hi
325; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
326; GFX10-NEXT:    s_setpc_b64 s[30:31]
327  %neg.c = fneg <2 x half> %c
328  %cast.neg.c = bitcast <2 x half> %neg.c to i32
329  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %cast.neg.c, i1 false)
330  ret i32 %r
331}
332
333define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
334; GFX906-LABEL: v_sdot2_shuffle10_a:
335; GFX906:       ; %bb.0:
336; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337; GFX906-NEXT:    v_alignbit_b32 v0, v0, v0, 16
338; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
339; GFX906-NEXT:    s_setpc_b64 s[30:31]
340;
341; GFX908-LABEL: v_sdot2_shuffle10_a:
342; GFX908:       ; %bb.0:
343; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
344; GFX908-NEXT:    v_alignbit_b32 v0, v0, v0, 16
345; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
346; GFX908-NEXT:    s_setpc_b64 s[30:31]
347;
348; GFX10-LABEL: v_sdot2_shuffle10_a:
349; GFX10:       ; %bb.0:
350; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
351; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
352; GFX10-NEXT:    v_alignbit_b32 v0, v0, v0, 16
353; GFX10-NEXT:    ; implicit-def: $vcc_hi
354; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
355; GFX10-NEXT:    s_setpc_b64 s[30:31]
356  %shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
357  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
358  ret i32 %r
359}
360
361define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
362; GFX906-LABEL: v_sdot2_shuffle10_b:
363; GFX906:       ; %bb.0:
364; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365; GFX906-NEXT:    v_alignbit_b32 v1, v1, v1, 16
366; GFX906-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
367; GFX906-NEXT:    s_setpc_b64 s[30:31]
368;
369; GFX908-LABEL: v_sdot2_shuffle10_b:
370; GFX908:       ; %bb.0:
371; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
372; GFX908-NEXT:    v_alignbit_b32 v1, v1, v1, 16
373; GFX908-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
374; GFX908-NEXT:    s_setpc_b64 s[30:31]
375;
376; GFX10-LABEL: v_sdot2_shuffle10_b:
377; GFX10:       ; %bb.0:
378; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
380; GFX10-NEXT:    v_alignbit_b32 v1, v1, v1, 16
381; GFX10-NEXT:    ; implicit-def: $vcc_hi
382; GFX10-NEXT:    v_dot2_i32_i16 v0, v0, v1, v2
383; GFX10-NEXT:    s_setpc_b64 s[30:31]
384  %shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
385  %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
386  ret i32 %r
387}
388
389declare i32 @llvm.amdgcn.sdot2(<2 x i16>, <2 x i16>, i32, i1 immarg) #0
390
391attributes #0 = { nounwind readnone speculatable }
392