1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
5; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6
7define amdgpu_ps void @insertelement_s_v2i16_s_s(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
8; GFX9-LABEL: insertelement_s_v2i16_s_s:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
11; GFX9-NEXT:    s_and_b32 s1, s5, 1
12; GFX9-NEXT:    s_mov_b32 s2, 0xffff
13; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
14; GFX9-NEXT:    s_and_b32 s3, s4, s2
15; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
16; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
17; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
18; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
19; GFX9-NEXT:    s_or_b32 s0, s0, s3
20; GFX9-NEXT:    v_mov_b32_e32 v0, 0
21; GFX9-NEXT:    v_mov_b32_e32 v1, 0
22; GFX9-NEXT:    v_mov_b32_e32 v2, s0
23; GFX9-NEXT:    global_store_dword v[0:1], v2, off
24; GFX9-NEXT:    s_endpgm
25;
26; GFX8-LABEL: insertelement_s_v2i16_s_s:
27; GFX8:       ; %bb.0:
28; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
29; GFX8-NEXT:    s_and_b32 s1, s5, 1
30; GFX8-NEXT:    s_mov_b32 s2, 0xffff
31; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
32; GFX8-NEXT:    s_and_b32 s3, s4, s2
33; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
34; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
35; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
36; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
37; GFX8-NEXT:    s_or_b32 s0, s0, s3
38; GFX8-NEXT:    v_mov_b32_e32 v0, 0
39; GFX8-NEXT:    v_mov_b32_e32 v1, 0
40; GFX8-NEXT:    v_mov_b32_e32 v2, s0
41; GFX8-NEXT:    flat_store_dword v[0:1], v2
42; GFX8-NEXT:    s_endpgm
43;
44; GFX7-LABEL: insertelement_s_v2i16_s_s:
45; GFX7:       ; %bb.0:
46; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
47; GFX7-NEXT:    s_and_b32 s1, s5, 1
48; GFX7-NEXT:    s_mov_b32 s2, 0xffff
49; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
50; GFX7-NEXT:    s_and_b32 s3, s4, s2
51; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
52; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
53; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
55; GFX7-NEXT:    s_or_b32 s0, s0, s3
56; GFX7-NEXT:    v_mov_b32_e32 v0, 0
57; GFX7-NEXT:    v_mov_b32_e32 v1, 0
58; GFX7-NEXT:    v_mov_b32_e32 v2, s0
59; GFX7-NEXT:    flat_store_dword v[0:1], v2
60; GFX7-NEXT:    s_endpgm
61;
62; GFX10-LABEL: insertelement_s_v2i16_s_s:
63; GFX10:       ; %bb.0:
64; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
65; GFX10-NEXT:    s_and_b32 s1, s5, 1
66; GFX10-NEXT:    s_mov_b32 s2, 0xffff
67; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
68; GFX10-NEXT:    s_and_b32 s3, s4, s2
69; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
70; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
71; GFX10-NEXT:    v_mov_b32_e32 v0, 0
72; GFX10-NEXT:    v_mov_b32_e32 v1, 0
73; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
75; GFX10-NEXT:    s_or_b32 s0, s0, s1
76; GFX10-NEXT:    v_mov_b32_e32 v2, s0
77; GFX10-NEXT:    global_store_dword v[0:1], v2, off
78; GFX10-NEXT:    s_endpgm
79  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
80  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
81  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
82  ret void
83}
84
85define amdgpu_ps void @insertelement_v_v2i16_s_s(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
86; GFX9-LABEL: insertelement_v_v2i16_s_s:
87; GFX9:       ; %bb.0:
88; GFX9-NEXT:    global_load_dword v2, v[0:1], off
89; GFX9-NEXT:    s_and_b32 s0, s3, 1
90; GFX9-NEXT:    s_mov_b32 s1, 0xffff
91; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
92; GFX9-NEXT:    s_and_b32 s2, s2, s1
93; GFX9-NEXT:    s_lshl_b32 s2, s2, s0
94; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
95; GFX9-NEXT:    s_not_b32 s0, s0
96; GFX9-NEXT:    v_mov_b32_e32 v3, s2
97; GFX9-NEXT:    v_mov_b32_e32 v0, 0
98; GFX9-NEXT:    v_mov_b32_e32 v1, 0
99; GFX9-NEXT:    s_waitcnt vmcnt(0)
100; GFX9-NEXT:    v_and_or_b32 v2, v2, s0, v3
101; GFX9-NEXT:    global_store_dword v[0:1], v2, off
102; GFX9-NEXT:    s_endpgm
103;
104; GFX8-LABEL: insertelement_v_v2i16_s_s:
105; GFX8:       ; %bb.0:
106; GFX8-NEXT:    flat_load_dword v0, v[0:1]
107; GFX8-NEXT:    s_and_b32 s1, s3, 1
108; GFX8-NEXT:    s_mov_b32 s0, 0xffff
109; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
110; GFX8-NEXT:    s_and_b32 s2, s2, s0
111; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
112; GFX8-NEXT:    s_not_b32 s0, s0
113; GFX8-NEXT:    s_lshl_b32 s2, s2, s1
114; GFX8-NEXT:    s_waitcnt vmcnt(0)
115; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
116; GFX8-NEXT:    v_mov_b32_e32 v0, 0
117; GFX8-NEXT:    v_mov_b32_e32 v1, 0
118; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
119; GFX8-NEXT:    flat_store_dword v[0:1], v2
120; GFX8-NEXT:    s_endpgm
121;
122; GFX7-LABEL: insertelement_v_v2i16_s_s:
123; GFX7:       ; %bb.0:
124; GFX7-NEXT:    flat_load_dword v0, v[0:1]
125; GFX7-NEXT:    s_and_b32 s1, s3, 1
126; GFX7-NEXT:    s_mov_b32 s0, 0xffff
127; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
128; GFX7-NEXT:    s_and_b32 s2, s2, s0
129; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
130; GFX7-NEXT:    s_not_b32 s0, s0
131; GFX7-NEXT:    s_lshl_b32 s2, s2, s1
132; GFX7-NEXT:    s_waitcnt vmcnt(0)
133; GFX7-NEXT:    v_and_b32_e32 v2, s0, v0
134; GFX7-NEXT:    v_mov_b32_e32 v0, 0
135; GFX7-NEXT:    v_mov_b32_e32 v1, 0
136; GFX7-NEXT:    v_or_b32_e32 v2, s2, v2
137; GFX7-NEXT:    flat_store_dword v[0:1], v2
138; GFX7-NEXT:    s_endpgm
139;
140; GFX10-LABEL: insertelement_v_v2i16_s_s:
141; GFX10:       ; %bb.0:
142; GFX10-NEXT:    global_load_dword v2, v[0:1], off
143; GFX10-NEXT:    s_and_b32 s0, s3, 1
144; GFX10-NEXT:    s_mov_b32 s1, 0xffff
145; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
146; GFX10-NEXT:    s_and_b32 s2, s2, s1
147; GFX10-NEXT:    s_lshl_b32 s1, s1, s0
148; GFX10-NEXT:    s_lshl_b32 s0, s2, s0
149; GFX10-NEXT:    s_not_b32 s1, s1
150; GFX10-NEXT:    v_mov_b32_e32 v0, 0
151; GFX10-NEXT:    v_mov_b32_e32 v1, 0
152; GFX10-NEXT:    s_waitcnt vmcnt(0)
153; GFX10-NEXT:    v_and_or_b32 v2, v2, s1, s0
154; GFX10-NEXT:    global_store_dword v[0:1], v2, off
155; GFX10-NEXT:    s_endpgm
156  %vec = load <2 x i16>, <2 x i16> addrspace(1 )* %ptr
157  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
158  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
159  ret void
160}
161
162define amdgpu_ps void @insertelement_s_v2i16_v_s(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
163; GFX9-LABEL: insertelement_s_v2i16_v_s:
164; GFX9:       ; %bb.0:
165; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
166; GFX9-NEXT:    s_and_b32 s1, s4, 1
167; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
168; GFX9-NEXT:    s_mov_b32 s2, 0xffff
169; GFX9-NEXT:    v_and_b32_e32 v2, s2, v0
170; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX9-NEXT:    s_andn2_b32 s0, s0, s2
173; GFX9-NEXT:    v_mov_b32_e32 v3, s0
174; GFX9-NEXT:    v_mov_b32_e32 v0, 0
175; GFX9-NEXT:    v_mov_b32_e32 v1, 0
176; GFX9-NEXT:    v_lshl_or_b32 v2, v2, s1, v3
177; GFX9-NEXT:    global_store_dword v[0:1], v2, off
178; GFX9-NEXT:    s_endpgm
179;
180; GFX8-LABEL: insertelement_s_v2i16_v_s:
181; GFX8:       ; %bb.0:
182; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
183; GFX8-NEXT:    s_and_b32 s1, s4, 1
184; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
185; GFX8-NEXT:    s_mov_b32 s2, 0xffff
186; GFX8-NEXT:    v_mov_b32_e32 v1, s1
187; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
188; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
189; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
190; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
191; GFX8-NEXT:    v_mov_b32_e32 v0, 0
192; GFX8-NEXT:    v_mov_b32_e32 v1, 0
193; GFX8-NEXT:    v_or_b32_e32 v2, s0, v2
194; GFX8-NEXT:    flat_store_dword v[0:1], v2
195; GFX8-NEXT:    s_endpgm
196;
197; GFX7-LABEL: insertelement_s_v2i16_v_s:
198; GFX7:       ; %bb.0:
199; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
200; GFX7-NEXT:    s_and_b32 s1, s4, 1
201; GFX7-NEXT:    s_mov_b32 s2, 0xffff
202; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
203; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
204; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v0
205; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
206; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
208; GFX7-NEXT:    v_mov_b32_e32 v0, 0
209; GFX7-NEXT:    v_mov_b32_e32 v1, 0
210; GFX7-NEXT:    v_or_b32_e32 v2, s0, v2
211; GFX7-NEXT:    flat_store_dword v[0:1], v2
212; GFX7-NEXT:    s_endpgm
213;
214; GFX10-LABEL: insertelement_s_v2i16_v_s:
215; GFX10:       ; %bb.0:
216; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
217; GFX10-NEXT:    s_and_b32 s1, s4, 1
218; GFX10-NEXT:    s_mov_b32 s2, 0xffff
219; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
220; GFX10-NEXT:    v_and_b32_e32 v2, s2, v0
221; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
222; GFX10-NEXT:    v_mov_b32_e32 v0, 0
223; GFX10-NEXT:    v_mov_b32_e32 v1, 0
224; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
225; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
226; GFX10-NEXT:    v_lshl_or_b32 v2, v2, s1, s0
227; GFX10-NEXT:    global_store_dword v[0:1], v2, off
228; GFX10-NEXT:    s_endpgm
229  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
230  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
231  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
232  ret void
233}
234
235define amdgpu_ps void @insertelement_s_v2i16_s_v(<2 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
236; GFX9-LABEL: insertelement_s_v2i16_s_v:
237; GFX9:       ; %bb.0:
238; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
239; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
240; GFX9-NEXT:    s_mov_b32 s1, 0xffff
241; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
242; GFX9-NEXT:    s_and_b32 s2, s4, s1
243; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
244; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
245; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
246; GFX9-NEXT:    v_mov_b32_e32 v0, 0
247; GFX9-NEXT:    v_mov_b32_e32 v1, 0
248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
250; GFX9-NEXT:    global_store_dword v[0:1], v2, off
251; GFX9-NEXT:    s_endpgm
252;
253; GFX8-LABEL: insertelement_s_v2i16_s_v:
254; GFX8:       ; %bb.0:
255; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
256; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
257; GFX8-NEXT:    s_mov_b32 s1, 0xffff
258; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
259; GFX8-NEXT:    s_and_b32 s2, s4, s1
260; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
261; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
262; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
263; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
265; GFX8-NEXT:    v_mov_b32_e32 v0, 0
266; GFX8-NEXT:    v_mov_b32_e32 v1, 0
267; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
268; GFX8-NEXT:    flat_store_dword v[0:1], v2
269; GFX8-NEXT:    s_endpgm
270;
271; GFX7-LABEL: insertelement_s_v2i16_s_v:
272; GFX7:       ; %bb.0:
273; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
274; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
275; GFX7-NEXT:    s_mov_b32 s1, 0xffff
276; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
277; GFX7-NEXT:    s_and_b32 s2, s4, s1
278; GFX7-NEXT:    v_lshl_b32_e32 v2, s2, v0
279; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v0
280; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
281; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
283; GFX7-NEXT:    v_mov_b32_e32 v0, 0
284; GFX7-NEXT:    v_mov_b32_e32 v1, 0
285; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
286; GFX7-NEXT:    flat_store_dword v[0:1], v2
287; GFX7-NEXT:    s_endpgm
288;
289; GFX10-LABEL: insertelement_s_v2i16_s_v:
290; GFX10:       ; %bb.0:
291; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
292; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
293; GFX10-NEXT:    s_mov_b32 s1, 0xffff
294; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
295; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s1
296; GFX10-NEXT:    s_and_b32 s1, s4, s1
297; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
298; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
299; GFX10-NEXT:    v_mov_b32_e32 v0, 0
300; GFX10-NEXT:    v_mov_b32_e32 v1, 0
301; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX10-NEXT:    v_and_or_b32 v2, s0, v3, v2
303; GFX10-NEXT:    global_store_dword v[0:1], v2, off
304; GFX10-NEXT:    s_endpgm
305  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
306  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
307  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
308  ret void
309}
310
311define amdgpu_ps void @insertelement_s_v2i16_v_v(<2 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
312; GFX9-LABEL: insertelement_s_v2i16_v_v:
313; GFX9:       ; %bb.0:
314; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
315; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
316; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
317; GFX9-NEXT:    s_mov_b32 s1, 0xffff
318; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
319; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
320; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
321; GFX9-NEXT:    v_mov_b32_e32 v0, 0
322; GFX9-NEXT:    v_mov_b32_e32 v1, 0
323; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
325; GFX9-NEXT:    global_store_dword v[0:1], v2, off
326; GFX9-NEXT:    s_endpgm
327;
328; GFX8-LABEL: insertelement_s_v2i16_v_v:
329; GFX8:       ; %bb.0:
330; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
331; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
332; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
333; GFX8-NEXT:    s_mov_b32 s1, 0xffff
334; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
335; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
336; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
337; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
339; GFX8-NEXT:    v_mov_b32_e32 v0, 0
340; GFX8-NEXT:    v_mov_b32_e32 v1, 0
341; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
342; GFX8-NEXT:    flat_store_dword v[0:1], v2
343; GFX8-NEXT:    s_endpgm
344;
345; GFX7-LABEL: insertelement_s_v2i16_v_v:
346; GFX7:       ; %bb.0:
347; GFX7-NEXT:    s_load_dword s0, s[2:3], 0x0
348; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
349; GFX7-NEXT:    s_mov_b32 s1, 0xffff
350; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
351; GFX7-NEXT:    v_and_b32_e32 v0, s1, v0
352; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v0
353; GFX7-NEXT:    v_lshl_b32_e32 v0, s1, v1
354; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
355; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
357; GFX7-NEXT:    v_mov_b32_e32 v0, 0
358; GFX7-NEXT:    v_mov_b32_e32 v1, 0
359; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
360; GFX7-NEXT:    flat_store_dword v[0:1], v2
361; GFX7-NEXT:    s_endpgm
362;
363; GFX10-LABEL: insertelement_s_v2i16_v_v:
364; GFX10:       ; %bb.0:
365; GFX10-NEXT:    s_load_dword s0, s[2:3], 0x0
366; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
367; GFX10-NEXT:    s_mov_b32 s1, 0xffff
368; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
369; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
370; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
371; GFX10-NEXT:    v_mov_b32_e32 v0, 0
372; GFX10-NEXT:    v_mov_b32_e32 v1, 0
373; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
374; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX10-NEXT:    v_and_or_b32 v2, s0, v2, v3
376; GFX10-NEXT:    global_store_dword v[0:1], v2, off
377; GFX10-NEXT:    s_endpgm
378  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %ptr
379  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
380  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
381  ret void
382}
383
384define amdgpu_ps void @insertelement_v_v2i16_s_v(<2 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
385; GFX9-LABEL: insertelement_v_v2i16_s_v:
386; GFX9:       ; %bb.0:
387; GFX9-NEXT:    global_load_dword v3, v[0:1], off
388; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
389; GFX9-NEXT:    s_mov_b32 s0, 0xffff
390; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
391; GFX9-NEXT:    s_and_b32 s1, s2, s0
392; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
393; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
394; GFX9-NEXT:    v_xor_b32_e32 v4, -1, v0
395; GFX9-NEXT:    v_mov_b32_e32 v0, 0
396; GFX9-NEXT:    v_mov_b32_e32 v1, 0
397; GFX9-NEXT:    s_waitcnt vmcnt(0)
398; GFX9-NEXT:    v_and_or_b32 v2, v3, v4, v2
399; GFX9-NEXT:    global_store_dword v[0:1], v2, off
400; GFX9-NEXT:    s_endpgm
401;
402; GFX8-LABEL: insertelement_v_v2i16_s_v:
403; GFX8:       ; %bb.0:
404; GFX8-NEXT:    flat_load_dword v0, v[0:1]
405; GFX8-NEXT:    s_mov_b32 s0, 0xffff
406; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
407; GFX8-NEXT:    s_and_b32 s1, s2, s0
408; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
409; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
410; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
411; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
412; GFX8-NEXT:    s_waitcnt vmcnt(0)
413; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
414; GFX8-NEXT:    v_mov_b32_e32 v0, 0
415; GFX8-NEXT:    v_mov_b32_e32 v1, 0
416; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
417; GFX8-NEXT:    flat_store_dword v[0:1], v2
418; GFX8-NEXT:    s_endpgm
419;
420; GFX7-LABEL: insertelement_v_v2i16_s_v:
421; GFX7:       ; %bb.0:
422; GFX7-NEXT:    flat_load_dword v0, v[0:1]
423; GFX7-NEXT:    s_mov_b32 s0, 0xffff
424; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
425; GFX7-NEXT:    s_and_b32 s1, s2, s0
426; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
427; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
428; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
429; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
430; GFX7-NEXT:    s_waitcnt vmcnt(0)
431; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
432; GFX7-NEXT:    v_mov_b32_e32 v0, 0
433; GFX7-NEXT:    v_mov_b32_e32 v1, 0
434; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
435; GFX7-NEXT:    flat_store_dword v[0:1], v2
436; GFX7-NEXT:    s_endpgm
437;
438; GFX10-LABEL: insertelement_v_v2i16_s_v:
439; GFX10:       ; %bb.0:
440; GFX10-NEXT:    global_load_dword v3, v[0:1], off
441; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
442; GFX10-NEXT:    s_mov_b32 s0, 0xffff
443; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
444; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
445; GFX10-NEXT:    s_and_b32 s0, s2, s0
446; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
447; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v1
448; GFX10-NEXT:    v_mov_b32_e32 v0, 0
449; GFX10-NEXT:    v_mov_b32_e32 v1, 0
450; GFX10-NEXT:    s_waitcnt vmcnt(0)
451; GFX10-NEXT:    v_and_or_b32 v2, v3, v4, v2
452; GFX10-NEXT:    global_store_dword v[0:1], v2, off
453; GFX10-NEXT:    s_endpgm
454  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
455  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
456  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
457  ret void
458}
459
460define amdgpu_ps void @insertelement_v_v2i16_v_s(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
461; GFX9-LABEL: insertelement_v_v2i16_v_s:
462; GFX9:       ; %bb.0:
463; GFX9-NEXT:    global_load_dword v3, v[0:1], off
464; GFX9-NEXT:    s_and_b32 s0, s2, 1
465; GFX9-NEXT:    s_lshl_b32 s0, s0, 4
466; GFX9-NEXT:    s_mov_b32 s1, 0xffff
467; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
468; GFX9-NEXT:    s_lshl_b32 s0, s1, s0
469; GFX9-NEXT:    s_not_b32 s0, s0
470; GFX9-NEXT:    v_mov_b32_e32 v0, 0
471; GFX9-NEXT:    v_mov_b32_e32 v1, 0
472; GFX9-NEXT:    s_waitcnt vmcnt(0)
473; GFX9-NEXT:    v_and_or_b32 v2, v3, s0, v2
474; GFX9-NEXT:    global_store_dword v[0:1], v2, off
475; GFX9-NEXT:    s_endpgm
476;
477; GFX8-LABEL: insertelement_v_v2i16_v_s:
478; GFX8:       ; %bb.0:
479; GFX8-NEXT:    flat_load_dword v0, v[0:1]
480; GFX8-NEXT:    s_and_b32 s1, s2, 1
481; GFX8-NEXT:    s_mov_b32 s0, 0xffff
482; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
483; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
484; GFX8-NEXT:    v_mov_b32_e32 v1, s1
485; GFX8-NEXT:    s_not_b32 s0, s0
486; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
487; GFX8-NEXT:    s_waitcnt vmcnt(0)
488; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
489; GFX8-NEXT:    v_mov_b32_e32 v0, 0
490; GFX8-NEXT:    v_mov_b32_e32 v1, 0
491; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
492; GFX8-NEXT:    flat_store_dword v[0:1], v2
493; GFX8-NEXT:    s_endpgm
494;
495; GFX7-LABEL: insertelement_v_v2i16_v_s:
496; GFX7:       ; %bb.0:
497; GFX7-NEXT:    flat_load_dword v0, v[0:1]
498; GFX7-NEXT:    s_and_b32 s1, s2, 1
499; GFX7-NEXT:    s_mov_b32 s0, 0xffff
500; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
501; GFX7-NEXT:    v_and_b32_e32 v1, s0, v2
502; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
503; GFX7-NEXT:    s_not_b32 s0, s0
504; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s1, v1
505; GFX7-NEXT:    s_waitcnt vmcnt(0)
506; GFX7-NEXT:    v_and_b32_e32 v3, s0, v0
507; GFX7-NEXT:    v_mov_b32_e32 v0, 0
508; GFX7-NEXT:    v_mov_b32_e32 v1, 0
509; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
510; GFX7-NEXT:    flat_store_dword v[0:1], v2
511; GFX7-NEXT:    s_endpgm
512;
513; GFX10-LABEL: insertelement_v_v2i16_v_s:
514; GFX10:       ; %bb.0:
515; GFX10-NEXT:    global_load_dword v3, v[0:1], off
516; GFX10-NEXT:    s_and_b32 s0, s2, 1
517; GFX10-NEXT:    s_mov_b32 s1, 0xffff
518; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
519; GFX10-NEXT:    v_mov_b32_e32 v0, 0
520; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
521; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
522; GFX10-NEXT:    v_mov_b32_e32 v1, 0
523; GFX10-NEXT:    s_not_b32 s0, s0
524; GFX10-NEXT:    s_waitcnt vmcnt(0)
525; GFX10-NEXT:    v_and_or_b32 v2, v3, s0, v2
526; GFX10-NEXT:    global_store_dword v[0:1], v2, off
527; GFX10-NEXT:    s_endpgm
528  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
529  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
530  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
531  ret void
532}
533
534define amdgpu_ps void @insertelement_v_v2i16_v_v(<2 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
535; GFX9-LABEL: insertelement_v_v2i16_v_v:
536; GFX9:       ; %bb.0:
537; GFX9-NEXT:    global_load_dword v4, v[0:1], off
538; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
539; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
540; GFX9-NEXT:    s_mov_b32 s0, 0xffff
541; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
542; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
543; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v0
544; GFX9-NEXT:    v_mov_b32_e32 v0, 0
545; GFX9-NEXT:    v_mov_b32_e32 v1, 0
546; GFX9-NEXT:    s_waitcnt vmcnt(0)
547; GFX9-NEXT:    v_and_or_b32 v2, v4, v3, v2
548; GFX9-NEXT:    global_store_dword v[0:1], v2, off
549; GFX9-NEXT:    s_endpgm
550;
551; GFX8-LABEL: insertelement_v_v2i16_v_v:
552; GFX8:       ; %bb.0:
553; GFX8-NEXT:    flat_load_dword v0, v[0:1]
554; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
555; GFX8-NEXT:    s_mov_b32 s0, 0xffff
556; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
557; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
558; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
559; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
560; GFX8-NEXT:    s_waitcnt vmcnt(0)
561; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
562; GFX8-NEXT:    v_mov_b32_e32 v0, 0
563; GFX8-NEXT:    v_mov_b32_e32 v1, 0
564; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
565; GFX8-NEXT:    flat_store_dword v[0:1], v2
566; GFX8-NEXT:    s_endpgm
567;
568; GFX7-LABEL: insertelement_v_v2i16_v_v:
569; GFX7:       ; %bb.0:
570; GFX7-NEXT:    flat_load_dword v0, v[0:1]
571; GFX7-NEXT:    s_mov_b32 s0, 0xffff
572; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
573; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
574; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
575; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
576; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
577; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
578; GFX7-NEXT:    s_waitcnt vmcnt(0)
579; GFX7-NEXT:    v_and_b32_e32 v3, v0, v1
580; GFX7-NEXT:    v_mov_b32_e32 v0, 0
581; GFX7-NEXT:    v_mov_b32_e32 v1, 0
582; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
583; GFX7-NEXT:    flat_store_dword v[0:1], v2
584; GFX7-NEXT:    s_endpgm
585;
586; GFX10-LABEL: insertelement_v_v2i16_v_v:
587; GFX10:       ; %bb.0:
588; GFX10-NEXT:    global_load_dword v4, v[0:1], off
589; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
590; GFX10-NEXT:    s_mov_b32 s0, 0xffff
591; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
592; GFX10-NEXT:    v_lshlrev_b32_e64 v1, v0, s0
593; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
594; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v1
595; GFX10-NEXT:    v_mov_b32_e32 v0, 0
596; GFX10-NEXT:    v_mov_b32_e32 v1, 0
597; GFX10-NEXT:    s_waitcnt vmcnt(0)
598; GFX10-NEXT:    v_and_or_b32 v2, v4, v3, v2
599; GFX10-NEXT:    global_store_dword v[0:1], v2, off
600; GFX10-NEXT:    s_endpgm
601  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %ptr
602  %insert = insertelement <2 x i16> %vec, i16 %val, i32 %idx
603  store <2 x i16> %insert, <2 x i16> addrspace(1)* null
604  ret void
605}
606
607; FIXME: 3 element load/store legalization
608; define amdgpu_ps void @insertelement_s_v3i16_s_s(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
609;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
610;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
611;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
612;   ret void
613; }
614
615; define amdgpu_ps void @insertelement_v_v3i16_s_s(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
616;   %vec = load <3 x i16>, <3 x i16> addrspace(1 )* %ptr
617;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
618;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
619;   ret void
620; }
621
622; define amdgpu_ps void @insertelement_s_v3i16_v_s(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
623;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
624;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
625;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
626;   ret void
627; }
628
629; define amdgpu_ps void @insertelement_s_v3i16_s_v(<3 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
630;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
631;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
632;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
633;   ret void
634; }
635
636; define amdgpu_ps void @insertelement_s_v3i16_v_v(<3 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
637;   %vec = load <3 x i16>, <3 x i16> addrspace(4)* %ptr
638;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
639;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
640;   ret void
641; }
642
643; define amdgpu_ps void @insertelement_v_v3i16_s_v(<3 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
644;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
645;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
646;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
647;   ret void
648; }
649
650; define amdgpu_ps void @insertelement_v_v3i16_v_s(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
651;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
652;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
653;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
654;   ret void
655; }
656
657; define amdgpu_ps void @insertelement_v_v3i16_v_v(<3 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
658;   %vec = load <3 x i16>, <3 x i16> addrspace(1)* %ptr
659;   %insert = insertelement <3 x i16> %vec, i16 %val, i32 %idx
660;   store <3 x i16> %insert, <3 x i16> addrspace(1)* null
661;   ret void
662; }
663
664define amdgpu_ps void @insertelement_v_v4i16_s_s(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
665; GFX9-LABEL: insertelement_v_v4i16_s_s:
666; GFX9:       ; %bb.0:
667; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
668; GFX9-NEXT:    s_mov_b32 s0, 0xffff
669; GFX9-NEXT:    s_lshr_b32 s1, s3, 1
670; GFX9-NEXT:    s_and_b32 s3, s3, 1
671; GFX9-NEXT:    s_and_b32 s2, s2, s0
672; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
673; GFX9-NEXT:    s_lshl_b32 s2, s2, s3
674; GFX9-NEXT:    s_lshl_b32 s0, s0, s3
675; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
676; GFX9-NEXT:    s_not_b32 s0, s0
677; GFX9-NEXT:    v_mov_b32_e32 v4, s2
678; GFX9-NEXT:    v_mov_b32_e32 v2, 0
679; GFX9-NEXT:    v_mov_b32_e32 v3, 0
680; GFX9-NEXT:    s_waitcnt vmcnt(0)
681; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
682; GFX9-NEXT:    v_and_or_b32 v4, v5, s0, v4
683; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
684; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
685; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
686; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
687; GFX9-NEXT:    s_endpgm
688;
689; GFX8-LABEL: insertelement_v_v4i16_s_s:
690; GFX8:       ; %bb.0:
691; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
692; GFX8-NEXT:    s_lshr_b32 s1, s3, 1
693; GFX8-NEXT:    s_and_b32 s3, s3, 1
694; GFX8-NEXT:    s_mov_b32 s0, 0xffff
695; GFX8-NEXT:    s_lshl_b32 s3, s3, 4
696; GFX8-NEXT:    s_and_b32 s2, s2, s0
697; GFX8-NEXT:    s_lshl_b32 s0, s0, s3
698; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
699; GFX8-NEXT:    s_not_b32 s0, s0
700; GFX8-NEXT:    s_lshl_b32 s2, s2, s3
701; GFX8-NEXT:    v_mov_b32_e32 v2, 0
702; GFX8-NEXT:    v_mov_b32_e32 v3, 0
703; GFX8-NEXT:    s_waitcnt vmcnt(0)
704; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
705; GFX8-NEXT:    v_and_b32_e32 v4, s0, v4
706; GFX8-NEXT:    v_or_b32_e32 v4, s2, v4
707; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
708; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
709; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
710; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
711; GFX8-NEXT:    s_endpgm
712;
713; GFX7-LABEL: insertelement_v_v4i16_s_s:
714; GFX7:       ; %bb.0:
715; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
716; GFX7-NEXT:    s_lshr_b32 s1, s3, 1
717; GFX7-NEXT:    s_and_b32 s3, s3, 1
718; GFX7-NEXT:    s_mov_b32 s0, 0xffff
719; GFX7-NEXT:    s_lshl_b32 s3, s3, 4
720; GFX7-NEXT:    s_and_b32 s2, s2, s0
721; GFX7-NEXT:    s_lshl_b32 s0, s0, s3
722; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
723; GFX7-NEXT:    s_not_b32 s0, s0
724; GFX7-NEXT:    s_lshl_b32 s2, s2, s3
725; GFX7-NEXT:    v_mov_b32_e32 v2, 0
726; GFX7-NEXT:    v_mov_b32_e32 v3, 0
727; GFX7-NEXT:    s_waitcnt vmcnt(0)
728; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
729; GFX7-NEXT:    v_and_b32_e32 v4, s0, v4
730; GFX7-NEXT:    v_or_b32_e32 v4, s2, v4
731; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
732; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
733; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
734; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
735; GFX7-NEXT:    s_endpgm
736;
737; GFX10-LABEL: insertelement_v_v4i16_s_s:
738; GFX10:       ; %bb.0:
739; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
740; GFX10-NEXT:    s_lshr_b32 s1, s3, 1
741; GFX10-NEXT:    s_and_b32 s3, s3, 1
742; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s1, 1
743; GFX10-NEXT:    s_mov_b32 s0, 0xffff
744; GFX10-NEXT:    s_lshl_b32 s3, s3, 4
745; GFX10-NEXT:    s_and_b32 s2, s2, s0
746; GFX10-NEXT:    s_lshl_b32 s0, s0, s3
747; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
748; GFX10-NEXT:    s_not_b32 s0, s0
749; GFX10-NEXT:    s_waitcnt vmcnt(0)
750; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc_lo
751; GFX10-NEXT:    v_and_or_b32 v4, v2, s0, s2
752; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s1, 0
753; GFX10-NEXT:    v_mov_b32_e32 v2, 0
754; GFX10-NEXT:    v_mov_b32_e32 v3, 0
755; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
756; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
757; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
758; GFX10-NEXT:    s_endpgm
759  %vec = load <4 x i16>, <4 x i16> addrspace(1 )* %ptr
760  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
761  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
762  ret void
763}
764
765define amdgpu_ps void @insertelement_s_v4i16_v_s(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
766; GFX9-LABEL: insertelement_s_v4i16_v_s:
767; GFX9:       ; %bb.0:
768; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
769; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
770; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
771; GFX9-NEXT:    s_mov_b32 s5, 0xffff
772; GFX9-NEXT:    v_and_b32_e32 v0, s5, v0
773; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX9-NEXT:    s_cselect_b32 s3, s1, s0
775; GFX9-NEXT:    s_and_b32 s4, s4, 1
776; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
777; GFX9-NEXT:    s_lshl_b32 s5, s5, s4
778; GFX9-NEXT:    s_andn2_b32 s3, s3, s5
779; GFX9-NEXT:    v_mov_b32_e32 v1, s3
780; GFX9-NEXT:    v_lshl_or_b32 v4, v0, s4, v1
781; GFX9-NEXT:    v_mov_b32_e32 v0, s0
782; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
783; GFX9-NEXT:    v_mov_b32_e32 v1, s1
784; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
785; GFX9-NEXT:    v_mov_b32_e32 v2, 0
786; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
787; GFX9-NEXT:    v_mov_b32_e32 v3, 0
788; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
789; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
790; GFX9-NEXT:    s_endpgm
791;
792; GFX8-LABEL: insertelement_s_v4i16_v_s:
793; GFX8:       ; %bb.0:
794; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
795; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
796; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
797; GFX8-NEXT:    s_mov_b32 s5, 0xffff
798; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
799; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
801; GFX8-NEXT:    s_and_b32 s4, s4, 1
802; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
803; GFX8-NEXT:    v_mov_b32_e32 v1, s4
804; GFX8-NEXT:    s_lshl_b32 s4, s5, s4
805; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
806; GFX8-NEXT:    s_andn2_b32 s3, s3, s4
807; GFX8-NEXT:    v_or_b32_e32 v4, s3, v0
808; GFX8-NEXT:    v_mov_b32_e32 v0, s0
809; GFX8-NEXT:    v_mov_b32_e32 v1, s1
810; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
811; GFX8-NEXT:    v_mov_b32_e32 v2, 0
812; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
813; GFX8-NEXT:    v_mov_b32_e32 v3, 0
814; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
815; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
816; GFX8-NEXT:    s_endpgm
817;
818; GFX7-LABEL: insertelement_s_v4i16_v_s:
819; GFX7:       ; %bb.0:
820; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
821; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
822; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
823; GFX7-NEXT:    s_mov_b32 s5, 0xffff
824; GFX7-NEXT:    v_and_b32_e32 v0, s5, v0
825; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX7-NEXT:    s_cselect_b32 s3, s1, s0
827; GFX7-NEXT:    s_and_b32 s4, s4, 1
828; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
829; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
830; GFX7-NEXT:    s_lshl_b32 s4, s5, s4
831; GFX7-NEXT:    s_andn2_b32 s3, s3, s4
832; GFX7-NEXT:    v_or_b32_e32 v4, s3, v0
833; GFX7-NEXT:    v_mov_b32_e32 v0, s0
834; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
835; GFX7-NEXT:    v_mov_b32_e32 v1, s1
836; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
837; GFX7-NEXT:    v_mov_b32_e32 v2, 0
838; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
839; GFX7-NEXT:    v_mov_b32_e32 v3, 0
840; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
841; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
842; GFX7-NEXT:    s_endpgm
843;
844; GFX10-LABEL: insertelement_s_v4i16_v_s:
845; GFX10:       ; %bb.0:
846; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
847; GFX10-NEXT:    s_lshr_b32 s2, s4, 1
848; GFX10-NEXT:    s_mov_b32 s5, 0xffff
849; GFX10-NEXT:    s_cmp_eq_u32 s2, 1
850; GFX10-NEXT:    v_and_b32_e32 v2, s5, v0
851; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
852; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
853; GFX10-NEXT:    s_cselect_b32 s3, s1, s0
854; GFX10-NEXT:    s_and_b32 s4, s4, 1
855; GFX10-NEXT:    v_mov_b32_e32 v0, s0
856; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
857; GFX10-NEXT:    v_mov_b32_e32 v1, s1
858; GFX10-NEXT:    s_lshl_b32 s5, s5, s4
859; GFX10-NEXT:    s_andn2_b32 s3, s3, s5
860; GFX10-NEXT:    v_lshl_or_b32 v4, v2, s4, s3
861; GFX10-NEXT:    v_mov_b32_e32 v2, 0
862; GFX10-NEXT:    v_mov_b32_e32 v3, 0
863; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
864; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
865; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
866; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
867; GFX10-NEXT:    s_endpgm
868  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
869  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
870  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
871  ret void
872}
873
874define amdgpu_ps void @insertelement_s_v4i16_s_v(<4 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
875; GFX9-LABEL: insertelement_s_v4i16_s_v:
876; GFX9:       ; %bb.0:
877; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
878; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
879; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
880; GFX9-NEXT:    s_mov_b32 s2, 0xffff
881; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
882; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX9-NEXT:    v_mov_b32_e32 v1, s0
884; GFX9-NEXT:    v_mov_b32_e32 v3, s1
885; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
886; GFX9-NEXT:    s_and_b32 s3, s4, s2
887; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
888; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
889; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
890; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
891; GFX9-NEXT:    v_and_or_b32 v4, v1, v0, v3
892; GFX9-NEXT:    v_mov_b32_e32 v0, s0
893; GFX9-NEXT:    v_mov_b32_e32 v1, s1
894; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
895; GFX9-NEXT:    v_mov_b32_e32 v2, 0
896; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
897; GFX9-NEXT:    v_mov_b32_e32 v3, 0
898; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
899; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
900; GFX9-NEXT:    s_endpgm
901;
902; GFX8-LABEL: insertelement_s_v4i16_s_v:
903; GFX8:       ; %bb.0:
904; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
905; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
906; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
907; GFX8-NEXT:    s_mov_b32 s2, 0xffff
908; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
909; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX8-NEXT:    v_mov_b32_e32 v1, s0
911; GFX8-NEXT:    v_mov_b32_e32 v3, s1
912; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
913; GFX8-NEXT:    s_and_b32 s3, s4, s2
914; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
915; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
916; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
917; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
918; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
919; GFX8-NEXT:    v_or_b32_e32 v4, v0, v3
920; GFX8-NEXT:    v_mov_b32_e32 v0, s0
921; GFX8-NEXT:    v_mov_b32_e32 v1, s1
922; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
923; GFX8-NEXT:    v_mov_b32_e32 v2, 0
924; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
925; GFX8-NEXT:    v_mov_b32_e32 v3, 0
926; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
927; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
928; GFX8-NEXT:    s_endpgm
929;
930; GFX7-LABEL: insertelement_s_v4i16_s_v:
931; GFX7:       ; %bb.0:
932; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
933; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
934; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
935; GFX7-NEXT:    s_mov_b32 s2, 0xffff
936; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
937; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
938; GFX7-NEXT:    v_mov_b32_e32 v1, s0
939; GFX7-NEXT:    v_mov_b32_e32 v3, s1
940; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
941; GFX7-NEXT:    s_and_b32 s3, s4, s2
942; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
943; GFX7-NEXT:    v_lshl_b32_e32 v3, s3, v0
944; GFX7-NEXT:    v_lshl_b32_e32 v0, s2, v0
945; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
946; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
947; GFX7-NEXT:    v_or_b32_e32 v4, v0, v3
948; GFX7-NEXT:    v_mov_b32_e32 v0, s0
949; GFX7-NEXT:    v_mov_b32_e32 v1, s1
950; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
951; GFX7-NEXT:    v_mov_b32_e32 v2, 0
952; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
953; GFX7-NEXT:    v_mov_b32_e32 v3, 0
954; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
955; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
956; GFX7-NEXT:    s_endpgm
957;
958; GFX10-LABEL: insertelement_s_v4i16_s_v:
959; GFX10:       ; %bb.0:
960; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
961; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
962; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
963; GFX10-NEXT:    s_mov_b32 s2, 0xffff
964; GFX10-NEXT:    s_and_b32 s3, s4, s2
965; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
966; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
967; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s2
968; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s3
969; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v2
970; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
971; GFX10-NEXT:    v_mov_b32_e32 v0, s1
972; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v0, vcc_lo
973; GFX10-NEXT:    v_mov_b32_e32 v0, s0
974; GFX10-NEXT:    v_mov_b32_e32 v1, s1
975; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
976; GFX10-NEXT:    v_and_or_b32 v5, v5, v2, v3
977; GFX10-NEXT:    v_mov_b32_e32 v2, 0
978; GFX10-NEXT:    v_mov_b32_e32 v3, 0
979; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
980; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
981; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
982; GFX10-NEXT:    s_endpgm
983  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
984  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
985  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
986  ret void
987}
988
989define amdgpu_ps void @insertelement_s_v4i16_v_v(<4 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
990; GFX9-LABEL: insertelement_s_v4i16_v_v:
991; GFX9:       ; %bb.0:
992; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
993; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
994; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
995; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
996; GFX9-NEXT:    s_mov_b32 s2, 0xffff
997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
998; GFX9-NEXT:    v_mov_b32_e32 v3, s0
999; GFX9-NEXT:    v_mov_b32_e32 v4, s1
1000; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1001; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1002; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1003; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1004; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
1005; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
1006; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1007; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1008; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1009; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1010; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1011; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1012; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1013; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1014; GFX9-NEXT:    s_endpgm
1015;
1016; GFX8-LABEL: insertelement_s_v4i16_v_v:
1017; GFX8:       ; %bb.0:
1018; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1019; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
1020; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
1021; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1022; GFX8-NEXT:    s_mov_b32 s2, 0xffff
1023; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX8-NEXT:    v_mov_b32_e32 v3, s0
1025; GFX8-NEXT:    v_mov_b32_e32 v4, s1
1026; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1027; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1028; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1029; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1030; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
1031; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
1032; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
1033; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1034; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1035; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1036; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1037; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1038; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1039; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1040; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1041; GFX8-NEXT:    s_endpgm
1042;
1043; GFX7-LABEL: insertelement_s_v4i16_v_v:
1044; GFX7:       ; %bb.0:
1045; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1046; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
1047; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
1048; GFX7-NEXT:    s_mov_b32 s2, 0xffff
1049; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1050; GFX7-NEXT:    v_and_b32_e32 v0, s2, v0
1051; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1052; GFX7-NEXT:    v_mov_b32_e32 v3, s0
1053; GFX7-NEXT:    v_mov_b32_e32 v4, s1
1054; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
1055; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
1056; GFX7-NEXT:    v_lshl_b32_e32 v1, s2, v1
1057; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1058; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
1059; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
1060; GFX7-NEXT:    v_or_b32_e32 v4, v1, v0
1061; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1062; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1063; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
1064; GFX7-NEXT:    v_mov_b32_e32 v2, 0
1065; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1066; GFX7-NEXT:    v_mov_b32_e32 v3, 0
1067; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1068; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1069; GFX7-NEXT:    s_endpgm
1070;
1071; GFX10-LABEL: insertelement_s_v4i16_v_v:
1072; GFX10:       ; %bb.0:
1073; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
1074; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
1075; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
1076; GFX10-NEXT:    s_mov_b32 s2, 0xffff
1077; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1078; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
1079; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s2
1080; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1081; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v3
1082; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1083; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1084; GFX10-NEXT:    v_cndmask_b32_e32 v5, s0, v1, vcc_lo
1085; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1086; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1087; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v4
1088; GFX10-NEXT:    v_and_or_b32 v5, v5, v3, v2
1089; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1090; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1091; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
1092; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
1093; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1094; GFX10-NEXT:    s_endpgm
1095  %vec = load <4 x i16>, <4 x i16> addrspace(4)* %ptr
1096  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1097  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
1098  ret void
1099}
1100
1101define amdgpu_ps void @insertelement_v_v4i16_s_v(<4 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
1102; GFX9-LABEL: insertelement_v_v4i16_s_v:
1103; GFX9:       ; %bb.0:
1104; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1105; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1106; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1107; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
1108; GFX9-NEXT:    s_and_b32 s1, s2, s0
1109; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1110; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
1111; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
1112; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
1113; GFX9-NEXT:    v_xor_b32_e32 v2, -1, v2
1114; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1115; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
1116; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1117; GFX9-NEXT:    s_waitcnt vmcnt(0)
1118; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1119; GFX9-NEXT:    v_and_or_b32 v2, v7, v2, v6
1120; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1121; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1122; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
1123; GFX9-NEXT:    s_endpgm
1124;
1125; GFX8-LABEL: insertelement_v_v4i16_s_v:
1126; GFX8:       ; %bb.0:
1127; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1128; GFX8-NEXT:    s_mov_b32 s0, 0xffff
1129; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1130; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
1131; GFX8-NEXT:    s_and_b32 s1, s2, s0
1132; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1133; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
1134; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
1135; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
1136; GFX8-NEXT:    v_xor_b32_e32 v2, -1, v2
1137; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1138; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
1139; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1140; GFX8-NEXT:    s_waitcnt vmcnt(0)
1141; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1142; GFX8-NEXT:    v_and_b32_e32 v2, v7, v2
1143; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
1144; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1145; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1146; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1147; GFX8-NEXT:    s_endpgm
1148;
1149; GFX7-LABEL: insertelement_v_v4i16_s_v:
1150; GFX7:       ; %bb.0:
1151; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1152; GFX7-NEXT:    s_mov_b32 s0, 0xffff
1153; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1154; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
1155; GFX7-NEXT:    s_and_b32 s1, s2, s0
1156; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1157; GFX7-NEXT:    v_lshl_b32_e32 v6, s1, v2
1158; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v2
1159; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
1160; GFX7-NEXT:    v_xor_b32_e32 v2, -1, v2
1161; GFX7-NEXT:    v_mov_b32_e32 v3, 0
1162; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
1163; GFX7-NEXT:    v_mov_b32_e32 v4, 0
1164; GFX7-NEXT:    s_waitcnt vmcnt(0)
1165; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1166; GFX7-NEXT:    v_and_b32_e32 v2, v7, v2
1167; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
1168; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1169; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1170; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1171; GFX7-NEXT:    s_endpgm
1172;
1173; GFX10-LABEL: insertelement_v_v4i16_s_v:
1174; GFX10:       ; %bb.0:
1175; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1176; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
1177; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
1178; GFX10-NEXT:    s_mov_b32 s0, 0xffff
1179; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1180; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
1181; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v3, s0
1182; GFX10-NEXT:    s_and_b32 s0, s2, s0
1183; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v3, s0
1184; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v5
1185; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v4
1186; GFX10-NEXT:    s_waitcnt vmcnt(0)
1187; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1188; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
1189; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1190; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1191; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1192; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1193; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1194; GFX10-NEXT:    s_endpgm
1195  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
1196  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1197  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
1198  ret void
1199}
1200
1201define amdgpu_ps void @insertelement_v_v4i16_v_s(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
1202; GFX9-LABEL: insertelement_v_v4i16_v_s:
1203; GFX9:       ; %bb.0:
1204; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1205; GFX9-NEXT:    s_lshr_b32 s1, s2, 1
1206; GFX9-NEXT:    s_and_b32 s2, s2, 1
1207; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1208; GFX9-NEXT:    s_lshl_b32 s2, s2, 4
1209; GFX9-NEXT:    s_lshl_b32 s0, s0, s2
1210; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
1211; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1212; GFX9-NEXT:    s_not_b32 s0, s0
1213; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1214; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1215; GFX9-NEXT:    s_waitcnt vmcnt(0)
1216; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1217; GFX9-NEXT:    v_and_or_b32 v2, v5, s0, v2
1218; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
1219; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1220; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1221; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
1222; GFX9-NEXT:    s_endpgm
1223;
1224; GFX8-LABEL: insertelement_v_v4i16_v_s:
1225; GFX8:       ; %bb.0:
1226; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1227; GFX8-NEXT:    s_lshr_b32 s1, s2, 1
1228; GFX8-NEXT:    s_and_b32 s2, s2, 1
1229; GFX8-NEXT:    s_mov_b32 s0, 0xffff
1230; GFX8-NEXT:    s_lshl_b32 s2, s2, 4
1231; GFX8-NEXT:    v_mov_b32_e32 v5, s2
1232; GFX8-NEXT:    s_lshl_b32 s0, s0, s2
1233; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
1234; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1235; GFX8-NEXT:    s_not_b32 s0, s0
1236; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1237; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1238; GFX8-NEXT:    s_waitcnt vmcnt(0)
1239; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1240; GFX8-NEXT:    v_and_b32_e32 v5, s0, v5
1241; GFX8-NEXT:    v_or_b32_e32 v2, v5, v2
1242; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
1243; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1244; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1245; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1246; GFX8-NEXT:    s_endpgm
1247;
1248; GFX7-LABEL: insertelement_v_v4i16_v_s:
1249; GFX7:       ; %bb.0:
1250; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1251; GFX7-NEXT:    s_lshr_b32 s1, s2, 1
1252; GFX7-NEXT:    s_and_b32 s2, s2, 1
1253; GFX7-NEXT:    s_mov_b32 s0, 0xffff
1254; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
1255; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
1256; GFX7-NEXT:    s_lshl_b32 s0, s0, s2
1257; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s1, 1
1258; GFX7-NEXT:    s_not_b32 s0, s0
1259; GFX7-NEXT:    v_lshlrev_b32_e32 v2, s2, v2
1260; GFX7-NEXT:    v_mov_b32_e32 v3, 0
1261; GFX7-NEXT:    v_mov_b32_e32 v4, 0
1262; GFX7-NEXT:    s_waitcnt vmcnt(0)
1263; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
1264; GFX7-NEXT:    v_and_b32_e32 v5, s0, v5
1265; GFX7-NEXT:    v_or_b32_e32 v2, v5, v2
1266; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s1, 0
1267; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1268; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1269; GFX7-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
1270; GFX7-NEXT:    s_endpgm
1271;
1272; GFX10-LABEL: insertelement_v_v4i16_v_s:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1275; GFX10-NEXT:    s_and_b32 s0, s2, 1
1276; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
1277; GFX10-NEXT:    s_lshl_b32 s0, s0, 4
1278; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
1279; GFX10-NEXT:    s_mov_b32 s1, 0xffff
1280; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1281; GFX10-NEXT:    s_lshl_b32 s0, s1, s0
1282; GFX10-NEXT:    s_not_b32 s0, s0
1283; GFX10-NEXT:    s_waitcnt vmcnt(0)
1284; GFX10-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc_lo
1285; GFX10-NEXT:    v_and_or_b32 v4, v3, s0, v2
1286; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s2, 0
1287; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1288; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1289; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1290; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1291; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1292; GFX10-NEXT:    s_endpgm
1293  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
1294  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1295  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
1296  ret void
1297}
1298
1299define amdgpu_ps void @insertelement_v_v4i16_v_v(<4 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
1300; GFX9-LABEL: insertelement_v_v4i16_v_v:
1301; GFX9:       ; %bb.0:
1302; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1303; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1304; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
1305; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1306; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1307; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1308; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
1309; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
1310; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v3
1311; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1312; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
1313; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1314; GFX9-NEXT:    s_waitcnt vmcnt(0)
1315; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1316; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
1317; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1318; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1319; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
1320; GFX9-NEXT:    s_endpgm
1321;
1322; GFX8-LABEL: insertelement_v_v4i16_v_v:
1323; GFX8:       ; %bb.0:
1324; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1325; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1326; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
1327; GFX8-NEXT:    s_mov_b32 s0, 0xffff
1328; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1329; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1330; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
1331; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
1332; GFX8-NEXT:    v_xor_b32_e32 v3, -1, v3
1333; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1334; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
1335; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1336; GFX8-NEXT:    s_waitcnt vmcnt(0)
1337; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1338; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
1339; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
1340; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1341; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1342; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1343; GFX8-NEXT:    s_endpgm
1344;
1345; GFX7-LABEL: insertelement_v_v4i16_v_v:
1346; GFX7:       ; %bb.0:
1347; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1348; GFX7-NEXT:    s_mov_b32 s0, 0xffff
1349; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1350; GFX7-NEXT:    v_and_b32_e32 v3, 1, v3
1351; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
1352; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
1353; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
1354; GFX7-NEXT:    v_lshl_b32_e32 v3, s0, v3
1355; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
1356; GFX7-NEXT:    v_xor_b32_e32 v3, -1, v3
1357; GFX7-NEXT:    v_mov_b32_e32 v4, 0
1358; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
1359; GFX7-NEXT:    v_mov_b32_e32 v5, 0
1360; GFX7-NEXT:    s_waitcnt vmcnt(0)
1361; GFX7-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1362; GFX7-NEXT:    v_and_b32_e32 v3, v7, v3
1363; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
1364; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
1365; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1366; GFX7-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
1367; GFX7-NEXT:    s_endpgm
1368;
1369; GFX10-LABEL: insertelement_v_v4i16_v_v:
1370; GFX10:       ; %bb.0:
1371; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
1372; GFX10-NEXT:    v_and_b32_e32 v4, 1, v3
1373; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
1374; GFX10-NEXT:    s_mov_b32 s0, 0xffff
1375; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1376; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
1377; GFX10-NEXT:    v_lshlrev_b32_e64 v5, v4, s0
1378; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1379; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v6
1380; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v5
1381; GFX10-NEXT:    s_waitcnt vmcnt(0)
1382; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1383; GFX10-NEXT:    v_and_or_b32 v4, v4, v3, v2
1384; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1385; GFX10-NEXT:    v_mov_b32_e32 v3, 0
1386; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
1387; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
1388; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
1389; GFX10-NEXT:    s_endpgm
1390  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %ptr
1391  %insert = insertelement <4 x i16> %vec, i16 %val, i32 %idx
1392  store <4 x i16> %insert, <4 x i16> addrspace(1)* null
1393  ret void
1394}
1395
1396define amdgpu_ps void @insertelement_s_v8i16_s_s(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
1397; GFX9-LABEL: insertelement_s_v8i16_s_s:
1398; GFX9:       ; %bb.0:
1399; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1400; GFX9-NEXT:    s_lshr_b32 s6, s5, 1
1401; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
1402; GFX9-NEXT:    s_mov_b32 s8, 0xffff
1403; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1404; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1405; GFX9-NEXT:    s_cselect_b32 s7, s1, s0
1406; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
1407; GFX9-NEXT:    s_cselect_b32 s7, s2, s7
1408; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
1409; GFX9-NEXT:    s_cselect_b32 s7, s3, s7
1410; GFX9-NEXT:    s_and_b32 s5, s5, 1
1411; GFX9-NEXT:    s_lshl_b32 s5, s5, 4
1412; GFX9-NEXT:    s_and_b32 s4, s4, s8
1413; GFX9-NEXT:    s_lshl_b32 s4, s4, s5
1414; GFX9-NEXT:    s_lshl_b32 s5, s8, s5
1415; GFX9-NEXT:    s_andn2_b32 s5, s7, s5
1416; GFX9-NEXT:    s_or_b32 s4, s5, s4
1417; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
1418; GFX9-NEXT:    s_cselect_b32 s0, s4, s0
1419; GFX9-NEXT:    s_cmp_eq_u32 s6, 1
1420; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
1421; GFX9-NEXT:    s_cmp_eq_u32 s6, 2
1422; GFX9-NEXT:    s_cselect_b32 s2, s4, s2
1423; GFX9-NEXT:    s_cmp_eq_u32 s6, 3
1424; GFX9-NEXT:    s_cselect_b32 s3, s4, s3
1425; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1426; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1427; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1428; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1429; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1430; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1431; GFX9-NEXT:    s_endpgm
1432;
1433; GFX8-LABEL: insertelement_s_v8i16_s_s:
1434; GFX8:       ; %bb.0:
1435; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1436; GFX8-NEXT:    s_lshr_b32 s6, s5, 1
1437; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
1438; GFX8-NEXT:    s_mov_b32 s8, 0xffff
1439; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1440; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX8-NEXT:    s_cselect_b32 s7, s1, s0
1442; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
1443; GFX8-NEXT:    s_cselect_b32 s7, s2, s7
1444; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
1445; GFX8-NEXT:    s_cselect_b32 s7, s3, s7
1446; GFX8-NEXT:    s_and_b32 s5, s5, 1
1447; GFX8-NEXT:    s_lshl_b32 s5, s5, 4
1448; GFX8-NEXT:    s_and_b32 s4, s4, s8
1449; GFX8-NEXT:    s_lshl_b32 s4, s4, s5
1450; GFX8-NEXT:    s_lshl_b32 s5, s8, s5
1451; GFX8-NEXT:    s_andn2_b32 s5, s7, s5
1452; GFX8-NEXT:    s_or_b32 s4, s5, s4
1453; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
1454; GFX8-NEXT:    s_cselect_b32 s0, s4, s0
1455; GFX8-NEXT:    s_cmp_eq_u32 s6, 1
1456; GFX8-NEXT:    s_cselect_b32 s1, s4, s1
1457; GFX8-NEXT:    s_cmp_eq_u32 s6, 2
1458; GFX8-NEXT:    s_cselect_b32 s2, s4, s2
1459; GFX8-NEXT:    s_cmp_eq_u32 s6, 3
1460; GFX8-NEXT:    s_cselect_b32 s3, s4, s3
1461; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1462; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1463; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1464; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1465; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1466; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1467; GFX8-NEXT:    s_endpgm
1468;
1469; GFX7-LABEL: insertelement_s_v8i16_s_s:
1470; GFX7:       ; %bb.0:
1471; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1472; GFX7-NEXT:    s_lshr_b32 s6, s5, 1
1473; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
1474; GFX7-NEXT:    s_mov_b32 s8, 0xffff
1475; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1476; GFX7-NEXT:    s_cselect_b32 s7, s1, s0
1477; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
1478; GFX7-NEXT:    s_cselect_b32 s7, s2, s7
1479; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
1480; GFX7-NEXT:    s_cselect_b32 s7, s3, s7
1481; GFX7-NEXT:    s_and_b32 s5, s5, 1
1482; GFX7-NEXT:    s_lshl_b32 s5, s5, 4
1483; GFX7-NEXT:    s_and_b32 s4, s4, s8
1484; GFX7-NEXT:    s_lshl_b32 s4, s4, s5
1485; GFX7-NEXT:    s_lshl_b32 s5, s8, s5
1486; GFX7-NEXT:    s_andn2_b32 s5, s7, s5
1487; GFX7-NEXT:    s_or_b32 s4, s5, s4
1488; GFX7-NEXT:    s_cmp_eq_u32 s6, 0
1489; GFX7-NEXT:    s_cselect_b32 s0, s4, s0
1490; GFX7-NEXT:    s_cmp_eq_u32 s6, 1
1491; GFX7-NEXT:    s_cselect_b32 s1, s4, s1
1492; GFX7-NEXT:    s_cmp_eq_u32 s6, 2
1493; GFX7-NEXT:    s_cselect_b32 s2, s4, s2
1494; GFX7-NEXT:    s_cmp_eq_u32 s6, 3
1495; GFX7-NEXT:    s_cselect_b32 s3, s4, s3
1496; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1497; GFX7-NEXT:    s_mov_b64 s[4:5], 0
1498; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1499; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1500; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1501; GFX7-NEXT:    s_mov_b32 s6, -1
1502; GFX7-NEXT:    s_mov_b32 s7, 0xf000
1503; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1504; GFX7-NEXT:    s_endpgm
1505;
1506; GFX10-LABEL: insertelement_s_v8i16_s_s:
1507; GFX10:       ; %bb.0:
1508; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1509; GFX10-NEXT:    s_lshr_b32 s6, s5, 1
1510; GFX10-NEXT:    s_mov_b32 s8, 0xffff
1511; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
1512; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1513; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1514; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1515; GFX10-NEXT:    s_cselect_b32 s7, s1, s0
1516; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
1517; GFX10-NEXT:    s_cselect_b32 s7, s2, s7
1518; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
1519; GFX10-NEXT:    s_cselect_b32 s7, s3, s7
1520; GFX10-NEXT:    s_and_b32 s5, s5, 1
1521; GFX10-NEXT:    s_and_b32 s4, s4, s8
1522; GFX10-NEXT:    s_lshl_b32 s5, s5, 4
1523; GFX10-NEXT:    s_lshl_b32 s8, s8, s5
1524; GFX10-NEXT:    s_lshl_b32 s4, s4, s5
1525; GFX10-NEXT:    s_andn2_b32 s5, s7, s8
1526; GFX10-NEXT:    s_or_b32 s4, s5, s4
1527; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
1528; GFX10-NEXT:    s_cselect_b32 s0, s4, s0
1529; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
1530; GFX10-NEXT:    s_cselect_b32 s1, s4, s1
1531; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
1532; GFX10-NEXT:    s_cselect_b32 s2, s4, s2
1533; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
1534; GFX10-NEXT:    s_cselect_b32 s3, s4, s3
1535; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1536; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1537; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1538; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1539; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1540; GFX10-NEXT:    s_endpgm
1541  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
1542  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
1543  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
1544  ret void
1545}
1546
1547define amdgpu_ps void @insertelement_v_v8i16_s_s(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
1548; GFX9-LABEL: insertelement_v_v8i16_s_s:
1549; GFX9:       ; %bb.0:
1550; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
1551; GFX9-NEXT:    s_and_b32 s1, s3, 1
1552; GFX9-NEXT:    s_mov_b32 s0, 0xffff
1553; GFX9-NEXT:    s_lshr_b32 s4, s3, 1
1554; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
1555; GFX9-NEXT:    s_and_b32 s2, s2, s0
1556; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
1557; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1558; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
1559; GFX9-NEXT:    s_not_b32 s5, s0
1560; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1561; GFX9-NEXT:    v_mov_b32_e32 v6, s2
1562; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1563; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1564; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1565; GFX9-NEXT:    s_waitcnt vmcnt(0)
1566; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
1567; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v2, s[0:1]
1568; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[2:3]
1569; GFX9-NEXT:    v_and_or_b32 v6, v7, s5, v6
1570; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1571; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1572; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1573; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1574; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1575; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1576; GFX9-NEXT:    s_endpgm
1577;
1578; GFX8-LABEL: insertelement_v_v8i16_s_s:
1579; GFX8:       ; %bb.0:
1580; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
1581; GFX8-NEXT:    s_and_b32 s1, s3, 1
1582; GFX8-NEXT:    s_mov_b32 s0, 0xffff
1583; GFX8-NEXT:    s_lshr_b32 s4, s3, 1
1584; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
1585; GFX8-NEXT:    s_and_b32 s2, s2, s0
1586; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
1587; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1588; GFX8-NEXT:    s_lshl_b32 s5, s2, s1
1589; GFX8-NEXT:    s_not_b32 s6, s0
1590; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1591; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1592; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1593; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1594; GFX8-NEXT:    s_waitcnt vmcnt(0)
1595; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
1596; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
1597; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[2:3]
1598; GFX8-NEXT:    v_and_b32_e32 v6, s6, v6
1599; GFX8-NEXT:    v_or_b32_e32 v6, s5, v6
1600; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1601; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1602; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1603; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1604; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1605; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1606; GFX8-NEXT:    s_endpgm
1607;
1608; GFX7-LABEL: insertelement_v_v8i16_s_s:
1609; GFX7:       ; %bb.0:
1610; GFX7-NEXT:    s_mov_b32 s10, 0
1611; GFX7-NEXT:    s_mov_b32 s11, 0xf000
1612; GFX7-NEXT:    s_mov_b64 s[8:9], 0
1613; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
1614; GFX7-NEXT:    s_and_b32 s1, s3, 1
1615; GFX7-NEXT:    s_mov_b32 s0, 0xffff
1616; GFX7-NEXT:    s_lshr_b32 s4, s3, 1
1617; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
1618; GFX7-NEXT:    s_and_b32 s2, s2, s0
1619; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
1620; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
1621; GFX7-NEXT:    s_lshl_b32 s5, s2, s1
1622; GFX7-NEXT:    s_not_b32 s6, s0
1623; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
1624; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
1625; GFX7-NEXT:    s_mov_b32 s10, -1
1626; GFX7-NEXT:    s_waitcnt vmcnt(0)
1627; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
1628; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s[0:1]
1629; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s[2:3]
1630; GFX7-NEXT:    v_and_b32_e32 v4, s6, v4
1631; GFX7-NEXT:    v_or_b32_e32 v4, s5, v4
1632; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
1633; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
1634; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1635; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1636; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
1637; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
1638; GFX7-NEXT:    s_endpgm
1639;
1640; GFX10-LABEL: insertelement_v_v8i16_s_s:
1641; GFX10:       ; %bb.0:
1642; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off
1643; GFX10-NEXT:    s_lshr_b32 s4, s3, 1
1644; GFX10-NEXT:    s_and_b32 s1, s3, 1
1645; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
1646; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s4, 2
1647; GFX10-NEXT:    s_lshl_b32 s3, s1, 4
1648; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s4, 3
1649; GFX10-NEXT:    s_mov_b32 s5, 0xffff
1650; GFX10-NEXT:    s_and_b32 s2, s2, s5
1651; GFX10-NEXT:    s_lshl_b32 s5, s5, s3
1652; GFX10-NEXT:    s_lshl_b32 s2, s2, s3
1653; GFX10-NEXT:    s_not_b32 s3, s5
1654; GFX10-NEXT:    s_waitcnt vmcnt(0)
1655; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc_lo
1656; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v2, s0
1657; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v3, s1
1658; GFX10-NEXT:    v_and_or_b32 v6, v4, s3, s2
1659; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s4, 0
1660; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1661; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1662; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
1663; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s2
1664; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
1665; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s1
1666; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1667; GFX10-NEXT:    s_endpgm
1668  %vec = load <8 x i16>, <8 x i16> addrspace(1 )* %ptr
1669  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
1670  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
1671  ret void
1672}
1673
1674define amdgpu_ps void @insertelement_s_v8i16_v_s(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
1675; GFX9-LABEL: insertelement_s_v8i16_v_s:
1676; GFX9:       ; %bb.0:
1677; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1678; GFX9-NEXT:    s_lshr_b32 s5, s4, 1
1679; GFX9-NEXT:    s_cmp_eq_u32 s5, 1
1680; GFX9-NEXT:    s_mov_b32 s7, 0xffff
1681; GFX9-NEXT:    v_and_b32_e32 v0, s7, v0
1682; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1683; GFX9-NEXT:    s_cselect_b32 s6, s1, s0
1684; GFX9-NEXT:    s_cmp_eq_u32 s5, 2
1685; GFX9-NEXT:    s_cselect_b32 s6, s2, s6
1686; GFX9-NEXT:    s_cmp_eq_u32 s5, 3
1687; GFX9-NEXT:    s_cselect_b32 s6, s3, s6
1688; GFX9-NEXT:    s_and_b32 s4, s4, 1
1689; GFX9-NEXT:    s_lshl_b32 s4, s4, 4
1690; GFX9-NEXT:    s_lshl_b32 s7, s7, s4
1691; GFX9-NEXT:    s_andn2_b32 s6, s6, s7
1692; GFX9-NEXT:    v_mov_b32_e32 v1, s6
1693; GFX9-NEXT:    v_lshl_or_b32 v6, v0, s4, v1
1694; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1695; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
1696; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1697; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1698; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
1699; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1700; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1701; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
1702; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1703; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1704; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1705; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
1706; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1707; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1708; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1709; GFX9-NEXT:    s_endpgm
1710;
1711; GFX8-LABEL: insertelement_s_v8i16_v_s:
1712; GFX8:       ; %bb.0:
1713; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1714; GFX8-NEXT:    s_lshr_b32 s5, s4, 1
1715; GFX8-NEXT:    s_cmp_eq_u32 s5, 1
1716; GFX8-NEXT:    s_mov_b32 s7, 0xffff
1717; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
1718; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1719; GFX8-NEXT:    s_cselect_b32 s6, s1, s0
1720; GFX8-NEXT:    s_cmp_eq_u32 s5, 2
1721; GFX8-NEXT:    s_cselect_b32 s6, s2, s6
1722; GFX8-NEXT:    s_cmp_eq_u32 s5, 3
1723; GFX8-NEXT:    s_cselect_b32 s6, s3, s6
1724; GFX8-NEXT:    s_and_b32 s4, s4, 1
1725; GFX8-NEXT:    s_lshl_b32 s4, s4, 4
1726; GFX8-NEXT:    v_mov_b32_e32 v1, s4
1727; GFX8-NEXT:    s_lshl_b32 s4, s7, s4
1728; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1729; GFX8-NEXT:    s_andn2_b32 s4, s6, s4
1730; GFX8-NEXT:    v_or_b32_e32 v6, s4, v0
1731; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1732; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1733; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1734; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
1735; GFX8-NEXT:    v_mov_b32_e32 v2, s2
1736; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1737; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
1738; GFX8-NEXT:    v_mov_b32_e32 v3, s3
1739; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1740; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1741; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
1742; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1743; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1744; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1745; GFX8-NEXT:    s_endpgm
1746;
1747; GFX7-LABEL: insertelement_s_v8i16_v_s:
1748; GFX7:       ; %bb.0:
1749; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1750; GFX7-NEXT:    s_lshr_b32 s5, s4, 1
1751; GFX7-NEXT:    s_cmp_eq_u32 s5, 1
1752; GFX7-NEXT:    s_mov_b32 s7, 0xffff
1753; GFX7-NEXT:    v_and_b32_e32 v0, s7, v0
1754; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX7-NEXT:    s_cselect_b32 s6, s1, s0
1756; GFX7-NEXT:    s_cmp_eq_u32 s5, 2
1757; GFX7-NEXT:    s_cselect_b32 s6, s2, s6
1758; GFX7-NEXT:    s_cmp_eq_u32 s5, 3
1759; GFX7-NEXT:    s_cselect_b32 s6, s3, s6
1760; GFX7-NEXT:    s_and_b32 s4, s4, 1
1761; GFX7-NEXT:    s_lshl_b32 s4, s4, 4
1762; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s4, v0
1763; GFX7-NEXT:    s_lshl_b32 s4, s7, s4
1764; GFX7-NEXT:    s_andn2_b32 s4, s6, s4
1765; GFX7-NEXT:    v_or_b32_e32 v4, s4, v0
1766; GFX7-NEXT:    v_mov_b32_e32 v0, s0
1767; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 0
1768; GFX7-NEXT:    v_mov_b32_e32 v1, s1
1769; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1770; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
1771; GFX7-NEXT:    v_mov_b32_e32 v2, s2
1772; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1773; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 2
1774; GFX7-NEXT:    v_mov_b32_e32 v3, s3
1775; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1776; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 3
1777; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1778; GFX7-NEXT:    s_mov_b64 s[0:1], 0
1779; GFX7-NEXT:    s_mov_b32 s2, -1
1780; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1781; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1782; GFX7-NEXT:    s_endpgm
1783;
1784; GFX10-LABEL: insertelement_s_v8i16_v_s:
1785; GFX10:       ; %bb.0:
1786; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x0
1787; GFX10-NEXT:    s_lshr_b32 s5, s4, 1
1788; GFX10-NEXT:    s_mov_b32 s7, 0xffff
1789; GFX10-NEXT:    s_cmp_eq_u32 s5, 1
1790; GFX10-NEXT:    v_and_b32_e32 v4, s7, v0
1791; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 0
1792; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX10-NEXT:    s_cselect_b32 s6, s1, s0
1794; GFX10-NEXT:    s_cmp_eq_u32 s5, 2
1795; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1796; GFX10-NEXT:    s_cselect_b32 s6, s2, s6
1797; GFX10-NEXT:    s_cmp_eq_u32 s5, 3
1798; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1799; GFX10-NEXT:    s_cselect_b32 s6, s3, s6
1800; GFX10-NEXT:    s_and_b32 s4, s4, 1
1801; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1802; GFX10-NEXT:    s_lshl_b32 s4, s4, 4
1803; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1804; GFX10-NEXT:    s_lshl_b32 s7, s7, s4
1805; GFX10-NEXT:    s_andn2_b32 s6, s6, s7
1806; GFX10-NEXT:    v_lshl_or_b32 v6, v4, s4, s6
1807; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1808; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1809; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
1810; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 1
1811; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
1812; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 2
1813; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
1814; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s5, 3
1815; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
1816; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1817; GFX10-NEXT:    s_endpgm
1818  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
1819  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
1820  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
1821  ret void
1822}
1823
1824define amdgpu_ps void @insertelement_s_v8i16_s_v(<8 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
1825; GFX9-LABEL: insertelement_s_v8i16_s_v:
1826; GFX9:       ; %bb.0:
1827; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
1828; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
1829; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
1830; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
1831; GFX9-NEXT:    s_mov_b32 s5, 0xffff
1832; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1833; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1834; GFX9-NEXT:    v_mov_b32_e32 v2, s9
1835; GFX9-NEXT:    v_mov_b32_e32 v3, s10
1836; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1837; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
1838; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1839; GFX9-NEXT:    s_and_b32 s4, s4, s5
1840; GFX9-NEXT:    v_mov_b32_e32 v5, s11
1841; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
1842; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
1843; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
1844; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
1845; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
1846; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
1847; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
1848; GFX9-NEXT:    v_mov_b32_e32 v0, s8
1849; GFX9-NEXT:    v_mov_b32_e32 v1, s9
1850; GFX9-NEXT:    v_mov_b32_e32 v2, s10
1851; GFX9-NEXT:    v_mov_b32_e32 v3, s11
1852; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
1853; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1854; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1855; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1856; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1857; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1858; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1859; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1860; GFX9-NEXT:    s_endpgm
1861;
1862; GFX8-LABEL: insertelement_s_v8i16_s_v:
1863; GFX8:       ; %bb.0:
1864; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
1865; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
1866; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
1867; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
1868; GFX8-NEXT:    s_mov_b32 s5, 0xffff
1869; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1870; GFX8-NEXT:    v_mov_b32_e32 v1, s8
1871; GFX8-NEXT:    v_mov_b32_e32 v2, s9
1872; GFX8-NEXT:    v_mov_b32_e32 v3, s10
1873; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1874; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
1875; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1876; GFX8-NEXT:    s_and_b32 s4, s4, s5
1877; GFX8-NEXT:    v_mov_b32_e32 v5, s11
1878; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
1879; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
1880; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
1881; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
1882; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
1883; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
1884; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
1885; GFX8-NEXT:    v_or_b32_e32 v6, v0, v2
1886; GFX8-NEXT:    v_mov_b32_e32 v0, s8
1887; GFX8-NEXT:    v_mov_b32_e32 v1, s9
1888; GFX8-NEXT:    v_mov_b32_e32 v2, s10
1889; GFX8-NEXT:    v_mov_b32_e32 v3, s11
1890; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
1891; GFX8-NEXT:    v_mov_b32_e32 v4, 0
1892; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
1893; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1894; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1895; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1896; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
1897; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1898; GFX8-NEXT:    s_endpgm
1899;
1900; GFX7-LABEL: insertelement_s_v8i16_s_v:
1901; GFX7:       ; %bb.0:
1902; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
1903; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
1904; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
1905; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
1906; GFX7-NEXT:    s_mov_b32 s5, 0xffff
1907; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
1908; GFX7-NEXT:    v_mov_b32_e32 v1, s8
1909; GFX7-NEXT:    v_mov_b32_e32 v2, s9
1910; GFX7-NEXT:    v_mov_b32_e32 v3, s10
1911; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
1912; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
1913; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
1914; GFX7-NEXT:    s_and_b32 s4, s4, s5
1915; GFX7-NEXT:    v_mov_b32_e32 v5, s11
1916; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
1917; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
1918; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
1919; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
1920; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
1921; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
1922; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
1923; GFX7-NEXT:    v_or_b32_e32 v5, v0, v2
1924; GFX7-NEXT:    v_mov_b32_e32 v0, s8
1925; GFX7-NEXT:    v_mov_b32_e32 v1, s9
1926; GFX7-NEXT:    v_mov_b32_e32 v2, s10
1927; GFX7-NEXT:    v_mov_b32_e32 v3, s11
1928; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
1929; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
1930; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1931; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1932; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
1933; GFX7-NEXT:    s_mov_b64 s[0:1], 0
1934; GFX7-NEXT:    s_mov_b32 s2, -1
1935; GFX7-NEXT:    s_mov_b32 s3, 0xf000
1936; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1937; GFX7-NEXT:    s_endpgm
1938;
1939; GFX10-LABEL: insertelement_s_v8i16_s_v:
1940; GFX10:       ; %bb.0:
1941; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
1942; GFX10-NEXT:    v_and_b32_e32 v1, 1, v0
1943; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v0
1944; GFX10-NEXT:    s_mov_b32 s0, 0xffff
1945; GFX10-NEXT:    s_and_b32 s1, s4, s0
1946; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1947; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
1948; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
1949; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v1, s0
1950; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
1951; GFX10-NEXT:    v_lshlrev_b32_e64 v4, v1, s1
1952; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
1953; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v2
1954; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX10-NEXT:    v_mov_b32_e32 v0, s9
1956; GFX10-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
1957; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, s10, s0
1958; GFX10-NEXT:    v_cndmask_b32_e64 v7, v0, s11, s1
1959; GFX10-NEXT:    v_mov_b32_e32 v0, s8
1960; GFX10-NEXT:    v_mov_b32_e32 v1, s9
1961; GFX10-NEXT:    v_mov_b32_e32 v2, s10
1962; GFX10-NEXT:    v_mov_b32_e32 v3, s11
1963; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
1964; GFX10-NEXT:    v_mov_b32_e32 v4, 0
1965; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1966; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
1967; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
1968; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
1969; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
1970; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
1971; GFX10-NEXT:    s_endpgm
1972  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
1973  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
1974  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
1975  ret void
1976}
1977
1978define amdgpu_ps void @insertelement_s_v8i16_v_v(<8 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
1979; GFX9-LABEL: insertelement_s_v8i16_v_v:
1980; GFX9:       ; %bb.0:
1981; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
1982; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
1983; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
1984; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
1985; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
1986; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1987; GFX9-NEXT:    v_mov_b32_e32 v2, s4
1988; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1989; GFX9-NEXT:    v_mov_b32_e32 v5, s6
1990; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
1991; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1992; GFX9-NEXT:    s_mov_b32 s8, 0xffff
1993; GFX9-NEXT:    v_mov_b32_e32 v6, s7
1994; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1995; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
1996; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1997; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
1998; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
1999; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
2000; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
2001; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2002; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2003; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2004; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2005; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2006; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2007; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2008; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2009; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2010; GFX9-NEXT:    v_mov_b32_e32 v5, 0
2011; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2012; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2013; GFX9-NEXT:    s_endpgm
2014;
2015; GFX8-LABEL: insertelement_s_v8i16_v_v:
2016; GFX8:       ; %bb.0:
2017; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2018; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
2019; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2020; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
2021; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2022; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2023; GFX8-NEXT:    v_mov_b32_e32 v2, s4
2024; GFX8-NEXT:    v_mov_b32_e32 v3, s5
2025; GFX8-NEXT:    v_mov_b32_e32 v5, s6
2026; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2027; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2028; GFX8-NEXT:    s_mov_b32 s8, 0xffff
2029; GFX8-NEXT:    v_mov_b32_e32 v6, s7
2030; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2031; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2032; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2033; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
2034; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2035; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
2036; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
2037; GFX8-NEXT:    v_or_b32_e32 v6, v1, v0
2038; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2039; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2040; GFX8-NEXT:    v_mov_b32_e32 v2, s6
2041; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2042; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2043; GFX8-NEXT:    v_mov_b32_e32 v4, 0
2044; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2045; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
2046; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
2047; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2048; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[2:3]
2049; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2050; GFX8-NEXT:    s_endpgm
2051;
2052; GFX7-LABEL: insertelement_s_v8i16_v_v:
2053; GFX7:       ; %bb.0:
2054; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2055; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
2056; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
2057; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
2058; GFX7-NEXT:    s_mov_b32 s8, 0xffff
2059; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2060; GFX7-NEXT:    v_mov_b32_e32 v2, s4
2061; GFX7-NEXT:    v_mov_b32_e32 v3, s5
2062; GFX7-NEXT:    v_mov_b32_e32 v5, s6
2063; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
2064; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
2065; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2066; GFX7-NEXT:    v_and_b32_e32 v0, s8, v0
2067; GFX7-NEXT:    v_mov_b32_e32 v6, s7
2068; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2069; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
2070; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
2071; GFX7-NEXT:    v_lshl_b32_e32 v1, s8, v1
2072; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
2073; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
2074; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
2075; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
2076; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2077; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2078; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2079; GFX7-NEXT:    v_mov_b32_e32 v3, s7
2080; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
2081; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[4:5]
2082; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
2083; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
2084; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
2085; GFX7-NEXT:    s_mov_b64 s[0:1], 0
2086; GFX7-NEXT:    s_mov_b32 s2, -1
2087; GFX7-NEXT:    s_mov_b32 s3, 0xf000
2088; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2089; GFX7-NEXT:    s_endpgm
2090;
2091; GFX10-LABEL: insertelement_s_v8i16_v_v:
2092; GFX10:       ; %bb.0:
2093; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
2094; GFX10-NEXT:    v_and_b32_e32 v2, 1, v1
2095; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 1, v1
2096; GFX10-NEXT:    s_mov_b32 s0, 0xffff
2097; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
2098; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
2099; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v6
2100; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v6
2101; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
2102; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v6
2103; GFX10-NEXT:    v_lshlrev_b32_sdwa v4, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2104; GFX10-NEXT:    v_xor_b32_e32 v5, -1, v3
2105; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2106; GFX10-NEXT:    v_mov_b32_e32 v1, s5
2107; GFX10-NEXT:    v_cndmask_b32_e32 v1, s4, v1, vcc_lo
2108; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s6, s0
2109; GFX10-NEXT:    v_cndmask_b32_e64 v7, v1, s7, s1
2110; GFX10-NEXT:    v_mov_b32_e32 v0, s4
2111; GFX10-NEXT:    v_mov_b32_e32 v1, s5
2112; GFX10-NEXT:    v_mov_b32_e32 v2, s6
2113; GFX10-NEXT:    v_mov_b32_e32 v3, s7
2114; GFX10-NEXT:    v_and_or_b32 v7, v7, v5, v4
2115; GFX10-NEXT:    v_mov_b32_e32 v4, 0
2116; GFX10-NEXT:    v_mov_b32_e32 v5, 0
2117; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s2
2118; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
2119; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s0
2120; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2121; GFX10-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2122; GFX10-NEXT:    s_endpgm
2123  %vec = load <8 x i16>, <8 x i16> addrspace(4)* %ptr
2124  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2125  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
2126  ret void
2127}
2128
2129define amdgpu_ps void @insertelement_v_v8i16_s_v(<8 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
2130; GFX9-LABEL: insertelement_v_v8i16_s_v:
2131; GFX9:       ; %bb.0:
2132; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2133; GFX9-NEXT:    s_mov_b32 s0, 0xffff
2134; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
2135; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
2136; GFX9-NEXT:    s_and_b32 s1, s2, s0
2137; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2138; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2139; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
2140; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
2141; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2142; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2143; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
2144; GFX9-NEXT:    v_mov_b32_e32 v7, 0
2145; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2146; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2147; GFX9-NEXT:    s_waitcnt vmcnt(0)
2148; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
2149; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
2150; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
2151; GFX9-NEXT:    v_and_or_b32 v9, v9, v1, v2
2152; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2153; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2154; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2155; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2156; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2157; GFX9-NEXT:    s_endpgm
2158;
2159; GFX8-LABEL: insertelement_v_v8i16_s_v:
2160; GFX8:       ; %bb.0:
2161; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
2162; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2163; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
2164; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
2165; GFX8-NEXT:    s_and_b32 s1, s2, s0
2166; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2167; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2168; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
2169; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
2170; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2171; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2172; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
2173; GFX8-NEXT:    v_mov_b32_e32 v7, 0
2174; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2175; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2176; GFX8-NEXT:    s_waitcnt vmcnt(0)
2177; GFX8-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
2178; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
2179; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
2180; GFX8-NEXT:    v_and_b32_e32 v1, v9, v1
2181; GFX8-NEXT:    v_or_b32_e32 v9, v1, v2
2182; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2183; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2184; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2185; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2186; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
2187; GFX8-NEXT:    s_endpgm
2188;
2189; GFX7-LABEL: insertelement_v_v8i16_s_v:
2190; GFX7:       ; %bb.0:
2191; GFX7-NEXT:    s_mov_b32 s10, 0
2192; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2193; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2194; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
2195; GFX7-NEXT:    s_mov_b32 s0, 0xffff
2196; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
2197; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
2198; GFX7-NEXT:    s_and_b32 s1, s2, s0
2199; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2200; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2201; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
2202; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
2203; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2204; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2205; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
2206; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2207; GFX7-NEXT:    s_mov_b32 s10, -1
2208; GFX7-NEXT:    s_waitcnt vmcnt(0)
2209; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
2210; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
2211; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v6, s[2:3]
2212; GFX7-NEXT:    v_and_b32_e32 v1, v7, v1
2213; GFX7-NEXT:    v_or_b32_e32 v7, v1, v2
2214; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
2215; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
2216; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
2217; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
2218; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2219; GFX7-NEXT:    s_endpgm
2220;
2221; GFX10-LABEL: insertelement_v_v8i16_s_v:
2222; GFX10:       ; %bb.0:
2223; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2224; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
2225; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
2226; GFX10-NEXT:    s_mov_b32 s0, 0xffff
2227; GFX10-NEXT:    s_and_b32 s1, s2, s0
2228; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2229; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
2230; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
2231; GFX10-NEXT:    v_lshlrev_b32_e64 v7, v0, s0
2232; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
2233; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
2234; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
2235; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
2236; GFX10-NEXT:    s_waitcnt vmcnt(0)
2237; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc_lo
2238; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s0
2239; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s1
2240; GFX10-NEXT:    v_and_or_b32 v9, v2, v7, v0
2241; GFX10-NEXT:    v_mov_b32_e32 v7, 0
2242; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2243; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2244; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
2245; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2246; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2247; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2248; GFX10-NEXT:    s_endpgm
2249  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
2250  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2251  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
2252  ret void
2253}
2254
2255define amdgpu_ps void @insertelement_v_v8i16_v_s(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
2256; GFX9-LABEL: insertelement_v_v8i16_v_s:
2257; GFX9:       ; %bb.0:
2258; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2259; GFX9-NEXT:    s_and_b32 s1, s2, 1
2260; GFX9-NEXT:    s_mov_b32 s0, 0xffff
2261; GFX9-NEXT:    s_lshr_b32 s4, s2, 1
2262; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
2263; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
2264; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2265; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2266; GFX9-NEXT:    s_not_b32 s5, s0
2267; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2268; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2269; GFX9-NEXT:    v_mov_b32_e32 v7, 0
2270; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2271; GFX9-NEXT:    s_waitcnt vmcnt(0)
2272; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2273; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2274; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2275; GFX9-NEXT:    v_and_or_b32 v9, v1, s5, v0
2276; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2277; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2278; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2279; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2280; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2281; GFX9-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2282; GFX9-NEXT:    s_endpgm
2283;
2284; GFX8-LABEL: insertelement_v_v8i16_v_s:
2285; GFX8:       ; %bb.0:
2286; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
2287; GFX8-NEXT:    s_and_b32 s1, s2, 1
2288; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2289; GFX8-NEXT:    s_lshr_b32 s4, s2, 1
2290; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
2291; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
2292; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2293; GFX8-NEXT:    v_mov_b32_e32 v0, s1
2294; GFX8-NEXT:    s_not_b32 s5, s0
2295; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2296; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2297; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2298; GFX8-NEXT:    v_mov_b32_e32 v7, 0
2299; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2300; GFX8-NEXT:    s_waitcnt vmcnt(0)
2301; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2302; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2303; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2304; GFX8-NEXT:    v_and_b32_e32 v1, s5, v1
2305; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
2306; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2307; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
2308; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
2309; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
2310; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s[2:3]
2311; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
2312; GFX8-NEXT:    s_endpgm
2313;
2314; GFX7-LABEL: insertelement_v_v8i16_v_s:
2315; GFX7:       ; %bb.0:
2316; GFX7-NEXT:    s_mov_b32 s10, 0
2317; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2318; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2319; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[8:11], 0 addr64
2320; GFX7-NEXT:    s_and_b32 s1, s2, 1
2321; GFX7-NEXT:    s_mov_b32 s0, 0xffff
2322; GFX7-NEXT:    s_lshr_b32 s4, s2, 1
2323; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
2324; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
2325; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
2326; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
2327; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
2328; GFX7-NEXT:    s_not_b32 s5, s0
2329; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
2330; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
2331; GFX7-NEXT:    s_mov_b32 s10, -1
2332; GFX7-NEXT:    s_waitcnt vmcnt(0)
2333; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
2334; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
2335; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
2336; GFX7-NEXT:    v_and_b32_e32 v1, s5, v1
2337; GFX7-NEXT:    v_or_b32_e32 v7, v1, v0
2338; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 0
2339; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v7, s[4:5]
2340; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
2341; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v7, s[0:1]
2342; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v7, s[2:3]
2343; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2344; GFX7-NEXT:    s_endpgm
2345;
2346; GFX10-LABEL: insertelement_v_v8i16_v_s:
2347; GFX10:       ; %bb.0:
2348; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
2349; GFX10-NEXT:    s_lshr_b32 s3, s2, 1
2350; GFX10-NEXT:    s_and_b32 s1, s2, 1
2351; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s3, 1
2352; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 2
2353; GFX10-NEXT:    s_lshl_b32 s2, s1, 4
2354; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 3
2355; GFX10-NEXT:    s_mov_b32 s4, 0xffff
2356; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2357; GFX10-NEXT:    s_lshl_b32 s2, s4, s2
2358; GFX10-NEXT:    v_mov_b32_e32 v7, 0
2359; GFX10-NEXT:    s_not_b32 s2, s2
2360; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2361; GFX10-NEXT:    s_waitcnt vmcnt(0)
2362; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
2363; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
2364; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
2365; GFX10-NEXT:    v_and_or_b32 v9, v0, s2, v1
2366; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s3, 0
2367; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc_lo
2368; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s2
2369; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s0
2370; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v9, s1
2371; GFX10-NEXT:    global_store_dwordx4 v[7:8], v[0:3], off
2372; GFX10-NEXT:    s_endpgm
2373  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
2374  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2375  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
2376  ret void
2377}
2378
2379define amdgpu_ps void @insertelement_v_v8i16_v_v(<8 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
2380; GFX9-LABEL: insertelement_v_v8i16_v_v:
2381; GFX9:       ; %bb.0:
2382; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2383; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
2384; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
2385; GFX9-NEXT:    s_mov_b32 s0, 0xffff
2386; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2387; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2388; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2389; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
2390; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2391; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2392; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
2393; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2394; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2395; GFX9-NEXT:    v_mov_b32_e32 v9, 0
2396; GFX9-NEXT:    s_waitcnt vmcnt(0)
2397; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2398; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2399; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2400; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v2
2401; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2402; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2403; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2404; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2405; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2406; GFX9-NEXT:    s_endpgm
2407;
2408; GFX8-LABEL: insertelement_v_v8i16_v_v:
2409; GFX8:       ; %bb.0:
2410; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
2411; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
2412; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
2413; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2414; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2415; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2416; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2417; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
2418; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2419; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2420; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
2421; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2422; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2423; GFX8-NEXT:    v_mov_b32_e32 v9, 0
2424; GFX8-NEXT:    s_waitcnt vmcnt(0)
2425; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2426; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2427; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2428; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
2429; GFX8-NEXT:    v_or_b32_e32 v3, v1, v2
2430; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2431; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2432; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2433; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2434; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2435; GFX8-NEXT:    s_endpgm
2436;
2437; GFX7-LABEL: insertelement_v_v8i16_v_v:
2438; GFX7:       ; %bb.0:
2439; GFX7-NEXT:    s_mov_b32 s10, 0
2440; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2441; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2442; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[8:11], 0 addr64
2443; GFX7-NEXT:    s_mov_b32 s0, 0xffff
2444; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
2445; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
2446; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
2447; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
2448; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2449; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
2450; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
2451; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
2452; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
2453; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
2454; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
2455; GFX7-NEXT:    s_mov_b32 s10, -1
2456; GFX7-NEXT:    s_waitcnt vmcnt(0)
2457; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
2458; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
2459; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
2460; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
2461; GFX7-NEXT:    v_or_b32_e32 v3, v1, v2
2462; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
2463; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
2464; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
2465; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[2:3]
2466; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2467; GFX7-NEXT:    s_endpgm
2468;
2469; GFX10-LABEL: insertelement_v_v8i16_v_v:
2470; GFX10:       ; %bb.0:
2471; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
2472; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
2473; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
2474; GFX10-NEXT:    s_mov_b32 s0, 0xffff
2475; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
2476; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
2477; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v1
2478; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
2479; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s0
2480; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v1
2481; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2482; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v8
2483; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2484; GFX10-NEXT:    v_mov_b32_e32 v9, 0
2485; GFX10-NEXT:    s_waitcnt vmcnt(0)
2486; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
2487; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s0
2488; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s1
2489; GFX10-NEXT:    v_and_or_b32 v3, v3, v2, v0
2490; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s2
2491; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc_lo
2492; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s0
2493; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s1
2494; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2495; GFX10-NEXT:    s_endpgm
2496  %vec = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
2497  %insert = insertelement <8 x i16> %vec, i16 %val, i32 %idx
2498  store <8 x i16> %insert, <8 x i16> addrspace(1)* null
2499  ret void
2500}
2501
2502define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 inreg %idx) {
2503; GFX9-LABEL: insertelement_s_v16i16_s_s:
2504; GFX9:       ; %bb.0:
2505; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
2506; GFX9-NEXT:    s_lshr_b32 s7, s5, 1
2507; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
2508; GFX9-NEXT:    s_mov_b32 s2, 0xffff
2509; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2510; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2511; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
2512; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
2513; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
2514; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
2515; GFX9-NEXT:    s_cselect_b32 s0, s11, s0
2516; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
2517; GFX9-NEXT:    s_cselect_b32 s0, s12, s0
2518; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
2519; GFX9-NEXT:    s_cselect_b32 s0, s13, s0
2520; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
2521; GFX9-NEXT:    s_cselect_b32 s0, s14, s0
2522; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
2523; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
2524; GFX9-NEXT:    s_and_b32 s1, s5, 1
2525; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
2526; GFX9-NEXT:    s_and_b32 s3, s4, s2
2527; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
2528; GFX9-NEXT:    s_lshl_b32 s1, s2, s1
2529; GFX9-NEXT:    s_andn2_b32 s0, s0, s1
2530; GFX9-NEXT:    s_or_b32 s16, s0, s3
2531; GFX9-NEXT:    s_cmp_eq_u32 s7, 0
2532; GFX9-NEXT:    s_cselect_b32 s0, s16, s8
2533; GFX9-NEXT:    s_cmp_eq_u32 s7, 1
2534; GFX9-NEXT:    s_cselect_b32 s1, s16, s9
2535; GFX9-NEXT:    s_cmp_eq_u32 s7, 2
2536; GFX9-NEXT:    s_cselect_b32 s2, s16, s10
2537; GFX9-NEXT:    s_cmp_eq_u32 s7, 3
2538; GFX9-NEXT:    s_cselect_b32 s3, s16, s11
2539; GFX9-NEXT:    s_cmp_eq_u32 s7, 4
2540; GFX9-NEXT:    s_cselect_b32 s4, s16, s12
2541; GFX9-NEXT:    s_cmp_eq_u32 s7, 5
2542; GFX9-NEXT:    s_cselect_b32 s5, s16, s13
2543; GFX9-NEXT:    s_cmp_eq_u32 s7, 6
2544; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2545; GFX9-NEXT:    s_cselect_b32 s6, s16, s14
2546; GFX9-NEXT:    s_cmp_eq_u32 s7, 7
2547; GFX9-NEXT:    v_mov_b32_e32 v5, 0
2548; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2549; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2550; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2551; GFX9-NEXT:    s_cselect_b32 s7, s16, s15
2552; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
2553; GFX9-NEXT:    s_mov_b64 s[0:1], 16
2554; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2555; GFX9-NEXT:    v_mov_b32_e32 v1, s5
2556; GFX9-NEXT:    v_mov_b32_e32 v2, s6
2557; GFX9-NEXT:    v_mov_b32_e32 v3, s7
2558; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2559; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2560; GFX9-NEXT:    s_endpgm
2561;
2562; GFX8-LABEL: insertelement_s_v16i16_s_s:
2563; GFX8:       ; %bb.0:
2564; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
2565; GFX8-NEXT:    s_lshr_b32 s7, s5, 1
2566; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
2567; GFX8-NEXT:    s_mov_b32 s2, 0xffff
2568; GFX8-NEXT:    v_mov_b32_e32 v4, 0
2569; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2570; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
2571; GFX8-NEXT:    s_cmp_eq_u32 s7, 2
2572; GFX8-NEXT:    s_cselect_b32 s0, s10, s0
2573; GFX8-NEXT:    s_cmp_eq_u32 s7, 3
2574; GFX8-NEXT:    s_cselect_b32 s0, s11, s0
2575; GFX8-NEXT:    s_cmp_eq_u32 s7, 4
2576; GFX8-NEXT:    s_cselect_b32 s0, s12, s0
2577; GFX8-NEXT:    s_cmp_eq_u32 s7, 5
2578; GFX8-NEXT:    s_cselect_b32 s0, s13, s0
2579; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
2580; GFX8-NEXT:    s_cselect_b32 s0, s14, s0
2581; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
2582; GFX8-NEXT:    s_cselect_b32 s0, s15, s0
2583; GFX8-NEXT:    s_and_b32 s1, s5, 1
2584; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
2585; GFX8-NEXT:    s_and_b32 s3, s4, s2
2586; GFX8-NEXT:    s_lshl_b32 s3, s3, s1
2587; GFX8-NEXT:    s_lshl_b32 s1, s2, s1
2588; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
2589; GFX8-NEXT:    s_or_b32 s16, s0, s3
2590; GFX8-NEXT:    s_cmp_eq_u32 s7, 0
2591; GFX8-NEXT:    s_cselect_b32 s0, s16, s8
2592; GFX8-NEXT:    s_cmp_eq_u32 s7, 1
2593; GFX8-NEXT:    s_cselect_b32 s1, s16, s9
2594; GFX8-NEXT:    s_cmp_eq_u32 s7, 2
2595; GFX8-NEXT:    s_cselect_b32 s2, s16, s10
2596; GFX8-NEXT:    s_cmp_eq_u32 s7, 3
2597; GFX8-NEXT:    s_cselect_b32 s3, s16, s11
2598; GFX8-NEXT:    s_cmp_eq_u32 s7, 4
2599; GFX8-NEXT:    s_cselect_b32 s4, s16, s12
2600; GFX8-NEXT:    s_cmp_eq_u32 s7, 5
2601; GFX8-NEXT:    s_cselect_b32 s5, s16, s13
2602; GFX8-NEXT:    s_cmp_eq_u32 s7, 6
2603; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2604; GFX8-NEXT:    s_cselect_b32 s6, s16, s14
2605; GFX8-NEXT:    s_cmp_eq_u32 s7, 7
2606; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2607; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2608; GFX8-NEXT:    v_mov_b32_e32 v2, s2
2609; GFX8-NEXT:    v_mov_b32_e32 v3, s3
2610; GFX8-NEXT:    s_cselect_b32 s7, s16, s15
2611; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2612; GFX8-NEXT:    v_mov_b32_e32 v4, 16
2613; GFX8-NEXT:    v_mov_b32_e32 v0, s4
2614; GFX8-NEXT:    v_mov_b32_e32 v5, 0
2615; GFX8-NEXT:    v_mov_b32_e32 v1, s5
2616; GFX8-NEXT:    v_mov_b32_e32 v2, s6
2617; GFX8-NEXT:    v_mov_b32_e32 v3, s7
2618; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2619; GFX8-NEXT:    s_endpgm
2620;
2621; GFX7-LABEL: insertelement_s_v16i16_s_s:
2622; GFX7:       ; %bb.0:
2623; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
2624; GFX7-NEXT:    s_lshr_b32 s7, s5, 1
2625; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
2626; GFX7-NEXT:    s_mov_b32 s2, 0xffff
2627; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
2628; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
2629; GFX7-NEXT:    s_cmp_eq_u32 s7, 2
2630; GFX7-NEXT:    s_cselect_b32 s0, s10, s0
2631; GFX7-NEXT:    s_cmp_eq_u32 s7, 3
2632; GFX7-NEXT:    s_cselect_b32 s0, s11, s0
2633; GFX7-NEXT:    s_cmp_eq_u32 s7, 4
2634; GFX7-NEXT:    s_cselect_b32 s0, s12, s0
2635; GFX7-NEXT:    s_cmp_eq_u32 s7, 5
2636; GFX7-NEXT:    s_cselect_b32 s0, s13, s0
2637; GFX7-NEXT:    s_cmp_eq_u32 s7, 6
2638; GFX7-NEXT:    s_cselect_b32 s0, s14, s0
2639; GFX7-NEXT:    s_cmp_eq_u32 s7, 7
2640; GFX7-NEXT:    s_cselect_b32 s0, s15, s0
2641; GFX7-NEXT:    s_and_b32 s1, s5, 1
2642; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
2643; GFX7-NEXT:    s_and_b32 s3, s4, s2
2644; GFX7-NEXT:    s_lshl_b32 s3, s3, s1
2645; GFX7-NEXT:    s_lshl_b32 s1, s2, s1
2646; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
2647; GFX7-NEXT:    s_or_b32 s16, s0, s3
2648; GFX7-NEXT:    s_cmp_eq_u32 s7, 0
2649; GFX7-NEXT:    s_cselect_b32 s0, s16, s8
2650; GFX7-NEXT:    s_cmp_eq_u32 s7, 1
2651; GFX7-NEXT:    s_cselect_b32 s1, s16, s9
2652; GFX7-NEXT:    s_cmp_eq_u32 s7, 2
2653; GFX7-NEXT:    s_cselect_b32 s2, s16, s10
2654; GFX7-NEXT:    s_cmp_eq_u32 s7, 3
2655; GFX7-NEXT:    s_cselect_b32 s3, s16, s11
2656; GFX7-NEXT:    s_cmp_eq_u32 s7, 4
2657; GFX7-NEXT:    s_cselect_b32 s4, s16, s12
2658; GFX7-NEXT:    s_cmp_eq_u32 s7, 5
2659; GFX7-NEXT:    s_cselect_b32 s5, s16, s13
2660; GFX7-NEXT:    s_cmp_eq_u32 s7, 6
2661; GFX7-NEXT:    v_mov_b32_e32 v0, s0
2662; GFX7-NEXT:    s_cselect_b32 s6, s16, s14
2663; GFX7-NEXT:    s_cmp_eq_u32 s7, 7
2664; GFX7-NEXT:    s_mov_b64 s[8:9], 0
2665; GFX7-NEXT:    v_mov_b32_e32 v1, s1
2666; GFX7-NEXT:    v_mov_b32_e32 v2, s2
2667; GFX7-NEXT:    v_mov_b32_e32 v3, s3
2668; GFX7-NEXT:    s_mov_b32 s10, -1
2669; GFX7-NEXT:    s_mov_b32 s11, 0xf000
2670; GFX7-NEXT:    s_cselect_b32 s7, s16, s15
2671; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2672; GFX7-NEXT:    s_mov_b64 s[8:9], 16
2673; GFX7-NEXT:    v_mov_b32_e32 v0, s4
2674; GFX7-NEXT:    v_mov_b32_e32 v1, s5
2675; GFX7-NEXT:    v_mov_b32_e32 v2, s6
2676; GFX7-NEXT:    v_mov_b32_e32 v3, s7
2677; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
2678; GFX7-NEXT:    s_endpgm
2679;
2680; GFX10-LABEL: insertelement_s_v16i16_s_s:
2681; GFX10:       ; %bb.0:
2682; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
2683; GFX10-NEXT:    s_lshr_b32 s7, s5, 1
2684; GFX10-NEXT:    s_mov_b32 s2, 0xffff
2685; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
2686; GFX10-NEXT:    v_mov_b32_e32 v8, 0
2687; GFX10-NEXT:    v_mov_b32_e32 v9, 0
2688; GFX10-NEXT:    v_mov_b32_e32 v10, 0
2689; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2690; GFX10-NEXT:    s_cselect_b32 s0, s9, s8
2691; GFX10-NEXT:    s_cmp_eq_u32 s7, 2
2692; GFX10-NEXT:    s_cselect_b32 s0, s10, s0
2693; GFX10-NEXT:    s_cmp_eq_u32 s7, 3
2694; GFX10-NEXT:    s_cselect_b32 s0, s11, s0
2695; GFX10-NEXT:    s_cmp_eq_u32 s7, 4
2696; GFX10-NEXT:    s_cselect_b32 s0, s12, s0
2697; GFX10-NEXT:    s_cmp_eq_u32 s7, 5
2698; GFX10-NEXT:    s_cselect_b32 s0, s13, s0
2699; GFX10-NEXT:    s_cmp_eq_u32 s7, 6
2700; GFX10-NEXT:    s_cselect_b32 s0, s14, s0
2701; GFX10-NEXT:    s_cmp_eq_u32 s7, 7
2702; GFX10-NEXT:    s_cselect_b32 s0, s15, s0
2703; GFX10-NEXT:    s_and_b32 s1, s5, 1
2704; GFX10-NEXT:    s_and_b32 s3, s4, s2
2705; GFX10-NEXT:    s_lshl_b32 s1, s1, 4
2706; GFX10-NEXT:    s_lshl_b32 s2, s2, s1
2707; GFX10-NEXT:    s_lshl_b32 s1, s3, s1
2708; GFX10-NEXT:    s_andn2_b32 s0, s0, s2
2709; GFX10-NEXT:    s_or_b32 s16, s0, s1
2710; GFX10-NEXT:    s_cmp_eq_u32 s7, 0
2711; GFX10-NEXT:    s_cselect_b32 s0, s16, s8
2712; GFX10-NEXT:    s_cmp_eq_u32 s7, 1
2713; GFX10-NEXT:    s_cselect_b32 s1, s16, s9
2714; GFX10-NEXT:    s_cmp_eq_u32 s7, 2
2715; GFX10-NEXT:    s_cselect_b32 s2, s16, s10
2716; GFX10-NEXT:    s_cmp_eq_u32 s7, 3
2717; GFX10-NEXT:    s_cselect_b32 s3, s16, s11
2718; GFX10-NEXT:    s_cmp_eq_u32 s7, 4
2719; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2720; GFX10-NEXT:    s_cselect_b32 s4, s16, s12
2721; GFX10-NEXT:    s_cmp_eq_u32 s7, 5
2722; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2723; GFX10-NEXT:    s_cselect_b32 s5, s16, s13
2724; GFX10-NEXT:    s_cmp_eq_u32 s7, 6
2725; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2726; GFX10-NEXT:    s_cselect_b32 s6, s16, s14
2727; GFX10-NEXT:    s_cmp_eq_u32 s7, 7
2728; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2729; GFX10-NEXT:    s_cselect_b32 s7, s16, s15
2730; GFX10-NEXT:    v_mov_b32_e32 v4, s4
2731; GFX10-NEXT:    v_mov_b32_e32 v5, s5
2732; GFX10-NEXT:    v_mov_b32_e32 v6, s6
2733; GFX10-NEXT:    v_mov_b32_e32 v7, s7
2734; GFX10-NEXT:    s_mov_b64 s[0:1], 16
2735; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2736; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
2737; GFX10-NEXT:    s_endpgm
2738  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
2739  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
2740  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
2741  ret void
2742}
2743
2744define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
2745; GFX9-LABEL: insertelement_v_v16i16_s_s:
2746; GFX9:       ; %bb.0:
2747; GFX9-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
2748; GFX9-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
2749; GFX9-NEXT:    s_and_b32 s1, s3, 1
2750; GFX9-NEXT:    s_mov_b32 s0, 0xffff
2751; GFX9-NEXT:    s_lshr_b32 s12, s3, 1
2752; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
2753; GFX9-NEXT:    s_and_b32 s2, s2, s0
2754; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
2755; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
2756; GFX9-NEXT:    s_lshl_b32 s2, s2, s1
2757; GFX9-NEXT:    s_not_b32 s13, s0
2758; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
2759; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2760; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
2761; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
2762; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
2763; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
2764; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
2765; GFX9-NEXT:    s_waitcnt vmcnt(1)
2766; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
2767; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
2768; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
2769; GFX9-NEXT:    s_waitcnt vmcnt(0)
2770; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[4:5]
2771; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
2772; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[8:9]
2773; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
2774; GFX9-NEXT:    v_and_or_b32 v10, v1, s13, v0
2775; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
2776; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
2777; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
2778; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
2779; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
2780; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
2781; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
2782; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
2783; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
2784; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2785; GFX9-NEXT:    v_mov_b32_e32 v9, 0
2786; GFX9-NEXT:    s_mov_b64 s[0:1], 16
2787; GFX9-NEXT:    v_mov_b32_e32 v10, 0
2788; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2789; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
2790; GFX9-NEXT:    s_endpgm
2791;
2792; GFX8-LABEL: insertelement_v_v16i16_s_s:
2793; GFX8:       ; %bb.0:
2794; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
2795; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
2796; GFX8-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
2797; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
2798; GFX8-NEXT:    s_and_b32 s1, s3, 1
2799; GFX8-NEXT:    s_mov_b32 s0, 0xffff
2800; GFX8-NEXT:    s_lshr_b32 s12, s3, 1
2801; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
2802; GFX8-NEXT:    s_and_b32 s2, s2, s0
2803; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
2804; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
2805; GFX8-NEXT:    s_lshl_b32 s13, s2, s1
2806; GFX8-NEXT:    s_not_b32 s14, s0
2807; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
2808; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
2809; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
2810; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
2811; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
2812; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
2813; GFX8-NEXT:    v_mov_b32_e32 v10, 16
2814; GFX8-NEXT:    v_mov_b32_e32 v11, 0
2815; GFX8-NEXT:    s_waitcnt vmcnt(1)
2816; GFX8-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
2817; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[0:1]
2818; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v3, s[2:3]
2819; GFX8-NEXT:    s_waitcnt vmcnt(0)
2820; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v4, s[4:5]
2821; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s[6:7]
2822; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v6, s[8:9]
2823; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v7, s[10:11]
2824; GFX8-NEXT:    v_and_b32_e32 v8, s14, v8
2825; GFX8-NEXT:    v_or_b32_e32 v8, s13, v8
2826; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
2827; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[12:13]
2828; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
2829; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[0:1]
2830; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[2:3]
2831; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
2832; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
2833; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[8:9]
2834; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[10:11]
2835; GFX8-NEXT:    v_mov_b32_e32 v8, 0
2836; GFX8-NEXT:    v_mov_b32_e32 v9, 0
2837; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
2838; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
2839; GFX8-NEXT:    s_endpgm
2840;
2841; GFX7-LABEL: insertelement_v_v16i16_s_s:
2842; GFX7:       ; %bb.0:
2843; GFX7-NEXT:    s_mov_b32 s18, 0
2844; GFX7-NEXT:    s_mov_b32 s19, 0xf000
2845; GFX7-NEXT:    s_mov_b64 s[16:17], 0
2846; GFX7-NEXT:    buffer_load_dwordx4 v[2:5], v[0:1], s[16:19], 0 addr64
2847; GFX7-NEXT:    buffer_load_dwordx4 v[6:9], v[0:1], s[16:19], 0 addr64 offset:16
2848; GFX7-NEXT:    s_and_b32 s1, s3, 1
2849; GFX7-NEXT:    s_mov_b32 s0, 0xffff
2850; GFX7-NEXT:    s_lshr_b32 s12, s3, 1
2851; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
2852; GFX7-NEXT:    s_and_b32 s2, s2, s0
2853; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
2854; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
2855; GFX7-NEXT:    s_lshl_b32 s13, s2, s1
2856; GFX7-NEXT:    s_not_b32 s14, s0
2857; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
2858; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
2859; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
2860; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
2861; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
2862; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
2863; GFX7-NEXT:    s_mov_b32 s18, -1
2864; GFX7-NEXT:    s_waitcnt vmcnt(1)
2865; GFX7-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
2866; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
2867; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[2:3]
2868; GFX7-NEXT:    s_waitcnt vmcnt(0)
2869; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
2870; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[6:7]
2871; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[8:9]
2872; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[10:11]
2873; GFX7-NEXT:    v_and_b32_e32 v0, s14, v0
2874; GFX7-NEXT:    v_or_b32_e32 v10, s13, v0
2875; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
2876; GFX7-NEXT:    v_cndmask_b32_e64 v0, v2, v10, s[12:13]
2877; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
2878; GFX7-NEXT:    v_cndmask_b32_e64 v2, v4, v10, s[0:1]
2879; GFX7-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[2:3]
2880; GFX7-NEXT:    v_cndmask_b32_e64 v4, v6, v10, s[4:5]
2881; GFX7-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[6:7]
2882; GFX7-NEXT:    v_cndmask_b32_e64 v6, v8, v10, s[8:9]
2883; GFX7-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[10:11]
2884; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
2885; GFX7-NEXT:    s_mov_b64 s[16:17], 16
2886; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
2887; GFX7-NEXT:    s_endpgm
2888;
2889; GFX10-LABEL: insertelement_v_v16i16_s_s:
2890; GFX10:       ; %bb.0:
2891; GFX10-NEXT:    s_clause 0x1
2892; GFX10-NEXT:    global_load_dwordx4 v[2:5], v[0:1], off
2893; GFX10-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:16
2894; GFX10-NEXT:    s_lshr_b32 s7, s3, 1
2895; GFX10-NEXT:    s_mov_b32 s8, 0xffff
2896; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s7, 1
2897; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, 2
2898; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s7, 3
2899; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, s7, 4
2900; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s7, 5
2901; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, s7, 6
2902; GFX10-NEXT:    s_and_b32 s9, s2, s8
2903; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s7, 7
2904; GFX10-NEXT:    s_and_b32 s3, s3, 1
2905; GFX10-NEXT:    v_mov_b32_e32 v10, 0
2906; GFX10-NEXT:    s_lshl_b32 s3, s3, 4
2907; GFX10-NEXT:    v_mov_b32_e32 v11, 0
2908; GFX10-NEXT:    s_lshl_b32 s8, s8, s3
2909; GFX10-NEXT:    s_lshl_b32 s3, s9, s3
2910; GFX10-NEXT:    s_not_b32 s8, s8
2911; GFX10-NEXT:    v_mov_b32_e32 v13, 0
2912; GFX10-NEXT:    s_waitcnt vmcnt(1)
2913; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
2914; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
2915; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s1
2916; GFX10-NEXT:    s_waitcnt vmcnt(0)
2917; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s4
2918; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s5
2919; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s6
2920; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s2
2921; GFX10-NEXT:    v_and_or_b32 v12, v0, s8, s3
2922; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, s7, 0
2923; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v12, vcc_lo
2924; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v12, s3
2925; GFX10-NEXT:    v_cndmask_b32_e64 v2, v4, v12, s0
2926; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v12, s1
2927; GFX10-NEXT:    v_cndmask_b32_e64 v4, v6, v12, s4
2928; GFX10-NEXT:    v_cndmask_b32_e64 v5, v7, v12, s5
2929; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v12, s6
2930; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v12, s2
2931; GFX10-NEXT:    s_mov_b64 s[0:1], 16
2932; GFX10-NEXT:    global_store_dwordx4 v[10:11], v[0:3], off
2933; GFX10-NEXT:    global_store_dwordx4 v13, v[4:7], s[0:1]
2934; GFX10-NEXT:    s_endpgm
2935  %vec = load <16 x i16>, <16 x i16> addrspace(1 )* %ptr
2936  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
2937  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
2938  ret void
2939}
2940
2941define amdgpu_ps void @insertelement_s_v16i16_v_s(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 inreg %idx) {
2942; GFX9-LABEL: insertelement_s_v16i16_v_s:
2943; GFX9:       ; %bb.0:
2944; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
2945; GFX9-NEXT:    s_lshr_b32 s2, s4, 1
2946; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
2947; GFX9-NEXT:    s_mov_b32 s3, 0xffff
2948; GFX9-NEXT:    v_and_b32_e32 v0, s3, v0
2949; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2950; GFX9-NEXT:    s_cselect_b32 s0, s9, s8
2951; GFX9-NEXT:    s_cmp_eq_u32 s2, 2
2952; GFX9-NEXT:    s_cselect_b32 s0, s10, s0
2953; GFX9-NEXT:    s_cmp_eq_u32 s2, 3
2954; GFX9-NEXT:    s_cselect_b32 s0, s11, s0
2955; GFX9-NEXT:    s_cmp_eq_u32 s2, 4
2956; GFX9-NEXT:    s_cselect_b32 s0, s12, s0
2957; GFX9-NEXT:    s_cmp_eq_u32 s2, 5
2958; GFX9-NEXT:    s_cselect_b32 s0, s13, s0
2959; GFX9-NEXT:    s_cmp_eq_u32 s2, 6
2960; GFX9-NEXT:    s_cselect_b32 s0, s14, s0
2961; GFX9-NEXT:    s_cmp_eq_u32 s2, 7
2962; GFX9-NEXT:    s_cselect_b32 s0, s15, s0
2963; GFX9-NEXT:    s_and_b32 s1, s4, 1
2964; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
2965; GFX9-NEXT:    s_lshl_b32 s3, s3, s1
2966; GFX9-NEXT:    s_andn2_b32 s0, s0, s3
2967; GFX9-NEXT:    v_mov_b32_e32 v1, s0
2968; GFX9-NEXT:    v_lshl_or_b32 v8, v0, s1, v1
2969; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2970; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
2971; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2972; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
2973; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
2974; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2975; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
2976; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
2977; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2978; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
2979; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
2980; GFX9-NEXT:    v_mov_b32_e32 v5, s13
2981; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
2982; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
2983; GFX9-NEXT:    v_mov_b32_e32 v6, s14
2984; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
2985; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
2986; GFX9-NEXT:    v_mov_b32_e32 v4, s12
2987; GFX9-NEXT:    v_mov_b32_e32 v7, s15
2988; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
2989; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
2990; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
2991; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
2992; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
2993; GFX9-NEXT:    v_mov_b32_e32 v8, 0
2994; GFX9-NEXT:    v_mov_b32_e32 v9, 0
2995; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
2996; GFX9-NEXT:    s_mov_b64 s[0:1], 16
2997; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2998; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
2999; GFX9-NEXT:    s_endpgm
3000;
3001; GFX8-LABEL: insertelement_s_v16i16_v_s:
3002; GFX8:       ; %bb.0:
3003; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3004; GFX8-NEXT:    s_lshr_b32 s2, s4, 1
3005; GFX8-NEXT:    s_cmp_eq_u32 s2, 1
3006; GFX8-NEXT:    s_mov_b32 s3, 0xffff
3007; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
3008; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3009; GFX8-NEXT:    s_cselect_b32 s0, s9, s8
3010; GFX8-NEXT:    s_cmp_eq_u32 s2, 2
3011; GFX8-NEXT:    s_cselect_b32 s0, s10, s0
3012; GFX8-NEXT:    s_cmp_eq_u32 s2, 3
3013; GFX8-NEXT:    s_cselect_b32 s0, s11, s0
3014; GFX8-NEXT:    s_cmp_eq_u32 s2, 4
3015; GFX8-NEXT:    s_cselect_b32 s0, s12, s0
3016; GFX8-NEXT:    s_cmp_eq_u32 s2, 5
3017; GFX8-NEXT:    s_cselect_b32 s0, s13, s0
3018; GFX8-NEXT:    s_cmp_eq_u32 s2, 6
3019; GFX8-NEXT:    s_cselect_b32 s0, s14, s0
3020; GFX8-NEXT:    s_cmp_eq_u32 s2, 7
3021; GFX8-NEXT:    s_cselect_b32 s0, s15, s0
3022; GFX8-NEXT:    s_and_b32 s1, s4, 1
3023; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
3024; GFX8-NEXT:    v_mov_b32_e32 v1, s1
3025; GFX8-NEXT:    s_lshl_b32 s1, s3, s1
3026; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3027; GFX8-NEXT:    s_andn2_b32 s0, s0, s1
3028; GFX8-NEXT:    v_or_b32_e32 v8, s0, v0
3029; GFX8-NEXT:    v_mov_b32_e32 v0, s8
3030; GFX8-NEXT:    v_mov_b32_e32 v1, s9
3031; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
3032; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
3033; GFX8-NEXT:    v_mov_b32_e32 v2, s10
3034; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
3035; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
3036; GFX8-NEXT:    v_mov_b32_e32 v3, s11
3037; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
3038; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
3039; GFX8-NEXT:    v_mov_b32_e32 v5, s13
3040; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
3041; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
3042; GFX8-NEXT:    v_mov_b32_e32 v6, s14
3043; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
3044; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
3045; GFX8-NEXT:    v_mov_b32_e32 v4, s12
3046; GFX8-NEXT:    v_mov_b32_e32 v7, s15
3047; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
3048; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
3049; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
3050; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
3051; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
3052; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3053; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3054; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3055; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3056; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3057; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3058; GFX8-NEXT:    s_endpgm
3059;
3060; GFX7-LABEL: insertelement_s_v16i16_v_s:
3061; GFX7:       ; %bb.0:
3062; GFX7-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3063; GFX7-NEXT:    s_lshr_b32 s2, s4, 1
3064; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
3065; GFX7-NEXT:    s_mov_b32 s3, 0xffff
3066; GFX7-NEXT:    v_and_b32_e32 v0, s3, v0
3067; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3068; GFX7-NEXT:    s_cselect_b32 s0, s9, s8
3069; GFX7-NEXT:    s_cmp_eq_u32 s2, 2
3070; GFX7-NEXT:    s_cselect_b32 s0, s10, s0
3071; GFX7-NEXT:    s_cmp_eq_u32 s2, 3
3072; GFX7-NEXT:    s_cselect_b32 s0, s11, s0
3073; GFX7-NEXT:    s_cmp_eq_u32 s2, 4
3074; GFX7-NEXT:    s_cselect_b32 s0, s12, s0
3075; GFX7-NEXT:    s_cmp_eq_u32 s2, 5
3076; GFX7-NEXT:    s_cselect_b32 s0, s13, s0
3077; GFX7-NEXT:    s_cmp_eq_u32 s2, 6
3078; GFX7-NEXT:    s_cselect_b32 s0, s14, s0
3079; GFX7-NEXT:    s_cmp_eq_u32 s2, 7
3080; GFX7-NEXT:    s_cselect_b32 s0, s15, s0
3081; GFX7-NEXT:    s_and_b32 s1, s4, 1
3082; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
3083; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
3084; GFX7-NEXT:    s_lshl_b32 s1, s3, s1
3085; GFX7-NEXT:    s_andn2_b32 s0, s0, s1
3086; GFX7-NEXT:    v_or_b32_e32 v8, s0, v0
3087; GFX7-NEXT:    v_mov_b32_e32 v0, s8
3088; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
3089; GFX7-NEXT:    v_mov_b32_e32 v1, s9
3090; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
3091; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
3092; GFX7-NEXT:    v_mov_b32_e32 v2, s10
3093; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
3094; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 2
3095; GFX7-NEXT:    v_mov_b32_e32 v3, s11
3096; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
3097; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 3
3098; GFX7-NEXT:    v_mov_b32_e32 v5, s13
3099; GFX7-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
3100; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 5
3101; GFX7-NEXT:    v_mov_b32_e32 v4, s12
3102; GFX7-NEXT:    v_mov_b32_e32 v6, s14
3103; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, 4
3104; GFX7-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
3105; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 6
3106; GFX7-NEXT:    v_mov_b32_e32 v7, s15
3107; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[0:1]
3108; GFX7-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
3109; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 7
3110; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3111; GFX7-NEXT:    s_mov_b32 s2, -1
3112; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3113; GFX7-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
3114; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3115; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3116; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
3117; GFX7-NEXT:    s_endpgm
3118;
3119; GFX10-LABEL: insertelement_s_v16i16_v_s:
3120; GFX10:       ; %bb.0:
3121; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3122; GFX10-NEXT:    s_lshr_b32 s0, s4, 1
3123; GFX10-NEXT:    s_mov_b32 s3, 0xffff
3124; GFX10-NEXT:    s_cmp_eq_u32 s0, 1
3125; GFX10-NEXT:    v_and_b32_e32 v8, s3, v0
3126; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 0
3127; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3128; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3129; GFX10-NEXT:    s_cselect_b32 s1, s9, s8
3130; GFX10-NEXT:    s_cmp_eq_u32 s0, 2
3131; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3132; GFX10-NEXT:    s_cselect_b32 s1, s10, s1
3133; GFX10-NEXT:    s_cmp_eq_u32 s0, 3
3134; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3135; GFX10-NEXT:    s_cselect_b32 s1, s11, s1
3136; GFX10-NEXT:    s_cmp_eq_u32 s0, 4
3137; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3138; GFX10-NEXT:    s_cselect_b32 s1, s12, s1
3139; GFX10-NEXT:    s_cmp_eq_u32 s0, 5
3140; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3141; GFX10-NEXT:    s_cselect_b32 s1, s13, s1
3142; GFX10-NEXT:    s_cmp_eq_u32 s0, 6
3143; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3144; GFX10-NEXT:    s_cselect_b32 s1, s14, s1
3145; GFX10-NEXT:    s_cmp_eq_u32 s0, 7
3146; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3147; GFX10-NEXT:    s_cselect_b32 s1, s15, s1
3148; GFX10-NEXT:    s_and_b32 s2, s4, 1
3149; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3150; GFX10-NEXT:    s_lshl_b32 s2, s2, 4
3151; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3152; GFX10-NEXT:    s_lshl_b32 s3, s3, s2
3153; GFX10-NEXT:    s_andn2_b32 s1, s1, s3
3154; GFX10-NEXT:    v_lshl_or_b32 v10, v8, s2, s1
3155; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3156; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3157; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc_lo
3158; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 1
3159; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc_lo
3160; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 2
3161; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc_lo
3162; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 3
3163; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc_lo
3164; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 4
3165; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
3166; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 5
3167; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc_lo
3168; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 6
3169; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
3170; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s0, 7
3171; GFX10-NEXT:    s_mov_b64 s[0:1], 16
3172; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc_lo
3173; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3174; GFX10-NEXT:    global_store_dwordx4 v11, v[4:7], s[0:1]
3175; GFX10-NEXT:    s_endpgm
3176  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
3177  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3178  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
3179  ret void
3180}
3181
3182define amdgpu_ps void @insertelement_s_v16i16_s_v(<16 x i16> addrspace(4)* inreg %ptr, i16 inreg %val, i32 %idx) {
3183; GFX9-LABEL: insertelement_s_v16i16_s_v:
3184; GFX9:       ; %bb.0:
3185; GFX9-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3186; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3187; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3188; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3189; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3190; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3191; GFX9-NEXT:    v_mov_b32_e32 v1, s16
3192; GFX9-NEXT:    v_mov_b32_e32 v2, s17
3193; GFX9-NEXT:    v_mov_b32_e32 v3, s18
3194; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3195; GFX9-NEXT:    v_mov_b32_e32 v4, s19
3196; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3197; GFX9-NEXT:    v_mov_b32_e32 v5, s20
3198; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3199; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3200; GFX9-NEXT:    v_mov_b32_e32 v6, s21
3201; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3202; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3203; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
3204; GFX9-NEXT:    s_mov_b32 s5, 0xffff
3205; GFX9-NEXT:    v_mov_b32_e32 v7, s22
3206; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3207; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3208; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3209; GFX9-NEXT:    s_and_b32 s4, s4, s5
3210; GFX9-NEXT:    v_mov_b32_e32 v9, s23
3211; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3212; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3213; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
3214; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
3215; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3216; GFX9-NEXT:    v_xor_b32_e32 v0, -1, v0
3217; GFX9-NEXT:    v_and_or_b32 v9, v1, v0, v2
3218; GFX9-NEXT:    v_mov_b32_e32 v0, s16
3219; GFX9-NEXT:    v_mov_b32_e32 v1, s17
3220; GFX9-NEXT:    v_mov_b32_e32 v2, s18
3221; GFX9-NEXT:    v_mov_b32_e32 v3, s19
3222; GFX9-NEXT:    v_mov_b32_e32 v4, s20
3223; GFX9-NEXT:    v_mov_b32_e32 v5, s21
3224; GFX9-NEXT:    v_mov_b32_e32 v6, s22
3225; GFX9-NEXT:    v_mov_b32_e32 v7, s23
3226; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3227; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3228; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3229; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3230; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3231; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3232; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3233; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3234; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3235; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3236; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3237; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3238; GFX9-NEXT:    s_mov_b64 s[0:1], 16
3239; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3240; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
3241; GFX9-NEXT:    s_endpgm
3242;
3243; GFX8-LABEL: insertelement_s_v16i16_s_v:
3244; GFX8:       ; %bb.0:
3245; GFX8-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3246; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3247; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3248; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3249; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3250; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3251; GFX8-NEXT:    v_mov_b32_e32 v1, s16
3252; GFX8-NEXT:    v_mov_b32_e32 v2, s17
3253; GFX8-NEXT:    v_mov_b32_e32 v3, s18
3254; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3255; GFX8-NEXT:    v_mov_b32_e32 v4, s19
3256; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3257; GFX8-NEXT:    v_mov_b32_e32 v5, s20
3258; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3259; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3260; GFX8-NEXT:    v_mov_b32_e32 v6, s21
3261; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3262; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3263; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
3264; GFX8-NEXT:    s_mov_b32 s5, 0xffff
3265; GFX8-NEXT:    v_mov_b32_e32 v7, s22
3266; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3267; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3268; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3269; GFX8-NEXT:    s_and_b32 s4, s4, s5
3270; GFX8-NEXT:    v_mov_b32_e32 v9, s23
3271; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3272; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3273; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
3274; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
3275; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3276; GFX8-NEXT:    v_xor_b32_e32 v0, -1, v0
3277; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
3278; GFX8-NEXT:    v_or_b32_e32 v9, v0, v2
3279; GFX8-NEXT:    v_mov_b32_e32 v0, s16
3280; GFX8-NEXT:    v_mov_b32_e32 v1, s17
3281; GFX8-NEXT:    v_mov_b32_e32 v2, s18
3282; GFX8-NEXT:    v_mov_b32_e32 v3, s19
3283; GFX8-NEXT:    v_mov_b32_e32 v4, s20
3284; GFX8-NEXT:    v_mov_b32_e32 v5, s21
3285; GFX8-NEXT:    v_mov_b32_e32 v6, s22
3286; GFX8-NEXT:    v_mov_b32_e32 v7, s23
3287; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3288; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3289; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3290; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3291; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3292; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3293; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3294; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3295; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3296; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3297; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3298; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3299; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3300; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3301; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3302; GFX8-NEXT:    s_endpgm
3303;
3304; GFX7-LABEL: insertelement_s_v16i16_s_v:
3305; GFX7:       ; %bb.0:
3306; GFX7-NEXT:    s_load_dwordx8 s[16:23], s[2:3], 0x0
3307; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v0
3308; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3309; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3310; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3311; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3312; GFX7-NEXT:    v_mov_b32_e32 v1, s16
3313; GFX7-NEXT:    v_mov_b32_e32 v2, s17
3314; GFX7-NEXT:    v_mov_b32_e32 v3, s18
3315; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
3316; GFX7-NEXT:    v_mov_b32_e32 v4, s19
3317; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
3318; GFX7-NEXT:    v_mov_b32_e32 v5, s20
3319; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
3320; GFX7-NEXT:    v_cmp_eq_u32_e64 s[14:15], 4, v8
3321; GFX7-NEXT:    v_mov_b32_e32 v6, s21
3322; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[14:15]
3323; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3324; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
3325; GFX7-NEXT:    s_mov_b32 s5, 0xffff
3326; GFX7-NEXT:    v_mov_b32_e32 v7, s22
3327; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
3328; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3329; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3330; GFX7-NEXT:    s_and_b32 s4, s4, s5
3331; GFX7-NEXT:    v_mov_b32_e32 v9, s23
3332; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
3333; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3334; GFX7-NEXT:    v_lshl_b32_e32 v2, s4, v0
3335; GFX7-NEXT:    v_lshl_b32_e32 v0, s5, v0
3336; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
3337; GFX7-NEXT:    v_xor_b32_e32 v0, -1, v0
3338; GFX7-NEXT:    v_and_b32_e32 v0, v1, v0
3339; GFX7-NEXT:    v_or_b32_e32 v9, v0, v2
3340; GFX7-NEXT:    v_mov_b32_e32 v0, s16
3341; GFX7-NEXT:    v_mov_b32_e32 v1, s17
3342; GFX7-NEXT:    v_mov_b32_e32 v2, s18
3343; GFX7-NEXT:    v_mov_b32_e32 v3, s19
3344; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3345; GFX7-NEXT:    v_mov_b32_e32 v4, s20
3346; GFX7-NEXT:    v_mov_b32_e32 v5, s21
3347; GFX7-NEXT:    v_mov_b32_e32 v6, s22
3348; GFX7-NEXT:    v_mov_b32_e32 v7, s23
3349; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3350; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3351; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3352; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3353; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3354; GFX7-NEXT:    s_mov_b32 s2, -1
3355; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3356; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[14:15]
3357; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3358; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3359; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3360; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3361; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3362; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
3363; GFX7-NEXT:    s_endpgm
3364;
3365; GFX10-LABEL: insertelement_s_v16i16_s_v:
3366; GFX10:       ; %bb.0:
3367; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3368; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 1, v0
3369; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
3370; GFX10-NEXT:    s_mov_b32 s5, 0xffff
3371; GFX10-NEXT:    v_mov_b32_e32 v12, 0
3372; GFX10-NEXT:    s_and_b32 s6, s4, s5
3373; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
3374; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v10
3375; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
3376; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v10
3377; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v10
3378; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
3379; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v10
3380; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v0, s5
3381; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v10
3382; GFX10-NEXT:    v_lshlrev_b32_e64 v8, v0, s6
3383; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v10
3384; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3385; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3386; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v2
3387; GFX10-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
3388; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s10, s0
3389; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s11, s1
3390; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s12, s2
3391; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s13, s3
3392; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s14, s4
3393; GFX10-NEXT:    v_cndmask_b32_e64 v11, v1, s15, s5
3394; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3395; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3396; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3397; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3398; GFX10-NEXT:    v_and_or_b32 v11, v11, v9, v8
3399; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3400; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3401; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3402; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3403; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3404; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3405; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s6
3406; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
3407; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s0
3408; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s1
3409; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s2
3410; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s3
3411; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s4
3412; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s5
3413; GFX10-NEXT:    s_mov_b64 s[0:1], 16
3414; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3415; GFX10-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1]
3416; GFX10-NEXT:    s_endpgm
3417  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
3418  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3419  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
3420  ret void
3421}
3422
3423define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg %ptr, i16 %val, i32 %idx) {
3424; GFX9-LABEL: insertelement_s_v16i16_v_v:
3425; GFX9:       ; %bb.0:
3426; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3427; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3428; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3429; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3430; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3432; GFX9-NEXT:    v_mov_b32_e32 v2, s12
3433; GFX9-NEXT:    v_mov_b32_e32 v3, s13
3434; GFX9-NEXT:    v_mov_b32_e32 v4, s14
3435; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3436; GFX9-NEXT:    v_mov_b32_e32 v5, s15
3437; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3438; GFX9-NEXT:    v_mov_b32_e32 v6, s16
3439; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3440; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3441; GFX9-NEXT:    v_mov_b32_e32 v7, s17
3442; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3443; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3444; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
3445; GFX9-NEXT:    v_mov_b32_e32 v9, s18
3446; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
3447; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3448; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3449; GFX9-NEXT:    s_mov_b32 s20, 0xffff
3450; GFX9-NEXT:    v_mov_b32_e32 v10, s19
3451; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
3452; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3453; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3454; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
3455; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
3456; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
3457; GFX9-NEXT:    v_and_or_b32 v9, v2, v1, v0
3458; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3459; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3460; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3461; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3462; GFX9-NEXT:    v_mov_b32_e32 v4, s16
3463; GFX9-NEXT:    v_mov_b32_e32 v5, s17
3464; GFX9-NEXT:    v_mov_b32_e32 v6, s18
3465; GFX9-NEXT:    v_mov_b32_e32 v7, s19
3466; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3467; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3468; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3469; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3470; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3471; GFX9-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
3472; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3473; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3474; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3475; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3476; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3477; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3478; GFX9-NEXT:    s_mov_b64 s[0:1], 16
3479; GFX9-NEXT:    v_mov_b32_e32 v0, 0
3480; GFX9-NEXT:    global_store_dwordx4 v0, v[4:7], s[0:1]
3481; GFX9-NEXT:    s_endpgm
3482;
3483; GFX8-LABEL: insertelement_s_v16i16_v_v:
3484; GFX8:       ; %bb.0:
3485; GFX8-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3486; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3487; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3488; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3489; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3490; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
3491; GFX8-NEXT:    v_mov_b32_e32 v2, s12
3492; GFX8-NEXT:    v_mov_b32_e32 v3, s13
3493; GFX8-NEXT:    v_mov_b32_e32 v4, s14
3494; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3495; GFX8-NEXT:    v_mov_b32_e32 v5, s15
3496; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3497; GFX8-NEXT:    v_mov_b32_e32 v6, s16
3498; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3499; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3500; GFX8-NEXT:    v_mov_b32_e32 v7, s17
3501; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3502; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3503; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
3504; GFX8-NEXT:    v_mov_b32_e32 v9, s18
3505; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
3506; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3507; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3508; GFX8-NEXT:    s_mov_b32 s20, 0xffff
3509; GFX8-NEXT:    v_mov_b32_e32 v10, s19
3510; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
3511; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3512; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3513; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
3514; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
3515; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
3516; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
3517; GFX8-NEXT:    v_or_b32_e32 v9, v1, v0
3518; GFX8-NEXT:    v_mov_b32_e32 v0, s12
3519; GFX8-NEXT:    v_mov_b32_e32 v1, s13
3520; GFX8-NEXT:    v_mov_b32_e32 v2, s14
3521; GFX8-NEXT:    v_mov_b32_e32 v3, s15
3522; GFX8-NEXT:    v_mov_b32_e32 v4, s16
3523; GFX8-NEXT:    v_mov_b32_e32 v5, s17
3524; GFX8-NEXT:    v_mov_b32_e32 v6, s18
3525; GFX8-NEXT:    v_mov_b32_e32 v7, s19
3526; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3527; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3528; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3529; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3530; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3531; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
3532; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3533; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3534; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3535; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3536; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3537; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3538; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3539; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3540; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3541; GFX8-NEXT:    s_endpgm
3542;
3543; GFX7-LABEL: insertelement_s_v16i16_v_v:
3544; GFX7:       ; %bb.0:
3545; GFX7-NEXT:    s_load_dwordx8 s[12:19], s[2:3], 0x0
3546; GFX7-NEXT:    v_lshrrev_b32_e32 v8, 1, v1
3547; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
3548; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v8
3549; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v8
3550; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
3551; GFX7-NEXT:    v_mov_b32_e32 v2, s12
3552; GFX7-NEXT:    v_mov_b32_e32 v3, s13
3553; GFX7-NEXT:    v_mov_b32_e32 v4, s14
3554; GFX7-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
3555; GFX7-NEXT:    v_mov_b32_e32 v5, s15
3556; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
3557; GFX7-NEXT:    v_mov_b32_e32 v6, s16
3558; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
3559; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v8
3560; GFX7-NEXT:    v_mov_b32_e32 v7, s17
3561; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
3562; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v8
3563; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
3564; GFX7-NEXT:    s_mov_b32 s20, 0xffff
3565; GFX7-NEXT:    v_mov_b32_e32 v9, s18
3566; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
3567; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
3568; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3569; GFX7-NEXT:    v_and_b32_e32 v0, s20, v0
3570; GFX7-NEXT:    v_mov_b32_e32 v10, s19
3571; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
3572; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
3573; GFX7-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
3574; GFX7-NEXT:    v_lshl_b32_e32 v1, s20, v1
3575; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
3576; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
3577; GFX7-NEXT:    v_and_b32_e32 v1, v2, v1
3578; GFX7-NEXT:    v_or_b32_e32 v9, v1, v0
3579; GFX7-NEXT:    v_mov_b32_e32 v0, s12
3580; GFX7-NEXT:    v_mov_b32_e32 v1, s13
3581; GFX7-NEXT:    v_mov_b32_e32 v2, s14
3582; GFX7-NEXT:    v_mov_b32_e32 v3, s15
3583; GFX7-NEXT:    v_mov_b32_e32 v4, s16
3584; GFX7-NEXT:    v_mov_b32_e32 v5, s17
3585; GFX7-NEXT:    v_mov_b32_e32 v6, s18
3586; GFX7-NEXT:    v_mov_b32_e32 v7, s19
3587; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v8
3588; GFX7-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[12:13]
3589; GFX7-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc
3590; GFX7-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[0:1]
3591; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[2:3]
3592; GFX7-NEXT:    s_mov_b64 s[0:1], 0
3593; GFX7-NEXT:    s_mov_b32 s2, -1
3594; GFX7-NEXT:    s_mov_b32 s3, 0xf000
3595; GFX7-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
3596; GFX7-NEXT:    v_cndmask_b32_e64 v5, v5, v9, s[6:7]
3597; GFX7-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
3598; GFX7-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[10:11]
3599; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3600; GFX7-NEXT:    s_mov_b64 s[0:1], 16
3601; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
3602; GFX7-NEXT:    s_endpgm
3603;
3604; GFX10-LABEL: insertelement_s_v16i16_v_v:
3605; GFX10:       ; %bb.0:
3606; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
3607; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 1, v1
3608; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
3609; GFX10-NEXT:    s_mov_b32 s4, 0xffff
3610; GFX10-NEXT:    v_mov_b32_e32 v12, 0
3611; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
3612; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v10
3613; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v10
3614; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v10
3615; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3616; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v10
3617; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v10
3618; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v10
3619; GFX10-NEXT:    v_lshlrev_b32_e64 v3, v1, s4
3620; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v10
3621; GFX10-NEXT:    v_lshlrev_b32_sdwa v8, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3622; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v3
3623; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3624; GFX10-NEXT:    v_mov_b32_e32 v2, s9
3625; GFX10-NEXT:    v_cndmask_b32_e32 v2, s8, v2, vcc_lo
3626; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s10, s0
3627; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s11, s1
3628; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s12, s2
3629; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s13, s3
3630; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s14, s4
3631; GFX10-NEXT:    v_cndmask_b32_e64 v11, v2, s15, s5
3632; GFX10-NEXT:    v_mov_b32_e32 v0, s8
3633; GFX10-NEXT:    v_mov_b32_e32 v1, s9
3634; GFX10-NEXT:    v_mov_b32_e32 v2, s10
3635; GFX10-NEXT:    v_mov_b32_e32 v3, s11
3636; GFX10-NEXT:    v_and_or_b32 v11, v11, v9, v8
3637; GFX10-NEXT:    v_mov_b32_e32 v4, s12
3638; GFX10-NEXT:    v_mov_b32_e32 v5, s13
3639; GFX10-NEXT:    v_mov_b32_e32 v6, s14
3640; GFX10-NEXT:    v_mov_b32_e32 v7, s15
3641; GFX10-NEXT:    v_mov_b32_e32 v8, 0
3642; GFX10-NEXT:    v_mov_b32_e32 v9, 0
3643; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v11, s6
3644; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc_lo
3645; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v11, s0
3646; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s1
3647; GFX10-NEXT:    v_cndmask_b32_e64 v4, v4, v11, s2
3648; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s3
3649; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s4
3650; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s5
3651; GFX10-NEXT:    s_mov_b64 s[0:1], 16
3652; GFX10-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3653; GFX10-NEXT:    global_store_dwordx4 v12, v[4:7], s[0:1]
3654; GFX10-NEXT:    s_endpgm
3655  %vec = load <16 x i16>, <16 x i16> addrspace(4)* %ptr
3656  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3657  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
3658  ret void
3659}
3660
3661define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
3662; GFX9-LABEL: insertelement_v_v16i16_s_v:
3663; GFX9:       ; %bb.0:
3664; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
3665; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
3666; GFX9-NEXT:    s_mov_b32 s0, 0xffff
3667; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
3668; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
3669; GFX9-NEXT:    s_and_b32 s1, s2, s0
3670; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3671; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
3672; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
3673; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
3674; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
3675; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
3676; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
3677; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
3678; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
3679; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
3680; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
3681; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
3682; GFX9-NEXT:    s_waitcnt vmcnt(1)
3683; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
3684; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
3685; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
3686; GFX9-NEXT:    s_waitcnt vmcnt(0)
3687; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
3688; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
3689; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
3690; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
3691; GFX9-NEXT:    v_and_or_b32 v11, v11, v1, v2
3692; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3693; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3694; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3695; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3696; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3697; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3698; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3699; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3700; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3701; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3702; GFX9-NEXT:    s_mov_b64 s[0:1], 16
3703; GFX9-NEXT:    v_mov_b32_e32 v10, 0
3704; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3705; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
3706; GFX9-NEXT:    s_endpgm
3707;
3708; GFX8-LABEL: insertelement_v_v16i16_s_v:
3709; GFX8:       ; %bb.0:
3710; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
3711; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
3712; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
3713; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
3714; GFX8-NEXT:    s_mov_b32 s0, 0xffff
3715; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
3716; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
3717; GFX8-NEXT:    s_and_b32 s1, s2, s0
3718; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3719; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
3720; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
3721; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
3722; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
3723; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
3724; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
3725; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
3726; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
3727; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
3728; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
3729; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
3730; GFX8-NEXT:    s_waitcnt vmcnt(1)
3731; GFX8-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
3732; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
3733; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
3734; GFX8-NEXT:    s_waitcnt vmcnt(0)
3735; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
3736; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
3737; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
3738; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
3739; GFX8-NEXT:    v_and_b32_e32 v1, v11, v1
3740; GFX8-NEXT:    v_or_b32_e32 v11, v1, v2
3741; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3742; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3743; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3744; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3745; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3746; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3747; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3748; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3749; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3750; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3751; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3752; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3753; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3754; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3755; GFX8-NEXT:    s_endpgm
3756;
3757; GFX7-LABEL: insertelement_v_v16i16_s_v:
3758; GFX7:       ; %bb.0:
3759; GFX7-NEXT:    s_mov_b32 s18, 0
3760; GFX7-NEXT:    s_mov_b32 s19, 0xf000
3761; GFX7-NEXT:    s_mov_b64 s[16:17], 0
3762; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
3763; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
3764; GFX7-NEXT:    s_mov_b32 s0, 0xffff
3765; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
3766; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
3767; GFX7-NEXT:    s_and_b32 s1, s2, s0
3768; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
3769; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
3770; GFX7-NEXT:    v_lshl_b32_e32 v2, s1, v1
3771; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
3772; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
3773; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
3774; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
3775; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
3776; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
3777; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
3778; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
3779; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
3780; GFX7-NEXT:    s_mov_b32 s18, -1
3781; GFX7-NEXT:    s_waitcnt vmcnt(1)
3782; GFX7-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
3783; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
3784; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
3785; GFX7-NEXT:    s_waitcnt vmcnt(0)
3786; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
3787; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
3788; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
3789; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
3790; GFX7-NEXT:    v_and_b32_e32 v1, v11, v1
3791; GFX7-NEXT:    v_or_b32_e32 v11, v1, v2
3792; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3793; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3794; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3795; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3796; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3797; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3798; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3799; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3800; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
3801; GFX7-NEXT:    s_mov_b64 s[16:17], 16
3802; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
3803; GFX7-NEXT:    s_endpgm
3804;
3805; GFX10-LABEL: insertelement_v_v16i16_s_v:
3806; GFX10:       ; %bb.0:
3807; GFX10-NEXT:    s_clause 0x1
3808; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
3809; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
3810; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
3811; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
3812; GFX10-NEXT:    s_mov_b32 s5, 0xffff
3813; GFX10-NEXT:    v_mov_b32_e32 v14, 0
3814; GFX10-NEXT:    s_and_b32 s6, s2, s5
3815; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
3816; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
3817; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
3818; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 4, v0
3819; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 5, v0
3820; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
3821; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 6, v0
3822; GFX10-NEXT:    v_lshlrev_b32_e64 v11, v2, s5
3823; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
3824; GFX10-NEXT:    v_lshlrev_b32_e64 v2, v2, s6
3825; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
3826; GFX10-NEXT:    v_xor_b32_e32 v11, -1, v11
3827; GFX10-NEXT:    s_waitcnt vmcnt(1)
3828; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
3829; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
3830; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s1
3831; GFX10-NEXT:    s_waitcnt vmcnt(0)
3832; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s3
3833; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s4
3834; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s2
3835; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s5
3836; GFX10-NEXT:    v_and_or_b32 v13, v1, v11, v2
3837; GFX10-NEXT:    v_mov_b32_e32 v11, 0
3838; GFX10-NEXT:    v_mov_b32_e32 v12, 0
3839; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v13, s6
3840; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v13, vcc_lo
3841; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v13, s0
3842; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v13, s1
3843; GFX10-NEXT:    v_cndmask_b32_e64 v4, v7, v13, s3
3844; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v13, s4
3845; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s2
3846; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v13, s5
3847; GFX10-NEXT:    s_mov_b64 s[0:1], 16
3848; GFX10-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off
3849; GFX10-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
3850; GFX10-NEXT:    s_endpgm
3851  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
3852  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
3853  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
3854  ret void
3855}
3856
3857define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
3858; GFX9-LABEL: insertelement_v_v16i16_v_s:
3859; GFX9:       ; %bb.0:
3860; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
3861; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
3862; GFX9-NEXT:    s_and_b32 s1, s2, 1
3863; GFX9-NEXT:    s_mov_b32 s0, 0xffff
3864; GFX9-NEXT:    s_lshr_b32 s12, s2, 1
3865; GFX9-NEXT:    s_lshl_b32 s1, s1, 4
3866; GFX9-NEXT:    s_lshl_b32 s0, s0, s1
3867; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
3868; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3869; GFX9-NEXT:    s_not_b32 s13, s0
3870; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
3871; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
3872; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
3873; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
3874; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
3875; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
3876; GFX9-NEXT:    s_waitcnt vmcnt(1)
3877; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
3878; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3879; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3880; GFX9-NEXT:    s_waitcnt vmcnt(0)
3881; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
3882; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
3883; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
3884; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
3885; GFX9-NEXT:    v_and_or_b32 v11, v1, s13, v0
3886; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
3887; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3888; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3889; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3890; GFX9-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3891; GFX9-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3892; GFX9-NEXT:    v_mov_b32_e32 v8, 0
3893; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3894; GFX9-NEXT:    v_mov_b32_e32 v9, 0
3895; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3896; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3897; GFX9-NEXT:    s_mov_b64 s[0:1], 16
3898; GFX9-NEXT:    v_mov_b32_e32 v10, 0
3899; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
3900; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
3901; GFX9-NEXT:    s_endpgm
3902;
3903; GFX8-LABEL: insertelement_v_v16i16_v_s:
3904; GFX8:       ; %bb.0:
3905; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 16, v0
3906; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
3907; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
3908; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[7:8]
3909; GFX8-NEXT:    s_and_b32 s1, s2, 1
3910; GFX8-NEXT:    s_mov_b32 s0, 0xffff
3911; GFX8-NEXT:    s_lshr_b32 s12, s2, 1
3912; GFX8-NEXT:    s_lshl_b32 s1, s1, 4
3913; GFX8-NEXT:    s_lshl_b32 s0, s0, s1
3914; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
3915; GFX8-NEXT:    v_mov_b32_e32 v0, s1
3916; GFX8-NEXT:    s_not_b32 s13, s0
3917; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
3918; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
3919; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
3920; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
3921; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
3922; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
3923; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
3924; GFX8-NEXT:    s_waitcnt vmcnt(1)
3925; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
3926; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3927; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3928; GFX8-NEXT:    s_waitcnt vmcnt(0)
3929; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
3930; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
3931; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
3932; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
3933; GFX8-NEXT:    v_and_b32_e32 v1, s13, v1
3934; GFX8-NEXT:    v_or_b32_e32 v11, v1, v0
3935; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
3936; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3937; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3938; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3939; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3940; GFX8-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3941; GFX8-NEXT:    v_mov_b32_e32 v8, 0
3942; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3943; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3944; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3945; GFX8-NEXT:    v_mov_b32_e32 v9, 0
3946; GFX8-NEXT:    v_mov_b32_e32 v10, 16
3947; GFX8-NEXT:    v_mov_b32_e32 v11, 0
3948; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
3949; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
3950; GFX8-NEXT:    s_endpgm
3951;
3952; GFX7-LABEL: insertelement_v_v16i16_v_s:
3953; GFX7:       ; %bb.0:
3954; GFX7-NEXT:    s_mov_b32 s18, 0
3955; GFX7-NEXT:    s_mov_b32 s19, 0xf000
3956; GFX7-NEXT:    s_mov_b64 s[16:17], 0
3957; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
3958; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
3959; GFX7-NEXT:    s_and_b32 s1, s2, 1
3960; GFX7-NEXT:    s_mov_b32 s0, 0xffff
3961; GFX7-NEXT:    s_lshr_b32 s12, s2, 1
3962; GFX7-NEXT:    s_lshl_b32 s1, s1, 4
3963; GFX7-NEXT:    v_and_b32_e32 v0, s0, v2
3964; GFX7-NEXT:    s_lshl_b32 s0, s0, s1
3965; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s12, 1
3966; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s1, v0
3967; GFX7-NEXT:    s_not_b32 s13, s0
3968; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s12, 2
3969; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, 3
3970; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], s12, 4
3971; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], s12, 5
3972; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], s12, 6
3973; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], s12, 7
3974; GFX7-NEXT:    s_mov_b32 s18, -1
3975; GFX7-NEXT:    s_waitcnt vmcnt(1)
3976; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
3977; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[0:1]
3978; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
3979; GFX7-NEXT:    s_waitcnt vmcnt(0)
3980; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
3981; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
3982; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[8:9]
3983; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[10:11]
3984; GFX7-NEXT:    v_and_b32_e32 v1, s13, v1
3985; GFX7-NEXT:    v_or_b32_e32 v11, v1, v0
3986; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], s12, 0
3987; GFX7-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
3988; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
3989; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
3990; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
3991; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
3992; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
3993; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
3994; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
3995; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
3996; GFX7-NEXT:    s_mov_b64 s[16:17], 16
3997; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
3998; GFX7-NEXT:    s_endpgm
3999;
4000; GFX10-LABEL: insertelement_v_v16i16_v_s:
4001; GFX10:       ; %bb.0:
4002; GFX10-NEXT:    s_clause 0x1
4003; GFX10-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
4004; GFX10-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
4005; GFX10-NEXT:    s_lshr_b32 s6, s2, 1
4006; GFX10-NEXT:    s_and_b32 s5, s2, 1
4007; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s6, 1
4008; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s6, 2
4009; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s6, 3
4010; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, s6, 4
4011; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, s6, 5
4012; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, s6, 6
4013; GFX10-NEXT:    s_lshl_b32 s7, s5, 4
4014; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, s6, 7
4015; GFX10-NEXT:    s_mov_b32 s8, 0xffff
4016; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4017; GFX10-NEXT:    s_lshl_b32 s7, s8, s7
4018; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, s6, 0
4019; GFX10-NEXT:    s_not_b32 s7, s7
4020; GFX10-NEXT:    v_mov_b32_e32 v11, 0
4021; GFX10-NEXT:    v_mov_b32_e32 v12, 0
4022; GFX10-NEXT:    v_mov_b32_e32 v14, 0
4023; GFX10-NEXT:    s_waitcnt vmcnt(1)
4024; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc_lo
4025; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s0
4026; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
4027; GFX10-NEXT:    s_waitcnt vmcnt(0)
4028; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s3
4029; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s4
4030; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s2
4031; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s5
4032; GFX10-NEXT:    v_and_or_b32 v13, v0, s7, v1
4033; GFX10-NEXT:    v_cndmask_b32_e64 v0, v3, v13, s6
4034; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v13, vcc_lo
4035; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v13, s0
4036; GFX10-NEXT:    v_cndmask_b32_e64 v3, v6, v13, s1
4037; GFX10-NEXT:    v_cndmask_b32_e64 v4, v7, v13, s3
4038; GFX10-NEXT:    v_cndmask_b32_e64 v5, v8, v13, s4
4039; GFX10-NEXT:    v_cndmask_b32_e64 v6, v9, v13, s2
4040; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v13, s5
4041; GFX10-NEXT:    s_mov_b64 s[0:1], 16
4042; GFX10-NEXT:    global_store_dwordx4 v[11:12], v[0:3], off
4043; GFX10-NEXT:    global_store_dwordx4 v14, v[4:7], s[0:1]
4044; GFX10-NEXT:    s_endpgm
4045  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
4046  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4047  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
4048  ret void
4049}
4050
4051define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
4052; GFX9-LABEL: insertelement_v_v16i16_v_v:
4053; GFX9:       ; %bb.0:
4054; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
4055; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
4056; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4057; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
4058; GFX9-NEXT:    s_mov_b32 s0, 0xffff
4059; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4060; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
4061; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4062; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
4063; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
4064; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
4065; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
4066; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
4067; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
4068; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
4069; GFX9-NEXT:    v_xor_b32_e32 v1, -1, v1
4070; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
4071; GFX9-NEXT:    s_waitcnt vmcnt(1)
4072; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4073; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4074; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4075; GFX9-NEXT:    s_waitcnt vmcnt(0)
4076; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4077; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4078; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4079; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4080; GFX9-NEXT:    v_and_or_b32 v12, v3, v1, v2
4081; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
4082; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
4083; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
4084; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
4085; GFX9-NEXT:    v_mov_b32_e32 v8, 0
4086; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
4087; GFX9-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
4088; GFX9-NEXT:    v_mov_b32_e32 v9, 0
4089; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
4090; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
4091; GFX9-NEXT:    s_mov_b64 s[0:1], 16
4092; GFX9-NEXT:    v_mov_b32_e32 v10, 0
4093; GFX9-NEXT:    global_store_dwordx4 v[8:9], v[0:3], off
4094; GFX9-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1]
4095; GFX9-NEXT:    s_endpgm
4096;
4097; GFX8-LABEL: insertelement_v_v16i16_v_v:
4098; GFX8:       ; %bb.0:
4099; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 16, v0
4100; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
4101; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
4102; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
4103; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4104; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
4105; GFX8-NEXT:    s_mov_b32 s0, 0xffff
4106; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4107; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
4108; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4109; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
4110; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
4111; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
4112; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
4113; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
4114; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
4115; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
4116; GFX8-NEXT:    v_xor_b32_e32 v1, -1, v1
4117; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
4118; GFX8-NEXT:    s_waitcnt vmcnt(1)
4119; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4120; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4121; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4122; GFX8-NEXT:    s_waitcnt vmcnt(0)
4123; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4124; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4125; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4126; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4127; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
4128; GFX8-NEXT:    v_or_b32_e32 v12, v1, v2
4129; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
4130; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
4131; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
4132; GFX8-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
4133; GFX8-NEXT:    v_mov_b32_e32 v8, 0
4134; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
4135; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
4136; GFX8-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
4137; GFX8-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
4138; GFX8-NEXT:    v_mov_b32_e32 v9, 0
4139; GFX8-NEXT:    v_mov_b32_e32 v10, 16
4140; GFX8-NEXT:    v_mov_b32_e32 v11, 0
4141; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
4142; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
4143; GFX8-NEXT:    s_endpgm
4144;
4145; GFX7-LABEL: insertelement_v_v16i16_v_v:
4146; GFX7:       ; %bb.0:
4147; GFX7-NEXT:    s_mov_b32 s18, 0
4148; GFX7-NEXT:    s_mov_b32 s19, 0xf000
4149; GFX7-NEXT:    s_mov_b64 s[16:17], 0
4150; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
4151; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
4152; GFX7-NEXT:    s_mov_b32 s0, 0xffff
4153; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4154; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
4155; GFX7-NEXT:    v_and_b32_e32 v2, s0, v2
4156; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
4157; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
4158; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
4159; GFX7-NEXT:    v_lshl_b32_e32 v1, s0, v1
4160; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
4161; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
4162; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
4163; GFX7-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
4164; GFX7-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
4165; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
4166; GFX7-NEXT:    v_xor_b32_e32 v1, -1, v1
4167; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
4168; GFX7-NEXT:    s_mov_b32 s18, -1
4169; GFX7-NEXT:    s_waitcnt vmcnt(1)
4170; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
4171; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
4172; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
4173; GFX7-NEXT:    s_waitcnt vmcnt(0)
4174; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
4175; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
4176; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
4177; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
4178; GFX7-NEXT:    v_and_b32_e32 v1, v3, v1
4179; GFX7-NEXT:    v_or_b32_e32 v12, v1, v2
4180; GFX7-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
4181; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
4182; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
4183; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
4184; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
4185; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
4186; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
4187; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
4188; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
4189; GFX7-NEXT:    s_mov_b64 s[16:17], 16
4190; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
4191; GFX7-NEXT:    s_endpgm
4192;
4193; GFX10-LABEL: insertelement_v_v16i16_v_v:
4194; GFX10:       ; %bb.0:
4195; GFX10-NEXT:    s_clause 0x1
4196; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
4197; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
4198; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
4199; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
4200; GFX10-NEXT:    s_mov_b32 s4, 0xffff
4201; GFX10-NEXT:    v_mov_b32_e32 v15, 0
4202; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
4203; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 2, v0
4204; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 3, v0
4205; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 4, v0
4206; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
4207; GFX10-NEXT:    v_cmp_eq_u32_e64 s3, 5, v0
4208; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 7, v0
4209; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v0
4210; GFX10-NEXT:    v_lshlrev_b32_e64 v12, v3, s4
4211; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 6, v0
4212; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
4213; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v12
4214; GFX10-NEXT:    v_mov_b32_e32 v12, 0
4215; GFX10-NEXT:    v_mov_b32_e32 v13, 0
4216; GFX10-NEXT:    s_waitcnt vmcnt(1)
4217; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
4218; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
4219; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
4220; GFX10-NEXT:    s_waitcnt vmcnt(0)
4221; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s2
4222; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s3
4223; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s4
4224; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s5
4225; GFX10-NEXT:    v_and_or_b32 v14, v1, v3, v2
4226; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v14, s6
4227; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v14, vcc_lo
4228; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v14, s0
4229; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v14, s1
4230; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v14, s2
4231; GFX10-NEXT:    v_cndmask_b32_e64 v5, v9, v14, s3
4232; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v14, s4
4233; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v14, s5
4234; GFX10-NEXT:    s_mov_b64 s[0:1], 16
4235; GFX10-NEXT:    global_store_dwordx4 v[12:13], v[0:3], off
4236; GFX10-NEXT:    global_store_dwordx4 v15, v[4:7], s[0:1]
4237; GFX10-NEXT:    s_endpgm
4238  %vec = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
4239  %insert = insertelement <16 x i16> %vec, i16 %val, i32 %idx
4240  store <16 x i16> %insert, <16 x i16> addrspace(1)* null
4241  ret void
4242}
4243