1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,VI %s
4; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=CIVI,CI %s
5
6define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
7; GFX9-LABEL: s_insertelement_v2i16_0:
8; GFX9:       ; %bb.0:
9; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
10; GFX9-NEXT:    v_mov_b32_e32 v0, 0
11; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
12; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_pack_lh_b32_b16 s2, 0x3e7, s2
15; GFX9-NEXT:    v_mov_b32_e32 v1, s2
16; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
17; GFX9-NEXT:    s_endpgm
18;
19; CIVI-LABEL: s_insertelement_v2i16_0:
20; CIVI:       ; %bb.0:
21; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
22; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
23; CIVI-NEXT:    v_mov_b32_e32 v0, s0
24; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
25; CIVI-NEXT:    v_mov_b32_e32 v1, s1
26; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
27; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
28; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e7
29; CIVI-NEXT:    v_mov_b32_e32 v2, s0
30; CIVI-NEXT:    flat_store_dword v[0:1], v2
31; CIVI-NEXT:    s_endpgm
32  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
33  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
34  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
35  ret void
36}
37
38
39define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
40; GFX9-LABEL: s_insertelement_v2i16_0_reg:
41; GFX9:       ; %bb.0:
42; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
43; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
44; GFX9-NEXT:    v_mov_b32_e32 v0, 0
45; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
46; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
47; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
48; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s6, s2
49; GFX9-NEXT:    v_mov_b32_e32 v1, s2
50; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
51; GFX9-NEXT:    s_endpgm
52;
53; VI-LABEL: s_insertelement_v2i16_0_reg:
54; VI:       ; %bb.0:
55; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
56; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
57; VI-NEXT:    s_waitcnt lgkmcnt(0)
58; VI-NEXT:    v_mov_b32_e32 v0, s0
59; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
60; VI-NEXT:    v_mov_b32_e32 v1, s1
61; VI-NEXT:    s_and_b32 s1, s4, 0xffff
62; VI-NEXT:    s_waitcnt lgkmcnt(0)
63; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
64; VI-NEXT:    s_or_b32 s0, s1, s0
65; VI-NEXT:    v_mov_b32_e32 v2, s0
66; VI-NEXT:    flat_store_dword v[0:1], v2
67; VI-NEXT:    s_endpgm
68;
69; CI-LABEL: s_insertelement_v2i16_0_reg:
70; CI:       ; %bb.0:
71; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
72; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
73; CI-NEXT:    s_waitcnt lgkmcnt(0)
74; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
75; CI-NEXT:    v_mov_b32_e32 v0, s0
76; CI-NEXT:    v_mov_b32_e32 v1, s1
77; CI-NEXT:    s_and_b32 s1, s4, 0xffff
78; CI-NEXT:    s_waitcnt lgkmcnt(0)
79; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
80; CI-NEXT:    s_or_b32 s0, s1, s0
81; CI-NEXT:    v_mov_b32_e32 v2, s0
82; CI-NEXT:    flat_store_dword v[0:1], v2
83; CI-NEXT:    s_endpgm
84  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
85  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
86  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
87  ret void
88}
89
90define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
91; GFX9-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
92; GFX9:       ; %bb.0:
93; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
94; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
95; GFX9-NEXT:    v_mov_b32_e32 v0, 0
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
98; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
99; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
100; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s2
101; GFX9-NEXT:    v_mov_b32_e32 v1, s3
102; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
103; GFX9-NEXT:    ;;#ASMSTART
104; GFX9-NEXT:    ; use s2
105; GFX9-NEXT:    ;;#ASMEND
106; GFX9-NEXT:    s_endpgm
107;
108; VI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
109; VI:       ; %bb.0:
110; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
111; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
112; VI-NEXT:    s_waitcnt lgkmcnt(0)
113; VI-NEXT:    v_mov_b32_e32 v0, s0
114; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
115; VI-NEXT:    v_mov_b32_e32 v1, s1
116; VI-NEXT:    s_and_b32 s1, s4, 0xffff
117; VI-NEXT:    s_waitcnt lgkmcnt(0)
118; VI-NEXT:    s_lshr_b32 s2, s0, 16
119; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
120; VI-NEXT:    s_or_b32 s0, s1, s0
121; VI-NEXT:    v_mov_b32_e32 v2, s0
122; VI-NEXT:    flat_store_dword v[0:1], v2
123; VI-NEXT:    ;;#ASMSTART
124; VI-NEXT:    ; use s2
125; VI-NEXT:    ;;#ASMEND
126; VI-NEXT:    s_endpgm
127;
128; CI-LABEL: s_insertelement_v2i16_0_multi_use_hi_reg:
129; CI:       ; %bb.0:
130; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
131; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
132; CI-NEXT:    s_waitcnt lgkmcnt(0)
133; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
134; CI-NEXT:    v_mov_b32_e32 v1, s1
135; CI-NEXT:    v_mov_b32_e32 v0, s0
136; CI-NEXT:    s_and_b32 s0, s4, 0xffff
137; CI-NEXT:    s_waitcnt lgkmcnt(0)
138; CI-NEXT:    s_lshr_b32 s1, s2, 16
139; CI-NEXT:    s_lshl_b32 s2, s1, 16
140; CI-NEXT:    s_or_b32 s0, s0, s2
141; CI-NEXT:    v_mov_b32_e32 v2, s0
142; CI-NEXT:    flat_store_dword v[0:1], v2
143; CI-NEXT:    ;;#ASMSTART
144; CI-NEXT:    ; use s1
145; CI-NEXT:    ;;#ASMEND
146; CI-NEXT:    s_endpgm
147  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
148  %elt1 = extractelement <2 x i16> %vec, i32 1
149  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
150  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
151  %use1 = zext i16 %elt1 to i32
152  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
153  ret void
154}
155
156define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
157; GFX9-LABEL: s_insertelement_v2i16_0_reghi:
158; GFX9:       ; %bb.0:
159; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
160; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
161; GFX9-NEXT:    v_mov_b32_e32 v0, 0
162; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
163; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
164; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
165; GFX9-NEXT:    s_pack_hh_b32_b16 s2, s6, s2
166; GFX9-NEXT:    v_mov_b32_e32 v1, s2
167; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
168; GFX9-NEXT:    s_endpgm
169;
170; VI-LABEL: s_insertelement_v2i16_0_reghi:
171; VI:       ; %bb.0:
172; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
173; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
174; VI-NEXT:    s_waitcnt lgkmcnt(0)
175; VI-NEXT:    v_mov_b32_e32 v0, s0
176; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
177; VI-NEXT:    v_mov_b32_e32 v1, s1
178; VI-NEXT:    s_lshr_b32 s1, s4, 16
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
181; VI-NEXT:    s_or_b32 s0, s1, s0
182; VI-NEXT:    v_mov_b32_e32 v2, s0
183; VI-NEXT:    flat_store_dword v[0:1], v2
184; VI-NEXT:    s_endpgm
185;
186; CI-LABEL: s_insertelement_v2i16_0_reghi:
187; CI:       ; %bb.0:
188; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
189; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
190; CI-NEXT:    s_waitcnt lgkmcnt(0)
191; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
192; CI-NEXT:    v_mov_b32_e32 v0, s0
193; CI-NEXT:    v_mov_b32_e32 v1, s1
194; CI-NEXT:    s_lshr_b32 s1, s4, 16
195; CI-NEXT:    s_waitcnt lgkmcnt(0)
196; CI-NEXT:    s_and_b32 s0, s2, 0xffff0000
197; CI-NEXT:    s_or_b32 s0, s1, s0
198; CI-NEXT:    v_mov_b32_e32 v2, s0
199; CI-NEXT:    flat_store_dword v[0:1], v2
200; CI-NEXT:    s_endpgm
201  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
202  %elt.hi = lshr i32 %elt.arg, 16
203  %elt = trunc i32 %elt.hi to i16
204  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
205  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
206  ret void
207}
208
209define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
210; GFX9-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
211; GFX9:       ; %bb.0:
212; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
213; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
214; GFX9-NEXT:    v_mov_b32_e32 v0, 0
215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
217; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
218; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX9-NEXT:    s_pack_lh_b32_b16 s2, s3, s2
220; GFX9-NEXT:    v_mov_b32_e32 v1, s2
221; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
222; GFX9-NEXT:    ;;#ASMSTART
223; GFX9-NEXT:    ; use s3
224; GFX9-NEXT:    ;;#ASMEND
225; GFX9-NEXT:    s_endpgm
226;
227; VI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
228; VI:       ; %bb.0:
229; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
230; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
231; VI-NEXT:    s_waitcnt lgkmcnt(0)
232; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
233; VI-NEXT:    v_mov_b32_e32 v0, s0
234; VI-NEXT:    v_mov_b32_e32 v1, s1
235; VI-NEXT:    s_lshr_b32 s0, s4, 16
236; VI-NEXT:    s_waitcnt lgkmcnt(0)
237; VI-NEXT:    s_and_b32 s1, s2, 0xffff0000
238; VI-NEXT:    s_or_b32 s1, s0, s1
239; VI-NEXT:    v_mov_b32_e32 v2, s1
240; VI-NEXT:    flat_store_dword v[0:1], v2
241; VI-NEXT:    ;;#ASMSTART
242; VI-NEXT:    ; use s0
243; VI-NEXT:    ;;#ASMEND
244; VI-NEXT:    s_endpgm
245;
246; CI-LABEL: s_insertelement_v2i16_0_reghi_multi_use_1:
247; CI:       ; %bb.0:
248; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
249; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
250; CI-NEXT:    s_waitcnt lgkmcnt(0)
251; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
252; CI-NEXT:    v_mov_b32_e32 v0, s0
253; CI-NEXT:    v_mov_b32_e32 v1, s1
254; CI-NEXT:    s_lshr_b32 s0, s4, 16
255; CI-NEXT:    s_waitcnt lgkmcnt(0)
256; CI-NEXT:    s_and_b32 s1, s2, 0xffff0000
257; CI-NEXT:    s_or_b32 s1, s0, s1
258; CI-NEXT:    v_mov_b32_e32 v2, s1
259; CI-NEXT:    flat_store_dword v[0:1], v2
260; CI-NEXT:    ;;#ASMSTART
261; CI-NEXT:    ; use s0
262; CI-NEXT:    ;;#ASMEND
263; CI-NEXT:    s_endpgm
264  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
265  %elt.hi = lshr i32 %elt.arg, 16
266  %elt = trunc i32 %elt.hi to i16
267  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
268  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
269  %use1 = zext i16 %elt to i32
270  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
271  ret void
272}
273
274define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
275; GFX9-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
276; GFX9:       ; %bb.0:
277; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
278; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
279; GFX9-NEXT:    v_mov_b32_e32 v0, 0
280; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
282; GFX9-NEXT:    s_lshr_b32 s3, s6, 16
283; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
284; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
285; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s3, s2
286; GFX9-NEXT:    v_mov_b32_e32 v1, s4
287; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
288; GFX9-NEXT:    ;;#ASMSTART
289; GFX9-NEXT:    ; use s3
290; GFX9-NEXT:    ;;#ASMEND
291; GFX9-NEXT:    ;;#ASMSTART
292; GFX9-NEXT:    ; use s2
293; GFX9-NEXT:    ;;#ASMEND
294; GFX9-NEXT:    s_endpgm
295;
296; VI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
297; VI:       ; %bb.0:
298; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
299; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
300; VI-NEXT:    s_waitcnt lgkmcnt(0)
301; VI-NEXT:    v_mov_b32_e32 v0, s0
302; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
303; VI-NEXT:    v_mov_b32_e32 v1, s1
304; VI-NEXT:    s_lshr_b32 s1, s4, 16
305; VI-NEXT:    s_waitcnt lgkmcnt(0)
306; VI-NEXT:    s_lshr_b32 s2, s0, 16
307; VI-NEXT:    s_and_b32 s0, s0, 0xffff0000
308; VI-NEXT:    s_or_b32 s0, s1, s0
309; VI-NEXT:    v_mov_b32_e32 v2, s0
310; VI-NEXT:    flat_store_dword v[0:1], v2
311; VI-NEXT:    ;;#ASMSTART
312; VI-NEXT:    ; use s1
313; VI-NEXT:    ;;#ASMEND
314; VI-NEXT:    ;;#ASMSTART
315; VI-NEXT:    ; use s2
316; VI-NEXT:    ;;#ASMEND
317; VI-NEXT:    s_endpgm
318;
319; CI-LABEL: s_insertelement_v2i16_0_reghi_both_multi_use_1:
320; CI:       ; %bb.0:
321; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
322; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
323; CI-NEXT:    s_waitcnt lgkmcnt(0)
324; CI-NEXT:    v_mov_b32_e32 v0, s0
325; CI-NEXT:    s_load_dword s0, s[2:3], 0x0
326; CI-NEXT:    v_mov_b32_e32 v2, s4
327; CI-NEXT:    v_mov_b32_e32 v1, s1
328; CI-NEXT:    s_lshr_b32 s1, s4, 16
329; CI-NEXT:    s_waitcnt lgkmcnt(0)
330; CI-NEXT:    s_lshr_b32 s0, s0, 16
331; CI-NEXT:    v_alignbit_b32 v2, s0, v2, 16
332; CI-NEXT:    flat_store_dword v[0:1], v2
333; CI-NEXT:    ;;#ASMSTART
334; CI-NEXT:    ; use s1
335; CI-NEXT:    ;;#ASMEND
336; CI-NEXT:    ;;#ASMSTART
337; CI-NEXT:    ; use s0
338; CI-NEXT:    ;;#ASMEND
339; CI-NEXT:    s_endpgm
340  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
341  %elt.hi = lshr i32 %elt.arg, 16
342  %elt = trunc i32 %elt.hi to i16
343  %vec.hi = extractelement <2 x i16> %vec, i32 1
344  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
345  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
346  %use1 = zext i16 %elt to i32
347  %vec.hi.use1 = zext i16 %vec.hi to i32
348
349  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
350  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
351  ret void
352}
353
354define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
355; GFX9-LABEL: s_insertelement_v2i16_1:
356; GFX9:       ; %bb.0:
357; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
358; GFX9-NEXT:    v_mov_b32_e32 v0, 0
359; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
361; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x3e7
363; GFX9-NEXT:    v_mov_b32_e32 v1, s2
364; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
365; GFX9-NEXT:    s_endpgm
366;
367; CIVI-LABEL: s_insertelement_v2i16_1:
368; CIVI:       ; %bb.0:
369; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
370; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
371; CIVI-NEXT:    v_mov_b32_e32 v0, s0
372; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
373; CIVI-NEXT:    v_mov_b32_e32 v1, s1
374; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
375; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
376; CIVI-NEXT:    s_or_b32 s0, s0, 0x3e70000
377; CIVI-NEXT:    v_mov_b32_e32 v2, s0
378; CIVI-NEXT:    flat_store_dword v[0:1], v2
379; CIVI-NEXT:    s_endpgm
380  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
381  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
382  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
383  ret void
384}
385
386define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
387; GFX9-LABEL: s_insertelement_v2i16_1_reg:
388; GFX9:       ; %bb.0:
389; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
390; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
391; GFX9-NEXT:    v_mov_b32_e32 v0, 0
392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
393; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
394; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
395; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s6
396; GFX9-NEXT:    v_mov_b32_e32 v1, s2
397; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
398; GFX9-NEXT:    s_endpgm
399;
400; VI-LABEL: s_insertelement_v2i16_1_reg:
401; VI:       ; %bb.0:
402; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
403; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
404; VI-NEXT:    s_waitcnt lgkmcnt(0)
405; VI-NEXT:    v_mov_b32_e32 v0, s0
406; VI-NEXT:    s_load_dword s0, s[2:3], 0x0
407; VI-NEXT:    v_mov_b32_e32 v1, s1
408; VI-NEXT:    s_lshl_b32 s1, s4, 16
409; VI-NEXT:    s_waitcnt lgkmcnt(0)
410; VI-NEXT:    s_and_b32 s0, s0, 0xffff
411; VI-NEXT:    s_or_b32 s0, s0, s1
412; VI-NEXT:    v_mov_b32_e32 v2, s0
413; VI-NEXT:    flat_store_dword v[0:1], v2
414; VI-NEXT:    s_endpgm
415;
416; CI-LABEL: s_insertelement_v2i16_1_reg:
417; CI:       ; %bb.0:
418; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
419; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
420; CI-NEXT:    s_waitcnt lgkmcnt(0)
421; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
422; CI-NEXT:    v_mov_b32_e32 v0, s0
423; CI-NEXT:    v_mov_b32_e32 v1, s1
424; CI-NEXT:    s_lshl_b32 s1, s4, 16
425; CI-NEXT:    s_waitcnt lgkmcnt(0)
426; CI-NEXT:    s_and_b32 s0, s2, 0xffff
427; CI-NEXT:    s_or_b32 s0, s0, s1
428; CI-NEXT:    v_mov_b32_e32 v2, s0
429; CI-NEXT:    flat_store_dword v[0:1], v2
430; CI-NEXT:    s_endpgm
431  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
432  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
433  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
434  ret void
435}
436
437define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
438; GFX9-LABEL: s_insertelement_v2f16_0:
439; GFX9:       ; %bb.0:
440; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
441; GFX9-NEXT:    v_mov_b32_e32 v0, 0
442; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
443; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
444; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
446; GFX9-NEXT:    s_pack_ll_b32_b16 s2, 0x4500, s2
447; GFX9-NEXT:    v_mov_b32_e32 v1, s2
448; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
449; GFX9-NEXT:    s_endpgm
450;
451; CIVI-LABEL: s_insertelement_v2f16_0:
452; CIVI:       ; %bb.0:
453; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
454; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
455; CIVI-NEXT:    v_mov_b32_e32 v0, s0
456; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
457; CIVI-NEXT:    v_mov_b32_e32 v1, s1
458; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
459; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff0000
460; CIVI-NEXT:    s_or_b32 s0, s0, 0x4500
461; CIVI-NEXT:    v_mov_b32_e32 v2, s0
462; CIVI-NEXT:    flat_store_dword v[0:1], v2
463; CIVI-NEXT:    s_endpgm
464  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
465  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
466  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
467  ret void
468}
469
470define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
471; GFX9-LABEL: s_insertelement_v2f16_1:
472; GFX9:       ; %bb.0:
473; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
474; GFX9-NEXT:    v_mov_b32_e32 v0, 0
475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
477; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x4500
479; GFX9-NEXT:    v_mov_b32_e32 v1, s2
480; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
481; GFX9-NEXT:    s_endpgm
482;
483; CIVI-LABEL: s_insertelement_v2f16_1:
484; CIVI:       ; %bb.0:
485; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
486; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
487; CIVI-NEXT:    v_mov_b32_e32 v0, s0
488; CIVI-NEXT:    s_load_dword s0, s[2:3], 0x0
489; CIVI-NEXT:    v_mov_b32_e32 v1, s1
490; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
491; CIVI-NEXT:    s_and_b32 s0, s0, 0xffff
492; CIVI-NEXT:    s_or_b32 s0, s0, 0x45000000
493; CIVI-NEXT:    v_mov_b32_e32 v2, s0
494; CIVI-NEXT:    flat_store_dword v[0:1], v2
495; CIVI-NEXT:    s_endpgm
496  %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
497  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
498  store <2 x half> %vecins, <2 x half> addrspace(1)* %out
499  ret void
500}
501
502define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
503; GFX9-LABEL: v_insertelement_v2i16_0:
504; GFX9:       ; %bb.0:
505; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
506; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
507; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
508; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
509; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
510; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
511; GFX9-NEXT:    s_waitcnt vmcnt(0)
512; GFX9-NEXT:    v_bfi_b32 v1, v2, s2, v1
513; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
514; GFX9-NEXT:    s_endpgm
515;
516; VI-LABEL: v_insertelement_v2i16_0:
517; VI:       ; %bb.0:
518; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
519; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
520; VI-NEXT:    s_waitcnt lgkmcnt(0)
521; VI-NEXT:    v_mov_b32_e32 v1, s3
522; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
523; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
524; VI-NEXT:    flat_load_dword v0, v[0:1]
525; VI-NEXT:    v_mov_b32_e32 v3, s1
526; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
527; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
528; VI-NEXT:    s_waitcnt vmcnt(0)
529; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
530; VI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
531; VI-NEXT:    flat_store_dword v[2:3], v0
532; VI-NEXT:    s_endpgm
533;
534; CI-LABEL: v_insertelement_v2i16_0:
535; CI:       ; %bb.0:
536; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
537; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
538; CI-NEXT:    s_waitcnt lgkmcnt(0)
539; CI-NEXT:    v_mov_b32_e32 v1, s3
540; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
541; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
542; CI-NEXT:    flat_load_dword v0, v[0:1]
543; CI-NEXT:    v_mov_b32_e32 v3, s1
544; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
545; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
546; CI-NEXT:    s_waitcnt vmcnt(0)
547; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
548; CI-NEXT:    v_or_b32_e32 v0, 0x3e7, v0
549; CI-NEXT:    flat_store_dword v[2:3], v0
550; CI-NEXT:    s_endpgm
551  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
552  %tid.ext = sext i32 %tid to i64
553  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
554  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
555  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
556  %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
557  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
558  ret void
559}
560
561define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
562; GFX9-LABEL: v_insertelement_v2i16_0_reghi:
563; GFX9:       ; %bb.0:
564; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
565; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
566; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
567; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff0000
568; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
569; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
570; GFX9-NEXT:    v_lshrrev_b32_e64 v2, 16, s6
571; GFX9-NEXT:    s_waitcnt vmcnt(0)
572; GFX9-NEXT:    v_and_or_b32 v1, v1, v3, v2
573; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
574; GFX9-NEXT:    s_endpgm
575;
576; VI-LABEL: v_insertelement_v2i16_0_reghi:
577; VI:       ; %bb.0:
578; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
579; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
580; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
581; VI-NEXT:    s_waitcnt lgkmcnt(0)
582; VI-NEXT:    v_mov_b32_e32 v1, s3
583; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
584; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
585; VI-NEXT:    flat_load_dword v0, v[0:1]
586; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
587; VI-NEXT:    v_mov_b32_e32 v3, s1
588; VI-NEXT:    s_lshr_b32 s0, s4, 16
589; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
590; VI-NEXT:    s_waitcnt vmcnt(0)
591; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
592; VI-NEXT:    v_or_b32_e32 v0, s0, v0
593; VI-NEXT:    flat_store_dword v[2:3], v0
594; VI-NEXT:    s_endpgm
595;
596; CI-LABEL: v_insertelement_v2i16_0_reghi:
597; CI:       ; %bb.0:
598; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
599; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
600; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
601; CI-NEXT:    s_waitcnt lgkmcnt(0)
602; CI-NEXT:    v_mov_b32_e32 v1, s3
603; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
604; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
605; CI-NEXT:    flat_load_dword v3, v[0:1]
606; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
607; CI-NEXT:    v_mov_b32_e32 v1, s1
608; CI-NEXT:    s_lshr_b32 s0, s4, 16
609; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
610; CI-NEXT:    s_waitcnt vmcnt(0)
611; CI-NEXT:    v_and_b32_e32 v2, 0xffff0000, v3
612; CI-NEXT:    v_or_b32_e32 v2, s0, v2
613; CI-NEXT:    flat_store_dword v[0:1], v2
614; CI-NEXT:    s_endpgm
615  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
616  %tid.ext = sext i32 %tid to i64
617  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
618  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
619  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
620  %elt.hi = lshr i32 %elt.arg, 16
621  %elt = trunc i32 %elt.hi to i16
622  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
623  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
624  ret void
625}
626
627define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
628; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm:
629; GFX9:       ; %bb.0:
630; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
631; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
632; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
633; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
635; GFX9-NEXT:    s_waitcnt vmcnt(0)
636; GFX9-NEXT:    v_bfi_b32 v1, v2, 53, v1
637; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
638; GFX9-NEXT:    s_endpgm
639;
640; VI-LABEL: v_insertelement_v2i16_0_inlineimm:
641; VI:       ; %bb.0:
642; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
643; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
644; VI-NEXT:    s_waitcnt lgkmcnt(0)
645; VI-NEXT:    v_mov_b32_e32 v1, s3
646; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
647; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
648; VI-NEXT:    flat_load_dword v0, v[0:1]
649; VI-NEXT:    v_mov_b32_e32 v3, s1
650; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
651; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
652; VI-NEXT:    s_waitcnt vmcnt(0)
653; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
654; VI-NEXT:    v_or_b32_e32 v0, 53, v0
655; VI-NEXT:    flat_store_dword v[2:3], v0
656; VI-NEXT:    s_endpgm
657;
658; CI-LABEL: v_insertelement_v2i16_0_inlineimm:
659; CI:       ; %bb.0:
660; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
661; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
662; CI-NEXT:    s_waitcnt lgkmcnt(0)
663; CI-NEXT:    v_mov_b32_e32 v1, s3
664; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
665; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
666; CI-NEXT:    flat_load_dword v0, v[0:1]
667; CI-NEXT:    v_mov_b32_e32 v3, s1
668; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
669; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
670; CI-NEXT:    s_waitcnt vmcnt(0)
671; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
672; CI-NEXT:    v_or_b32_e32 v0, 53, v0
673; CI-NEXT:    flat_store_dword v[2:3], v0
674; CI-NEXT:    s_endpgm
675  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
676  %tid.ext = sext i32 %tid to i64
677  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
678  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
679  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
680  %vecins = insertelement <2 x i16> %vec, i16 53, i32 0
681  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
682  ret void
683}
684
685; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
686define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
687; GFX9-LABEL: v_insertelement_v2i16_1:
688; GFX9:       ; %bb.0:
689; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
690; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
691; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
692; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
693; GFX9-NEXT:    s_movk_i32 s2, 0x3e7
694; GFX9-NEXT:    s_waitcnt vmcnt(0)
695; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
696; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
697; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
698; GFX9-NEXT:    s_endpgm
699;
700; VI-LABEL: v_insertelement_v2i16_1:
701; VI:       ; %bb.0:
702; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
703; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
704; VI-NEXT:    s_waitcnt lgkmcnt(0)
705; VI-NEXT:    v_mov_b32_e32 v1, s3
706; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
707; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
708; VI-NEXT:    flat_load_dword v0, v[0:1]
709; VI-NEXT:    v_mov_b32_e32 v1, 0x3e70000
710; VI-NEXT:    v_mov_b32_e32 v3, s1
711; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
712; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
713; VI-NEXT:    s_waitcnt vmcnt(0)
714; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
715; VI-NEXT:    flat_store_dword v[2:3], v0
716; VI-NEXT:    s_endpgm
717;
718; CI-LABEL: v_insertelement_v2i16_1:
719; CI:       ; %bb.0:
720; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
721; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
722; CI-NEXT:    s_waitcnt lgkmcnt(0)
723; CI-NEXT:    v_mov_b32_e32 v1, s3
724; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
725; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
726; CI-NEXT:    flat_load_dword v0, v[0:1]
727; CI-NEXT:    v_mov_b32_e32 v3, s1
728; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
729; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
730; CI-NEXT:    s_waitcnt vmcnt(0)
731; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
732; CI-NEXT:    v_or_b32_e32 v0, 0x3e70000, v0
733; CI-NEXT:    flat_store_dword v[2:3], v0
734; CI-NEXT:    s_endpgm
735  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
736  %tid.ext = sext i32 %tid to i64
737  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
738  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
739  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
740  %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
741  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
742  ret void
743}
744
745define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
746; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm:
747; GFX9:       ; %bb.0:
748; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
749; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
750; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
751; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
752; GFX9-NEXT:    s_waitcnt vmcnt(0)
753; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
754; GFX9-NEXT:    v_lshl_or_b32 v1, -15, 16, v1
755; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
756; GFX9-NEXT:    s_endpgm
757;
758; VI-LABEL: v_insertelement_v2i16_1_inlineimm:
759; VI:       ; %bb.0:
760; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
761; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
762; VI-NEXT:    s_waitcnt lgkmcnt(0)
763; VI-NEXT:    v_mov_b32_e32 v1, s3
764; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
765; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
766; VI-NEXT:    flat_load_dword v0, v[0:1]
767; VI-NEXT:    v_mov_b32_e32 v1, 0xfff10000
768; VI-NEXT:    v_mov_b32_e32 v3, s1
769; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
770; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
771; VI-NEXT:    s_waitcnt vmcnt(0)
772; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
773; VI-NEXT:    flat_store_dword v[2:3], v0
774; VI-NEXT:    s_endpgm
775;
776; CI-LABEL: v_insertelement_v2i16_1_inlineimm:
777; CI:       ; %bb.0:
778; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
779; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
780; CI-NEXT:    s_waitcnt lgkmcnt(0)
781; CI-NEXT:    v_mov_b32_e32 v1, s3
782; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
783; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
784; CI-NEXT:    flat_load_dword v0, v[0:1]
785; CI-NEXT:    v_mov_b32_e32 v3, s1
786; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
787; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
788; CI-NEXT:    s_waitcnt vmcnt(0)
789; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
790; CI-NEXT:    v_or_b32_e32 v0, 0xfff10000, v0
791; CI-NEXT:    flat_store_dword v[2:3], v0
792; CI-NEXT:    s_endpgm
793  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
794  %tid.ext = sext i32 %tid to i64
795  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
796  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
797  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
798  %vecins = insertelement <2 x i16> %vec, i16 -15, i32 1
799  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
800  ret void
801}
802
803define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
804; GFX9-LABEL: v_insertelement_v2f16_0:
805; GFX9:       ; %bb.0:
806; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
807; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
808; GFX9-NEXT:    v_mov_b32_e32 v2, 0x4500
809; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
811; GFX9-NEXT:    s_waitcnt vmcnt(0)
812; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
813; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
814; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
815; GFX9-NEXT:    s_endpgm
816;
817; VI-LABEL: v_insertelement_v2f16_0:
818; VI:       ; %bb.0:
819; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
820; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
821; VI-NEXT:    s_waitcnt lgkmcnt(0)
822; VI-NEXT:    v_mov_b32_e32 v1, s3
823; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
824; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
825; VI-NEXT:    flat_load_dword v0, v[0:1]
826; VI-NEXT:    v_mov_b32_e32 v3, s1
827; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
828; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
829; VI-NEXT:    s_waitcnt vmcnt(0)
830; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
831; VI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
832; VI-NEXT:    flat_store_dword v[2:3], v0
833; VI-NEXT:    s_endpgm
834;
835; CI-LABEL: v_insertelement_v2f16_0:
836; CI:       ; %bb.0:
837; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
838; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
839; CI-NEXT:    s_waitcnt lgkmcnt(0)
840; CI-NEXT:    v_mov_b32_e32 v1, s3
841; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
842; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
843; CI-NEXT:    flat_load_dword v0, v[0:1]
844; CI-NEXT:    v_mov_b32_e32 v3, s1
845; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
846; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
847; CI-NEXT:    s_waitcnt vmcnt(0)
848; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
849; CI-NEXT:    v_or_b32_e32 v0, 0x4500, v0
850; CI-NEXT:    flat_store_dword v[2:3], v0
851; CI-NEXT:    s_endpgm
852  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
853  %tid.ext = sext i32 %tid to i64
854  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
855  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
856  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
857  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
858  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
859  ret void
860}
861
862define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
863; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm:
864; GFX9:       ; %bb.0:
865; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
866; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
867; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
868; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
869; GFX9-NEXT:    s_waitcnt vmcnt(0)
870; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
871; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 16, 53
872; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
873; GFX9-NEXT:    s_endpgm
874;
875; VI-LABEL: v_insertelement_v2f16_0_inlineimm:
876; VI:       ; %bb.0:
877; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
878; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
879; VI-NEXT:    s_waitcnt lgkmcnt(0)
880; VI-NEXT:    v_mov_b32_e32 v1, s3
881; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
882; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
883; VI-NEXT:    flat_load_dword v0, v[0:1]
884; VI-NEXT:    v_mov_b32_e32 v3, s1
885; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
886; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
887; VI-NEXT:    s_waitcnt vmcnt(0)
888; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
889; VI-NEXT:    v_or_b32_e32 v0, 53, v0
890; VI-NEXT:    flat_store_dword v[2:3], v0
891; VI-NEXT:    s_endpgm
892;
893; CI-LABEL: v_insertelement_v2f16_0_inlineimm:
894; CI:       ; %bb.0:
895; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
896; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
897; CI-NEXT:    s_waitcnt lgkmcnt(0)
898; CI-NEXT:    v_mov_b32_e32 v1, s3
899; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
900; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
901; CI-NEXT:    flat_load_dword v0, v[0:1]
902; CI-NEXT:    v_mov_b32_e32 v3, s1
903; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
904; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
905; CI-NEXT:    s_waitcnt vmcnt(0)
906; CI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
907; CI-NEXT:    v_or_b32_e32 v0, 53, v0
908; CI-NEXT:    flat_store_dword v[2:3], v0
909; CI-NEXT:    s_endpgm
910  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
911  %tid.ext = sext i32 %tid to i64
912  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
913  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
914  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
915  %vecins = insertelement <2 x half> %vec, half 0xH0035, i32 0
916  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
917  ret void
918}
919
920define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
921; GFX9-LABEL: v_insertelement_v2f16_1:
922; GFX9:       ; %bb.0:
923; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
924; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
925; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
926; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
927; GFX9-NEXT:    s_movk_i32 s2, 0x4500
928; GFX9-NEXT:    s_waitcnt vmcnt(0)
929; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
930; GFX9-NEXT:    v_lshl_or_b32 v1, s2, 16, v1
931; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
932; GFX9-NEXT:    s_endpgm
933;
934; VI-LABEL: v_insertelement_v2f16_1:
935; VI:       ; %bb.0:
936; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
937; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
938; VI-NEXT:    s_waitcnt lgkmcnt(0)
939; VI-NEXT:    v_mov_b32_e32 v1, s3
940; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
941; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
942; VI-NEXT:    flat_load_dword v0, v[0:1]
943; VI-NEXT:    v_mov_b32_e32 v1, 0x45000000
944; VI-NEXT:    v_mov_b32_e32 v3, s1
945; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
946; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
947; VI-NEXT:    s_waitcnt vmcnt(0)
948; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
949; VI-NEXT:    flat_store_dword v[2:3], v0
950; VI-NEXT:    s_endpgm
951;
952; CI-LABEL: v_insertelement_v2f16_1:
953; CI:       ; %bb.0:
954; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
955; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
956; CI-NEXT:    s_waitcnt lgkmcnt(0)
957; CI-NEXT:    v_mov_b32_e32 v1, s3
958; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
959; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
960; CI-NEXT:    flat_load_dword v0, v[0:1]
961; CI-NEXT:    v_mov_b32_e32 v3, s1
962; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
963; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
964; CI-NEXT:    s_waitcnt vmcnt(0)
965; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
966; CI-NEXT:    v_or_b32_e32 v0, 0x45000000, v0
967; CI-NEXT:    flat_store_dword v[2:3], v0
968; CI-NEXT:    s_endpgm
969  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
970  %tid.ext = sext i32 %tid to i64
971  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
972  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
973  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
974  %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
975  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
976  ret void
977}
978
979define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
980; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm:
981; GFX9:       ; %bb.0:
982; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
983; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
984; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
985; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
986; GFX9-NEXT:    s_waitcnt vmcnt(0)
987; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
988; GFX9-NEXT:    v_lshl_or_b32 v1, 35, 16, v1
989; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
990; GFX9-NEXT:    s_endpgm
991;
992; VI-LABEL: v_insertelement_v2f16_1_inlineimm:
993; VI:       ; %bb.0:
994; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
995; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
996; VI-NEXT:    s_waitcnt lgkmcnt(0)
997; VI-NEXT:    v_mov_b32_e32 v1, s3
998; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
999; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1000; VI-NEXT:    flat_load_dword v0, v[0:1]
1001; VI-NEXT:    v_mov_b32_e32 v1, 0x230000
1002; VI-NEXT:    v_mov_b32_e32 v3, s1
1003; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1004; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1005; VI-NEXT:    s_waitcnt vmcnt(0)
1006; VI-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1007; VI-NEXT:    flat_store_dword v[2:3], v0
1008; VI-NEXT:    s_endpgm
1009;
1010; CI-LABEL: v_insertelement_v2f16_1_inlineimm:
1011; CI:       ; %bb.0:
1012; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1013; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1014; CI-NEXT:    s_waitcnt lgkmcnt(0)
1015; CI-NEXT:    v_mov_b32_e32 v1, s3
1016; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1017; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1018; CI-NEXT:    flat_load_dword v0, v[0:1]
1019; CI-NEXT:    v_mov_b32_e32 v3, s1
1020; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1021; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1022; CI-NEXT:    s_waitcnt vmcnt(0)
1023; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1024; CI-NEXT:    v_or_b32_e32 v0, 0x230000, v0
1025; CI-NEXT:    flat_store_dword v[2:3], v0
1026; CI-NEXT:    s_endpgm
1027  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1028  %tid.ext = sext i32 %tid to i64
1029  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1030  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1031  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1032  %vecins = insertelement <2 x half> %vec, half 0xH0023, i32 1
1033  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1034  ret void
1035}
1036
1037; FIXME: Enable for others when argument load not split
1038define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
1039; GFX9-LABEL: s_insertelement_v2i16_dynamic:
1040; GFX9:       ; %bb.0:
1041; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1042; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1043; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1044; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
1046; GFX9-NEXT:    s_load_dword s5, s[2:3], 0x0
1047; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX9-NEXT:    s_lshl_b32 s2, s4, 4
1049; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1050; GFX9-NEXT:    s_andn2_b32 s3, s5, s2
1051; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
1052; GFX9-NEXT:    s_or_b32 s2, s2, s3
1053; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1054; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1055; GFX9-NEXT:    s_endpgm
1056;
1057; VI-LABEL: s_insertelement_v2i16_dynamic:
1058; VI:       ; %bb.0:
1059; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1060; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1061; VI-NEXT:    s_waitcnt lgkmcnt(0)
1062; VI-NEXT:    v_mov_b32_e32 v0, s0
1063; VI-NEXT:    s_load_dword s0, s[4:5], 0x0
1064; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1065; VI-NEXT:    v_mov_b32_e32 v1, s1
1066; VI-NEXT:    s_waitcnt lgkmcnt(0)
1067; VI-NEXT:    s_lshl_b32 s0, s0, 4
1068; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1069; VI-NEXT:    s_andn2_b32 s1, s2, s0
1070; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1071; VI-NEXT:    s_or_b32 s0, s0, s1
1072; VI-NEXT:    v_mov_b32_e32 v2, s0
1073; VI-NEXT:    flat_store_dword v[0:1], v2
1074; VI-NEXT:    s_endpgm
1075;
1076; CI-LABEL: s_insertelement_v2i16_dynamic:
1077; CI:       ; %bb.0:
1078; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1079; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
1080; CI-NEXT:    s_waitcnt lgkmcnt(0)
1081; CI-NEXT:    v_mov_b32_e32 v0, s0
1082; CI-NEXT:    s_load_dword s0, s[4:5], 0x0
1083; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
1084; CI-NEXT:    v_mov_b32_e32 v1, s1
1085; CI-NEXT:    s_waitcnt lgkmcnt(0)
1086; CI-NEXT:    s_lshl_b32 s0, s0, 4
1087; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1088; CI-NEXT:    s_andn2_b32 s1, s2, s0
1089; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
1090; CI-NEXT:    s_or_b32 s0, s0, s1
1091; CI-NEXT:    v_mov_b32_e32 v2, s0
1092; CI-NEXT:    flat_store_dword v[0:1], v2
1093; CI-NEXT:    s_endpgm
1094  %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
1095  %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1096  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1097  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1098  ret void
1099}
1100
1101define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
1102; GFX9-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1103; GFX9:       ; %bb.0:
1104; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1105; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1106; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1107; GFX9-NEXT:    v_mov_b32_e32 v2, 0x3e703e7
1108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
1110; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
1111; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
1112; GFX9-NEXT:    s_waitcnt vmcnt(0)
1113; GFX9-NEXT:    v_bfi_b32 v1, s2, v2, v1
1114; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1115; GFX9-NEXT:    s_endpgm
1116;
1117; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1118; VI:       ; %bb.0:
1119; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1120; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1121; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1122; VI-NEXT:    s_waitcnt lgkmcnt(0)
1123; VI-NEXT:    v_mov_b32_e32 v1, s3
1124; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1125; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126; VI-NEXT:    flat_load_dword v0, v[0:1]
1127; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1128; VI-NEXT:    s_lshl_b32 s0, s4, 4
1129; VI-NEXT:    v_mov_b32_e32 v3, s1
1130; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1131; VI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
1132; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1133; VI-NEXT:    s_waitcnt vmcnt(0)
1134; VI-NEXT:    v_bfi_b32 v0, s0, v1, v0
1135; VI-NEXT:    flat_store_dword v[2:3], v0
1136; VI-NEXT:    s_endpgm
1137;
1138; CI-LABEL: v_insertelement_v2i16_dynamic_sgpr:
1139; CI:       ; %bb.0:
1140; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1141; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1142; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
1143; CI-NEXT:    s_waitcnt lgkmcnt(0)
1144; CI-NEXT:    v_mov_b32_e32 v1, s3
1145; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1146; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1147; CI-NEXT:    flat_load_dword v0, v[0:1]
1148; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1149; CI-NEXT:    s_lshl_b32 s0, s4, 4
1150; CI-NEXT:    v_mov_b32_e32 v3, s1
1151; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
1152; CI-NEXT:    v_mov_b32_e32 v1, 0x3e703e7
1153; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1154; CI-NEXT:    s_waitcnt vmcnt(0)
1155; CI-NEXT:    v_bfi_b32 v0, s0, v1, v0
1156; CI-NEXT:    flat_store_dword v[2:3], v0
1157; CI-NEXT:    s_endpgm
1158  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1159  %tid.ext = sext i32 %tid to i64
1160  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
1161  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
1162  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
1163  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
1164  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
1165  ret void
1166}
1167
1168define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
1169; GFX9-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1170; GFX9:       ; %bb.0:
1171; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1172; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x10
1173; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1174; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX9-NEXT:    global_load_dword v1, v0, s[8:9]
1176; GFX9-NEXT:    global_load_dword v2, v0, s[2:3]
1177; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1178; GFX9-NEXT:    s_waitcnt vmcnt(1)
1179; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1180; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
1181; GFX9-NEXT:    s_mov_b32 s2, 0x12341234
1182; GFX9-NEXT:    s_waitcnt vmcnt(0)
1183; GFX9-NEXT:    v_bfi_b32 v1, v1, s2, v2
1184; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1185; GFX9-NEXT:    s_endpgm
1186;
1187; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1188; VI:       ; %bb.0:
1189; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1190; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x10
1191; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1192; VI-NEXT:    s_waitcnt lgkmcnt(0)
1193; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
1194; VI-NEXT:    v_mov_b32_e32 v1, s3
1195; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1196; VI-NEXT:    v_mov_b32_e32 v3, s5
1197; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
1198; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1199; VI-NEXT:    flat_load_dword v2, v[2:3]
1200; VI-NEXT:    flat_load_dword v3, v[0:1]
1201; VI-NEXT:    s_mov_b32 s2, 0xffff
1202; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v4
1203; VI-NEXT:    v_mov_b32_e32 v1, s1
1204; VI-NEXT:    s_mov_b32 s0, 0x12341234
1205; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1206; VI-NEXT:    s_waitcnt vmcnt(1)
1207; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1208; VI-NEXT:    v_lshlrev_b32_e64 v2, v2, s2
1209; VI-NEXT:    s_waitcnt vmcnt(0)
1210; VI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1211; VI-NEXT:    flat_store_dword v[0:1], v2
1212; VI-NEXT:    s_endpgm
1213;
1214; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
1215; CI:       ; %bb.0:
1216; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1217; CI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x4
1218; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
1219; CI-NEXT:    s_waitcnt lgkmcnt(0)
1220; CI-NEXT:    v_mov_b32_e32 v1, s3
1221; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
1222; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1223; CI-NEXT:    v_mov_b32_e32 v3, s5
1224; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
1225; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1226; CI-NEXT:    flat_load_dword v2, v[2:3]
1227; CI-NEXT:    flat_load_dword v3, v[0:1]
1228; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
1229; CI-NEXT:    v_mov_b32_e32 v1, s1
1230; CI-NEXT:    s_mov_b32 s0, 0x12341234
1231; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1232; CI-NEXT:    s_waitcnt vmcnt(1)
1233; CI-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1234; CI-NEXT:    v_lshl_b32_e32 v2, 0xffff, v2
1235; CI-NEXT:    s_waitcnt vmcnt(0)
1236; CI-NEXT:    v_bfi_b32 v2, v2, s0, v3
1237; CI-NEXT:    flat_store_dword v[0:1], v2
1238; CI-NEXT:    s_endpgm
1239  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1240  %tid.ext = sext i32 %tid to i64
1241  %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
1242  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
1243  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
1244  %idx = load i32, i32 addrspace(1)* %idx.gep
1245  %vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
1246  %vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
1247  store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
1248  ret void
1249}
1250
1251define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1252; GFX9-LABEL: v_insertelement_v4f16_0:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1255; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1256; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1257; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1260; GFX9-NEXT:    s_waitcnt vmcnt(0)
1261; GFX9-NEXT:    v_bfi_b32 v0, v3, s6, v0
1262; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1263; GFX9-NEXT:    s_endpgm
1264;
1265; VI-LABEL: v_insertelement_v4f16_0:
1266; VI:       ; %bb.0:
1267; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1268; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1269; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1270; VI-NEXT:    s_waitcnt lgkmcnt(0)
1271; VI-NEXT:    v_mov_b32_e32 v1, s3
1272; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1273; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1274; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1275; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1276; VI-NEXT:    v_mov_b32_e32 v3, s1
1277; VI-NEXT:    s_mov_b32 s0, 0xffff
1278; VI-NEXT:    v_mov_b32_e32 v4, s4
1279; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1280; VI-NEXT:    s_waitcnt vmcnt(0)
1281; VI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1282; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1283; VI-NEXT:    s_endpgm
1284;
1285; CI-LABEL: v_insertelement_v4f16_0:
1286; CI:       ; %bb.0:
1287; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1288; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1289; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1290; CI-NEXT:    s_waitcnt lgkmcnt(0)
1291; CI-NEXT:    v_mov_b32_e32 v1, s3
1292; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1293; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1294; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1295; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1296; CI-NEXT:    v_mov_b32_e32 v3, s1
1297; CI-NEXT:    s_mov_b32 s0, 0xffff
1298; CI-NEXT:    v_mov_b32_e32 v4, s4
1299; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1300; CI-NEXT:    s_waitcnt vmcnt(0)
1301; CI-NEXT:    v_bfi_b32 v0, s0, v4, v0
1302; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1303; CI-NEXT:    s_endpgm
1304  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1305  %tid.ext = sext i32 %tid to i64
1306  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1307  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1308  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1309  %val.trunc = trunc i32 %val to i16
1310  %val.cvt = bitcast i16 %val.trunc to half
1311  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
1312  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1313  ret void
1314}
1315
1316define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1317; GFX9-LABEL: v_insertelement_v4f16_1:
1318; GFX9:       ; %bb.0:
1319; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1320; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1321; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1322; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1323; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1324; GFX9-NEXT:    s_waitcnt vmcnt(0)
1325; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1326; GFX9-NEXT:    v_lshl_or_b32 v0, s6, 16, v0
1327; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1328; GFX9-NEXT:    s_endpgm
1329;
1330; VI-LABEL: v_insertelement_v4f16_1:
1331; VI:       ; %bb.0:
1332; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1333; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1334; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1335; VI-NEXT:    s_waitcnt lgkmcnt(0)
1336; VI-NEXT:    v_mov_b32_e32 v1, s3
1337; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1338; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1339; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1340; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1341; VI-NEXT:    s_lshl_b32 s0, s4, 16
1342; VI-NEXT:    v_mov_b32_e32 v3, s1
1343; VI-NEXT:    v_mov_b32_e32 v4, s0
1344; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1345; VI-NEXT:    s_waitcnt vmcnt(0)
1346; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1347; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1348; VI-NEXT:    s_endpgm
1349;
1350; CI-LABEL: v_insertelement_v4f16_1:
1351; CI:       ; %bb.0:
1352; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1353; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1354; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1355; CI-NEXT:    s_waitcnt lgkmcnt(0)
1356; CI-NEXT:    v_mov_b32_e32 v1, s3
1357; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1358; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1359; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1360; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1361; CI-NEXT:    v_mov_b32_e32 v3, s1
1362; CI-NEXT:    s_lshl_b32 s0, s4, 16
1363; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1364; CI-NEXT:    s_waitcnt vmcnt(0)
1365; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1366; CI-NEXT:    v_or_b32_e32 v0, s0, v0
1367; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1368; CI-NEXT:    s_endpgm
1369  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1370  %tid.ext = sext i32 %tid to i64
1371  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1372  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1373  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1374  %val.trunc = trunc i32 %val to i16
1375  %val.cvt = bitcast i16 %val.trunc to half
1376  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
1377  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1378  ret void
1379}
1380
1381define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
1382; GFX9-LABEL: v_insertelement_v4f16_2:
1383; GFX9:       ; %bb.0:
1384; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1385; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x30
1386; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1387; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1388; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1390; GFX9-NEXT:    s_waitcnt vmcnt(0)
1391; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1392; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1393; GFX9-NEXT:    s_endpgm
1394;
1395; VI-LABEL: v_insertelement_v4f16_2:
1396; VI:       ; %bb.0:
1397; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1398; VI-NEXT:    s_load_dword s4, s[4:5], 0x30
1399; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1400; VI-NEXT:    s_waitcnt lgkmcnt(0)
1401; VI-NEXT:    v_mov_b32_e32 v1, s3
1402; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1403; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1404; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1405; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1406; VI-NEXT:    v_mov_b32_e32 v3, s1
1407; VI-NEXT:    s_mov_b32 s0, 0xffff
1408; VI-NEXT:    v_mov_b32_e32 v4, s4
1409; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1410; VI-NEXT:    s_waitcnt vmcnt(0)
1411; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1412; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1413; VI-NEXT:    s_endpgm
1414;
1415; CI-LABEL: v_insertelement_v4f16_2:
1416; CI:       ; %bb.0:
1417; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1418; CI-NEXT:    s_load_dword s4, s[4:5], 0xc
1419; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1420; CI-NEXT:    s_waitcnt lgkmcnt(0)
1421; CI-NEXT:    v_mov_b32_e32 v1, s3
1422; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1423; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1424; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1425; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1426; CI-NEXT:    v_mov_b32_e32 v3, s1
1427; CI-NEXT:    s_mov_b32 s0, 0xffff
1428; CI-NEXT:    v_mov_b32_e32 v4, s4
1429; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1430; CI-NEXT:    s_waitcnt vmcnt(0)
1431; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1432; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1433; CI-NEXT:    s_endpgm
1434  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1435  %tid.ext = sext i32 %tid to i64
1436  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1437  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1438  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1439  %val.trunc = trunc i32 %val to i16
1440  %val.cvt = bitcast i16 %val.trunc to half
1441  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
1442  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1443  ret void
1444}
1445
1446define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
1447; GFX9-LABEL: v_insertelement_v4f16_3:
1448; GFX9:       ; %bb.0:
1449; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1450; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1451; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1452; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1453; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1454; GFX9-NEXT:    s_waitcnt vmcnt(0)
1455; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1456; GFX9-NEXT:    v_lshl_or_b32 v1, s6, 16, v1
1457; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1458; GFX9-NEXT:    s_endpgm
1459;
1460; VI-LABEL: v_insertelement_v4f16_3:
1461; VI:       ; %bb.0:
1462; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1463; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1464; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1465; VI-NEXT:    s_waitcnt lgkmcnt(0)
1466; VI-NEXT:    v_mov_b32_e32 v1, s3
1467; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1468; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1469; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1470; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1471; VI-NEXT:    s_lshl_b32 s0, s4, 16
1472; VI-NEXT:    v_mov_b32_e32 v3, s1
1473; VI-NEXT:    v_mov_b32_e32 v4, s0
1474; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1475; VI-NEXT:    s_waitcnt vmcnt(0)
1476; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1477; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1478; VI-NEXT:    s_endpgm
1479;
1480; CI-LABEL: v_insertelement_v4f16_3:
1481; CI:       ; %bb.0:
1482; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1483; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1484; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1485; CI-NEXT:    s_waitcnt lgkmcnt(0)
1486; CI-NEXT:    v_mov_b32_e32 v1, s3
1487; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1488; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1489; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1490; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1491; CI-NEXT:    v_mov_b32_e32 v3, s1
1492; CI-NEXT:    s_lshl_b32 s0, s4, 16
1493; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1494; CI-NEXT:    s_waitcnt vmcnt(0)
1495; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1496; CI-NEXT:    v_or_b32_e32 v1, s0, v1
1497; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1498; CI-NEXT:    s_endpgm
1499  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1500  %tid.ext = sext i32 %tid to i64
1501  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1502  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1503  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1504  %val.trunc = trunc i32 %val to i16
1505  %val.cvt = bitcast i16 %val.trunc to half
1506  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
1507  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1508  ret void
1509}
1510
1511define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1512; GFX9-LABEL: v_insertelement_v4i16_2:
1513; GFX9:       ; %bb.0:
1514; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1515; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1516; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1517; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
1518; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1519; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1520; GFX9-NEXT:    s_waitcnt vmcnt(0)
1521; GFX9-NEXT:    v_bfi_b32 v1, v3, s6, v1
1522; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1523; GFX9-NEXT:    s_endpgm
1524;
1525; VI-LABEL: v_insertelement_v4i16_2:
1526; VI:       ; %bb.0:
1527; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1528; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1529; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1530; VI-NEXT:    s_waitcnt lgkmcnt(0)
1531; VI-NEXT:    v_mov_b32_e32 v1, s3
1532; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1533; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1534; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1535; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1536; VI-NEXT:    v_mov_b32_e32 v3, s1
1537; VI-NEXT:    s_mov_b32 s0, 0xffff
1538; VI-NEXT:    v_mov_b32_e32 v4, s4
1539; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1540; VI-NEXT:    s_waitcnt vmcnt(0)
1541; VI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1542; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1543; VI-NEXT:    s_endpgm
1544;
1545; CI-LABEL: v_insertelement_v4i16_2:
1546; CI:       ; %bb.0:
1547; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1548; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1549; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1550; CI-NEXT:    s_waitcnt lgkmcnt(0)
1551; CI-NEXT:    v_mov_b32_e32 v1, s3
1552; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1553; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1554; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1555; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1556; CI-NEXT:    v_mov_b32_e32 v3, s1
1557; CI-NEXT:    s_mov_b32 s0, 0xffff
1558; CI-NEXT:    v_mov_b32_e32 v4, s4
1559; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1560; CI-NEXT:    s_waitcnt vmcnt(0)
1561; CI-NEXT:    v_bfi_b32 v1, s0, v4, v1
1562; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1563; CI-NEXT:    s_endpgm
1564  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1565  %tid.ext = sext i32 %tid to i64
1566  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1567  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1568  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1569  %val.trunc = trunc i32 %val to i16
1570  %val.cvt = bitcast i16 %val.trunc to i16
1571  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
1572  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1573  ret void
1574}
1575
1576; FIXME: Better code on CI?
1577define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
1578; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1579; GFX9:       ; %bb.0:
1580; GFX9-NEXT:    global_load_dword v2, v[0:1], off glc
1581; GFX9-NEXT:    s_waitcnt vmcnt(0)
1582; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1583; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
1584; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
1585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1586; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3]
1587; GFX9-NEXT:    s_mov_b32 s3, 0
1588; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1589; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
1590; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v2, s[2:3]
1591; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s6, s6
1592; GFX9-NEXT:    s_waitcnt vmcnt(0)
1593; GFX9-NEXT:    v_bfi_b32 v1, v3, s2, v1
1594; GFX9-NEXT:    v_bfi_b32 v0, v2, s2, v0
1595; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1596; GFX9-NEXT:    s_endpgm
1597;
1598; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1599; VI:       ; %bb.0:
1600; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1601; VI-NEXT:    s_load_dword s4, s[4:5], 0x10
1602; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1603; VI-NEXT:    s_waitcnt lgkmcnt(0)
1604; VI-NEXT:    v_mov_b32_e32 v1, s3
1605; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1606; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1607; VI-NEXT:    flat_load_dword v4, v[0:1] glc
1608; VI-NEXT:    s_waitcnt vmcnt(0)
1609; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1610; VI-NEXT:    s_mov_b32 s2, 0xffff
1611; VI-NEXT:    v_mov_b32_e32 v3, s1
1612; VI-NEXT:    s_mov_b32 s3, 0
1613; VI-NEXT:    s_and_b32 s1, s4, s2
1614; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1615; VI-NEXT:    s_lshl_b32 s0, s1, 16
1616; VI-NEXT:    s_or_b32 s0, s1, s0
1617; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1618; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1619; VI-NEXT:    v_lshlrev_b64 v[4:5], v4, s[2:3]
1620; VI-NEXT:    s_waitcnt vmcnt(0)
1621; VI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1622; VI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1623; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1624; VI-NEXT:    s_endpgm
1625;
1626; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
1627; CI:       ; %bb.0:
1628; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1629; CI-NEXT:    s_load_dword s4, s[4:5], 0x4
1630; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1631; CI-NEXT:    s_waitcnt lgkmcnt(0)
1632; CI-NEXT:    v_mov_b32_e32 v1, s3
1633; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1634; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1635; CI-NEXT:    flat_load_dword v4, v[0:1] glc
1636; CI-NEXT:    s_waitcnt vmcnt(0)
1637; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1638; CI-NEXT:    s_mov_b32 s3, 0
1639; CI-NEXT:    s_mov_b32 s2, 0xffff
1640; CI-NEXT:    v_mov_b32_e32 v3, s1
1641; CI-NEXT:    s_lshl_b32 s1, s4, 16
1642; CI-NEXT:    s_and_b32 s4, s4, s2
1643; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1644; CI-NEXT:    s_or_b32 s0, s4, s1
1645; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1646; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
1647; CI-NEXT:    v_lshl_b64 v[4:5], s[2:3], v4
1648; CI-NEXT:    s_waitcnt vmcnt(0)
1649; CI-NEXT:    v_bfi_b32 v1, v5, s0, v1
1650; CI-NEXT:    v_bfi_b32 v0, v4, s0, v0
1651; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1652; CI-NEXT:    s_endpgm
1653  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1654  %tid.ext = sext i32 %tid to i64
1655  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
1656  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
1657  %idx.val = load volatile i32, i32 addrspace(1)* undef
1658  %vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
1659  %val.trunc = trunc i32 %val to i16
1660  %val.cvt = bitcast i16 %val.trunc to i16
1661  %vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
1662  store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
1663  ret void
1664}
1665
1666define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
1667; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1668; GFX9:       ; %bb.0:
1669; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1670; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
1671; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1672; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1673; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1674; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
1675; GFX9-NEXT:    s_mov_b32 s3, 0
1676; GFX9-NEXT:    s_mov_b32 s2, 0xffff
1677; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
1678; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
1679; GFX9-NEXT:    v_mov_b32_e32 v3, s5
1680; GFX9-NEXT:    v_mov_b32_e32 v4, s5
1681; GFX9-NEXT:    s_waitcnt vmcnt(0)
1682; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
1683; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
1684; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1685; GFX9-NEXT:    s_endpgm
1686;
1687; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1688; VI:       ; %bb.0:
1689; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1690; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
1691; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1692; VI-NEXT:    s_waitcnt lgkmcnt(0)
1693; VI-NEXT:    v_mov_b32_e32 v1, s3
1694; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1695; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1696; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1697; VI-NEXT:    s_mov_b32 s2, 0xffff
1698; VI-NEXT:    v_mov_b32_e32 v3, s1
1699; VI-NEXT:    s_mov_b32 s3, 0
1700; VI-NEXT:    s_lshl_b32 s1, s5, 4
1701; VI-NEXT:    s_and_b32 s4, s4, s2
1702; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
1703; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
1704; VI-NEXT:    s_lshl_b32 s2, s4, 16
1705; VI-NEXT:    s_or_b32 s2, s4, s2
1706; VI-NEXT:    v_mov_b32_e32 v4, s2
1707; VI-NEXT:    v_mov_b32_e32 v5, s2
1708; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1709; VI-NEXT:    s_waitcnt vmcnt(0)
1710; VI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1711; VI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1712; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1713; VI-NEXT:    s_endpgm
1714;
1715; CI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
1716; CI:       ; %bb.0:
1717; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
1718; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x4
1719; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1720; CI-NEXT:    s_waitcnt lgkmcnt(0)
1721; CI-NEXT:    v_mov_b32_e32 v1, s3
1722; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1723; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1724; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1725; CI-NEXT:    s_mov_b32 s2, 0xffff
1726; CI-NEXT:    v_mov_b32_e32 v3, s1
1727; CI-NEXT:    s_and_b32 s6, s4, s2
1728; CI-NEXT:    s_mov_b32 s3, 0
1729; CI-NEXT:    s_lshl_b32 s1, s5, 4
1730; CI-NEXT:    s_lshl_b32 s4, s4, 16
1731; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
1732; CI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
1733; CI-NEXT:    s_or_b32 s2, s6, s4
1734; CI-NEXT:    v_mov_b32_e32 v4, s2
1735; CI-NEXT:    v_mov_b32_e32 v5, s2
1736; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
1737; CI-NEXT:    s_waitcnt vmcnt(0)
1738; CI-NEXT:    v_bfi_b32 v1, s1, v4, v1
1739; CI-NEXT:    v_bfi_b32 v0, s0, v5, v0
1740; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1741; CI-NEXT:    s_endpgm
1742  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
1743  %tid.ext = sext i32 %tid to i64
1744  %in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
1745  %out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
1746  %vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
1747  %val.trunc = trunc i32 %val to i16
1748  %val.cvt = bitcast i16 %val.trunc to half
1749  %vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
1750  store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
1751  ret void
1752}
1753
1754declare i32 @llvm.amdgcn.workitem.id.x() #1
1755
1756attributes #0 = { nounwind }
1757attributes #1 = { nounwind readnone }
1758