1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
3
4; Check lowering of some large insertelement that use the stack
5; instead of register indexing.
6
7define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.ptr, <64 x i32> addrspace(1)* %ptr, i32 %val, i32 %idx) #0 {
8; GCN-LABEL: v_insert_v64i32_varidx:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_add_u32 s0, s0, s7
11; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
12; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
13; GCN-NEXT:    s_addc_u32 s1, s1, 0
14; GCN-NEXT:    v_mov_b32_e32 v16, 0x100
15; GCN-NEXT:    v_mov_b32_e32 v64, 0
16; GCN-NEXT:    s_waitcnt lgkmcnt(0)
17; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0x0
18; GCN-NEXT:    s_load_dwordx16 s[52:67], s[10:11], 0x40
19; GCN-NEXT:    s_load_dwordx16 s[12:27], s[10:11], 0x80
20; GCN-NEXT:    s_and_b32 s4, s7, 63
21; GCN-NEXT:    s_lshl_b32 s4, s4, 2
22; GCN-NEXT:    s_waitcnt lgkmcnt(0)
23; GCN-NEXT:    v_mov_b32_e32 v0, s36
24; GCN-NEXT:    v_mov_b32_e32 v1, s37
25; GCN-NEXT:    v_mov_b32_e32 v2, s38
26; GCN-NEXT:    v_mov_b32_e32 v3, s39
27; GCN-NEXT:    v_mov_b32_e32 v4, s40
28; GCN-NEXT:    v_mov_b32_e32 v5, s41
29; GCN-NEXT:    v_mov_b32_e32 v6, s42
30; GCN-NEXT:    v_mov_b32_e32 v7, s43
31; GCN-NEXT:    v_mov_b32_e32 v8, s44
32; GCN-NEXT:    v_mov_b32_e32 v9, s45
33; GCN-NEXT:    v_mov_b32_e32 v10, s46
34; GCN-NEXT:    v_mov_b32_e32 v11, s47
35; GCN-NEXT:    v_mov_b32_e32 v12, s48
36; GCN-NEXT:    v_mov_b32_e32 v13, s49
37; GCN-NEXT:    v_mov_b32_e32 v14, s50
38; GCN-NEXT:    v_mov_b32_e32 v15, s51
39; GCN-NEXT:    s_load_dwordx16 s[36:51], s[10:11], 0xc0
40; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:256
41; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:260
42; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:264
43; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:268
44; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:272
45; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:276
46; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], 0 offset:280
47; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], 0 offset:284
48; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], 0 offset:288
49; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], 0 offset:292
50; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], 0 offset:296
51; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], 0 offset:300
52; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], 0 offset:304
53; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], 0 offset:308
54; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], 0 offset:312
55; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], 0 offset:316
56; GCN-NEXT:    v_mov_b32_e32 v0, s52
57; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:320
58; GCN-NEXT:    v_mov_b32_e32 v0, s53
59; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:324
60; GCN-NEXT:    v_mov_b32_e32 v0, s54
61; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:328
62; GCN-NEXT:    v_mov_b32_e32 v0, s55
63; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:332
64; GCN-NEXT:    v_mov_b32_e32 v0, s56
65; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:336
66; GCN-NEXT:    v_mov_b32_e32 v0, s57
67; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:340
68; GCN-NEXT:    v_mov_b32_e32 v0, s58
69; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:344
70; GCN-NEXT:    v_mov_b32_e32 v0, s59
71; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:348
72; GCN-NEXT:    v_mov_b32_e32 v0, s60
73; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:352
74; GCN-NEXT:    v_mov_b32_e32 v0, s61
75; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:356
76; GCN-NEXT:    v_mov_b32_e32 v0, s62
77; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:360
78; GCN-NEXT:    v_mov_b32_e32 v0, s63
79; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:364
80; GCN-NEXT:    v_mov_b32_e32 v0, s64
81; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:368
82; GCN-NEXT:    v_mov_b32_e32 v0, s65
83; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:372
84; GCN-NEXT:    v_mov_b32_e32 v0, s66
85; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:376
86; GCN-NEXT:    v_mov_b32_e32 v0, s67
87; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:380
88; GCN-NEXT:    v_mov_b32_e32 v0, s12
89; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:384
90; GCN-NEXT:    v_mov_b32_e32 v0, s13
91; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:388
92; GCN-NEXT:    v_mov_b32_e32 v0, s14
93; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:392
94; GCN-NEXT:    v_mov_b32_e32 v0, s15
95; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:396
96; GCN-NEXT:    v_mov_b32_e32 v0, s16
97; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:400
98; GCN-NEXT:    v_mov_b32_e32 v0, s17
99; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:404
100; GCN-NEXT:    v_mov_b32_e32 v0, s18
101; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:408
102; GCN-NEXT:    v_mov_b32_e32 v0, s19
103; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:412
104; GCN-NEXT:    v_mov_b32_e32 v0, s20
105; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:416
106; GCN-NEXT:    v_mov_b32_e32 v0, s21
107; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:420
108; GCN-NEXT:    v_mov_b32_e32 v0, s22
109; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:424
110; GCN-NEXT:    v_mov_b32_e32 v0, s23
111; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:428
112; GCN-NEXT:    v_mov_b32_e32 v0, s24
113; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:432
114; GCN-NEXT:    v_mov_b32_e32 v0, s25
115; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:436
116; GCN-NEXT:    v_mov_b32_e32 v0, s26
117; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:440
118; GCN-NEXT:    v_mov_b32_e32 v0, s27
119; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:444
120; GCN-NEXT:    s_waitcnt lgkmcnt(0)
121; GCN-NEXT:    v_mov_b32_e32 v0, s36
122; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:448
123; GCN-NEXT:    v_mov_b32_e32 v0, s37
124; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:452
125; GCN-NEXT:    v_mov_b32_e32 v0, s38
126; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:456
127; GCN-NEXT:    v_mov_b32_e32 v0, s39
128; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:460
129; GCN-NEXT:    v_mov_b32_e32 v0, s40
130; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:464
131; GCN-NEXT:    v_mov_b32_e32 v0, s41
132; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:468
133; GCN-NEXT:    v_mov_b32_e32 v0, s42
134; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:472
135; GCN-NEXT:    v_mov_b32_e32 v0, s43
136; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:476
137; GCN-NEXT:    v_mov_b32_e32 v0, s44
138; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:480
139; GCN-NEXT:    v_mov_b32_e32 v0, s45
140; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:484
141; GCN-NEXT:    v_mov_b32_e32 v0, s46
142; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:488
143; GCN-NEXT:    v_mov_b32_e32 v0, s47
144; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:492
145; GCN-NEXT:    v_mov_b32_e32 v0, s48
146; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:496
147; GCN-NEXT:    v_mov_b32_e32 v0, s49
148; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:500
149; GCN-NEXT:    v_mov_b32_e32 v0, s50
150; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:504
151; GCN-NEXT:    v_mov_b32_e32 v0, s51
152; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:508
153; GCN-NEXT:    v_add_u32_e32 v0, s4, v16
154; GCN-NEXT:    v_mov_b32_e32 v1, s6
155; GCN-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
156; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:256
157; GCN-NEXT:    s_nop 0
158; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:260
159; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:264
160; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:268
161; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:272
162; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:276
163; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:280
164; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:284
165; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:288
166; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:292
167; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:296
168; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:300
169; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:304
170; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:308
171; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:312
172; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:316
173; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], 0 offset:320
174; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], 0 offset:324
175; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], 0 offset:328
176; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], 0 offset:332
177; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], 0 offset:336
178; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], 0 offset:340
179; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], 0 offset:344
180; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], 0 offset:348
181; GCN-NEXT:    buffer_load_dword v24, off, s[0:3], 0 offset:352
182; GCN-NEXT:    buffer_load_dword v25, off, s[0:3], 0 offset:356
183; GCN-NEXT:    buffer_load_dword v26, off, s[0:3], 0 offset:360
184; GCN-NEXT:    buffer_load_dword v27, off, s[0:3], 0 offset:364
185; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], 0 offset:368
186; GCN-NEXT:    buffer_load_dword v29, off, s[0:3], 0 offset:372
187; GCN-NEXT:    buffer_load_dword v30, off, s[0:3], 0 offset:376
188; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], 0 offset:380
189; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], 0 offset:384
190; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], 0 offset:388
191; GCN-NEXT:    buffer_load_dword v34, off, s[0:3], 0 offset:392
192; GCN-NEXT:    buffer_load_dword v35, off, s[0:3], 0 offset:396
193; GCN-NEXT:    buffer_load_dword v36, off, s[0:3], 0 offset:400
194; GCN-NEXT:    buffer_load_dword v37, off, s[0:3], 0 offset:404
195; GCN-NEXT:    buffer_load_dword v38, off, s[0:3], 0 offset:408
196; GCN-NEXT:    buffer_load_dword v39, off, s[0:3], 0 offset:412
197; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], 0 offset:416
198; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], 0 offset:420
199; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], 0 offset:424
200; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], 0 offset:428
201; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], 0 offset:432
202; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], 0 offset:436
203; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], 0 offset:440
204; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], 0 offset:444
205; GCN-NEXT:    buffer_load_dword v48, off, s[0:3], 0 offset:448
206; GCN-NEXT:    buffer_load_dword v49, off, s[0:3], 0 offset:452
207; GCN-NEXT:    buffer_load_dword v50, off, s[0:3], 0 offset:456
208; GCN-NEXT:    buffer_load_dword v51, off, s[0:3], 0 offset:460
209; GCN-NEXT:    buffer_load_dword v52, off, s[0:3], 0 offset:464
210; GCN-NEXT:    buffer_load_dword v53, off, s[0:3], 0 offset:468
211; GCN-NEXT:    buffer_load_dword v54, off, s[0:3], 0 offset:472
212; GCN-NEXT:    buffer_load_dword v55, off, s[0:3], 0 offset:476
213; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], 0 offset:480
214; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], 0 offset:484
215; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], 0 offset:488
216; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], 0 offset:492
217; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], 0 offset:496
218; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], 0 offset:500
219; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], 0 offset:504
220; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], 0 offset:508
221; GCN-NEXT:    s_waitcnt vmcnt(60)
222; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[8:9]
223; GCN-NEXT:    s_waitcnt vmcnt(57)
224; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[8:9] offset:16
225; GCN-NEXT:    s_waitcnt vmcnt(54)
226; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[8:9] offset:32
227; GCN-NEXT:    s_waitcnt vmcnt(51)
228; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[8:9] offset:48
229; GCN-NEXT:    s_waitcnt vmcnt(48)
230; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[8:9] offset:64
231; GCN-NEXT:    s_waitcnt vmcnt(45)
232; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[8:9] offset:80
233; GCN-NEXT:    s_waitcnt vmcnt(42)
234; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[8:9] offset:96
235; GCN-NEXT:    s_waitcnt vmcnt(39)
236; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[8:9] offset:112
237; GCN-NEXT:    s_waitcnt vmcnt(36)
238; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[8:9] offset:128
239; GCN-NEXT:    s_waitcnt vmcnt(33)
240; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[8:9] offset:144
241; GCN-NEXT:    s_waitcnt vmcnt(30)
242; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[8:9] offset:160
243; GCN-NEXT:    s_waitcnt vmcnt(27)
244; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[8:9] offset:176
245; GCN-NEXT:    s_waitcnt vmcnt(24)
246; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[8:9] offset:192
247; GCN-NEXT:    s_waitcnt vmcnt(21)
248; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[8:9] offset:208
249; GCN-NEXT:    s_waitcnt vmcnt(18)
250; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[8:9] offset:224
251; GCN-NEXT:    s_waitcnt vmcnt(15)
252; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[8:9] offset:240
253; GCN-NEXT:    s_endpgm
254  %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
255  %insert = insertelement <64 x i32> %vec, i32 %val, i32 %idx
256  store <64 x i32> %insert, <64 x i32> addrspace(1)* %out.ptr
257  ret void
258}
259
260attributes #0 = { "amdgpu-waves-per-eu"="1,10" }
261