1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4; SelectionDAG builder was using the IR value kind to decide how to
5; split the types for copyToRegs/copyFromRegs in all contexts. This
6; was incorrect if the ABI-like value such as a call was used outside
7; of the block. The value in that case is not used directly, but
8; through another set of copies to potentially different register
9; types in the parent block.
10
11; This would then end up producing inconsistent pairs of copies with
12; the wrong sizes when the vector type result from the call was split
13; into multiple pieces, but expected to be a single register in the
14; cross-block copy.
15;
16; This isn't exactly ideal for AMDGPU, since in reality the
17; intermediate vector register type is undesirable anyway, but it
18; requires more work to be able to split all vector copies in all
19; contexts.
20;
21; This was only an issue if the value was used directly in another
22; block. If there was an intermediate operation or a phi it was fine,
23; since that didn't look like an ABI copy.
24
25
26define float @call_split_type_used_outside_block_v2f32() #0 {
27; GCN-LABEL: call_split_type_used_outside_block_v2f32:
28; GCN:       ; %bb.0: ; %bb0
29; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
31; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
32; GCN-NEXT:    s_mov_b64 exec, s[4:5]
33; GCN-NEXT:    v_writelane_b32 v40, s33, 2
34; GCN-NEXT:    s_mov_b32 s33, s32
35; GCN-NEXT:    s_addk_i32 s32, 0x400
36; GCN-NEXT:    v_writelane_b32 v40, s30, 0
37; GCN-NEXT:    v_writelane_b32 v40, s31, 1
38; GCN-NEXT:    s_getpc_b64 s[4:5]
39; GCN-NEXT:    s_add_u32 s4, s4, func_v2f32@rel32@lo+4
40; GCN-NEXT:    s_addc_u32 s5, s5, func_v2f32@rel32@hi+12
41; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
42; GCN-NEXT:    v_readlane_b32 s4, v40, 0
43; GCN-NEXT:    v_readlane_b32 s5, v40, 1
44; GCN-NEXT:    s_addk_i32 s32, 0xfc00
45; GCN-NEXT:    v_readlane_b32 s33, v40, 2
46; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
47; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
48; GCN-NEXT:    s_mov_b64 exec, s[6:7]
49; GCN-NEXT:    s_waitcnt vmcnt(0)
50; GCN-NEXT:    s_setpc_b64 s[4:5]
51bb0:
52  %split.ret.type = call <2 x float> @func_v2f32()
53  br label %bb1
54
55bb1:
56  %extract = extractelement <2 x float> %split.ret.type, i32 0
57  ret float %extract
58}
59
60define float @call_split_type_used_outside_block_v3f32() #0 {
61; GCN-LABEL: call_split_type_used_outside_block_v3f32:
62; GCN:       ; %bb.0: ; %bb0
63; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
65; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
66; GCN-NEXT:    s_mov_b64 exec, s[4:5]
67; GCN-NEXT:    v_writelane_b32 v40, s33, 2
68; GCN-NEXT:    s_mov_b32 s33, s32
69; GCN-NEXT:    s_addk_i32 s32, 0x400
70; GCN-NEXT:    v_writelane_b32 v40, s30, 0
71; GCN-NEXT:    v_writelane_b32 v40, s31, 1
72; GCN-NEXT:    s_getpc_b64 s[4:5]
73; GCN-NEXT:    s_add_u32 s4, s4, func_v3f32@rel32@lo+4
74; GCN-NEXT:    s_addc_u32 s5, s5, func_v3f32@rel32@hi+12
75; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
76; GCN-NEXT:    v_readlane_b32 s4, v40, 0
77; GCN-NEXT:    v_readlane_b32 s5, v40, 1
78; GCN-NEXT:    s_addk_i32 s32, 0xfc00
79; GCN-NEXT:    v_readlane_b32 s33, v40, 2
80; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
81; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
82; GCN-NEXT:    s_mov_b64 exec, s[6:7]
83; GCN-NEXT:    s_waitcnt vmcnt(0)
84; GCN-NEXT:    s_setpc_b64 s[4:5]
85bb0:
86  %split.ret.type = call <3 x float> @func_v3f32()
87  br label %bb1
88
89bb1:
90  %extract = extractelement <3 x float> %split.ret.type, i32 0
91  ret float %extract
92}
93
94define half @call_split_type_used_outside_block_v4f16() #0 {
95; GCN-LABEL: call_split_type_used_outside_block_v4f16:
96; GCN:       ; %bb.0: ; %bb0
97; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
99; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
100; GCN-NEXT:    s_mov_b64 exec, s[4:5]
101; GCN-NEXT:    v_writelane_b32 v40, s33, 2
102; GCN-NEXT:    s_mov_b32 s33, s32
103; GCN-NEXT:    s_addk_i32 s32, 0x400
104; GCN-NEXT:    v_writelane_b32 v40, s30, 0
105; GCN-NEXT:    v_writelane_b32 v40, s31, 1
106; GCN-NEXT:    s_getpc_b64 s[4:5]
107; GCN-NEXT:    s_add_u32 s4, s4, func_v4f16@rel32@lo+4
108; GCN-NEXT:    s_addc_u32 s5, s5, func_v4f16@rel32@hi+12
109; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
110; GCN-NEXT:    v_readlane_b32 s4, v40, 0
111; GCN-NEXT:    v_readlane_b32 s5, v40, 1
112; GCN-NEXT:    s_addk_i32 s32, 0xfc00
113; GCN-NEXT:    v_readlane_b32 s33, v40, 2
114; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
115; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
116; GCN-NEXT:    s_mov_b64 exec, s[6:7]
117; GCN-NEXT:    s_waitcnt vmcnt(0)
118; GCN-NEXT:    s_setpc_b64 s[4:5]
119bb0:
120  %split.ret.type = call <4 x half> @func_v4f16()
121  br label %bb1
122
123bb1:
124  %extract = extractelement <4 x half> %split.ret.type, i32 0
125  ret half %extract
126}
127
128define { i32, half } @call_split_type_used_outside_block_struct() #0 {
129; GCN-LABEL: call_split_type_used_outside_block_struct:
130; GCN:       ; %bb.0: ; %bb0
131; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
133; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
134; GCN-NEXT:    s_mov_b64 exec, s[4:5]
135; GCN-NEXT:    v_writelane_b32 v40, s33, 2
136; GCN-NEXT:    s_mov_b32 s33, s32
137; GCN-NEXT:    s_addk_i32 s32, 0x400
138; GCN-NEXT:    v_writelane_b32 v40, s30, 0
139; GCN-NEXT:    v_writelane_b32 v40, s31, 1
140; GCN-NEXT:    s_getpc_b64 s[4:5]
141; GCN-NEXT:    s_add_u32 s4, s4, func_struct@rel32@lo+4
142; GCN-NEXT:    s_addc_u32 s5, s5, func_struct@rel32@hi+12
143; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
144; GCN-NEXT:    v_readlane_b32 s4, v40, 0
145; GCN-NEXT:    v_mov_b32_e32 v1, v4
146; GCN-NEXT:    v_readlane_b32 s5, v40, 1
147; GCN-NEXT:    s_addk_i32 s32, 0xfc00
148; GCN-NEXT:    v_readlane_b32 s33, v40, 2
149; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
150; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
151; GCN-NEXT:    s_mov_b64 exec, s[6:7]
152; GCN-NEXT:    s_waitcnt vmcnt(0)
153; GCN-NEXT:    s_setpc_b64 s[4:5]
154bb0:
155  %split.ret.type = call { <4 x i32>, <4 x half> } @func_struct()
156  br label %bb1
157
158bb1:
159  %val0 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 0
160  %val1 = extractvalue { <4 x i32>, <4 x half> } %split.ret.type, 1
161  %extract0 = extractelement <4 x i32> %val0, i32 0
162  %extract1 = extractelement <4 x half> %val1, i32 0
163  %ins0 = insertvalue { i32, half } undef, i32 %extract0, 0
164  %ins1 = insertvalue { i32, half } %ins0, half %extract1, 1
165  ret { i32, half } %ins1
166}
167
168define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
169; GCN-LABEL: v3i16_registers:
170; GCN:       ; %bb.0: ; %entry
171; GCN-NEXT:    s_load_dword s4, s[8:9], 0x0
172; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
173; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
174; GCN-NEXT:    s_add_u32 s0, s0, s17
175; GCN-NEXT:    s_addc_u32 s1, s1, 0
176; GCN-NEXT:    s_waitcnt lgkmcnt(0)
177; GCN-NEXT:    s_bitcmp1_b32 s4, 0
178; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
179; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
180; GCN-NEXT:    s_mov_b32 s32, 0
181; GCN-NEXT:    s_cbranch_vccnz BB4_2
182; GCN-NEXT:  ; %bb.1: ; %if.else
183; GCN-NEXT:    s_getpc_b64 s[4:5]
184; GCN-NEXT:    s_add_u32 s4, s4, func_v3i16@rel32@lo+4
185; GCN-NEXT:    s_addc_u32 s5, s5, func_v3i16@rel32@hi+12
186; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
187; GCN-NEXT:    s_branch BB4_3
188; GCN-NEXT:  BB4_2:
189; GCN-NEXT:    s_mov_b32 s4, 0
190; GCN-NEXT:    s_mov_b32 s5, s4
191; GCN-NEXT:    v_mov_b32_e32 v0, s4
192; GCN-NEXT:    v_mov_b32_e32 v1, s5
193; GCN-NEXT:  BB4_3: ; %if.end
194; GCN-NEXT:    global_store_short v[0:1], v1, off
195; GCN-NEXT:    global_store_dword v[0:1], v0, off
196; GCN-NEXT:    s_endpgm
197entry:
198  br i1 %cond, label %if.then, label %if.else
199
200if.then:                                          ; preds = %entry
201  br label %if.end
202
203if.else:                                          ; preds = %entry
204  %call6 = tail call <3 x i16> @func_v3i16() #0
205  br label %if.end
206
207if.end:                                           ; preds = %if.else, %if.then
208  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
209  store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
210  ret void
211}
212
213define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
214; GCN-LABEL: v3f16_registers:
215; GCN:       ; %bb.0: ; %entry
216; GCN-NEXT:    s_load_dword s4, s[8:9], 0x0
217; GCN-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
218; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
219; GCN-NEXT:    s_add_u32 s0, s0, s17
220; GCN-NEXT:    s_addc_u32 s1, s1, 0
221; GCN-NEXT:    s_waitcnt lgkmcnt(0)
222; GCN-NEXT:    s_bitcmp1_b32 s4, 0
223; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
224; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
225; GCN-NEXT:    s_mov_b32 s32, 0
226; GCN-NEXT:    s_cbranch_vccnz BB5_2
227; GCN-NEXT:  ; %bb.1: ; %if.else
228; GCN-NEXT:    s_getpc_b64 s[4:5]
229; GCN-NEXT:    s_add_u32 s4, s4, func_v3f16@rel32@lo+4
230; GCN-NEXT:    s_addc_u32 s5, s5, func_v3f16@rel32@hi+12
231; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
232; GCN-NEXT:    s_branch BB5_3
233; GCN-NEXT:  BB5_2:
234; GCN-NEXT:    s_mov_b32 s4, 0
235; GCN-NEXT:    s_mov_b32 s5, s4
236; GCN-NEXT:    v_mov_b32_e32 v0, s4
237; GCN-NEXT:    v_mov_b32_e32 v1, s5
238; GCN-NEXT:  BB5_3: ; %if.end
239; GCN-NEXT:    global_store_short v[0:1], v1, off
240; GCN-NEXT:    global_store_dword v[0:1], v0, off
241; GCN-NEXT:    s_endpgm
242entry:
243  br i1 %cond, label %if.then, label %if.else
244
245if.then:                                          ; preds = %entry
246  br label %if.end
247
248if.else:                                          ; preds = %entry
249  %call6 = tail call <3 x half> @func_v3f16() #0
250  br label %if.end
251
252if.end:                                           ; preds = %if.else, %if.then
253  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
254  store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
255  ret void
256}
257
258declare hidden <2 x float> @func_v2f32() #0
259declare hidden <3 x float> @func_v3f32() #0
260declare hidden <4 x float> @func_v4f32() #0
261declare hidden <4 x half> @func_v4f16() #0
262declare hidden <3 x i16> @func_v3i16()
263declare hidden <3 x half> @func_v3f16()
264
265declare hidden { <4 x i32>, <4 x half> } @func_struct() #0
266
267attributes #0 = { nounwind}
268