1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI
4
5; XXX - Why the packing?
6define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
7; SI-LABEL: scalar_to_vector_v2i32:
8; SI:       ; %bb.0:
9; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
10; SI-NEXT:    s_mov_b32 s7, 0xf000
11; SI-NEXT:    s_mov_b32 s6, -1
12; SI-NEXT:    s_mov_b32 s10, s6
13; SI-NEXT:    s_mov_b32 s11, s7
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_mov_b32 s8, s2
16; SI-NEXT:    s_mov_b32 s9, s3
17; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
18; SI-NEXT:    s_waitcnt vmcnt(0)
19; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
20; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
21; SI-NEXT:    s_mov_b32 s4, s0
22; SI-NEXT:    s_mov_b32 s5, s1
23; SI-NEXT:    v_mov_b32_e32 v1, v0
24; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: scalar_to_vector_v2i32:
28; VI:       ; %bb.0:
29; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
30; VI-NEXT:    s_mov_b32 s3, 0xf000
31; VI-NEXT:    s_mov_b32 s2, -1
32; VI-NEXT:    s_waitcnt lgkmcnt(0)
33; VI-NEXT:    s_mov_b32 s0, s4
34; VI-NEXT:    s_mov_b32 s1, s5
35; VI-NEXT:    s_mov_b32 s4, s6
36; VI-NEXT:    s_mov_b32 s5, s7
37; VI-NEXT:    s_mov_b32 s6, s2
38; VI-NEXT:    s_mov_b32 s7, s3
39; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
40; VI-NEXT:    s_waitcnt vmcnt(0)
41; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
42; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
43; VI-NEXT:    v_mov_b32_e32 v1, v0
44; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
45; VI-NEXT:    s_endpgm
46  %tmp1 = load i32, i32 addrspace(1)* %in, align 4
47  %bc = bitcast i32 %tmp1 to <2 x i16>
48  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
49  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
50  ret void
51}
52
53define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
54; SI-LABEL: scalar_to_vector_v2f32:
55; SI:       ; %bb.0:
56; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
57; SI-NEXT:    s_mov_b32 s7, 0xf000
58; SI-NEXT:    s_mov_b32 s6, -1
59; SI-NEXT:    s_mov_b32 s10, s6
60; SI-NEXT:    s_mov_b32 s11, s7
61; SI-NEXT:    s_waitcnt lgkmcnt(0)
62; SI-NEXT:    s_mov_b32 s8, s2
63; SI-NEXT:    s_mov_b32 s9, s3
64; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
65; SI-NEXT:    s_waitcnt vmcnt(0)
66; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
67; SI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
68; SI-NEXT:    s_mov_b32 s4, s0
69; SI-NEXT:    s_mov_b32 s5, s1
70; SI-NEXT:    v_mov_b32_e32 v1, v0
71; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
72; SI-NEXT:    s_endpgm
73;
74; VI-LABEL: scalar_to_vector_v2f32:
75; VI:       ; %bb.0:
76; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
77; VI-NEXT:    s_mov_b32 s3, 0xf000
78; VI-NEXT:    s_mov_b32 s2, -1
79; VI-NEXT:    s_waitcnt lgkmcnt(0)
80; VI-NEXT:    s_mov_b32 s0, s4
81; VI-NEXT:    s_mov_b32 s1, s5
82; VI-NEXT:    s_mov_b32 s4, s6
83; VI-NEXT:    s_mov_b32 s5, s7
84; VI-NEXT:    s_mov_b32 s6, s2
85; VI-NEXT:    s_mov_b32 s7, s3
86; VI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
87; VI-NEXT:    s_waitcnt vmcnt(0)
88; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
89; VI-NEXT:    v_alignbit_b32 v0, v1, v0, 16
90; VI-NEXT:    v_mov_b32_e32 v1, v0
91; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
92; VI-NEXT:    s_endpgm
93  %tmp1 = load float, float addrspace(1)* %in, align 4
94  %bc = bitcast float %tmp1 to <2 x i16>
95  %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
96  store <4 x i16> %tmp2, <4 x i16> addrspace(1)* %out, align 8
97  ret void
98}
99
100define amdgpu_kernel void @scalar_to_vector_v4i16() {
101; SI-LABEL: scalar_to_vector_v4i16:
102; SI:       ; %bb.0: ; %bb
103; SI-NEXT:    s_mov_b32 s3, 0xf000
104; SI-NEXT:    s_mov_b32 s2, -1
105; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
106; SI-NEXT:    s_waitcnt vmcnt(0)
107; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
108; SI-NEXT:    v_or_b32_e32 v0, v1, v0
109; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
110; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
111; SI-NEXT:    v_or_b32_e32 v1, v1, v2
112; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
113; SI-NEXT:    v_or_b32_e32 v1, v1, v2
114; SI-NEXT:    v_or_b32_e32 v0, v0, v2
115; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
116; SI-NEXT:    s_endpgm
117;
118; VI-LABEL: scalar_to_vector_v4i16:
119; VI:       ; %bb.0: ; %bb
120; VI-NEXT:    s_mov_b32 s3, 0xf000
121; VI-NEXT:    s_mov_b32 s2, -1
122; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v0
125; VI-NEXT:    v_or_b32_e32 v0, v1, v0
126; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
127; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
128; VI-NEXT:    v_or_b32_e32 v1, v1, v2
129; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
130; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
131; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
132; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
133; VI-NEXT:    s_endpgm
134bb:
135  %tmp = load <2 x i8>, <2 x i8> addrspace(1)* undef, align 1
136  %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
137  %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
138  store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8
139  ret void
140}
141
142define amdgpu_kernel void @scalar_to_vector_v4f16() {
143; SI-LABEL: scalar_to_vector_v4f16:
144; SI:       ; %bb.0: ; %bb
145; SI-NEXT:    s_mov_b32 s3, 0xf000
146; SI-NEXT:    s_mov_b32 s2, -1
147; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
148; SI-NEXT:    s_waitcnt vmcnt(0)
149; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
150; SI-NEXT:    v_or_b32_e32 v0, v1, v0
151; SI-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
152; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
153; SI-NEXT:    v_or_b32_e32 v1, v1, v2
154; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
155; SI-NEXT:    v_or_b32_e32 v1, v1, v2
156; SI-NEXT:    v_or_b32_e32 v0, v0, v2
157; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
158; SI-NEXT:    s_endpgm
159;
160; VI-LABEL: scalar_to_vector_v4f16:
161; VI:       ; %bb.0: ; %bb
162; VI-NEXT:    s_mov_b32 s3, 0xf000
163; VI-NEXT:    s_mov_b32 s2, -1
164; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
165; VI-NEXT:    s_waitcnt vmcnt(0)
166; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v0
167; VI-NEXT:    v_or_b32_e32 v0, v1, v0
168; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
169; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
170; VI-NEXT:    v_or_b32_e32 v1, v1, v2
171; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
172; VI-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
173; VI-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
174; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
175; VI-NEXT:    s_endpgm
176bb:
177  %load = load half, half addrspace(1)* undef, align 1
178  %tmp = bitcast half %load to <2 x i8>
179  %tmp1 = shufflevector <2 x i8> %tmp, <2 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
180  %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 0, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
181  store <8 x i8> %tmp2, <8 x i8> addrspace(1)* undef, align 8
182  ret void
183}
184
185; Getting a SCALAR_TO_VECTOR seems to be tricky. These cases managed
186; to produce one, but for some reason never made it to selection.
187
188
189; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
190;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
191;   %bc = bitcast i32 %tmp1 to <4 x i8>
192
193;   %tmp2 = shufflevector <4 x i8> %bc, <4 x i8> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
194;   store <8 x i8> %tmp2, <8 x i8> addrspace(1)* %out, align 4
195;   ret void
196; }
197
198; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
199;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
200;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
201;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
202;   %add = add <4 x i32> %bc, <i32 1, i32 2, i32 3, i32 4>
203;   store <4 x i32> %add, <4 x i32> addrspace(1)* %out, align 16
204;   ret void
205; }
206
207; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
208;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
209;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
210;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
211;   store <8 x i16> %add, <8 x i16> addrspace(1)* %out, align 16
212;   ret void
213; }
214
215; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
216;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
217;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
218;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
219;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
220;   ret void
221; }
222
223define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
224; SI-LABEL: scalar_to_vector_test6:
225; SI:       ; %bb.0:
226; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
227; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
228; SI-NEXT:    s_mov_b32 s3, 0xf000
229; SI-NEXT:    s_mov_b32 s2, -1
230; SI-NEXT:    s_waitcnt lgkmcnt(0)
231; SI-NEXT:    v_mov_b32_e32 v0, s4
232; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
233; SI-NEXT:    s_endpgm
234;
235; VI-LABEL: scalar_to_vector_test6:
236; VI:       ; %bb.0:
237; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
238; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
239; VI-NEXT:    s_mov_b32 s7, 0xf000
240; VI-NEXT:    s_mov_b32 s6, -1
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    v_mov_b32_e32 v0, s0
243; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
244; VI-NEXT:    s_endpgm
245  %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
246  %bc = bitcast <4 x i8> %newvec0 to <2 x half>
247  store <2 x half> %bc, <2 x half> addrspace(1)* %out
248  ret void
249}
250