1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
7
8;
9; Partial Vector Loads - PR16739
10;
11
12define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
13; SSE-LABEL: load_float4_float3:
14; SSE:       # %bb.0:
15; SSE-NEXT:    movups (%rdi), %xmm0
16; SSE-NEXT:    retq
17;
18; AVX-LABEL: load_float4_float3:
19; AVX:       # %bb.0:
20; AVX-NEXT:    vmovups (%rdi), %xmm0
21; AVX-NEXT:    retq
22  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
23  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
24  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
25  %ld0 = load float, float* %p0, align 4
26  %ld1 = load float, float* %p1, align 4
27  %ld2 = load float, float* %p2, align 4
28  %r0 = insertelement <4 x float> undef, float %ld0, i32 0
29  %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
30  %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
31  ret <4 x float> %r2
32}
33
34define <4 x float> @load_float4_float3_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
35; SSE-LABEL: load_float4_float3_0122:
36; SSE:       # %bb.0:
37; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
38; SSE-NEXT:    movups (%rdi), %xmm0
39; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: load_float4_float3_0122:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
45; AVX-NEXT:    vmovups (%rdi), %xmm1
46; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
47; AVX-NEXT:    retq
48  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
49  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
50  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
51  %ld0 = load float, float* %p0, align 4
52  %ld1 = load float, float* %p1, align 4
53  %ld2 = load float, float* %p2, align 4
54  %r0 = insertelement <4 x float> undef, float %ld0, i32 0
55  %r1 = insertelement <4 x float> %r0,   float %ld1, i32 1
56  %r2 = insertelement <4 x float> %r1,   float %ld2, i32 2
57  %r3 = insertelement <4 x float> %r2,   float %ld2, i32 3
58  ret <4 x float> %r3
59}
60
61define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
62; SSE-LABEL: load_float8_float3:
63; SSE:       # %bb.0:
64; SSE-NEXT:    movups (%rdi), %xmm0
65; SSE-NEXT:    retq
66;
67; AVX-LABEL: load_float8_float3:
68; AVX:       # %bb.0:
69; AVX-NEXT:    vmovups (%rdi), %xmm0
70; AVX-NEXT:    retq
71  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
72  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
73  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
74  %ld0 = load float, float* %p0, align 4
75  %ld1 = load float, float* %p1, align 4
76  %ld2 = load float, float* %p2, align 4
77  %r0 = insertelement <8 x float> undef, float %ld0, i32 0
78  %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
79  %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
80  ret <8 x float> %r2
81}
82
83define <8 x float> @load_float8_float3_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
84; SSE-LABEL: load_float8_float3_0122:
85; SSE:       # %bb.0:
86; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
87; SSE-NEXT:    movups (%rdi), %xmm0
88; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
89; SSE-NEXT:    retq
90;
91; AVX-LABEL: load_float8_float3_0122:
92; AVX:       # %bb.0:
93; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
94; AVX-NEXT:    vmovups (%rdi), %xmm1
95; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
96; AVX-NEXT:    retq
97  %p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
98  %p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
99  %p2 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
100  %ld0 = load float, float* %p0, align 4
101  %ld1 = load float, float* %p1, align 4
102  %ld2 = load float, float* %p2, align 4
103  %r0 = insertelement <8 x float> undef, float %ld0, i32 0
104  %r1 = insertelement <8 x float> %r0,   float %ld1, i32 1
105  %r2 = insertelement <8 x float> %r1,   float %ld2, i32 2
106  %r3 = insertelement <8 x float> %r2,   float %ld2, i32 3
107  ret <8 x float> %r3
108}
109
110define <4 x float> @load_float4_float3_as_float2_float(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
111; SSE-LABEL: load_float4_float3_as_float2_float:
112; SSE:       # %bb.0:
113; SSE-NEXT:    movups (%rdi), %xmm0
114; SSE-NEXT:    retq
115;
116; AVX-LABEL: load_float4_float3_as_float2_float:
117; AVX:       # %bb.0:
118; AVX-NEXT:    vmovups (%rdi), %xmm0
119; AVX-NEXT:    retq
120  %2 = bitcast <4 x float>* %0 to <2 x float>*
121  %3 = load <2 x float>, <2 x float>* %2, align 4
122  %4 = extractelement <2 x float> %3, i32 0
123  %5 = insertelement <4 x float> undef, float %4, i32 0
124  %6 = extractelement <2 x float> %3, i32 1
125  %7 = insertelement <4 x float> %5, float %6, i32 1
126  %8 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
127  %9 = load float, float* %8, align 4
128  %10 = insertelement <4 x float> %7, float %9, i32 2
129  ret <4 x float> %10
130}
131
132define <4 x float> @load_float4_float3_as_float2_float_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
133; SSE-LABEL: load_float4_float3_as_float2_float_0122:
134; SSE:       # %bb.0:
135; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
136; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
137; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
138; SSE-NEXT:    retq
139;
140; AVX-LABEL: load_float4_float3_as_float2_float_0122:
141; AVX:       # %bb.0:
142; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
143; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
144; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
145; AVX-NEXT:    retq
146  %2 = bitcast <4 x float>* %0 to <2 x float>*
147  %3 = load <2 x float>, <2 x float>* %2, align 4
148  %4 = extractelement <2 x float> %3, i32 0
149  %5 = insertelement <4 x float> undef, float %4, i32 0
150  %6 = extractelement <2 x float> %3, i32 1
151  %7 = insertelement <4 x float> %5, float %6, i32 1
152  %8 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
153  %9 = load float, float* %8, align 4
154  %10 = insertelement <4 x float> %7, float %9, i32 2
155  %11 = insertelement <4 x float> %10, float %9, i32 3
156  ret <4 x float> %11
157}
158
159define <4 x float> @load_float4_float3_trunc(<4 x float>* nocapture readonly dereferenceable(16)) {
160; SSE-LABEL: load_float4_float3_trunc:
161; SSE:       # %bb.0:
162; SSE-NEXT:    movaps (%rdi), %xmm0
163; SSE-NEXT:    retq
164;
165; AVX-LABEL: load_float4_float3_trunc:
166; AVX:       # %bb.0:
167; AVX-NEXT:    vmovaps (%rdi), %xmm0
168; AVX-NEXT:    retq
169  %2 = bitcast <4 x float>* %0 to i64*
170  %3 = load i64, i64* %2, align 16
171  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
172  %5 = bitcast float* %4 to i64*
173  %6 = load i64, i64* %5, align 8
174  %7 = trunc i64 %3 to i32
175  %8 = bitcast i32 %7 to float
176  %9 = insertelement <4 x float> undef, float %8, i32 0
177  %10 = lshr i64 %3, 32
178  %11 = trunc i64 %10 to i32
179  %12 = bitcast i32 %11 to float
180  %13 = insertelement <4 x float> %9, float %12, i32 1
181  %14 = trunc i64 %6 to i32
182  %15 = bitcast i32 %14 to float
183  %16 = insertelement <4 x float> %13, float %15, i32 2
184  ret <4 x float> %16
185}
186
187define <4 x float> @load_float4_float3_trunc_0122(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
188; SSE-LABEL: load_float4_float3_trunc_0122:
189; SSE:       # %bb.0:
190; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
191; SSE-NEXT:    movaps (%rdi), %xmm0
192; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0]
193; SSE-NEXT:    retq
194;
195; AVX-LABEL: load_float4_float3_trunc_0122:
196; AVX:       # %bb.0:
197; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
198; AVX-NEXT:    vmovaps (%rdi), %xmm1
199; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0]
200; AVX-NEXT:    retq
201  %2 = bitcast <4 x float>* %0 to i64*
202  %3 = load i64, i64* %2, align 16
203  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
204  %5 = bitcast float* %4 to i64*
205  %6 = load i64, i64* %5, align 8
206  %7 = trunc i64 %3 to i32
207  %8 = bitcast i32 %7 to float
208  %9 = insertelement <4 x float> undef, float %8, i32 0
209  %10 = lshr i64 %3, 32
210  %11 = trunc i64 %10 to i32
211  %12 = bitcast i32 %11 to float
212  %13 = insertelement <4 x float> %9, float %12, i32 1
213  %14 = trunc i64 %6 to i32
214  %15 = bitcast i32 %14 to float
215  %16 = insertelement <4 x float> %13, float %15, i32 2
216  %17 = insertelement <4 x float> %16, float %15, i32 3
217  ret <4 x float> %17
218}
219
220define <4 x float> @load_float4_float3_trunc_0123(<4 x float>* nocapture readonly dereferenceable(16)) nofree nosync {
221; SSE2-LABEL: load_float4_float3_trunc_0123:
222; SSE2:       # %bb.0:
223; SSE2-NEXT:    movaps (%rdi), %xmm0
224; SSE2-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
225; SSE2-NEXT:    retq
226;
227; SSSE3-LABEL: load_float4_float3_trunc_0123:
228; SSSE3:       # %bb.0:
229; SSSE3-NEXT:    movaps (%rdi), %xmm0
230; SSSE3-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
231; SSSE3-NEXT:    retq
232;
233; SSE41-LABEL: load_float4_float3_trunc_0123:
234; SSE41:       # %bb.0:
235; SSE41-NEXT:    movaps (%rdi), %xmm0
236; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
237; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
238; SSE41-NEXT:    retq
239;
240; AVX-LABEL: load_float4_float3_trunc_0123:
241; AVX:       # %bb.0:
242; AVX-NEXT:    vmovaps (%rdi), %xmm0
243; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
244; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
245; AVX-NEXT:    retq
246  %2 = bitcast <4 x float>* %0 to i64*
247  %3 = load i64, i64* %2, align 16
248  %4 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 2
249  %5 = bitcast float* %4 to i64*
250  %6 = load i64, i64* %5, align 8
251  %7 = trunc i64 %3 to i32
252  %8 = bitcast i32 %7 to float
253  %9 = insertelement <4 x float> undef, float %8, i32 0
254  %10 = lshr i64 %3, 32
255  %11 = trunc i64 %10 to i32
256  %12 = bitcast i32 %11 to float
257  %13 = insertelement <4 x float> %9, float %12, i32 1
258  %14 = trunc i64 %6 to i32
259  %15 = bitcast i32 %14 to float
260  %16 = insertelement <4 x float> %13, float %15, i32 2
261  %17 = lshr i64 %6, 32
262  %18 = trunc i64 %17 to i32
263  %19 = bitcast i32 %18 to float
264  %20 = insertelement <4 x float> %16, float %19, i32 3
265  ret <4 x float> %20
266}
267
268; PR21780
269define <4 x double> @load_double4_0u2u(double* nocapture readonly dereferenceable(32)) nofree nosync {
270; SSE2-LABEL: load_double4_0u2u:
271; SSE2:       # %bb.0:
272; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
273; SSE2-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
274; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0,0]
275; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0,0]
276; SSE2-NEXT:    retq
277;
278; SSSE3-LABEL: load_double4_0u2u:
279; SSSE3:       # %bb.0:
280; SSSE3-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
281; SSSE3-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
282; SSSE3-NEXT:    retq
283;
284; SSE41-LABEL: load_double4_0u2u:
285; SSE41:       # %bb.0:
286; SSE41-NEXT:    movddup {{.*#+}} xmm0 = mem[0,0]
287; SSE41-NEXT:    movddup {{.*#+}} xmm1 = mem[0,0]
288; SSE41-NEXT:    retq
289;
290; AVX-LABEL: load_double4_0u2u:
291; AVX:       # %bb.0:
292; AVX-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
293; AVX-NEXT:    retq
294  %2 = load double, double* %0, align 8
295  %3 = insertelement <4 x double> undef, double %2, i32 0
296  %4 = getelementptr inbounds double, double* %0, i64 2
297  %5 = load double, double* %4, align 8
298  %6 = insertelement <4 x double> %3, double %5, i32 2
299  %7 = shufflevector <4 x double> %6, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
300  ret <4 x double> %7
301}
302
303; Test case identified in rL366501
304@h = dso_local local_unnamed_addr global i8 0, align 1
305define dso_local i32 @load_partial_illegal_type()  {
306; SSE2-LABEL: load_partial_illegal_type:
307; SSE2:       # %bb.0:
308; SSE2-NEXT:    movzwl h(%rip), %eax
309; SSE2-NEXT:    movd %eax, %xmm0
310; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
311; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
312; SSE2-NEXT:    movd %xmm0, %eax
313; SSE2-NEXT:    retq
314;
315; SSSE3-LABEL: load_partial_illegal_type:
316; SSSE3:       # %bb.0:
317; SSSE3-NEXT:    movzwl h(%rip), %eax
318; SSSE3-NEXT:    movd %eax, %xmm0
319; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[3,u,u,u,u,u,u,u,u,u,u,u,u]
320; SSSE3-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
321; SSSE3-NEXT:    movd %xmm0, %eax
322; SSSE3-NEXT:    retq
323;
324; SSE41-LABEL: load_partial_illegal_type:
325; SSE41:       # %bb.0:
326; SSE41-NEXT:    movzwl h(%rip), %eax
327; SSE41-NEXT:    movd %eax, %xmm0
328; SSE41-NEXT:    movl $2, %eax
329; SSE41-NEXT:    pinsrb $2, %eax, %xmm0
330; SSE41-NEXT:    movd %xmm0, %eax
331; SSE41-NEXT:    retq
332;
333; AVX-LABEL: load_partial_illegal_type:
334; AVX:       # %bb.0:
335; AVX-NEXT:    movzwl h(%rip), %eax
336; AVX-NEXT:    vmovd %eax, %xmm0
337; AVX-NEXT:    movl $2, %eax
338; AVX-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
339; AVX-NEXT:    vmovd %xmm0, %eax
340; AVX-NEXT:    retq
341  %1 = load <2 x i8>, <2 x i8>* bitcast (i8* @h to <2 x i8>*), align 1
342  %2 = shufflevector <2 x i8> %1, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
343  %3 = insertelement <4 x i8> %2, i8 2, i32 2
344  %4 = bitcast <4 x i8> %3 to i32
345  ret i32 %4
346}
347
348define dso_local void @PR43227(i32* %explicit_0, <8 x i32>* %explicit_1) {
349; SSE-LABEL: PR43227:
350; SSE:       # %bb.0:
351; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
352; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
353; SSE-NEXT:    psrlq $32, %xmm0
354; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
355; SSE-NEXT:    pxor %xmm1, %xmm1
356; SSE-NEXT:    movdqa %xmm1, 672(%rsi)
357; SSE-NEXT:    movdqa %xmm0, 688(%rsi)
358; SSE-NEXT:    retq
359;
360; AVX1-LABEL: PR43227:
361; AVX1:       # %bb.0:
362; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
363; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
364; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm0
365; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
366; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
367; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
368; AVX1-NEXT:    vmovaps %ymm0, 672(%rsi)
369; AVX1-NEXT:    vzeroupper
370; AVX1-NEXT:    retq
371;
372; AVX2-LABEL: PR43227:
373; AVX2:       # %bb.0:
374; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
375; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
376; AVX2-NEXT:    vpsrlq $32, %xmm0, %xmm0
377; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
378; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
379; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
380; AVX2-NEXT:    vmovdqa %ymm0, 672(%rsi)
381; AVX2-NEXT:    vzeroupper
382; AVX2-NEXT:    retq
383  %1 = getelementptr i32, i32* %explicit_0, i64 63
384  %2 = bitcast i32* %1 to <3 x i32>*
385  %3 = load <3 x i32>, <3 x i32>* %2, align 1
386  %4 = shufflevector <3 x i32> %3, <3 x i32> undef, <2 x i32> <i32 1, i32 2>
387  %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
388  %6 = shufflevector <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 0, i32 undef, i32 0>, <8 x i32> %5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 9, i32 7>
389  %7 = getelementptr inbounds <8 x i32>, <8 x i32>* %explicit_1, i64 21
390  store <8 x i32> %6, <8 x i32>* %7, align 32
391  ret void
392}
393