1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1
3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2
4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512
12
13;
14; Subvector Load + Broadcast
15;
16
17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18; X86-LABEL: test_broadcast_2f64_4f64:
19; X86:       # %bb.0:
20; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
22; X86-NEXT:    retl
23;
24; X64-LABEL: test_broadcast_2f64_4f64:
25; X64:       # %bb.0:
26; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
27; X64-NEXT:    retq
28 %1 = load <2 x double>, <2 x double> *%p
29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
30 ret <4 x double> %2
31}
32
33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
34; X86-AVX-LABEL: test_broadcast_2f64_8f64:
35; X86-AVX:       # %bb.0:
36; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
38; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
39; X86-AVX-NEXT:    retl
40;
41; X86-AVX512-LABEL: test_broadcast_2f64_8f64:
42; X86-AVX512:       # %bb.0:
43; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
44; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
45; X86-AVX512-NEXT:    retl
46;
47; X64-AVX-LABEL: test_broadcast_2f64_8f64:
48; X64-AVX:       # %bb.0:
49; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
50; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
51; X64-AVX-NEXT:    retq
52;
53; X64-AVX512-LABEL: test_broadcast_2f64_8f64:
54; X64-AVX512:       # %bb.0:
55; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
56; X64-AVX512-NEXT:    retq
57 %1 = load <2 x double>, <2 x double> *%p
58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
59 ret <8 x double> %2
60}
61
62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
63; X86-AVX-LABEL: test_broadcast_4f64_8f64:
64; X86-AVX:       # %bb.0:
65; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
66; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
67; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
68; X86-AVX-NEXT:    retl
69;
70; X86-AVX512-LABEL: test_broadcast_4f64_8f64:
71; X86-AVX512:       # %bb.0:
72; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
73; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
74; X86-AVX512-NEXT:    retl
75;
76; X64-AVX-LABEL: test_broadcast_4f64_8f64:
77; X64-AVX:       # %bb.0:
78; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
79; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
80; X64-AVX-NEXT:    retq
81;
82; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
83; X64-AVX512:       # %bb.0:
84; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
85; X64-AVX512-NEXT:    retq
86 %1 = load <4 x double>, <4 x double> *%p
87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
88 ret <8 x double> %2
89}
90
91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
92; X86-AVX-LABEL: test_broadcast_2i64_4i64:
93; X86-AVX:       # %bb.0:
94; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
95; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
96; X86-AVX-NEXT:    retl
97;
98; X86-AVX512-LABEL: test_broadcast_2i64_4i64:
99; X86-AVX512:       # %bb.0:
100; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
101; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
102; X86-AVX512-NEXT:    retl
103;
104; X64-AVX-LABEL: test_broadcast_2i64_4i64:
105; X64-AVX:       # %bb.0:
106; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
107; X64-AVX-NEXT:    retq
108;
109; X64-AVX512-LABEL: test_broadcast_2i64_4i64:
110; X64-AVX512:       # %bb.0:
111; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
112; X64-AVX512-NEXT:    retq
113 %1 = load <2 x i64>, <2 x i64> *%p
114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
115 ret <4 x i64> %2
116}
117
118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
119; X86-AVX-LABEL: test_broadcast_2i64_8i64:
120; X86-AVX:       # %bb.0:
121; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
122; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
123; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
124; X86-AVX-NEXT:    retl
125;
126; X86-AVX512-LABEL: test_broadcast_2i64_8i64:
127; X86-AVX512:       # %bb.0:
128; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
129; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
130; X86-AVX512-NEXT:    retl
131;
132; X64-AVX-LABEL: test_broadcast_2i64_8i64:
133; X64-AVX:       # %bb.0:
134; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
135; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
136; X64-AVX-NEXT:    retq
137;
138; X64-AVX512-LABEL: test_broadcast_2i64_8i64:
139; X64-AVX512:       # %bb.0:
140; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
141; X64-AVX512-NEXT:    retq
142 %1 = load <2 x i64>, <2 x i64> *%p
143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
144 ret <8 x i64> %2
145}
146
147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
148; X86-AVX-LABEL: test_broadcast_4i64_8i64:
149; X86-AVX:       # %bb.0:
150; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
152; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
153; X86-AVX-NEXT:    retl
154;
155; X86-AVX512-LABEL: test_broadcast_4i64_8i64:
156; X86-AVX512:       # %bb.0:
157; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
158; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
159; X86-AVX512-NEXT:    retl
160;
161; X64-AVX-LABEL: test_broadcast_4i64_8i64:
162; X64-AVX:       # %bb.0:
163; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
164; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
165; X64-AVX-NEXT:    retq
166;
167; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
168; X64-AVX512:       # %bb.0:
169; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
170; X64-AVX512-NEXT:    retq
171 %1 = load <4 x i64>, <4 x i64> *%p
172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
173 ret <8 x i64> %2
174}
175
176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
177; X86-LABEL: test_broadcast_4f32_8f32:
178; X86:       # %bb.0:
179; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
180; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
181; X86-NEXT:    retl
182;
183; X64-LABEL: test_broadcast_4f32_8f32:
184; X64:       # %bb.0:
185; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
186; X64-NEXT:    retq
187 %1 = load <4 x float>, <4 x float> *%p
188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
189 ret <8 x float> %2
190}
191
192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
193; X86-AVX-LABEL: test_broadcast_4f32_16f32:
194; X86-AVX:       # %bb.0:
195; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
196; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
197; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
198; X86-AVX-NEXT:    retl
199;
200; X86-AVX512-LABEL: test_broadcast_4f32_16f32:
201; X86-AVX512:       # %bb.0:
202; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
203; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
204; X86-AVX512-NEXT:    retl
205;
206; X64-AVX-LABEL: test_broadcast_4f32_16f32:
207; X64-AVX:       # %bb.0:
208; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
209; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
210; X64-AVX-NEXT:    retq
211;
212; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
213; X64-AVX512:       # %bb.0:
214; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
215; X64-AVX512-NEXT:    retq
216 %1 = load <4 x float>, <4 x float> *%p
217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
218 ret <16 x float> %2
219}
220
221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
222; X86-AVX-LABEL: test_broadcast_8f32_16f32:
223; X86-AVX:       # %bb.0:
224; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
226; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
227; X86-AVX-NEXT:    retl
228;
229; X86-AVX512-LABEL: test_broadcast_8f32_16f32:
230; X86-AVX512:       # %bb.0:
231; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
232; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
233; X86-AVX512-NEXT:    retl
234;
235; X64-AVX-LABEL: test_broadcast_8f32_16f32:
236; X64-AVX:       # %bb.0:
237; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
238; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
239; X64-AVX-NEXT:    retq
240;
241; X64-AVX512-LABEL: test_broadcast_8f32_16f32:
242; X64-AVX512:       # %bb.0:
243; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
244; X64-AVX512-NEXT:    retq
245 %1 = load <8 x float>, <8 x float> *%p
246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
247 ret <16 x float> %2
248}
249
250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
251; X86-AVX-LABEL: test_broadcast_4i32_8i32:
252; X86-AVX:       # %bb.0:
253; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
254; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
255; X86-AVX-NEXT:    retl
256;
257; X86-AVX512-LABEL: test_broadcast_4i32_8i32:
258; X86-AVX512:       # %bb.0:
259; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
260; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
261; X86-AVX512-NEXT:    retl
262;
263; X64-AVX-LABEL: test_broadcast_4i32_8i32:
264; X64-AVX:       # %bb.0:
265; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
266; X64-AVX-NEXT:    retq
267;
268; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
269; X64-AVX512:       # %bb.0:
270; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
271; X64-AVX512-NEXT:    retq
272 %1 = load <4 x i32>, <4 x i32> *%p
273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
274 ret <8 x i32> %2
275}
276
277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
278; X86-AVX-LABEL: test_broadcast_4i32_16i32:
279; X86-AVX:       # %bb.0:
280; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
281; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
282; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
283; X86-AVX-NEXT:    retl
284;
285; X86-AVX512-LABEL: test_broadcast_4i32_16i32:
286; X86-AVX512:       # %bb.0:
287; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
288; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
289; X86-AVX512-NEXT:    retl
290;
291; X64-AVX-LABEL: test_broadcast_4i32_16i32:
292; X64-AVX:       # %bb.0:
293; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
294; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
295; X64-AVX-NEXT:    retq
296;
297; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
298; X64-AVX512:       # %bb.0:
299; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
300; X64-AVX512-NEXT:    retq
301 %1 = load <4 x i32>, <4 x i32> *%p
302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
303 ret <16 x i32> %2
304}
305
306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
307; X86-AVX-LABEL: test_broadcast_8i32_16i32:
308; X86-AVX:       # %bb.0:
309; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
310; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
311; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
312; X86-AVX-NEXT:    retl
313;
314; X86-AVX512-LABEL: test_broadcast_8i32_16i32:
315; X86-AVX512:       # %bb.0:
316; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
317; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
318; X86-AVX512-NEXT:    retl
319;
320; X64-AVX-LABEL: test_broadcast_8i32_16i32:
321; X64-AVX:       # %bb.0:
322; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
323; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
324; X64-AVX-NEXT:    retq
325;
326; X64-AVX512-LABEL: test_broadcast_8i32_16i32:
327; X64-AVX512:       # %bb.0:
328; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
329; X64-AVX512-NEXT:    retq
330 %1 = load <8 x i32>, <8 x i32> *%p
331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
332 ret <16 x i32> %2
333}
334
335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
336; X86-AVX-LABEL: test_broadcast_8i16_16i16:
337; X86-AVX:       # %bb.0:
338; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
339; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
340; X86-AVX-NEXT:    retl
341;
342; X86-AVX512-LABEL: test_broadcast_8i16_16i16:
343; X86-AVX512:       # %bb.0:
344; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
345; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
346; X86-AVX512-NEXT:    retl
347;
348; X64-AVX-LABEL: test_broadcast_8i16_16i16:
349; X64-AVX:       # %bb.0:
350; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
351; X64-AVX-NEXT:    retq
352;
353; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
354; X64-AVX512:       # %bb.0:
355; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
356; X64-AVX512-NEXT:    retq
357 %1 = load <8 x i16>, <8 x i16> *%p
358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
359 ret <16 x i16> %2
360}
361
362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
363; X86-AVX-LABEL: test_broadcast_8i16_32i16:
364; X86-AVX:       # %bb.0:
365; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
366; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
367; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
368; X86-AVX-NEXT:    retl
369;
370; X86-AVX512-LABEL: test_broadcast_8i16_32i16:
371; X86-AVX512:       # %bb.0:
372; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
373; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
374; X86-AVX512-NEXT:    retl
375;
376; X64-AVX-LABEL: test_broadcast_8i16_32i16:
377; X64-AVX:       # %bb.0:
378; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
379; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
380; X64-AVX-NEXT:    retq
381;
382; X64-AVX512-LABEL: test_broadcast_8i16_32i16:
383; X64-AVX512:       # %bb.0:
384; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
385; X64-AVX512-NEXT:    retq
386 %1 = load <8 x i16>, <8 x i16> *%p
387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
388 ret <32 x i16> %2
389}
390
391define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
392; X86-AVX-LABEL: test_broadcast_16i16_32i16:
393; X86-AVX:       # %bb.0:
394; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
395; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
396; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
397; X86-AVX-NEXT:    retl
398;
399; X86-AVX512-LABEL: test_broadcast_16i16_32i16:
400; X86-AVX512:       # %bb.0:
401; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
402; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
403; X86-AVX512-NEXT:    retl
404;
405; X64-AVX-LABEL: test_broadcast_16i16_32i16:
406; X64-AVX:       # %bb.0:
407; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
408; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
409; X64-AVX-NEXT:    retq
410;
411; X64-AVX512-LABEL: test_broadcast_16i16_32i16:
412; X64-AVX512:       # %bb.0:
413; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
414; X64-AVX512-NEXT:    retq
415 %1 = load <16 x i16>, <16 x i16> *%p
416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
417 ret <32 x i16> %2
418}
419
420define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
421; X86-AVX-LABEL: test_broadcast_16i8_32i8:
422; X86-AVX:       # %bb.0:
423; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
424; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
425; X86-AVX-NEXT:    retl
426;
427; X86-AVX512-LABEL: test_broadcast_16i8_32i8:
428; X86-AVX512:       # %bb.0:
429; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
430; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
431; X86-AVX512-NEXT:    retl
432;
433; X64-AVX-LABEL: test_broadcast_16i8_32i8:
434; X64-AVX:       # %bb.0:
435; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
436; X64-AVX-NEXT:    retq
437;
438; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
439; X64-AVX512:       # %bb.0:
440; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
441; X64-AVX512-NEXT:    retq
442 %1 = load <16 x i8>, <16 x i8> *%p
443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
444 ret <32 x i8> %2
445}
446
447define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
448; X86-AVX-LABEL: test_broadcast_16i8_64i8:
449; X86-AVX:       # %bb.0:
450; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
451; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
453; X86-AVX-NEXT:    retl
454;
455; X86-AVX512-LABEL: test_broadcast_16i8_64i8:
456; X86-AVX512:       # %bb.0:
457; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
458; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
459; X86-AVX512-NEXT:    retl
460;
461; X64-AVX-LABEL: test_broadcast_16i8_64i8:
462; X64-AVX:       # %bb.0:
463; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
464; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
465; X64-AVX-NEXT:    retq
466;
467; X64-AVX512-LABEL: test_broadcast_16i8_64i8:
468; X64-AVX512:       # %bb.0:
469; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
470; X64-AVX512-NEXT:    retq
471 %1 = load <16 x i8>, <16 x i8> *%p
472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
473 ret <64 x i8> %2
474}
475
476define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
477; X86-AVX-LABEL: test_broadcast_32i8_64i8:
478; X86-AVX:       # %bb.0:
479; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
480; X86-AVX-NEXT:    vmovaps (%eax), %ymm0
481; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
482; X86-AVX-NEXT:    retl
483;
484; X86-AVX512-LABEL: test_broadcast_32i8_64i8:
485; X86-AVX512:       # %bb.0:
486; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
487; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
488; X86-AVX512-NEXT:    retl
489;
490; X64-AVX-LABEL: test_broadcast_32i8_64i8:
491; X64-AVX:       # %bb.0:
492; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
493; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
494; X64-AVX-NEXT:    retq
495;
496; X64-AVX512-LABEL: test_broadcast_32i8_64i8:
497; X64-AVX512:       # %bb.0:
498; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499; X64-AVX512-NEXT:    retq
500 %1 = load <32 x i8>, <32 x i8> *%p
501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
502 ret <64 x i8> %2
503}
504
505;
506; Subvector Load + Broadcast + Store
507;
508
509define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
510; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
511; X86-AVX:       # %bb.0:
512; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
513; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
514; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
515; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
516; X86-AVX-NEXT:    retl
517;
518; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
519; X86-AVX512:       # %bb.0:
520; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
521; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
522; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
523; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
524; X86-AVX512-NEXT:    retl
525;
526; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
527; X64-AVX:       # %bb.0:
528; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
529; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
530; X64-AVX-NEXT:    retq
531;
532; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse:
533; X64-AVX512:       # %bb.0:
534; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
535; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
536; X64-AVX512-NEXT:    retq
537 %1 = load <2 x double>, <2 x double>* %p0
538 store <2 x double> %1, <2 x double>* %p1
539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
540 ret <4 x double> %2
541}
542
543define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
544; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
545; X86-AVX:       # %bb.0:
546; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
547; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
548; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
549; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
550; X86-AVX-NEXT:    retl
551;
552; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
553; X86-AVX512:       # %bb.0:
554; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
555; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
556; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
557; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
558; X86-AVX512-NEXT:    retl
559;
560; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
561; X64-AVX:       # %bb.0:
562; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
563; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
564; X64-AVX-NEXT:    retq
565;
566; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
567; X64-AVX512:       # %bb.0:
568; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
569; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
570; X64-AVX512-NEXT:    retq
571 %1 = load <2 x i64>, <2 x i64>* %p0
572 store <2 x i64> %1, <2 x i64>* %p1
573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
574 ret <4 x i64> %2
575}
576
577define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
578; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
579; X86-AVX:       # %bb.0:
580; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
581; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
582; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
583; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
584; X86-AVX-NEXT:    retl
585;
586; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
587; X86-AVX512:       # %bb.0:
588; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
589; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
590; X86-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
591; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
592; X86-AVX512-NEXT:    retl
593;
594; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse:
595; X64-AVX:       # %bb.0:
596; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
597; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
598; X64-AVX-NEXT:    retq
599;
600; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse:
601; X64-AVX512:       # %bb.0:
602; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
603; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
604; X64-AVX512-NEXT:    retq
605 %1 = load <4 x float>, <4 x float>* %p0
606 store <4 x float> %1, <4 x float>* %p1
607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
608 ret <8 x float> %2
609}
610
611define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
612; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
613; X86-AVX:       # %bb.0:
614; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
615; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
616; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
617; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
618; X86-AVX-NEXT:    retl
619;
620; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
621; X86-AVX512:       # %bb.0:
622; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
623; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
624; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
625; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
626; X86-AVX512-NEXT:    retl
627;
628; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
629; X64-AVX:       # %bb.0:
630; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
631; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
632; X64-AVX-NEXT:    retq
633;
634; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
635; X64-AVX512:       # %bb.0:
636; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
637; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
638; X64-AVX512-NEXT:    retq
639 %1 = load <4 x i32>, <4 x i32>* %p0
640 store <4 x i32> %1, <4 x i32>* %p1
641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
642 ret <8 x i32> %2
643}
644
645define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
646; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
647; X86-AVX:       # %bb.0:
648; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
649; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
650; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
651; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
652; X86-AVX-NEXT:    retl
653;
654; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
655; X86-AVX512:       # %bb.0:
656; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
657; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
658; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
659; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
660; X86-AVX512-NEXT:    retl
661;
662; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
663; X64-AVX:       # %bb.0:
664; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
665; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
666; X64-AVX-NEXT:    retq
667;
668; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse:
669; X64-AVX512:       # %bb.0:
670; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
671; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
672; X64-AVX512-NEXT:    retq
673 %1 = load <8 x i16>, <8 x i16> *%p0
674 store <8 x i16> %1, <8 x i16>* %p1
675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
676 ret <16 x i16> %2
677}
678
679define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
680; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
681; X86-AVX:       # %bb.0:
682; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
683; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
684; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
685; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
686; X86-AVX-NEXT:    retl
687;
688; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
689; X86-AVX512:       # %bb.0:
690; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
691; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
692; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
693; X86-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
694; X86-AVX512-NEXT:    retl
695;
696; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
697; X64-AVX:       # %bb.0:
698; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
699; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
700; X64-AVX-NEXT:    retq
701;
702; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse:
703; X64-AVX512:       # %bb.0:
704; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
705; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
706; X64-AVX512-NEXT:    retq
707 %1 = load <16 x i8>, <16 x i8> *%p0
708 store <16 x i8> %1, <16 x i8>* %p1
709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
710 ret <32 x i8> %2
711}
712
713;
714; Subvector Load + Broadcast with Separate Store
715;
716
717define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
718; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain:
719; X86-AVX:       # %bb.0:
720; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
721; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
722; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
723; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
724; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
725; X86-AVX-NEXT:    retl
726;
727; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
728; X86-AVX512:       # %bb.0:
729; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
730; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
731; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
732; X86-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
733; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
734; X86-AVX512-NEXT:    retl
735;
736; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
737; X64-AVX:       # %bb.0:
738; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
739; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
740; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
741; X64-AVX-NEXT:    retq
742;
743; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain:
744; X64-AVX512:       # %bb.0:
745; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
746; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
747; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
748; X64-AVX512-NEXT:    retq
749  %1 = load <4 x i32>, <4 x i32>* %p0
750  store <4 x float> zeroinitializer, <4 x float>* %p1
751  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
752  ret <8 x i32> %2
753}
754
755define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
756; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain:
757; X86-AVX:       # %bb.0:
758; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
759; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
760; X86-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
761; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
762; X86-AVX-NEXT:    vmovaps %xmm1, (%eax)
763; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
764; X86-AVX-NEXT:    retl
765;
766; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
767; X86-AVX512:       # %bb.0:
768; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
769; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
770; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
771; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
772; X86-AVX512-NEXT:    vmovaps %xmm1, (%eax)
773; X86-AVX512-NEXT:    retl
774;
775; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
776; X64-AVX:       # %bb.0:
777; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
778; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
779; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
780; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
781; X64-AVX-NEXT:    retq
782;
783; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain:
784; X64-AVX512:       # %bb.0:
785; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
786; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
787; X64-AVX512-NEXT:    vmovaps %xmm1, (%rsi)
788; X64-AVX512-NEXT:    retq
789  %1 = load <4 x i32>, <4 x i32>* %p0
790  store <4 x float> zeroinitializer, <4 x float>* %p1
791  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
792  ret <16 x i32> %2
793}
794
795;
796; subvector Load with multiple uses + broadcast
797; Fallback to the broadcast should be done
798;
799
800@ga4 = dso_local global <4 x i64> zeroinitializer, align 8
801@gb4 = dso_local global <8 x i64> zeroinitializer, align 8
802
803define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
804; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
805; X86-AVX1:       # %bb.0: # %entry
806; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,2,0]
807; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
808; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
809; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [3,0,4,0]
810; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
811; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0]
812; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
813; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm7
814; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
815; X86-AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
816; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
817; X86-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
818; X86-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
819; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
820; X86-AVX1-NEXT:    vandps %ymm6, %ymm1, %ymm1
821; X86-AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
822; X86-AVX1-NEXT:    vmovdqu %xmm0, ga4+16
823; X86-AVX1-NEXT:    vmovdqu %xmm4, ga4
824; X86-AVX1-NEXT:    vmovups %ymm2, gb4+32
825; X86-AVX1-NEXT:    vmovups %ymm1, gb4
826; X86-AVX1-NEXT:    vzeroupper
827; X86-AVX1-NEXT:    retl
828;
829; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
830; X86-AVX2:       # %bb.0: # %entry
831; X86-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
832; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
833; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
834; X86-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
835; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
836; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
837; X86-AVX2-NEXT:    vmovdqu %ymm0, ga4
838; X86-AVX2-NEXT:    vmovdqu %ymm2, gb4+32
839; X86-AVX2-NEXT:    vmovdqu %ymm1, gb4
840; X86-AVX2-NEXT:    vzeroupper
841; X86-AVX2-NEXT:    retl
842;
843; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
844; X86-AVX512:       # %bb.0: # %entry
845; X86-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
846; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
847; X86-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
848; X86-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
849; X86-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
850; X86-AVX512-NEXT:    vmovdqu %ymm0, ga4
851; X86-AVX512-NEXT:    vmovdqu64 %zmm1, gb4
852; X86-AVX512-NEXT:    vzeroupper
853; X86-AVX512-NEXT:    retl
854;
855; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
856; X64-AVX1:       # %bb.0: # %entry
857; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,2]
858; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm4
859; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
860; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [3,4]
861; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
862; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [1,2,3,4]
863; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
864; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm7
865; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
866; X64-AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm2, %ymm2
867; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
868; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm7, %xmm5
869; X64-AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
870; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
871; X64-AVX1-NEXT:    vandps %ymm6, %ymm1, %ymm1
872; X64-AVX1-NEXT:    vandps %ymm6, %ymm2, %ymm2
873; X64-AVX1-NEXT:    vmovdqu %xmm0, ga4+16(%rip)
874; X64-AVX1-NEXT:    vmovdqu %xmm4, ga4(%rip)
875; X64-AVX1-NEXT:    vmovups %ymm2, gb4+32(%rip)
876; X64-AVX1-NEXT:    vmovups %ymm1, gb4(%rip)
877; X64-AVX1-NEXT:    vzeroupper
878; X64-AVX1-NEXT:    retq
879;
880; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
881; X64-AVX2:       # %bb.0: # %entry
882; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
883; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
884; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
885; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
886; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
887; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
888; X64-AVX2-NEXT:    vmovdqu %ymm0, ga4(%rip)
889; X64-AVX2-NEXT:    vmovdqu %ymm2, gb4+32(%rip)
890; X64-AVX2-NEXT:    vmovdqu %ymm1, gb4(%rip)
891; X64-AVX2-NEXT:    vzeroupper
892; X64-AVX2-NEXT:    retq
893;
894; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
895; X64-AVX512:       # %bb.0: # %entry
896; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4]
897; X64-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
898; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
899; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
900; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
901; X64-AVX512-NEXT:    vmovdqu %ymm0, ga4(%rip)
902; X64-AVX512-NEXT:    vmovdqu64 %zmm1, gb4(%rip)
903; X64-AVX512-NEXT:    vzeroupper
904; X64-AVX512-NEXT:    retq
905entry:
906  %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
907  %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
908  %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
909  store <4 x i64> %0, <4 x i64>* @ga4, align 8
910  store <8 x i64> %2, <8 x i64>* @gb4, align 8
911  ret void
912}
913
914
915@ga2 = dso_local global <4 x double> zeroinitializer, align 8
916@gb2 = dso_local global <8 x double> zeroinitializer, align 8
917
918define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
919; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
920; X86-AVX:       # %bb.0: # %entry
921; X86-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
922; X86-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
923; X86-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
924; X86-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
925; X86-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
926; X86-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
927; X86-AVX-NEXT:    vmovupd %ymm0, ga2
928; X86-AVX-NEXT:    vmovupd %ymm2, gb2+32
929; X86-AVX-NEXT:    vmovupd %ymm1, gb2
930; X86-AVX-NEXT:    vzeroupper
931; X86-AVX-NEXT:    retl
932;
933; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
934; X86-AVX512:       # %bb.0: # %entry
935; X86-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
936; X86-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
937; X86-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
938; X86-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
939; X86-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
940; X86-AVX512-NEXT:    vmovupd %ymm0, ga2
941; X86-AVX512-NEXT:    vmovupd %zmm1, gb2
942; X86-AVX512-NEXT:    vzeroupper
943; X86-AVX512-NEXT:    retl
944;
945; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
946; X64-AVX:       # %bb.0: # %entry
947; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
948; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
949; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
950; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
951; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
952; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
953; X64-AVX-NEXT:    vmovupd %ymm0, ga2(%rip)
954; X64-AVX-NEXT:    vmovupd %ymm2, gb2+32(%rip)
955; X64-AVX-NEXT:    vmovupd %ymm1, gb2(%rip)
956; X64-AVX-NEXT:    vzeroupper
957; X64-AVX-NEXT:    retq
958;
959; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
960; X64-AVX512:       # %bb.0: # %entry
961; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0]
962; X64-AVX512-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
963; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
964; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
965; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
966; X64-AVX512-NEXT:    vmovupd %ymm0, ga2(%rip)
967; X64-AVX512-NEXT:    vmovupd %zmm1, gb2(%rip)
968; X64-AVX512-NEXT:    vzeroupper
969; X64-AVX512-NEXT:    retq
970entry:
971  %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
972  %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
973  %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
974  store <4 x double> %0, <4 x double>* @ga2, align 8
975  store <8 x double> %2, <8 x double>* @gb2, align 8
976  ret void
977}
978
979@ha4 = dso_local global <4 x i32> zeroinitializer, align 8
980@hb4 = dso_local global <8 x i32> zeroinitializer, align 8
981@hc4 = dso_local global <16 x i32> zeroinitializer, align 8
982
983define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind {
984; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
985; X86-AVX1:       # %bb.0: # %entry
986; X86-AVX1-NEXT:    pushl %ebp
987; X86-AVX1-NEXT:    movl %esp, %ebp
988; X86-AVX1-NEXT:    andl $-32, %esp
989; X86-AVX1-NEXT:    subl $32, %esp
990; X86-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
991; X86-AVX1-NEXT:    # ymm3 = mem[0,1,0,1]
992; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
993; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
994; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
995; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
996; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
997; X86-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
998; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
999; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm4, %xmm4
1000; X86-AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1001; X86-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
1002; X86-AVX1-NEXT:    vpaddd 8(%ebp), %xmm3, %xmm4
1003; X86-AVX1-NEXT:    vpaddd 24(%ebp), %xmm3, %xmm5
1004; X86-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm4, %ymm4
1005; X86-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
1006; X86-AVX1-NEXT:    vandps %ymm3, %ymm4, %ymm3
1007; X86-AVX1-NEXT:    vmovdqu %xmm0, ha4
1008; X86-AVX1-NEXT:    vmovups %ymm1, hb4
1009; X86-AVX1-NEXT:    vmovups %ymm3, hc4+32
1010; X86-AVX1-NEXT:    vmovups %ymm2, hc4
1011; X86-AVX1-NEXT:    movl %ebp, %esp
1012; X86-AVX1-NEXT:    popl %ebp
1013; X86-AVX1-NEXT:    vzeroupper
1014; X86-AVX1-NEXT:    retl
1015;
1016; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1017; X86-AVX2:       # %bb.0: # %entry
1018; X86-AVX2-NEXT:    pushl %ebp
1019; X86-AVX2-NEXT:    movl %esp, %ebp
1020; X86-AVX2-NEXT:    andl $-32, %esp
1021; X86-AVX2-NEXT:    subl $32, %esp
1022; X86-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4]
1023; X86-AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1024; X86-AVX2-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1025; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1026; X86-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1027; X86-AVX2-NEXT:    vpaddd %ymm3, %ymm2, %ymm2
1028; X86-AVX2-NEXT:    vpaddd 8(%ebp), %ymm3, %ymm4
1029; X86-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
1030; X86-AVX2-NEXT:    vpand %ymm3, %ymm4, %ymm3
1031; X86-AVX2-NEXT:    vmovdqu %xmm0, ha4
1032; X86-AVX2-NEXT:    vmovdqu %ymm1, hb4
1033; X86-AVX2-NEXT:    vmovdqu %ymm3, hc4+32
1034; X86-AVX2-NEXT:    vmovdqu %ymm2, hc4
1035; X86-AVX2-NEXT:    movl %ebp, %esp
1036; X86-AVX2-NEXT:    popl %ebp
1037; X86-AVX2-NEXT:    vzeroupper
1038; X86-AVX2-NEXT:    retl
1039;
1040; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1041; X86-AVX512:       # %bb.0: # %entry
1042; X86-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1043; X86-AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1044; X86-AVX512-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1045; X86-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1046; X86-AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm1
1047; X86-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
1048; X86-AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm2
1049; X86-AVX512-NEXT:    vmovdqu %xmm0, ha4
1050; X86-AVX512-NEXT:    vmovdqu %ymm1, hb4
1051; X86-AVX512-NEXT:    vmovdqu64 %zmm2, hc4
1052; X86-AVX512-NEXT:    vzeroupper
1053; X86-AVX512-NEXT:    retl
1054;
1055; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1056; X64-AVX1:       # %bb.0: # %entry
1057; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1058; X64-AVX1-NEXT:    # ymm4 = mem[0,1,0,1]
1059; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1060; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
1061; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1062; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
1063; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
1064; X64-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1065; X64-AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
1066; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1067; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
1068; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm3, %ymm3
1069; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
1070; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm5
1071; X64-AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
1072; X64-AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
1073; X64-AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1074; X64-AVX1-NEXT:    vandps %ymm4, %ymm3, %ymm3
1075; X64-AVX1-NEXT:    vmovdqu %xmm0, ha4(%rip)
1076; X64-AVX1-NEXT:    vmovups %ymm1, hb4(%rip)
1077; X64-AVX1-NEXT:    vmovups %ymm3, hc4+32(%rip)
1078; X64-AVX1-NEXT:    vmovups %ymm2, hc4(%rip)
1079; X64-AVX1-NEXT:    vzeroupper
1080; X64-AVX1-NEXT:    retq
1081;
1082; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1083; X64-AVX2:       # %bb.0: # %entry
1084; X64-AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4]
1085; X64-AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1086; X64-AVX2-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
1087; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm1, %ymm1
1088; X64-AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
1089; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm3, %ymm3
1090; X64-AVX2-NEXT:    vpaddd %ymm4, %ymm2, %ymm2
1091; X64-AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
1092; X64-AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
1093; X64-AVX2-NEXT:    vmovdqu %xmm0, ha4(%rip)
1094; X64-AVX2-NEXT:    vmovdqu %ymm1, hb4(%rip)
1095; X64-AVX2-NEXT:    vmovdqu %ymm3, hc4+32(%rip)
1096; X64-AVX2-NEXT:    vmovdqu %ymm2, hc4(%rip)
1097; X64-AVX2-NEXT:    vzeroupper
1098; X64-AVX2-NEXT:    retq
1099;
1100; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32:
1101; X64-AVX512:       # %bb.0: # %entry
1102; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4]
1103; X64-AVX512-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1104; X64-AVX512-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
1105; X64-AVX512-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
1106; X64-AVX512-NEXT:    vpand %ymm3, %ymm1, %ymm1
1107; X64-AVX512-NEXT:    vpaddd %zmm3, %zmm2, %zmm2
1108; X64-AVX512-NEXT:    vpandd %zmm3, %zmm2, %zmm2
1109; X64-AVX512-NEXT:    vmovdqu %xmm0, ha4(%rip)
1110; X64-AVX512-NEXT:    vmovdqu %ymm1, hb4(%rip)
1111; X64-AVX512-NEXT:    vmovdqu64 %zmm2, hc4(%rip)
1112; X64-AVX512-NEXT:    vzeroupper
1113; X64-AVX512-NEXT:    retq
1114entry:
1115  %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
1116  %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1117  %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1118  %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1119  %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
1120  store <4 x i32> %0, <4 x i32>* @ha4, align 8
1121  store <8 x i32> %2, <8 x i32>* @hb4, align 8
1122  store <16 x i32> %4, <16 x i32>* @hc4, align 8
1123  ret void
1124}
1125
1126;
1127; Subvector Broadcast from register
1128;
1129
1130define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind {
1131; X86-LABEL: reg_broadcast_2f64_4f64:
1132; X86:       # %bb.0:
1133; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1134; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1135; X86-NEXT:    retl
1136;
1137; X64-LABEL: reg_broadcast_2f64_4f64:
1138; X64:       # %bb.0:
1139; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1140; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1141; X64-NEXT:    retq
1142 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1143 ret <4 x double> %1
1144}
1145
1146define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind {
1147; X86-AVX-LABEL: reg_broadcast_2f64_8f64:
1148; X86-AVX:       # %bb.0:
1149; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1150; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1151; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1152; X86-AVX-NEXT:    retl
1153;
1154; X86-AVX512-LABEL: reg_broadcast_2f64_8f64:
1155; X86-AVX512:       # %bb.0:
1156; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1157; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1158; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1159; X86-AVX512-NEXT:    retl
1160;
1161; X64-AVX-LABEL: reg_broadcast_2f64_8f64:
1162; X64-AVX:       # %bb.0:
1163; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1164; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1165; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1166; X64-AVX-NEXT:    retq
1167;
1168; X64-AVX512-LABEL: reg_broadcast_2f64_8f64:
1169; X64-AVX512:       # %bb.0:
1170; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1171; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1172; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1173; X64-AVX512-NEXT:    retq
1174 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1175 ret <8 x double> %1
1176}
1177
1178define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind {
1179; X86-AVX-LABEL: reg_broadcast_4f64_8f64:
1180; X86-AVX:       # %bb.0:
1181; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1182; X86-AVX-NEXT:    retl
1183;
1184; X86-AVX512-LABEL: reg_broadcast_4f64_8f64:
1185; X86-AVX512:       # %bb.0:
1186; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1187; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1188; X86-AVX512-NEXT:    retl
1189;
1190; X64-AVX-LABEL: reg_broadcast_4f64_8f64:
1191; X64-AVX:       # %bb.0:
1192; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1193; X64-AVX-NEXT:    retq
1194;
1195; X64-AVX512-LABEL: reg_broadcast_4f64_8f64:
1196; X64-AVX512:       # %bb.0:
1197; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1198; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1199; X64-AVX512-NEXT:    retq
1200 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1201 ret <8 x double> %1
1202}
1203
1204define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind {
1205; X86-LABEL: reg_broadcast_2i64_4i64:
1206; X86:       # %bb.0:
1207; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1208; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1209; X86-NEXT:    retl
1210;
1211; X64-LABEL: reg_broadcast_2i64_4i64:
1212; X64:       # %bb.0:
1213; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1214; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1215; X64-NEXT:    retq
1216 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1217 ret <4 x i64> %1
1218}
1219
1220define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind {
1221; X86-AVX-LABEL: reg_broadcast_2i64_8i64:
1222; X86-AVX:       # %bb.0:
1223; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1224; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1225; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1226; X86-AVX-NEXT:    retl
1227;
1228; X86-AVX512-LABEL: reg_broadcast_2i64_8i64:
1229; X86-AVX512:       # %bb.0:
1230; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1231; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1232; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1233; X86-AVX512-NEXT:    retl
1234;
1235; X64-AVX-LABEL: reg_broadcast_2i64_8i64:
1236; X64-AVX:       # %bb.0:
1237; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1238; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1239; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1240; X64-AVX-NEXT:    retq
1241;
1242; X64-AVX512-LABEL: reg_broadcast_2i64_8i64:
1243; X64-AVX512:       # %bb.0:
1244; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1245; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1246; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1247; X64-AVX512-NEXT:    retq
1248 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1249 ret <8 x i64> %1
1250}
1251
1252define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind {
1253; X86-AVX-LABEL: reg_broadcast_4i64_8i64:
1254; X86-AVX:       # %bb.0:
1255; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1256; X86-AVX-NEXT:    retl
1257;
1258; X86-AVX512-LABEL: reg_broadcast_4i64_8i64:
1259; X86-AVX512:       # %bb.0:
1260; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1261; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1262; X86-AVX512-NEXT:    retl
1263;
1264; X64-AVX-LABEL: reg_broadcast_4i64_8i64:
1265; X64-AVX:       # %bb.0:
1266; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1267; X64-AVX-NEXT:    retq
1268;
1269; X64-AVX512-LABEL: reg_broadcast_4i64_8i64:
1270; X64-AVX512:       # %bb.0:
1271; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1272; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1273; X64-AVX512-NEXT:    retq
1274 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1275 ret <8 x i64> %1
1276}
1277
1278define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind {
1279; X86-LABEL: reg_broadcast_4f32_8f32:
1280; X86:       # %bb.0:
1281; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1282; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1283; X86-NEXT:    retl
1284;
1285; X64-LABEL: reg_broadcast_4f32_8f32:
1286; X64:       # %bb.0:
1287; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1288; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1289; X64-NEXT:    retq
1290 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1291 ret <8 x float> %1
1292}
1293
1294define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
1295; X86-AVX-LABEL: reg_broadcast_4f32_16f32:
1296; X86-AVX:       # %bb.0:
1297; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1298; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1299; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1300; X86-AVX-NEXT:    retl
1301;
1302; X86-AVX512-LABEL: reg_broadcast_4f32_16f32:
1303; X86-AVX512:       # %bb.0:
1304; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1305; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1306; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1307; X86-AVX512-NEXT:    retl
1308;
1309; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
1310; X64-AVX:       # %bb.0:
1311; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1312; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1313; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1314; X64-AVX-NEXT:    retq
1315;
1316; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
1317; X64-AVX512:       # %bb.0:
1318; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1319; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1320; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1321; X64-AVX512-NEXT:    retq
1322 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1323 ret <16 x float> %1
1324}
1325
1326define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
1327; X86-AVX-LABEL: reg_broadcast_8f32_16f32:
1328; X86-AVX:       # %bb.0:
1329; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1330; X86-AVX-NEXT:    retl
1331;
1332; X86-AVX512-LABEL: reg_broadcast_8f32_16f32:
1333; X86-AVX512:       # %bb.0:
1334; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1335; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1336; X86-AVX512-NEXT:    retl
1337;
1338; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
1339; X64-AVX:       # %bb.0:
1340; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1341; X64-AVX-NEXT:    retq
1342;
1343; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
1344; X64-AVX512:       # %bb.0:
1345; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1346; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1347; X64-AVX512-NEXT:    retq
1348 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1349 ret <16 x float> %1
1350}
1351
1352define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind {
1353; X86-LABEL: reg_broadcast_4i32_8i32:
1354; X86:       # %bb.0:
1355; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1356; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1357; X86-NEXT:    retl
1358;
1359; X64-LABEL: reg_broadcast_4i32_8i32:
1360; X64:       # %bb.0:
1361; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1362; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1363; X64-NEXT:    retq
1364 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1365 ret <8 x i32> %1
1366}
1367
1368define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
1369; X86-AVX-LABEL: reg_broadcast_4i32_16i32:
1370; X86-AVX:       # %bb.0:
1371; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1372; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1373; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1374; X86-AVX-NEXT:    retl
1375;
1376; X86-AVX512-LABEL: reg_broadcast_4i32_16i32:
1377; X86-AVX512:       # %bb.0:
1378; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1379; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1380; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1381; X86-AVX512-NEXT:    retl
1382;
1383; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
1384; X64-AVX:       # %bb.0:
1385; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1386; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1387; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1388; X64-AVX-NEXT:    retq
1389;
1390; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
1391; X64-AVX512:       # %bb.0:
1392; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1393; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1394; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1395; X64-AVX512-NEXT:    retq
1396 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1397 ret <16 x i32> %1
1398}
1399
1400define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
1401; X86-AVX-LABEL: reg_broadcast_8i32_16i32:
1402; X86-AVX:       # %bb.0:
1403; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1404; X86-AVX-NEXT:    retl
1405;
1406; X86-AVX512-LABEL: reg_broadcast_8i32_16i32:
1407; X86-AVX512:       # %bb.0:
1408; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1409; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1410; X86-AVX512-NEXT:    retl
1411;
1412; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
1413; X64-AVX:       # %bb.0:
1414; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1415; X64-AVX-NEXT:    retq
1416;
1417; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
1418; X64-AVX512:       # %bb.0:
1419; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1420; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1421; X64-AVX512-NEXT:    retq
1422 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1423 ret <16 x i32> %1
1424}
1425
1426define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind {
1427; X86-LABEL: reg_broadcast_8i16_16i16:
1428; X86:       # %bb.0:
1429; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1430; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1431; X86-NEXT:    retl
1432;
1433; X64-LABEL: reg_broadcast_8i16_16i16:
1434; X64:       # %bb.0:
1435; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1436; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1437; X64-NEXT:    retq
1438 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1439 ret <16 x i16> %1
1440}
1441
1442define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind {
1443; X86-AVX-LABEL: reg_broadcast_8i16_32i16:
1444; X86-AVX:       # %bb.0:
1445; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1446; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1447; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1448; X86-AVX-NEXT:    retl
1449;
1450; X86-AVX512-LABEL: reg_broadcast_8i16_32i16:
1451; X86-AVX512:       # %bb.0:
1452; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1453; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1454; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1455; X86-AVX512-NEXT:    retl
1456;
1457; X64-AVX-LABEL: reg_broadcast_8i16_32i16:
1458; X64-AVX:       # %bb.0:
1459; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1460; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1461; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1462; X64-AVX-NEXT:    retq
1463;
1464; X64-AVX512-LABEL: reg_broadcast_8i16_32i16:
1465; X64-AVX512:       # %bb.0:
1466; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1467; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1468; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1469; X64-AVX512-NEXT:    retq
1470 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1471 ret <32 x i16> %1
1472}
1473
1474define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind {
1475; X86-AVX-LABEL: reg_broadcast_16i16_32i16:
1476; X86-AVX:       # %bb.0:
1477; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1478; X86-AVX-NEXT:    retl
1479;
1480; X86-AVX512-LABEL: reg_broadcast_16i16_32i16:
1481; X86-AVX512:       # %bb.0:
1482; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1483; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1484; X86-AVX512-NEXT:    retl
1485;
1486; X64-AVX-LABEL: reg_broadcast_16i16_32i16:
1487; X64-AVX:       # %bb.0:
1488; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1489; X64-AVX-NEXT:    retq
1490;
1491; X64-AVX512-LABEL: reg_broadcast_16i16_32i16:
1492; X64-AVX512:       # %bb.0:
1493; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1494; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1495; X64-AVX512-NEXT:    retq
1496 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1497 ret <32 x i16> %1
1498}
1499
1500define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind {
1501; X86-LABEL: reg_broadcast_16i8_32i8:
1502; X86:       # %bb.0:
1503; X86-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1504; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1505; X86-NEXT:    retl
1506;
1507; X64-LABEL: reg_broadcast_16i8_32i8:
1508; X64:       # %bb.0:
1509; X64-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1510; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1511; X64-NEXT:    retq
1512 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1513 ret <32 x i8> %1
1514}
1515
1516define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind {
1517; X86-AVX-LABEL: reg_broadcast_16i8_64i8:
1518; X86-AVX:       # %bb.0:
1519; X86-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1520; X86-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1521; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1522; X86-AVX-NEXT:    retl
1523;
1524; X86-AVX512-LABEL: reg_broadcast_16i8_64i8:
1525; X86-AVX512:       # %bb.0:
1526; X86-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1527; X86-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1528; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1529; X86-AVX512-NEXT:    retl
1530;
1531; X64-AVX-LABEL: reg_broadcast_16i8_64i8:
1532; X64-AVX:       # %bb.0:
1533; X64-AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1534; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1535; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1536; X64-AVX-NEXT:    retq
1537;
1538; X64-AVX512-LABEL: reg_broadcast_16i8_64i8:
1539; X64-AVX512:       # %bb.0:
1540; X64-AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
1541; X64-AVX512-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1542; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1543; X64-AVX512-NEXT:    retq
1544 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1545 ret <64 x i8> %1
1546}
1547
1548define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind {
1549; X86-AVX-LABEL: reg_broadcast_32i8_64i8:
1550; X86-AVX:       # %bb.0:
1551; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1552; X86-AVX-NEXT:    retl
1553;
1554; X86-AVX512-LABEL: reg_broadcast_32i8_64i8:
1555; X86-AVX512:       # %bb.0:
1556; X86-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1557; X86-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1558; X86-AVX512-NEXT:    retl
1559;
1560; X64-AVX-LABEL: reg_broadcast_32i8_64i8:
1561; X64-AVX:       # %bb.0:
1562; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1563; X64-AVX-NEXT:    retq
1564;
1565; X64-AVX512-LABEL: reg_broadcast_32i8_64i8:
1566; X64-AVX512:       # %bb.0:
1567; X64-AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1568; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
1569; X64-AVX512-NEXT:    retq
1570 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
1571 ret <64 x i8> %1
1572}
1573
1574;
1575; PR34394
1576;
1577
1578define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) {
1579; X86-LABEL: test_2xi32_to_4xi32_mem:
1580; X86:       # %bb.0:
1581; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1582; X86-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1583; X86-NEXT:    retl
1584;
1585; X64-LABEL: test_2xi32_to_4xi32_mem:
1586; X64:       # %bb.0:
1587; X64-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
1588; X64-NEXT:    retq
1589  %vec = load <2 x i32>, <2 x i32>* %vp
1590  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
1591  ret <4 x i32> %res
1592}
1593
1594define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) {
1595; X86-LABEL: test_2xi32_to_8xi32_mem:
1596; X86:       # %bb.0:
1597; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1598; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1599; X86-NEXT:    retl
1600;
1601; X64-LABEL: test_2xi32_to_8xi32_mem:
1602; X64:       # %bb.0:
1603; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1604; X64-NEXT:    retq
1605  %vec = load <2 x i32>, <2 x i32>* %vp
1606  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1607  ret <8 x i32> %res
1608}
1609
1610define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) {
1611; X86-AVX-LABEL: test_2xi32_to_16xi32_mem:
1612; X86-AVX:       # %bb.0:
1613; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1614; X86-AVX-NEXT:    vbroadcastsd (%eax), %ymm0
1615; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1616; X86-AVX-NEXT:    retl
1617;
1618; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1619; X86-AVX512:       # %bb.0:
1620; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1621; X86-AVX512-NEXT:    vbroadcastsd (%eax), %zmm0
1622; X86-AVX512-NEXT:    retl
1623;
1624; X64-AVX-LABEL: test_2xi32_to_16xi32_mem:
1625; X64-AVX:       # %bb.0:
1626; X64-AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
1627; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1628; X64-AVX-NEXT:    retq
1629;
1630; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem:
1631; X64-AVX512:       # %bb.0:
1632; X64-AVX512-NEXT:    vbroadcastsd (%rdi), %zmm0
1633; X64-AVX512-NEXT:    retq
1634  %vec = load <2 x i32>, <2 x i32>* %vp
1635  %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
1636  ret <16 x i32> %res
1637}
1638
1639;
1640; PR34041
1641;
1642
1643define <4 x double> @broadcast_v4f64_f64_u000(double* %p) {
1644; X86-LABEL: broadcast_v4f64_f64_u000:
1645; X86:       # %bb.0:
1646; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1647; X86-NEXT:    vbroadcastsd (%eax), %ymm0
1648; X86-NEXT:    retl
1649;
1650; X64-LABEL: broadcast_v4f64_f64_u000:
1651; X64:       # %bb.0:
1652; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
1653; X64-NEXT:    retq
1654  %s = load double, double* %p
1655  %vec = insertelement <2 x double> undef, double %s, i32 0
1656  %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
1657  ret <4 x double> %res
1658}
1659
1660define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) {
1661; X86-LABEL: broadcast_v4f64_v2f64_4u61:
1662; X86:       # %bb.0:
1663; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1664; X86-NEXT:    vinsertf128 $1, (%eax), %ymm0, %ymm1
1665; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1666; X86-NEXT:    retl
1667;
1668; X64-LABEL: broadcast_v4f64_v2f64_4u61:
1669; X64:       # %bb.0:
1670; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm1
1671; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1672; X64-NEXT:    retq
1673  %vec = load <2 x double>, <2 x double>* %vp
1674  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1>
1675  %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default
1676  ret <4 x double> %res
1677}
1678
1679define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) {
1680; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1681; X86:       # %bb.0:
1682; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1683; X86-NEXT:    vbroadcastsd (%eax), %ymm1
1684; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1685; X86-NEXT:    retl
1686;
1687; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu:
1688; X64:       # %bb.0:
1689; X64-NEXT:    vbroadcastsd (%rdi), %ymm1
1690; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
1691; X64-NEXT:    retq
1692  %vec = load <2 x float>, <2 x float>* %vp
1693  %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef>
1694  %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default
1695  ret <8 x float> %res
1696}
1697
1698define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) {
1699; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1700; X86-AVX:       # %bb.0:
1701; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1702; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1703; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1704; X86-AVX-NEXT:    retl
1705;
1706; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1707; X86-AVX512:       # %bb.0:
1708; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1709; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1710; X86-AVX512-NEXT:    retl
1711;
1712; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101:
1713; X64-AVX:       # %bb.0:
1714; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1715; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1716; X64-AVX-NEXT:    retq
1717;
1718; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101:
1719; X64-AVX512:       # %bb.0:
1720; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1721; X64-AVX512-NEXT:    retq
1722  %vec = load <2 x double>, <2 x double>* %vp
1723  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1>
1724  ret <8 x double> %res
1725}
1726
1727define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) {
1728; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1729; X86-AVX:       # %bb.0:
1730; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1731; X86-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1732; X86-AVX-NEXT:    vmovaps %ymm0, %ymm1
1733; X86-AVX-NEXT:    retl
1734;
1735; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1736; X86-AVX512:       # %bb.0:
1737; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
1738; X86-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1739; X86-AVX512-NEXT:    retl
1740;
1741; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1742; X64-AVX:       # %bb.0:
1743; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
1744; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1745; X64-AVX-NEXT:    retq
1746;
1747; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101:
1748; X64-AVX512:       # %bb.0:
1749; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1750; X64-AVX512-NEXT:    retq
1751  %vec = load <2 x double>, <2 x double>* %vp
1752  %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1>
1753  ret <8 x double> %res
1754}
1755