1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl %s -o - | FileCheck %s
3
4define <2 x double> @test_2xdouble_dup_low(<2 x double> %vec) {
5; CHECK-LABEL: test_2xdouble_dup_low:
6; CHECK:       # %bb.0:
7; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
8; CHECK-NEXT:    retq
9  %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
10  ret <2 x double> %res
11}
12define <2 x double> @test_masked_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
13; CHECK-LABEL: test_masked_2xdouble_dup_low_mask0:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
16; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
17; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
18; CHECK-NEXT:    vmovapd %xmm1, %xmm0
19; CHECK-NEXT:    retq
20  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
21  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
22  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
23  ret <2 x double> %res
24}
25
26define <2 x double> @test_masked_z_2xdouble_dup_low_mask0(<2 x double> %vec, <2 x double> %mask) {
27; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask0:
28; CHECK:       # %bb.0:
29; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
30; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
31; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
32; CHECK-NEXT:    retq
33  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
34  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
35  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
36  ret <2 x double> %res
37}
38define <2 x double> @test_masked_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
39; CHECK-LABEL: test_masked_2xdouble_dup_low_mask1:
40; CHECK:       # %bb.0:
41; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
42; CHECK-NEXT:    vcmpeqpd %xmm3, %xmm2, %k1
43; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 {%k1} = xmm0[0,0]
44; CHECK-NEXT:    vmovapd %xmm1, %xmm0
45; CHECK-NEXT:    retq
46  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
47  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
48  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
49  ret <2 x double> %res
50}
51
52define <2 x double> @test_masked_z_2xdouble_dup_low_mask1(<2 x double> %vec, <2 x double> %mask) {
53; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mask1:
54; CHECK:       # %bb.0:
55; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
56; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
57; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0]
58; CHECK-NEXT:    retq
59  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
60  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
61  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
62  ret <2 x double> %res
63}
64define <2 x double> @test_2xdouble_dup_low_mem(<2 x double>* %vp) {
65; CHECK-LABEL: test_2xdouble_dup_low_mem:
66; CHECK:       # %bb.0:
67; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
68; CHECK-NEXT:    retq
69  %vec = load <2 x double>, <2 x double>* %vp
70  %res = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
71  ret <2 x double> %res
72}
73define <2 x double> @test_masked_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
74; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask0:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
77; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
78; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
79; CHECK-NEXT:    retq
80  %vec = load <2 x double>, <2 x double>* %vp
81  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
82  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
83  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
84  ret <2 x double> %res
85}
86
87define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask0(<2 x double>* %vp, <2 x double> %mask) {
88; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask0:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
91; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
92; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
93; CHECK-NEXT:    retq
94  %vec = load <2 x double>, <2 x double>* %vp
95  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
96  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
97  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
98  ret <2 x double> %res
99}
100define <2 x double> @test_masked_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
101; CHECK-LABEL: test_masked_2xdouble_dup_low_mem_mask1:
102; CHECK:       # %bb.0:
103; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
104; CHECK-NEXT:    vcmpeqpd %xmm2, %xmm1, %k1
105; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
106; CHECK-NEXT:    retq
107  %vec = load <2 x double>, <2 x double>* %vp
108  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
109  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
110  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
111  ret <2 x double> %res
112}
113
114define <2 x double> @test_masked_z_2xdouble_dup_low_mem_mask1(<2 x double>* %vp, <2 x double> %mask) {
115; CHECK-LABEL: test_masked_z_2xdouble_dup_low_mem_mask1:
116; CHECK:       # %bb.0:
117; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
118; CHECK-NEXT:    vcmpeqpd %xmm1, %xmm0, %k1
119; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
120; CHECK-NEXT:    retq
121  %vec = load <2 x double>, <2 x double>* %vp
122  %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <2 x i32> <i32 0, i32 0>
123  %cmp = fcmp oeq <2 x double> %mask, zeroinitializer
124  %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
125  ret <2 x double> %res
126}
127define <4 x double> @test_4xdouble_dup_low(<4 x double> %vec) {
128; CHECK-LABEL: test_4xdouble_dup_low:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
131; CHECK-NEXT:    retq
132  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
133  ret <4 x double> %res
134}
135define <4 x double> @test_masked_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
136; CHECK-LABEL: test_masked_4xdouble_dup_low_mask0:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
139; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
140; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
141; CHECK-NEXT:    vmovapd %ymm1, %ymm0
142; CHECK-NEXT:    retq
143  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
144  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
145  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
146  ret <4 x double> %res
147}
148
149define <4 x double> @test_masked_z_4xdouble_dup_low_mask0(<4 x double> %vec, <4 x double> %mask) {
150; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask0:
151; CHECK:       # %bb.0:
152; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
153; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
154; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
155; CHECK-NEXT:    retq
156  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
157  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
158  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
159  ret <4 x double> %res
160}
161define <4 x double> @test_masked_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
162; CHECK-LABEL: test_masked_4xdouble_dup_low_mask1:
163; CHECK:       # %bb.0:
164; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
165; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
166; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
167; CHECK-NEXT:    vmovapd %ymm1, %ymm0
168; CHECK-NEXT:    retq
169  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
170  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
171  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
172  ret <4 x double> %res
173}
174
175define <4 x double> @test_masked_z_4xdouble_dup_low_mask1(<4 x double> %vec, <4 x double> %mask) {
176; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask1:
177; CHECK:       # %bb.0:
178; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
179; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
180; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
181; CHECK-NEXT:    retq
182  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
183  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
184  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
185  ret <4 x double> %res
186}
187define <4 x double> @test_masked_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
188; CHECK-LABEL: test_masked_4xdouble_dup_low_mask2:
189; CHECK:       # %bb.0:
190; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
191; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
192; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
193; CHECK-NEXT:    vmovapd %ymm1, %ymm0
194; CHECK-NEXT:    retq
195  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
196  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
197  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
198  ret <4 x double> %res
199}
200
201define <4 x double> @test_masked_z_4xdouble_dup_low_mask2(<4 x double> %vec, <4 x double> %mask) {
202; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask2:
203; CHECK:       # %bb.0:
204; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
205; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
206; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
207; CHECK-NEXT:    retq
208  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
209  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
210  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
211  ret <4 x double> %res
212}
213define <4 x double> @test_masked_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
214; CHECK-LABEL: test_masked_4xdouble_dup_low_mask3:
215; CHECK:       # %bb.0:
216; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
217; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
218; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
219; CHECK-NEXT:    vmovapd %ymm1, %ymm0
220; CHECK-NEXT:    retq
221  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
222  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
223  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
224  ret <4 x double> %res
225}
226
227define <4 x double> @test_masked_z_4xdouble_dup_low_mask3(<4 x double> %vec, <4 x double> %mask) {
228; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask3:
229; CHECK:       # %bb.0:
230; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
231; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
232; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
233; CHECK-NEXT:    retq
234  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
235  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
236  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
237  ret <4 x double> %res
238}
239define <4 x double> @test_masked_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
240; CHECK-LABEL: test_masked_4xdouble_dup_low_mask4:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
243; CHECK-NEXT:    vcmpeqpd %ymm3, %ymm2, %k1
244; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2]
245; CHECK-NEXT:    vmovapd %ymm1, %ymm0
246; CHECK-NEXT:    retq
247  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
248  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
249  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
250  ret <4 x double> %res
251}
252
253define <4 x double> @test_masked_z_4xdouble_dup_low_mask4(<4 x double> %vec, <4 x double> %mask) {
254; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mask4:
255; CHECK:       # %bb.0:
256; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
257; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
258; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2]
259; CHECK-NEXT:    retq
260  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
261  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
262  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
263  ret <4 x double> %res
264}
265define <4 x double> @test_4xdouble_dup_low_mem(<4 x double>* %vp) {
266; CHECK-LABEL: test_4xdouble_dup_low_mem:
267; CHECK:       # %bb.0:
268; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
269; CHECK-NEXT:    retq
270  %vec = load <4 x double>, <4 x double>* %vp
271  %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
272  ret <4 x double> %res
273}
274define <4 x double> @test_masked_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
275; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask0:
276; CHECK:       # %bb.0:
277; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
278; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
279; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
280; CHECK-NEXT:    retq
281  %vec = load <4 x double>, <4 x double>* %vp
282  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
283  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
284  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
285  ret <4 x double> %res
286}
287
288define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask0(<4 x double>* %vp, <4 x double> %mask) {
289; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask0:
290; CHECK:       # %bb.0:
291; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
292; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
293; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
294; CHECK-NEXT:    retq
295  %vec = load <4 x double>, <4 x double>* %vp
296  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
297  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
298  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
299  ret <4 x double> %res
300}
301define <4 x double> @test_masked_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
302; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask1:
303; CHECK:       # %bb.0:
304; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
305; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
306; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
307; CHECK-NEXT:    retq
308  %vec = load <4 x double>, <4 x double>* %vp
309  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
310  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
311  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
312  ret <4 x double> %res
313}
314
315define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask1(<4 x double>* %vp, <4 x double> %mask) {
316; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask1:
317; CHECK:       # %bb.0:
318; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
319; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
320; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
321; CHECK-NEXT:    retq
322  %vec = load <4 x double>, <4 x double>* %vp
323  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
324  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
325  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
326  ret <4 x double> %res
327}
328define <4 x double> @test_masked_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
329; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask2:
330; CHECK:       # %bb.0:
331; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
332; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
333; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
334; CHECK-NEXT:    retq
335  %vec = load <4 x double>, <4 x double>* %vp
336  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
337  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
338  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
339  ret <4 x double> %res
340}
341
342define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask2(<4 x double>* %vp, <4 x double> %mask) {
343; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask2:
344; CHECK:       # %bb.0:
345; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
346; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
347; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
348; CHECK-NEXT:    retq
349  %vec = load <4 x double>, <4 x double>* %vp
350  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
351  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
352  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
353  ret <4 x double> %res
354}
355define <4 x double> @test_masked_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
356; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask3:
357; CHECK:       # %bb.0:
358; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
359; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
360; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
361; CHECK-NEXT:    retq
362  %vec = load <4 x double>, <4 x double>* %vp
363  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
364  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
365  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
366  ret <4 x double> %res
367}
368
369define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask3(<4 x double>* %vp, <4 x double> %mask) {
370; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask3:
371; CHECK:       # %bb.0:
372; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
373; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
374; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
375; CHECK-NEXT:    retq
376  %vec = load <4 x double>, <4 x double>* %vp
377  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
378  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
379  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
380  ret <4 x double> %res
381}
382define <4 x double> @test_masked_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) {
383; CHECK-LABEL: test_masked_4xdouble_dup_low_mem_mask4:
384; CHECK:       # %bb.0:
385; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
386; CHECK-NEXT:    vcmpeqpd %ymm2, %ymm1, %k1
387; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2]
388; CHECK-NEXT:    retq
389  %vec = load <4 x double>, <4 x double>* %vp
390  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
391  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
392  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
393  ret <4 x double> %res
394}
395
396define <4 x double> @test_masked_z_4xdouble_dup_low_mem_mask4(<4 x double>* %vp, <4 x double> %mask) {
397; CHECK-LABEL: test_masked_z_4xdouble_dup_low_mem_mask4:
398; CHECK:       # %bb.0:
399; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
400; CHECK-NEXT:    vcmpeqpd %ymm1, %ymm0, %k1
401; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2]
402; CHECK-NEXT:    retq
403  %vec = load <4 x double>, <4 x double>* %vp
404  %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
405  %cmp = fcmp oeq <4 x double> %mask, zeroinitializer
406  %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
407  ret <4 x double> %res
408}
409define <8 x double> @test_8xdouble_dup_low(<8 x double> %vec) {
410; CHECK-LABEL: test_8xdouble_dup_low:
411; CHECK:       # %bb.0:
412; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6]
413; CHECK-NEXT:    retq
414  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
415  ret <8 x double> %res
416}
417define <8 x double> @test_masked_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
418; CHECK-LABEL: test_masked_8xdouble_dup_low_mask0:
419; CHECK:       # %bb.0:
420; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
421; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
422; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
423; CHECK-NEXT:    vmovapd %zmm1, %zmm0
424; CHECK-NEXT:    retq
425  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
426  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
427  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
428  ret <8 x double> %res
429}
430
431define <8 x double> @test_masked_z_8xdouble_dup_low_mask0(<8 x double> %vec, <8 x double> %mask) {
432; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask0:
433; CHECK:       # %bb.0:
434; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
435; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
436; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
437; CHECK-NEXT:    retq
438  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
439  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
440  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
441  ret <8 x double> %res
442}
443define <8 x double> @test_masked_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
444; CHECK-LABEL: test_masked_8xdouble_dup_low_mask1:
445; CHECK:       # %bb.0:
446; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
447; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
448; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
449; CHECK-NEXT:    vmovapd %zmm1, %zmm0
450; CHECK-NEXT:    retq
451  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
452  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
453  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
454  ret <8 x double> %res
455}
456
457define <8 x double> @test_masked_z_8xdouble_dup_low_mask1(<8 x double> %vec, <8 x double> %mask) {
458; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask1:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
461; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
462; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
463; CHECK-NEXT:    retq
464  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
465  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
466  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
467  ret <8 x double> %res
468}
469define <8 x double> @test_masked_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
470; CHECK-LABEL: test_masked_8xdouble_dup_low_mask2:
471; CHECK:       # %bb.0:
472; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
473; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
474; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
475; CHECK-NEXT:    vmovapd %zmm1, %zmm0
476; CHECK-NEXT:    retq
477  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
478  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
479  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
480  ret <8 x double> %res
481}
482
483define <8 x double> @test_masked_z_8xdouble_dup_low_mask2(<8 x double> %vec, <8 x double> %mask) {
484; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask2:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
487; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
488; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
489; CHECK-NEXT:    retq
490  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
491  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
492  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
493  ret <8 x double> %res
494}
495define <8 x double> @test_masked_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
496; CHECK-LABEL: test_masked_8xdouble_dup_low_mask3:
497; CHECK:       # %bb.0:
498; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
499; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
500; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
501; CHECK-NEXT:    vmovapd %zmm1, %zmm0
502; CHECK-NEXT:    retq
503  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
504  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
505  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
506  ret <8 x double> %res
507}
508
509define <8 x double> @test_masked_z_8xdouble_dup_low_mask3(<8 x double> %vec, <8 x double> %mask) {
510; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask3:
511; CHECK:       # %bb.0:
512; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
513; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
514; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
515; CHECK-NEXT:    retq
516  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
517  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
518  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
519  ret <8 x double> %res
520}
521define <8 x double> @test_masked_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x double> %mask) {
522; CHECK-LABEL: test_masked_8xdouble_dup_low_mask4:
523; CHECK:       # %bb.0:
524; CHECK-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
525; CHECK-NEXT:    vcmpeqpd %zmm3, %zmm2, %k1
526; CHECK-NEXT:    vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6]
527; CHECK-NEXT:    vmovapd %zmm1, %zmm0
528; CHECK-NEXT:    retq
529  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
530  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
531  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
532  ret <8 x double> %res
533}
534
535define <8 x double> @test_masked_z_8xdouble_dup_low_mask4(<8 x double> %vec, <8 x double> %mask) {
536; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mask4:
537; CHECK:       # %bb.0:
538; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
539; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
540; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
541; CHECK-NEXT:    retq
542  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
543  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
544  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
545  ret <8 x double> %res
546}
547define <8 x double> @test_8xdouble_dup_low_mem(<8 x double>* %vp) {
548; CHECK-LABEL: test_8xdouble_dup_low_mem:
549; CHECK:       # %bb.0:
550; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6]
551; CHECK-NEXT:    retq
552  %vec = load <8 x double>, <8 x double>* %vp
553  %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
554  ret <8 x double> %res
555}
556define <8 x double> @test_masked_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
557; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask0:
558; CHECK:       # %bb.0:
559; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
560; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
561; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
562; CHECK-NEXT:    retq
563  %vec = load <8 x double>, <8 x double>* %vp
564  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
565  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
566  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
567  ret <8 x double> %res
568}
569
570define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask0(<8 x double>* %vp, <8 x double> %mask) {
571; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask0:
572; CHECK:       # %bb.0:
573; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
574; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
575; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
576; CHECK-NEXT:    retq
577  %vec = load <8 x double>, <8 x double>* %vp
578  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
579  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
580  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
581  ret <8 x double> %res
582}
583define <8 x double> @test_masked_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
584; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask1:
585; CHECK:       # %bb.0:
586; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
587; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
588; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
589; CHECK-NEXT:    retq
590  %vec = load <8 x double>, <8 x double>* %vp
591  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
592  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
593  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
594  ret <8 x double> %res
595}
596
597define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask1(<8 x double>* %vp, <8 x double> %mask) {
598; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask1:
599; CHECK:       # %bb.0:
600; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
601; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
602; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
603; CHECK-NEXT:    retq
604  %vec = load <8 x double>, <8 x double>* %vp
605  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
606  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
607  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
608  ret <8 x double> %res
609}
610define <8 x double> @test_masked_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
611; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask2:
612; CHECK:       # %bb.0:
613; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
614; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
615; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
616; CHECK-NEXT:    retq
617  %vec = load <8 x double>, <8 x double>* %vp
618  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
619  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
620  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
621  ret <8 x double> %res
622}
623
624define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask2(<8 x double>* %vp, <8 x double> %mask) {
625; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask2:
626; CHECK:       # %bb.0:
627; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
628; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
629; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
630; CHECK-NEXT:    retq
631  %vec = load <8 x double>, <8 x double>* %vp
632  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
633  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
634  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
635  ret <8 x double> %res
636}
637define <8 x double> @test_masked_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
638; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask3:
639; CHECK:       # %bb.0:
640; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
641; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
642; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
643; CHECK-NEXT:    retq
644  %vec = load <8 x double>, <8 x double>* %vp
645  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
646  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
647  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
648  ret <8 x double> %res
649}
650
651define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask3(<8 x double>* %vp, <8 x double> %mask) {
652; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask3:
653; CHECK:       # %bb.0:
654; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
655; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
656; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
657; CHECK-NEXT:    retq
658  %vec = load <8 x double>, <8 x double>* %vp
659  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
660  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
661  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
662  ret <8 x double> %res
663}
664define <8 x double> @test_masked_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x double> %mask) {
665; CHECK-LABEL: test_masked_8xdouble_dup_low_mem_mask4:
666; CHECK:       # %bb.0:
667; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
668; CHECK-NEXT:    vcmpeqpd %zmm2, %zmm1, %k1
669; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6]
670; CHECK-NEXT:    retq
671  %vec = load <8 x double>, <8 x double>* %vp
672  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
673  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
674  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2
675  ret <8 x double> %res
676}
677
678define <8 x double> @test_masked_z_8xdouble_dup_low_mem_mask4(<8 x double>* %vp, <8 x double> %mask) {
679; CHECK-LABEL: test_masked_z_8xdouble_dup_low_mem_mask4:
680; CHECK:       # %bb.0:
681; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
682; CHECK-NEXT:    vcmpeqpd %zmm1, %zmm0, %k1
683; CHECK-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
684; CHECK-NEXT:    retq
685  %vec = load <8 x double>, <8 x double>* %vp
686  %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
687  %cmp = fcmp oeq <8 x double> %mask, zeroinitializer
688  %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer
689  ret <8 x double> %res
690}
691define <4 x float> @test_4xfloat_dup_low(<4 x float> %vec) {
692; CHECK-LABEL: test_4xfloat_dup_low:
693; CHECK:       # %bb.0:
694; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
695; CHECK-NEXT:    retq
696  %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
697  ret <4 x float> %res
698}
699define <4 x float> @test_masked_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
700; CHECK-LABEL: test_masked_4xfloat_dup_low_mask0:
701; CHECK:       # %bb.0:
702; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
703; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
704; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
705; CHECK-NEXT:    vmovaps %xmm1, %xmm0
706; CHECK-NEXT:    retq
707  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
708  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
709  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
710  ret <4 x float> %res
711}
712
713define <4 x float> @test_masked_z_4xfloat_dup_low_mask0(<4 x float> %vec, <4 x float> %mask) {
714; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask0:
715; CHECK:       # %bb.0:
716; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
717; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
718; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
719; CHECK-NEXT:    retq
720  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
721  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
722  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
723  ret <4 x float> %res
724}
725define <4 x float> @test_masked_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
726; CHECK-LABEL: test_masked_4xfloat_dup_low_mask1:
727; CHECK:       # %bb.0:
728; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
729; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
730; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
731; CHECK-NEXT:    vmovaps %xmm1, %xmm0
732; CHECK-NEXT:    retq
733  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
734  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
735  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
736  ret <4 x float> %res
737}
738
739define <4 x float> @test_masked_z_4xfloat_dup_low_mask1(<4 x float> %vec, <4 x float> %mask) {
740; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask1:
741; CHECK:       # %bb.0:
742; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
743; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
744; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
745; CHECK-NEXT:    retq
746  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
747  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
748  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
749  ret <4 x float> %res
750}
751define <4 x float> @test_masked_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
752; CHECK-LABEL: test_masked_4xfloat_dup_low_mask2:
753; CHECK:       # %bb.0:
754; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
755; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
756; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
757; CHECK-NEXT:    vmovaps %xmm1, %xmm0
758; CHECK-NEXT:    retq
759  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
760  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
761  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
762  ret <4 x float> %res
763}
764
765define <4 x float> @test_masked_z_4xfloat_dup_low_mask2(<4 x float> %vec, <4 x float> %mask) {
766; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask2:
767; CHECK:       # %bb.0:
768; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
769; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
770; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
771; CHECK-NEXT:    retq
772  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
773  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
774  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
775  ret <4 x float> %res
776}
777define <4 x float> @test_masked_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
778; CHECK-LABEL: test_masked_4xfloat_dup_low_mask3:
779; CHECK:       # %bb.0:
780; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
781; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
782; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
783; CHECK-NEXT:    vmovaps %xmm1, %xmm0
784; CHECK-NEXT:    retq
785  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
786  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
787  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
788  ret <4 x float> %res
789}
790
791define <4 x float> @test_masked_z_4xfloat_dup_low_mask3(<4 x float> %vec, <4 x float> %mask) {
792; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask3:
793; CHECK:       # %bb.0:
794; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
795; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
796; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
797; CHECK-NEXT:    retq
798  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
799  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
800  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
801  ret <4 x float> %res
802}
803define <4 x float> @test_masked_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
804; CHECK-LABEL: test_masked_4xfloat_dup_low_mask4:
805; CHECK:       # %bb.0:
806; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
807; CHECK-NEXT:    vcmpeqps %xmm3, %xmm2, %k1
808; CHECK-NEXT:    vmovsldup {{.*#+}} xmm1 {%k1} = xmm0[0,0,2,2]
809; CHECK-NEXT:    vmovaps %xmm1, %xmm0
810; CHECK-NEXT:    retq
811  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
812  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
813  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
814  ret <4 x float> %res
815}
816
817define <4 x float> @test_masked_z_4xfloat_dup_low_mask4(<4 x float> %vec, <4 x float> %mask) {
818; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mask4:
819; CHECK:       # %bb.0:
820; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
821; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
822; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2]
823; CHECK-NEXT:    retq
824  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
825  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
826  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
827  ret <4 x float> %res
828}
829define <4 x float> @test_4xfloat_dup_low_mem(<4 x float>* %vp) {
830; CHECK-LABEL: test_4xfloat_dup_low_mem:
831; CHECK:       # %bb.0:
832; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 = mem[0,0,2,2]
833; CHECK-NEXT:    retq
834  %vec = load <4 x float>, <4 x float>* %vp
835  %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
836  ret <4 x float> %res
837}
838define <4 x float> @test_masked_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
839; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask0:
840; CHECK:       # %bb.0:
841; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
842; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
843; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
844; CHECK-NEXT:    retq
845  %vec = load <4 x float>, <4 x float>* %vp
846  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
847  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
848  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
849  ret <4 x float> %res
850}
851
852define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask0(<4 x float>* %vp, <4 x float> %mask) {
853; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask0:
854; CHECK:       # %bb.0:
855; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
856; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
857; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
858; CHECK-NEXT:    retq
859  %vec = load <4 x float>, <4 x float>* %vp
860  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
861  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
862  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
863  ret <4 x float> %res
864}
865define <4 x float> @test_masked_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
866; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask1:
867; CHECK:       # %bb.0:
868; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
869; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
870; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
871; CHECK-NEXT:    retq
872  %vec = load <4 x float>, <4 x float>* %vp
873  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
874  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
875  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
876  ret <4 x float> %res
877}
878
879define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask1(<4 x float>* %vp, <4 x float> %mask) {
880; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask1:
881; CHECK:       # %bb.0:
882; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
883; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
884; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
885; CHECK-NEXT:    retq
886  %vec = load <4 x float>, <4 x float>* %vp
887  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
888  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
889  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
890  ret <4 x float> %res
891}
892define <4 x float> @test_masked_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
893; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask2:
894; CHECK:       # %bb.0:
895; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
896; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
897; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
898; CHECK-NEXT:    retq
899  %vec = load <4 x float>, <4 x float>* %vp
900  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
901  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
902  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
903  ret <4 x float> %res
904}
905
906define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask2(<4 x float>* %vp, <4 x float> %mask) {
907; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask2:
908; CHECK:       # %bb.0:
909; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
910; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
911; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
912; CHECK-NEXT:    retq
913  %vec = load <4 x float>, <4 x float>* %vp
914  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
915  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
916  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
917  ret <4 x float> %res
918}
919define <4 x float> @test_masked_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
920; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask3:
921; CHECK:       # %bb.0:
922; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
923; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
924; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
925; CHECK-NEXT:    retq
926  %vec = load <4 x float>, <4 x float>* %vp
927  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
928  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
929  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
930  ret <4 x float> %res
931}
932
933define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask3(<4 x float>* %vp, <4 x float> %mask) {
934; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask3:
935; CHECK:       # %bb.0:
936; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
937; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
938; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
939; CHECK-NEXT:    retq
940  %vec = load <4 x float>, <4 x float>* %vp
941  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
942  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
943  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
944  ret <4 x float> %res
945}
946define <4 x float> @test_masked_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %vec2, <4 x float> %mask) {
947; CHECK-LABEL: test_masked_4xfloat_dup_low_mem_mask4:
948; CHECK:       # %bb.0:
949; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
950; CHECK-NEXT:    vcmpeqps %xmm2, %xmm1, %k1
951; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} = mem[0,0,2,2]
952; CHECK-NEXT:    retq
953  %vec = load <4 x float>, <4 x float>* %vp
954  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
955  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
956  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
957  ret <4 x float> %res
958}
959
960define <4 x float> @test_masked_z_4xfloat_dup_low_mem_mask4(<4 x float>* %vp, <4 x float> %mask) {
961; CHECK-LABEL: test_masked_z_4xfloat_dup_low_mem_mask4:
962; CHECK:       # %bb.0:
963; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
964; CHECK-NEXT:    vcmpeqps %xmm1, %xmm0, %k1
965; CHECK-NEXT:    vmovsldup {{.*#+}} xmm0 {%k1} {z} = mem[0,0,2,2]
966; CHECK-NEXT:    retq
967  %vec = load <4 x float>, <4 x float>* %vp
968  %shuf = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
969  %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
970  %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
971  ret <4 x float> %res
972}
973define <8 x float> @test_8xfloat_dup_low(<8 x float> %vec) {
974; CHECK-LABEL: test_8xfloat_dup_low:
975; CHECK:       # %bb.0:
976; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
977; CHECK-NEXT:    retq
978  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
979  ret <8 x float> %res
980}
981define <8 x float> @test_masked_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
982; CHECK-LABEL: test_masked_8xfloat_dup_low_mask0:
983; CHECK:       # %bb.0:
984; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
985; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
986; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
987; CHECK-NEXT:    vmovaps %ymm1, %ymm0
988; CHECK-NEXT:    retq
989  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
990  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
991  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
992  ret <8 x float> %res
993}
994
995define <8 x float> @test_masked_z_8xfloat_dup_low_mask0(<8 x float> %vec, <8 x float> %mask) {
996; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask0:
997; CHECK:       # %bb.0:
998; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
999; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1000; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1001; CHECK-NEXT:    retq
1002  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1003  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1004  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1005  ret <8 x float> %res
1006}
1007define <8 x float> @test_masked_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1008; CHECK-LABEL: test_masked_8xfloat_dup_low_mask1:
1009; CHECK:       # %bb.0:
1010; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1011; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
1012; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
1013; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1014; CHECK-NEXT:    retq
1015  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1016  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1017  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1018  ret <8 x float> %res
1019}
1020
1021define <8 x float> @test_masked_z_8xfloat_dup_low_mask1(<8 x float> %vec, <8 x float> %mask) {
1022; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask1:
1023; CHECK:       # %bb.0:
1024; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1025; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1026; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1027; CHECK-NEXT:    retq
1028  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1029  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1030  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1031  ret <8 x float> %res
1032}
1033define <8 x float> @test_masked_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1034; CHECK-LABEL: test_masked_8xfloat_dup_low_mask2:
1035; CHECK:       # %bb.0:
1036; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1037; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
1038; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
1039; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1040; CHECK-NEXT:    retq
1041  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1042  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1043  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1044  ret <8 x float> %res
1045}
1046
1047define <8 x float> @test_masked_z_8xfloat_dup_low_mask2(<8 x float> %vec, <8 x float> %mask) {
1048; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask2:
1049; CHECK:       # %bb.0:
1050; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1051; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1052; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1053; CHECK-NEXT:    retq
1054  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1055  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1056  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1057  ret <8 x float> %res
1058}
1059define <8 x float> @test_masked_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1060; CHECK-LABEL: test_masked_8xfloat_dup_low_mask3:
1061; CHECK:       # %bb.0:
1062; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1063; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
1064; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
1065; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1066; CHECK-NEXT:    retq
1067  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1068  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1069  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1070  ret <8 x float> %res
1071}
1072
1073define <8 x float> @test_masked_z_8xfloat_dup_low_mask3(<8 x float> %vec, <8 x float> %mask) {
1074; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask3:
1075; CHECK:       # %bb.0:
1076; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1077; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1078; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1079; CHECK-NEXT:    retq
1080  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1081  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1082  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1083  ret <8 x float> %res
1084}
1085define <8 x float> @test_masked_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %vec2, <8 x float> %mask) {
1086; CHECK-LABEL: test_masked_8xfloat_dup_low_mask4:
1087; CHECK:       # %bb.0:
1088; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1089; CHECK-NEXT:    vcmpeqps %ymm3, %ymm2, %k1
1090; CHECK-NEXT:    vmovsldup {{.*#+}} ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6]
1091; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1092; CHECK-NEXT:    retq
1093  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1094  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1095  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1096  ret <8 x float> %res
1097}
1098
1099define <8 x float> @test_masked_z_8xfloat_dup_low_mask4(<8 x float> %vec, <8 x float> %mask) {
1100; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mask4:
1101; CHECK:       # %bb.0:
1102; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1103; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1104; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6]
1105; CHECK-NEXT:    retq
1106  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1107  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1108  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1109  ret <8 x float> %res
1110}
1111define <8 x float> @test_8xfloat_dup_low_mem(<8 x float>* %vp) {
1112; CHECK-LABEL: test_8xfloat_dup_low_mem:
1113; CHECK:       # %bb.0:
1114; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = mem[0,0,2,2,4,4,6,6]
1115; CHECK-NEXT:    retq
1116  %vec = load <8 x float>, <8 x float>* %vp
1117  %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1118  ret <8 x float> %res
1119}
1120define <8 x float> @test_masked_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1121; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask0:
1122; CHECK:       # %bb.0:
1123; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1124; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1125; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
1126; CHECK-NEXT:    retq
1127  %vec = load <8 x float>, <8 x float>* %vp
1128  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1129  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1130  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1131  ret <8 x float> %res
1132}
1133
1134define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask0(<8 x float>* %vp, <8 x float> %mask) {
1135; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask0:
1136; CHECK:       # %bb.0:
1137; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1138; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1139; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
1140; CHECK-NEXT:    retq
1141  %vec = load <8 x float>, <8 x float>* %vp
1142  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1143  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1144  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1145  ret <8 x float> %res
1146}
1147define <8 x float> @test_masked_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1148; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask1:
1149; CHECK:       # %bb.0:
1150; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1151; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1152; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
1153; CHECK-NEXT:    retq
1154  %vec = load <8 x float>, <8 x float>* %vp
1155  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1156  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1157  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1158  ret <8 x float> %res
1159}
1160
1161define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask1(<8 x float>* %vp, <8 x float> %mask) {
1162; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask1:
1163; CHECK:       # %bb.0:
1164; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1165; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1166; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
1167; CHECK-NEXT:    retq
1168  %vec = load <8 x float>, <8 x float>* %vp
1169  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1170  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1171  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1172  ret <8 x float> %res
1173}
1174define <8 x float> @test_masked_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1175; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask2:
1176; CHECK:       # %bb.0:
1177; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1178; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1179; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
1180; CHECK-NEXT:    retq
1181  %vec = load <8 x float>, <8 x float>* %vp
1182  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1183  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1184  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1185  ret <8 x float> %res
1186}
1187
1188define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask2(<8 x float>* %vp, <8 x float> %mask) {
1189; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask2:
1190; CHECK:       # %bb.0:
1191; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1192; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1193; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
1194; CHECK-NEXT:    retq
1195  %vec = load <8 x float>, <8 x float>* %vp
1196  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1197  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1198  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1199  ret <8 x float> %res
1200}
1201define <8 x float> @test_masked_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1202; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask3:
1203; CHECK:       # %bb.0:
1204; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1205; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1206; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
1207; CHECK-NEXT:    retq
1208  %vec = load <8 x float>, <8 x float>* %vp
1209  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1210  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1211  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1212  ret <8 x float> %res
1213}
1214
1215define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask3(<8 x float>* %vp, <8 x float> %mask) {
1216; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask3:
1217; CHECK:       # %bb.0:
1218; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1219; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1220; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
1221; CHECK-NEXT:    retq
1222  %vec = load <8 x float>, <8 x float>* %vp
1223  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1224  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1225  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1226  ret <8 x float> %res
1227}
1228define <8 x float> @test_masked_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %vec2, <8 x float> %mask) {
1229; CHECK-LABEL: test_masked_8xfloat_dup_low_mem_mask4:
1230; CHECK:       # %bb.0:
1231; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1232; CHECK-NEXT:    vcmpeqps %ymm2, %ymm1, %k1
1233; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} = mem[0,0,2,2,4,4,6,6]
1234; CHECK-NEXT:    retq
1235  %vec = load <8 x float>, <8 x float>* %vp
1236  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1237  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1238  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2
1239  ret <8 x float> %res
1240}
1241
1242define <8 x float> @test_masked_z_8xfloat_dup_low_mem_mask4(<8 x float>* %vp, <8 x float> %mask) {
1243; CHECK-LABEL: test_masked_z_8xfloat_dup_low_mem_mask4:
1244; CHECK:       # %bb.0:
1245; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1246; CHECK-NEXT:    vcmpeqps %ymm1, %ymm0, %k1
1247; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6]
1248; CHECK-NEXT:    retq
1249  %vec = load <8 x float>, <8 x float>* %vp
1250  %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1251  %cmp = fcmp oeq <8 x float> %mask, zeroinitializer
1252  %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer
1253  ret <8 x float> %res
1254}
1255define <16 x float> @test_16xfloat_dup_low(<16 x float> %vec) {
1256; CHECK-LABEL: test_16xfloat_dup_low:
1257; CHECK:       # %bb.0:
1258; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1259; CHECK-NEXT:    retq
1260  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1261  ret <16 x float> %res
1262}
1263define <16 x float> @test_masked_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
1264; CHECK-LABEL: test_masked_16xfloat_dup_low_mask0:
1265; CHECK:       # %bb.0:
1266; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1267; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
1268; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1269; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1270; CHECK-NEXT:    retq
1271  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1272  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1273  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1274  ret <16 x float> %res
1275}
1276
1277define <16 x float> @test_masked_z_16xfloat_dup_low_mask0(<16 x float> %vec, <16 x float> %mask) {
1278; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask0:
1279; CHECK:       # %bb.0:
1280; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1281; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1282; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1283; CHECK-NEXT:    retq
1284  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1285  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1286  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1287  ret <16 x float> %res
1288}
1289define <16 x float> @test_masked_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
1290; CHECK-LABEL: test_masked_16xfloat_dup_low_mask1:
1291; CHECK:       # %bb.0:
1292; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1293; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
1294; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1295; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1296; CHECK-NEXT:    retq
1297  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1298  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1299  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1300  ret <16 x float> %res
1301}
1302
1303define <16 x float> @test_masked_z_16xfloat_dup_low_mask1(<16 x float> %vec, <16 x float> %mask) {
1304; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask1:
1305; CHECK:       # %bb.0:
1306; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1307; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1308; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1309; CHECK-NEXT:    retq
1310  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1311  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1312  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1313  ret <16 x float> %res
1314}
1315define <16 x float> @test_masked_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
1316; CHECK-LABEL: test_masked_16xfloat_dup_low_mask2:
1317; CHECK:       # %bb.0:
1318; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1319; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
1320; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1321; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1322; CHECK-NEXT:    retq
1323  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1324  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1325  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1326  ret <16 x float> %res
1327}
1328
1329define <16 x float> @test_masked_z_16xfloat_dup_low_mask2(<16 x float> %vec, <16 x float> %mask) {
1330; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask2:
1331; CHECK:       # %bb.0:
1332; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1333; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1334; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1335; CHECK-NEXT:    retq
1336  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1337  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1338  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1339  ret <16 x float> %res
1340}
1341define <16 x float> @test_masked_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
1342; CHECK-LABEL: test_masked_16xfloat_dup_low_mask3:
1343; CHECK:       # %bb.0:
1344; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1345; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
1346; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1347; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1348; CHECK-NEXT:    retq
1349  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1350  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1351  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1352  ret <16 x float> %res
1353}
1354
1355define <16 x float> @test_masked_z_16xfloat_dup_low_mask3(<16 x float> %vec, <16 x float> %mask) {
1356; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask3:
1357; CHECK:       # %bb.0:
1358; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1359; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1360; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1361; CHECK-NEXT:    retq
1362  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1363  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1364  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1365  ret <16 x float> %res
1366}
1367define <16 x float> @test_masked_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %vec2, <16 x float> %mask) {
1368; CHECK-LABEL: test_masked_16xfloat_dup_low_mask4:
1369; CHECK:       # %bb.0:
1370; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
1371; CHECK-NEXT:    vcmpeqps %zmm3, %zmm2, %k1
1372; CHECK-NEXT:    vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1373; CHECK-NEXT:    vmovaps %zmm1, %zmm0
1374; CHECK-NEXT:    retq
1375  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1376  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1377  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1378  ret <16 x float> %res
1379}
1380
1381define <16 x float> @test_masked_z_16xfloat_dup_low_mask4(<16 x float> %vec, <16 x float> %mask) {
1382; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mask4:
1383; CHECK:       # %bb.0:
1384; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1385; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1386; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1387; CHECK-NEXT:    retq
1388  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1389  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1390  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1391  ret <16 x float> %res
1392}
1393define <16 x float> @test_16xfloat_dup_low_mem(<16 x float>* %vp) {
1394; CHECK-LABEL: test_16xfloat_dup_low_mem:
1395; CHECK:       # %bb.0:
1396; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1397; CHECK-NEXT:    retq
1398  %vec = load <16 x float>, <16 x float>* %vp
1399  %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1400  ret <16 x float> %res
1401}
1402define <16 x float> @test_masked_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
1403; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask0:
1404; CHECK:       # %bb.0:
1405; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1406; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1407; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1408; CHECK-NEXT:    retq
1409  %vec = load <16 x float>, <16 x float>* %vp
1410  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1411  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1412  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1413  ret <16 x float> %res
1414}
1415
1416define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask0(<16 x float>* %vp, <16 x float> %mask) {
1417; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask0:
1418; CHECK:       # %bb.0:
1419; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1420; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1421; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1422; CHECK-NEXT:    retq
1423  %vec = load <16 x float>, <16 x float>* %vp
1424  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1425  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1426  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1427  ret <16 x float> %res
1428}
1429define <16 x float> @test_masked_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
1430; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask1:
1431; CHECK:       # %bb.0:
1432; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1433; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1434; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1435; CHECK-NEXT:    retq
1436  %vec = load <16 x float>, <16 x float>* %vp
1437  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1438  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1439  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1440  ret <16 x float> %res
1441}
1442
1443define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask1(<16 x float>* %vp, <16 x float> %mask) {
1444; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask1:
1445; CHECK:       # %bb.0:
1446; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1447; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1448; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1449; CHECK-NEXT:    retq
1450  %vec = load <16 x float>, <16 x float>* %vp
1451  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1452  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1453  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1454  ret <16 x float> %res
1455}
1456define <16 x float> @test_masked_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
1457; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask2:
1458; CHECK:       # %bb.0:
1459; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1460; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1461; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1462; CHECK-NEXT:    retq
1463  %vec = load <16 x float>, <16 x float>* %vp
1464  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1465  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1466  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1467  ret <16 x float> %res
1468}
1469
1470define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask2(<16 x float>* %vp, <16 x float> %mask) {
1471; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask2:
1472; CHECK:       # %bb.0:
1473; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1474; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1475; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1476; CHECK-NEXT:    retq
1477  %vec = load <16 x float>, <16 x float>* %vp
1478  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1479  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1480  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1481  ret <16 x float> %res
1482}
1483define <16 x float> @test_masked_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
1484; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask3:
1485; CHECK:       # %bb.0:
1486; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1487; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1488; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1489; CHECK-NEXT:    retq
1490  %vec = load <16 x float>, <16 x float>* %vp
1491  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1492  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1493  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1494  ret <16 x float> %res
1495}
1496
1497define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask3(<16 x float>* %vp, <16 x float> %mask) {
1498; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask3:
1499; CHECK:       # %bb.0:
1500; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1501; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1502; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1503; CHECK-NEXT:    retq
1504  %vec = load <16 x float>, <16 x float>* %vp
1505  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1506  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1507  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1508  ret <16 x float> %res
1509}
1510define <16 x float> @test_masked_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %vec2, <16 x float> %mask) {
1511; CHECK-LABEL: test_masked_16xfloat_dup_low_mem_mask4:
1512; CHECK:       # %bb.0:
1513; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1514; CHECK-NEXT:    vcmpeqps %zmm2, %zmm1, %k1
1515; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1516; CHECK-NEXT:    retq
1517  %vec = load <16 x float>, <16 x float>* %vp
1518  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1519  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1520  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2
1521  ret <16 x float> %res
1522}
1523
1524define <16 x float> @test_masked_z_16xfloat_dup_low_mem_mask4(<16 x float>* %vp, <16 x float> %mask) {
1525; CHECK-LABEL: test_masked_z_16xfloat_dup_low_mem_mask4:
1526; CHECK:       # %bb.0:
1527; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1528; CHECK-NEXT:    vcmpeqps %zmm1, %zmm0, %k1
1529; CHECK-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
1530; CHECK-NEXT:    retq
1531  %vec = load <16 x float>, <16 x float>* %vp
1532  %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
1533  %cmp = fcmp oeq <16 x float> %mask, zeroinitializer
1534  %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer
1535  ret <16 x float> %res
1536}
1537