1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
3; RUN: llc < %s -O2 -mattr=avx512f -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
4; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
5; RUN: llc < %s -O2 -mattr=avx512vl -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
6
7define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
8; CHECK64-LABEL: test_mm_mask_move_ss:
9; CHECK64:       # %bb.0: # %entry
10; CHECK64-NEXT:    kmovw %edi, %k1
11; CHECK64-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1}
12; CHECK64-NEXT:    retq
13;
14; CHECK32-LABEL: test_mm_mask_move_ss:
15; CHECK32:       # %bb.0: # %entry
16; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
17; CHECK32-NEXT:    kmovw %eax, %k1
18; CHECK32-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1}
19; CHECK32-NEXT:    retl
20entry:
21  %0 = and i8 %__U, 1
22  %tobool.i = icmp ne i8 %0, 0
23  %__B.elt.i = extractelement <4 x float> %__B, i32 0
24  %__W.elt.i = extractelement <4 x float> %__W, i32 0
25  %vecext1.i = select i1 %tobool.i, float %__B.elt.i, float %__W.elt.i
26  %vecins.i = insertelement <4 x float> %__A, float %vecext1.i, i32 0
27  ret <4 x float> %vecins.i
28}
29
30define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
31; CHECK64-LABEL: test_mm_maskz_move_ss:
32; CHECK64:       # %bb.0: # %entry
33; CHECK64-NEXT:    kmovw %edi, %k1
34; CHECK64-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
35; CHECK64-NEXT:    retq
36;
37; CHECK32-LABEL: test_mm_maskz_move_ss:
38; CHECK32:       # %bb.0: # %entry
39; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
40; CHECK32-NEXT:    kmovw %eax, %k1
41; CHECK32-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
42; CHECK32-NEXT:    retl
43entry:
44  %0 = and i8 %__U, 1
45  %tobool.i = icmp ne i8 %0, 0
46  %vecext.i = extractelement <4 x float> %__B, i32 0
47  %cond.i = select i1 %tobool.i, float %vecext.i, float 0.000000e+00
48  %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0
49  ret <4 x float> %vecins.i
50}
51
52define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
53; CHECK64-LABEL: test_mm_mask_move_sd:
54; CHECK64:       # %bb.0: # %entry
55; CHECK64-NEXT:    kmovw %edi, %k1
56; CHECK64-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1}
57; CHECK64-NEXT:    retq
58;
59; CHECK32-LABEL: test_mm_mask_move_sd:
60; CHECK32:       # %bb.0: # %entry
61; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
62; CHECK32-NEXT:    kmovw %eax, %k1
63; CHECK32-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1}
64; CHECK32-NEXT:    retl
65entry:
66  %0 = and i8 %__U, 1
67  %tobool.i = icmp ne i8 %0, 0
68  %__B.elt.i = extractelement <2 x double> %__B, i32 0
69  %__W.elt.i = extractelement <2 x double> %__W, i32 0
70  %vecext1.i = select i1 %tobool.i, double %__B.elt.i, double %__W.elt.i
71  %vecins.i = insertelement <2 x double> %__A, double %vecext1.i, i32 0
72  ret <2 x double> %vecins.i
73}
74
75define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
76; CHECK64-LABEL: test_mm_maskz_move_sd:
77; CHECK64:       # %bb.0: # %entry
78; CHECK64-NEXT:    kmovw %edi, %k1
79; CHECK64-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
80; CHECK64-NEXT:    retq
81;
82; CHECK32-LABEL: test_mm_maskz_move_sd:
83; CHECK32:       # %bb.0: # %entry
84; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
85; CHECK32-NEXT:    kmovw %eax, %k1
86; CHECK32-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
87; CHECK32-NEXT:    retl
88entry:
89  %0 = and i8 %__U, 1
90  %tobool.i = icmp ne i8 %0, 0
91  %vecext.i = extractelement <2 x double> %__B, i32 0
92  %cond.i = select i1 %tobool.i, double %vecext.i, double 0.000000e+00
93  %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0
94  ret <2 x double> %vecins.i
95}
96
97define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
98; CHECK64-LABEL: test_mm_mask_store_ss:
99; CHECK64:       # %bb.0: # %entry
100; CHECK64-NEXT:    kmovw %esi, %k1
101; CHECK64-NEXT:    vmovss %xmm0, (%rdi) {%k1}
102; CHECK64-NEXT:    retq
103;
104; CHECK32-LABEL: test_mm_mask_store_ss:
105; CHECK32:       # %bb.0: # %entry
106; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
107; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
108; CHECK32-NEXT:    kmovw %ecx, %k1
109; CHECK32-NEXT:    vmovss %xmm0, (%eax) {%k1}
110; CHECK32-NEXT:    retl
111entry:
112  %0 = bitcast float* %__W to <16 x float>*
113  %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
114  %1 = and i8 %__U, 1
115  %conv2.i = zext i8 %1 to i16
116  %2 = bitcast i16 %conv2.i to <16 x i1>
117  tail call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %shuffle.i.i, <16 x float>* %0, i32 16, <16 x i1> %2) #5
118  ret void
119}
120
121define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
122; CHECK64-LABEL: test_mm_mask_store_sd:
123; CHECK64:       # %bb.0: # %entry
124; CHECK64-NEXT:    kmovw %esi, %k1
125; CHECK64-NEXT:    vmovsd %xmm0, (%rdi) {%k1}
126; CHECK64-NEXT:    retq
127;
128; CHECK32-LABEL: test_mm_mask_store_sd:
129; CHECK32:       # %bb.0: # %entry
130; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
131; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
132; CHECK32-NEXT:    kmovw %ecx, %k1
133; CHECK32-NEXT:    vmovsd %xmm0, (%eax) {%k1}
134; CHECK32-NEXT:    retl
135entry:
136  %0 = bitcast double* %__W to <8 x double>*
137  %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
138  %1 = and i8 %__U, 1
139  %2 = bitcast i8 %1 to <8 x i1>
140  tail call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %shuffle.i.i, <8 x double>* %0, i32 16, <8 x i1> %2) #5
141  ret void
142}
143
144define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
145; CHECK64-LABEL: test_mm_mask_load_ss:
146; CHECK64:       # %bb.0: # %entry
147; CHECK64-NEXT:    kmovw %edi, %k1
148; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
149; CHECK64-NEXT:    retq
150;
151; CHECK32-LABEL: test_mm_mask_load_ss:
152; CHECK32:       # %bb.0: # %entry
153; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
154; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
155; CHECK32-NEXT:    kmovw %ecx, %k1
156; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
157; CHECK32-NEXT:    retl
158entry:
159  %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
160  %0 = bitcast float* %__W to <16 x float>*
161  %shuffle.i.i = shufflevector <4 x float> %shuffle.i, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
162  %1 = and i8 %__U, 1
163  %conv2.i = zext i8 %1 to i16
164  %2 = bitcast i16 %conv2.i to <16 x i1>
165  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> %shuffle.i.i) #5
166  %shuffle4.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
167  ret <4 x float> %shuffle4.i
168}
169
170define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
171; CHECK64-LABEL: test_mm_mask_load_sd:
172; CHECK64:       # %bb.0: # %entry
173; CHECK64-NEXT:    kmovw %edi, %k1
174; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
175; CHECK64-NEXT:    retq
176;
177; CHECK32-LABEL: test_mm_mask_load_sd:
178; CHECK32:       # %bb.0: # %entry
179; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
180; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
181; CHECK32-NEXT:    kmovw %ecx, %k1
182; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
183; CHECK32-NEXT:    retl
184entry:
185  %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
186  %0 = bitcast double* %__W to <8 x double>*
187  %shuffle.i.i = shufflevector <2 x double> %shuffle5.i, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
188  %1 = and i8 %__U, 1
189  %2 = bitcast i8 %1 to <8 x i1>
190  %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> %shuffle.i.i) #5
191  %shuffle3.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
192  ret <2 x double> %shuffle3.i
193}
194
195define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
196; CHECK64-LABEL: test_mm_maskz_load_ss:
197; CHECK64:       # %bb.0: # %entry
198; CHECK64-NEXT:    kmovw %edi, %k1
199; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
200; CHECK64-NEXT:    retq
201;
202; CHECK32-LABEL: test_mm_maskz_load_ss:
203; CHECK32:       # %bb.0: # %entry
204; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
205; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
206; CHECK32-NEXT:    kmovw %ecx, %k1
207; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
208; CHECK32-NEXT:    retl
209entry:
210  %0 = bitcast float* %__W to <16 x float>*
211  %1 = and i8 %__U, 1
212  %conv2.i = zext i8 %1 to i16
213  %2 = bitcast i16 %conv2.i to <16 x i1>
214  %3 = tail call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %0, i32 16, <16 x i1> %2, <16 x float> zeroinitializer) #5
215  %shuffle.i = shufflevector <16 x float> %3, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
216  ret <4 x float> %shuffle.i
217}
218
219define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
220; CHECK64-LABEL: test_mm_maskz_load_sd:
221; CHECK64:       # %bb.0: # %entry
222; CHECK64-NEXT:    kmovw %edi, %k1
223; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
224; CHECK64-NEXT:    retq
225;
226; CHECK32-LABEL: test_mm_maskz_load_sd:
227; CHECK32:       # %bb.0: # %entry
228; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
229; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
230; CHECK32-NEXT:    kmovw %ecx, %k1
231; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
232; CHECK32-NEXT:    retl
233entry:
234  %0 = bitcast double* %__W to <8 x double>*
235  %1 = and i8 %__U, 1
236  %2 = bitcast i8 %1 to <8 x i1>
237  %3 = tail call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %0, i32 16, <8 x i1> %2, <8 x double> zeroinitializer) #5
238  %shuffle.i = shufflevector <8 x double> %3, <8 x double> undef, <2 x i32> <i32 0, i32 1>
239  ret <2 x double> %shuffle.i
240}
241
242; The tests below match clang's newer codegen that uses 128-bit masked load/stores.
243
244define void @test_mm_mask_store_ss_2(float* %__P, i8 zeroext %__U, <4 x float> %__A) {
245; CHECK64-LABEL: test_mm_mask_store_ss_2:
246; CHECK64:       # %bb.0: # %entry
247; CHECK64-NEXT:    kmovw %esi, %k1
248; CHECK64-NEXT:    vmovss %xmm0, (%rdi) {%k1}
249; CHECK64-NEXT:    retq
250;
251; CHECK32-LABEL: test_mm_mask_store_ss_2:
252; CHECK32:       # %bb.0: # %entry
253; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
254; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
255; CHECK32-NEXT:    kmovw %ecx, %k1
256; CHECK32-NEXT:    vmovss %xmm0, (%eax) {%k1}
257; CHECK32-NEXT:    retl
258entry:
259  %0 = bitcast float* %__P to <4 x float>*
260  %1 = and i8 %__U, 1
261  %2 = bitcast i8 %1 to <8 x i1>
262  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
263  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %__A, <4 x float>* %0, i32 1, <4 x i1> %extract.i)
264  ret void
265}
266
267define void @test_mm_mask_store_sd_2(double* %__P, i8 zeroext %__U, <2 x double> %__A) {
268; CHECK64-LABEL: test_mm_mask_store_sd_2:
269; CHECK64:       # %bb.0: # %entry
270; CHECK64-NEXT:    kmovw %esi, %k1
271; CHECK64-NEXT:    vmovsd %xmm0, (%rdi) {%k1}
272; CHECK64-NEXT:    retq
273;
274; CHECK32-LABEL: test_mm_mask_store_sd_2:
275; CHECK32:       # %bb.0: # %entry
276; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
277; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
278; CHECK32-NEXT:    kmovw %ecx, %k1
279; CHECK32-NEXT:    vmovsd %xmm0, (%eax) {%k1}
280; CHECK32-NEXT:    retl
281entry:
282  %0 = bitcast double* %__P to <2 x double>*
283  %1 = and i8 %__U, 1
284  %2 = bitcast i8 %1 to <8 x i1>
285  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
286  tail call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %__A, <2 x double>* %0, i32 1, <2 x i1> %extract.i)
287  ret void
288}
289
290define <4 x float> @test_mm_mask_load_ss_2(<4 x float> %__A, i8 zeroext %__U, float* readonly %__W) {
291; CHECK64-LABEL: test_mm_mask_load_ss_2:
292; CHECK64:       # %bb.0: # %entry
293; CHECK64-NEXT:    kmovw %edi, %k1
294; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
295; CHECK64-NEXT:    retq
296;
297; CHECK32-LABEL: test_mm_mask_load_ss_2:
298; CHECK32:       # %bb.0: # %entry
299; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
300; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
301; CHECK32-NEXT:    kmovw %ecx, %k1
302; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
303; CHECK32-NEXT:    retl
304entry:
305  %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
306  %0 = bitcast float* %__W to <4 x float>*
307  %1 = and i8 %__U, 1
308  %2 = bitcast i8 %1 to <8 x i1>
309  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
310  %3 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 1, <4 x i1> %extract.i, <4 x float> %shuffle.i)
311  ret <4 x float> %3
312}
313
314define <4 x float> @test_mm_maskz_load_ss_2(i8 zeroext %__U, float* readonly %__W) {
315; CHECK64-LABEL: test_mm_maskz_load_ss_2:
316; CHECK64:       # %bb.0: # %entry
317; CHECK64-NEXT:    kmovw %edi, %k1
318; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
319; CHECK64-NEXT:    retq
320;
321; CHECK32-LABEL: test_mm_maskz_load_ss_2:
322; CHECK32:       # %bb.0: # %entry
323; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
324; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
325; CHECK32-NEXT:    kmovw %ecx, %k1
326; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
327; CHECK32-NEXT:    retl
328entry:
329  %0 = bitcast float* %__W to <4 x float>*
330  %1 = and i8 %__U, 1
331  %2 = bitcast i8 %1 to <8 x i1>
332  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
333  %3 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 1, <4 x i1> %extract.i, <4 x float> zeroinitializer)
334  ret <4 x float> %3
335}
336
337define <2 x double> @test_mm_mask_load_sd_2(<2 x double> %__A, i8 zeroext %__U, double* readonly %__W) {
338; CHECK64-LABEL: test_mm_mask_load_sd_2:
339; CHECK64:       # %bb.0: # %entry
340; CHECK64-NEXT:    kmovw %edi, %k1
341; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
342; CHECK64-NEXT:    retq
343;
344; CHECK32-LABEL: test_mm_mask_load_sd_2:
345; CHECK32:       # %bb.0: # %entry
346; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
347; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
348; CHECK32-NEXT:    kmovw %ecx, %k1
349; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
350; CHECK32-NEXT:    retl
351entry:
352  %shuffle3.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
353  %0 = bitcast double* %__W to <2 x double>*
354  %1 = and i8 %__U, 1
355  %2 = bitcast i8 %1 to <8 x i1>
356  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
357  %3 = tail call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %0, i32 1, <2 x i1> %extract.i, <2 x double> %shuffle3.i)
358  ret <2 x double> %3
359}
360
361define <2 x double> @test_mm_maskz_load_sd_2(i8 zeroext %__U, double* readonly %__W) {
362; CHECK64-LABEL: test_mm_maskz_load_sd_2:
363; CHECK64:       # %bb.0: # %entry
364; CHECK64-NEXT:    kmovw %edi, %k1
365; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
366; CHECK64-NEXT:    retq
367;
368; CHECK32-LABEL: test_mm_maskz_load_sd_2:
369; CHECK32:       # %bb.0: # %entry
370; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
371; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
372; CHECK32-NEXT:    kmovw %ecx, %k1
373; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
374; CHECK32-NEXT:    retl
375entry:
376  %0 = bitcast double* %__W to <2 x double>*
377  %1 = and i8 %__U, 1
378  %2 = bitcast i8 %1 to <8 x i1>
379  %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
380  %3 = tail call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %0, i32 1, <2 x i1> %extract.i, <2 x double> zeroinitializer)
381  ret <2 x double> %3
382}
383
384
385declare void @llvm.masked.store.v16f32.p0v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) #3
386
387declare void @llvm.masked.store.v8f64.p0v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) #3
388
389declare <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) #4
390
391declare <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) #4
392
393declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
394
395declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>)
396
397declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
398
399declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>)
400