1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck --check-prefixes=CHECK,KNL %s
3; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck --check-prefixes=CHECK,SKX %s
4; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+avx512vbmi | FileCheck --check-prefixes=CHECK,SKX %s
5
6define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
7; CHECK-LABEL: test1:
8; CHECK:       ## %bb.0:
9; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
10; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
11; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
12; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
13; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
14; CHECK-NEXT:    retq
15  %rrr = load float, float* %br
16  %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
17  %rrr3 = insertelement <16 x float> %rrr2, float %y, i32 14
18  ret <16 x float> %rrr3
19}
20
21define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
22; CHECK-LABEL: test2:
23; CHECK:       ## %bb.0:
24; CHECK-NEXT:    vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
25; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
26; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
27; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
28; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
29; CHECK-NEXT:    retq
30  %rrr = load double, double* %br
31  %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
32  %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
33  ret <8 x double> %rrr3
34}
35
36define <16 x float> @test3(<16 x float> %x) nounwind {
37; CHECK-LABEL: test3:
38; CHECK:       ## %bb.0:
39; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
40; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
41; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
42; CHECK-NEXT:    retq
43  %eee = extractelement <16 x float> %x, i32 4
44  %rrr2 = insertelement <16 x float> %x, float %eee, i32 1
45  ret <16 x float> %rrr2
46}
47
48define <8 x i64> @test4(<8 x i64> %x) nounwind {
49; CHECK-LABEL: test4:
50; CHECK:       ## %bb.0:
51; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
52; CHECK-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
53; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
54; CHECK-NEXT:    retq
55  %eee = extractelement <8 x i64> %x, i32 4
56  %rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
57  ret <8 x i64> %rrr2
58}
59
60define i32 @test5(<4 x float> %x) nounwind {
61; CHECK-LABEL: test5:
62; CHECK:       ## %bb.0:
63; CHECK-NEXT:    vextractps $3, %xmm0, %eax
64; CHECK-NEXT:    retq
65  %ef = extractelement <4 x float> %x, i32 3
66  %ei = bitcast float %ef to i32
67  ret i32 %ei
68}
69
70define void @test6(<4 x float> %x, float* %out) nounwind {
71; CHECK-LABEL: test6:
72; CHECK:       ## %bb.0:
73; CHECK-NEXT:    vextractps $3, %xmm0, (%rdi)
74; CHECK-NEXT:    retq
75  %ef = extractelement <4 x float> %x, i32 3
76  store float %ef, float* %out, align 4
77  ret void
78}
79
80define float @test7(<16 x float> %x, i32 %ind) nounwind {
81; CHECK-LABEL: test7:
82; CHECK:       ## %bb.0:
83; CHECK-NEXT:    pushq %rbp
84; CHECK-NEXT:    movq %rsp, %rbp
85; CHECK-NEXT:    andq $-64, %rsp
86; CHECK-NEXT:    subq $128, %rsp
87; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
88; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
89; CHECK-NEXT:    andl $15, %edi
90; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
91; CHECK-NEXT:    movq %rbp, %rsp
92; CHECK-NEXT:    popq %rbp
93; CHECK-NEXT:    vzeroupper
94; CHECK-NEXT:    retq
95  %e = extractelement <16 x float> %x, i32 %ind
96  ret float %e
97}
98
99define double @test8(<8 x double> %x, i32 %ind) nounwind {
100; CHECK-LABEL: test8:
101; CHECK:       ## %bb.0:
102; CHECK-NEXT:    pushq %rbp
103; CHECK-NEXT:    movq %rsp, %rbp
104; CHECK-NEXT:    andq $-64, %rsp
105; CHECK-NEXT:    subq $128, %rsp
106; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
107; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
108; CHECK-NEXT:    andl $7, %edi
109; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
110; CHECK-NEXT:    movq %rbp, %rsp
111; CHECK-NEXT:    popq %rbp
112; CHECK-NEXT:    vzeroupper
113; CHECK-NEXT:    retq
114  %e = extractelement <8 x double> %x, i32 %ind
115  ret double %e
116}
117
118define float @test9(<8 x float> %x, i32 %ind) nounwind {
119; CHECK-LABEL: test9:
120; CHECK:       ## %bb.0:
121; CHECK-NEXT:    pushq %rbp
122; CHECK-NEXT:    movq %rsp, %rbp
123; CHECK-NEXT:    andq $-32, %rsp
124; CHECK-NEXT:    subq $64, %rsp
125; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
126; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
127; CHECK-NEXT:    andl $7, %edi
128; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
129; CHECK-NEXT:    movq %rbp, %rsp
130; CHECK-NEXT:    popq %rbp
131; CHECK-NEXT:    vzeroupper
132; CHECK-NEXT:    retq
133  %e = extractelement <8 x float> %x, i32 %ind
134  ret float %e
135}
136
137define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
138; CHECK-LABEL: test10:
139; CHECK:       ## %bb.0:
140; CHECK-NEXT:    pushq %rbp
141; CHECK-NEXT:    movq %rsp, %rbp
142; CHECK-NEXT:    andq $-64, %rsp
143; CHECK-NEXT:    subq $128, %rsp
144; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
145; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
146; CHECK-NEXT:    andl $15, %edi
147; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
148; CHECK-NEXT:    movq %rbp, %rsp
149; CHECK-NEXT:    popq %rbp
150; CHECK-NEXT:    vzeroupper
151; CHECK-NEXT:    retq
152  %e = extractelement <16 x i32> %x, i32 %ind
153  ret i32 %e
154}
155
156define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
157; KNL-LABEL: test11:
158; KNL:       ## %bb.0:
159; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
160; KNL-NEXT:    kshiftrw $4, %k0, %k0
161; KNL-NEXT:    kmovw %k0, %eax
162; KNL-NEXT:    testb $1, %al
163; KNL-NEXT:    je LBB10_2
164; KNL-NEXT:  ## %bb.1: ## %A
165; KNL-NEXT:    vmovdqa64 %zmm1, %zmm0
166; KNL-NEXT:    retq
167; KNL-NEXT:  LBB10_2: ## %B
168; KNL-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
169; KNL-NEXT:    retq
170;
171; SKX-LABEL: test11:
172; SKX:       ## %bb.0:
173; SKX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
174; SKX-NEXT:    kshiftrw $4, %k0, %k0
175; SKX-NEXT:    kmovd %k0, %eax
176; SKX-NEXT:    testb $1, %al
177; SKX-NEXT:    je LBB10_2
178; SKX-NEXT:  ## %bb.1: ## %A
179; SKX-NEXT:    vmovdqa64 %zmm1, %zmm0
180; SKX-NEXT:    retq
181; SKX-NEXT:  LBB10_2: ## %B
182; SKX-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
183; SKX-NEXT:    retq
184  %cmp_res = icmp ult <16 x i32> %a, %b
185  %ia = extractelement <16 x i1> %cmp_res, i32 4
186  br i1 %ia, label %A, label %B
187  A:
188    ret <16 x i32>%b
189  B:
190   %c = add <16 x i32>%b, %a
191   ret <16 x i32>%c
192}
193
194define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
195; KNL-LABEL: test12:
196; KNL:       ## %bb.0:
197; KNL-NEXT:    movq %rdi, %rax
198; KNL-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
199; KNL-NEXT:    kmovw %k0, %ecx
200; KNL-NEXT:    testb $1, %cl
201; KNL-NEXT:    cmoveq %rsi, %rax
202; KNL-NEXT:    vzeroupper
203; KNL-NEXT:    retq
204;
205; SKX-LABEL: test12:
206; SKX:       ## %bb.0:
207; SKX-NEXT:    movq %rdi, %rax
208; SKX-NEXT:    vpcmpgtq %zmm0, %zmm2, %k0
209; SKX-NEXT:    kmovd %k0, %ecx
210; SKX-NEXT:    testb $1, %cl
211; SKX-NEXT:    cmoveq %rsi, %rax
212; SKX-NEXT:    vzeroupper
213; SKX-NEXT:    retq
214  %cmpvector_func.i = icmp slt <16 x i64> %a, %b
215  %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
216  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
217  ret i64 %res
218}
219
220define i16 @test13(i32 %a, i32 %b) {
221; KNL-LABEL: test13:
222; KNL:       ## %bb.0:
223; KNL-NEXT:    cmpl %esi, %edi
224; KNL-NEXT:    setb %al
225; KNL-NEXT:    movw $-4, %cx
226; KNL-NEXT:    kmovw %ecx, %k0
227; KNL-NEXT:    kshiftrw $1, %k0, %k0
228; KNL-NEXT:    kshiftlw $1, %k0, %k0
229; KNL-NEXT:    andl $1, %eax
230; KNL-NEXT:    kmovw %eax, %k1
231; KNL-NEXT:    korw %k1, %k0, %k0
232; KNL-NEXT:    kmovw %k0, %eax
233; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
234; KNL-NEXT:    retq
235;
236; SKX-LABEL: test13:
237; SKX:       ## %bb.0:
238; SKX-NEXT:    cmpl %esi, %edi
239; SKX-NEXT:    setb %al
240; SKX-NEXT:    movw $-4, %cx
241; SKX-NEXT:    kmovd %ecx, %k0
242; SKX-NEXT:    kshiftrw $1, %k0, %k0
243; SKX-NEXT:    kshiftlw $1, %k0, %k0
244; SKX-NEXT:    andl $1, %eax
245; SKX-NEXT:    kmovw %eax, %k1
246; SKX-NEXT:    korw %k1, %k0, %k0
247; SKX-NEXT:    kmovd %k0, %eax
248; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
249; SKX-NEXT:    retq
250  %cmp_res = icmp ult i32 %a, %b
251  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
252  %res = bitcast <16 x i1> %maskv to i16
253  ret i16 %res
254}
255
256define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
257; KNL-LABEL: test14:
258; KNL:       ## %bb.0:
259; KNL-NEXT:    movq %rdi, %rax
260; KNL-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
261; KNL-NEXT:    kshiftrw $4, %k0, %k0
262; KNL-NEXT:    kmovw %k0, %ecx
263; KNL-NEXT:    testb $1, %cl
264; KNL-NEXT:    cmoveq %rsi, %rax
265; KNL-NEXT:    vzeroupper
266; KNL-NEXT:    retq
267;
268; SKX-LABEL: test14:
269; SKX:       ## %bb.0:
270; SKX-NEXT:    movq %rdi, %rax
271; SKX-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
272; SKX-NEXT:    kshiftrb $4, %k0, %k0
273; SKX-NEXT:    kmovd %k0, %ecx
274; SKX-NEXT:    testb $1, %cl
275; SKX-NEXT:    cmoveq %rsi, %rax
276; SKX-NEXT:    vzeroupper
277; SKX-NEXT:    retq
278  %cmpvector_func.i = icmp slt <8 x i64> %a, %b
279  %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
280  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
281  ret i64 %res
282}
283
284define i16 @test15(i1 *%addr) {
285; CHECK-LABEL: test15:
286; CHECK:       ## %bb.0:
287; CHECK-NEXT:    xorl %ecx, %ecx
288; CHECK-NEXT:    cmpb $0, (%rdi)
289; CHECK-NEXT:    movl $65535, %eax ## imm = 0xFFFF
290; CHECK-NEXT:    cmovel %ecx, %eax
291; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
292; CHECK-NEXT:    retq
293  %x = load i1 , i1 * %addr, align 1
294  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
295  %x2 = bitcast <16 x i1>%x1 to i16
296  ret i16 %x2
297}
298
299define i16 @test16(i1 *%addr, i16 %a) {
300; KNL-LABEL: test16:
301; KNL:       ## %bb.0:
302; KNL-NEXT:    movb (%rdi), %al
303; KNL-NEXT:    kmovw %esi, %k0
304; KNL-NEXT:    movw $-1025, %cx ## imm = 0xFBFF
305; KNL-NEXT:    kmovw %ecx, %k1
306; KNL-NEXT:    kandw %k1, %k0, %k0
307; KNL-NEXT:    kmovw %eax, %k1
308; KNL-NEXT:    kshiftlw $15, %k1, %k1
309; KNL-NEXT:    kshiftrw $5, %k1, %k1
310; KNL-NEXT:    korw %k1, %k0, %k0
311; KNL-NEXT:    kmovw %k0, %eax
312; KNL-NEXT:    ## kill: def $ax killed $ax killed $eax
313; KNL-NEXT:    retq
314;
315; SKX-LABEL: test16:
316; SKX:       ## %bb.0:
317; SKX-NEXT:    kmovb (%rdi), %k0
318; SKX-NEXT:    kmovd %esi, %k1
319; SKX-NEXT:    movw $-1025, %ax ## imm = 0xFBFF
320; SKX-NEXT:    kmovd %eax, %k2
321; SKX-NEXT:    kandw %k2, %k1, %k1
322; SKX-NEXT:    kshiftlw $15, %k0, %k0
323; SKX-NEXT:    kshiftrw $5, %k0, %k0
324; SKX-NEXT:    korw %k0, %k1, %k0
325; SKX-NEXT:    kmovd %k0, %eax
326; SKX-NEXT:    ## kill: def $ax killed $ax killed $eax
327; SKX-NEXT:    retq
328  %x = load i1 , i1 * %addr, align 128
329  %a1 = bitcast i16 %a to <16 x i1>
330  %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
331  %x2 = bitcast <16 x i1>%x1 to i16
332  ret i16 %x2
333}
334
335define i8 @test17(i1 *%addr, i8 %a) {
336; KNL-LABEL: test17:
337; KNL:       ## %bb.0:
338; KNL-NEXT:    movb (%rdi), %al
339; KNL-NEXT:    kmovw %esi, %k0
340; KNL-NEXT:    movw $-17, %cx
341; KNL-NEXT:    kmovw %ecx, %k1
342; KNL-NEXT:    kandw %k1, %k0, %k0
343; KNL-NEXT:    kmovw %eax, %k1
344; KNL-NEXT:    kshiftlw $15, %k1, %k1
345; KNL-NEXT:    kshiftrw $11, %k1, %k1
346; KNL-NEXT:    korw %k1, %k0, %k0
347; KNL-NEXT:    kmovw %k0, %eax
348; KNL-NEXT:    ## kill: def $al killed $al killed $eax
349; KNL-NEXT:    retq
350;
351; SKX-LABEL: test17:
352; SKX:       ## %bb.0:
353; SKX-NEXT:    kmovb (%rdi), %k0
354; SKX-NEXT:    kmovd %esi, %k1
355; SKX-NEXT:    movb $-17, %al
356; SKX-NEXT:    kmovd %eax, %k2
357; SKX-NEXT:    kandb %k2, %k1, %k1
358; SKX-NEXT:    kshiftlb $7, %k0, %k0
359; SKX-NEXT:    kshiftrb $3, %k0, %k0
360; SKX-NEXT:    korb %k0, %k1, %k0
361; SKX-NEXT:    kmovd %k0, %eax
362; SKX-NEXT:    ## kill: def $al killed $al killed $eax
363; SKX-NEXT:    retq
364  %x = load i1 , i1 * %addr, align 128
365  %a1 = bitcast i8 %a to <8 x i1>
366  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 4
367  %x2 = bitcast <8 x i1>%x1 to i8
368  ret i8 %x2
369}
370
371define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
372; CHECK-LABEL: extract_v8i64:
373; CHECK:       ## %bb.0:
374; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
375; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
376; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
377; CHECK-NEXT:    vzeroupper
378; CHECK-NEXT:    retq
379  %r1 = extractelement <8 x i64> %x, i32 1
380  %r2 = extractelement <8 x i64> %x, i32 3
381  store i64 %r2, i64* %dst, align 1
382  ret i64 %r1
383}
384
385define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
386; CHECK-LABEL: extract_v4i64:
387; CHECK:       ## %bb.0:
388; CHECK-NEXT:    vpextrq $1, %xmm0, %rax
389; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
390; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
391; CHECK-NEXT:    vzeroupper
392; CHECK-NEXT:    retq
393  %r1 = extractelement <4 x i64> %x, i32 1
394  %r2 = extractelement <4 x i64> %x, i32 3
395  store i64 %r2, i64* %dst, align 1
396  ret i64 %r1
397}
398
399define i64 @extract_v2i64(<2 x i64> %x, i64* %dst) {
400; CHECK-LABEL: extract_v2i64:
401; CHECK:       ## %bb.0:
402; CHECK-NEXT:    vmovq %xmm0, %rax
403; CHECK-NEXT:    vpextrq $1, %xmm0, (%rdi)
404; CHECK-NEXT:    retq
405  %r1 = extractelement <2 x i64> %x, i32 0
406  %r2 = extractelement <2 x i64> %x, i32 1
407  store i64 %r2, i64* %dst, align 1
408  ret i64 %r1
409}
410
411define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
412; CHECK-LABEL: extract_v16i32:
413; CHECK:       ## %bb.0:
414; CHECK-NEXT:    vextractps $1, %xmm0, %eax
415; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
416; CHECK-NEXT:    vextractps $1, %xmm0, (%rdi)
417; CHECK-NEXT:    vzeroupper
418; CHECK-NEXT:    retq
419  %r1 = extractelement <16 x i32> %x, i32 1
420  %r2 = extractelement <16 x i32> %x, i32 5
421  store i32 %r2, i32* %dst, align 1
422  ret i32 %r1
423}
424
425define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
426; CHECK-LABEL: extract_v8i32:
427; CHECK:       ## %bb.0:
428; CHECK-NEXT:    vextractps $1, %xmm0, %eax
429; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
430; CHECK-NEXT:    vextractps $1, %xmm0, (%rdi)
431; CHECK-NEXT:    vzeroupper
432; CHECK-NEXT:    retq
433  %r1 = extractelement <8 x i32> %x, i32 1
434  %r2 = extractelement <8 x i32> %x, i32 5
435  store i32 %r2, i32* %dst, align 1
436  ret i32 %r1
437}
438
439define i32 @extract_v4i32(<4 x i32> %x, i32* %dst) {
440; CHECK-LABEL: extract_v4i32:
441; CHECK:       ## %bb.0:
442; CHECK-NEXT:    vextractps $1, %xmm0, %eax
443; CHECK-NEXT:    vextractps $3, %xmm0, (%rdi)
444; CHECK-NEXT:    retq
445  %r1 = extractelement <4 x i32> %x, i32 1
446  %r2 = extractelement <4 x i32> %x, i32 3
447  store i32 %r2, i32* %dst, align 1
448  ret i32 %r1
449}
450
451define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
452; CHECK-LABEL: extract_v32i16:
453; CHECK:       ## %bb.0:
454; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
455; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
456; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
457; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
458; CHECK-NEXT:    vzeroupper
459; CHECK-NEXT:    retq
460  %r1 = extractelement <32 x i16> %x, i32 1
461  %r2 = extractelement <32 x i16> %x, i32 9
462  store i16 %r2, i16* %dst, align 1
463  ret i16 %r1
464}
465
466define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
467; CHECK-LABEL: extract_v16i16:
468; CHECK:       ## %bb.0:
469; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
470; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
471; CHECK-NEXT:    vpextrw $1, %xmm0, (%rdi)
472; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
473; CHECK-NEXT:    vzeroupper
474; CHECK-NEXT:    retq
475  %r1 = extractelement <16 x i16> %x, i32 1
476  %r2 = extractelement <16 x i16> %x, i32 9
477  store i16 %r2, i16* %dst, align 1
478  ret i16 %r1
479}
480
481define i16 @extract_v8i16(<8 x i16> %x, i16* %dst) {
482; CHECK-LABEL: extract_v8i16:
483; CHECK:       ## %bb.0:
484; CHECK-NEXT:    vpextrw $1, %xmm0, %eax
485; CHECK-NEXT:    vpextrw $3, %xmm0, (%rdi)
486; CHECK-NEXT:    ## kill: def $ax killed $ax killed $eax
487; CHECK-NEXT:    retq
488  %r1 = extractelement <8 x i16> %x, i32 1
489  %r2 = extractelement <8 x i16> %x, i32 3
490  store i16 %r2, i16* %dst, align 1
491  ret i16 %r1
492}
493
494define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
495; CHECK-LABEL: extract_v64i8:
496; CHECK:       ## %bb.0:
497; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
498; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
499; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
500; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
501; CHECK-NEXT:    vzeroupper
502; CHECK-NEXT:    retq
503  %r1 = extractelement <64 x i8> %x, i32 1
504  %r2 = extractelement <64 x i8> %x, i32 17
505  store i8 %r2, i8* %dst, align 1
506  ret i8 %r1
507}
508
509define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
510; CHECK-LABEL: extract_v32i8:
511; CHECK:       ## %bb.0:
512; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
513; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
514; CHECK-NEXT:    vpextrb $1, %xmm0, (%rdi)
515; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
516; CHECK-NEXT:    vzeroupper
517; CHECK-NEXT:    retq
518  %r1 = extractelement <32 x i8> %x, i32 1
519  %r2 = extractelement <32 x i8> %x, i32 17
520  store i8 %r2, i8* %dst, align 1
521  ret i8 %r1
522}
523
524define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
525; CHECK-LABEL: extract_v16i8:
526; CHECK:       ## %bb.0:
527; CHECK-NEXT:    vpextrb $1, %xmm0, %eax
528; CHECK-NEXT:    vpextrb $3, %xmm0, (%rdi)
529; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
530; CHECK-NEXT:    retq
531  %r1 = extractelement <16 x i8> %x, i32 1
532  %r2 = extractelement <16 x i8> %x, i32 3
533  store i8 %r2, i8* %dst, align 1
534  ret i8 %r1
535}
536
537define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
538; CHECK-LABEL: insert_v8i64:
539; CHECK:       ## %bb.0:
540; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
541; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
542; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
543; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
544; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
545; CHECK-NEXT:    retq
546  %val = load i64, i64* %ptr
547  %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
548  %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
549  ret <8 x i64> %r2
550}
551
552define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
553; CHECK-LABEL: insert_v4i64:
554; CHECK:       ## %bb.0:
555; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
556; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
557; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
558; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
559; CHECK-NEXT:    retq
560  %val = load i64, i64* %ptr
561  %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
562  %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
563  ret <4 x i64> %r2
564}
565
566define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
567; CHECK-LABEL: insert_v2i64:
568; CHECK:       ## %bb.0:
569; CHECK-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
570; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm0
571; CHECK-NEXT:    retq
572  %val = load i64, i64* %ptr
573  %r1 = insertelement <2 x i64> %x, i64 %val, i32 1
574  %r2 = insertelement <2 x i64> %r1, i64 %y, i32 0
575  ret <2 x i64> %r2
576}
577
578define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
579; CHECK-LABEL: insert_v16i32:
580; CHECK:       ## %bb.0:
581; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
582; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
583; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
584; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
585; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
586; CHECK-NEXT:    retq
587  %val = load i32, i32* %ptr
588  %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
589  %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
590  ret <16 x i32> %r2
591}
592
593define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
594; CHECK-LABEL: insert_v8i32:
595; CHECK:       ## %bb.0:
596; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
597; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
598; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
599; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
600; CHECK-NEXT:    retq
601  %val = load i32, i32* %ptr
602  %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
603  %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
604  ret <8 x i32> %r2
605}
606
607define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
608; CHECK-LABEL: insert_v4i32:
609; CHECK:       ## %bb.0:
610; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm0
611; CHECK-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
612; CHECK-NEXT:    retq
613  %val = load i32, i32* %ptr
614  %r1 = insertelement <4 x i32> %x, i32 %val, i32 1
615  %r2 = insertelement <4 x i32> %r1, i32 %y, i32 3
616  ret <4 x i32> %r2
617}
618
619define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
620; CHECK-LABEL: insert_v32i16:
621; CHECK:       ## %bb.0:
622; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
623; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
624; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
625; CHECK-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
626; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
627; CHECK-NEXT:    retq
628  %val = load i16, i16* %ptr
629  %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
630  %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
631  ret <32 x i16> %r2
632}
633
634define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
635; CHECK-LABEL: insert_v16i16:
636; CHECK:       ## %bb.0:
637; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
638; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
639; CHECK-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
640; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
641; CHECK-NEXT:    retq
642  %val = load i16, i16* %ptr
643  %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
644  %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
645  ret <16 x i16> %r2
646}
647
648define <8 x i16> @insert_v8i16(<8 x i16> %x, i16 %y, i16* %ptr) {
649; CHECK-LABEL: insert_v8i16:
650; CHECK:       ## %bb.0:
651; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm0
652; CHECK-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
653; CHECK-NEXT:    retq
654  %val = load i16, i16* %ptr
655  %r1 = insertelement <8 x i16> %x, i16 %val, i32 1
656  %r2 = insertelement <8 x i16> %r1, i16 %y, i32 5
657  ret <8 x i16> %r2
658}
659
660define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
661; CHECK-LABEL: insert_v64i8:
662; CHECK:       ## %bb.0:
663; CHECK-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
664; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
665; CHECK-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
666; CHECK-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
667; CHECK-NEXT:    vinserti32x4 $3, %xmm0, %zmm1, %zmm0
668; CHECK-NEXT:    retq
669  %val = load i8, i8* %ptr
670  %r1 = insertelement <64 x i8> %x, i8 %val, i32 1
671  %r2 = insertelement <64 x i8> %r1, i8 %y, i32 50
672  ret <64 x i8> %r2
673}
674
675define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
676; CHECK-LABEL: insert_v32i8:
677; CHECK:       ## %bb.0:
678; CHECK-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
679; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
680; CHECK-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
681; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
682; CHECK-NEXT:    retq
683  %val = load i8, i8* %ptr
684  %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
685  %r2 = insertelement <32 x i8> %r1, i8 %y, i32 17
686  ret <32 x i8> %r2
687}
688
689define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
690; CHECK-LABEL: insert_v16i8:
691; CHECK:       ## %bb.0:
692; CHECK-NEXT:    vpinsrb $3, (%rsi), %xmm0, %xmm0
693; CHECK-NEXT:    vpinsrb $10, %edi, %xmm0, %xmm0
694; CHECK-NEXT:    retq
695  %val = load i8, i8* %ptr
696  %r1 = insertelement <16 x i8> %x, i8 %val, i32 3
697  %r2 = insertelement <16 x i8> %r1, i8 %y, i32 10
698  ret <16 x i8> %r2
699}
700
701define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
702; CHECK-LABEL: test_insert_128_v8i64:
703; CHECK:       ## %bb.0:
704; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
705; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
706; CHECK-NEXT:    retq
707  %r = insertelement <8 x i64> %x, i64 %y, i32 1
708  ret <8 x i64> %r
709}
710
711define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
712; CHECK-LABEL: test_insert_128_v16i32:
713; CHECK:       ## %bb.0:
714; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm1
715; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
716; CHECK-NEXT:    retq
717  %r = insertelement <16 x i32> %x, i32 %y, i32 1
718  ret <16 x i32> %r
719}
720
721define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
722; CHECK-LABEL: test_insert_128_v8f64:
723; CHECK:       ## %bb.0:
724; CHECK-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm0[0],xmm1[0]
725; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
726; CHECK-NEXT:    retq
727  %r = insertelement <8 x double> %x, double %y, i32 1
728  ret <8 x double> %r
729}
730
731define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
732; CHECK-LABEL: test_insert_128_v16f32:
733; CHECK:       ## %bb.0:
734; CHECK-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[2,3]
735; CHECK-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
736; CHECK-NEXT:    retq
737  %r = insertelement <16 x float> %x, float %y, i32 1
738  ret <16 x float> %r
739}
740
741define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
742; CHECK-LABEL: test_insert_128_v16i16:
743; CHECK:       ## %bb.0:
744; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
745; CHECK-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
746; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
747; CHECK-NEXT:    retq
748  %r = insertelement <16 x i16> %x, i16 %y, i32 10
749  ret <16 x i16> %r
750}
751
752define <32 x i8> @test_insert_128_v32i8(<32 x i8> %x, i8 %y) {
753; CHECK-LABEL: test_insert_128_v32i8:
754; CHECK:       ## %bb.0:
755; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
756; CHECK-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
757; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
758; CHECK-NEXT:    retq
759  %r = insertelement <32 x i8> %x, i8 %y, i32 20
760  ret <32 x i8> %r
761}
762
763define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> %y) {
764; KNL-LABEL: test_insertelement_v32i1:
765; KNL:       ## %bb.0:
766; KNL-NEXT:    cmpl %esi, %edi
767; KNL-NEXT:    setb %al
768; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
769; KNL-NEXT:    kmovw %k0, %ecx
770; KNL-NEXT:    shll $16, %ecx
771; KNL-NEXT:    movw $-17, %dx
772; KNL-NEXT:    kmovw %edx, %k1
773; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0 {%k1}
774; KNL-NEXT:    kmovw %eax, %k1
775; KNL-NEXT:    kshiftlw $15, %k1, %k1
776; KNL-NEXT:    kshiftrw $11, %k1, %k1
777; KNL-NEXT:    korw %k1, %k0, %k0
778; KNL-NEXT:    kmovw %k0, %eax
779; KNL-NEXT:    orl %ecx, %eax
780; KNL-NEXT:    vzeroupper
781; KNL-NEXT:    retq
782;
783; SKX-LABEL: test_insertelement_v32i1:
784; SKX:       ## %bb.0:
785; SKX-NEXT:    cmpl %esi, %edi
786; SKX-NEXT:    setb %al
787; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
788; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k1
789; SKX-NEXT:    kunpckwd %k0, %k1, %k0
790; SKX-NEXT:    movl $-17, %ecx
791; SKX-NEXT:    kmovd %ecx, %k1
792; SKX-NEXT:    kandd %k1, %k0, %k0
793; SKX-NEXT:    kmovd %eax, %k1
794; SKX-NEXT:    kshiftld $31, %k1, %k1
795; SKX-NEXT:    kshiftrd $27, %k1, %k1
796; SKX-NEXT:    kord %k1, %k0, %k0
797; SKX-NEXT:    kmovd %k0, %eax
798; SKX-NEXT:    vzeroupper
799; SKX-NEXT:    retq
800  %cmp_res_i1 = icmp ult i32 %a, %b
801  %cmp_cmp_vec = icmp ult <32 x i32> %x, %y
802  %maskv = insertelement <32 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 4
803  %res = bitcast <32 x i1> %maskv to i32
804  ret i32 %res
805}
806
807define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) {
808; KNL-LABEL: test_iinsertelement_v4i1:
809; KNL:       ## %bb.0:
810; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
811; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
812; KNL-NEXT:    cmpl %esi, %edi
813; KNL-NEXT:    setb %al
814; KNL-NEXT:    movw $-5, %cx
815; KNL-NEXT:    kmovw %ecx, %k1
816; KNL-NEXT:    vpcmpltud %zmm1, %zmm0, %k0 {%k1}
817; KNL-NEXT:    kmovw %eax, %k1
818; KNL-NEXT:    kshiftlw $15, %k1, %k1
819; KNL-NEXT:    kshiftrw $13, %k1, %k1
820; KNL-NEXT:    korw %k1, %k0, %k0
821; KNL-NEXT:    kmovw %k0, %eax
822; KNL-NEXT:    ## kill: def $al killed $al killed $eax
823; KNL-NEXT:    vzeroupper
824; KNL-NEXT:    retq
825;
826; SKX-LABEL: test_iinsertelement_v4i1:
827; SKX:       ## %bb.0:
828; SKX-NEXT:    cmpl %esi, %edi
829; SKX-NEXT:    setb %al
830; SKX-NEXT:    movb $-5, %cl
831; SKX-NEXT:    kmovd %ecx, %k1
832; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k1}
833; SKX-NEXT:    kmovd %eax, %k1
834; SKX-NEXT:    kshiftlb $7, %k1, %k1
835; SKX-NEXT:    kshiftrb $5, %k1, %k1
836; SKX-NEXT:    korw %k1, %k0, %k0
837; SKX-NEXT:    kmovd %k0, %eax
838; SKX-NEXT:    ## kill: def $al killed $al killed $eax
839; SKX-NEXT:    retq
840  %cmp_res_i1 = icmp ult i32 %a, %b
841  %cmp_cmp_vec = icmp ult <4 x i32> %x, %y
842  %maskv = insertelement <4 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 2
843  %res0 = shufflevector <4 x i1> %maskv, <4 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
844  %res = bitcast <8 x i1> %res0 to i8
845  ret i8 %res
846}
847
848define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) {
849; KNL-LABEL: test_iinsertelement_v2i1:
850; KNL:       ## %bb.0:
851; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
852; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
853; KNL-NEXT:    cmpl %esi, %edi
854; KNL-NEXT:    setb %al
855; KNL-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
856; KNL-NEXT:    kshiftlw $15, %k0, %k0
857; KNL-NEXT:    kshiftrw $15, %k0, %k0
858; KNL-NEXT:    kmovw %eax, %k1
859; KNL-NEXT:    kshiftlw $1, %k1, %k1
860; KNL-NEXT:    korw %k1, %k0, %k0
861; KNL-NEXT:    kmovw %k0, %eax
862; KNL-NEXT:    ## kill: def $al killed $al killed $eax
863; KNL-NEXT:    vzeroupper
864; KNL-NEXT:    retq
865;
866; SKX-LABEL: test_iinsertelement_v2i1:
867; SKX:       ## %bb.0:
868; SKX-NEXT:    cmpl %esi, %edi
869; SKX-NEXT:    setb %al
870; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0
871; SKX-NEXT:    kshiftlb $7, %k0, %k0
872; SKX-NEXT:    kshiftrb $7, %k0, %k0
873; SKX-NEXT:    kmovd %eax, %k1
874; SKX-NEXT:    kshiftlb $1, %k1, %k1
875; SKX-NEXT:    korw %k1, %k0, %k0
876; SKX-NEXT:    kmovd %k0, %eax
877; SKX-NEXT:    ## kill: def $al killed $al killed $eax
878; SKX-NEXT:    retq
879  %cmp_res_i1 = icmp ult i32 %a, %b
880  %cmp_cmp_vec = icmp ult <2 x i64> %x, %y
881  %maskv = insertelement <2 x i1> %cmp_cmp_vec, i1 %cmp_res_i1, i32 1
882  %res0 = shufflevector <2 x i1> %maskv, <2 x i1> undef , <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
883  %res = bitcast <8 x i1> %res0 to i8
884  ret i8 %res
885}
886
887define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
888; KNL-LABEL: test_extractelement_v2i1:
889; KNL:       ## %bb.0:
890; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
891; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
892; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
893; KNL-NEXT:    kmovw %k0, %ecx
894; KNL-NEXT:    andl $1, %ecx
895; KNL-NEXT:    movl $4, %eax
896; KNL-NEXT:    subl %ecx, %eax
897; KNL-NEXT:    vzeroupper
898; KNL-NEXT:    retq
899;
900; SKX-LABEL: test_extractelement_v2i1:
901; SKX:       ## %bb.0:
902; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
903; SKX-NEXT:    kmovd %k0, %ecx
904; SKX-NEXT:    andl $1, %ecx
905; SKX-NEXT:    movl $4, %eax
906; SKX-NEXT:    subl %ecx, %eax
907; SKX-NEXT:    retq
908  %t1 = icmp ugt <2 x i64> %a, %b
909  %t2 = extractelement <2 x i1> %t1, i32 0
910  %res = select i1 %t2, i8 3, i8 4
911  ret i8 %res
912}
913
914define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
915; KNL-LABEL: extractelement_v2i1_alt:
916; KNL:       ## %bb.0:
917; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
918; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
919; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k0
920; KNL-NEXT:    kmovw %k0, %eax
921; KNL-NEXT:    andb $1, %al
922; KNL-NEXT:    movb $4, %cl
923; KNL-NEXT:    subb %al, %cl
924; KNL-NEXT:    movzbl %cl, %eax
925; KNL-NEXT:    vzeroupper
926; KNL-NEXT:    retq
927;
928; SKX-LABEL: extractelement_v2i1_alt:
929; SKX:       ## %bb.0:
930; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
931; SKX-NEXT:    kmovd %k0, %eax
932; SKX-NEXT:    andb $1, %al
933; SKX-NEXT:    movb $4, %cl
934; SKX-NEXT:    subb %al, %cl
935; SKX-NEXT:    movzbl %cl, %eax
936; SKX-NEXT:    retq
937  %t1 = icmp ugt <2 x i64> %a, %b
938  %t2 = extractelement <2 x i1> %t1, i32 0
939  %sext = sext i1 %t2 to i8
940  %res = add i8 %sext, 4
941  ret i8 %res
942}
943
944define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
945; KNL-LABEL: test_extractelement_v4i1:
946; KNL:       ## %bb.0:
947; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
948; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
949; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
950; KNL-NEXT:    kshiftrw $3, %k0, %k0
951; KNL-NEXT:    kmovw %k0, %eax
952; KNL-NEXT:    andl $1, %eax
953; KNL-NEXT:    vzeroupper
954; KNL-NEXT:    retq
955;
956; SKX-LABEL: test_extractelement_v4i1:
957; SKX:       ## %bb.0:
958; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
959; SKX-NEXT:    kshiftrb $3, %k0, %k0
960; SKX-NEXT:    kmovd %k0, %eax
961; SKX-NEXT:    andl $1, %eax
962; SKX-NEXT:    retq
963  %t1 = icmp ugt <4 x i32> %a, %b
964  %t2 = extractelement <4 x i1> %t1, i32 3
965  %res = zext i1 %t2 to i8
966  ret i8 %res
967}
968
969define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
970; KNL-LABEL: test_extractelement_v32i1:
971; KNL:       ## %bb.0:
972; KNL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
973; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
974; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
975; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
976; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
977; KNL-NEXT:    kshiftrw $2, %k0, %k0
978; KNL-NEXT:    kmovw %k0, %eax
979; KNL-NEXT:    andl $1, %eax
980; KNL-NEXT:    vzeroupper
981; KNL-NEXT:    retq
982;
983; SKX-LABEL: test_extractelement_v32i1:
984; SKX:       ## %bb.0:
985; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
986; SKX-NEXT:    kshiftrd $2, %k0, %k0
987; SKX-NEXT:    kmovd %k0, %eax
988; SKX-NEXT:    andl $1, %eax
989; SKX-NEXT:    vzeroupper
990; SKX-NEXT:    retq
991  %t1 = icmp ugt <32 x i8> %a, %b
992  %t2 = extractelement <32 x i1> %t1, i32 2
993  %res = zext i1 %t2 to i8
994  ret i8 %res
995}
996
997define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
998; KNL-LABEL: test_extractelement_v64i1:
999; KNL:       ## %bb.0:
1000; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1001; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
1002; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1003; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1004; KNL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
1005; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1006; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1007; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1008; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1009; KNL-NEXT:    kshiftrw $15, %k0, %k0
1010; KNL-NEXT:    kmovw %k0, %ecx
1011; KNL-NEXT:    andl $1, %ecx
1012; KNL-NEXT:    movl $4, %eax
1013; KNL-NEXT:    subl %ecx, %eax
1014; KNL-NEXT:    vzeroupper
1015; KNL-NEXT:    retq
1016;
1017; SKX-LABEL: test_extractelement_v64i1:
1018; SKX:       ## %bb.0:
1019; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
1020; SKX-NEXT:    kshiftrq $63, %k0, %k0
1021; SKX-NEXT:    kmovd %k0, %ecx
1022; SKX-NEXT:    andl $1, %ecx
1023; SKX-NEXT:    movl $4, %eax
1024; SKX-NEXT:    subl %ecx, %eax
1025; SKX-NEXT:    vzeroupper
1026; SKX-NEXT:    retq
1027  %t1 = icmp ugt <64 x i8> %a, %b
1028  %t2 = extractelement <64 x i1> %t1, i32 63
1029  %res = select i1 %t2, i8 3, i8 4
1030  ret i8 %res
1031}
1032
1033define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
1034; KNL-LABEL: extractelement_v64i1_alt:
1035; KNL:       ## %bb.0:
1036; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
1037; KNL-NEXT:    vextracti128 $1, %ymm1, %xmm1
1038; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
1039; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
1040; KNL-NEXT:    vpminub %xmm1, %xmm0, %xmm1
1041; KNL-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1042; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1043; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
1044; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1045; KNL-NEXT:    kshiftrw $15, %k0, %k0
1046; KNL-NEXT:    kmovw %k0, %eax
1047; KNL-NEXT:    andb $1, %al
1048; KNL-NEXT:    movb $4, %cl
1049; KNL-NEXT:    subb %al, %cl
1050; KNL-NEXT:    movzbl %cl, %eax
1051; KNL-NEXT:    vzeroupper
1052; KNL-NEXT:    retq
1053;
1054; SKX-LABEL: extractelement_v64i1_alt:
1055; SKX:       ## %bb.0:
1056; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
1057; SKX-NEXT:    kshiftrq $63, %k0, %k0
1058; SKX-NEXT:    kmovd %k0, %eax
1059; SKX-NEXT:    andb $1, %al
1060; SKX-NEXT:    movb $4, %cl
1061; SKX-NEXT:    subb %al, %cl
1062; SKX-NEXT:    movzbl %cl, %eax
1063; SKX-NEXT:    vzeroupper
1064; SKX-NEXT:    retq
1065  %t1 = icmp ugt <64 x i8> %a, %b
1066  %t2 = extractelement <64 x i1> %t1, i32 63
1067  %sext = sext i1 %t2 to i8
1068  %res = add i8 %sext, 4
1069  ret i8 %res
1070}
1071
1072define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
1073; CHECK-LABEL: test_extractelement_variable_v2i64:
1074; CHECK:       ## %bb.0:
1075; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1076; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1077; CHECK-NEXT:    andl $1, %edi
1078; CHECK-NEXT:    movq -24(%rsp,%rdi,8), %rax
1079; CHECK-NEXT:    retq
1080  %t2 = extractelement <2 x i64> %t1, i32 %index
1081  ret i64 %t2
1082}
1083
1084define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
1085; CHECK-LABEL: test_extractelement_variable_v4i64:
1086; CHECK:       ## %bb.0:
1087; CHECK-NEXT:    pushq %rbp
1088; CHECK-NEXT:    .cfi_def_cfa_offset 16
1089; CHECK-NEXT:    .cfi_offset %rbp, -16
1090; CHECK-NEXT:    movq %rsp, %rbp
1091; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1092; CHECK-NEXT:    andq $-32, %rsp
1093; CHECK-NEXT:    subq $64, %rsp
1094; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1095; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1096; CHECK-NEXT:    andl $3, %edi
1097; CHECK-NEXT:    movq (%rsp,%rdi,8), %rax
1098; CHECK-NEXT:    movq %rbp, %rsp
1099; CHECK-NEXT:    popq %rbp
1100; CHECK-NEXT:    vzeroupper
1101; CHECK-NEXT:    retq
1102  %t2 = extractelement <4 x i64> %t1, i32 %index
1103  ret i64 %t2
1104}
1105
1106define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
1107; CHECK-LABEL: test_extractelement_variable_v8i64:
1108; CHECK:       ## %bb.0:
1109; CHECK-NEXT:    pushq %rbp
1110; CHECK-NEXT:    .cfi_def_cfa_offset 16
1111; CHECK-NEXT:    .cfi_offset %rbp, -16
1112; CHECK-NEXT:    movq %rsp, %rbp
1113; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1114; CHECK-NEXT:    andq $-64, %rsp
1115; CHECK-NEXT:    subq $128, %rsp
1116; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1117; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1118; CHECK-NEXT:    andl $7, %edi
1119; CHECK-NEXT:    movq (%rsp,%rdi,8), %rax
1120; CHECK-NEXT:    movq %rbp, %rsp
1121; CHECK-NEXT:    popq %rbp
1122; CHECK-NEXT:    vzeroupper
1123; CHECK-NEXT:    retq
1124  %t2 = extractelement <8 x i64> %t1, i32 %index
1125  ret i64 %t2
1126}
1127
1128define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
1129; CHECK-LABEL: test_extractelement_variable_v2f64:
1130; CHECK:       ## %bb.0:
1131; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1132; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1133; CHECK-NEXT:    andl $1, %edi
1134; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1135; CHECK-NEXT:    retq
1136  %t2 = extractelement <2 x double> %t1, i32 %index
1137  ret double %t2
1138}
1139
1140define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
1141; CHECK-LABEL: test_extractelement_variable_v4f64:
1142; CHECK:       ## %bb.0:
1143; CHECK-NEXT:    pushq %rbp
1144; CHECK-NEXT:    .cfi_def_cfa_offset 16
1145; CHECK-NEXT:    .cfi_offset %rbp, -16
1146; CHECK-NEXT:    movq %rsp, %rbp
1147; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1148; CHECK-NEXT:    andq $-32, %rsp
1149; CHECK-NEXT:    subq $64, %rsp
1150; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1151; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1152; CHECK-NEXT:    andl $3, %edi
1153; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1154; CHECK-NEXT:    movq %rbp, %rsp
1155; CHECK-NEXT:    popq %rbp
1156; CHECK-NEXT:    vzeroupper
1157; CHECK-NEXT:    retq
1158  %t2 = extractelement <4 x double> %t1, i32 %index
1159  ret double %t2
1160}
1161
1162define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
1163; CHECK-LABEL: test_extractelement_variable_v8f64:
1164; CHECK:       ## %bb.0:
1165; CHECK-NEXT:    pushq %rbp
1166; CHECK-NEXT:    .cfi_def_cfa_offset 16
1167; CHECK-NEXT:    .cfi_offset %rbp, -16
1168; CHECK-NEXT:    movq %rsp, %rbp
1169; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1170; CHECK-NEXT:    andq $-64, %rsp
1171; CHECK-NEXT:    subq $128, %rsp
1172; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1173; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1174; CHECK-NEXT:    andl $7, %edi
1175; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1176; CHECK-NEXT:    movq %rbp, %rsp
1177; CHECK-NEXT:    popq %rbp
1178; CHECK-NEXT:    vzeroupper
1179; CHECK-NEXT:    retq
1180  %t2 = extractelement <8 x double> %t1, i32 %index
1181  ret double %t2
1182}
1183
1184define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
1185; CHECK-LABEL: test_extractelement_variable_v4i32:
1186; CHECK:       ## %bb.0:
1187; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1188; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1189; CHECK-NEXT:    andl $3, %edi
1190; CHECK-NEXT:    movl -24(%rsp,%rdi,4), %eax
1191; CHECK-NEXT:    retq
1192  %t2 = extractelement <4 x i32> %t1, i32 %index
1193  ret i32 %t2
1194}
1195
1196define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
1197; CHECK-LABEL: test_extractelement_variable_v8i32:
1198; CHECK:       ## %bb.0:
1199; CHECK-NEXT:    pushq %rbp
1200; CHECK-NEXT:    .cfi_def_cfa_offset 16
1201; CHECK-NEXT:    .cfi_offset %rbp, -16
1202; CHECK-NEXT:    movq %rsp, %rbp
1203; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1204; CHECK-NEXT:    andq $-32, %rsp
1205; CHECK-NEXT:    subq $64, %rsp
1206; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1207; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1208; CHECK-NEXT:    andl $7, %edi
1209; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
1210; CHECK-NEXT:    movq %rbp, %rsp
1211; CHECK-NEXT:    popq %rbp
1212; CHECK-NEXT:    vzeroupper
1213; CHECK-NEXT:    retq
1214  %t2 = extractelement <8 x i32> %t1, i32 %index
1215  ret i32 %t2
1216}
1217
1218define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
1219; CHECK-LABEL: test_extractelement_variable_v16i32:
1220; CHECK:       ## %bb.0:
1221; CHECK-NEXT:    pushq %rbp
1222; CHECK-NEXT:    .cfi_def_cfa_offset 16
1223; CHECK-NEXT:    .cfi_offset %rbp, -16
1224; CHECK-NEXT:    movq %rsp, %rbp
1225; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1226; CHECK-NEXT:    andq $-64, %rsp
1227; CHECK-NEXT:    subq $128, %rsp
1228; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1229; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1230; CHECK-NEXT:    andl $15, %edi
1231; CHECK-NEXT:    movl (%rsp,%rdi,4), %eax
1232; CHECK-NEXT:    movq %rbp, %rsp
1233; CHECK-NEXT:    popq %rbp
1234; CHECK-NEXT:    vzeroupper
1235; CHECK-NEXT:    retq
1236  %t2 = extractelement <16 x i32> %t1, i32 %index
1237  ret i32 %t2
1238}
1239
1240define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
1241; CHECK-LABEL: test_extractelement_variable_v4f32:
1242; CHECK:       ## %bb.0:
1243; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1244; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1245; CHECK-NEXT:    andl $3, %edi
1246; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1247; CHECK-NEXT:    retq
1248  %t2 = extractelement <4 x float> %t1, i32 %index
1249  ret float %t2
1250}
1251
1252define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
1253; CHECK-LABEL: test_extractelement_variable_v8f32:
1254; CHECK:       ## %bb.0:
1255; CHECK-NEXT:    pushq %rbp
1256; CHECK-NEXT:    .cfi_def_cfa_offset 16
1257; CHECK-NEXT:    .cfi_offset %rbp, -16
1258; CHECK-NEXT:    movq %rsp, %rbp
1259; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1260; CHECK-NEXT:    andq $-32, %rsp
1261; CHECK-NEXT:    subq $64, %rsp
1262; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1263; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1264; CHECK-NEXT:    andl $7, %edi
1265; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1266; CHECK-NEXT:    movq %rbp, %rsp
1267; CHECK-NEXT:    popq %rbp
1268; CHECK-NEXT:    vzeroupper
1269; CHECK-NEXT:    retq
1270  %t2 = extractelement <8 x float> %t1, i32 %index
1271  ret float %t2
1272}
1273
1274define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
1275; CHECK-LABEL: test_extractelement_variable_v16f32:
1276; CHECK:       ## %bb.0:
1277; CHECK-NEXT:    pushq %rbp
1278; CHECK-NEXT:    .cfi_def_cfa_offset 16
1279; CHECK-NEXT:    .cfi_offset %rbp, -16
1280; CHECK-NEXT:    movq %rsp, %rbp
1281; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1282; CHECK-NEXT:    andq $-64, %rsp
1283; CHECK-NEXT:    subq $128, %rsp
1284; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1285; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1286; CHECK-NEXT:    andl $15, %edi
1287; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1288; CHECK-NEXT:    movq %rbp, %rsp
1289; CHECK-NEXT:    popq %rbp
1290; CHECK-NEXT:    vzeroupper
1291; CHECK-NEXT:    retq
1292  %t2 = extractelement <16 x float> %t1, i32 %index
1293  ret float %t2
1294}
1295
1296define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
1297; CHECK-LABEL: test_extractelement_variable_v8i16:
1298; CHECK:       ## %bb.0:
1299; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1300; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1301; CHECK-NEXT:    andl $7, %edi
1302; CHECK-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
1303; CHECK-NEXT:    retq
1304  %t2 = extractelement <8 x i16> %t1, i32 %index
1305  ret i16 %t2
1306}
1307
1308define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
1309; CHECK-LABEL: test_extractelement_variable_v16i16:
1310; CHECK:       ## %bb.0:
1311; CHECK-NEXT:    pushq %rbp
1312; CHECK-NEXT:    .cfi_def_cfa_offset 16
1313; CHECK-NEXT:    .cfi_offset %rbp, -16
1314; CHECK-NEXT:    movq %rsp, %rbp
1315; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1316; CHECK-NEXT:    andq $-32, %rsp
1317; CHECK-NEXT:    subq $64, %rsp
1318; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1319; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1320; CHECK-NEXT:    andl $15, %edi
1321; CHECK-NEXT:    movzwl (%rsp,%rdi,2), %eax
1322; CHECK-NEXT:    movq %rbp, %rsp
1323; CHECK-NEXT:    popq %rbp
1324; CHECK-NEXT:    vzeroupper
1325; CHECK-NEXT:    retq
1326  %t2 = extractelement <16 x i16> %t1, i32 %index
1327  ret i16 %t2
1328}
1329
1330define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
1331; CHECK-LABEL: test_extractelement_variable_v32i16:
1332; CHECK:       ## %bb.0:
1333; CHECK-NEXT:    pushq %rbp
1334; CHECK-NEXT:    .cfi_def_cfa_offset 16
1335; CHECK-NEXT:    .cfi_offset %rbp, -16
1336; CHECK-NEXT:    movq %rsp, %rbp
1337; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1338; CHECK-NEXT:    andq $-64, %rsp
1339; CHECK-NEXT:    subq $128, %rsp
1340; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1341; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1342; CHECK-NEXT:    andl $31, %edi
1343; CHECK-NEXT:    movzwl (%rsp,%rdi,2), %eax
1344; CHECK-NEXT:    movq %rbp, %rsp
1345; CHECK-NEXT:    popq %rbp
1346; CHECK-NEXT:    vzeroupper
1347; CHECK-NEXT:    retq
1348  %t2 = extractelement <32 x i16> %t1, i32 %index
1349  ret i16 %t2
1350}
1351
1352define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
1353; CHECK-LABEL: test_extractelement_variable_v16i8:
1354; CHECK:       ## %bb.0:
1355; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1356; CHECK-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1357; CHECK-NEXT:    andl $15, %edi
1358; CHECK-NEXT:    movb -24(%rsp,%rdi), %al
1359; CHECK-NEXT:    retq
1360  %t2 = extractelement <16 x i8> %t1, i32 %index
1361  ret i8 %t2
1362}
1363
1364define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
1365; CHECK-LABEL: test_extractelement_variable_v32i8:
1366; CHECK:       ## %bb.0:
1367; CHECK-NEXT:    pushq %rbp
1368; CHECK-NEXT:    .cfi_def_cfa_offset 16
1369; CHECK-NEXT:    .cfi_offset %rbp, -16
1370; CHECK-NEXT:    movq %rsp, %rbp
1371; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1372; CHECK-NEXT:    andq $-32, %rsp
1373; CHECK-NEXT:    subq $64, %rsp
1374; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1375; CHECK-NEXT:    vmovaps %ymm0, (%rsp)
1376; CHECK-NEXT:    andl $31, %edi
1377; CHECK-NEXT:    movb (%rsp,%rdi), %al
1378; CHECK-NEXT:    movq %rbp, %rsp
1379; CHECK-NEXT:    popq %rbp
1380; CHECK-NEXT:    vzeroupper
1381; CHECK-NEXT:    retq
1382
1383  %t2 = extractelement <32 x i8> %t1, i32 %index
1384  ret i8 %t2
1385}
1386
1387define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
1388; CHECK-LABEL: test_extractelement_variable_v64i8:
1389; CHECK:       ## %bb.0:
1390; CHECK-NEXT:    pushq %rbp
1391; CHECK-NEXT:    .cfi_def_cfa_offset 16
1392; CHECK-NEXT:    .cfi_offset %rbp, -16
1393; CHECK-NEXT:    movq %rsp, %rbp
1394; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1395; CHECK-NEXT:    andq $-64, %rsp
1396; CHECK-NEXT:    subq $128, %rsp
1397; CHECK-NEXT:    ## kill: def $edi killed $edi def $rdi
1398; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1399; CHECK-NEXT:    andl $63, %edi
1400; CHECK-NEXT:    movb (%rsp,%rdi), %al
1401; CHECK-NEXT:    movq %rbp, %rsp
1402; CHECK-NEXT:    popq %rbp
1403; CHECK-NEXT:    vzeroupper
1404; CHECK-NEXT:    retq
1405
1406  %t2 = extractelement <64 x i8> %t1, i32 %index
1407  ret i8 %t2
1408}
1409
1410define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
1411; CHECK-LABEL: test_extractelement_variable_v64i8_indexi8:
1412; CHECK:       ## %bb.0:
1413; CHECK-NEXT:    pushq %rbp
1414; CHECK-NEXT:    .cfi_def_cfa_offset 16
1415; CHECK-NEXT:    .cfi_offset %rbp, -16
1416; CHECK-NEXT:    movq %rsp, %rbp
1417; CHECK-NEXT:    .cfi_def_cfa_register %rbp
1418; CHECK-NEXT:    andq $-64, %rsp
1419; CHECK-NEXT:    subq $128, %rsp
1420; CHECK-NEXT:    addb %dil, %dil
1421; CHECK-NEXT:    vmovaps %zmm0, (%rsp)
1422; CHECK-NEXT:    movzbl %dil, %eax
1423; CHECK-NEXT:    andl $63, %eax
1424; CHECK-NEXT:    movb (%rsp,%rax), %al
1425; CHECK-NEXT:    movq %rbp, %rsp
1426; CHECK-NEXT:    popq %rbp
1427; CHECK-NEXT:    vzeroupper
1428; CHECK-NEXT:    retq
1429
1430  %i  = add i8 %index, %index
1431  %t2 = extractelement <64 x i8> %t1, i8 %i
1432  ret i8 %t2
1433}
1434
1435define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
1436; KNL-LABEL: test_extractelement_varible_v2i1:
1437; KNL:       ## %bb.0:
1438; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1439; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1440; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1441; KNL-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
1442; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1443; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1444; KNL-NEXT:    andl $1, %edi
1445; KNL-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
1446; KNL-NEXT:    andl $1, %eax
1447; KNL-NEXT:    vzeroupper
1448; KNL-NEXT:    retq
1449;
1450; SKX-LABEL: test_extractelement_varible_v2i1:
1451; SKX:       ## %bb.0:
1452; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1453; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
1454; SKX-NEXT:    vpmovm2q %k0, %xmm0
1455; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1456; SKX-NEXT:    andl $1, %edi
1457; SKX-NEXT:    movzbl -24(%rsp,%rdi,8), %eax
1458; SKX-NEXT:    andl $1, %eax
1459; SKX-NEXT:    retq
1460  %t1 = icmp ugt <2 x i64> %a, %b
1461  %t2 = extractelement <2 x i1> %t1, i32 %index
1462  %res = zext i1 %t2 to i8
1463  ret i8 %res
1464}
1465
1466define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
1467; KNL-LABEL: test_extractelement_varible_v4i1:
1468; KNL:       ## %bb.0:
1469; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1470; KNL-NEXT:    ## kill: def $xmm1 killed $xmm1 def $zmm1
1471; KNL-NEXT:    ## kill: def $xmm0 killed $xmm0 def $zmm0
1472; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1473; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1474; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1475; KNL-NEXT:    andl $3, %edi
1476; KNL-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
1477; KNL-NEXT:    andl $1, %eax
1478; KNL-NEXT:    vzeroupper
1479; KNL-NEXT:    retq
1480;
1481; SKX-LABEL: test_extractelement_varible_v4i1:
1482; SKX:       ## %bb.0:
1483; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1484; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
1485; SKX-NEXT:    vpmovm2d %k0, %xmm0
1486; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1487; SKX-NEXT:    andl $3, %edi
1488; SKX-NEXT:    movzbl -24(%rsp,%rdi,4), %eax
1489; SKX-NEXT:    andl $1, %eax
1490; SKX-NEXT:    retq
1491  %t1 = icmp ugt <4 x i32> %a, %b
1492  %t2 = extractelement <4 x i1> %t1, i32 %index
1493  %res = zext i1 %t2 to i8
1494  ret i8 %res
1495}
1496
1497define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
1498; KNL-LABEL: test_extractelement_varible_v8i1:
1499; KNL:       ## %bb.0:
1500; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1501; KNL-NEXT:    ## kill: def $ymm1 killed $ymm1 def $zmm1
1502; KNL-NEXT:    ## kill: def $ymm0 killed $ymm0 def $zmm0
1503; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1504; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1505; KNL-NEXT:    vpmovdw %zmm0, %ymm0
1506; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1507; KNL-NEXT:    andl $7, %edi
1508; KNL-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
1509; KNL-NEXT:    andl $1, %eax
1510; KNL-NEXT:    vzeroupper
1511; KNL-NEXT:    retq
1512;
1513; SKX-LABEL: test_extractelement_varible_v8i1:
1514; SKX:       ## %bb.0:
1515; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1516; SKX-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
1517; SKX-NEXT:    vpmovm2w %k0, %xmm0
1518; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1519; SKX-NEXT:    andl $7, %edi
1520; SKX-NEXT:    movzbl -24(%rsp,%rdi,2), %eax
1521; SKX-NEXT:    andl $1, %eax
1522; SKX-NEXT:    vzeroupper
1523; SKX-NEXT:    retq
1524  %t1 = icmp ugt <8 x i32> %a, %b
1525  %t2 = extractelement <8 x i1> %t1, i32 %index
1526  %res = zext i1 %t2 to i8
1527  ret i8 %res
1528}
1529
1530define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
1531; KNL-LABEL: test_extractelement_varible_v16i1:
1532; KNL:       ## %bb.0:
1533; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1534; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
1535; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1536; KNL-NEXT:    vpmovdb %zmm0, -{{[0-9]+}}(%rsp)
1537; KNL-NEXT:    andl $15, %edi
1538; KNL-NEXT:    movzbl -24(%rsp,%rdi), %eax
1539; KNL-NEXT:    andl $1, %eax
1540; KNL-NEXT:    vzeroupper
1541; KNL-NEXT:    retq
1542;
1543; SKX-LABEL: test_extractelement_varible_v16i1:
1544; SKX:       ## %bb.0:
1545; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1546; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
1547; SKX-NEXT:    vpmovm2b %k0, %xmm0
1548; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
1549; SKX-NEXT:    andl $15, %edi
1550; SKX-NEXT:    movzbl -24(%rsp,%rdi), %eax
1551; SKX-NEXT:    andl $1, %eax
1552; SKX-NEXT:    vzeroupper
1553; SKX-NEXT:    retq
1554  %t1 = icmp ugt <16 x i32> %a, %b
1555  %t2 = extractelement <16 x i1> %t1, i32 %index
1556  %res = zext i1 %t2 to i8
1557  ret i8 %res
1558}
1559
1560define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
1561; KNL-LABEL: test_extractelement_varible_v32i1:
1562; KNL:       ## %bb.0:
1563; KNL-NEXT:    pushq %rbp
1564; KNL-NEXT:    .cfi_def_cfa_offset 16
1565; KNL-NEXT:    .cfi_offset %rbp, -16
1566; KNL-NEXT:    movq %rsp, %rbp
1567; KNL-NEXT:    .cfi_def_cfa_register %rbp
1568; KNL-NEXT:    andq $-32, %rsp
1569; KNL-NEXT:    subq $64, %rsp
1570; KNL-NEXT:    ## kill: def $edi killed $edi def $rdi
1571; KNL-NEXT:    vpminub %ymm1, %ymm0, %ymm1
1572; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
1573; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1574; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
1575; KNL-NEXT:    andl $31, %edi
1576; KNL-NEXT:    movzbl (%rsp,%rdi), %eax
1577; KNL-NEXT:    andl $1, %eax
1578; KNL-NEXT:    movq %rbp, %rsp
1579; KNL-NEXT:    popq %rbp
1580; KNL-NEXT:    vzeroupper
1581; KNL-NEXT:    retq
1582;
1583; SKX-LABEL: test_extractelement_varible_v32i1:
1584; SKX:       ## %bb.0:
1585; SKX-NEXT:    pushq %rbp
1586; SKX-NEXT:    .cfi_def_cfa_offset 16
1587; SKX-NEXT:    .cfi_offset %rbp, -16
1588; SKX-NEXT:    movq %rsp, %rbp
1589; SKX-NEXT:    .cfi_def_cfa_register %rbp
1590; SKX-NEXT:    andq $-32, %rsp
1591; SKX-NEXT:    subq $64, %rsp
1592; SKX-NEXT:    ## kill: def $edi killed $edi def $rdi
1593; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
1594; SKX-NEXT:    vpmovm2b %k0, %ymm0
1595; SKX-NEXT:    vmovdqa %ymm0, (%rsp)
1596; SKX-NEXT:    andl $31, %edi
1597; SKX-NEXT:    movzbl (%rsp,%rdi), %eax
1598; SKX-NEXT:    andl $1, %eax
1599; SKX-NEXT:    movq %rbp, %rsp
1600; SKX-NEXT:    popq %rbp
1601; SKX-NEXT:    vzeroupper
1602; SKX-NEXT:    retq
1603  %t1 = icmp ugt <32 x i8> %a, %b
1604  %t2 = extractelement <32 x i1> %t1, i32 %index
1605  %res = zext i1 %t2 to i8
1606  ret i8 %res
1607}
1608
1609define <8 x i64> @insert_double_zero(<2 x i64> %a) nounwind {
1610; CHECK-LABEL: insert_double_zero:
1611; CHECK:       ## %bb.0:
1612; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1613; CHECK-NEXT:    vinsertf32x4 $2, %xmm0, %zmm1, %zmm0
1614; CHECK-NEXT:    retq
1615  %b = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1616  %d = shufflevector <4 x i64> %b, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1617  %e = shufflevector <8 x i64> %d, <8 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
1618  ret <8 x i64> %e
1619}
1620
1621define i32 @test_insertelement_variable_v32i1(<32 x i8> %a, i8 %b, i32 %index) {
1622; KNL-LABEL: test_insertelement_variable_v32i1:
1623; KNL:       ## %bb.0:
1624; KNL-NEXT:    pushq %rbp
1625; KNL-NEXT:    .cfi_def_cfa_offset 16
1626; KNL-NEXT:    .cfi_offset %rbp, -16
1627; KNL-NEXT:    movq %rsp, %rbp
1628; KNL-NEXT:    .cfi_def_cfa_register %rbp
1629; KNL-NEXT:    andq $-32, %rsp
1630; KNL-NEXT:    subq $64, %rsp
1631; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
1632; KNL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1633; KNL-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
1634; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1635; KNL-NEXT:    andl $31, %esi
1636; KNL-NEXT:    testb %dil, %dil
1637; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
1638; KNL-NEXT:    setne (%rsp,%rsi)
1639; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
1640; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1641; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1642; KNL-NEXT:    kmovw %k0, %ecx
1643; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1644; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1645; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1646; KNL-NEXT:    kmovw %k0, %eax
1647; KNL-NEXT:    shll $16, %eax
1648; KNL-NEXT:    orl %ecx, %eax
1649; KNL-NEXT:    movq %rbp, %rsp
1650; KNL-NEXT:    popq %rbp
1651; KNL-NEXT:    vzeroupper
1652; KNL-NEXT:    retq
1653;
1654; SKX-LABEL: test_insertelement_variable_v32i1:
1655; SKX:       ## %bb.0:
1656; SKX-NEXT:    vptestmb %ymm0, %ymm0, %k0
1657; SKX-NEXT:    testb %dil, %dil
1658; SKX-NEXT:    setne %al
1659; SKX-NEXT:    vpbroadcastb %esi, %ymm0
1660; SKX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k1
1661; SKX-NEXT:    vpmovm2b %k0, %ymm0
1662; SKX-NEXT:    vpbroadcastb %eax, %ymm0 {%k1}
1663; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
1664; SKX-NEXT:    vpmovb2m %ymm0, %k0
1665; SKX-NEXT:    kmovd %k0, %eax
1666; SKX-NEXT:    vzeroupper
1667; SKX-NEXT:    retq
1668  %t1 = icmp ugt <32 x i8> %a, zeroinitializer
1669  %t2 = icmp ugt i8 %b, 0
1670  %t3 = insertelement <32 x i1> %t1, i1 %t2, i32 %index
1671  %t4 = bitcast <32 x i1> %t3 to i32
1672  ret i32 %t4
1673}
1674
1675define i64 @test_insertelement_variable_v64i1(<64 x i8> %a, i8 %b, i32 %index) {
1676; KNL-LABEL: test_insertelement_variable_v64i1:
1677; KNL:       ## %bb.0:
1678; KNL-NEXT:    pushq %rbp
1679; KNL-NEXT:    .cfi_def_cfa_offset 16
1680; KNL-NEXT:    .cfi_offset %rbp, -16
1681; KNL-NEXT:    movq %rsp, %rbp
1682; KNL-NEXT:    .cfi_def_cfa_register %rbp
1683; KNL-NEXT:    andq $-64, %rsp
1684; KNL-NEXT:    subq $128, %rsp
1685; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
1686; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1687; KNL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1688; KNL-NEXT:    vpcmpeqb %ymm2, %ymm1, %ymm1
1689; KNL-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
1690; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1691; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1692; KNL-NEXT:    andl $63, %esi
1693; KNL-NEXT:    testb %dil, %dil
1694; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp)
1695; KNL-NEXT:    setne (%rsp,%rsi)
1696; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
1697; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1698; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1699; KNL-NEXT:    kmovw %k0, %eax
1700; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1701; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1702; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1703; KNL-NEXT:    kmovw %k0, %ecx
1704; KNL-NEXT:    shll $16, %ecx
1705; KNL-NEXT:    orl %eax, %ecx
1706; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1707; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1708; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1709; KNL-NEXT:    kmovw %k0, %edx
1710; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1711; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1712; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1713; KNL-NEXT:    kmovw %k0, %eax
1714; KNL-NEXT:    shll $16, %eax
1715; KNL-NEXT:    orl %edx, %eax
1716; KNL-NEXT:    shlq $32, %rax
1717; KNL-NEXT:    orq %rcx, %rax
1718; KNL-NEXT:    movq %rbp, %rsp
1719; KNL-NEXT:    popq %rbp
1720; KNL-NEXT:    vzeroupper
1721; KNL-NEXT:    retq
1722;
1723; SKX-LABEL: test_insertelement_variable_v64i1:
1724; SKX:       ## %bb.0:
1725; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
1726; SKX-NEXT:    testb %dil, %dil
1727; SKX-NEXT:    setne %al
1728; SKX-NEXT:    vpbroadcastb %esi, %zmm0
1729; SKX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %k1
1730; SKX-NEXT:    vpmovm2b %k0, %zmm0
1731; SKX-NEXT:    vpbroadcastb %eax, %zmm0 {%k1}
1732; SKX-NEXT:    vpsllw $7, %zmm0, %zmm0
1733; SKX-NEXT:    vpmovb2m %zmm0, %k0
1734; SKX-NEXT:    kmovq %k0, %rax
1735; SKX-NEXT:    vzeroupper
1736; SKX-NEXT:    retq
1737  %t1 = icmp ugt <64 x i8> %a, zeroinitializer
1738  %t2 = icmp ugt i8 %b, 0
1739  %t3 = insertelement <64 x i1> %t1, i1 %t2, i32 %index
1740  %t4 = bitcast <64 x i1> %t3 to i64
1741  ret i64 %t4
1742}
1743
1744define i96 @test_insertelement_variable_v96i1(<96 x i8> %a, i8 %b, i32 %index) {
1745; KNL-LABEL: test_insertelement_variable_v96i1:
1746; KNL:       ## %bb.0:
1747; KNL-NEXT:    pushq %rbp
1748; KNL-NEXT:    .cfi_def_cfa_offset 16
1749; KNL-NEXT:    .cfi_offset %rbp, -16
1750; KNL-NEXT:    movq %rsp, %rbp
1751; KNL-NEXT:    .cfi_def_cfa_register %rbp
1752; KNL-NEXT:    andq $-64, %rsp
1753; KNL-NEXT:    subq $192, %rsp
1754; KNL-NEXT:    movl 744(%rbp), %eax
1755; KNL-NEXT:    andl $127, %eax
1756; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1757; KNL-NEXT:    vpinsrb $1, 232(%rbp), %xmm0, %xmm0
1758; KNL-NEXT:    vpinsrb $2, 240(%rbp), %xmm0, %xmm0
1759; KNL-NEXT:    vpinsrb $3, 248(%rbp), %xmm0, %xmm0
1760; KNL-NEXT:    vpinsrb $4, 256(%rbp), %xmm0, %xmm0
1761; KNL-NEXT:    vpinsrb $5, 264(%rbp), %xmm0, %xmm0
1762; KNL-NEXT:    vpinsrb $6, 272(%rbp), %xmm0, %xmm0
1763; KNL-NEXT:    vpinsrb $7, 280(%rbp), %xmm0, %xmm0
1764; KNL-NEXT:    vpinsrb $8, 288(%rbp), %xmm0, %xmm0
1765; KNL-NEXT:    vpinsrb $9, 296(%rbp), %xmm0, %xmm0
1766; KNL-NEXT:    vpinsrb $10, 304(%rbp), %xmm0, %xmm0
1767; KNL-NEXT:    vpinsrb $11, 312(%rbp), %xmm0, %xmm0
1768; KNL-NEXT:    vpinsrb $12, 320(%rbp), %xmm0, %xmm0
1769; KNL-NEXT:    vpinsrb $13, 328(%rbp), %xmm0, %xmm0
1770; KNL-NEXT:    vpinsrb $14, 336(%rbp), %xmm0, %xmm0
1771; KNL-NEXT:    vpinsrb $15, 344(%rbp), %xmm0, %xmm0
1772; KNL-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1773; KNL-NEXT:    vpinsrb $1, 360(%rbp), %xmm1, %xmm1
1774; KNL-NEXT:    vpinsrb $2, 368(%rbp), %xmm1, %xmm1
1775; KNL-NEXT:    vpinsrb $3, 376(%rbp), %xmm1, %xmm1
1776; KNL-NEXT:    vpinsrb $4, 384(%rbp), %xmm1, %xmm1
1777; KNL-NEXT:    vpinsrb $5, 392(%rbp), %xmm1, %xmm1
1778; KNL-NEXT:    vpinsrb $6, 400(%rbp), %xmm1, %xmm1
1779; KNL-NEXT:    vpinsrb $7, 408(%rbp), %xmm1, %xmm1
1780; KNL-NEXT:    vpinsrb $8, 416(%rbp), %xmm1, %xmm1
1781; KNL-NEXT:    vpinsrb $9, 424(%rbp), %xmm1, %xmm1
1782; KNL-NEXT:    vpinsrb $10, 432(%rbp), %xmm1, %xmm1
1783; KNL-NEXT:    vpinsrb $11, 440(%rbp), %xmm1, %xmm1
1784; KNL-NEXT:    vpinsrb $12, 448(%rbp), %xmm1, %xmm1
1785; KNL-NEXT:    vpinsrb $13, 456(%rbp), %xmm1, %xmm1
1786; KNL-NEXT:    vpinsrb $14, 464(%rbp), %xmm1, %xmm1
1787; KNL-NEXT:    vpinsrb $15, 472(%rbp), %xmm1, %xmm1
1788; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
1789; KNL-NEXT:    vpxor %xmm0, %xmm0, %xmm0
1790; KNL-NEXT:    vpcmpeqb %ymm0, %ymm1, %ymm1
1791; KNL-NEXT:    vmovd %edi, %xmm2
1792; KNL-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
1793; KNL-NEXT:    vpinsrb $2, %edx, %xmm2, %xmm2
1794; KNL-NEXT:    vpinsrb $3, %ecx, %xmm2, %xmm2
1795; KNL-NEXT:    vpinsrb $4, %r8d, %xmm2, %xmm2
1796; KNL-NEXT:    vpinsrb $5, %r9d, %xmm2, %xmm2
1797; KNL-NEXT:    vpinsrb $6, 16(%rbp), %xmm2, %xmm2
1798; KNL-NEXT:    vpinsrb $7, 24(%rbp), %xmm2, %xmm2
1799; KNL-NEXT:    vpinsrb $8, 32(%rbp), %xmm2, %xmm2
1800; KNL-NEXT:    vpinsrb $9, 40(%rbp), %xmm2, %xmm2
1801; KNL-NEXT:    vpinsrb $10, 48(%rbp), %xmm2, %xmm2
1802; KNL-NEXT:    vpinsrb $11, 56(%rbp), %xmm2, %xmm2
1803; KNL-NEXT:    vpinsrb $12, 64(%rbp), %xmm2, %xmm2
1804; KNL-NEXT:    vpinsrb $13, 72(%rbp), %xmm2, %xmm2
1805; KNL-NEXT:    vpinsrb $14, 80(%rbp), %xmm2, %xmm2
1806; KNL-NEXT:    vpinsrb $15, 88(%rbp), %xmm2, %xmm2
1807; KNL-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
1808; KNL-NEXT:    vpinsrb $1, 104(%rbp), %xmm3, %xmm3
1809; KNL-NEXT:    vpinsrb $2, 112(%rbp), %xmm3, %xmm3
1810; KNL-NEXT:    vpinsrb $3, 120(%rbp), %xmm3, %xmm3
1811; KNL-NEXT:    vpinsrb $4, 128(%rbp), %xmm3, %xmm3
1812; KNL-NEXT:    vpinsrb $5, 136(%rbp), %xmm3, %xmm3
1813; KNL-NEXT:    vpinsrb $6, 144(%rbp), %xmm3, %xmm3
1814; KNL-NEXT:    vpinsrb $7, 152(%rbp), %xmm3, %xmm3
1815; KNL-NEXT:    vpinsrb $8, 160(%rbp), %xmm3, %xmm3
1816; KNL-NEXT:    vpinsrb $9, 168(%rbp), %xmm3, %xmm3
1817; KNL-NEXT:    vpinsrb $10, 176(%rbp), %xmm3, %xmm3
1818; KNL-NEXT:    vpinsrb $11, 184(%rbp), %xmm3, %xmm3
1819; KNL-NEXT:    vpinsrb $12, 192(%rbp), %xmm3, %xmm3
1820; KNL-NEXT:    vpinsrb $13, 200(%rbp), %xmm3, %xmm3
1821; KNL-NEXT:    vpinsrb $14, 208(%rbp), %xmm3, %xmm3
1822; KNL-NEXT:    vpinsrb $15, 216(%rbp), %xmm3, %xmm3
1823; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1824; KNL-NEXT:    vpcmpeqb %ymm0, %ymm2, %ymm2
1825; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
1826; KNL-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
1827; KNL-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1828; KNL-NEXT:    vpinsrb $1, 488(%rbp), %xmm2, %xmm2
1829; KNL-NEXT:    vpinsrb $2, 496(%rbp), %xmm2, %xmm2
1830; KNL-NEXT:    vpinsrb $3, 504(%rbp), %xmm2, %xmm2
1831; KNL-NEXT:    vpinsrb $4, 512(%rbp), %xmm2, %xmm2
1832; KNL-NEXT:    vpinsrb $5, 520(%rbp), %xmm2, %xmm2
1833; KNL-NEXT:    vpinsrb $6, 528(%rbp), %xmm2, %xmm2
1834; KNL-NEXT:    vpinsrb $7, 536(%rbp), %xmm2, %xmm2
1835; KNL-NEXT:    vpinsrb $8, 544(%rbp), %xmm2, %xmm2
1836; KNL-NEXT:    vpinsrb $9, 552(%rbp), %xmm2, %xmm2
1837; KNL-NEXT:    vpinsrb $10, 560(%rbp), %xmm2, %xmm2
1838; KNL-NEXT:    vpinsrb $11, 568(%rbp), %xmm2, %xmm2
1839; KNL-NEXT:    vpinsrb $12, 576(%rbp), %xmm2, %xmm2
1840; KNL-NEXT:    vpinsrb $13, 584(%rbp), %xmm2, %xmm2
1841; KNL-NEXT:    vpinsrb $14, 592(%rbp), %xmm2, %xmm2
1842; KNL-NEXT:    vpinsrb $15, 600(%rbp), %xmm2, %xmm2
1843; KNL-NEXT:    vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
1844; KNL-NEXT:    vpinsrb $1, 616(%rbp), %xmm3, %xmm3
1845; KNL-NEXT:    vpinsrb $2, 624(%rbp), %xmm3, %xmm3
1846; KNL-NEXT:    vpinsrb $3, 632(%rbp), %xmm3, %xmm3
1847; KNL-NEXT:    vpinsrb $4, 640(%rbp), %xmm3, %xmm3
1848; KNL-NEXT:    vpinsrb $5, 648(%rbp), %xmm3, %xmm3
1849; KNL-NEXT:    vpinsrb $6, 656(%rbp), %xmm3, %xmm3
1850; KNL-NEXT:    vpinsrb $7, 664(%rbp), %xmm3, %xmm3
1851; KNL-NEXT:    vpinsrb $8, 672(%rbp), %xmm3, %xmm3
1852; KNL-NEXT:    vpinsrb $9, 680(%rbp), %xmm3, %xmm3
1853; KNL-NEXT:    vpinsrb $10, 688(%rbp), %xmm3, %xmm3
1854; KNL-NEXT:    vpinsrb $11, 696(%rbp), %xmm3, %xmm3
1855; KNL-NEXT:    vpinsrb $12, 704(%rbp), %xmm3, %xmm3
1856; KNL-NEXT:    vpinsrb $13, 712(%rbp), %xmm3, %xmm3
1857; KNL-NEXT:    vpinsrb $14, 720(%rbp), %xmm3, %xmm3
1858; KNL-NEXT:    vpinsrb $15, 728(%rbp), %xmm3, %xmm3
1859; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
1860; KNL-NEXT:    vpcmpeqb %ymm0, %ymm2, %ymm0
1861; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
1862; KNL-NEXT:    cmpb $0, 736(%rbp)
1863; KNL-NEXT:    vmovdqa %ymm0, {{[0-9]+}}(%rsp)
1864; KNL-NEXT:    vmovdqa64 %zmm1, (%rsp)
1865; KNL-NEXT:    setne (%rsp,%rax)
1866; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
1867; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1868; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1869; KNL-NEXT:    kmovw %k0, %eax
1870; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1871; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1872; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1873; KNL-NEXT:    kmovw %k0, %ecx
1874; KNL-NEXT:    shll $16, %ecx
1875; KNL-NEXT:    orl %eax, %ecx
1876; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1877; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1878; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1879; KNL-NEXT:    kmovw %k0, %edx
1880; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1881; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1882; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1883; KNL-NEXT:    kmovw %k0, %eax
1884; KNL-NEXT:    shll $16, %eax
1885; KNL-NEXT:    orl %edx, %eax
1886; KNL-NEXT:    shlq $32, %rax
1887; KNL-NEXT:    orq %rcx, %rax
1888; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1889; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1890; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1891; KNL-NEXT:    kmovw %k0, %ecx
1892; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1893; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1894; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1895; KNL-NEXT:    kmovw %k0, %esi
1896; KNL-NEXT:    shll $16, %esi
1897; KNL-NEXT:    orl %ecx, %esi
1898; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1899; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1900; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1901; KNL-NEXT:    kmovw %k0, %ecx
1902; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
1903; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
1904; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
1905; KNL-NEXT:    kmovw %k0, %edx
1906; KNL-NEXT:    shll $16, %edx
1907; KNL-NEXT:    orl %ecx, %edx
1908; KNL-NEXT:    shlq $32, %rdx
1909; KNL-NEXT:    orq %rsi, %rdx
1910; KNL-NEXT:    movq %rbp, %rsp
1911; KNL-NEXT:    popq %rbp
1912; KNL-NEXT:    vzeroupper
1913; KNL-NEXT:    retq
1914;
1915; SKX-LABEL: test_insertelement_variable_v96i1:
1916; SKX:       ## %bb.0:
1917; SKX-NEXT:    pushq %rbp
1918; SKX-NEXT:    .cfi_def_cfa_offset 16
1919; SKX-NEXT:    .cfi_offset %rbp, -16
1920; SKX-NEXT:    movq %rsp, %rbp
1921; SKX-NEXT:    .cfi_def_cfa_register %rbp
1922; SKX-NEXT:    andq $-64, %rsp
1923; SKX-NEXT:    subq $192, %rsp
1924; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1925; SKX-NEXT:    vpinsrb $1, 232(%rbp), %xmm0, %xmm0
1926; SKX-NEXT:    vpinsrb $2, 240(%rbp), %xmm0, %xmm0
1927; SKX-NEXT:    vpinsrb $3, 248(%rbp), %xmm0, %xmm0
1928; SKX-NEXT:    vpinsrb $4, 256(%rbp), %xmm0, %xmm0
1929; SKX-NEXT:    vpinsrb $5, 264(%rbp), %xmm0, %xmm0
1930; SKX-NEXT:    vpinsrb $6, 272(%rbp), %xmm0, %xmm0
1931; SKX-NEXT:    vpinsrb $7, 280(%rbp), %xmm0, %xmm0
1932; SKX-NEXT:    vpinsrb $8, 288(%rbp), %xmm0, %xmm0
1933; SKX-NEXT:    vpinsrb $9, 296(%rbp), %xmm0, %xmm0
1934; SKX-NEXT:    vpinsrb $10, 304(%rbp), %xmm0, %xmm0
1935; SKX-NEXT:    vpinsrb $11, 312(%rbp), %xmm0, %xmm0
1936; SKX-NEXT:    vpinsrb $12, 320(%rbp), %xmm0, %xmm0
1937; SKX-NEXT:    vpinsrb $13, 328(%rbp), %xmm0, %xmm0
1938; SKX-NEXT:    vpinsrb $14, 336(%rbp), %xmm0, %xmm0
1939; SKX-NEXT:    vpinsrb $15, 344(%rbp), %xmm0, %xmm0
1940; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1941; SKX-NEXT:    vpinsrb $1, 360(%rbp), %xmm1, %xmm1
1942; SKX-NEXT:    vpinsrb $2, 368(%rbp), %xmm1, %xmm1
1943; SKX-NEXT:    vpinsrb $3, 376(%rbp), %xmm1, %xmm1
1944; SKX-NEXT:    vpinsrb $4, 384(%rbp), %xmm1, %xmm1
1945; SKX-NEXT:    vpinsrb $5, 392(%rbp), %xmm1, %xmm1
1946; SKX-NEXT:    vpinsrb $6, 400(%rbp), %xmm1, %xmm1
1947; SKX-NEXT:    vpinsrb $7, 408(%rbp), %xmm1, %xmm1
1948; SKX-NEXT:    vpinsrb $8, 416(%rbp), %xmm1, %xmm1
1949; SKX-NEXT:    vpinsrb $9, 424(%rbp), %xmm1, %xmm1
1950; SKX-NEXT:    vpinsrb $10, 432(%rbp), %xmm1, %xmm1
1951; SKX-NEXT:    vpinsrb $11, 440(%rbp), %xmm1, %xmm1
1952; SKX-NEXT:    vpinsrb $12, 448(%rbp), %xmm1, %xmm1
1953; SKX-NEXT:    vpinsrb $13, 456(%rbp), %xmm1, %xmm1
1954; SKX-NEXT:    vpinsrb $14, 464(%rbp), %xmm1, %xmm1
1955; SKX-NEXT:    vpinsrb $15, 472(%rbp), %xmm1, %xmm1
1956; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1957; SKX-NEXT:    vmovd %edi, %xmm1
1958; SKX-NEXT:    vpinsrb $1, %esi, %xmm1, %xmm1
1959; SKX-NEXT:    vpinsrb $2, %edx, %xmm1, %xmm1
1960; SKX-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
1961; SKX-NEXT:    vpinsrb $4, %r8d, %xmm1, %xmm1
1962; SKX-NEXT:    vpinsrb $5, %r9d, %xmm1, %xmm1
1963; SKX-NEXT:    vpinsrb $6, 16(%rbp), %xmm1, %xmm1
1964; SKX-NEXT:    vpinsrb $7, 24(%rbp), %xmm1, %xmm1
1965; SKX-NEXT:    vpinsrb $8, 32(%rbp), %xmm1, %xmm1
1966; SKX-NEXT:    vpinsrb $9, 40(%rbp), %xmm1, %xmm1
1967; SKX-NEXT:    vpinsrb $10, 48(%rbp), %xmm1, %xmm1
1968; SKX-NEXT:    vpinsrb $11, 56(%rbp), %xmm1, %xmm1
1969; SKX-NEXT:    vpinsrb $12, 64(%rbp), %xmm1, %xmm1
1970; SKX-NEXT:    vpinsrb $13, 72(%rbp), %xmm1, %xmm1
1971; SKX-NEXT:    vpinsrb $14, 80(%rbp), %xmm1, %xmm1
1972; SKX-NEXT:    vpinsrb $15, 88(%rbp), %xmm1, %xmm1
1973; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
1974; SKX-NEXT:    vpinsrb $1, 104(%rbp), %xmm2, %xmm2
1975; SKX-NEXT:    vpinsrb $2, 112(%rbp), %xmm2, %xmm2
1976; SKX-NEXT:    vpinsrb $3, 120(%rbp), %xmm2, %xmm2
1977; SKX-NEXT:    vpinsrb $4, 128(%rbp), %xmm2, %xmm2
1978; SKX-NEXT:    vpinsrb $5, 136(%rbp), %xmm2, %xmm2
1979; SKX-NEXT:    vpinsrb $6, 144(%rbp), %xmm2, %xmm2
1980; SKX-NEXT:    vpinsrb $7, 152(%rbp), %xmm2, %xmm2
1981; SKX-NEXT:    vpinsrb $8, 160(%rbp), %xmm2, %xmm2
1982; SKX-NEXT:    vpinsrb $9, 168(%rbp), %xmm2, %xmm2
1983; SKX-NEXT:    vpinsrb $10, 176(%rbp), %xmm2, %xmm2
1984; SKX-NEXT:    vpinsrb $11, 184(%rbp), %xmm2, %xmm2
1985; SKX-NEXT:    vpinsrb $12, 192(%rbp), %xmm2, %xmm2
1986; SKX-NEXT:    vpinsrb $13, 200(%rbp), %xmm2, %xmm2
1987; SKX-NEXT:    vpinsrb $14, 208(%rbp), %xmm2, %xmm2
1988; SKX-NEXT:    vpinsrb $15, 216(%rbp), %xmm2, %xmm2
1989; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1990; SKX-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
1991; SKX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1992; SKX-NEXT:    vpinsrb $1, 488(%rbp), %xmm1, %xmm1
1993; SKX-NEXT:    vpinsrb $2, 496(%rbp), %xmm1, %xmm1
1994; SKX-NEXT:    vpinsrb $3, 504(%rbp), %xmm1, %xmm1
1995; SKX-NEXT:    vpinsrb $4, 512(%rbp), %xmm1, %xmm1
1996; SKX-NEXT:    vpinsrb $5, 520(%rbp), %xmm1, %xmm1
1997; SKX-NEXT:    vpinsrb $6, 528(%rbp), %xmm1, %xmm1
1998; SKX-NEXT:    vpinsrb $7, 536(%rbp), %xmm1, %xmm1
1999; SKX-NEXT:    vpinsrb $8, 544(%rbp), %xmm1, %xmm1
2000; SKX-NEXT:    vpinsrb $9, 552(%rbp), %xmm1, %xmm1
2001; SKX-NEXT:    vpinsrb $10, 560(%rbp), %xmm1, %xmm1
2002; SKX-NEXT:    vpinsrb $11, 568(%rbp), %xmm1, %xmm1
2003; SKX-NEXT:    vpinsrb $12, 576(%rbp), %xmm1, %xmm1
2004; SKX-NEXT:    vpinsrb $13, 584(%rbp), %xmm1, %xmm1
2005; SKX-NEXT:    vpinsrb $14, 592(%rbp), %xmm1, %xmm1
2006; SKX-NEXT:    vpinsrb $15, 600(%rbp), %xmm1, %xmm1
2007; SKX-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2008; SKX-NEXT:    vpinsrb $1, 616(%rbp), %xmm2, %xmm2
2009; SKX-NEXT:    vpinsrb $2, 624(%rbp), %xmm2, %xmm2
2010; SKX-NEXT:    vpinsrb $3, 632(%rbp), %xmm2, %xmm2
2011; SKX-NEXT:    vpinsrb $4, 640(%rbp), %xmm2, %xmm2
2012; SKX-NEXT:    vpinsrb $5, 648(%rbp), %xmm2, %xmm2
2013; SKX-NEXT:    vpinsrb $6, 656(%rbp), %xmm2, %xmm2
2014; SKX-NEXT:    vpinsrb $7, 664(%rbp), %xmm2, %xmm2
2015; SKX-NEXT:    vpinsrb $8, 672(%rbp), %xmm2, %xmm2
2016; SKX-NEXT:    vpinsrb $9, 680(%rbp), %xmm2, %xmm2
2017; SKX-NEXT:    vpinsrb $10, 688(%rbp), %xmm2, %xmm2
2018; SKX-NEXT:    vpinsrb $11, 696(%rbp), %xmm2, %xmm2
2019; SKX-NEXT:    vpinsrb $12, 704(%rbp), %xmm2, %xmm2
2020; SKX-NEXT:    vpinsrb $13, 712(%rbp), %xmm2, %xmm2
2021; SKX-NEXT:    vpinsrb $14, 720(%rbp), %xmm2, %xmm2
2022; SKX-NEXT:    vpinsrb $15, 728(%rbp), %xmm2, %xmm2
2023; SKX-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2024; SKX-NEXT:    movl 744(%rbp), %eax
2025; SKX-NEXT:    andl $127, %eax
2026; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
2027; SKX-NEXT:    vptestmb %zmm1, %zmm1, %k1
2028; SKX-NEXT:    cmpb $0, 736(%rbp)
2029; SKX-NEXT:    vpmovm2b %k1, %zmm0
2030; SKX-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2031; SKX-NEXT:    vpmovm2b %k0, %zmm0
2032; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
2033; SKX-NEXT:    setne (%rsp,%rax)
2034; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2035; SKX-NEXT:    vpmovb2m %zmm0, %k0
2036; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
2037; SKX-NEXT:    vpmovb2m %zmm0, %k1
2038; SKX-NEXT:    kmovq %k1, %rax
2039; SKX-NEXT:    kmovq %k0, %rdx
2040; SKX-NEXT:    movq %rbp, %rsp
2041; SKX-NEXT:    popq %rbp
2042; SKX-NEXT:    vzeroupper
2043; SKX-NEXT:    retq
2044  %t1 = icmp ugt <96 x i8> %a, zeroinitializer
2045  %t2 = icmp ugt i8 %b, 0
2046  %t3 = insertelement <96 x i1> %t1, i1 %t2, i32 %index
2047  %t4 = bitcast <96 x i1> %t3 to i96
2048  ret i96 %t4
2049}
2050
2051define i128 @test_insertelement_variable_v128i1(<128 x i8> %a, i8 %b, i32 %index) {
2052; KNL-LABEL: test_insertelement_variable_v128i1:
2053; KNL:       ## %bb.0:
2054; KNL-NEXT:    pushq %rbp
2055; KNL-NEXT:    .cfi_def_cfa_offset 16
2056; KNL-NEXT:    .cfi_offset %rbp, -16
2057; KNL-NEXT:    movq %rsp, %rbp
2058; KNL-NEXT:    .cfi_def_cfa_register %rbp
2059; KNL-NEXT:    andq $-64, %rsp
2060; KNL-NEXT:    subq $192, %rsp
2061; KNL-NEXT:    ## kill: def $esi killed $esi def $rsi
2062; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
2063; KNL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2064; KNL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
2065; KNL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
2066; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
2067; KNL-NEXT:    vpternlogq $15, %zmm0, %zmm0, %zmm0
2068; KNL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2069; KNL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
2070; KNL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
2071; KNL-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
2072; KNL-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
2073; KNL-NEXT:    andl $127, %esi
2074; KNL-NEXT:    testb %dil, %dil
2075; KNL-NEXT:    vmovdqa64 %zmm1, {{[0-9]+}}(%rsp)
2076; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp)
2077; KNL-NEXT:    setne (%rsp,%rsi)
2078; KNL-NEXT:    vpmovsxbd (%rsp), %zmm0
2079; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2080; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2081; KNL-NEXT:    kmovw %k0, %eax
2082; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2083; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2084; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2085; KNL-NEXT:    kmovw %k0, %ecx
2086; KNL-NEXT:    shll $16, %ecx
2087; KNL-NEXT:    orl %eax, %ecx
2088; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2089; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2090; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2091; KNL-NEXT:    kmovw %k0, %edx
2092; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2093; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2094; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2095; KNL-NEXT:    kmovw %k0, %eax
2096; KNL-NEXT:    shll $16, %eax
2097; KNL-NEXT:    orl %edx, %eax
2098; KNL-NEXT:    shlq $32, %rax
2099; KNL-NEXT:    orq %rcx, %rax
2100; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2101; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2102; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2103; KNL-NEXT:    kmovw %k0, %ecx
2104; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2105; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2106; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2107; KNL-NEXT:    kmovw %k0, %esi
2108; KNL-NEXT:    shll $16, %esi
2109; KNL-NEXT:    orl %ecx, %esi
2110; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2111; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2112; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2113; KNL-NEXT:    kmovw %k0, %ecx
2114; KNL-NEXT:    vpmovsxbd {{[0-9]+}}(%rsp), %zmm0
2115; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
2116; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
2117; KNL-NEXT:    kmovw %k0, %edx
2118; KNL-NEXT:    shll $16, %edx
2119; KNL-NEXT:    orl %ecx, %edx
2120; KNL-NEXT:    shlq $32, %rdx
2121; KNL-NEXT:    orq %rsi, %rdx
2122; KNL-NEXT:    movq %rbp, %rsp
2123; KNL-NEXT:    popq %rbp
2124; KNL-NEXT:    vzeroupper
2125; KNL-NEXT:    retq
2126;
2127; SKX-LABEL: test_insertelement_variable_v128i1:
2128; SKX:       ## %bb.0:
2129; SKX-NEXT:    pushq %rbp
2130; SKX-NEXT:    .cfi_def_cfa_offset 16
2131; SKX-NEXT:    .cfi_offset %rbp, -16
2132; SKX-NEXT:    movq %rsp, %rbp
2133; SKX-NEXT:    .cfi_def_cfa_register %rbp
2134; SKX-NEXT:    andq $-64, %rsp
2135; SKX-NEXT:    subq $192, %rsp
2136; SKX-NEXT:    ## kill: def $esi killed $esi def $rsi
2137; SKX-NEXT:    vptestmb %zmm0, %zmm0, %k0
2138; SKX-NEXT:    vptestmb %zmm1, %zmm1, %k1
2139; SKX-NEXT:    andl $127, %esi
2140; SKX-NEXT:    testb %dil, %dil
2141; SKX-NEXT:    vpmovm2b %k1, %zmm0
2142; SKX-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
2143; SKX-NEXT:    vpmovm2b %k0, %zmm0
2144; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
2145; SKX-NEXT:    setne (%rsp,%rsi)
2146; SKX-NEXT:    vpsllw $7, {{[0-9]+}}(%rsp), %zmm0
2147; SKX-NEXT:    vpmovb2m %zmm0, %k0
2148; SKX-NEXT:    vpsllw $7, (%rsp), %zmm0
2149; SKX-NEXT:    vpmovb2m %zmm0, %k1
2150; SKX-NEXT:    kmovq %k1, %rax
2151; SKX-NEXT:    kmovq %k0, %rdx
2152; SKX-NEXT:    movq %rbp, %rsp
2153; SKX-NEXT:    popq %rbp
2154; SKX-NEXT:    vzeroupper
2155; SKX-NEXT:    retq
2156  %t1 = icmp ugt <128 x i8> %a, zeroinitializer
2157  %t2 = icmp ugt i8 %b, 0
2158  %t3 = insertelement <128 x i1> %t1, i1 %t2, i32 %index
2159  %t4 = bitcast <128 x i1> %t3 to i128
2160  ret i128 %t4
2161}
2162
2163define void @test_concat_v2i1(<2 x half>* %arg, <2 x half>* %arg1, <2 x half>* %arg2) {
2164; KNL-LABEL: test_concat_v2i1:
2165; KNL:       ## %bb.0:
2166; KNL-NEXT:    movzwl 2(%rdi), %eax
2167; KNL-NEXT:    movzwl (%rdi), %ecx
2168; KNL-NEXT:    vmovd %ecx, %xmm0
2169; KNL-NEXT:    vcvtph2ps %xmm0, %xmm0
2170; KNL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2171; KNL-NEXT:    vucomiss %xmm1, %xmm0
2172; KNL-NEXT:    setb %cl
2173; KNL-NEXT:    andl $1, %ecx
2174; KNL-NEXT:    kmovw %ecx, %k0
2175; KNL-NEXT:    vmovd %eax, %xmm2
2176; KNL-NEXT:    vcvtph2ps %xmm2, %xmm2
2177; KNL-NEXT:    vucomiss %xmm1, %xmm2
2178; KNL-NEXT:    setb %al
2179; KNL-NEXT:    kmovw %eax, %k1
2180; KNL-NEXT:    kshiftlw $1, %k1, %k1
2181; KNL-NEXT:    korw %k1, %k0, %k0
2182; KNL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2183; KNL-NEXT:    vucomiss %xmm1, %xmm0
2184; KNL-NEXT:    seta %al
2185; KNL-NEXT:    andl $1, %eax
2186; KNL-NEXT:    kmovw %eax, %k1
2187; KNL-NEXT:    vucomiss %xmm1, %xmm2
2188; KNL-NEXT:    seta %al
2189; KNL-NEXT:    kmovw %eax, %k2
2190; KNL-NEXT:    kshiftlw $1, %k2, %k2
2191; KNL-NEXT:    korw %k2, %k1, %k1
2192; KNL-NEXT:    kandw %k1, %k0, %k0
2193; KNL-NEXT:    kshiftrw $1, %k0, %k1
2194; KNL-NEXT:    kmovw %k1, %ecx
2195; KNL-NEXT:    xorl %eax, %eax
2196; KNL-NEXT:    testb $1, %cl
2197; KNL-NEXT:    movl $0, %ecx
2198; KNL-NEXT:    je LBB85_2
2199; KNL-NEXT:  ## %bb.1:
2200; KNL-NEXT:    movzwl 2(%rsi), %ecx
2201; KNL-NEXT:  LBB85_2:
2202; KNL-NEXT:    kmovw %k0, %edi
2203; KNL-NEXT:    testb $1, %dil
2204; KNL-NEXT:    je LBB85_4
2205; KNL-NEXT:  ## %bb.3:
2206; KNL-NEXT:    movzwl (%rsi), %eax
2207; KNL-NEXT:  LBB85_4:
2208; KNL-NEXT:    movw %ax, (%rdx)
2209; KNL-NEXT:    movw %cx, 2(%rdx)
2210; KNL-NEXT:    retq
2211;
2212; SKX-LABEL: test_concat_v2i1:
2213; SKX:       ## %bb.0:
2214; SKX-NEXT:    movzwl (%rdi), %eax
2215; SKX-NEXT:    movzwl 2(%rdi), %ecx
2216; SKX-NEXT:    vmovd %ecx, %xmm0
2217; SKX-NEXT:    vcvtph2ps %xmm0, %xmm0
2218; SKX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2219; SKX-NEXT:    vucomiss %xmm1, %xmm0
2220; SKX-NEXT:    setb %cl
2221; SKX-NEXT:    kmovd %ecx, %k0
2222; SKX-NEXT:    kshiftlb $1, %k0, %k0
2223; SKX-NEXT:    vmovd %eax, %xmm2
2224; SKX-NEXT:    vcvtph2ps %xmm2, %xmm2
2225; SKX-NEXT:    vucomiss %xmm1, %xmm2
2226; SKX-NEXT:    setb %al
2227; SKX-NEXT:    kmovd %eax, %k1
2228; SKX-NEXT:    kshiftlb $7, %k1, %k1
2229; SKX-NEXT:    kshiftrb $7, %k1, %k1
2230; SKX-NEXT:    korw %k0, %k1, %k0
2231; SKX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
2232; SKX-NEXT:    vucomiss %xmm1, %xmm0
2233; SKX-NEXT:    seta %al
2234; SKX-NEXT:    kmovd %eax, %k1
2235; SKX-NEXT:    kshiftlb $1, %k1, %k1
2236; SKX-NEXT:    vucomiss %xmm1, %xmm2
2237; SKX-NEXT:    seta %al
2238; SKX-NEXT:    kmovd %eax, %k2
2239; SKX-NEXT:    kshiftlb $7, %k2, %k2
2240; SKX-NEXT:    kshiftrb $7, %k2, %k2
2241; SKX-NEXT:    korw %k1, %k2, %k1
2242; SKX-NEXT:    kandw %k1, %k0, %k0
2243; SKX-NEXT:    kshiftrb $1, %k0, %k1
2244; SKX-NEXT:    kmovd %k1, %ecx
2245; SKX-NEXT:    xorl %eax, %eax
2246; SKX-NEXT:    testb $1, %cl
2247; SKX-NEXT:    movl $0, %ecx
2248; SKX-NEXT:    je LBB85_2
2249; SKX-NEXT:  ## %bb.1:
2250; SKX-NEXT:    movzwl 2(%rsi), %ecx
2251; SKX-NEXT:  LBB85_2:
2252; SKX-NEXT:    kmovd %k0, %edi
2253; SKX-NEXT:    testb $1, %dil
2254; SKX-NEXT:    je LBB85_4
2255; SKX-NEXT:  ## %bb.3:
2256; SKX-NEXT:    movzwl (%rsi), %eax
2257; SKX-NEXT:  LBB85_4:
2258; SKX-NEXT:    movw %ax, (%rdx)
2259; SKX-NEXT:    movw %cx, 2(%rdx)
2260; SKX-NEXT:    retq
2261  %tmp = load <2 x half>, <2 x half>* %arg, align 8
2262  %tmp3 = fcmp fast olt <2 x half> %tmp, <half 0xH4600, half 0xH4600>
2263  %tmp4 = fcmp fast ogt <2 x half> %tmp, zeroinitializer
2264  %tmp5 = and <2 x i1> %tmp3, %tmp4
2265  %tmp6 = load <2 x half>, <2 x half>* %arg1, align 8
2266  %tmp7 = select <2 x i1> %tmp5, <2 x half> %tmp6, <2 x half> zeroinitializer
2267  store <2 x half> %tmp7, <2 x half>* %arg2, align 8
2268  ret void
2269}
2270