1; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
2; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
3
4define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
5; CHECK-LABEL: @blendvb_fallback_v4i32
6; CHECK: vblendvps
7; CHECK: ret
8  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
9  ret <4 x i32> %ret
10}
11
12define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
13; CHECK-LABEL: @blendvb_fallback_v8i32
14; CHECK: vblendvps
15; CHECK: ret
16  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
17  ret <8 x i32> %ret
18}
19
20define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
21; CHECK-LABEL: @blendvb_fallback_v8f32
22; CHECK: vblendvps
23; CHECK: ret
24  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
25  ret <8 x float> %ret
26}
27
28declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
29
30define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
31; CHECK-LABEL: insertps_from_vector_load:
32; On X32, account for the argument's move to registers
33; X32: movl    4(%esp), %eax
34; CHECK-NOT: mov
35; CHECK: insertps    $48
36; CHECK-NEXT: ret
37  %1 = load <4 x float>* %pb, align 16
38  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
39  ret <4 x float> %2
40}
41
42;; Use a non-zero CountS for insertps
43define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
44; CHECK-LABEL: insertps_from_vector_load_offset:
45; On X32, account for the argument's move to registers
46; X32: movl    4(%esp), %eax
47; CHECK-NOT: mov
48;; Try to match a bit more of the instr, since we need the load's offset.
49; CHECK: insertps    $96, 4(%{{...}}), %
50; CHECK-NEXT: ret
51  %1 = load <4 x float>* %pb, align 16
52  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
53  ret <4 x float> %2
54}
55
56define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
57; CHECK-LABEL: insertps_from_vector_load_offset_2:
58; On X32, account for the argument's move to registers
59; X32: movl    4(%esp), %eax
60; X32: movl    8(%esp), %ecx
61; CHECK-NOT: mov
62;; Try to match a bit more of the instr, since we need the load's offset.
63; CHECK: vinsertps    $-64, 12(%{{...}},%{{...}}), %
64; CHECK-NEXT: ret
65  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
66  %2 = load <4 x float>* %1, align 16
67  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
68  ret <4 x float> %3
69}
70
71define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
72; CHECK-LABEL: insertps_from_broadcast_loadf32:
73; On X32, account for the arguments' move to registers
74; X32: movl    8(%esp), %eax
75; X32: movl    4(%esp), %ecx
76; CHECK-NOT: mov
77; CHECK: insertps    $48
78; CHECK-NEXT: ret
79  %1 = getelementptr inbounds float* %fb, i64 %index
80  %2 = load float* %1, align 4
81  %3 = insertelement <4 x float> undef, float %2, i32 0
82  %4 = insertelement <4 x float> %3, float %2, i32 1
83  %5 = insertelement <4 x float> %4, float %2, i32 2
84  %6 = insertelement <4 x float> %5, float %2, i32 3
85  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
86  ret <4 x float> %7
87}
88
89define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
90; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
91; On X32, account for the arguments' move to registers
92; X32: movl    4(%esp), %{{...}}
93; CHECK-NOT: mov
94; CHECK: insertps    $48
95; CHECK-NEXT: ret
96  %1 = load <4 x float>* %b, align 4
97  %2 = extractelement <4 x float> %1, i32 0
98  %3 = insertelement <4 x float> undef, float %2, i32 0
99  %4 = insertelement <4 x float> %3, float %2, i32 1
100  %5 = insertelement <4 x float> %4, float %2, i32 2
101  %6 = insertelement <4 x float> %5, float %2, i32 3
102  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
103  ret <4 x float> %7
104}
105
106;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
107define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
108; CHECK-LABEL: insertps_from_broadcast_multiple_use:
109; On X32, account for the arguments' move to registers
110; X32: movl    8(%esp), %eax
111; X32: movl    4(%esp), %ecx
112; CHECK: vbroadcastss
113; CHECK-NOT: mov
114; CHECK: insertps    $48
115; CHECK: insertps    $48
116; CHECK: insertps    $48
117; CHECK: insertps    $48
118; CHECK: vaddps
119; CHECK: vaddps
120; CHECK: vaddps
121; CHECK-NEXT: ret
122  %1 = getelementptr inbounds float* %fb, i64 %index
123  %2 = load float* %1, align 4
124  %3 = insertelement <4 x float> undef, float %2, i32 0
125  %4 = insertelement <4 x float> %3, float %2, i32 1
126  %5 = insertelement <4 x float> %4, float %2, i32 2
127  %6 = insertelement <4 x float> %5, float %2, i32 3
128  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
129  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
130  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
131  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
132  %11 = fadd <4 x float> %7, %8
133  %12 = fadd <4 x float> %9, %10
134  %13 = fadd <4 x float> %11, %12
135  ret <4 x float> %13
136}
137