1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2            | FileCheck %s --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3           | FileCheck %s --check-prefixes=SSSE3,SSSE3-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx             | FileCheck %s --check-prefixes=AVX1,AVX1-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops   | FileCheck %s --check-prefixes=AVX1,AVX1-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2            | FileCheck %s --check-prefix=AVX2
8
9define float @pr26491(<4 x float> %a0) {
10; SSE2-LABEL: pr26491:
11; SSE2:       # %bb.0:
12; SSE2-NEXT:    movaps %xmm0, %xmm1
13; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
14; SSE2-NEXT:    addps %xmm0, %xmm1
15; SSE2-NEXT:    movaps %xmm1, %xmm0
16; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
17; SSE2-NEXT:    addss %xmm1, %xmm0
18; SSE2-NEXT:    retq
19;
20; SSSE3-SLOW-LABEL: pr26491:
21; SSSE3-SLOW:       # %bb.0:
22; SSSE3-SLOW-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
23; SSSE3-SLOW-NEXT:    addps %xmm0, %xmm1
24; SSSE3-SLOW-NEXT:    movaps %xmm1, %xmm0
25; SSSE3-SLOW-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
26; SSSE3-SLOW-NEXT:    addss %xmm1, %xmm0
27; SSSE3-SLOW-NEXT:    retq
28;
29; SSSE3-FAST-LABEL: pr26491:
30; SSSE3-FAST:       # %bb.0:
31; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm0
32; SSSE3-FAST-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
33; SSSE3-FAST-NEXT:    addss %xmm1, %xmm0
34; SSSE3-FAST-NEXT:    retq
35;
36; AVX1-SLOW-LABEL: pr26491:
37; AVX1-SLOW:       # %bb.0:
38; AVX1-SLOW-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
39; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
40; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
41; AVX1-SLOW-NEXT:    vaddss %xmm0, %xmm1, %xmm0
42; AVX1-SLOW-NEXT:    retq
43;
44; AVX1-FAST-LABEL: pr26491:
45; AVX1-FAST:       # %bb.0:
46; AVX1-FAST-NEXT:    vhaddps %xmm0, %xmm0, %xmm0
47; AVX1-FAST-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
48; AVX1-FAST-NEXT:    vaddss %xmm0, %xmm1, %xmm0
49; AVX1-FAST-NEXT:    retq
50;
51; AVX2-LABEL: pr26491:
52; AVX2:       # %bb.0:
53; AVX2-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
54; AVX2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
55; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
56; AVX2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
57; AVX2-NEXT:    retq
58  %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
59  %2 = fadd <4 x float> %1, %a0
60  %3 = extractelement <4 x float> %2, i32 2
61  %4 = extractelement <4 x float> %2, i32 0
62  %5 = fadd float %3, %4
63  ret float %5
64}
65
66; When simplifying away a splat (broadcast), the hop type must match the shuffle type.
67
68define <4 x double> @PR41414(i64 %x, <4 x double> %y) {
69; SSE2-LABEL: PR41414:
70; SSE2:       # %bb.0:
71; SSE2-NEXT:    movq %rdi, %xmm2
72; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
73; SSE2-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
74; SSE2-NEXT:    movapd %xmm2, %xmm3
75; SSE2-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm2[0]
76; SSE2-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
77; SSE2-NEXT:    addpd %xmm3, %xmm2
78; SSE2-NEXT:    divpd %xmm2, %xmm1
79; SSE2-NEXT:    divpd %xmm2, %xmm0
80; SSE2-NEXT:    xorpd %xmm2, %xmm2
81; SSE2-NEXT:    addpd %xmm2, %xmm0
82; SSE2-NEXT:    addpd %xmm2, %xmm1
83; SSE2-NEXT:    retq
84;
85; SSSE3-LABEL: PR41414:
86; SSSE3:       # %bb.0:
87; SSSE3-NEXT:    movq %rdi, %xmm2
88; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
89; SSSE3-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
90; SSSE3-NEXT:    haddpd %xmm2, %xmm2
91; SSSE3-NEXT:    divpd %xmm2, %xmm1
92; SSSE3-NEXT:    divpd %xmm2, %xmm0
93; SSSE3-NEXT:    xorpd %xmm2, %xmm2
94; SSSE3-NEXT:    addpd %xmm2, %xmm0
95; SSSE3-NEXT:    addpd %xmm2, %xmm1
96; SSSE3-NEXT:    retq
97;
98; AVX1-LABEL: PR41414:
99; AVX1:       # %bb.0:
100; AVX1-NEXT:    vmovq %rdi, %xmm1
101; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
102; AVX1-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
103; AVX1-NEXT:    vhaddpd %xmm1, %xmm1, %xmm1
104; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
105; AVX1-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
106; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
107; AVX1-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
108; AVX1-NEXT:    retq
109;
110; AVX2-LABEL: PR41414:
111; AVX2:       # %bb.0:
112; AVX2-NEXT:    vmovq %rdi, %xmm1
113; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
114; AVX2-NEXT:    vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
115; AVX2-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
116; AVX2-NEXT:    vaddsd %xmm1, %xmm2, %xmm1
117; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
118; AVX2-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
119; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
120; AVX2-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
121; AVX2-NEXT:    retq
122  %conv = uitofp i64 %x to double
123  %t0 = insertelement <4 x double> undef, double %conv, i32 0
124  %t1 = shufflevector <4 x double> %t0, <4 x double> undef, <4 x i32> zeroinitializer
125  %t2 = fdiv <4 x double> %y, %t1
126  %t3 = fadd <4 x double> zeroinitializer, %t2
127  ret <4 x double> %t3
128}
129
130define <4 x float> @PR48823(<4 x float> %0, <4 x float> %1) {
131; SSE2-LABEL: PR48823:
132; SSE2:       # %bb.0:
133; SSE2-NEXT:    movaps %xmm0, %xmm2
134; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
135; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
136; SSE2-NEXT:    subps %xmm2, %xmm0
137; SSE2-NEXT:    retq
138;
139; SSSE3-SLOW-LABEL: PR48823:
140; SSSE3-SLOW:       # %bb.0:
141; SSSE3-SLOW-NEXT:    movaps %xmm0, %xmm2
142; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
143; SSSE3-SLOW-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
144; SSSE3-SLOW-NEXT:    subps %xmm2, %xmm0
145; SSSE3-SLOW-NEXT:    retq
146;
147; SSSE3-FAST-LABEL: PR48823:
148; SSSE3-FAST:       # %bb.0:
149; SSSE3-FAST-NEXT:    hsubps %xmm1, %xmm0
150; SSSE3-FAST-NEXT:    retq
151;
152; AVX1-SLOW-LABEL: PR48823:
153; AVX1-SLOW:       # %bb.0:
154; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3]
155; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
156; AVX1-SLOW-NEXT:    vsubps %xmm2, %xmm0, %xmm0
157; AVX1-SLOW-NEXT:    retq
158;
159; AVX1-FAST-LABEL: PR48823:
160; AVX1-FAST:       # %bb.0:
161; AVX1-FAST-NEXT:    vhsubps %xmm1, %xmm0, %xmm0
162; AVX1-FAST-NEXT:    retq
163;
164; AVX2-LABEL: PR48823:
165; AVX2:       # %bb.0:
166; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,1],xmm1[2,3]
167; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2]
168; AVX2-NEXT:    vsubps %xmm2, %xmm0, %xmm0
169; AVX2-NEXT:    retq
170  %3 = shufflevector <4 x float> %0, <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
171  %4 = fsub <4 x float> %0, %3
172  %5 = shufflevector <4 x float> %1, <4 x float> poison, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 2>
173  %6 = fsub <4 x float> %5, %1
174  %7 = shufflevector <4 x float> %4, <4 x float> %6, <4 x i32> <i32 0, i32 undef, i32 undef, i32 7>
175  ret <4 x float> %7
176}
177