1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
4
5; test vector shifts converted to proper SSE2 vector shifts when the shift
6; amounts are the same when using a shuffle splat.
7
8define void @shift1a(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
9; X86-LABEL: shift1a:
10; X86:       # %bb.0: # %entry
11; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
12; X86-NEXT:    psllq %xmm1, %xmm0
13; X86-NEXT:    movdqa %xmm0, (%eax)
14; X86-NEXT:    retl
15;
16; X64-LABEL: shift1a:
17; X64:       # %bb.0: # %entry
18; X64-NEXT:    psllq %xmm1, %xmm0
19; X64-NEXT:    movdqa %xmm0, (%rdi)
20; X64-NEXT:    retq
21entry:
22  %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
23  %shl = shl <2 x i64> %val, %shamt
24  store <2 x i64> %shl, <2 x i64>* %dst
25  ret void
26}
27
28; shift1b can't use a packed shift but can shift lanes separately and shuffle back together
29define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, <2 x i64> %sh) nounwind {
30; X86-LABEL: shift1b:
31; X86:       # %bb.0: # %entry
32; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
33; X86-NEXT:    movdqa %xmm0, %xmm2
34; X86-NEXT:    psllq %xmm1, %xmm2
35; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
36; X86-NEXT:    psllq %xmm1, %xmm0
37; X86-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
38; X86-NEXT:    movapd %xmm0, (%eax)
39; X86-NEXT:    retl
40;
41; X64-LABEL: shift1b:
42; X64:       # %bb.0: # %entry
43; X64-NEXT:    movdqa %xmm0, %xmm2
44; X64-NEXT:    psllq %xmm1, %xmm2
45; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
46; X64-NEXT:    psllq %xmm1, %xmm0
47; X64-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
48; X64-NEXT:    movapd %xmm0, (%rdi)
49; X64-NEXT:    retq
50entry:
51  %shamt = shufflevector <2 x i64> %sh, <2 x i64> undef, <2 x i32> <i32 0, i32 1>
52  %shl = shl <2 x i64> %val, %shamt
53  store <2 x i64> %shl, <2 x i64>* %dst
54  ret void
55}
56
57define void @shift2a(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
58; X86-LABEL: shift2a:
59; X86:       # %bb.0: # %entry
60; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
61; X86-NEXT:    psrlq $32, %xmm1
62; X86-NEXT:    pslld %xmm1, %xmm0
63; X86-NEXT:    movdqa %xmm0, (%eax)
64; X86-NEXT:    retl
65;
66; X64-LABEL: shift2a:
67; X64:       # %bb.0: # %entry
68; X64-NEXT:    psrlq $32, %xmm1
69; X64-NEXT:    pslld %xmm1, %xmm0
70; X64-NEXT:    movdqa %xmm0, (%rdi)
71; X64-NEXT:    retq
72entry:
73  %shamt = shufflevector <2 x i32> %amt, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
74  %shl = shl <4 x i32> %val, %shamt
75  store <4 x i32> %shl, <4 x i32>* %dst
76  ret void
77}
78
79define void @shift2b(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
80; X86-LABEL: shift2b:
81; X86:       # %bb.0: # %entry
82; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
83; X86-NEXT:    psrlq $32, %xmm1
84; X86-NEXT:    pslld %xmm1, %xmm0
85; X86-NEXT:    movdqa %xmm0, (%eax)
86; X86-NEXT:    retl
87;
88; X64-LABEL: shift2b:
89; X64:       # %bb.0: # %entry
90; X64-NEXT:    psrlq $32, %xmm1
91; X64-NEXT:    pslld %xmm1, %xmm0
92; X64-NEXT:    movdqa %xmm0, (%rdi)
93; X64-NEXT:    retq
94entry:
95  %shamt = shufflevector <2 x i32> %amt, <2 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 1, i32 1>
96  %shl = shl <4 x i32> %val, %shamt
97  store <4 x i32> %shl, <4 x i32>* %dst
98  ret void
99}
100
101define void @shift2c(<4 x i32> %val, <4 x i32>* %dst, <2 x i32> %amt) nounwind {
102; X86-LABEL: shift2c:
103; X86:       # %bb.0: # %entry
104; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
105; X86-NEXT:    psrlq $32, %xmm1
106; X86-NEXT:    pslld %xmm1, %xmm0
107; X86-NEXT:    movdqa %xmm0, (%eax)
108; X86-NEXT:    retl
109;
110; X64-LABEL: shift2c:
111; X64:       # %bb.0: # %entry
112; X64-NEXT:    psrlq $32, %xmm1
113; X64-NEXT:    pslld %xmm1, %xmm0
114; X64-NEXT:    movdqa %xmm0, (%rdi)
115; X64-NEXT:    retq
116entry:
117  %shamt = shufflevector <2 x i32> %amt, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
118  %shl = shl <4 x i32> %val, %shamt
119  store <4 x i32> %shl, <4 x i32>* %dst
120  ret void
121}
122
123define void @shift3a(<8 x i16> %val, <8 x i16>* %dst, <8 x i16> %amt) nounwind {
124; X86-LABEL: shift3a:
125; X86:       # %bb.0: # %entry
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
127; X86-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
128; X86-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
129; X86-NEXT:    psllw %xmm1, %xmm0
130; X86-NEXT:    movdqa %xmm0, (%eax)
131; X86-NEXT:    retl
132;
133; X64-LABEL: shift3a:
134; X64:       # %bb.0: # %entry
135; X64-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
136; X64-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
137; X64-NEXT:    psllw %xmm1, %xmm0
138; X64-NEXT:    movdqa %xmm0, (%rdi)
139; X64-NEXT:    retq
140entry:
141  %shamt = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
142  %shl = shl <8 x i16> %val, %shamt
143  store <8 x i16> %shl, <8 x i16>* %dst
144  ret void
145}
146
147define void @shift3b(<8 x i16> %val, <8 x i16>* %dst, i16 %amt) nounwind {
148; X86-LABEL: shift3b:
149; X86:       # %bb.0: # %entry
150; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
151; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
152; X86-NEXT:    movd %ecx, %xmm1
153; X86-NEXT:    psllw %xmm1, %xmm0
154; X86-NEXT:    movdqa %xmm0, (%eax)
155; X86-NEXT:    retl
156;
157; X64-LABEL: shift3b:
158; X64:       # %bb.0: # %entry
159; X64-NEXT:    movzwl %si, %eax
160; X64-NEXT:    movd %eax, %xmm1
161; X64-NEXT:    psllw %xmm1, %xmm0
162; X64-NEXT:    movdqa %xmm0, (%rdi)
163; X64-NEXT:    retq
164entry:
165  %0 = insertelement <8 x i16> undef, i16 %amt, i32 0
166  %1 = insertelement <8 x i16> %0, i16 %amt, i32 1
167  %2 = insertelement <8 x i16> %1, i16 %amt, i32 2
168  %3 = insertelement <8 x i16> %2, i16 %amt, i32 3
169  %4 = insertelement <8 x i16> %3, i16 %amt, i32 4
170  %5 = insertelement <8 x i16> %4, i16 %amt, i32 5
171  %6 = insertelement <8 x i16> %5, i16 %amt, i32 6
172  %7 = insertelement <8 x i16> %6, i16 %amt, i32 7
173  %shl = shl <8 x i16> %val, %7
174  store <8 x i16> %shl, <8 x i16>* %dst
175  ret void
176}
177
178