1; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core2 < %s | FileCheck --check-prefix=SSE2-CODEGEN %s
2; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -cost-model -analyze < %s | FileCheck --check-prefix=SSE2 %s
3
4%shifttype = type <2 x i16>
5define %shifttype @shift2i16(%shifttype %a, %shifttype %b) {
6entry:
7  ; SSE2: shift2i16
8  ; SSE2: cost of 20 {{.*}} shl
9  ; SSE2-CODEGEN: shift2i16
10  ; SSE2-CODEGEN: shlq %cl
11
12  %0 = shl %shifttype %a , %b
13  ret %shifttype %0
14}
15
16%shifttype4i16 = type <4 x i16>
17define %shifttype4i16 @shift4i16(%shifttype4i16 %a, %shifttype4i16 %b) {
18entry:
19  ; SSE2: shift4i16
20  ; SSE2: cost of 10 {{.*}} shl
21  ; SSE2-CODEGEN: shift4i16
22  ; SSE2-CODEGEN: pmuludq
23
24  %0 = shl %shifttype4i16 %a , %b
25  ret %shifttype4i16 %0
26}
27
28%shifttype8i16 = type <8 x i16>
29define %shifttype8i16 @shift8i16(%shifttype8i16 %a, %shifttype8i16 %b) {
30entry:
31  ; SSE2: shift8i16
32  ; SSE2: cost of 80 {{.*}} shl
33  ; SSE2-CODEGEN: shift8i16
34  ; SSE2-CODEGEN: shll %cl
35
36  %0 = shl %shifttype8i16 %a , %b
37  ret %shifttype8i16 %0
38}
39
40%shifttype16i16 = type <16 x i16>
41define %shifttype16i16 @shift16i16(%shifttype16i16 %a, %shifttype16i16 %b) {
42entry:
43  ; SSE2: shift16i16
44  ; SSE2: cost of 160 {{.*}} shl
45  ; SSE2-CODEGEN: shift16i16
46  ; SSE2-CODEGEN: shll %cl
47
48  %0 = shl %shifttype16i16 %a , %b
49  ret %shifttype16i16 %0
50}
51
52%shifttype32i16 = type <32 x i16>
53define %shifttype32i16 @shift32i16(%shifttype32i16 %a, %shifttype32i16 %b) {
54entry:
55  ; SSE2: shift32i16
56  ; SSE2: cost of 320 {{.*}} shl
57  ; SSE2-CODEGEN: shift32i16
58  ; SSE2-CODEGEN: shll %cl
59
60  %0 = shl %shifttype32i16 %a , %b
61  ret %shifttype32i16 %0
62}
63
64%shifttype2i32 = type <2 x i32>
65define %shifttype2i32 @shift2i32(%shifttype2i32 %a, %shifttype2i32 %b) {
66entry:
67  ; SSE2: shift2i32
68  ; SSE2: cost of 20 {{.*}} shl
69  ; SSE2-CODEGEN: shift2i32
70  ; SSE2-CODEGEN: shlq %cl
71
72  %0 = shl %shifttype2i32 %a , %b
73  ret %shifttype2i32 %0
74}
75
76%shifttype4i32 = type <4 x i32>
77define %shifttype4i32 @shift4i32(%shifttype4i32 %a, %shifttype4i32 %b) {
78entry:
79  ; SSE2: shift4i32
80  ; SSE2: cost of 10 {{.*}} shl
81  ; SSE2-CODEGEN: shift4i32
82  ; SSE2-CODEGEN: pmuludq
83
84  %0 = shl %shifttype4i32 %a , %b
85  ret %shifttype4i32 %0
86}
87
88%shifttype8i32 = type <8 x i32>
89define %shifttype8i32 @shift8i32(%shifttype8i32 %a, %shifttype8i32 %b) {
90entry:
91  ; SSE2: shift8i32
92  ; SSE2: cost of 20 {{.*}} shl
93  ; SSE2-CODEGEN: shift8i32
94  ; SSE2-CODEGEN: pmuludq
95
96  %0 = shl %shifttype8i32 %a , %b
97  ret %shifttype8i32 %0
98}
99
100%shifttype16i32 = type <16 x i32>
101define %shifttype16i32 @shift16i32(%shifttype16i32 %a, %shifttype16i32 %b) {
102entry:
103  ; SSE2: shift16i32
104  ; SSE2: cost of 40 {{.*}} shl
105  ; SSE2-CODEGEN: shift16i32
106  ; SSE2-CODEGEN: pmuludq
107
108  %0 = shl %shifttype16i32 %a , %b
109  ret %shifttype16i32 %0
110}
111
112%shifttype32i32 = type <32 x i32>
113define %shifttype32i32 @shift32i32(%shifttype32i32 %a, %shifttype32i32 %b) {
114entry:
115  ; SSE2: shift32i32
116  ; SSE2: cost of 80 {{.*}} shl
117  ; SSE2-CODEGEN: shift32i32
118  ; SSE2-CODEGEN: pmuludq
119
120  %0 = shl %shifttype32i32 %a , %b
121  ret %shifttype32i32 %0
122}
123
124%shifttype2i64 = type <2 x i64>
125define %shifttype2i64 @shift2i64(%shifttype2i64 %a, %shifttype2i64 %b) {
126entry:
127  ; SSE2: shift2i64
128  ; SSE2: cost of 20 {{.*}} shl
129  ; SSE2-CODEGEN: shift2i64
130  ; SSE2-CODEGEN: shlq %cl
131
132  %0 = shl %shifttype2i64 %a , %b
133  ret %shifttype2i64 %0
134}
135
136%shifttype4i64 = type <4 x i64>
137define %shifttype4i64 @shift4i64(%shifttype4i64 %a, %shifttype4i64 %b) {
138entry:
139  ; SSE2: shift4i64
140  ; SSE2: cost of 40 {{.*}} shl
141  ; SSE2-CODEGEN: shift4i64
142  ; SSE2-CODEGEN: shlq %cl
143
144  %0 = shl %shifttype4i64 %a , %b
145  ret %shifttype4i64 %0
146}
147
148%shifttype8i64 = type <8 x i64>
149define %shifttype8i64 @shift8i64(%shifttype8i64 %a, %shifttype8i64 %b) {
150entry:
151  ; SSE2: shift8i64
152  ; SSE2: cost of 80 {{.*}} shl
153  ; SSE2-CODEGEN: shift8i64
154  ; SSE2-CODEGEN: shlq %cl
155
156  %0 = shl %shifttype8i64 %a , %b
157  ret %shifttype8i64 %0
158}
159
160%shifttype16i64 = type <16 x i64>
161define %shifttype16i64 @shift16i64(%shifttype16i64 %a, %shifttype16i64 %b) {
162entry:
163  ; SSE2: shift16i64
164  ; SSE2: cost of 160 {{.*}} shl
165  ; SSE2-CODEGEN: shift16i64
166  ; SSE2-CODEGEN: shlq %cl
167
168  %0 = shl %shifttype16i64 %a , %b
169  ret %shifttype16i64 %0
170}
171
172%shifttype32i64 = type <32 x i64>
173define %shifttype32i64 @shift32i64(%shifttype32i64 %a, %shifttype32i64 %b) {
174entry:
175  ; SSE2: shift32i64
176  ; SSE2: cost of 320 {{.*}} shl
177  ; SSE2-CODEGEN: shift32i64
178  ; SSE2-CODEGEN: shlq %cl
179
180  %0 = shl %shifttype32i64 %a , %b
181  ret %shifttype32i64 %0
182}
183
184%shifttype2i8 = type <2 x i8>
185define %shifttype2i8 @shift2i8(%shifttype2i8 %a, %shifttype2i8 %b) {
186entry:
187  ; SSE2: shift2i8
188  ; SSE2: cost of 20 {{.*}} shl
189  ; SSE2-CODEGEN: shift2i8
190  ; SSE2-CODEGEN: shlq %cl
191
192  %0 = shl %shifttype2i8 %a , %b
193  ret %shifttype2i8 %0
194}
195
196%shifttype4i8 = type <4 x i8>
197define %shifttype4i8 @shift4i8(%shifttype4i8 %a, %shifttype4i8 %b) {
198entry:
199  ; SSE2: shift4i8
200  ; SSE2: cost of 10 {{.*}} shl
201  ; SSE2-CODEGEN: shift4i8
202  ; SSE2-CODEGEN: pmuludq
203
204  %0 = shl %shifttype4i8 %a , %b
205  ret %shifttype4i8 %0
206}
207
208%shifttype8i8 = type <8 x i8>
209define %shifttype8i8 @shift8i8(%shifttype8i8 %a, %shifttype8i8 %b) {
210entry:
211  ; SSE2: shift8i8
212  ; SSE2: cost of 80 {{.*}} shl
213  ; SSE2-CODEGEN: shift8i8
214  ; SSE2-CODEGEN: shll
215
216  %0 = shl %shifttype8i8 %a , %b
217  ret %shifttype8i8 %0
218}
219
220%shifttype16i8 = type <16 x i8>
221define %shifttype16i8 @shift16i8(%shifttype16i8 %a, %shifttype16i8 %b) {
222entry:
223  ; SSE2: shift16i8
224  ; SSE2: cost of 30 {{.*}} shl
225  ; SSE2-CODEGEN: shift16i8
226  ; SSE2-CODEGEN: cmpeqb
227
228  %0 = shl %shifttype16i8 %a , %b
229  ret %shifttype16i8 %0
230}
231
232%shifttype32i8 = type <32 x i8>
233define %shifttype32i8 @shift32i8(%shifttype32i8 %a, %shifttype32i8 %b) {
234entry:
235  ; SSE2: shift32i8
236  ; SSE2: cost of 60 {{.*}} shl
237  ; SSE2-CODEGEN: shift32i8
238  ; SSE2-CODEGEN: cmpeqb
239
240  %0 = shl %shifttype32i8 %a , %b
241  ret %shifttype32i8 %0
242}
243
244; Test shift by a constant vector.
245
246%shifttypec = type <2 x i16>
247define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
248entry:
249  ; SSE2: shift2i16const
250  ; SSE2: cost of 1 {{.*}} shl
251  ; SSE2-CODEGEN: shift2i16const
252  ; SSE2-CODEGEN: psllq $3
253
254  %0 = shl %shifttypec %a , <i16 3, i16 3>
255  ret %shifttypec %0
256}
257
258%shifttypec4i16 = type <4 x i16>
259define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
260entry:
261  ; SSE2: shift4i16const
262  ; SSE2: cost of 1 {{.*}} shl
263  ; SSE2-CODEGEN: shift4i16const
264  ; SSE2-CODEGEN: pslld $3
265
266  %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
267  ret %shifttypec4i16 %0
268}
269
270%shifttypec8i16 = type <8 x i16>
271define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
272entry:
273  ; SSE2: shift8i16const
274  ; SSE2: cost of 1 {{.*}} shl
275  ; SSE2-CODEGEN: shift8i16const
276  ; SSE2-CODEGEN: psllw $3
277
278  %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
279                                  i16 3, i16 3, i16 3, i16 3>
280  ret %shifttypec8i16 %0
281}
282
283%shifttypec16i16 = type <16 x i16>
284define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
285                                         %shifttypec16i16 %b) {
286entry:
287  ; SSE2: shift16i16const
288  ; SSE2: cost of 2 {{.*}} shl
289  ; SSE2-CODEGEN: shift16i16const
290  ; SSE2-CODEGEN: psllw $3
291
292  %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
293                                   i16 3, i16 3, i16 3, i16 3,
294                                   i16 3, i16 3, i16 3, i16 3,
295                                   i16 3, i16 3, i16 3, i16 3>
296  ret %shifttypec16i16 %0
297}
298
299%shifttypec32i16 = type <32 x i16>
300define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
301                                        %shifttypec32i16 %b) {
302entry:
303  ; SSE2: shift32i16const
304  ; SSE2: cost of 4 {{.*}} shl
305  ; SSE2-CODEGEN: shift32i16const
306  ; SSE2-CODEGEN: psllw $3
307
308  %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
309                                   i16 3, i16 3, i16 3, i16 3,
310                                   i16 3, i16 3, i16 3, i16 3,
311                                   i16 3, i16 3, i16 3, i16 3,
312                                   i16 3, i16 3, i16 3, i16 3,
313                                   i16 3, i16 3, i16 3, i16 3,
314                                   i16 3, i16 3, i16 3, i16 3,
315                                   i16 3, i16 3, i16 3, i16 3>
316  ret %shifttypec32i16 %0
317}
318
319%shifttypec2i32 = type <2 x i32>
320define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
321entry:
322  ; SSE2: shift2i32c
323  ; SSE2: cost of 1 {{.*}} shl
324  ; SSE2-CODEGEN: shift2i32c
325  ; SSE2-CODEGEN: psllq $3
326
327  %0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
328  ret %shifttypec2i32 %0
329}
330
331%shifttypec4i32 = type <4 x i32>
332define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
333entry:
334  ; SSE2: shift4i32c
335  ; SSE2: cost of 1 {{.*}} shl
336  ; SSE2-CODEGEN: shift4i32c
337  ; SSE2-CODEGEN: pslld $3
338
339  %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
340  ret %shifttypec4i32 %0
341}
342
343%shifttypec8i32 = type <8 x i32>
344define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
345entry:
346  ; SSE2: shift8i32c
347  ; SSE2: cost of 2 {{.*}} shl
348  ; SSE2-CODEGEN: shift8i32c
349  ; SSE2-CODEGEN: pslld $3
350
351  %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
352                                  i32 3, i32 3, i32 3, i32 3>
353  ret %shifttypec8i32 %0
354}
355
356%shifttypec16i32 = type <16 x i32>
357define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
358entry:
359  ; SSE2: shift16i32c
360  ; SSE2: cost of 4 {{.*}} shl
361  ; SSE2-CODEGEN: shift16i32c
362  ; SSE2-CODEGEN: pslld $3
363
364  %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
365                                   i32 3, i32 3, i32 3, i32 3,
366                                   i32 3, i32 3, i32 3, i32 3,
367                                   i32 3, i32 3, i32 3, i32 3>
368  ret %shifttypec16i32 %0
369}
370
371%shifttypec32i32 = type <32 x i32>
372define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
373entry:
374  ; SSE2: shift32i32c
375  ; SSE2: cost of 8 {{.*}} shl
376  ; SSE2-CODEGEN: shift32i32c
377  ; SSE2-CODEGEN: pslld $3
378  %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
379                                   i32 3, i32 3, i32 3, i32 3,
380                                   i32 3, i32 3, i32 3, i32 3,
381                                   i32 3, i32 3, i32 3, i32 3,
382                                   i32 3, i32 3, i32 3, i32 3,
383                                   i32 3, i32 3, i32 3, i32 3,
384                                   i32 3, i32 3, i32 3, i32 3,
385                                   i32 3, i32 3, i32 3, i32 3>
386  ret %shifttypec32i32 %0
387}
388
389%shifttypec2i64 = type <2 x i64>
390define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
391entry:
392  ; SSE2: shift2i64c
393  ; SSE2: cost of 1 {{.*}} shl
394  ; SSE2-CODEGEN: shift2i64c
395  ; SSE2-CODEGEN: psllq $3
396
397  %0 = shl %shifttypec2i64 %a , <i64 3, i64 3>
398  ret %shifttypec2i64 %0
399}
400
401%shifttypec4i64 = type <4 x i64>
402define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
403entry:
404  ; SSE2: shift4i64c
405  ; SSE2: cost of 2 {{.*}} shl
406  ; SSE2-CODEGEN: shift4i64c
407  ; SSE2-CODEGEN: psllq $3
408
409  %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
410  ret %shifttypec4i64 %0
411}
412
413%shifttypec8i64 = type <8 x i64>
414define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
415entry:
416  ; SSE2: shift8i64c
417  ; SSE2: cost of 4 {{.*}} shl
418  ; SSE2-CODEGEN: shift8i64c
419  ; SSE2-CODEGEN: psllq $3
420
421 %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
422                                 i64 3, i64 3, i64 3, i64 3>
423  ret %shifttypec8i64 %0
424}
425
426%shifttypec16i64 = type <16 x i64>
427define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
428entry:
429  ; SSE2: shift16i64c
430  ; SSE2: cost of 8 {{.*}} shl
431  ; SSE2-CODEGEN: shift16i64c
432  ; SSE2-CODEGEN: psllq $3
433
434  %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
435                                   i64 3, i64 3, i64 3, i64 3,
436                                   i64 3, i64 3, i64 3, i64 3,
437                                   i64 3, i64 3, i64 3, i64 3>
438  ret %shifttypec16i64 %0
439}
440
441%shifttypec32i64 = type <32 x i64>
442define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
443entry:
444  ; SSE2: shift32i64c
445  ; SSE2: cost of 16 {{.*}} shl
446  ; SSE2-CODEGEN: shift32i64c
447  ; SSE2-CODEGEN: psllq $3
448
449  %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
450                                  i64 3, i64 3, i64 3, i64 3,
451                                  i64 3, i64 3, i64 3, i64 3,
452                                  i64 3, i64 3, i64 3, i64 3,
453                                  i64 3, i64 3, i64 3, i64 3,
454                                  i64 3, i64 3, i64 3, i64 3,
455                                  i64 3, i64 3, i64 3, i64 3,
456                                  i64 3, i64 3, i64 3, i64 3>
457  ret %shifttypec32i64 %0
458}
459
460%shifttypec2i8 = type <2 x i8>
461define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
462entry:
463  ; SSE2: shift2i8c
464  ; SSE2: cost of 1 {{.*}} shl
465  ; SSE2-CODEGEN: shift2i8c
466  ; SSE2-CODEGEN: psllq $3
467
468  %0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
469  ret %shifttypec2i8 %0
470}
471
472%shifttypec4i8 = type <4 x i8>
473define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
474entry:
475  ; SSE2: shift4i8c
476  ; SSE2: cost of 1 {{.*}} shl
477  ; SSE2-CODEGEN: shift4i8c
478  ; SSE2-CODEGEN: pslld $3
479
480  %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
481  ret %shifttypec4i8 %0
482}
483
484%shifttypec8i8 = type <8 x i8>
485define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
486entry:
487  ; SSE2: shift8i8c
488  ; SSE2: cost of 1 {{.*}} shl
489  ; SSE2-CODEGEN: shift8i8c
490  ; SSE2-CODEGEN: psllw $3
491
492  %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
493                                 i8 3, i8 3, i8 3, i8 3>
494  ret %shifttypec8i8 %0
495}
496
497%shifttypec16i8 = type <16 x i8>
498define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
499entry:
500  ; SSE2: shift16i8c
501  ; SSE2: cost of 1 {{.*}} shl
502  ; SSE2-CODEGEN: shift16i8c
503  ; SSE2-CODEGEN: psllw $3
504
505  %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
506                                  i8 3, i8 3, i8 3, i8 3,
507                                  i8 3, i8 3, i8 3, i8 3,
508                                  i8 3, i8 3, i8 3, i8 3>
509  ret %shifttypec16i8 %0
510}
511
512%shifttypec32i8 = type <32 x i8>
513define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
514entry:
515  ; SSE2: shift32i8c
516  ; SSE2: cost of 2 {{.*}} shl
517  ; SSE2-CODEGEN: shift32i8c
518  ; SSE2-CODEGEN: psllw $3
519
520  %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
521                                  i8 3, i8 3, i8 3, i8 3,
522                                  i8 3, i8 3, i8 3, i8 3,
523                                  i8 3, i8 3, i8 3, i8 3,
524                                  i8 3, i8 3, i8 3, i8 3,
525                                  i8 3, i8 3, i8 3, i8 3,
526                                  i8 3, i8 3, i8 3, i8 3,
527                                  i8 3, i8 3, i8 3, i8 3>
528  ret %shifttypec32i8 %0
529}
530