1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP
9
10; fold (sdiv x, 1) -> x
11define i32 @combine_sdiv_by_one(i32 %x) {
12; CHECK-LABEL: combine_sdiv_by_one:
13; CHECK:       # %bb.0:
14; CHECK-NEXT:    movl %edi, %eax
15; CHECK-NEXT:    retq
16  %1 = sdiv i32 %x, 1
17  ret i32 %1
18}
19
20define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) {
21; CHECK-LABEL: combine_vec_sdiv_by_one:
22; CHECK:       # %bb.0:
23; CHECK-NEXT:    retq
24  %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
25  ret <4 x i32> %1
26}
27
28; fold (sdiv x, -1) -> 0 - x
29define i32 @combine_sdiv_by_negone(i32 %x) {
30; CHECK-LABEL: combine_sdiv_by_negone:
31; CHECK:       # %bb.0:
32; CHECK-NEXT:    movl %edi, %eax
33; CHECK-NEXT:    negl %eax
34; CHECK-NEXT:    retq
35  %1 = sdiv i32 %x, -1
36  ret i32 %1
37}
38
39define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) {
40; SSE-LABEL: combine_vec_sdiv_by_negone:
41; SSE:       # %bb.0:
42; SSE-NEXT:    pxor %xmm1, %xmm1
43; SSE-NEXT:    psubd %xmm0, %xmm1
44; SSE-NEXT:    movdqa %xmm1, %xmm0
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: combine_vec_sdiv_by_negone:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
50; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
51; AVX-NEXT:    retq
52  %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
53  ret <4 x i32> %1
54}
55
56; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0)
57define i32 @combine_sdiv_by_minsigned(i32 %x) {
58; CHECK-LABEL: combine_sdiv_by_minsigned:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    xorl %eax, %eax
61; CHECK-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
62; CHECK-NEXT:    sete %al
63; CHECK-NEXT:    retq
64  %1 = sdiv i32 %x, -2147483648
65  ret i32 %1
66}
67
68define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) {
69; SSE-LABEL: combine_vec_sdiv_by_minsigned:
70; SSE:       # %bb.0:
71; SSE-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
72; SSE-NEXT:    psrld $31, %xmm0
73; SSE-NEXT:    retq
74;
75; AVX1-LABEL: combine_vec_sdiv_by_minsigned:
76; AVX1:       # %bb.0:
77; AVX1-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
78; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm0
79; AVX1-NEXT:    retq
80;
81; AVX2-LABEL: combine_vec_sdiv_by_minsigned:
82; AVX2:       # %bb.0:
83; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
84; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
85; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
86; AVX2-NEXT:    retq
87;
88; AVX512F-LABEL: combine_vec_sdiv_by_minsigned:
89; AVX512F:       # %bb.0:
90; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
91; AVX512F-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
92; AVX512F-NEXT:    vpsrld $31, %xmm0, %xmm0
93; AVX512F-NEXT:    retq
94;
95; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned:
96; AVX512BW:       # %bb.0:
97; AVX512BW-NEXT:    vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1
98; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
99; AVX512BW-NEXT:    retq
100;
101; XOP-LABEL: combine_vec_sdiv_by_minsigned:
102; XOP:       # %bb.0:
103; XOP-NEXT:    vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
104; XOP-NEXT:    vpsrld $31, %xmm0, %xmm0
105; XOP-NEXT:    retq
106  %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
107  ret <4 x i32> %1
108}
109
110; fold (sdiv 0, x) -> 0
111define i32 @combine_sdiv_zero(i32 %x) {
112; CHECK-LABEL: combine_sdiv_zero:
113; CHECK:       # %bb.0:
114; CHECK-NEXT:    xorl %eax, %eax
115; CHECK-NEXT:    retq
116  %1 = sdiv i32 0, %x
117  ret i32 %1
118}
119
120define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) {
121; SSE-LABEL: combine_vec_sdiv_zero:
122; SSE:       # %bb.0:
123; SSE-NEXT:    xorps %xmm0, %xmm0
124; SSE-NEXT:    retq
125;
126; AVX-LABEL: combine_vec_sdiv_zero:
127; AVX:       # %bb.0:
128; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
129; AVX-NEXT:    retq
130  %1 = sdiv <4 x i32> zeroinitializer, %x
131  ret <4 x i32> %1
132}
133
134; fold (sdiv x, x) -> 1
135define i32 @combine_sdiv_dupe(i32 %x) {
136; CHECK-LABEL: combine_sdiv_dupe:
137; CHECK:       # %bb.0:
138; CHECK-NEXT:    movl $1, %eax
139; CHECK-NEXT:    retq
140  %1 = sdiv i32 %x, %x
141  ret i32 %1
142}
143
144define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) {
145; SSE-LABEL: combine_vec_sdiv_dupe:
146; SSE:       # %bb.0:
147; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,1,1,1]
148; SSE-NEXT:    retq
149;
150; AVX1-LABEL: combine_vec_sdiv_dupe:
151; AVX1:       # %bb.0:
152; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,1]
153; AVX1-NEXT:    retq
154;
155; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe:
156; AVX2ORLATER:       # %bb.0:
157; AVX2ORLATER-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1,1,1,1]
158; AVX2ORLATER-NEXT:    retq
159;
160; XOP-LABEL: combine_vec_sdiv_dupe:
161; XOP:       # %bb.0:
162; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [1,1,1,1]
163; XOP-NEXT:    retq
164  %1 = sdiv <4 x i32> %x, %x
165  ret <4 x i32> %1
166}
167
168; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive
169define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) {
170; SSE-LABEL: combine_vec_sdiv_by_pos0:
171; SSE:       # %bb.0:
172; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
173; SSE-NEXT:    psrld $2, %xmm0
174; SSE-NEXT:    retq
175;
176; AVX-LABEL: combine_vec_sdiv_by_pos0:
177; AVX:       # %bb.0:
178; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
179; AVX-NEXT:    vpsrld $2, %xmm0, %xmm0
180; AVX-NEXT:    retq
181  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
182  %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
183  ret <4 x i32> %2
184}
185
186define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) {
187; SSE2-LABEL: combine_vec_sdiv_by_pos1:
188; SSE2:       # %bb.0:
189; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
190; SSE2-NEXT:    movdqa %xmm0, %xmm1
191; SSE2-NEXT:    psrld $4, %xmm1
192; SSE2-NEXT:    movdqa %xmm0, %xmm2
193; SSE2-NEXT:    psrld $3, %xmm2
194; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
195; SSE2-NEXT:    movdqa %xmm0, %xmm1
196; SSE2-NEXT:    psrld $2, %xmm1
197; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
199; SSE2-NEXT:    retq
200;
201; SSE41-LABEL: combine_vec_sdiv_by_pos1:
202; SSE41:       # %bb.0:
203; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
204; SSE41-NEXT:    movdqa %xmm0, %xmm2
205; SSE41-NEXT:    movdqa %xmm0, %xmm1
206; SSE41-NEXT:    psrld $3, %xmm1
207; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
208; SSE41-NEXT:    psrld $4, %xmm0
209; SSE41-NEXT:    psrld $2, %xmm2
210; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
211; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
212; SSE41-NEXT:    movdqa %xmm1, %xmm0
213; SSE41-NEXT:    retq
214;
215; AVX1-LABEL: combine_vec_sdiv_by_pos1:
216; AVX1:       # %bb.0:
217; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
218; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm1
219; AVX1-NEXT:    vpsrld $2, %xmm0, %xmm2
220; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
221; AVX1-NEXT:    vpsrld $3, %xmm0, %xmm2
222; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
223; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
224; AVX1-NEXT:    retq
225;
226; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1:
227; AVX2ORLATER:       # %bb.0:
228; AVX2ORLATER-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
229; AVX2ORLATER-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
230; AVX2ORLATER-NEXT:    retq
231;
232; XOP-LABEL: combine_vec_sdiv_by_pos1:
233; XOP:       # %bb.0:
234; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
235; XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
236; XOP-NEXT:    retq
237  %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255>
238  %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16>
239  ret <4 x i32> %2
240}
241
242; fold (sdiv x, (1 << c)) -> x >>u c
243define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) {
244; SSE-LABEL: combine_vec_sdiv_by_pow2a:
245; SSE:       # %bb.0:
246; SSE-NEXT:    movdqa %xmm0, %xmm1
247; SSE-NEXT:    psrad $31, %xmm1
248; SSE-NEXT:    psrld $30, %xmm1
249; SSE-NEXT:    paddd %xmm0, %xmm1
250; SSE-NEXT:    psrad $2, %xmm1
251; SSE-NEXT:    movdqa %xmm1, %xmm0
252; SSE-NEXT:    retq
253;
254; AVX-LABEL: combine_vec_sdiv_by_pow2a:
255; AVX:       # %bb.0:
256; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
257; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
258; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
259; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
260; AVX-NEXT:    retq
261  %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
262  ret <4 x i32> %1
263}
264
265define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) {
266; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg:
267; SSE:       # %bb.0:
268; SSE-NEXT:    movdqa %xmm0, %xmm1
269; SSE-NEXT:    psrad $31, %xmm1
270; SSE-NEXT:    psrld $30, %xmm1
271; SSE-NEXT:    paddd %xmm0, %xmm1
272; SSE-NEXT:    psrad $2, %xmm1
273; SSE-NEXT:    pxor %xmm0, %xmm0
274; SSE-NEXT:    psubd %xmm1, %xmm0
275; SSE-NEXT:    retq
276;
277; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg:
278; AVX:       # %bb.0:
279; AVX-NEXT:    vpsrad $31, %xmm0, %xmm1
280; AVX-NEXT:    vpsrld $30, %xmm1, %xmm1
281; AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
282; AVX-NEXT:    vpsrad $2, %xmm0, %xmm0
283; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
284; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
285; AVX-NEXT:    retq
286  %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4>
287  ret <4 x i32> %1
288}
289
290define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) {
291; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
292; SSE2:       # %bb.0:
293; SSE2-NEXT:    pxor %xmm1, %xmm1
294; SSE2-NEXT:    pxor %xmm2, %xmm2
295; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
296; SSE2-NEXT:    movdqa %xmm2, %xmm3
297; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
298; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2]
299; SSE2-NEXT:    pmullw %xmm4, %xmm3
300; SSE2-NEXT:    psrlw $8, %xmm3
301; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
302; SSE2-NEXT:    pmullw %xmm4, %xmm2
303; SSE2-NEXT:    psrlw $8, %xmm2
304; SSE2-NEXT:    packuswb %xmm3, %xmm2
305; SSE2-NEXT:    paddb %xmm0, %xmm2
306; SSE2-NEXT:    movdqa %xmm2, %xmm1
307; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
308; SSE2-NEXT:    psraw $8, %xmm1
309; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
310; SSE2-NEXT:    pmullw %xmm3, %xmm1
311; SSE2-NEXT:    psrlw $8, %xmm1
312; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
313; SSE2-NEXT:    psraw $8, %xmm2
314; SSE2-NEXT:    pmullw %xmm3, %xmm2
315; SSE2-NEXT:    psrlw $8, %xmm2
316; SSE2-NEXT:    packuswb %xmm1, %xmm2
317; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
318; SSE2-NEXT:    pand %xmm1, %xmm2
319; SSE2-NEXT:    pandn %xmm0, %xmm1
320; SSE2-NEXT:    por %xmm2, %xmm1
321; SSE2-NEXT:    movdqa %xmm1, %xmm0
322; SSE2-NEXT:    retq
323;
324; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
325; SSE41:       # %bb.0:
326; SSE41-NEXT:    movdqa %xmm0, %xmm1
327; SSE41-NEXT:    pxor %xmm0, %xmm0
328; SSE41-NEXT:    pxor %xmm3, %xmm3
329; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
330; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
331; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
332; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2]
333; SSE41-NEXT:    pmullw %xmm0, %xmm3
334; SSE41-NEXT:    psrlw $8, %xmm3
335; SSE41-NEXT:    pmullw %xmm0, %xmm2
336; SSE41-NEXT:    psrlw $8, %xmm2
337; SSE41-NEXT:    packuswb %xmm3, %xmm2
338; SSE41-NEXT:    paddb %xmm1, %xmm2
339; SSE41-NEXT:    movdqa %xmm2, %xmm0
340; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
341; SSE41-NEXT:    psraw $8, %xmm0
342; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
343; SSE41-NEXT:    pmullw %xmm3, %xmm0
344; SSE41-NEXT:    psrlw $8, %xmm0
345; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
346; SSE41-NEXT:    psraw $8, %xmm2
347; SSE41-NEXT:    pmullw %xmm3, %xmm2
348; SSE41-NEXT:    psrlw $8, %xmm2
349; SSE41-NEXT:    packuswb %xmm0, %xmm2
350; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
351; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
352; SSE41-NEXT:    movdqa %xmm1, %xmm0
353; SSE41-NEXT:    retq
354;
355; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
356; AVX1:       # %bb.0:
357; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
358; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
359; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
360; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2]
361; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
362; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
363; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
364; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
365; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
366; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
367; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
368; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
369; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
370; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128]
371; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
372; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
373; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
374; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
375; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
376; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
377; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
378; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
379; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
380; AVX1-NEXT:    retq
381;
382; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
383; AVX2:       # %bb.0:
384; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
385; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
386; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
387; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
388; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
389; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
390; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
391; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
392; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
393; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
394; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
395; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
396; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
397; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
398; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
399; AVX2-NEXT:    vzeroupper
400; AVX2-NEXT:    retq
401;
402; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
403; AVX512F:       # %bb.0:
404; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
405; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
406; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
407; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
408; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
409; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
410; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
411; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
412; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
413; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
414; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
415; AVX512F-NEXT:    vzeroupper
416; AVX512F-NEXT:    retq
417;
418; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
419; AVX512BW:       # %bb.0:
420; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
421; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
422; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
423; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
424; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
425; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
426; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
427; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
428; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
429; AVX512BW-NEXT:    movw $257, %ax # imm = 0x101
430; AVX512BW-NEXT:    kmovd %eax, %k1
431; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
432; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
433; AVX512BW-NEXT:    vzeroupper
434; AVX512BW-NEXT:    retq
435;
436; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8:
437; XOP:       # %bb.0:
438; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
439; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
440; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
441; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
442; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
443; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255]
444; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
445; XOP-NEXT:    retq
446  %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2>
447  ret <16 x i8> %1
448}
449
450define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) {
451; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
452; SSE2:       # %bb.0:
453; SSE2-NEXT:    movdqa %xmm0, %xmm1
454; SSE2-NEXT:    psraw $15, %xmm1
455; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
456; SSE2-NEXT:    paddw %xmm0, %xmm1
457; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535]
458; SSE2-NEXT:    movdqa %xmm1, %xmm3
459; SSE2-NEXT:    pand %xmm2, %xmm3
460; SSE2-NEXT:    psraw $4, %xmm1
461; SSE2-NEXT:    pandn %xmm1, %xmm2
462; SSE2-NEXT:    por %xmm3, %xmm2
463; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535]
464; SSE2-NEXT:    movdqa %xmm2, %xmm3
465; SSE2-NEXT:    pand %xmm1, %xmm3
466; SSE2-NEXT:    psraw $2, %xmm2
467; SSE2-NEXT:    pandn %xmm2, %xmm1
468; SSE2-NEXT:    por %xmm3, %xmm1
469; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0]
470; SSE2-NEXT:    movdqa %xmm1, %xmm3
471; SSE2-NEXT:    pand %xmm2, %xmm3
472; SSE2-NEXT:    psraw $1, %xmm1
473; SSE2-NEXT:    pandn %xmm1, %xmm2
474; SSE2-NEXT:    por %xmm3, %xmm2
475; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
476; SSE2-NEXT:    pand %xmm1, %xmm2
477; SSE2-NEXT:    pandn %xmm0, %xmm1
478; SSE2-NEXT:    por %xmm2, %xmm1
479; SSE2-NEXT:    movdqa %xmm1, %xmm0
480; SSE2-NEXT:    retq
481;
482; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
483; SSE41:       # %bb.0:
484; SSE41-NEXT:    movdqa %xmm0, %xmm1
485; SSE41-NEXT:    psraw $15, %xmm1
486; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
487; SSE41-NEXT:    paddw %xmm0, %xmm1
488; SSE41-NEXT:    movdqa %xmm1, %xmm2
489; SSE41-NEXT:    psraw $1, %xmm2
490; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
491; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
492; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
493; SSE41-NEXT:    movdqa %xmm1, %xmm0
494; SSE41-NEXT:    retq
495;
496; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
497; AVX1:       # %bb.0:
498; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm1
499; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
500; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
501; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm2
502; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
503; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
505; AVX1-NEXT:    retq
506;
507; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
508; AVX2:       # %bb.0:
509; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm1
510; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
511; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
512; AVX2-NEXT:    vpsraw $1, %xmm1, %xmm2
513; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
514; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7]
515; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
516; AVX2-NEXT:    retq
517;
518; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
519; AVX512F:       # %bb.0:
520; AVX512F-NEXT:    vpsraw $15, %xmm0, %xmm1
521; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
522; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
523; AVX512F-NEXT:    vpmovsxwd %xmm1, %ymm1
524; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
525; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
526; AVX512F-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
527; AVX512F-NEXT:    vzeroupper
528; AVX512F-NEXT:    retq
529;
530; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
531; AVX512BW:       # %bb.0:
532; AVX512BW-NEXT:    vpsraw $15, %xmm0, %xmm1
533; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
534; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
535; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
536; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
537; AVX512BW-NEXT:    retq
538;
539; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16:
540; XOP:       # %bb.0:
541; XOP-NEXT:    vpsraw $15, %xmm0, %xmm1
542; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
543; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm1
544; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
545; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
546; XOP-NEXT:    retq
547  %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
548  ret <8 x i16> %1
549}
550
551define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
552; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
553; SSE2:       # %bb.0:
554; SSE2-NEXT:    movdqa %xmm0, %xmm3
555; SSE2-NEXT:    psraw $15, %xmm0
556; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2>
557; SSE2-NEXT:    pmulhuw %xmm8, %xmm0
558; SSE2-NEXT:    paddw %xmm3, %xmm0
559; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
560; SSE2-NEXT:    movdqa %xmm0, %xmm2
561; SSE2-NEXT:    pand %xmm4, %xmm2
562; SSE2-NEXT:    psraw $4, %xmm0
563; SSE2-NEXT:    movdqa %xmm4, %xmm6
564; SSE2-NEXT:    pandn %xmm0, %xmm6
565; SSE2-NEXT:    por %xmm2, %xmm6
566; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535]
567; SSE2-NEXT:    movdqa %xmm6, %xmm0
568; SSE2-NEXT:    pand %xmm5, %xmm0
569; SSE2-NEXT:    psraw $2, %xmm6
570; SSE2-NEXT:    movdqa %xmm5, %xmm2
571; SSE2-NEXT:    pandn %xmm6, %xmm2
572; SSE2-NEXT:    por %xmm0, %xmm2
573; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0]
574; SSE2-NEXT:    movdqa %xmm2, %xmm0
575; SSE2-NEXT:    pand %xmm7, %xmm0
576; SSE2-NEXT:    psraw $1, %xmm2
577; SSE2-NEXT:    movdqa %xmm7, %xmm6
578; SSE2-NEXT:    pandn %xmm2, %xmm6
579; SSE2-NEXT:    por %xmm0, %xmm6
580; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
581; SSE2-NEXT:    pand %xmm2, %xmm6
582; SSE2-NEXT:    movdqa %xmm2, %xmm0
583; SSE2-NEXT:    pandn %xmm3, %xmm0
584; SSE2-NEXT:    por %xmm6, %xmm0
585; SSE2-NEXT:    movdqa %xmm1, %xmm3
586; SSE2-NEXT:    psraw $15, %xmm3
587; SSE2-NEXT:    pmulhuw %xmm8, %xmm3
588; SSE2-NEXT:    paddw %xmm1, %xmm3
589; SSE2-NEXT:    movdqa %xmm3, %xmm6
590; SSE2-NEXT:    pand %xmm4, %xmm6
591; SSE2-NEXT:    psraw $4, %xmm3
592; SSE2-NEXT:    pandn %xmm3, %xmm4
593; SSE2-NEXT:    por %xmm6, %xmm4
594; SSE2-NEXT:    movdqa %xmm4, %xmm3
595; SSE2-NEXT:    pand %xmm5, %xmm3
596; SSE2-NEXT:    psraw $2, %xmm4
597; SSE2-NEXT:    pandn %xmm4, %xmm5
598; SSE2-NEXT:    por %xmm3, %xmm5
599; SSE2-NEXT:    movdqa %xmm5, %xmm3
600; SSE2-NEXT:    pand %xmm7, %xmm3
601; SSE2-NEXT:    psraw $1, %xmm5
602; SSE2-NEXT:    pandn %xmm5, %xmm7
603; SSE2-NEXT:    por %xmm3, %xmm7
604; SSE2-NEXT:    pand %xmm2, %xmm7
605; SSE2-NEXT:    pandn %xmm1, %xmm2
606; SSE2-NEXT:    por %xmm7, %xmm2
607; SSE2-NEXT:    movdqa %xmm2, %xmm1
608; SSE2-NEXT:    retq
609;
610; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
611; SSE41:       # %bb.0:
612; SSE41-NEXT:    movdqa %xmm0, %xmm2
613; SSE41-NEXT:    psraw $15, %xmm2
614; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
615; SSE41-NEXT:    pmulhuw %xmm4, %xmm2
616; SSE41-NEXT:    paddw %xmm0, %xmm2
617; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768>
618; SSE41-NEXT:    movdqa %xmm2, %xmm3
619; SSE41-NEXT:    pmulhw %xmm5, %xmm3
620; SSE41-NEXT:    psraw $1, %xmm2
621; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7]
622; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
623; SSE41-NEXT:    movdqa %xmm1, %xmm3
624; SSE41-NEXT:    psraw $15, %xmm3
625; SSE41-NEXT:    pmulhuw %xmm4, %xmm3
626; SSE41-NEXT:    paddw %xmm1, %xmm3
627; SSE41-NEXT:    pmulhw %xmm3, %xmm5
628; SSE41-NEXT:    psraw $1, %xmm3
629; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7]
630; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7]
631; SSE41-NEXT:    movdqa %xmm2, %xmm0
632; SSE41-NEXT:    movdqa %xmm3, %xmm1
633; SSE41-NEXT:    retq
634;
635; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
636; AVX1:       # %bb.0:
637; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
638; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm2
639; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
640; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
641; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
642; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
643; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm4
644; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
645; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
646; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm4
647; AVX1-NEXT:    vpmulhuw %xmm3, %xmm4, %xmm3
648; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
649; AVX1-NEXT:    vpmulhw %xmm2, %xmm3, %xmm2
650; AVX1-NEXT:    vpsraw $1, %xmm3, %xmm3
651; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7]
652; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
653; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
654; AVX1-NEXT:    # ymm2 = mem[0,1,0,1]
655; AVX1-NEXT:    vandps %ymm2, %ymm1, %ymm1
656; AVX1-NEXT:    vandnps %ymm0, %ymm2, %ymm0
657; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
658; AVX1-NEXT:    retq
659;
660; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
661; AVX2:       # %bb.0:
662; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm1
663; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
664; AVX2-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
665; AVX2-NEXT:    vpsraw $1, %ymm1, %ymm2
666; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
667; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15]
668; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
669; AVX2-NEXT:    retq
670;
671; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
672; AVX512F:       # %bb.0:
673; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm1
674; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
675; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
676; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
677; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
678; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
679; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
680; AVX512F-NEXT:    retq
681;
682; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
683; AVX512BW:       # %bb.0:
684; AVX512BW-NEXT:    vpsraw $15, %ymm0, %ymm1
685; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
686; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
687; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
688; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
689; AVX512BW-NEXT:    retq
690;
691; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16:
692; XOP:       # %bb.0:
693; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
694; XOP-NEXT:    vpsraw $15, %xmm1, %xmm2
695; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
696; XOP-NEXT:    vpshlw %xmm3, %xmm2, %xmm2
697; XOP-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
698; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
699; XOP-NEXT:    vpshaw %xmm2, %xmm1, %xmm1
700; XOP-NEXT:    vpsraw $15, %xmm0, %xmm4
701; XOP-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
702; XOP-NEXT:    vpaddw %xmm3, %xmm0, %xmm3
703; XOP-NEXT:    vpshaw %xmm2, %xmm3, %xmm2
704; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
705; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
706; XOP-NEXT:    # ymm2 = mem[0,1,0,1]
707; XOP-NEXT:    vpcmov %ymm2, %ymm0, %ymm1, %ymm0
708; XOP-NEXT:    retq
709  %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
710  ret <16 x i16> %1
711}
712
713define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
714; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
715; SSE2:       # %bb.0:
716; SSE2-NEXT:    movdqa %xmm1, %xmm8
717; SSE2-NEXT:    movdqa %xmm0, %xmm1
718; SSE2-NEXT:    psraw $15, %xmm0
719; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
720; SSE2-NEXT:    pmulhuw %xmm9, %xmm0
721; SSE2-NEXT:    paddw %xmm1, %xmm0
722; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535]
723; SSE2-NEXT:    movdqa %xmm0, %xmm4
724; SSE2-NEXT:    pand %xmm11, %xmm4
725; SSE2-NEXT:    psraw $4, %xmm0
726; SSE2-NEXT:    movdqa %xmm11, %xmm5
727; SSE2-NEXT:    pandn %xmm0, %xmm5
728; SSE2-NEXT:    por %xmm4, %xmm5
729; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535]
730; SSE2-NEXT:    movdqa %xmm5, %xmm0
731; SSE2-NEXT:    pand %xmm7, %xmm0
732; SSE2-NEXT:    psraw $2, %xmm5
733; SSE2-NEXT:    movdqa %xmm7, %xmm4
734; SSE2-NEXT:    pandn %xmm5, %xmm4
735; SSE2-NEXT:    por %xmm0, %xmm4
736; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0]
737; SSE2-NEXT:    movdqa %xmm4, %xmm0
738; SSE2-NEXT:    pand %xmm10, %xmm0
739; SSE2-NEXT:    psraw $1, %xmm4
740; SSE2-NEXT:    movdqa %xmm10, %xmm5
741; SSE2-NEXT:    pandn %xmm4, %xmm5
742; SSE2-NEXT:    por %xmm0, %xmm5
743; SSE2-NEXT:    movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535]
744; SSE2-NEXT:    pand %xmm12, %xmm5
745; SSE2-NEXT:    movdqa %xmm12, %xmm0
746; SSE2-NEXT:    pandn %xmm1, %xmm0
747; SSE2-NEXT:    por %xmm5, %xmm0
748; SSE2-NEXT:    movdqa %xmm8, %xmm1
749; SSE2-NEXT:    psraw $15, %xmm1
750; SSE2-NEXT:    pmulhuw %xmm9, %xmm1
751; SSE2-NEXT:    paddw %xmm8, %xmm1
752; SSE2-NEXT:    movdqa %xmm1, %xmm5
753; SSE2-NEXT:    pand %xmm11, %xmm5
754; SSE2-NEXT:    psraw $4, %xmm1
755; SSE2-NEXT:    movdqa %xmm11, %xmm6
756; SSE2-NEXT:    pandn %xmm1, %xmm6
757; SSE2-NEXT:    por %xmm5, %xmm6
758; SSE2-NEXT:    movdqa %xmm6, %xmm1
759; SSE2-NEXT:    pand %xmm7, %xmm1
760; SSE2-NEXT:    psraw $2, %xmm6
761; SSE2-NEXT:    movdqa %xmm7, %xmm5
762; SSE2-NEXT:    pandn %xmm6, %xmm5
763; SSE2-NEXT:    por %xmm1, %xmm5
764; SSE2-NEXT:    movdqa %xmm5, %xmm1
765; SSE2-NEXT:    pand %xmm10, %xmm1
766; SSE2-NEXT:    psraw $1, %xmm5
767; SSE2-NEXT:    movdqa %xmm10, %xmm6
768; SSE2-NEXT:    pandn %xmm5, %xmm6
769; SSE2-NEXT:    por %xmm1, %xmm6
770; SSE2-NEXT:    pand %xmm12, %xmm6
771; SSE2-NEXT:    movdqa %xmm12, %xmm1
772; SSE2-NEXT:    pandn %xmm8, %xmm1
773; SSE2-NEXT:    por %xmm6, %xmm1
774; SSE2-NEXT:    movdqa %xmm2, %xmm5
775; SSE2-NEXT:    psraw $15, %xmm5
776; SSE2-NEXT:    pmulhuw %xmm9, %xmm5
777; SSE2-NEXT:    paddw %xmm2, %xmm5
778; SSE2-NEXT:    movdqa %xmm5, %xmm6
779; SSE2-NEXT:    pand %xmm11, %xmm6
780; SSE2-NEXT:    psraw $4, %xmm5
781; SSE2-NEXT:    movdqa %xmm11, %xmm4
782; SSE2-NEXT:    pandn %xmm5, %xmm4
783; SSE2-NEXT:    por %xmm6, %xmm4
784; SSE2-NEXT:    movdqa %xmm4, %xmm5
785; SSE2-NEXT:    pand %xmm7, %xmm5
786; SSE2-NEXT:    psraw $2, %xmm4
787; SSE2-NEXT:    movdqa %xmm7, %xmm6
788; SSE2-NEXT:    pandn %xmm4, %xmm6
789; SSE2-NEXT:    por %xmm5, %xmm6
790; SSE2-NEXT:    movdqa %xmm6, %xmm4
791; SSE2-NEXT:    pand %xmm10, %xmm4
792; SSE2-NEXT:    psraw $1, %xmm6
793; SSE2-NEXT:    movdqa %xmm10, %xmm5
794; SSE2-NEXT:    pandn %xmm6, %xmm5
795; SSE2-NEXT:    por %xmm4, %xmm5
796; SSE2-NEXT:    pand %xmm12, %xmm5
797; SSE2-NEXT:    movdqa %xmm12, %xmm8
798; SSE2-NEXT:    pandn %xmm2, %xmm8
799; SSE2-NEXT:    por %xmm5, %xmm8
800; SSE2-NEXT:    movdqa %xmm3, %xmm2
801; SSE2-NEXT:    psraw $15, %xmm2
802; SSE2-NEXT:    pmulhuw %xmm9, %xmm2
803; SSE2-NEXT:    paddw %xmm3, %xmm2
804; SSE2-NEXT:    movdqa %xmm2, %xmm4
805; SSE2-NEXT:    pand %xmm11, %xmm4
806; SSE2-NEXT:    psraw $4, %xmm2
807; SSE2-NEXT:    pandn %xmm2, %xmm11
808; SSE2-NEXT:    por %xmm4, %xmm11
809; SSE2-NEXT:    movdqa %xmm11, %xmm2
810; SSE2-NEXT:    pand %xmm7, %xmm2
811; SSE2-NEXT:    psraw $2, %xmm11
812; SSE2-NEXT:    pandn %xmm11, %xmm7
813; SSE2-NEXT:    por %xmm2, %xmm7
814; SSE2-NEXT:    movdqa %xmm7, %xmm2
815; SSE2-NEXT:    pand %xmm10, %xmm2
816; SSE2-NEXT:    psraw $1, %xmm7
817; SSE2-NEXT:    pandn %xmm7, %xmm10
818; SSE2-NEXT:    por %xmm2, %xmm10
819; SSE2-NEXT:    pand %xmm12, %xmm10
820; SSE2-NEXT:    pandn %xmm3, %xmm12
821; SSE2-NEXT:    por %xmm10, %xmm12
822; SSE2-NEXT:    movdqa %xmm8, %xmm2
823; SSE2-NEXT:    movdqa %xmm12, %xmm3
824; SSE2-NEXT:    retq
825;
826; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
827; SSE41:       # %bb.0:
828; SSE41-NEXT:    movdqa %xmm1, %xmm4
829; SSE41-NEXT:    movdqa %xmm0, %xmm1
830; SSE41-NEXT:    psraw $15, %xmm0
831; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
832; SSE41-NEXT:    pmulhuw %xmm7, %xmm0
833; SSE41-NEXT:    paddw %xmm1, %xmm0
834; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768>
835; SSE41-NEXT:    movdqa %xmm0, %xmm5
836; SSE41-NEXT:    pmulhw %xmm6, %xmm5
837; SSE41-NEXT:    psraw $1, %xmm0
838; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7]
839; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
840; SSE41-NEXT:    movdqa %xmm4, %xmm1
841; SSE41-NEXT:    psraw $15, %xmm1
842; SSE41-NEXT:    pmulhuw %xmm7, %xmm1
843; SSE41-NEXT:    paddw %xmm4, %xmm1
844; SSE41-NEXT:    movdqa %xmm1, %xmm5
845; SSE41-NEXT:    pmulhw %xmm6, %xmm5
846; SSE41-NEXT:    psraw $1, %xmm1
847; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7]
848; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7]
849; SSE41-NEXT:    movdqa %xmm2, %xmm4
850; SSE41-NEXT:    psraw $15, %xmm4
851; SSE41-NEXT:    pmulhuw %xmm7, %xmm4
852; SSE41-NEXT:    paddw %xmm2, %xmm4
853; SSE41-NEXT:    movdqa %xmm4, %xmm5
854; SSE41-NEXT:    pmulhw %xmm6, %xmm5
855; SSE41-NEXT:    psraw $1, %xmm4
856; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7]
857; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7]
858; SSE41-NEXT:    movdqa %xmm3, %xmm5
859; SSE41-NEXT:    psraw $15, %xmm5
860; SSE41-NEXT:    pmulhuw %xmm7, %xmm5
861; SSE41-NEXT:    paddw %xmm3, %xmm5
862; SSE41-NEXT:    pmulhw %xmm5, %xmm6
863; SSE41-NEXT:    psraw $1, %xmm5
864; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
865; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7]
866; SSE41-NEXT:    movdqa %xmm4, %xmm2
867; SSE41-NEXT:    movdqa %xmm5, %xmm3
868; SSE41-NEXT:    retq
869;
870; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
871; AVX1:       # %bb.0:
872; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
873; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm3
874; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
875; AVX1-NEXT:    vpmulhuw %xmm4, %xmm3, %xmm3
876; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
877; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
878; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm5
879; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
880; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
881; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm5
882; AVX1-NEXT:    vpmulhuw %xmm4, %xmm5, %xmm5
883; AVX1-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
884; AVX1-NEXT:    vpmulhw %xmm3, %xmm5, %xmm6
885; AVX1-NEXT:    vpsraw $1, %xmm5, %xmm5
886; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7]
887; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
888; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
889; AVX1-NEXT:    # ymm5 = mem[0,1,0,1]
890; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
891; AVX1-NEXT:    vandnps %ymm0, %ymm5, %ymm0
892; AVX1-NEXT:    vorps %ymm0, %ymm2, %ymm0
893; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
894; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm6
895; AVX1-NEXT:    vpmulhuw %xmm4, %xmm6, %xmm6
896; AVX1-NEXT:    vpaddw %xmm6, %xmm2, %xmm2
897; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm6
898; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
899; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7]
900; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm6
901; AVX1-NEXT:    vpmulhuw %xmm4, %xmm6, %xmm4
902; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm4
903; AVX1-NEXT:    vpmulhw %xmm3, %xmm4, %xmm3
904; AVX1-NEXT:    vpsraw $1, %xmm4, %xmm4
905; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7]
906; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
907; AVX1-NEXT:    vandps %ymm5, %ymm2, %ymm2
908; AVX1-NEXT:    vandnps %ymm1, %ymm5, %ymm1
909; AVX1-NEXT:    vorps %ymm1, %ymm2, %ymm1
910; AVX1-NEXT:    retq
911;
912; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
913; AVX2:       # %bb.0:
914; AVX2-NEXT:    vpsraw $15, %ymm0, %ymm2
915; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
916; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
917; AVX2-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
918; AVX2-NEXT:    vpaddw %ymm2, %ymm0, %ymm2
919; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768]
920; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
921; AVX2-NEXT:    vpmulhw %ymm4, %ymm2, %ymm5
922; AVX2-NEXT:    vpsraw $1, %ymm2, %ymm2
923; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15]
924; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
925; AVX2-NEXT:    vpsraw $15, %ymm1, %ymm2
926; AVX2-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm2
927; AVX2-NEXT:    vpaddw %ymm2, %ymm1, %ymm2
928; AVX2-NEXT:    vpmulhw %ymm4, %ymm2, %ymm3
929; AVX2-NEXT:    vpsraw $1, %ymm2, %ymm2
930; AVX2-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15]
931; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
932; AVX2-NEXT:    retq
933;
934; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
935; AVX512F:       # %bb.0:
936; AVX512F-NEXT:    vpsraw $15, %ymm0, %ymm1
937; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2]
938; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
939; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
940; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm1
941; AVX512F-NEXT:    vpmovsxwd %ymm1, %zmm1
942; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1]
943; AVX512F-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
944; AVX512F-NEXT:    vpsravd %zmm3, %zmm1, %zmm1
945; AVX512F-NEXT:    vpmovdw %zmm1, %ymm1
946; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
947; AVX512F-NEXT:    vpsraw $15, %ymm4, %ymm5
948; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm5, %ymm2
949; AVX512F-NEXT:    vpaddw %ymm2, %ymm4, %ymm2
950; AVX512F-NEXT:    vpmovsxwd %ymm2, %zmm2
951; AVX512F-NEXT:    vpsravd %zmm3, %zmm2, %zmm2
952; AVX512F-NEXT:    vpmovdw %zmm2, %ymm2
953; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
954; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
955; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
956; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
957; AVX512F-NEXT:    retq
958;
959; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
960; AVX512BW:       # %bb.0:
961; AVX512BW-NEXT:    vpsraw $15, %zmm0, %zmm1
962; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
963; AVX512BW-NEXT:    vpaddw %zmm1, %zmm0, %zmm1
964; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
965; AVX512BW-NEXT:    movl $16843009, %eax # imm = 0x1010101
966; AVX512BW-NEXT:    kmovd %eax, %k1
967; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm1 {%k1}
968; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
969; AVX512BW-NEXT:    retq
970;
971; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
972; XOP:       # %bb.0:
973; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
974; XOP-NEXT:    vpsraw $15, %xmm2, %xmm3
975; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
976; XOP-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
977; XOP-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
978; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
979; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
980; XOP-NEXT:    vpsraw $15, %xmm0, %xmm5
981; XOP-NEXT:    vpshlw %xmm4, %xmm5, %xmm5
982; XOP-NEXT:    vpaddw %xmm5, %xmm0, %xmm5
983; XOP-NEXT:    vpshaw %xmm3, %xmm5, %xmm5
984; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
985; XOP-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
986; XOP-NEXT:    # ymm5 = mem[0,1,0,1]
987; XOP-NEXT:    vpcmov %ymm5, %ymm0, %ymm2, %ymm0
988; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
989; XOP-NEXT:    vpsraw $15, %xmm2, %xmm6
990; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm6
991; XOP-NEXT:    vpaddw %xmm6, %xmm2, %xmm2
992; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
993; XOP-NEXT:    vpsraw $15, %xmm1, %xmm6
994; XOP-NEXT:    vpshlw %xmm4, %xmm6, %xmm4
995; XOP-NEXT:    vpaddw %xmm4, %xmm1, %xmm4
996; XOP-NEXT:    vpshaw %xmm3, %xmm4, %xmm3
997; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
998; XOP-NEXT:    vpcmov %ymm5, %ymm1, %ymm2, %ymm1
999; XOP-NEXT:    retq
1000  %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2>
1001  ret <32 x i16> %1
1002}
1003
1004define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) {
1005; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1006; SSE2:       # %bb.0:
1007; SSE2-NEXT:    movdqa %xmm0, %xmm1
1008; SSE2-NEXT:    psrad $31, %xmm1
1009; SSE2-NEXT:    movdqa %xmm1, %xmm2
1010; SSE2-NEXT:    psrld $28, %xmm2
1011; SSE2-NEXT:    movdqa %xmm1, %xmm3
1012; SSE2-NEXT:    psrld $29, %xmm3
1013; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1014; SSE2-NEXT:    psrld $30, %xmm1
1015; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1016; SSE2-NEXT:    paddd %xmm0, %xmm1
1017; SSE2-NEXT:    movdqa %xmm1, %xmm2
1018; SSE2-NEXT:    psrad $4, %xmm2
1019; SSE2-NEXT:    movdqa %xmm1, %xmm3
1020; SSE2-NEXT:    psrad $3, %xmm3
1021; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1022; SSE2-NEXT:    psrad $2, %xmm1
1023; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3]
1024; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1025; SSE2-NEXT:    movaps %xmm1, %xmm0
1026; SSE2-NEXT:    retq
1027;
1028; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1029; SSE41:       # %bb.0:
1030; SSE41-NEXT:    movdqa %xmm0, %xmm1
1031; SSE41-NEXT:    psrad $31, %xmm1
1032; SSE41-NEXT:    movdqa %xmm1, %xmm2
1033; SSE41-NEXT:    psrld $28, %xmm2
1034; SSE41-NEXT:    movdqa %xmm1, %xmm3
1035; SSE41-NEXT:    psrld $30, %xmm3
1036; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1037; SSE41-NEXT:    psrld $29, %xmm1
1038; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1039; SSE41-NEXT:    paddd %xmm0, %xmm1
1040; SSE41-NEXT:    movdqa %xmm1, %xmm2
1041; SSE41-NEXT:    psrad $4, %xmm2
1042; SSE41-NEXT:    movdqa %xmm1, %xmm3
1043; SSE41-NEXT:    psrad $2, %xmm3
1044; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1045; SSE41-NEXT:    psrad $3, %xmm1
1046; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
1047; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1048; SSE41-NEXT:    movdqa %xmm1, %xmm0
1049; SSE41-NEXT:    retq
1050;
1051; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1052; AVX1:       # %bb.0:
1053; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
1054; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1055; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
1056; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1057; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
1058; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1059; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1060; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
1061; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
1062; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1063; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
1064; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1065; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1066; AVX1-NEXT:    retq
1067;
1068; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1069; AVX2ORLATER:       # %bb.0:
1070; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
1071; AVX2ORLATER-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1072; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1073; AVX2ORLATER-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1074; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
1075; AVX2ORLATER-NEXT:    retq
1076;
1077; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32:
1078; XOP:       # %bb.0:
1079; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
1080; XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1081; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
1082; XOP-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1083; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1084; XOP-NEXT:    retq
1085  %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16>
1086  ret <4 x i32> %1
1087}
1088
1089define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
1090; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1091; SSE2:       # %bb.0:
1092; SSE2-NEXT:    movdqa %xmm0, %xmm2
1093; SSE2-NEXT:    psrad $31, %xmm0
1094; SSE2-NEXT:    movdqa %xmm0, %xmm3
1095; SSE2-NEXT:    psrld $28, %xmm3
1096; SSE2-NEXT:    movdqa %xmm0, %xmm4
1097; SSE2-NEXT:    psrld $29, %xmm4
1098; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1099; SSE2-NEXT:    psrld $30, %xmm0
1100; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1101; SSE2-NEXT:    paddd %xmm2, %xmm0
1102; SSE2-NEXT:    movdqa %xmm0, %xmm3
1103; SSE2-NEXT:    psrad $4, %xmm3
1104; SSE2-NEXT:    movdqa %xmm0, %xmm4
1105; SSE2-NEXT:    psrad $3, %xmm4
1106; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1107; SSE2-NEXT:    psrad $2, %xmm0
1108; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3]
1109; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
1110; SSE2-NEXT:    movdqa %xmm1, %xmm2
1111; SSE2-NEXT:    psrad $31, %xmm2
1112; SSE2-NEXT:    movdqa %xmm2, %xmm3
1113; SSE2-NEXT:    psrld $28, %xmm3
1114; SSE2-NEXT:    movdqa %xmm2, %xmm4
1115; SSE2-NEXT:    psrld $29, %xmm4
1116; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1117; SSE2-NEXT:    psrld $30, %xmm2
1118; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1119; SSE2-NEXT:    paddd %xmm1, %xmm2
1120; SSE2-NEXT:    movdqa %xmm2, %xmm3
1121; SSE2-NEXT:    psrad $4, %xmm3
1122; SSE2-NEXT:    movdqa %xmm2, %xmm4
1123; SSE2-NEXT:    psrad $3, %xmm4
1124; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1]
1125; SSE2-NEXT:    psrad $2, %xmm2
1126; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3]
1127; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1128; SSE2-NEXT:    movaps %xmm2, %xmm1
1129; SSE2-NEXT:    retq
1130;
1131; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1132; SSE41:       # %bb.0:
1133; SSE41-NEXT:    movdqa %xmm0, %xmm2
1134; SSE41-NEXT:    psrad $31, %xmm0
1135; SSE41-NEXT:    movdqa %xmm0, %xmm3
1136; SSE41-NEXT:    psrld $28, %xmm3
1137; SSE41-NEXT:    movdqa %xmm0, %xmm4
1138; SSE41-NEXT:    psrld $30, %xmm4
1139; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1140; SSE41-NEXT:    psrld $29, %xmm0
1141; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1142; SSE41-NEXT:    paddd %xmm2, %xmm0
1143; SSE41-NEXT:    movdqa %xmm0, %xmm3
1144; SSE41-NEXT:    psrad $4, %xmm3
1145; SSE41-NEXT:    movdqa %xmm0, %xmm4
1146; SSE41-NEXT:    psrad $2, %xmm4
1147; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1148; SSE41-NEXT:    psrad $3, %xmm0
1149; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
1150; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7]
1151; SSE41-NEXT:    movdqa %xmm1, %xmm2
1152; SSE41-NEXT:    psrad $31, %xmm2
1153; SSE41-NEXT:    movdqa %xmm2, %xmm3
1154; SSE41-NEXT:    psrld $28, %xmm3
1155; SSE41-NEXT:    movdqa %xmm2, %xmm4
1156; SSE41-NEXT:    psrld $30, %xmm4
1157; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1158; SSE41-NEXT:    psrld $29, %xmm2
1159; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1160; SSE41-NEXT:    paddd %xmm1, %xmm2
1161; SSE41-NEXT:    movdqa %xmm2, %xmm3
1162; SSE41-NEXT:    psrad $4, %xmm3
1163; SSE41-NEXT:    movdqa %xmm2, %xmm4
1164; SSE41-NEXT:    psrad $2, %xmm4
1165; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1166; SSE41-NEXT:    psrad $3, %xmm2
1167; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1168; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7]
1169; SSE41-NEXT:    movdqa %xmm2, %xmm1
1170; SSE41-NEXT:    retq
1171;
1172; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1173; AVX1:       # %bb.0:
1174; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1175; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
1176; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
1177; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
1178; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1179; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
1180; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1181; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1182; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
1183; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
1184; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1185; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
1186; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1187; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
1188; AVX1-NEXT:    vpsrld $28, %xmm2, %xmm3
1189; AVX1-NEXT:    vpsrld $30, %xmm2, %xmm4
1190; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1191; AVX1-NEXT:    vpsrld $29, %xmm2, %xmm2
1192; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1193; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm2
1194; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1195; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1196; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1197; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1198; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1199; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1200; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1201; AVX1-NEXT:    retq
1202;
1203; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1204; AVX2ORLATER:       # %bb.0:
1205; AVX2ORLATER-NEXT:    vpsrad $31, %ymm0, %ymm1
1206; AVX2ORLATER-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1207; AVX2ORLATER-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
1208; AVX2ORLATER-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1209; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1210; AVX2ORLATER-NEXT:    retq
1211;
1212; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32:
1213; XOP:       # %bb.0:
1214; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
1215; XOP-NEXT:    vpsrad $31, %xmm1, %xmm2
1216; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
1217; XOP-NEXT:    vpshld %xmm3, %xmm2, %xmm2
1218; XOP-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
1219; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
1220; XOP-NEXT:    vpshad %xmm2, %xmm1, %xmm1
1221; XOP-NEXT:    vpsrad $31, %xmm0, %xmm4
1222; XOP-NEXT:    vpshld %xmm3, %xmm4, %xmm3
1223; XOP-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1224; XOP-NEXT:    vpshad %xmm2, %xmm3, %xmm2
1225; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1226; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
1227; XOP-NEXT:    retq
1228  %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1229  ret <8 x i32> %1
1230}
1231
1232define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
1233; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1234; SSE2:       # %bb.0:
1235; SSE2-NEXT:    movdqa %xmm1, %xmm4
1236; SSE2-NEXT:    movdqa %xmm0, %xmm1
1237; SSE2-NEXT:    psrad $31, %xmm0
1238; SSE2-NEXT:    movdqa %xmm0, %xmm5
1239; SSE2-NEXT:    psrld $28, %xmm5
1240; SSE2-NEXT:    movdqa %xmm0, %xmm6
1241; SSE2-NEXT:    psrld $29, %xmm6
1242; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1243; SSE2-NEXT:    psrld $30, %xmm0
1244; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1245; SSE2-NEXT:    paddd %xmm1, %xmm0
1246; SSE2-NEXT:    movdqa %xmm0, %xmm5
1247; SSE2-NEXT:    psrad $4, %xmm5
1248; SSE2-NEXT:    movdqa %xmm0, %xmm6
1249; SSE2-NEXT:    psrad $3, %xmm6
1250; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1251; SSE2-NEXT:    psrad $2, %xmm0
1252; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3]
1253; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
1254; SSE2-NEXT:    movdqa %xmm4, %xmm1
1255; SSE2-NEXT:    psrad $31, %xmm1
1256; SSE2-NEXT:    movdqa %xmm1, %xmm5
1257; SSE2-NEXT:    psrld $28, %xmm5
1258; SSE2-NEXT:    movdqa %xmm1, %xmm6
1259; SSE2-NEXT:    psrld $29, %xmm6
1260; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1261; SSE2-NEXT:    psrld $30, %xmm1
1262; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1263; SSE2-NEXT:    paddd %xmm4, %xmm1
1264; SSE2-NEXT:    movdqa %xmm1, %xmm5
1265; SSE2-NEXT:    psrad $4, %xmm5
1266; SSE2-NEXT:    movdqa %xmm1, %xmm6
1267; SSE2-NEXT:    psrad $3, %xmm6
1268; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1269; SSE2-NEXT:    psrad $2, %xmm1
1270; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3]
1271; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3]
1272; SSE2-NEXT:    movdqa %xmm2, %xmm4
1273; SSE2-NEXT:    psrad $31, %xmm4
1274; SSE2-NEXT:    movdqa %xmm4, %xmm5
1275; SSE2-NEXT:    psrld $28, %xmm5
1276; SSE2-NEXT:    movdqa %xmm4, %xmm6
1277; SSE2-NEXT:    psrld $29, %xmm6
1278; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1279; SSE2-NEXT:    psrld $30, %xmm4
1280; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1281; SSE2-NEXT:    paddd %xmm2, %xmm4
1282; SSE2-NEXT:    movdqa %xmm4, %xmm5
1283; SSE2-NEXT:    psrad $4, %xmm5
1284; SSE2-NEXT:    movdqa %xmm4, %xmm6
1285; SSE2-NEXT:    psrad $3, %xmm6
1286; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1]
1287; SSE2-NEXT:    psrad $2, %xmm4
1288; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3]
1289; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1290; SSE2-NEXT:    movdqa %xmm3, %xmm5
1291; SSE2-NEXT:    psrad $31, %xmm5
1292; SSE2-NEXT:    movdqa %xmm5, %xmm2
1293; SSE2-NEXT:    psrld $28, %xmm2
1294; SSE2-NEXT:    movdqa %xmm5, %xmm6
1295; SSE2-NEXT:    psrld $29, %xmm6
1296; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1297; SSE2-NEXT:    psrld $30, %xmm5
1298; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1299; SSE2-NEXT:    paddd %xmm3, %xmm5
1300; SSE2-NEXT:    movdqa %xmm5, %xmm2
1301; SSE2-NEXT:    psrad $4, %xmm2
1302; SSE2-NEXT:    movdqa %xmm5, %xmm6
1303; SSE2-NEXT:    psrad $3, %xmm6
1304; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1]
1305; SSE2-NEXT:    psrad $2, %xmm5
1306; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3]
1307; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3]
1308; SSE2-NEXT:    movaps %xmm4, %xmm2
1309; SSE2-NEXT:    movaps %xmm5, %xmm3
1310; SSE2-NEXT:    retq
1311;
1312; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1313; SSE41:       # %bb.0:
1314; SSE41-NEXT:    movdqa %xmm1, %xmm4
1315; SSE41-NEXT:    movdqa %xmm0, %xmm1
1316; SSE41-NEXT:    psrad $31, %xmm0
1317; SSE41-NEXT:    movdqa %xmm0, %xmm5
1318; SSE41-NEXT:    psrld $28, %xmm5
1319; SSE41-NEXT:    movdqa %xmm0, %xmm6
1320; SSE41-NEXT:    psrld $30, %xmm6
1321; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1322; SSE41-NEXT:    psrld $29, %xmm0
1323; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1324; SSE41-NEXT:    paddd %xmm1, %xmm0
1325; SSE41-NEXT:    movdqa %xmm0, %xmm5
1326; SSE41-NEXT:    psrad $4, %xmm5
1327; SSE41-NEXT:    movdqa %xmm0, %xmm6
1328; SSE41-NEXT:    psrad $2, %xmm6
1329; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1330; SSE41-NEXT:    psrad $3, %xmm0
1331; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7]
1332; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7]
1333; SSE41-NEXT:    movdqa %xmm4, %xmm1
1334; SSE41-NEXT:    psrad $31, %xmm1
1335; SSE41-NEXT:    movdqa %xmm1, %xmm5
1336; SSE41-NEXT:    psrld $28, %xmm5
1337; SSE41-NEXT:    movdqa %xmm1, %xmm6
1338; SSE41-NEXT:    psrld $30, %xmm6
1339; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1340; SSE41-NEXT:    psrld $29, %xmm1
1341; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1342; SSE41-NEXT:    paddd %xmm4, %xmm1
1343; SSE41-NEXT:    movdqa %xmm1, %xmm5
1344; SSE41-NEXT:    psrad $4, %xmm5
1345; SSE41-NEXT:    movdqa %xmm1, %xmm6
1346; SSE41-NEXT:    psrad $2, %xmm6
1347; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1348; SSE41-NEXT:    psrad $3, %xmm1
1349; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
1350; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7]
1351; SSE41-NEXT:    movdqa %xmm2, %xmm4
1352; SSE41-NEXT:    psrad $31, %xmm4
1353; SSE41-NEXT:    movdqa %xmm4, %xmm5
1354; SSE41-NEXT:    psrld $28, %xmm5
1355; SSE41-NEXT:    movdqa %xmm4, %xmm6
1356; SSE41-NEXT:    psrld $30, %xmm6
1357; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1358; SSE41-NEXT:    psrld $29, %xmm4
1359; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1360; SSE41-NEXT:    paddd %xmm2, %xmm4
1361; SSE41-NEXT:    movdqa %xmm4, %xmm5
1362; SSE41-NEXT:    psrad $4, %xmm5
1363; SSE41-NEXT:    movdqa %xmm4, %xmm6
1364; SSE41-NEXT:    psrad $2, %xmm6
1365; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1366; SSE41-NEXT:    psrad $3, %xmm4
1367; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
1368; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7]
1369; SSE41-NEXT:    movdqa %xmm3, %xmm5
1370; SSE41-NEXT:    psrad $31, %xmm5
1371; SSE41-NEXT:    movdqa %xmm5, %xmm2
1372; SSE41-NEXT:    psrld $28, %xmm2
1373; SSE41-NEXT:    movdqa %xmm5, %xmm6
1374; SSE41-NEXT:    psrld $30, %xmm6
1375; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1376; SSE41-NEXT:    psrld $29, %xmm5
1377; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1378; SSE41-NEXT:    paddd %xmm3, %xmm5
1379; SSE41-NEXT:    movdqa %xmm5, %xmm2
1380; SSE41-NEXT:    psrad $4, %xmm2
1381; SSE41-NEXT:    movdqa %xmm5, %xmm6
1382; SSE41-NEXT:    psrad $2, %xmm6
1383; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7]
1384; SSE41-NEXT:    psrad $3, %xmm5
1385; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1386; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7]
1387; SSE41-NEXT:    movdqa %xmm4, %xmm2
1388; SSE41-NEXT:    movdqa %xmm5, %xmm3
1389; SSE41-NEXT:    retq
1390;
1391; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1392; AVX1:       # %bb.0:
1393; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1394; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
1395; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1396; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1397; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1398; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1399; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1400; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1401; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1402; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1403; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1404; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1405; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1406; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
1407; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1408; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1409; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1410; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1411; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1412; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm3
1413; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
1414; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
1415; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1416; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm3
1417; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1418; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1419; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1420; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1421; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
1422; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1423; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1424; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1425; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1426; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1427; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1428; AVX1-NEXT:    vpsrad $4, %xmm2, %xmm3
1429; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1430; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1431; AVX1-NEXT:    vpsrad $3, %xmm2, %xmm2
1432; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1433; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm3
1434; AVX1-NEXT:    vpsrld $28, %xmm3, %xmm4
1435; AVX1-NEXT:    vpsrld $30, %xmm3, %xmm5
1436; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1437; AVX1-NEXT:    vpsrld $29, %xmm3, %xmm3
1438; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1439; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm3
1440; AVX1-NEXT:    vpsrad $4, %xmm3, %xmm4
1441; AVX1-NEXT:    vpsrad $2, %xmm3, %xmm5
1442; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
1443; AVX1-NEXT:    vpsrad $3, %xmm3, %xmm3
1444; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
1445; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1446; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1447; AVX1-NEXT:    retq
1448;
1449; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1450; AVX2:       # %bb.0:
1451; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
1452; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28]
1453; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1454; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
1455; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm2
1456; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4]
1457; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1458; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
1459; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1460; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
1461; AVX2-NEXT:    vpsrlvd %ymm3, %ymm2, %ymm2
1462; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm2
1463; AVX2-NEXT:    vpsravd %ymm4, %ymm2, %ymm2
1464; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1465; AVX2-NEXT:    retq
1466;
1467; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1468; AVX512F:       # %bb.0:
1469; AVX512F-NEXT:    vpsrad $31, %zmm0, %zmm1
1470; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1471; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
1472; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1473; AVX512F-NEXT:    movw $4369, %ax # imm = 0x1111
1474; AVX512F-NEXT:    kmovw %eax, %k1
1475; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1476; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
1477; AVX512F-NEXT:    retq
1478;
1479; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1480; AVX512BW:       # %bb.0:
1481; AVX512BW-NEXT:    vpsrad $31, %zmm0, %zmm1
1482; AVX512BW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1483; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
1484; AVX512BW-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1485; AVX512BW-NEXT:    movw $4369, %ax # imm = 0x1111
1486; AVX512BW-NEXT:    kmovd %eax, %k1
1487; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1488; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
1489; AVX512BW-NEXT:    retq
1490;
1491; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32:
1492; XOP:       # %bb.0:
1493; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1494; XOP-NEXT:    vpsrad $31, %xmm2, %xmm3
1495; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
1496; XOP-NEXT:    vpshld %xmm4, %xmm3, %xmm3
1497; XOP-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
1498; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
1499; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
1500; XOP-NEXT:    vpsrad $31, %xmm0, %xmm5
1501; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
1502; XOP-NEXT:    vpaddd %xmm5, %xmm0, %xmm5
1503; XOP-NEXT:    vpshad %xmm3, %xmm5, %xmm5
1504; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
1505; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7]
1506; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
1507; XOP-NEXT:    vpsrad $31, %xmm2, %xmm5
1508; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
1509; XOP-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
1510; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
1511; XOP-NEXT:    vpsrad $31, %xmm1, %xmm5
1512; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm4
1513; XOP-NEXT:    vpaddd %xmm4, %xmm1, %xmm4
1514; XOP-NEXT:    vpshad %xmm3, %xmm4, %xmm3
1515; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1516; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7]
1517; XOP-NEXT:    retq
1518  %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16>
1519  ret <16 x i32> %1
1520}
1521
1522define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) {
1523; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1524; SSE2:       # %bb.0:
1525; SSE2-NEXT:    movdqa %xmm0, %xmm1
1526; SSE2-NEXT:    psrad $31, %xmm1
1527; SSE2-NEXT:    psrlq $62, %xmm1
1528; SSE2-NEXT:    paddq %xmm0, %xmm1
1529; SSE2-NEXT:    movdqa %xmm1, %xmm2
1530; SSE2-NEXT:    psrad $2, %xmm2
1531; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
1532; SSE2-NEXT:    psrlq $2, %xmm1
1533; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1534; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1535; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1536; SSE2-NEXT:    retq
1537;
1538; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1539; SSE41:       # %bb.0:
1540; SSE41-NEXT:    movdqa %xmm0, %xmm1
1541; SSE41-NEXT:    psrad $31, %xmm1
1542; SSE41-NEXT:    psrlq $62, %xmm1
1543; SSE41-NEXT:    paddq %xmm0, %xmm1
1544; SSE41-NEXT:    movdqa %xmm1, %xmm2
1545; SSE41-NEXT:    psrad $2, %xmm2
1546; SSE41-NEXT:    psrlq $2, %xmm1
1547; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1548; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1549; SSE41-NEXT:    movdqa %xmm1, %xmm0
1550; SSE41-NEXT:    retq
1551;
1552; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1553; AVX1:       # %bb.0:
1554; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1555; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
1556; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm1
1557; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1558; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm2
1559; AVX1-NEXT:    vpsrlq $2, %xmm1, %xmm1
1560; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1561; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1562; AVX1-NEXT:    retq
1563;
1564; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1565; AVX2:       # %bb.0:
1566; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1567; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm1
1568; AVX2-NEXT:    vpsrlq $62, %xmm1, %xmm1
1569; AVX2-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1570; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm2
1571; AVX2-NEXT:    vpsrlq $2, %xmm1, %xmm1
1572; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
1573; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1574; AVX2-NEXT:    retq
1575;
1576; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1577; AVX512F:       # %bb.0:
1578; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1579; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
1580; AVX512F-NEXT:    vpsrlq $62, %xmm1, %xmm1
1581; AVX512F-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1582; AVX512F-NEXT:    vpsraq $2, %zmm1, %zmm1
1583; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1584; AVX512F-NEXT:    vzeroupper
1585; AVX512F-NEXT:    retq
1586;
1587; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1588; AVX512BW:       # %bb.0:
1589; AVX512BW-NEXT:    vpsraq $63, %xmm0, %xmm1
1590; AVX512BW-NEXT:    vpsrlq $62, %xmm1, %xmm1
1591; AVX512BW-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1592; AVX512BW-NEXT:    vpsraq $2, %xmm1, %xmm1
1593; AVX512BW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1594; AVX512BW-NEXT:    retq
1595;
1596; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64:
1597; XOP:       # %bb.0:
1598; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1599; XOP-NEXT:    vpsrlq $62, %xmm1, %xmm1
1600; XOP-NEXT:    vpaddq %xmm1, %xmm0, %xmm1
1601; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1602; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1603; XOP-NEXT:    retq
1604  %1 = sdiv <2 x i64> %x, <i64 1, i64 4>
1605  ret <2 x i64> %1
1606}
1607
1608define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
1609; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1610; SSE2:       # %bb.0:
1611; SSE2-NEXT:    movdqa %xmm0, %xmm2
1612; SSE2-NEXT:    psrad $31, %xmm2
1613; SSE2-NEXT:    psrlq $62, %xmm2
1614; SSE2-NEXT:    paddq %xmm0, %xmm2
1615; SSE2-NEXT:    movdqa %xmm2, %xmm3
1616; SSE2-NEXT:    psrad $2, %xmm3
1617; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
1618; SSE2-NEXT:    psrlq $2, %xmm2
1619; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1620; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1621; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1622; SSE2-NEXT:    movdqa %xmm1, %xmm2
1623; SSE2-NEXT:    psrad $31, %xmm2
1624; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1625; SSE2-NEXT:    movdqa %xmm2, %xmm3
1626; SSE2-NEXT:    psrlq $61, %xmm3
1627; SSE2-NEXT:    psrlq $60, %xmm2
1628; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
1629; SSE2-NEXT:    paddq %xmm1, %xmm2
1630; SSE2-NEXT:    movdqa %xmm2, %xmm1
1631; SSE2-NEXT:    psrlq $3, %xmm1
1632; SSE2-NEXT:    psrlq $4, %xmm2
1633; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1634; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1635; SSE2-NEXT:    xorpd %xmm1, %xmm2
1636; SSE2-NEXT:    psubq %xmm1, %xmm2
1637; SSE2-NEXT:    movdqa %xmm2, %xmm1
1638; SSE2-NEXT:    retq
1639;
1640; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1641; SSE41:       # %bb.0:
1642; SSE41-NEXT:    movdqa %xmm0, %xmm2
1643; SSE41-NEXT:    psrad $31, %xmm0
1644; SSE41-NEXT:    psrlq $62, %xmm0
1645; SSE41-NEXT:    paddq %xmm2, %xmm0
1646; SSE41-NEXT:    movdqa %xmm0, %xmm3
1647; SSE41-NEXT:    psrad $2, %xmm3
1648; SSE41-NEXT:    psrlq $2, %xmm0
1649; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
1650; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
1651; SSE41-NEXT:    movdqa %xmm1, %xmm2
1652; SSE41-NEXT:    psrad $31, %xmm2
1653; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1654; SSE41-NEXT:    movdqa %xmm2, %xmm3
1655; SSE41-NEXT:    psrlq $60, %xmm3
1656; SSE41-NEXT:    psrlq $61, %xmm2
1657; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
1658; SSE41-NEXT:    paddq %xmm1, %xmm2
1659; SSE41-NEXT:    movdqa %xmm2, %xmm1
1660; SSE41-NEXT:    psrlq $4, %xmm1
1661; SSE41-NEXT:    psrlq $3, %xmm2
1662; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1663; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1664; SSE41-NEXT:    pxor %xmm1, %xmm2
1665; SSE41-NEXT:    psubq %xmm1, %xmm2
1666; SSE41-NEXT:    movdqa %xmm2, %xmm1
1667; SSE41-NEXT:    retq
1668;
1669; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1670; AVX1:       # %bb.0:
1671; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1672; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1673; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm3
1674; AVX1-NEXT:    vpsrlq $60, %xmm3, %xmm4
1675; AVX1-NEXT:    vpsrlq $61, %xmm3, %xmm3
1676; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1677; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm1
1678; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm3
1679; AVX1-NEXT:    vpsrlq $3, %xmm1, %xmm1
1680; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1681; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488]
1682; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
1683; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
1684; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm2
1685; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
1686; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
1687; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm3
1688; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm2
1689; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1690; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1691; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1692; AVX1-NEXT:    retq
1693;
1694; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1695; AVX2:       # %bb.0:
1696; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1697; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm1
1698; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1699; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1700; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1701; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1702; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
1703; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
1704; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1705; AVX2-NEXT:    retq
1706;
1707; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1708; AVX512F:       # %bb.0:
1709; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1710; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
1711; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm2
1712; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
1713; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm2
1714; AVX512F-NEXT:    vpsravq %zmm1, %zmm2, %zmm1
1715; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1716; AVX512F-NEXT:    retq
1717;
1718; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1719; AVX512BW:       # %bb.0:
1720; AVX512BW-NEXT:    vpsraq $63, %ymm0, %ymm1
1721; AVX512BW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1722; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
1723; AVX512BW-NEXT:    vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1724; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1725; AVX512BW-NEXT:    retq
1726;
1727; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
1728; XOP:       # %bb.0:
1729; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553]
1730; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm2
1731; XOP-NEXT:    vpsrlq $62, %xmm2, %xmm2
1732; XOP-NEXT:    vpaddq %xmm2, %xmm0, %xmm2
1733; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1734; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm3
1735; XOP-NEXT:    vpshaq %xmm1, %xmm3, %xmm1
1736; XOP-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1737; XOP-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
1738; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1739; XOP-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1740; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
1741; XOP-NEXT:    retq
1742  %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16>
1743  ret <4 x i64> %1
1744}
1745
1746define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
1747; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1748; SSE2:       # %bb.0:
1749; SSE2-NEXT:    movdqa %xmm0, %xmm4
1750; SSE2-NEXT:    psrad $31, %xmm4
1751; SSE2-NEXT:    psrlq $62, %xmm4
1752; SSE2-NEXT:    paddq %xmm0, %xmm4
1753; SSE2-NEXT:    movdqa %xmm4, %xmm5
1754; SSE2-NEXT:    psrad $2, %xmm5
1755; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1756; SSE2-NEXT:    psrlq $2, %xmm4
1757; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1758; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1759; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
1760; SSE2-NEXT:    movdqa %xmm2, %xmm4
1761; SSE2-NEXT:    psrad $31, %xmm4
1762; SSE2-NEXT:    psrlq $62, %xmm4
1763; SSE2-NEXT:    paddq %xmm2, %xmm4
1764; SSE2-NEXT:    movdqa %xmm4, %xmm5
1765; SSE2-NEXT:    psrad $2, %xmm5
1766; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3]
1767; SSE2-NEXT:    psrlq $2, %xmm4
1768; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
1769; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1770; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
1771; SSE2-NEXT:    movdqa %xmm1, %xmm4
1772; SSE2-NEXT:    psrad $31, %xmm4
1773; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1774; SSE2-NEXT:    movdqa %xmm4, %xmm5
1775; SSE2-NEXT:    psrlq $61, %xmm5
1776; SSE2-NEXT:    psrlq $60, %xmm4
1777; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1778; SSE2-NEXT:    paddq %xmm1, %xmm4
1779; SSE2-NEXT:    movdqa %xmm4, %xmm1
1780; SSE2-NEXT:    psrlq $3, %xmm1
1781; SSE2-NEXT:    psrlq $4, %xmm4
1782; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
1783; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488]
1784; SSE2-NEXT:    xorpd %xmm1, %xmm4
1785; SSE2-NEXT:    psubq %xmm1, %xmm4
1786; SSE2-NEXT:    movdqa %xmm3, %xmm5
1787; SSE2-NEXT:    psrad $31, %xmm5
1788; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
1789; SSE2-NEXT:    movdqa %xmm5, %xmm6
1790; SSE2-NEXT:    psrlq $61, %xmm6
1791; SSE2-NEXT:    psrlq $60, %xmm5
1792; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1]
1793; SSE2-NEXT:    paddq %xmm3, %xmm5
1794; SSE2-NEXT:    movdqa %xmm5, %xmm3
1795; SSE2-NEXT:    psrlq $3, %xmm3
1796; SSE2-NEXT:    psrlq $4, %xmm5
1797; SSE2-NEXT:    movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1798; SSE2-NEXT:    xorpd %xmm1, %xmm5
1799; SSE2-NEXT:    psubq %xmm1, %xmm5
1800; SSE2-NEXT:    movdqa %xmm4, %xmm1
1801; SSE2-NEXT:    movdqa %xmm5, %xmm3
1802; SSE2-NEXT:    retq
1803;
1804; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1805; SSE41:       # %bb.0:
1806; SSE41-NEXT:    movdqa %xmm2, %xmm5
1807; SSE41-NEXT:    movdqa %xmm1, %xmm4
1808; SSE41-NEXT:    movdqa %xmm0, %xmm1
1809; SSE41-NEXT:    psrad $31, %xmm0
1810; SSE41-NEXT:    psrlq $62, %xmm0
1811; SSE41-NEXT:    paddq %xmm1, %xmm0
1812; SSE41-NEXT:    movdqa %xmm0, %xmm2
1813; SSE41-NEXT:    psrad $2, %xmm2
1814; SSE41-NEXT:    psrlq $2, %xmm0
1815; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1816; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
1817; SSE41-NEXT:    movdqa %xmm5, %xmm2
1818; SSE41-NEXT:    psrad $31, %xmm2
1819; SSE41-NEXT:    psrlq $62, %xmm2
1820; SSE41-NEXT:    paddq %xmm5, %xmm2
1821; SSE41-NEXT:    movdqa %xmm2, %xmm1
1822; SSE41-NEXT:    psrad $2, %xmm1
1823; SSE41-NEXT:    psrlq $2, %xmm2
1824; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1825; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
1826; SSE41-NEXT:    movdqa %xmm4, %xmm1
1827; SSE41-NEXT:    psrad $31, %xmm1
1828; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1829; SSE41-NEXT:    movdqa %xmm1, %xmm5
1830; SSE41-NEXT:    psrlq $60, %xmm5
1831; SSE41-NEXT:    psrlq $61, %xmm1
1832; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
1833; SSE41-NEXT:    paddq %xmm4, %xmm1
1834; SSE41-NEXT:    movdqa %xmm1, %xmm4
1835; SSE41-NEXT:    psrlq $4, %xmm4
1836; SSE41-NEXT:    psrlq $3, %xmm1
1837; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
1838; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488]
1839; SSE41-NEXT:    pxor %xmm5, %xmm1
1840; SSE41-NEXT:    psubq %xmm5, %xmm1
1841; SSE41-NEXT:    movdqa %xmm3, %xmm4
1842; SSE41-NEXT:    psrad $31, %xmm4
1843; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
1844; SSE41-NEXT:    movdqa %xmm4, %xmm6
1845; SSE41-NEXT:    psrlq $60, %xmm6
1846; SSE41-NEXT:    psrlq $61, %xmm4
1847; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7]
1848; SSE41-NEXT:    paddq %xmm3, %xmm4
1849; SSE41-NEXT:    movdqa %xmm4, %xmm3
1850; SSE41-NEXT:    psrlq $4, %xmm3
1851; SSE41-NEXT:    psrlq $3, %xmm4
1852; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7]
1853; SSE41-NEXT:    pxor %xmm5, %xmm4
1854; SSE41-NEXT:    psubq %xmm5, %xmm4
1855; SSE41-NEXT:    movdqa %xmm4, %xmm3
1856; SSE41-NEXT:    retq
1857;
1858; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1859; AVX1:       # %bb.0:
1860; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1861; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1862; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
1863; AVX1-NEXT:    vpsrlq $60, %xmm4, %xmm5
1864; AVX1-NEXT:    vpsrlq $61, %xmm4, %xmm4
1865; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7]
1866; AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
1867; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm4
1868; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
1869; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
1870; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488]
1871; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
1872; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
1873; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm5
1874; AVX1-NEXT:    vpsrlq $62, %xmm5, %xmm5
1875; AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm5
1876; AVX1-NEXT:    vpsrad $2, %xmm5, %xmm6
1877; AVX1-NEXT:    vpsrlq $2, %xmm5, %xmm5
1878; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
1879; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm5, %ymm3
1880; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1881; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1882; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm5
1883; AVX1-NEXT:    vpsrlq $60, %xmm5, %xmm6
1884; AVX1-NEXT:    vpsrlq $61, %xmm5, %xmm5
1885; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7]
1886; AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
1887; AVX1-NEXT:    vpsrlq $4, %xmm3, %xmm5
1888; AVX1-NEXT:    vpsrlq $3, %xmm3, %xmm3
1889; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7]
1890; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
1891; AVX1-NEXT:    vpsubq %xmm4, %xmm3, %xmm3
1892; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm2
1893; AVX1-NEXT:    vpsrlq $62, %xmm2, %xmm2
1894; AVX1-NEXT:    vpaddq %xmm2, %xmm1, %xmm2
1895; AVX1-NEXT:    vpsrad $2, %xmm2, %xmm4
1896; AVX1-NEXT:    vpsrlq $2, %xmm2, %xmm2
1897; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
1898; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1899; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1900; AVX1-NEXT:    retq
1901;
1902; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1903; AVX2:       # %bb.0:
1904; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1905; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
1906; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
1907; AVX2-NEXT:    vpsrlvq %ymm4, %ymm3, %ymm3
1908; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm3
1909; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
1910; AVX2-NEXT:    vpsrlvq %ymm5, %ymm3, %ymm3
1911; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
1912; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm3
1913; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
1914; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
1915; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
1916; AVX2-NEXT:    vpsrlvq %ymm4, %ymm2, %ymm2
1917; AVX2-NEXT:    vpaddq %ymm2, %ymm1, %ymm2
1918; AVX2-NEXT:    vpsrlvq %ymm5, %ymm2, %ymm2
1919; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
1920; AVX2-NEXT:    vpsubq %ymm6, %ymm2, %ymm2
1921; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1922; AVX2-NEXT:    retq
1923;
1924; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1925; AVX512F:       # %bb.0:
1926; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
1927; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1928; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
1929; AVX512F-NEXT:    vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1930; AVX512F-NEXT:    movb $17, %al
1931; AVX512F-NEXT:    kmovw %eax, %k1
1932; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
1933; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
1934; AVX512F-NEXT:    retq
1935;
1936; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1937; AVX512BW:       # %bb.0:
1938; AVX512BW-NEXT:    vpsraq $63, %zmm0, %zmm1
1939; AVX512BW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1940; AVX512BW-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
1941; AVX512BW-NEXT:    vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
1942; AVX512BW-NEXT:    movb $17, %al
1943; AVX512BW-NEXT:    kmovd %eax, %k1
1944; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
1945; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
1946; AVX512BW-NEXT:    retq
1947;
1948; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64:
1949; XOP:       # %bb.0:
1950; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1951; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553]
1952; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm4
1953; XOP-NEXT:    vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556]
1954; XOP-NEXT:    vpshlq %xmm5, %xmm4, %xmm4
1955; XOP-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
1956; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612]
1957; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
1958; XOP-NEXT:    vpshaq %xmm3, %xmm0, %xmm6
1959; XOP-NEXT:    vpsrlq $62, %xmm6, %xmm6
1960; XOP-NEXT:    vpaddq %xmm6, %xmm0, %xmm6
1961; XOP-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614>
1962; XOP-NEXT:    vpshaq %xmm7, %xmm6, %xmm6
1963; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm2
1964; XOP-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
1965; XOP-NEXT:    vextractf128 $1, %ymm1, %xmm2
1966; XOP-NEXT:    vpshaq %xmm3, %xmm2, %xmm6
1967; XOP-NEXT:    vpshlq %xmm5, %xmm6, %xmm5
1968; XOP-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
1969; XOP-NEXT:    vpshaq %xmm4, %xmm2, %xmm2
1970; XOP-NEXT:    vpshaq %xmm3, %xmm1, %xmm3
1971; XOP-NEXT:    vpsrlq $62, %xmm3, %xmm3
1972; XOP-NEXT:    vpaddq %xmm3, %xmm1, %xmm3
1973; XOP-NEXT:    vpshaq %xmm7, %xmm3, %xmm3
1974; XOP-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
1975; XOP-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
1976; XOP-NEXT:    retq
1977  %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16>
1978  ret <8 x i64> %1
1979}
1980
1981define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) {
1982; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
1983; SSE2:       # %bb.0:
1984; SSE2-NEXT:    movdqa %xmm0, %xmm1
1985; SSE2-NEXT:    psrad $31, %xmm0
1986; SSE2-NEXT:    movdqa %xmm0, %xmm2
1987; SSE2-NEXT:    psrld $28, %xmm2
1988; SSE2-NEXT:    movdqa %xmm0, %xmm3
1989; SSE2-NEXT:    psrld $29, %xmm3
1990; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1991; SSE2-NEXT:    psrld $30, %xmm0
1992; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
1993; SSE2-NEXT:    paddd %xmm1, %xmm0
1994; SSE2-NEXT:    movdqa %xmm0, %xmm2
1995; SSE2-NEXT:    psrad $4, %xmm2
1996; SSE2-NEXT:    movdqa %xmm0, %xmm3
1997; SSE2-NEXT:    psrad $3, %xmm3
1998; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1999; SSE2-NEXT:    psrad $2, %xmm0
2000; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3]
2001; SSE2-NEXT:    movaps %xmm0, %xmm2
2002; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3]
2003; SSE2-NEXT:    pxor %xmm3, %xmm3
2004; SSE2-NEXT:    psubd %xmm2, %xmm3
2005; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2006; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
2007; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2008; SSE2-NEXT:    retq
2009;
2010; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2011; SSE41:       # %bb.0:
2012; SSE41-NEXT:    movdqa %xmm0, %xmm1
2013; SSE41-NEXT:    psrad $31, %xmm1
2014; SSE41-NEXT:    movdqa %xmm1, %xmm2
2015; SSE41-NEXT:    psrld $28, %xmm2
2016; SSE41-NEXT:    movdqa %xmm1, %xmm3
2017; SSE41-NEXT:    psrld $30, %xmm3
2018; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2019; SSE41-NEXT:    psrld $29, %xmm1
2020; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
2021; SSE41-NEXT:    paddd %xmm0, %xmm1
2022; SSE41-NEXT:    movdqa %xmm1, %xmm2
2023; SSE41-NEXT:    psrad $4, %xmm2
2024; SSE41-NEXT:    movdqa %xmm1, %xmm3
2025; SSE41-NEXT:    psrad $2, %xmm3
2026; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2027; SSE41-NEXT:    pxor %xmm2, %xmm2
2028; SSE41-NEXT:    psubd %xmm3, %xmm2
2029; SSE41-NEXT:    psrad $3, %xmm1
2030; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2031; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2032; SSE41-NEXT:    movdqa %xmm1, %xmm0
2033; SSE41-NEXT:    retq
2034;
2035; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2036; AVX1:       # %bb.0:
2037; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm1
2038; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
2039; AVX1-NEXT:    vpsrld $30, %xmm1, %xmm3
2040; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2041; AVX1-NEXT:    vpsrld $29, %xmm1, %xmm1
2042; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
2043; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2044; AVX1-NEXT:    vpsrad $4, %xmm1, %xmm2
2045; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm3
2046; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
2047; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2048; AVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
2049; AVX1-NEXT:    vpsrad $3, %xmm1, %xmm1
2050; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2051; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2052; AVX1-NEXT:    retq
2053;
2054; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2055; AVX2ORLATER:       # %bb.0:
2056; AVX2ORLATER-NEXT:    vpsrad $31, %xmm0, %xmm1
2057; AVX2ORLATER-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2058; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2059; AVX2ORLATER-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2060; AVX2ORLATER-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2061; AVX2ORLATER-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
2062; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2063; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
2064; AVX2ORLATER-NEXT:    retq
2065;
2066; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg:
2067; XOP:       # %bb.0:
2068; XOP-NEXT:    vpsrad $31, %xmm0, %xmm1
2069; XOP-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2070; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2071; XOP-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2072; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2073; XOP-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
2074; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2075; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
2076; XOP-NEXT:    retq
2077  %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16>
2078  ret <4 x i32> %1
2079}
2080
2081define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) {
2082; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1:
2083; CHECK:       # %bb.0:
2084; CHECK-NEXT:    retq
2085  %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16>
2086  ret <4 x i32> %1
2087}
2088
2089define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) {
2090; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2:
2091; CHECK:       # %bb.0:
2092; CHECK-NEXT:    retq
2093  %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16>
2094  ret <4 x i32> %1
2095}
2096
2097define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) {
2098; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3:
2099; CHECK:       # %bb.0:
2100; CHECK-NEXT:    retq
2101  %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16>
2102  ret <4 x i32> %1
2103}
2104
2105; PR37119
2106define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) {
2107; SSE-LABEL: non_splat_minus_one_divisor_0:
2108; SSE:       # %bb.0:
2109; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2110; SSE-NEXT:    pxor %xmm1, %xmm0
2111; SSE-NEXT:    psubb %xmm1, %xmm0
2112; SSE-NEXT:    retq
2113;
2114; AVX1-LABEL: non_splat_minus_one_divisor_0:
2115; AVX1:       # %bb.0:
2116; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2117; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2118; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2119; AVX1-NEXT:    retq
2120;
2121; AVX2-LABEL: non_splat_minus_one_divisor_0:
2122; AVX2:       # %bb.0:
2123; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2124; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2125; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2126; AVX2-NEXT:    retq
2127;
2128; AVX512F-LABEL: non_splat_minus_one_divisor_0:
2129; AVX512F:       # %bb.0:
2130; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2131; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2132; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2133; AVX512F-NEXT:    retq
2134;
2135; AVX512BW-LABEL: non_splat_minus_one_divisor_0:
2136; AVX512BW:       # %bb.0:
2137; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2138; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
2139; AVX512BW-NEXT:    kmovd %eax, %k1
2140; AVX512BW-NEXT:    vpsubb %xmm0, %xmm1, %xmm0 {%k1}
2141; AVX512BW-NEXT:    retq
2142;
2143; XOP-LABEL: non_splat_minus_one_divisor_0:
2144; XOP:       # %bb.0:
2145; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0]
2146; XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2147; XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2148; XOP-NEXT:    retq
2149  %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
2150  ret <16 x i8> %div
2151}
2152
2153define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
2154; SSE2-LABEL: non_splat_minus_one_divisor_1:
2155; SSE2:       # %bb.0:
2156; SSE2-NEXT:    pxor %xmm1, %xmm1
2157; SSE2-NEXT:    pxor %xmm2, %xmm2
2158; SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
2159; SSE2-NEXT:    movdqa %xmm2, %xmm3
2160; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
2161; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2162; SSE2-NEXT:    psrlw $8, %xmm3
2163; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
2164; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2165; SSE2-NEXT:    psrlw $8, %xmm2
2166; SSE2-NEXT:    packuswb %xmm3, %xmm2
2167; SSE2-NEXT:    paddb %xmm0, %xmm2
2168; SSE2-NEXT:    movdqa %xmm2, %xmm1
2169; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2170; SSE2-NEXT:    psraw $8, %xmm1
2171; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2172; SSE2-NEXT:    psrlw $8, %xmm1
2173; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2174; SSE2-NEXT:    psraw $8, %xmm2
2175; SSE2-NEXT:    psllw $7, %xmm2
2176; SSE2-NEXT:    psrlw $8, %xmm2
2177; SSE2-NEXT:    packuswb %xmm1, %xmm2
2178; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2179; SSE2-NEXT:    pand %xmm1, %xmm2
2180; SSE2-NEXT:    pandn %xmm0, %xmm1
2181; SSE2-NEXT:    por %xmm2, %xmm1
2182; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2183; SSE2-NEXT:    pxor %xmm0, %xmm1
2184; SSE2-NEXT:    psubb %xmm0, %xmm1
2185; SSE2-NEXT:    movdqa %xmm1, %xmm0
2186; SSE2-NEXT:    retq
2187;
2188; SSE41-LABEL: non_splat_minus_one_divisor_1:
2189; SSE41:       # %bb.0:
2190; SSE41-NEXT:    movdqa %xmm0, %xmm1
2191; SSE41-NEXT:    pxor %xmm0, %xmm0
2192; SSE41-NEXT:    pxor %xmm3, %xmm3
2193; SSE41-NEXT:    pcmpgtb %xmm1, %xmm3
2194; SSE41-NEXT:    pxor %xmm4, %xmm4
2195; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
2196; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2197; SSE41-NEXT:    psllw $1, %xmm2
2198; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
2199; SSE41-NEXT:    psrlw $8, %xmm2
2200; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2201; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2202; SSE41-NEXT:    psrlw $8, %xmm3
2203; SSE41-NEXT:    packuswb %xmm3, %xmm2
2204; SSE41-NEXT:    paddb %xmm1, %xmm2
2205; SSE41-NEXT:    movdqa %xmm2, %xmm0
2206; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
2207; SSE41-NEXT:    psraw $8, %xmm0
2208; SSE41-NEXT:    movdqa %xmm0, %xmm3
2209; SSE41-NEXT:    psllw $1, %xmm3
2210; SSE41-NEXT:    psllw $7, %xmm0
2211; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
2212; SSE41-NEXT:    psrlw $8, %xmm0
2213; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2214; SSE41-NEXT:    psraw $8, %xmm2
2215; SSE41-NEXT:    psllw $7, %xmm2
2216; SSE41-NEXT:    psrlw $8, %xmm2
2217; SSE41-NEXT:    packuswb %xmm0, %xmm2
2218; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2219; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
2220; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2221; SSE41-NEXT:    pxor %xmm0, %xmm1
2222; SSE41-NEXT:    psubb %xmm0, %xmm1
2223; SSE41-NEXT:    movdqa %xmm1, %xmm0
2224; SSE41-NEXT:    retq
2225;
2226; AVX1-LABEL: non_splat_minus_one_divisor_1:
2227; AVX1:       # %bb.0:
2228; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2229; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2230; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2231; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2232; AVX1-NEXT:    vpsllw $1, %xmm4, %xmm4
2233; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
2234; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
2235; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2236; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2237; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2238; AVX1-NEXT:    vpackuswb %xmm1, %xmm3, %xmm1
2239; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2240; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2241; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
2242; AVX1-NEXT:    vpsllw $1, %xmm2, %xmm3
2243; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
2244; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
2245; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2246; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2247; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
2248; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
2249; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2250; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2251; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2252; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2253; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2254; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2255; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2256; AVX1-NEXT:    retq
2257;
2258; AVX2-LABEL: non_splat_minus_one_divisor_1:
2259; AVX2:       # %bb.0:
2260; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2261; AVX2-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2262; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2263; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2264; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2265; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2266; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2267; AVX2-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2268; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
2269; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
2270; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2271; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2272; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2273; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2274; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2275; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2276; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2277; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2278; AVX2-NEXT:    vzeroupper
2279; AVX2-NEXT:    retq
2280;
2281; AVX512F-LABEL: non_splat_minus_one_divisor_1:
2282; AVX512F:       # %bb.0:
2283; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2284; AVX512F-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2285; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2286; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2287; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2288; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2289; AVX512F-NEXT:    vpmovsxbd %xmm1, %zmm1
2290; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2291; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
2292; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2293; AVX512F-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2294; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2295; AVX512F-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2296; AVX512F-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2297; AVX512F-NEXT:    vzeroupper
2298; AVX512F-NEXT:    retq
2299;
2300; AVX512BW-LABEL: non_splat_minus_one_divisor_1:
2301; AVX512BW:       # %bb.0:
2302; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2303; AVX512BW-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
2304; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2305; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2306; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
2307; AVX512BW-NEXT:    vpaddb %xmm2, %xmm0, %xmm2
2308; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm2
2309; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
2310; AVX512BW-NEXT:    vpmovwb %ymm2, %xmm2
2311; AVX512BW-NEXT:    movw $443, %ax # imm = 0x1BB
2312; AVX512BW-NEXT:    kmovd %eax, %k1
2313; AVX512BW-NEXT:    vmovdqu8 %xmm0, %xmm2 {%k1}
2314; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm0
2315; AVX512BW-NEXT:    movw $24132, %ax # imm = 0x5E44
2316; AVX512BW-NEXT:    kmovd %eax, %k1
2317; AVX512BW-NEXT:    vmovdqu8 %xmm2, %xmm0 {%k1}
2318; AVX512BW-NEXT:    vzeroupper
2319; AVX512BW-NEXT:    retq
2320;
2321; XOP-LABEL: non_splat_minus_one_divisor_1:
2322; XOP:       # %bb.0:
2323; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2324; XOP-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm1
2325; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2326; XOP-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
2327; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2328; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255]
2329; XOP-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
2330; XOP-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255]
2331; XOP-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2332; XOP-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2333; XOP-NEXT:    retq
2334  %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128>
2335  ret <16 x i8> %div
2336}
2337
2338define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) {
2339; SSE2-LABEL: non_splat_minus_one_divisor_2:
2340; SSE2:       # %bb.0:
2341; SSE2-NEXT:    movdqa %xmm0, %xmm1
2342; SSE2-NEXT:    psrld $31, %xmm1
2343; SSE2-NEXT:    paddd %xmm0, %xmm1
2344; SSE2-NEXT:    psrad $1, %xmm1
2345; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2346; SSE2-NEXT:    pxor %xmm0, %xmm0
2347; SSE2-NEXT:    psubd %xmm1, %xmm0
2348; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2]
2349; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
2350; SSE2-NEXT:    retq
2351;
2352; SSE41-LABEL: non_splat_minus_one_divisor_2:
2353; SSE41:       # %bb.0:
2354; SSE41-NEXT:    movdqa %xmm0, %xmm1
2355; SSE41-NEXT:    psrld $31, %xmm1
2356; SSE41-NEXT:    paddd %xmm0, %xmm1
2357; SSE41-NEXT:    psrad $1, %xmm1
2358; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2359; SSE41-NEXT:    pxor %xmm0, %xmm0
2360; SSE41-NEXT:    psubd %xmm1, %xmm0
2361; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7]
2362; SSE41-NEXT:    movdqa %xmm1, %xmm0
2363; SSE41-NEXT:    retq
2364;
2365; AVX1-LABEL: non_splat_minus_one_divisor_2:
2366; AVX1:       # %bb.0:
2367; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
2368; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2369; AVX1-NEXT:    vpsrad $1, %xmm1, %xmm1
2370; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2371; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2372; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2373; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2374; AVX1-NEXT:    retq
2375;
2376; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2:
2377; AVX2ORLATER:       # %bb.0:
2378; AVX2ORLATER-NEXT:    vpsrld $31, %xmm0, %xmm1
2379; AVX2ORLATER-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2380; AVX2ORLATER-NEXT:    vpsrad $1, %xmm1, %xmm1
2381; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2382; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2383; AVX2ORLATER-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2384; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
2385; AVX2ORLATER-NEXT:    retq
2386;
2387; XOP-LABEL: non_splat_minus_one_divisor_2:
2388; XOP:       # %bb.0:
2389; XOP-NEXT:    vpsrld $31, %xmm0, %xmm1
2390; XOP-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
2391; XOP-NEXT:    vpsrad $1, %xmm1, %xmm1
2392; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2393; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2394; XOP-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
2395; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7]
2396; XOP-NEXT:    retq
2397  %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2>
2398  ret <4 x i32> %div
2399}
2400
2401define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) {
2402; SSE-LABEL: combine_vec_sdiv_nonuniform:
2403; SSE:       # %bb.0:
2404; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2405; SSE-NEXT:    movdqa %xmm0, %xmm1
2406; SSE-NEXT:    psrlw $15, %xmm1
2407; SSE-NEXT:    paddw %xmm0, %xmm1
2408; SSE-NEXT:    movdqa %xmm1, %xmm0
2409; SSE-NEXT:    retq
2410;
2411; AVX-LABEL: combine_vec_sdiv_nonuniform:
2412; AVX:       # %bb.0:
2413; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2414; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
2415; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2416; AVX-NEXT:    retq
2417  %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22>
2418  ret <8 x i16> %1
2419}
2420
2421define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) {
2422; SSE2-LABEL: combine_vec_sdiv_nonuniform2:
2423; SSE2:       # %bb.0:
2424; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2425; SSE2-NEXT:    movdqa %xmm0, %xmm1
2426; SSE2-NEXT:    psraw $2, %xmm1
2427; SSE2-NEXT:    movdqa %xmm0, %xmm2
2428; SSE2-NEXT:    psraw $1, %xmm2
2429; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
2430; SSE2-NEXT:    psrlw $15, %xmm0
2431; SSE2-NEXT:    paddw %xmm2, %xmm0
2432; SSE2-NEXT:    retq
2433;
2434; SSE41-LABEL: combine_vec_sdiv_nonuniform2:
2435; SSE41:       # %bb.0:
2436; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2437; SSE41-NEXT:    movdqa %xmm0, %xmm1
2438; SSE41-NEXT:    psraw $1, %xmm1
2439; SSE41-NEXT:    movdqa %xmm0, %xmm2
2440; SSE41-NEXT:    psraw $2, %xmm2
2441; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2442; SSE41-NEXT:    psrlw $15, %xmm0
2443; SSE41-NEXT:    paddw %xmm2, %xmm0
2444; SSE41-NEXT:    retq
2445;
2446; AVX1-LABEL: combine_vec_sdiv_nonuniform2:
2447; AVX1:       # %bb.0:
2448; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2449; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
2450; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm2
2451; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2452; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2453; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2454; AVX1-NEXT:    retq
2455;
2456; AVX2-LABEL: combine_vec_sdiv_nonuniform2:
2457; AVX2:       # %bb.0:
2458; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2459; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm1
2460; AVX2-NEXT:    vpsraw $2, %xmm0, %xmm2
2461; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2462; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2463; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2464; AVX2-NEXT:    retq
2465;
2466; AVX512F-LABEL: combine_vec_sdiv_nonuniform2:
2467; AVX512F:       # %bb.0:
2468; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2469; AVX512F-NEXT:    vpsraw $1, %xmm0, %xmm1
2470; AVX512F-NEXT:    vpsraw $2, %xmm0, %xmm2
2471; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2472; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2473; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2474; AVX512F-NEXT:    retq
2475;
2476; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2:
2477; AVX512BW:       # %bb.0:
2478; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2479; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2480; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2481; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2482; AVX512BW-NEXT:    retq
2483;
2484; XOP-LABEL: combine_vec_sdiv_nonuniform2:
2485; XOP:       # %bb.0:
2486; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2487; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2488; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2489; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2490; XOP-NEXT:    retq
2491  %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25>
2492  ret <8 x i16> %1
2493}
2494
2495define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) {
2496; SSE2-LABEL: combine_vec_sdiv_nonuniform3:
2497; SSE2:       # %bb.0:
2498; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2499; SSE2-NEXT:    pmulhw %xmm0, %xmm1
2500; SSE2-NEXT:    paddw %xmm0, %xmm1
2501; SSE2-NEXT:    movdqa %xmm1, %xmm0
2502; SSE2-NEXT:    psraw $4, %xmm0
2503; SSE2-NEXT:    movdqa %xmm1, %xmm2
2504; SSE2-NEXT:    psraw $8, %xmm2
2505; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2506; SSE2-NEXT:    psrlw $15, %xmm1
2507; SSE2-NEXT:    paddw %xmm2, %xmm1
2508; SSE2-NEXT:    movdqa %xmm1, %xmm0
2509; SSE2-NEXT:    retq
2510;
2511; SSE41-LABEL: combine_vec_sdiv_nonuniform3:
2512; SSE41:       # %bb.0:
2513; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833]
2514; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2515; SSE41-NEXT:    paddw %xmm0, %xmm1
2516; SSE41-NEXT:    movdqa %xmm1, %xmm0
2517; SSE41-NEXT:    psraw $8, %xmm0
2518; SSE41-NEXT:    movdqa %xmm1, %xmm2
2519; SSE41-NEXT:    psraw $4, %xmm2
2520; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2521; SSE41-NEXT:    psrlw $15, %xmm1
2522; SSE41-NEXT:    paddw %xmm2, %xmm1
2523; SSE41-NEXT:    movdqa %xmm1, %xmm0
2524; SSE41-NEXT:    retq
2525;
2526; AVX1-LABEL: combine_vec_sdiv_nonuniform3:
2527; AVX1:       # %bb.0:
2528; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2529; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2530; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm1
2531; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
2532; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2533; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2534; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2535; AVX1-NEXT:    retq
2536;
2537; AVX2-LABEL: combine_vec_sdiv_nonuniform3:
2538; AVX2:       # %bb.0:
2539; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2540; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2541; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm1
2542; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm2
2543; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2544; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2545; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2546; AVX2-NEXT:    retq
2547;
2548; AVX512F-LABEL: combine_vec_sdiv_nonuniform3:
2549; AVX512F:       # %bb.0:
2550; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2551; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2552; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm1
2553; AVX512F-NEXT:    vpsraw $4, %xmm0, %xmm2
2554; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2555; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2556; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2557; AVX512F-NEXT:    retq
2558;
2559; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3:
2560; AVX512BW:       # %bb.0:
2561; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2562; AVX512BW-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2563; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2564; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2565; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2566; AVX512BW-NEXT:    retq
2567;
2568; XOP-LABEL: combine_vec_sdiv_nonuniform3:
2569; XOP:       # %bb.0:
2570; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2571; XOP-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2572; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2573; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2574; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2575; XOP-NEXT:    retq
2576  %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511>
2577  ret <8 x i16> %1
2578}
2579
2580define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) {
2581; SSE2-LABEL: combine_vec_sdiv_nonuniform4:
2582; SSE2:       # %bb.0:
2583; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2584; SSE2-NEXT:    pmulhw %xmm0, %xmm1
2585; SSE2-NEXT:    psubw %xmm0, %xmm1
2586; SSE2-NEXT:    movdqa %xmm1, %xmm0
2587; SSE2-NEXT:    psraw $4, %xmm0
2588; SSE2-NEXT:    movdqa %xmm1, %xmm2
2589; SSE2-NEXT:    psraw $8, %xmm2
2590; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2591; SSE2-NEXT:    psrlw $15, %xmm1
2592; SSE2-NEXT:    paddw %xmm2, %xmm1
2593; SSE2-NEXT:    movdqa %xmm1, %xmm0
2594; SSE2-NEXT:    retq
2595;
2596; SSE41-LABEL: combine_vec_sdiv_nonuniform4:
2597; SSE41:       # %bb.0:
2598; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639]
2599; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2600; SSE41-NEXT:    psubw %xmm0, %xmm1
2601; SSE41-NEXT:    movdqa %xmm1, %xmm0
2602; SSE41-NEXT:    psraw $8, %xmm0
2603; SSE41-NEXT:    movdqa %xmm1, %xmm2
2604; SSE41-NEXT:    psraw $4, %xmm2
2605; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7]
2606; SSE41-NEXT:    psrlw $15, %xmm1
2607; SSE41-NEXT:    paddw %xmm2, %xmm1
2608; SSE41-NEXT:    movdqa %xmm1, %xmm0
2609; SSE41-NEXT:    retq
2610;
2611; AVX1-LABEL: combine_vec_sdiv_nonuniform4:
2612; AVX1:       # %bb.0:
2613; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2614; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2615; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm1
2616; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm2
2617; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
2618; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2619; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2620; AVX1-NEXT:    retq
2621;
2622; AVX2-LABEL: combine_vec_sdiv_nonuniform4:
2623; AVX2:       # %bb.0:
2624; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2625; AVX2-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2626; AVX2-NEXT:    vpsraw $8, %xmm0, %xmm1
2627; AVX2-NEXT:    vpsraw $4, %xmm0, %xmm2
2628; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2629; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2630; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2631; AVX2-NEXT:    retq
2632;
2633; AVX512F-LABEL: combine_vec_sdiv_nonuniform4:
2634; AVX512F:       # %bb.0:
2635; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2636; AVX512F-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2637; AVX512F-NEXT:    vpsraw $8, %xmm0, %xmm1
2638; AVX512F-NEXT:    vpsraw $4, %xmm0, %xmm2
2639; AVX512F-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2640; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm0
2641; AVX512F-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2642; AVX512F-NEXT:    retq
2643;
2644; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4:
2645; AVX512BW:       # %bb.0:
2646; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2647; AVX512BW-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2648; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2649; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2650; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2651; AVX512BW-NEXT:    retq
2652;
2653; XOP-LABEL: combine_vec_sdiv_nonuniform4:
2654; XOP:       # %bb.0:
2655; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2656; XOP-NEXT:    vpsubw %xmm0, %xmm1, %xmm0
2657; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2658; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2659; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2660; XOP-NEXT:    retq
2661  %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510>
2662  ret <8 x i16> %1
2663}
2664
2665define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
2666; SSE2-LABEL: combine_vec_sdiv_nonuniform5:
2667; SSE2:       # %bb.0:
2668; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2669; SSE2-NEXT:    pmullw %xmm0, %xmm1
2670; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2671; SSE2-NEXT:    paddw %xmm1, %xmm0
2672; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0]
2673; SSE2-NEXT:    movdqa %xmm0, %xmm2
2674; SSE2-NEXT:    pand %xmm1, %xmm2
2675; SSE2-NEXT:    movdqa %xmm0, %xmm3
2676; SSE2-NEXT:    psraw $8, %xmm3
2677; SSE2-NEXT:    pandn %xmm3, %xmm1
2678; SSE2-NEXT:    por %xmm2, %xmm1
2679; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535]
2680; SSE2-NEXT:    pand %xmm2, %xmm1
2681; SSE2-NEXT:    movdqa %xmm0, %xmm3
2682; SSE2-NEXT:    psraw $4, %xmm3
2683; SSE2-NEXT:    pandn %xmm3, %xmm2
2684; SSE2-NEXT:    por %xmm1, %xmm2
2685; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535]
2686; SSE2-NEXT:    movdqa %xmm2, %xmm3
2687; SSE2-NEXT:    pand %xmm1, %xmm3
2688; SSE2-NEXT:    psraw $2, %xmm2
2689; SSE2-NEXT:    pandn %xmm2, %xmm1
2690; SSE2-NEXT:    por %xmm3, %xmm1
2691; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535]
2692; SSE2-NEXT:    movdqa %xmm1, %xmm3
2693; SSE2-NEXT:    pand %xmm2, %xmm3
2694; SSE2-NEXT:    psraw $1, %xmm1
2695; SSE2-NEXT:    pandn %xmm1, %xmm2
2696; SSE2-NEXT:    por %xmm3, %xmm2
2697; SSE2-NEXT:    psrlw $15, %xmm0
2698; SSE2-NEXT:    paddw %xmm2, %xmm0
2699; SSE2-NEXT:    retq
2700;
2701; SSE41-LABEL: combine_vec_sdiv_nonuniform5:
2702; SSE41:       # %bb.0:
2703; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1]
2704; SSE41-NEXT:    pmullw %xmm0, %xmm1
2705; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2706; SSE41-NEXT:    paddw %xmm1, %xmm0
2707; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256>
2708; SSE41-NEXT:    pmulhw %xmm0, %xmm1
2709; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2710; SSE41-NEXT:    movdqa %xmm0, %xmm2
2711; SSE41-NEXT:    psraw $1, %xmm2
2712; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2713; SSE41-NEXT:    psrlw $15, %xmm0
2714; SSE41-NEXT:    paddw %xmm2, %xmm0
2715; SSE41-NEXT:    retq
2716;
2717; AVX1-LABEL: combine_vec_sdiv_nonuniform5:
2718; AVX1:       # %bb.0:
2719; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2720; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2721; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2722; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2723; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2724; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm2
2725; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2726; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2727; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2728; AVX1-NEXT:    retq
2729;
2730; AVX2-LABEL: combine_vec_sdiv_nonuniform5:
2731; AVX2:       # %bb.0:
2732; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2733; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2734; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2735; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2736; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2737; AVX2-NEXT:    vpsraw $1, %xmm0, %xmm2
2738; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
2739; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2740; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2741; AVX2-NEXT:    retq
2742;
2743; AVX512F-LABEL: combine_vec_sdiv_nonuniform5:
2744; AVX512F:       # %bb.0:
2745; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2746; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2747; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2748; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm1
2749; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2750; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2751; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2752; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2753; AVX512F-NEXT:    vzeroupper
2754; AVX512F-NEXT:    retq
2755;
2756; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5:
2757; AVX512BW:       # %bb.0:
2758; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2759; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2760; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2761; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2762; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2763; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2764; AVX512BW-NEXT:    retq
2765;
2766; XOP-LABEL: combine_vec_sdiv_nonuniform5:
2767; XOP:       # %bb.0:
2768; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2769; XOP-NEXT:    vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2770; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2771; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2772; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2773; XOP-NEXT:    retq
2774  %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511>
2775  ret <8 x i16> %1
2776}
2777
2778define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
2779; SSE2-LABEL: combine_vec_sdiv_nonuniform6:
2780; SSE2:       # %bb.0:
2781; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2782; SSE2-NEXT:    pmullw %xmm0, %xmm1
2783; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2784; SSE2-NEXT:    paddw %xmm1, %xmm0
2785; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535]
2786; SSE2-NEXT:    movdqa %xmm0, %xmm2
2787; SSE2-NEXT:    psraw $8, %xmm2
2788; SSE2-NEXT:    pand %xmm1, %xmm2
2789; SSE2-NEXT:    pandn %xmm0, %xmm1
2790; SSE2-NEXT:    por %xmm2, %xmm1
2791; SSE2-NEXT:    movdqa %xmm1, %xmm2
2792; SSE2-NEXT:    psraw $6, %xmm2
2793; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535]
2794; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0]
2795; SSE2-NEXT:    pand %xmm4, %xmm1
2796; SSE2-NEXT:    movdqa %xmm0, %xmm5
2797; SSE2-NEXT:    psraw $12, %xmm5
2798; SSE2-NEXT:    pandn %xmm5, %xmm4
2799; SSE2-NEXT:    por %xmm1, %xmm4
2800; SSE2-NEXT:    pand %xmm3, %xmm4
2801; SSE2-NEXT:    pandn %xmm2, %xmm3
2802; SSE2-NEXT:    por %xmm4, %xmm3
2803; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0]
2804; SSE2-NEXT:    movdqa %xmm3, %xmm2
2805; SSE2-NEXT:    pand %xmm1, %xmm2
2806; SSE2-NEXT:    psraw $1, %xmm3
2807; SSE2-NEXT:    pandn %xmm3, %xmm1
2808; SSE2-NEXT:    por %xmm2, %xmm1
2809; SSE2-NEXT:    psrlw $15, %xmm0
2810; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2811; SSE2-NEXT:    paddw %xmm1, %xmm0
2812; SSE2-NEXT:    retq
2813;
2814; SSE41-LABEL: combine_vec_sdiv_nonuniform6:
2815; SSE41:       # %bb.0:
2816; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0]
2817; SSE41-NEXT:    pmullw %xmm0, %xmm1
2818; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2819; SSE41-NEXT:    paddw %xmm1, %xmm0
2820; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8>
2821; SSE41-NEXT:    pmulhw %xmm0, %xmm2
2822; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
2823; SSE41-NEXT:    psrlw $15, %xmm0
2824; SSE41-NEXT:    pxor %xmm1, %xmm1
2825; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
2826; SSE41-NEXT:    paddw %xmm2, %xmm1
2827; SSE41-NEXT:    movdqa %xmm1, %xmm0
2828; SSE41-NEXT:    retq
2829;
2830; AVX1-LABEL: combine_vec_sdiv_nonuniform6:
2831; AVX1:       # %bb.0:
2832; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2833; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2834; AVX1-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2835; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2836; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2837; AVX1-NEXT:    vpsrlw $15, %xmm0, %xmm0
2838; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2839; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2840; AVX1-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2841; AVX1-NEXT:    retq
2842;
2843; AVX2-LABEL: combine_vec_sdiv_nonuniform6:
2844; AVX2:       # %bb.0:
2845; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2846; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2847; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2848; AVX2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2849; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
2850; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
2851; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2852; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7]
2853; AVX2-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
2854; AVX2-NEXT:    retq
2855;
2856; AVX512F-LABEL: combine_vec_sdiv_nonuniform6:
2857; AVX512F:       # %bb.0:
2858; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2859; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2860; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2861; AVX512F-NEXT:    vpsrlw $15, %xmm0, %xmm1
2862; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2863; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2864; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2865; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2866; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
2867; AVX512F-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2868; AVX512F-NEXT:    vzeroupper
2869; AVX512F-NEXT:    retq
2870;
2871; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6:
2872; AVX512BW:       # %bb.0:
2873; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2874; AVX512BW-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2875; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2876; AVX512BW-NEXT:    vpsrlw $15, %xmm0, %xmm1
2877; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2878; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2879; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2880; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2881; AVX512BW-NEXT:    retq
2882;
2883; XOP-LABEL: combine_vec_sdiv_nonuniform6:
2884; XOP:       # %bb.0:
2885; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
2886; XOP-NEXT:    vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2887; XOP-NEXT:    vpsrlw $15, %xmm0, %xmm1
2888; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2889; XOP-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
2890; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2891; XOP-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
2892; XOP-NEXT:    retq
2893  %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767>
2894  ret <8 x i16> %1
2895}
2896
2897define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) {
2898; SSE2-LABEL: combine_vec_sdiv_nonuniform7:
2899; SSE2:       # %bb.0:
2900; SSE2-NEXT:    pxor %xmm1, %xmm1
2901; SSE2-NEXT:    psubw %xmm0, %xmm1
2902; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
2903; SSE2-NEXT:    retq
2904;
2905; SSE41-LABEL: combine_vec_sdiv_nonuniform7:
2906; SSE41:       # %bb.0:
2907; SSE41-NEXT:    pxor %xmm1, %xmm1
2908; SSE41-NEXT:    psubw %xmm0, %xmm1
2909; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2910; SSE41-NEXT:    retq
2911;
2912; AVX1-LABEL: combine_vec_sdiv_nonuniform7:
2913; AVX1:       # %bb.0:
2914; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2915; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2916; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2917; AVX1-NEXT:    retq
2918;
2919; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7:
2920; AVX2ORLATER:       # %bb.0:
2921; AVX2ORLATER-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2922; AVX2ORLATER-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2923; AVX2ORLATER-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2924; AVX2ORLATER-NEXT:    retq
2925;
2926; XOP-LABEL: combine_vec_sdiv_nonuniform7:
2927; XOP:       # %bb.0:
2928; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2929; XOP-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
2930; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2931; XOP-NEXT:    retq
2932  %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1>
2933  ret <8 x i16> %1
2934}
2935
2936define <16 x i8> @pr38658(<16 x i8> %x) {
2937; SSE2-LABEL: pr38658:
2938; SSE2:       # %bb.0:
2939; SSE2-NEXT:    pxor %xmm2, %xmm2
2940; SSE2-NEXT:    pxor %xmm3, %xmm3
2941; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
2942; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
2943; SSE2-NEXT:    psrlw $8, %xmm3
2944; SSE2-NEXT:    pxor %xmm1, %xmm1
2945; SSE2-NEXT:    packuswb %xmm3, %xmm1
2946; SSE2-NEXT:    paddb %xmm0, %xmm1
2947; SSE2-NEXT:    movdqa %xmm1, %xmm0
2948; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2949; SSE2-NEXT:    movdqa %xmm1, %xmm2
2950; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2951; SSE2-NEXT:    psraw $8, %xmm2
2952; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2953; SSE2-NEXT:    psrlw $8, %xmm2
2954; SSE2-NEXT:    packuswb %xmm2, %xmm0
2955; SSE2-NEXT:    psrlw $7, %xmm1
2956; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2957; SSE2-NEXT:    paddb %xmm0, %xmm1
2958; SSE2-NEXT:    movdqa %xmm1, %xmm0
2959; SSE2-NEXT:    retq
2960;
2961; SSE41-LABEL: pr38658:
2962; SSE41:       # %bb.0:
2963; SSE41-NEXT:    pxor %xmm1, %xmm1
2964; SSE41-NEXT:    pxor %xmm2, %xmm2
2965; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2966; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
2967; SSE41-NEXT:    psrlw $8, %xmm2
2968; SSE41-NEXT:    packuswb %xmm2, %xmm1
2969; SSE41-NEXT:    paddb %xmm0, %xmm1
2970; SSE41-NEXT:    movdqa %xmm1, %xmm0
2971; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2972; SSE41-NEXT:    psraw $8, %xmm0
2973; SSE41-NEXT:    movdqa %xmm0, %xmm2
2974; SSE41-NEXT:    psllw $6, %xmm2
2975; SSE41-NEXT:    psllw $8, %xmm0
2976; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
2977; SSE41-NEXT:    psrlw $8, %xmm0
2978; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2979; SSE41-NEXT:    packuswb %xmm0, %xmm2
2980; SSE41-NEXT:    psrlw $7, %xmm1
2981; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2982; SSE41-NEXT:    paddb %xmm2, %xmm1
2983; SSE41-NEXT:    movdqa %xmm1, %xmm0
2984; SSE41-NEXT:    retq
2985;
2986; AVX1-LABEL: pr38658:
2987; AVX1:       # %bb.0:
2988; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2989; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2990; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
2991; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2992; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2993; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
2994; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2995; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
2996; AVX1-NEXT:    vpsllw $6, %xmm1, %xmm2
2997; AVX1-NEXT:    vpsllw $8, %xmm1, %xmm1
2998; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
2999; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
3000; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
3001; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
3002; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
3003; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3004; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3005; AVX1-NEXT:    retq
3006;
3007; AVX2-LABEL: pr38658:
3008; AVX2:       # %bb.0:
3009; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
3010; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3011; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
3012; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3013; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3014; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3015; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
3016; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3017; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
3018; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
3019; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
3020; AVX2-NEXT:    vpsrlw $7, %xmm0, %xmm0
3021; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3022; AVX2-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3023; AVX2-NEXT:    vzeroupper
3024; AVX2-NEXT:    retq
3025;
3026; AVX512F-LABEL: pr38658:
3027; AVX512F:       # %bb.0:
3028; AVX512F-NEXT:    vpmovsxbw %xmm0, %ymm1
3029; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3030; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
3031; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
3032; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
3033; AVX512F-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3034; AVX512F-NEXT:    vpsrlw $7, %xmm0, %xmm1
3035; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3036; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
3037; AVX512F-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
3038; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
3039; AVX512F-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3040; AVX512F-NEXT:    vzeroupper
3041; AVX512F-NEXT:    retq
3042;
3043; AVX512BW-LABEL: pr38658:
3044; AVX512BW:       # %bb.0:
3045; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
3046; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
3047; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
3048; AVX512BW-NEXT:    vpmovwb %ymm1, %xmm1
3049; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3050; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm1
3051; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
3052; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
3053; AVX512BW-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
3054; AVX512BW-NEXT:    vpmovwb %ymm0, %xmm0
3055; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
3056; AVX512BW-NEXT:    vzeroupper
3057; AVX512BW-NEXT:    retq
3058;
3059; XOP-LABEL: pr38658:
3060; XOP:       # %bb.0:
3061; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
3062; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
3063; XOP-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
3064; XOP-NEXT:    vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15]
3065; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3066; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
3067; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3068; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
3069; XOP-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
3070; XOP-NEXT:    retq
3071  %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7>
3072  ret <16 x i8> %1
3073}
3074
3075define i1 @bool_sdiv(i1 %x, i1 %y) {
3076; CHECK-LABEL: bool_sdiv:
3077; CHECK:       # %bb.0:
3078; CHECK-NEXT:    movl %edi, %eax
3079; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3080; CHECK-NEXT:    retq
3081  %r = sdiv i1 %x, %y
3082  ret i1 %r
3083}
3084
3085define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) {
3086; CHECK-LABEL: boolvec_sdiv:
3087; CHECK:       # %bb.0:
3088; CHECK-NEXT:    retq
3089  %r = sdiv <4 x i1> %x, %y
3090  ret <4 x i1> %r
3091}
3092
3093define i32 @combine_sdiv_two(i32 %x) {
3094; CHECK-LABEL: combine_sdiv_two:
3095; CHECK:       # %bb.0:
3096; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3097; CHECK-NEXT:    movl %edi, %eax
3098; CHECK-NEXT:    shrl $31, %eax
3099; CHECK-NEXT:    addl %edi, %eax
3100; CHECK-NEXT:    sarl %eax
3101; CHECK-NEXT:    retq
3102  %1 = sdiv i32 %x, 2
3103  ret i32 %1
3104}
3105
3106define i32 @combine_sdiv_negtwo(i32 %x) {
3107; CHECK-LABEL: combine_sdiv_negtwo:
3108; CHECK:       # %bb.0:
3109; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3110; CHECK-NEXT:    movl %edi, %eax
3111; CHECK-NEXT:    shrl $31, %eax
3112; CHECK-NEXT:    addl %edi, %eax
3113; CHECK-NEXT:    sarl %eax
3114; CHECK-NEXT:    negl %eax
3115; CHECK-NEXT:    retq
3116  %1 = sdiv i32 %x, -2
3117  ret i32 %1
3118}
3119
3120define i8 @combine_i8_sdiv_pow2(i8 %x) {
3121; CHECK-LABEL: combine_i8_sdiv_pow2:
3122; CHECK:       # %bb.0:
3123; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3124; CHECK-NEXT:    movl %edi, %eax
3125; CHECK-NEXT:    sarb $7, %al
3126; CHECK-NEXT:    shrb $4, %al
3127; CHECK-NEXT:    addl %edi, %eax
3128; CHECK-NEXT:    sarb $4, %al
3129; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3130; CHECK-NEXT:    retq
3131  %1 = sdiv i8 %x, 16
3132  ret i8 %1
3133}
3134
3135define i8 @combine_i8_sdiv_negpow2(i8 %x) {
3136; CHECK-LABEL: combine_i8_sdiv_negpow2:
3137; CHECK:       # %bb.0:
3138; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3139; CHECK-NEXT:    movl %edi, %eax
3140; CHECK-NEXT:    sarb $7, %al
3141; CHECK-NEXT:    shrb $2, %al
3142; CHECK-NEXT:    addl %edi, %eax
3143; CHECK-NEXT:    sarb $6, %al
3144; CHECK-NEXT:    negb %al
3145; CHECK-NEXT:    # kill: def $al killed $al killed $eax
3146; CHECK-NEXT:    retq
3147  %1 = sdiv i8 %x, -64
3148  ret i8 %1
3149}
3150
3151define i16 @combine_i16_sdiv_pow2(i16 %x) {
3152; CHECK-LABEL: combine_i16_sdiv_pow2:
3153; CHECK:       # %bb.0:
3154; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3155; CHECK-NEXT:    leal 15(%rdi), %eax
3156; CHECK-NEXT:    testw %di, %di
3157; CHECK-NEXT:    cmovnsl %edi, %eax
3158; CHECK-NEXT:    cwtl
3159; CHECK-NEXT:    shrl $4, %eax
3160; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
3161; CHECK-NEXT:    retq
3162  %1 = sdiv i16 %x, 16
3163  ret i16 %1
3164}
3165
3166define i16 @combine_i16_sdiv_negpow2(i16 %x) {
3167; CHECK-LABEL: combine_i16_sdiv_negpow2:
3168; CHECK:       # %bb.0:
3169; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3170; CHECK-NEXT:    leal 255(%rdi), %eax
3171; CHECK-NEXT:    testw %di, %di
3172; CHECK-NEXT:    cmovnsl %edi, %eax
3173; CHECK-NEXT:    cwtl
3174; CHECK-NEXT:    sarl $8, %eax
3175; CHECK-NEXT:    negl %eax
3176; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
3177; CHECK-NEXT:    retq
3178  %1 = sdiv i16 %x, -256
3179  ret i16 %1
3180}
3181
3182define i32 @combine_i32_sdiv_pow2(i32 %x) {
3183; CHECK-LABEL: combine_i32_sdiv_pow2:
3184; CHECK:       # %bb.0:
3185; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3186; CHECK-NEXT:    leal 15(%rdi), %eax
3187; CHECK-NEXT:    testl %edi, %edi
3188; CHECK-NEXT:    cmovnsl %edi, %eax
3189; CHECK-NEXT:    sarl $4, %eax
3190; CHECK-NEXT:    retq
3191  %1 = sdiv i32 %x, 16
3192  ret i32 %1
3193}
3194
3195define i32 @combine_i32_sdiv_negpow2(i32 %x) {
3196; CHECK-LABEL: combine_i32_sdiv_negpow2:
3197; CHECK:       # %bb.0:
3198; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
3199; CHECK-NEXT:    leal 255(%rdi), %eax
3200; CHECK-NEXT:    testl %edi, %edi
3201; CHECK-NEXT:    cmovnsl %edi, %eax
3202; CHECK-NEXT:    sarl $8, %eax
3203; CHECK-NEXT:    negl %eax
3204; CHECK-NEXT:    retq
3205  %1 = sdiv i32 %x, -256
3206  ret i32 %1
3207}
3208
3209define i64 @combine_i64_sdiv_pow2(i64 %x) {
3210; CHECK-LABEL: combine_i64_sdiv_pow2:
3211; CHECK:       # %bb.0:
3212; CHECK-NEXT:    leaq 15(%rdi), %rax
3213; CHECK-NEXT:    testq %rdi, %rdi
3214; CHECK-NEXT:    cmovnsq %rdi, %rax
3215; CHECK-NEXT:    sarq $4, %rax
3216; CHECK-NEXT:    retq
3217  %1 = sdiv i64 %x, 16
3218  ret i64 %1
3219}
3220
3221define i64 @combine_i64_sdiv_negpow2(i64 %x) {
3222; CHECK-LABEL: combine_i64_sdiv_negpow2:
3223; CHECK:       # %bb.0:
3224; CHECK-NEXT:    leaq 255(%rdi), %rax
3225; CHECK-NEXT:    testq %rdi, %rdi
3226; CHECK-NEXT:    cmovnsq %rdi, %rax
3227; CHECK-NEXT:    sarq $8, %rax
3228; CHECK-NEXT:    negq %rax
3229; CHECK-NEXT:    retq
3230  %1 = sdiv i64 %x, -256
3231  ret i64 %1
3232}
3233