1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW
7
8;
9; sdiv by 7
10;
11
12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind {
13; SSE2-LABEL: test_div7_2i64:
14; SSE2:       # %bb.0:
15; SSE2-NEXT:    movq %xmm0, %rax
16; SSE2-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
17; SSE2-NEXT:    imulq %rcx
18; SSE2-NEXT:    movq %rdx, %rax
19; SSE2-NEXT:    shrq $63, %rax
20; SSE2-NEXT:    sarq %rdx
21; SSE2-NEXT:    addq %rax, %rdx
22; SSE2-NEXT:    movq %rdx, %xmm1
23; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
24; SSE2-NEXT:    movq %xmm0, %rax
25; SSE2-NEXT:    imulq %rcx
26; SSE2-NEXT:    movq %rdx, %rax
27; SSE2-NEXT:    shrq $63, %rax
28; SSE2-NEXT:    sarq %rdx
29; SSE2-NEXT:    addq %rax, %rdx
30; SSE2-NEXT:    movq %rdx, %xmm0
31; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
32; SSE2-NEXT:    movdqa %xmm1, %xmm0
33; SSE2-NEXT:    retq
34;
35; SSE41-LABEL: test_div7_2i64:
36; SSE41:       # %bb.0:
37; SSE41-NEXT:    pextrq $1, %xmm0, %rax
38; SSE41-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
39; SSE41-NEXT:    imulq %rcx
40; SSE41-NEXT:    movq %rdx, %rax
41; SSE41-NEXT:    shrq $63, %rax
42; SSE41-NEXT:    sarq %rdx
43; SSE41-NEXT:    addq %rax, %rdx
44; SSE41-NEXT:    movq %rdx, %xmm1
45; SSE41-NEXT:    movq %xmm0, %rax
46; SSE41-NEXT:    imulq %rcx
47; SSE41-NEXT:    movq %rdx, %rax
48; SSE41-NEXT:    shrq $63, %rax
49; SSE41-NEXT:    sarq %rdx
50; SSE41-NEXT:    addq %rax, %rdx
51; SSE41-NEXT:    movq %rdx, %xmm0
52; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
53; SSE41-NEXT:    retq
54;
55; AVX-LABEL: test_div7_2i64:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vpextrq $1, %xmm0, %rax
58; AVX-NEXT:    movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925
59; AVX-NEXT:    imulq %rcx
60; AVX-NEXT:    movq %rdx, %rax
61; AVX-NEXT:    shrq $63, %rax
62; AVX-NEXT:    sarq %rdx
63; AVX-NEXT:    addq %rax, %rdx
64; AVX-NEXT:    vmovq %rdx, %xmm1
65; AVX-NEXT:    vmovq %xmm0, %rax
66; AVX-NEXT:    imulq %rcx
67; AVX-NEXT:    movq %rdx, %rax
68; AVX-NEXT:    shrq $63, %rax
69; AVX-NEXT:    sarq %rdx
70; AVX-NEXT:    addq %rax, %rdx
71; AVX-NEXT:    vmovq %rdx, %xmm0
72; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
73; AVX-NEXT:    retq
74  %res = sdiv <2 x i64> %a, <i64 7, i64 7>
75  ret <2 x i64> %res
76}
77
78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind {
79; SSE2-LABEL: test_div7_4i32:
80; SSE2:       # %bb.0:
81; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
82; SSE2-NEXT:    movdqa %xmm0, %xmm1
83; SSE2-NEXT:    pmuludq %xmm2, %xmm1
84; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
85; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
86; SSE2-NEXT:    pmuludq %xmm2, %xmm3
87; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
88; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
89; SSE2-NEXT:    pxor %xmm3, %xmm3
90; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
91; SSE2-NEXT:    pand %xmm2, %xmm3
92; SSE2-NEXT:    paddd %xmm0, %xmm3
93; SSE2-NEXT:    psubd %xmm3, %xmm1
94; SSE2-NEXT:    paddd %xmm0, %xmm1
95; SSE2-NEXT:    movdqa %xmm1, %xmm0
96; SSE2-NEXT:    psrld $31, %xmm0
97; SSE2-NEXT:    psrad $2, %xmm1
98; SSE2-NEXT:    paddd %xmm0, %xmm1
99; SSE2-NEXT:    movdqa %xmm1, %xmm0
100; SSE2-NEXT:    retq
101;
102; SSE41-LABEL: test_div7_4i32:
103; SSE41:       # %bb.0:
104; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
105; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
106; SSE41-NEXT:    pmuldq %xmm1, %xmm2
107; SSE41-NEXT:    pmuldq %xmm0, %xmm1
108; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
109; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
110; SSE41-NEXT:    paddd %xmm0, %xmm1
111; SSE41-NEXT:    movdqa %xmm1, %xmm0
112; SSE41-NEXT:    psrld $31, %xmm0
113; SSE41-NEXT:    psrad $2, %xmm1
114; SSE41-NEXT:    paddd %xmm0, %xmm1
115; SSE41-NEXT:    movdqa %xmm1, %xmm0
116; SSE41-NEXT:    retq
117;
118; AVX1-LABEL: test_div7_4i32:
119; AVX1:       # %bb.0:
120; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
121; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
122; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
123; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
124; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
125; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
126; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
127; AVX1-NEXT:    vpsrld $31, %xmm0, %xmm1
128; AVX1-NEXT:    vpsrad $2, %xmm0, %xmm0
129; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
130; AVX1-NEXT:    retq
131;
132; AVX2-LABEL: test_div7_4i32:
133; AVX2:       # %bb.0:
134; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
135; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
136; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
137; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
138; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
139; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
140; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
141; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm1
142; AVX2-NEXT:    vpsrad $2, %xmm0, %xmm0
143; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
144; AVX2-NEXT:    retq
145  %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
146  ret <4 x i32> %res
147}
148
149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
150; SSE-LABEL: test_div7_8i16:
151; SSE:       # %bb.0:
152; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
153; SSE-NEXT:    movdqa %xmm0, %xmm1
154; SSE-NEXT:    psrlw $15, %xmm1
155; SSE-NEXT:    psraw $1, %xmm0
156; SSE-NEXT:    paddw %xmm1, %xmm0
157; SSE-NEXT:    retq
158;
159; AVX-LABEL: test_div7_8i16:
160; AVX:       # %bb.0:
161; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
162; AVX-NEXT:    vpsrlw $15, %xmm0, %xmm1
163; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
164; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
165; AVX-NEXT:    retq
166  %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
167  ret <8 x i16> %res
168}
169
170define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
171; SSE-LABEL: test_div7_16i8:
172; SSE:       # %bb.0:
173; SSE-NEXT:    movdqa %xmm0, %xmm1
174; SSE-NEXT:    pxor %xmm0, %xmm0
175; SSE-NEXT:    pxor %xmm2, %xmm2
176; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
177; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
178; SSE-NEXT:    pmulhw %xmm3, %xmm2
179; SSE-NEXT:    psrlw $8, %xmm2
180; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
181; SSE-NEXT:    pmulhw %xmm3, %xmm0
182; SSE-NEXT:    psrlw $8, %xmm0
183; SSE-NEXT:    packuswb %xmm2, %xmm0
184; SSE-NEXT:    paddb %xmm1, %xmm0
185; SSE-NEXT:    movdqa %xmm0, %xmm1
186; SSE-NEXT:    psrlw $2, %xmm1
187; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
188; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
189; SSE-NEXT:    pxor %xmm2, %xmm1
190; SSE-NEXT:    psrlw $7, %xmm0
191; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
192; SSE-NEXT:    paddb %xmm1, %xmm0
193; SSE-NEXT:    psubb %xmm2, %xmm0
194; SSE-NEXT:    retq
195;
196; AVX1-LABEL: test_div7_16i8:
197; AVX1:       # %bb.0:
198; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
199; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
200; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
201; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
202; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
203; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
204; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
205; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
206; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
207; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
208; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
209; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
210; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
211; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
212; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
213; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
215; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
216; AVX1-NEXT:    retq
217;
218; AVX2NOBW-LABEL: test_div7_16i8:
219; AVX2NOBW:       # %bb.0:
220; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
221; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
222; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
223; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
224; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
225; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
226; AVX2NOBW-NEXT:    vpsrlw $2, %xmm0, %xmm1
227; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
228; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
229; AVX2NOBW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
230; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
231; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
232; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
233; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
234; AVX2NOBW-NEXT:    vzeroupper
235; AVX2NOBW-NEXT:    retq
236;
237; AVX512BW-LABEL: test_div7_16i8:
238; AVX512BW:       # %bb.0:
239; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
240; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
241; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
242; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
243; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
244; AVX512BW-NEXT:    vpsrlw $2, %xmm0, %xmm1
245; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
246; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
247; AVX512BW-NEXT:    vpxor %xmm2, %xmm1, %xmm1
248; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
249; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
250; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
251; AVX512BW-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
252; AVX512BW-NEXT:    vzeroupper
253; AVX512BW-NEXT:    retq
254  %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
255  ret <16 x i8> %res
256}
257
258;
259; sdiv by non-splat constant
260;
261
262define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
263; SSE-LABEL: test_divconstant_16i8:
264; SSE:       # %bb.0:
265; SSE-NEXT:    pxor %xmm1, %xmm1
266; SSE-NEXT:    pxor %xmm2, %xmm2
267; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
268; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
269; SSE-NEXT:    psrlw $8, %xmm2
270; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
271; SSE-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
272; SSE-NEXT:    psrlw $8, %xmm1
273; SSE-NEXT:    packuswb %xmm2, %xmm1
274; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
275; SSE-NEXT:    paddb %xmm1, %xmm0
276; SSE-NEXT:    movdqa %xmm0, %xmm1
277; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
278; SSE-NEXT:    psraw $8, %xmm1
279; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
280; SSE-NEXT:    psrlw $8, %xmm1
281; SSE-NEXT:    movdqa %xmm0, %xmm2
282; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
283; SSE-NEXT:    psraw $8, %xmm2
284; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
285; SSE-NEXT:    psrlw $8, %xmm2
286; SSE-NEXT:    packuswb %xmm1, %xmm2
287; SSE-NEXT:    psrlw $7, %xmm0
288; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
289; SSE-NEXT:    paddb %xmm2, %xmm0
290; SSE-NEXT:    retq
291;
292; AVX1-LABEL: test_divconstant_16i8:
293; AVX1:       # %bb.0:
294; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
295; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
296; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
297; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
298; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
299; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
300; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
301; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
302; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
303; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
304; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
305; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
306; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
307; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
308; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
309; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
310; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
311; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
312; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
313; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm0
314; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
315; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
316; AVX1-NEXT:    retq
317;
318; AVX2NOBW-LABEL: test_divconstant_16i8:
319; AVX2NOBW:       # %bb.0:
320; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
321; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
322; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
323; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
324; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
325; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
326; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
327; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
328; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
329; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
330; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
331; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
332; AVX2NOBW-NEXT:    vpsrlw $7, %xmm0, %xmm0
333; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
334; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
335; AVX2NOBW-NEXT:    vzeroupper
336; AVX2NOBW-NEXT:    retq
337;
338; AVX512BW-LABEL: test_divconstant_16i8:
339; AVX512BW:       # %bb.0:
340; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
341; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
342; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
343; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
344; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
345; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
346; AVX512BW-NEXT:    vpaddb %xmm0, %xmm2, %xmm0
347; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
348; AVX512BW-NEXT:    vpsravw %zmm1, %zmm2, %zmm1
349; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
350; AVX512BW-NEXT:    vpsrlw $7, %xmm0, %xmm0
351; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
352; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
353; AVX512BW-NEXT:    vzeroupper
354; AVX512BW-NEXT:    retq
355  %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
356  ret <16 x i8> %res
357}
358
359;
360; srem by 7
361;
362
363define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind {
364; SSE2-LABEL: test_rem7_2i64:
365; SSE2:       # %bb.0:
366; SSE2-NEXT:    movq %xmm0, %rcx
367; SSE2-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
368; SSE2-NEXT:    movq %rcx, %rax
369; SSE2-NEXT:    imulq %rsi
370; SSE2-NEXT:    movq %rdx, %rax
371; SSE2-NEXT:    shrq $63, %rax
372; SSE2-NEXT:    sarq %rdx
373; SSE2-NEXT:    addq %rax, %rdx
374; SSE2-NEXT:    leaq (,%rdx,8), %rax
375; SSE2-NEXT:    subq %rax, %rdx
376; SSE2-NEXT:    addq %rcx, %rdx
377; SSE2-NEXT:    movq %rdx, %xmm1
378; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
379; SSE2-NEXT:    movq %xmm0, %rcx
380; SSE2-NEXT:    movq %rcx, %rax
381; SSE2-NEXT:    imulq %rsi
382; SSE2-NEXT:    movq %rdx, %rax
383; SSE2-NEXT:    shrq $63, %rax
384; SSE2-NEXT:    sarq %rdx
385; SSE2-NEXT:    addq %rax, %rdx
386; SSE2-NEXT:    leaq (,%rdx,8), %rax
387; SSE2-NEXT:    subq %rax, %rdx
388; SSE2-NEXT:    addq %rcx, %rdx
389; SSE2-NEXT:    movq %rdx, %xmm0
390; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
391; SSE2-NEXT:    movdqa %xmm1, %xmm0
392; SSE2-NEXT:    retq
393;
394; SSE41-LABEL: test_rem7_2i64:
395; SSE41:       # %bb.0:
396; SSE41-NEXT:    pextrq $1, %xmm0, %rcx
397; SSE41-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
398; SSE41-NEXT:    movq %rcx, %rax
399; SSE41-NEXT:    imulq %rsi
400; SSE41-NEXT:    movq %rdx, %rax
401; SSE41-NEXT:    shrq $63, %rax
402; SSE41-NEXT:    sarq %rdx
403; SSE41-NEXT:    addq %rax, %rdx
404; SSE41-NEXT:    leaq (,%rdx,8), %rax
405; SSE41-NEXT:    subq %rax, %rdx
406; SSE41-NEXT:    addq %rcx, %rdx
407; SSE41-NEXT:    movq %rdx, %xmm1
408; SSE41-NEXT:    movq %xmm0, %rcx
409; SSE41-NEXT:    movq %rcx, %rax
410; SSE41-NEXT:    imulq %rsi
411; SSE41-NEXT:    movq %rdx, %rax
412; SSE41-NEXT:    shrq $63, %rax
413; SSE41-NEXT:    sarq %rdx
414; SSE41-NEXT:    addq %rax, %rdx
415; SSE41-NEXT:    leaq (,%rdx,8), %rax
416; SSE41-NEXT:    subq %rax, %rdx
417; SSE41-NEXT:    addq %rcx, %rdx
418; SSE41-NEXT:    movq %rdx, %xmm0
419; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
420; SSE41-NEXT:    retq
421;
422; AVX-LABEL: test_rem7_2i64:
423; AVX:       # %bb.0:
424; AVX-NEXT:    vpextrq $1, %xmm0, %rcx
425; AVX-NEXT:    movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925
426; AVX-NEXT:    movq %rcx, %rax
427; AVX-NEXT:    imulq %rsi
428; AVX-NEXT:    movq %rdx, %rax
429; AVX-NEXT:    shrq $63, %rax
430; AVX-NEXT:    sarq %rdx
431; AVX-NEXT:    addq %rax, %rdx
432; AVX-NEXT:    leaq (,%rdx,8), %rax
433; AVX-NEXT:    subq %rax, %rdx
434; AVX-NEXT:    addq %rcx, %rdx
435; AVX-NEXT:    vmovq %rdx, %xmm1
436; AVX-NEXT:    vmovq %xmm0, %rcx
437; AVX-NEXT:    movq %rcx, %rax
438; AVX-NEXT:    imulq %rsi
439; AVX-NEXT:    movq %rdx, %rax
440; AVX-NEXT:    shrq $63, %rax
441; AVX-NEXT:    sarq %rdx
442; AVX-NEXT:    addq %rax, %rdx
443; AVX-NEXT:    leaq (,%rdx,8), %rax
444; AVX-NEXT:    subq %rax, %rdx
445; AVX-NEXT:    addq %rcx, %rdx
446; AVX-NEXT:    vmovq %rdx, %xmm0
447; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
448; AVX-NEXT:    retq
449  %res = srem <2 x i64> %a, <i64 7, i64 7>
450  ret <2 x i64> %res
451}
452
453define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind {
454; SSE2-LABEL: test_rem7_4i32:
455; SSE2:       # %bb.0:
456; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
457; SSE2-NEXT:    movdqa %xmm0, %xmm2
458; SSE2-NEXT:    pmuludq %xmm1, %xmm2
459; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
460; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
461; SSE2-NEXT:    pmuludq %xmm1, %xmm3
462; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
463; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
464; SSE2-NEXT:    pxor %xmm3, %xmm3
465; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
466; SSE2-NEXT:    pand %xmm1, %xmm3
467; SSE2-NEXT:    paddd %xmm0, %xmm3
468; SSE2-NEXT:    psubd %xmm3, %xmm2
469; SSE2-NEXT:    paddd %xmm0, %xmm2
470; SSE2-NEXT:    movdqa %xmm2, %xmm1
471; SSE2-NEXT:    psrld $31, %xmm1
472; SSE2-NEXT:    psrad $2, %xmm2
473; SSE2-NEXT:    paddd %xmm1, %xmm2
474; SSE2-NEXT:    movdqa %xmm2, %xmm1
475; SSE2-NEXT:    pslld $3, %xmm1
476; SSE2-NEXT:    psubd %xmm1, %xmm2
477; SSE2-NEXT:    paddd %xmm2, %xmm0
478; SSE2-NEXT:    retq
479;
480; SSE41-LABEL: test_rem7_4i32:
481; SSE41:       # %bb.0:
482; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
483; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
484; SSE41-NEXT:    pmuldq %xmm2, %xmm1
485; SSE41-NEXT:    pmuldq %xmm0, %xmm2
486; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
487; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
488; SSE41-NEXT:    paddd %xmm0, %xmm2
489; SSE41-NEXT:    movdqa %xmm2, %xmm1
490; SSE41-NEXT:    psrld $31, %xmm1
491; SSE41-NEXT:    psrad $2, %xmm2
492; SSE41-NEXT:    paddd %xmm1, %xmm2
493; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
494; SSE41-NEXT:    psubd %xmm2, %xmm0
495; SSE41-NEXT:    retq
496;
497; AVX1-LABEL: test_rem7_4i32:
498; AVX1:       # %bb.0:
499; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
500; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
501; AVX1-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
502; AVX1-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
503; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
504; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
505; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
506; AVX1-NEXT:    vpsrld $31, %xmm1, %xmm2
507; AVX1-NEXT:    vpsrad $2, %xmm1, %xmm1
508; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
509; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
510; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
511; AVX1-NEXT:    retq
512;
513; AVX2-LABEL: test_rem7_4i32:
514; AVX2:       # %bb.0:
515; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
516; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
517; AVX2-NEXT:    vpmuldq %xmm2, %xmm1, %xmm1
518; AVX2-NEXT:    vpmuldq %xmm2, %xmm0, %xmm2
519; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
520; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
521; AVX2-NEXT:    vpaddd %xmm0, %xmm1, %xmm1
522; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm2
523; AVX2-NEXT:    vpsrad $2, %xmm1, %xmm1
524; AVX2-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
525; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
526; AVX2-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
527; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
528; AVX2-NEXT:    retq
529  %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
530  ret <4 x i32> %res
531}
532
533define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
534; SSE-LABEL: test_rem7_8i16:
535; SSE:       # %bb.0:
536; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725]
537; SSE-NEXT:    pmulhw %xmm0, %xmm1
538; SSE-NEXT:    movdqa %xmm1, %xmm2
539; SSE-NEXT:    psrlw $15, %xmm2
540; SSE-NEXT:    psraw $1, %xmm1
541; SSE-NEXT:    paddw %xmm2, %xmm1
542; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
543; SSE-NEXT:    psubw %xmm1, %xmm0
544; SSE-NEXT:    retq
545;
546; AVX-LABEL: test_rem7_8i16:
547; AVX:       # %bb.0:
548; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
549; AVX-NEXT:    vpsrlw $15, %xmm1, %xmm2
550; AVX-NEXT:    vpsraw $1, %xmm1, %xmm1
551; AVX-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
552; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
553; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
554; AVX-NEXT:    retq
555  %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
556  ret <8 x i16> %res
557}
558
559define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
560; SSE-LABEL: test_rem7_16i8:
561; SSE:       # %bb.0:
562; SSE-NEXT:    pxor %xmm1, %xmm1
563; SSE-NEXT:    pxor %xmm2, %xmm2
564; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
565; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
566; SSE-NEXT:    pmulhw %xmm3, %xmm2
567; SSE-NEXT:    psrlw $8, %xmm2
568; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
569; SSE-NEXT:    pmulhw %xmm3, %xmm1
570; SSE-NEXT:    psrlw $8, %xmm1
571; SSE-NEXT:    packuswb %xmm2, %xmm1
572; SSE-NEXT:    paddb %xmm0, %xmm1
573; SSE-NEXT:    movdqa %xmm1, %xmm2
574; SSE-NEXT:    psrlw $2, %xmm2
575; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
576; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
577; SSE-NEXT:    pxor %xmm3, %xmm2
578; SSE-NEXT:    psrlw $7, %xmm1
579; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
580; SSE-NEXT:    paddb %xmm2, %xmm1
581; SSE-NEXT:    psubb %xmm3, %xmm1
582; SSE-NEXT:    movdqa %xmm1, %xmm2
583; SSE-NEXT:    psllw $3, %xmm2
584; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
585; SSE-NEXT:    psubb %xmm2, %xmm1
586; SSE-NEXT:    paddb %xmm1, %xmm0
587; SSE-NEXT:    retq
588;
589; AVX1-LABEL: test_rem7_16i8:
590; AVX1:       # %bb.0:
591; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
592; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
593; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
594; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm2
595; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
596; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
597; AVX1-NEXT:    vpmulhw %xmm3, %xmm1, %xmm1
598; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
599; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
600; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
601; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm2
602; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
603; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
604; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
605; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
606; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
607; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
608; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
609; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
610; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
611; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
612; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
613; AVX1-NEXT:    retq
614;
615; AVX2NOBW-LABEL: test_rem7_16i8:
616; AVX2NOBW:       # %bb.0:
617; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
618; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
619; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
620; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
621; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
622; AVX2NOBW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
623; AVX2NOBW-NEXT:    vpsrlw $2, %xmm1, %xmm2
624; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
625; AVX2NOBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
626; AVX2NOBW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
627; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
628; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
629; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
630; AVX2NOBW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
631; AVX2NOBW-NEXT:    vpsllw $3, %xmm1, %xmm2
632; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
633; AVX2NOBW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
634; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
635; AVX2NOBW-NEXT:    vzeroupper
636; AVX2NOBW-NEXT:    retq
637;
638; AVX512BW-LABEL: test_rem7_16i8:
639; AVX512BW:       # %bb.0:
640; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm1
641; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
642; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
643; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
644; AVX512BW-NEXT:    vpaddb %xmm0, %xmm1, %xmm1
645; AVX512BW-NEXT:    vpsrlw $2, %xmm1, %xmm2
646; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
647; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
648; AVX512BW-NEXT:    vpxor %xmm3, %xmm2, %xmm2
649; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
650; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
651; AVX512BW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
652; AVX512BW-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
653; AVX512BW-NEXT:    vpsllw $3, %xmm1, %xmm2
654; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
655; AVX512BW-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
656; AVX512BW-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
657; AVX512BW-NEXT:    vzeroupper
658; AVX512BW-NEXT:    retq
659  %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
660  ret <16 x i8> %res
661}
662
663;
664; srem by non-splat constant
665;
666
667define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
668; SSE2-LABEL: test_remconstant_16i8:
669; SSE2:       # %bb.0:
670; SSE2-NEXT:    pxor %xmm2, %xmm2
671; SSE2-NEXT:    pxor %xmm1, %xmm1
672; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
673; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
674; SSE2-NEXT:    psrlw $8, %xmm1
675; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
676; SSE2-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
677; SSE2-NEXT:    psrlw $8, %xmm2
678; SSE2-NEXT:    packuswb %xmm1, %xmm2
679; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
680; SSE2-NEXT:    pand %xmm0, %xmm1
681; SSE2-NEXT:    paddb %xmm2, %xmm1
682; SSE2-NEXT:    movdqa %xmm1, %xmm2
683; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
684; SSE2-NEXT:    psraw $8, %xmm2
685; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
686; SSE2-NEXT:    psrlw $8, %xmm2
687; SSE2-NEXT:    movdqa %xmm1, %xmm3
688; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
689; SSE2-NEXT:    psraw $8, %xmm3
690; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
691; SSE2-NEXT:    psrlw $8, %xmm3
692; SSE2-NEXT:    packuswb %xmm2, %xmm3
693; SSE2-NEXT:    psrlw $7, %xmm1
694; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
695; SSE2-NEXT:    paddb %xmm3, %xmm1
696; SSE2-NEXT:    movdqa %xmm1, %xmm2
697; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
698; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
699; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
700; SSE2-NEXT:    pand %xmm3, %xmm2
701; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
702; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
703; SSE2-NEXT:    pand %xmm3, %xmm1
704; SSE2-NEXT:    packuswb %xmm2, %xmm1
705; SSE2-NEXT:    psubb %xmm1, %xmm0
706; SSE2-NEXT:    retq
707;
708; SSE41-LABEL: test_remconstant_16i8:
709; SSE41:       # %bb.0:
710; SSE41-NEXT:    pxor %xmm2, %xmm2
711; SSE41-NEXT:    pxor %xmm1, %xmm1
712; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
713; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
714; SSE41-NEXT:    psrlw $8, %xmm1
715; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
716; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
717; SSE41-NEXT:    psrlw $8, %xmm2
718; SSE41-NEXT:    packuswb %xmm1, %xmm2
719; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255]
720; SSE41-NEXT:    pand %xmm0, %xmm1
721; SSE41-NEXT:    paddb %xmm2, %xmm1
722; SSE41-NEXT:    movdqa %xmm1, %xmm2
723; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
724; SSE41-NEXT:    psraw $8, %xmm2
725; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
726; SSE41-NEXT:    psrlw $8, %xmm2
727; SSE41-NEXT:    movdqa %xmm1, %xmm3
728; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
729; SSE41-NEXT:    psraw $8, %xmm3
730; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
731; SSE41-NEXT:    psrlw $8, %xmm3
732; SSE41-NEXT:    packuswb %xmm2, %xmm3
733; SSE41-NEXT:    psrlw $7, %xmm1
734; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
735; SSE41-NEXT:    paddb %xmm3, %xmm1
736; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
737; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
738; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
739; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
740; SSE41-NEXT:    pand %xmm3, %xmm1
741; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
742; SSE41-NEXT:    pand %xmm3, %xmm2
743; SSE41-NEXT:    packuswb %xmm1, %xmm2
744; SSE41-NEXT:    psubb %xmm2, %xmm0
745; SSE41-NEXT:    retq
746;
747; AVX1-LABEL: test_remconstant_16i8:
748; AVX1:       # %bb.0:
749; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
750; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
751; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
752; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
753; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
754; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
755; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
756; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
757; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
758; AVX1-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
759; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
760; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
761; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
762; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
763; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
764; AVX1-NEXT:    vpsraw $8, %xmm3, %xmm3
765; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
766; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
767; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
768; AVX1-NEXT:    vpsrlw $7, %xmm1, %xmm1
769; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
770; AVX1-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
771; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
772; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
773; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
774; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
775; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
776; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
777; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
778; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
779; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
780; AVX1-NEXT:    retq
781;
782; AVX2NOBW-LABEL: test_remconstant_16i8:
783; AVX2NOBW:       # %bb.0:
784; AVX2NOBW-NEXT:    vpmovsxbw %xmm0, %ymm1
785; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
786; AVX2NOBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
787; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
788; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
789; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
790; AVX2NOBW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
791; AVX2NOBW-NEXT:    vpmovsxbw %xmm1, %ymm2
792; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
793; AVX2NOBW-NEXT:    vpsrlw $8, %ymm2, %ymm2
794; AVX2NOBW-NEXT:    vextracti128 $1, %ymm2, %xmm3
795; AVX2NOBW-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
796; AVX2NOBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
797; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
798; AVX2NOBW-NEXT:    vpaddb %xmm1, %xmm2, %xmm1
799; AVX2NOBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
800; AVX2NOBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
801; AVX2NOBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
802; AVX2NOBW-NEXT:    vextracti128 $1, %ymm1, %xmm2
803; AVX2NOBW-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
804; AVX2NOBW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
805; AVX2NOBW-NEXT:    vzeroupper
806; AVX2NOBW-NEXT:    retq
807;
808; AVX512BW-LABEL: test_remconstant_16i8:
809; AVX512BW:       # %bb.0:
810; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2]
811; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm2
812; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
813; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
814; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
815; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
816; AVX512BW-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
817; AVX512BW-NEXT:    vpmovsxbw %xmm2, %ymm3
818; AVX512BW-NEXT:    vpsravw %zmm1, %zmm3, %zmm1
819; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
820; AVX512BW-NEXT:    vpsrlw $7, %xmm2, %xmm2
821; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
822; AVX512BW-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
823; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
824; AVX512BW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
825; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
826; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
827; AVX512BW-NEXT:    vzeroupper
828; AVX512BW-NEXT:    retq
829  %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
830  ret <16 x i8> %res
831}
832
833; This test is just to show what an scalarized v16i8 division looks like.
834define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
835; SSE2-LABEL: test_rem_variable_16i8:
836; SSE2:       # %bb.0:
837; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
838; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
839; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
840; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
841; SSE2-NEXT:    movsbl %ah, %eax
842; SSE2-NEXT:    movd %eax, %xmm0
843; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
844; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
845; SSE2-NEXT:    movsbl %ah, %eax
846; SSE2-NEXT:    movd %eax, %xmm1
847; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
848; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
849; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
850; SSE2-NEXT:    movsbl %ah, %eax
851; SSE2-NEXT:    movd %eax, %xmm0
852; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
853; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
854; SSE2-NEXT:    movsbl %ah, %eax
855; SSE2-NEXT:    movd %eax, %xmm2
856; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
857; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
858; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
859; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
860; SSE2-NEXT:    movsbl %ah, %eax
861; SSE2-NEXT:    movd %eax, %xmm0
862; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
863; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
864; SSE2-NEXT:    movsbl %ah, %eax
865; SSE2-NEXT:    movd %eax, %xmm3
866; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
867; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
868; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
869; SSE2-NEXT:    movsbl %ah, %eax
870; SSE2-NEXT:    movd %eax, %xmm0
871; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
872; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
873; SSE2-NEXT:    movsbl %ah, %eax
874; SSE2-NEXT:    movd %eax, %xmm1
875; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
876; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
877; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
878; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
879; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
880; SSE2-NEXT:    movsbl %ah, %eax
881; SSE2-NEXT:    movd %eax, %xmm0
882; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
883; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
884; SSE2-NEXT:    movsbl %ah, %eax
885; SSE2-NEXT:    movd %eax, %xmm2
886; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
887; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
888; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
889; SSE2-NEXT:    movsbl %ah, %eax
890; SSE2-NEXT:    movd %eax, %xmm0
891; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
892; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
893; SSE2-NEXT:    movsbl %ah, %eax
894; SSE2-NEXT:    movd %eax, %xmm3
895; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
896; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
897; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
898; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
899; SSE2-NEXT:    movsbl %ah, %eax
900; SSE2-NEXT:    movd %eax, %xmm0
901; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
902; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
903; SSE2-NEXT:    movsbl %ah, %eax
904; SSE2-NEXT:    movd %eax, %xmm2
905; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
906; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
907; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
908; SSE2-NEXT:    movsbl %ah, %eax
909; SSE2-NEXT:    movd %eax, %xmm4
910; SSE2-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
911; SSE2-NEXT:    idivb -{{[0-9]+}}(%rsp)
912; SSE2-NEXT:    movsbl %ah, %eax
913; SSE2-NEXT:    movd %eax, %xmm0
914; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
915; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
916; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
917; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
918; SSE2-NEXT:    retq
919;
920; SSE41-LABEL: test_rem_variable_16i8:
921; SSE41:       # %bb.0:
922; SSE41-NEXT:    pextrb $1, %xmm1, %ecx
923; SSE41-NEXT:    pextrb $1, %xmm0, %eax
924; SSE41-NEXT:    cbtw
925; SSE41-NEXT:    idivb %cl
926; SSE41-NEXT:    movsbl %ah, %ecx
927; SSE41-NEXT:    movd %xmm1, %edx
928; SSE41-NEXT:    movd %xmm0, %eax
929; SSE41-NEXT:    cbtw
930; SSE41-NEXT:    idivb %dl
931; SSE41-NEXT:    movsbl %ah, %eax
932; SSE41-NEXT:    movd %eax, %xmm2
933; SSE41-NEXT:    pinsrb $1, %ecx, %xmm2
934; SSE41-NEXT:    pextrb $2, %xmm1, %ecx
935; SSE41-NEXT:    pextrb $2, %xmm0, %eax
936; SSE41-NEXT:    cbtw
937; SSE41-NEXT:    idivb %cl
938; SSE41-NEXT:    movsbl %ah, %eax
939; SSE41-NEXT:    pinsrb $2, %eax, %xmm2
940; SSE41-NEXT:    pextrb $3, %xmm1, %ecx
941; SSE41-NEXT:    pextrb $3, %xmm0, %eax
942; SSE41-NEXT:    cbtw
943; SSE41-NEXT:    idivb %cl
944; SSE41-NEXT:    movsbl %ah, %eax
945; SSE41-NEXT:    pinsrb $3, %eax, %xmm2
946; SSE41-NEXT:    pextrb $4, %xmm1, %ecx
947; SSE41-NEXT:    pextrb $4, %xmm0, %eax
948; SSE41-NEXT:    cbtw
949; SSE41-NEXT:    idivb %cl
950; SSE41-NEXT:    movsbl %ah, %eax
951; SSE41-NEXT:    pinsrb $4, %eax, %xmm2
952; SSE41-NEXT:    pextrb $5, %xmm1, %ecx
953; SSE41-NEXT:    pextrb $5, %xmm0, %eax
954; SSE41-NEXT:    cbtw
955; SSE41-NEXT:    idivb %cl
956; SSE41-NEXT:    movsbl %ah, %eax
957; SSE41-NEXT:    pinsrb $5, %eax, %xmm2
958; SSE41-NEXT:    pextrb $6, %xmm1, %ecx
959; SSE41-NEXT:    pextrb $6, %xmm0, %eax
960; SSE41-NEXT:    cbtw
961; SSE41-NEXT:    idivb %cl
962; SSE41-NEXT:    movsbl %ah, %eax
963; SSE41-NEXT:    pinsrb $6, %eax, %xmm2
964; SSE41-NEXT:    pextrb $7, %xmm1, %ecx
965; SSE41-NEXT:    pextrb $7, %xmm0, %eax
966; SSE41-NEXT:    cbtw
967; SSE41-NEXT:    idivb %cl
968; SSE41-NEXT:    movsbl %ah, %eax
969; SSE41-NEXT:    pinsrb $7, %eax, %xmm2
970; SSE41-NEXT:    pextrb $8, %xmm1, %ecx
971; SSE41-NEXT:    pextrb $8, %xmm0, %eax
972; SSE41-NEXT:    cbtw
973; SSE41-NEXT:    idivb %cl
974; SSE41-NEXT:    movsbl %ah, %eax
975; SSE41-NEXT:    pinsrb $8, %eax, %xmm2
976; SSE41-NEXT:    pextrb $9, %xmm1, %ecx
977; SSE41-NEXT:    pextrb $9, %xmm0, %eax
978; SSE41-NEXT:    cbtw
979; SSE41-NEXT:    idivb %cl
980; SSE41-NEXT:    movsbl %ah, %eax
981; SSE41-NEXT:    pinsrb $9, %eax, %xmm2
982; SSE41-NEXT:    pextrb $10, %xmm1, %ecx
983; SSE41-NEXT:    pextrb $10, %xmm0, %eax
984; SSE41-NEXT:    cbtw
985; SSE41-NEXT:    idivb %cl
986; SSE41-NEXT:    movsbl %ah, %eax
987; SSE41-NEXT:    pinsrb $10, %eax, %xmm2
988; SSE41-NEXT:    pextrb $11, %xmm1, %ecx
989; SSE41-NEXT:    pextrb $11, %xmm0, %eax
990; SSE41-NEXT:    cbtw
991; SSE41-NEXT:    idivb %cl
992; SSE41-NEXT:    movsbl %ah, %eax
993; SSE41-NEXT:    pinsrb $11, %eax, %xmm2
994; SSE41-NEXT:    pextrb $12, %xmm1, %ecx
995; SSE41-NEXT:    pextrb $12, %xmm0, %eax
996; SSE41-NEXT:    cbtw
997; SSE41-NEXT:    idivb %cl
998; SSE41-NEXT:    movsbl %ah, %eax
999; SSE41-NEXT:    pinsrb $12, %eax, %xmm2
1000; SSE41-NEXT:    pextrb $13, %xmm1, %ecx
1001; SSE41-NEXT:    pextrb $13, %xmm0, %eax
1002; SSE41-NEXT:    cbtw
1003; SSE41-NEXT:    idivb %cl
1004; SSE41-NEXT:    movsbl %ah, %eax
1005; SSE41-NEXT:    pinsrb $13, %eax, %xmm2
1006; SSE41-NEXT:    pextrb $14, %xmm1, %ecx
1007; SSE41-NEXT:    pextrb $14, %xmm0, %eax
1008; SSE41-NEXT:    cbtw
1009; SSE41-NEXT:    idivb %cl
1010; SSE41-NEXT:    movsbl %ah, %eax
1011; SSE41-NEXT:    pinsrb $14, %eax, %xmm2
1012; SSE41-NEXT:    pextrb $15, %xmm1, %ecx
1013; SSE41-NEXT:    pextrb $15, %xmm0, %eax
1014; SSE41-NEXT:    cbtw
1015; SSE41-NEXT:    idivb %cl
1016; SSE41-NEXT:    movsbl %ah, %eax
1017; SSE41-NEXT:    pinsrb $15, %eax, %xmm2
1018; SSE41-NEXT:    movdqa %xmm2, %xmm0
1019; SSE41-NEXT:    retq
1020;
1021; AVX-LABEL: test_rem_variable_16i8:
1022; AVX:       # %bb.0:
1023; AVX-NEXT:    vpextrb $1, %xmm1, %ecx
1024; AVX-NEXT:    vpextrb $1, %xmm0, %eax
1025; AVX-NEXT:    cbtw
1026; AVX-NEXT:    idivb %cl
1027; AVX-NEXT:    movsbl %ah, %ecx
1028; AVX-NEXT:    vmovd %xmm1, %edx
1029; AVX-NEXT:    vmovd %xmm0, %eax
1030; AVX-NEXT:    cbtw
1031; AVX-NEXT:    idivb %dl
1032; AVX-NEXT:    movsbl %ah, %eax
1033; AVX-NEXT:    vmovd %eax, %xmm2
1034; AVX-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
1035; AVX-NEXT:    vpextrb $2, %xmm1, %ecx
1036; AVX-NEXT:    vpextrb $2, %xmm0, %eax
1037; AVX-NEXT:    cbtw
1038; AVX-NEXT:    idivb %cl
1039; AVX-NEXT:    movsbl %ah, %eax
1040; AVX-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
1041; AVX-NEXT:    vpextrb $3, %xmm1, %ecx
1042; AVX-NEXT:    vpextrb $3, %xmm0, %eax
1043; AVX-NEXT:    cbtw
1044; AVX-NEXT:    idivb %cl
1045; AVX-NEXT:    movsbl %ah, %eax
1046; AVX-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
1047; AVX-NEXT:    vpextrb $4, %xmm1, %ecx
1048; AVX-NEXT:    vpextrb $4, %xmm0, %eax
1049; AVX-NEXT:    cbtw
1050; AVX-NEXT:    idivb %cl
1051; AVX-NEXT:    movsbl %ah, %eax
1052; AVX-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
1053; AVX-NEXT:    vpextrb $5, %xmm1, %ecx
1054; AVX-NEXT:    vpextrb $5, %xmm0, %eax
1055; AVX-NEXT:    cbtw
1056; AVX-NEXT:    idivb %cl
1057; AVX-NEXT:    movsbl %ah, %eax
1058; AVX-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
1059; AVX-NEXT:    vpextrb $6, %xmm1, %ecx
1060; AVX-NEXT:    vpextrb $6, %xmm0, %eax
1061; AVX-NEXT:    cbtw
1062; AVX-NEXT:    idivb %cl
1063; AVX-NEXT:    movsbl %ah, %eax
1064; AVX-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
1065; AVX-NEXT:    vpextrb $7, %xmm1, %ecx
1066; AVX-NEXT:    vpextrb $7, %xmm0, %eax
1067; AVX-NEXT:    cbtw
1068; AVX-NEXT:    idivb %cl
1069; AVX-NEXT:    movsbl %ah, %eax
1070; AVX-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
1071; AVX-NEXT:    vpextrb $8, %xmm1, %ecx
1072; AVX-NEXT:    vpextrb $8, %xmm0, %eax
1073; AVX-NEXT:    cbtw
1074; AVX-NEXT:    idivb %cl
1075; AVX-NEXT:    movsbl %ah, %eax
1076; AVX-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
1077; AVX-NEXT:    vpextrb $9, %xmm1, %ecx
1078; AVX-NEXT:    vpextrb $9, %xmm0, %eax
1079; AVX-NEXT:    cbtw
1080; AVX-NEXT:    idivb %cl
1081; AVX-NEXT:    movsbl %ah, %eax
1082; AVX-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
1083; AVX-NEXT:    vpextrb $10, %xmm1, %ecx
1084; AVX-NEXT:    vpextrb $10, %xmm0, %eax
1085; AVX-NEXT:    cbtw
1086; AVX-NEXT:    idivb %cl
1087; AVX-NEXT:    movsbl %ah, %eax
1088; AVX-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
1089; AVX-NEXT:    vpextrb $11, %xmm1, %ecx
1090; AVX-NEXT:    vpextrb $11, %xmm0, %eax
1091; AVX-NEXT:    cbtw
1092; AVX-NEXT:    idivb %cl
1093; AVX-NEXT:    movsbl %ah, %eax
1094; AVX-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
1095; AVX-NEXT:    vpextrb $12, %xmm1, %ecx
1096; AVX-NEXT:    vpextrb $12, %xmm0, %eax
1097; AVX-NEXT:    cbtw
1098; AVX-NEXT:    idivb %cl
1099; AVX-NEXT:    movsbl %ah, %eax
1100; AVX-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
1101; AVX-NEXT:    vpextrb $13, %xmm1, %ecx
1102; AVX-NEXT:    vpextrb $13, %xmm0, %eax
1103; AVX-NEXT:    cbtw
1104; AVX-NEXT:    idivb %cl
1105; AVX-NEXT:    movsbl %ah, %eax
1106; AVX-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
1107; AVX-NEXT:    vpextrb $14, %xmm1, %ecx
1108; AVX-NEXT:    vpextrb $14, %xmm0, %eax
1109; AVX-NEXT:    cbtw
1110; AVX-NEXT:    idivb %cl
1111; AVX-NEXT:    movsbl %ah, %eax
1112; AVX-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
1113; AVX-NEXT:    vpextrb $15, %xmm1, %ecx
1114; AVX-NEXT:    vpextrb $15, %xmm0, %eax
1115; AVX-NEXT:    cbtw
1116; AVX-NEXT:    idivb %cl
1117; AVX-NEXT:    movsbl %ah, %eax
1118; AVX-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm0
1119; AVX-NEXT:    retq
1120  %res = srem <16 x i8> %a, %b
1121  ret <16 x i8> %res
1122}
1123