1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
5
6define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
7; SSE-LABEL: fold_urem_vec_1:
8; SSE:       # %bb.0:
9; SSE-NEXT:    pextrw $1, %xmm0, %eax
10; SSE-NEXT:    movl %eax, %ecx
11; SSE-NEXT:    shrl $2, %ecx
12; SSE-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
13; SSE-NEXT:    shrl $19, %ecx
14; SSE-NEXT:    imull $124, %ecx, %ecx
15; SSE-NEXT:    subl %ecx, %eax
16; SSE-NEXT:    movd %xmm0, %ecx
17; SSE-NEXT:    movzwl %cx, %edx
18; SSE-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
19; SSE-NEXT:    shrl $22, %edx
20; SSE-NEXT:    imull $95, %edx, %edx
21; SSE-NEXT:    subl %edx, %ecx
22; SSE-NEXT:    movd %ecx, %xmm1
23; SSE-NEXT:    pinsrw $1, %eax, %xmm1
24; SSE-NEXT:    pextrw $2, %xmm0, %eax
25; SSE-NEXT:    movl %eax, %ecx
26; SSE-NEXT:    shrl %ecx
27; SSE-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
28; SSE-NEXT:    shrl $17, %ecx
29; SSE-NEXT:    imull $98, %ecx, %ecx
30; SSE-NEXT:    subl %ecx, %eax
31; SSE-NEXT:    pinsrw $2, %eax, %xmm1
32; SSE-NEXT:    pextrw $3, %xmm0, %eax
33; SSE-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
34; SSE-NEXT:    shrl $16, %ecx
35; SSE-NEXT:    movl %eax, %edx
36; SSE-NEXT:    subl %ecx, %edx
37; SSE-NEXT:    movzwl %dx, %edx
38; SSE-NEXT:    shrl %edx
39; SSE-NEXT:    addl %ecx, %edx
40; SSE-NEXT:    shrl $9, %edx
41; SSE-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
42; SSE-NEXT:    subl %ecx, %eax
43; SSE-NEXT:    pinsrw $3, %eax, %xmm1
44; SSE-NEXT:    movdqa %xmm1, %xmm0
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: fold_urem_vec_1:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vpextrw $1, %xmm0, %eax
50; AVX-NEXT:    movl %eax, %ecx
51; AVX-NEXT:    shrl $2, %ecx
52; AVX-NEXT:    imull $16913, %ecx, %ecx # imm = 0x4211
53; AVX-NEXT:    shrl $19, %ecx
54; AVX-NEXT:    imull $124, %ecx, %ecx
55; AVX-NEXT:    subl %ecx, %eax
56; AVX-NEXT:    vmovd %xmm0, %ecx
57; AVX-NEXT:    movzwl %cx, %edx
58; AVX-NEXT:    imull $44151, %edx, %edx # imm = 0xAC77
59; AVX-NEXT:    shrl $22, %edx
60; AVX-NEXT:    imull $95, %edx, %edx
61; AVX-NEXT:    subl %edx, %ecx
62; AVX-NEXT:    vmovd %ecx, %xmm1
63; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
64; AVX-NEXT:    vpextrw $2, %xmm0, %eax
65; AVX-NEXT:    movl %eax, %ecx
66; AVX-NEXT:    shrl %ecx
67; AVX-NEXT:    imull $2675, %ecx, %ecx # imm = 0xA73
68; AVX-NEXT:    shrl $17, %ecx
69; AVX-NEXT:    imull $98, %ecx, %ecx
70; AVX-NEXT:    subl %ecx, %eax
71; AVX-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
72; AVX-NEXT:    vpextrw $3, %xmm0, %eax
73; AVX-NEXT:    imull $1373, %eax, %ecx # imm = 0x55D
74; AVX-NEXT:    shrl $16, %ecx
75; AVX-NEXT:    movl %eax, %edx
76; AVX-NEXT:    subl %ecx, %edx
77; AVX-NEXT:    movzwl %dx, %edx
78; AVX-NEXT:    shrl %edx
79; AVX-NEXT:    addl %ecx, %edx
80; AVX-NEXT:    shrl $9, %edx
81; AVX-NEXT:    imull $1003, %edx, %ecx # imm = 0x3EB
82; AVX-NEXT:    subl %ecx, %eax
83; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
84; AVX-NEXT:    retq
85  %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
86  ret <4 x i16> %1
87}
88
89define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
90; SSE-LABEL: fold_urem_vec_2:
91; SSE:       # %bb.0:
92; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
93; SSE-NEXT:    pmulhuw %xmm0, %xmm1
94; SSE-NEXT:    psrlw $6, %xmm1
95; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
96; SSE-NEXT:    psubw %xmm1, %xmm0
97; SSE-NEXT:    retq
98;
99; AVX-LABEL: fold_urem_vec_2:
100; AVX:       # %bb.0:
101; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
102; AVX-NEXT:    vpsrlw $6, %xmm1, %xmm1
103; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
104; AVX-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
105; AVX-NEXT:    retq
106  %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
107  ret <4 x i16> %1
108}
109
110
111; Don't fold if we can combine urem with udiv.
112define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
113; SSE-LABEL: combine_urem_udiv:
114; SSE:       # %bb.0:
115; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
116; SSE-NEXT:    pmulhuw %xmm0, %xmm1
117; SSE-NEXT:    psrlw $6, %xmm1
118; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
119; SSE-NEXT:    pmullw %xmm1, %xmm2
120; SSE-NEXT:    psubw %xmm2, %xmm0
121; SSE-NEXT:    paddw %xmm1, %xmm0
122; SSE-NEXT:    retq
123;
124; AVX-LABEL: combine_urem_udiv:
125; AVX:       # %bb.0:
126; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
127; AVX-NEXT:    vpsrlw $6, %xmm1, %xmm1
128; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
129; AVX-NEXT:    vpsubw %xmm2, %xmm0, %xmm0
130; AVX-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
131; AVX-NEXT:    retq
132  %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
133  %2 = udiv <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
134  %3 = add <4 x i16> %1, %2
135  ret <4 x i16> %3
136}
137
138; Don't fold for divisors that are a power of two.
139define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
140; SSE-LABEL: dont_fold_urem_power_of_two:
141; SSE:       # %bb.0:
142; SSE-NEXT:    pextrw $3, %xmm0, %eax
143; SSE-NEXT:    imull $44151, %eax, %ecx # imm = 0xAC77
144; SSE-NEXT:    shrl $22, %ecx
145; SSE-NEXT:    imull $95, %ecx, %ecx
146; SSE-NEXT:    subl %ecx, %eax
147; SSE-NEXT:    pextrw $1, %xmm0, %ecx
148; SSE-NEXT:    andl $31, %ecx
149; SSE-NEXT:    movd %xmm0, %edx
150; SSE-NEXT:    andl $63, %edx
151; SSE-NEXT:    movd %edx, %xmm1
152; SSE-NEXT:    pinsrw $1, %ecx, %xmm1
153; SSE-NEXT:    pextrw $2, %xmm0, %ecx
154; SSE-NEXT:    andl $7, %ecx
155; SSE-NEXT:    pinsrw $2, %ecx, %xmm1
156; SSE-NEXT:    pinsrw $3, %eax, %xmm1
157; SSE-NEXT:    movdqa %xmm1, %xmm0
158; SSE-NEXT:    retq
159;
160; AVX-LABEL: dont_fold_urem_power_of_two:
161; AVX:       # %bb.0:
162; AVX-NEXT:    vpextrw $3, %xmm0, %eax
163; AVX-NEXT:    imull $44151, %eax, %ecx # imm = 0xAC77
164; AVX-NEXT:    shrl $22, %ecx
165; AVX-NEXT:    imull $95, %ecx, %ecx
166; AVX-NEXT:    subl %ecx, %eax
167; AVX-NEXT:    vpextrw $1, %xmm0, %ecx
168; AVX-NEXT:    andl $31, %ecx
169; AVX-NEXT:    vmovd %xmm0, %edx
170; AVX-NEXT:    andl $63, %edx
171; AVX-NEXT:    vmovd %edx, %xmm1
172; AVX-NEXT:    vpinsrw $1, %ecx, %xmm1, %xmm1
173; AVX-NEXT:    vpextrw $2, %xmm0, %ecx
174; AVX-NEXT:    andl $7, %ecx
175; AVX-NEXT:    vpinsrw $2, %ecx, %xmm1, %xmm0
176; AVX-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
177; AVX-NEXT:    retq
178  %1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
179  ret <4 x i16> %1
180}
181
182; Don't fold if the divisor is one.
183define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
184; SSE-LABEL: dont_fold_urem_one:
185; SSE:       # %bb.0:
186; SSE-NEXT:    pextrw $2, %xmm0, %eax
187; SSE-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
188; SSE-NEXT:    shrl $16, %ecx
189; SSE-NEXT:    movl %eax, %edx
190; SSE-NEXT:    subl %ecx, %edx
191; SSE-NEXT:    movzwl %dx, %edx
192; SSE-NEXT:    shrl %edx
193; SSE-NEXT:    addl %ecx, %edx
194; SSE-NEXT:    shrl $4, %edx
195; SSE-NEXT:    leal (%rdx,%rdx,2), %ecx
196; SSE-NEXT:    shll $3, %ecx
197; SSE-NEXT:    subl %ecx, %edx
198; SSE-NEXT:    addl %eax, %edx
199; SSE-NEXT:    pextrw $1, %xmm0, %eax
200; SSE-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
201; SSE-NEXT:    shrl $25, %ecx
202; SSE-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
203; SSE-NEXT:    subl %ecx, %eax
204; SSE-NEXT:    pxor %xmm1, %xmm1
205; SSE-NEXT:    pinsrw $1, %eax, %xmm1
206; SSE-NEXT:    pinsrw $2, %edx, %xmm1
207; SSE-NEXT:    pextrw $3, %xmm0, %eax
208; SSE-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
209; SSE-NEXT:    shrl $26, %ecx
210; SSE-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
211; SSE-NEXT:    subl %ecx, %eax
212; SSE-NEXT:    pinsrw $3, %eax, %xmm1
213; SSE-NEXT:    movdqa %xmm1, %xmm0
214; SSE-NEXT:    retq
215;
216; AVX-LABEL: dont_fold_urem_one:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vpextrw $2, %xmm0, %eax
219; AVX-NEXT:    imull $25645, %eax, %ecx # imm = 0x642D
220; AVX-NEXT:    shrl $16, %ecx
221; AVX-NEXT:    movl %eax, %edx
222; AVX-NEXT:    subl %ecx, %edx
223; AVX-NEXT:    movzwl %dx, %edx
224; AVX-NEXT:    shrl %edx
225; AVX-NEXT:    addl %ecx, %edx
226; AVX-NEXT:    shrl $4, %edx
227; AVX-NEXT:    leal (%rdx,%rdx,2), %ecx
228; AVX-NEXT:    shll $3, %ecx
229; AVX-NEXT:    subl %ecx, %edx
230; AVX-NEXT:    addl %eax, %edx
231; AVX-NEXT:    vpextrw $1, %xmm0, %eax
232; AVX-NEXT:    imull $51307, %eax, %ecx # imm = 0xC86B
233; AVX-NEXT:    shrl $25, %ecx
234; AVX-NEXT:    imull $654, %ecx, %ecx # imm = 0x28E
235; AVX-NEXT:    subl %ecx, %eax
236; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
237; AVX-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
238; AVX-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
239; AVX-NEXT:    vpextrw $3, %xmm0, %eax
240; AVX-NEXT:    imull $12375, %eax, %ecx # imm = 0x3057
241; AVX-NEXT:    shrl $26, %ecx
242; AVX-NEXT:    imull $5423, %ecx, %ecx # imm = 0x152F
243; AVX-NEXT:    subl %ecx, %eax
244; AVX-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm0
245; AVX-NEXT:    retq
246  %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
247  ret <4 x i16> %1
248}
249
250; Don't fold if the divisor is 2^16.
251define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
252; CHECK-LABEL: dont_fold_urem_i16_smax:
253; CHECK:       # %bb.0:
254; CHECK-NEXT:    retq
255  %1 = urem <4 x i16> %x, <i16 1, i16 65536, i16 23, i16 5423>
256  ret <4 x i16> %1
257}
258
259; Don't fold i64 urem.
260define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
261; SSE-LABEL: dont_fold_urem_i64:
262; SSE:       # %bb.0:
263; SSE-NEXT:    movq %xmm1, %rcx
264; SSE-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
265; SSE-NEXT:    movq %rcx, %rax
266; SSE-NEXT:    mulq %rdx
267; SSE-NEXT:    movq %rcx, %rax
268; SSE-NEXT:    subq %rdx, %rax
269; SSE-NEXT:    shrq %rax
270; SSE-NEXT:    addq %rdx, %rax
271; SSE-NEXT:    shrq $4, %rax
272; SSE-NEXT:    leaq (%rax,%rax,2), %rdx
273; SSE-NEXT:    shlq $3, %rdx
274; SSE-NEXT:    subq %rdx, %rax
275; SSE-NEXT:    addq %rcx, %rax
276; SSE-NEXT:    movq %rax, %xmm2
277; SSE-NEXT:    pextrq $1, %xmm1, %rcx
278; SSE-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
279; SSE-NEXT:    movq %rcx, %rax
280; SSE-NEXT:    mulq %rdx
281; SSE-NEXT:    shrq $12, %rdx
282; SSE-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
283; SSE-NEXT:    subq %rax, %rcx
284; SSE-NEXT:    movq %rcx, %xmm1
285; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
286; SSE-NEXT:    pextrq $1, %xmm0, %rcx
287; SSE-NEXT:    movq %rcx, %rax
288; SSE-NEXT:    shrq %rax
289; SSE-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
290; SSE-NEXT:    mulq %rdx
291; SSE-NEXT:    shrq $7, %rdx
292; SSE-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
293; SSE-NEXT:    subq %rax, %rcx
294; SSE-NEXT:    movq %rcx, %xmm0
295; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
296; SSE-NEXT:    movdqa %xmm2, %xmm1
297; SSE-NEXT:    retq
298;
299; AVX1-LABEL: dont_fold_urem_i64:
300; AVX1:       # %bb.0:
301; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
302; AVX1-NEXT:    vmovq %xmm1, %rcx
303; AVX1-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
304; AVX1-NEXT:    movq %rcx, %rax
305; AVX1-NEXT:    mulq %rdx
306; AVX1-NEXT:    movq %rcx, %rax
307; AVX1-NEXT:    subq %rdx, %rax
308; AVX1-NEXT:    shrq %rax
309; AVX1-NEXT:    addq %rdx, %rax
310; AVX1-NEXT:    shrq $4, %rax
311; AVX1-NEXT:    leaq (%rax,%rax,2), %rdx
312; AVX1-NEXT:    shlq $3, %rdx
313; AVX1-NEXT:    subq %rdx, %rax
314; AVX1-NEXT:    addq %rcx, %rax
315; AVX1-NEXT:    vmovq %rax, %xmm2
316; AVX1-NEXT:    vpextrq $1, %xmm1, %rcx
317; AVX1-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
318; AVX1-NEXT:    movq %rcx, %rax
319; AVX1-NEXT:    mulq %rdx
320; AVX1-NEXT:    shrq $12, %rdx
321; AVX1-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
322; AVX1-NEXT:    subq %rax, %rcx
323; AVX1-NEXT:    vmovq %rcx, %xmm1
324; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
325; AVX1-NEXT:    vpextrq $1, %xmm0, %rcx
326; AVX1-NEXT:    movq %rcx, %rax
327; AVX1-NEXT:    shrq %rax
328; AVX1-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
329; AVX1-NEXT:    mulq %rdx
330; AVX1-NEXT:    shrq $7, %rdx
331; AVX1-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
332; AVX1-NEXT:    subq %rax, %rcx
333; AVX1-NEXT:    vmovq %rcx, %xmm0
334; AVX1-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
335; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
336; AVX1-NEXT:    retq
337;
338; AVX2-LABEL: dont_fold_urem_i64:
339; AVX2:       # %bb.0:
340; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
341; AVX2-NEXT:    vmovq %xmm1, %rcx
342; AVX2-NEXT:    movabsq $7218291159277650633, %rdx # imm = 0x642C8590B21642C9
343; AVX2-NEXT:    movq %rcx, %rax
344; AVX2-NEXT:    mulq %rdx
345; AVX2-NEXT:    movq %rcx, %rax
346; AVX2-NEXT:    subq %rdx, %rax
347; AVX2-NEXT:    shrq %rax
348; AVX2-NEXT:    addq %rdx, %rax
349; AVX2-NEXT:    shrq $4, %rax
350; AVX2-NEXT:    leaq (%rax,%rax,2), %rdx
351; AVX2-NEXT:    shlq $3, %rdx
352; AVX2-NEXT:    subq %rdx, %rax
353; AVX2-NEXT:    addq %rcx, %rax
354; AVX2-NEXT:    vmovq %rax, %xmm2
355; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
356; AVX2-NEXT:    movabsq $-4513890722074972339, %rdx # imm = 0xC15B704DCBCA2F4D
357; AVX2-NEXT:    movq %rcx, %rax
358; AVX2-NEXT:    mulq %rdx
359; AVX2-NEXT:    shrq $12, %rdx
360; AVX2-NEXT:    imulq $5423, %rdx, %rax # imm = 0x152F
361; AVX2-NEXT:    subq %rax, %rcx
362; AVX2-NEXT:    vmovq %rcx, %xmm1
363; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
364; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
365; AVX2-NEXT:    movq %rcx, %rax
366; AVX2-NEXT:    shrq %rax
367; AVX2-NEXT:    movabsq $7220743857598845893, %rdx # imm = 0x64353C48064353C5
368; AVX2-NEXT:    mulq %rdx
369; AVX2-NEXT:    shrq $7, %rdx
370; AVX2-NEXT:    imulq $654, %rdx, %rax # imm = 0x28E
371; AVX2-NEXT:    subq %rax, %rcx
372; AVX2-NEXT:    vmovq %rcx, %xmm0
373; AVX2-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
374; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
375; AVX2-NEXT:    retq
376  %1 = urem <4 x i64> %x, <i64 1, i64 654, i64 23, i64 5423>
377  ret <4 x i64> %1
378}
379