1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
4
5; If the target does not have a single div/rem operation,
6; -div-rem-pairs pass will decompose the remainder calculation as:
7;   X % Y --> X - ((X / Y) * Y)
8; But if the target does have a single div/rem operation,
9; the opposite transform is likely beneficial.
10
11define i8 @scalar_i8(i8 %x, i8 %y, i8* %divdst) nounwind {
12; X86-LABEL: scalar_i8:
13; X86:       # %bb.0:
14; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
15; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
16; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
17; X86-NEXT:    movzbl %cl, %eax
18; X86-NEXT:    divb %ch
19; X86-NEXT:    movb %al, (%edx)
20; X86-NEXT:    mulb %ch
21; X86-NEXT:    subb %al, %cl
22; X86-NEXT:    movl %ecx, %eax
23; X86-NEXT:    retl
24;
25; X64-LABEL: scalar_i8:
26; X64:       # %bb.0:
27; X64-NEXT:    movzbl %dil, %ecx
28; X64-NEXT:    movl %ecx, %eax
29; X64-NEXT:    divb %sil
30; X64-NEXT:    movb %al, (%rdx)
31; X64-NEXT:    mulb %sil
32; X64-NEXT:    subb %al, %cl
33; X64-NEXT:    movl %ecx, %eax
34; X64-NEXT:    retq
35  %div = udiv i8 %x, %y
36  store i8 %div, i8* %divdst, align 4
37  %t1 = mul i8 %div, %y
38  %t2 = sub i8 %x, %t1
39  ret i8 %t2
40}
41
42define i16 @scalar_i16(i16 %x, i16 %y, i16* %divdst) nounwind {
43; X86-LABEL: scalar_i16:
44; X86:       # %bb.0:
45; X86-NEXT:    pushl %edi
46; X86-NEXT:    pushl %esi
47; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
48; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
49; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
50; X86-NEXT:    movl %ecx, %eax
51; X86-NEXT:    xorl %edx, %edx
52; X86-NEXT:    divw %si
53; X86-NEXT:    # kill: def $ax killed $ax def $eax
54; X86-NEXT:    movw %ax, (%edi)
55; X86-NEXT:    imull %eax, %esi
56; X86-NEXT:    subl %esi, %ecx
57; X86-NEXT:    movl %ecx, %eax
58; X86-NEXT:    popl %esi
59; X86-NEXT:    popl %edi
60; X86-NEXT:    retl
61;
62; X64-LABEL: scalar_i16:
63; X64:       # %bb.0:
64; X64-NEXT:    movq %rdx, %rcx
65; X64-NEXT:    movl %edi, %eax
66; X64-NEXT:    xorl %edx, %edx
67; X64-NEXT:    divw %si
68; X64-NEXT:    # kill: def $ax killed $ax def $eax
69; X64-NEXT:    movw %ax, (%rcx)
70; X64-NEXT:    imull %eax, %esi
71; X64-NEXT:    subl %esi, %edi
72; X64-NEXT:    movl %edi, %eax
73; X64-NEXT:    retq
74  %div = udiv i16 %x, %y
75  store i16 %div, i16* %divdst, align 4
76  %t1 = mul i16 %div, %y
77  %t2 = sub i16 %x, %t1
78  ret i16 %t2
79}
80
81define i32 @scalar_i32(i32 %x, i32 %y, i32* %divdst) nounwind {
82; X86-LABEL: scalar_i32:
83; X86:       # %bb.0:
84; X86-NEXT:    pushl %edi
85; X86-NEXT:    pushl %esi
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
87; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
89; X86-NEXT:    movl %ecx, %eax
90; X86-NEXT:    xorl %edx, %edx
91; X86-NEXT:    divl %edi
92; X86-NEXT:    movl %eax, (%esi)
93; X86-NEXT:    imull %edi, %eax
94; X86-NEXT:    subl %eax, %ecx
95; X86-NEXT:    movl %ecx, %eax
96; X86-NEXT:    popl %esi
97; X86-NEXT:    popl %edi
98; X86-NEXT:    retl
99;
100; X64-LABEL: scalar_i32:
101; X64:       # %bb.0:
102; X64-NEXT:    movq %rdx, %rcx
103; X64-NEXT:    movl %edi, %eax
104; X64-NEXT:    xorl %edx, %edx
105; X64-NEXT:    divl %esi
106; X64-NEXT:    movl %eax, (%rcx)
107; X64-NEXT:    imull %esi, %eax
108; X64-NEXT:    subl %eax, %edi
109; X64-NEXT:    movl %edi, %eax
110; X64-NEXT:    retq
111  %div = udiv i32 %x, %y
112  store i32 %div, i32* %divdst, align 4
113  %t1 = mul i32 %div, %y
114  %t2 = sub i32 %x, %t1
115  ret i32 %t2
116}
117
118define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind {
119; X86-LABEL: scalar_i64:
120; X86:       # %bb.0:
121; X86-NEXT:    pushl %ebp
122; X86-NEXT:    pushl %ebx
123; X86-NEXT:    pushl %edi
124; X86-NEXT:    pushl %esi
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
127; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
128; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
129; X86-NEXT:    pushl %ebx
130; X86-NEXT:    pushl %ebp
131; X86-NEXT:    pushl %edi
132; X86-NEXT:    pushl %esi
133; X86-NEXT:    calll __udivdi3
134; X86-NEXT:    addl $16, %esp
135; X86-NEXT:    movl %edx, %ecx
136; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
137; X86-NEXT:    movl %ecx, 4(%edx)
138; X86-NEXT:    movl %eax, (%edx)
139; X86-NEXT:    imull %eax, %ebx
140; X86-NEXT:    mull %ebp
141; X86-NEXT:    addl %ebx, %edx
142; X86-NEXT:    imull %ebp, %ecx
143; X86-NEXT:    addl %edx, %ecx
144; X86-NEXT:    subl %eax, %esi
145; X86-NEXT:    sbbl %ecx, %edi
146; X86-NEXT:    movl %esi, %eax
147; X86-NEXT:    movl %edi, %edx
148; X86-NEXT:    popl %esi
149; X86-NEXT:    popl %edi
150; X86-NEXT:    popl %ebx
151; X86-NEXT:    popl %ebp
152; X86-NEXT:    retl
153;
154; X64-LABEL: scalar_i64:
155; X64:       # %bb.0:
156; X64-NEXT:    movq %rdx, %rcx
157; X64-NEXT:    movq %rdi, %rax
158; X64-NEXT:    xorl %edx, %edx
159; X64-NEXT:    divq %rsi
160; X64-NEXT:    movq %rax, (%rcx)
161; X64-NEXT:    imulq %rsi, %rax
162; X64-NEXT:    subq %rax, %rdi
163; X64-NEXT:    movq %rdi, %rax
164; X64-NEXT:    retq
165  %div = udiv i64 %x, %y
166  store i64 %div, i64* %divdst, align 4
167  %t1 = mul i64 %div, %y
168  %t2 = sub i64 %x, %t1
169  ret i64 %t2
170}
171
172define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind {
173; X86-LABEL: scalar_i128:
174; X86:       # %bb.0:
175; X86-NEXT:    pushl %ebp
176; X86-NEXT:    movl %esp, %ebp
177; X86-NEXT:    pushl %ebx
178; X86-NEXT:    pushl %edi
179; X86-NEXT:    pushl %esi
180; X86-NEXT:    andl $-8, %esp
181; X86-NEXT:    subl $48, %esp
182; X86-NEXT:    movl 44(%ebp), %edi
183; X86-NEXT:    movl 28(%ebp), %ecx
184; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
185; X86-NEXT:    pushl 40(%ebp)
186; X86-NEXT:    pushl 36(%ebp)
187; X86-NEXT:    pushl 32(%ebp)
188; X86-NEXT:    pushl %ecx
189; X86-NEXT:    movl %ecx, %ebx
190; X86-NEXT:    pushl 24(%ebp)
191; X86-NEXT:    pushl 20(%ebp)
192; X86-NEXT:    pushl 16(%ebp)
193; X86-NEXT:    pushl 12(%ebp)
194; X86-NEXT:    pushl %eax
195; X86-NEXT:    calll __udivti3
196; X86-NEXT:    addl $32, %esp
197; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
198; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
199; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
200; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
201; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
202; X86-NEXT:    movl %ecx, 12(%edi)
203; X86-NEXT:    movl %esi, 8(%edi)
204; X86-NEXT:    movl %eax, 4(%edi)
205; X86-NEXT:    movl %edx, (%edi)
206; X86-NEXT:    movl %edx, %edi
207; X86-NEXT:    movl %ebx, %eax
208; X86-NEXT:    imull %ebx, %ecx
209; X86-NEXT:    mull %esi
210; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
211; X86-NEXT:    addl %ecx, %edx
212; X86-NEXT:    imull 32(%ebp), %esi
213; X86-NEXT:    addl %edx, %esi
214; X86-NEXT:    movl 36(%ebp), %eax
215; X86-NEXT:    movl %eax, %ecx
216; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
217; X86-NEXT:    imull %ebx, %ecx
218; X86-NEXT:    mull %edi
219; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
220; X86-NEXT:    addl %ecx, %edx
221; X86-NEXT:    movl %edi, %eax
222; X86-NEXT:    movl 40(%ebp), %edi
223; X86-NEXT:    imull %eax, %edi
224; X86-NEXT:    addl %edx, %edi
225; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
226; X86-NEXT:    addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
227; X86-NEXT:    adcl %esi, %edi
228; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
229; X86-NEXT:    movl %eax, %edi
230; X86-NEXT:    movl 28(%ebp), %ecx
231; X86-NEXT:    mull %ecx
232; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
233; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
234; X86-NEXT:    movl %ebx, %eax
235; X86-NEXT:    mull %ecx
236; X86-NEXT:    movl %edx, %esi
237; X86-NEXT:    movl %eax, %ecx
238; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
239; X86-NEXT:    adcl $0, %esi
240; X86-NEXT:    movl %edi, %eax
241; X86-NEXT:    mull 32(%ebp)
242; X86-NEXT:    movl %edx, %edi
243; X86-NEXT:    addl %ecx, %eax
244; X86-NEXT:    movl %eax, %ebx
245; X86-NEXT:    adcl %esi, %edi
246; X86-NEXT:    setb %cl
247; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
248; X86-NEXT:    mull 32(%ebp)
249; X86-NEXT:    addl %edi, %eax
250; X86-NEXT:    movzbl %cl, %ecx
251; X86-NEXT:    adcl %ecx, %edx
252; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
253; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
254; X86-NEXT:    movl 12(%ebp), %ecx
255; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
256; X86-NEXT:    movl 16(%ebp), %esi
257; X86-NEXT:    sbbl %ebx, %esi
258; X86-NEXT:    movl 20(%ebp), %edi
259; X86-NEXT:    sbbl %eax, %edi
260; X86-NEXT:    movl 24(%ebp), %ebx
261; X86-NEXT:    sbbl %edx, %ebx
262; X86-NEXT:    movl 8(%ebp), %eax
263; X86-NEXT:    movl %ecx, (%eax)
264; X86-NEXT:    movl %esi, 4(%eax)
265; X86-NEXT:    movl %edi, 8(%eax)
266; X86-NEXT:    movl %ebx, 12(%eax)
267; X86-NEXT:    leal -12(%ebp), %esp
268; X86-NEXT:    popl %esi
269; X86-NEXT:    popl %edi
270; X86-NEXT:    popl %ebx
271; X86-NEXT:    popl %ebp
272; X86-NEXT:    retl $4
273;
274; X64-LABEL: scalar_i128:
275; X64:       # %bb.0:
276; X64-NEXT:    pushq %r15
277; X64-NEXT:    pushq %r14
278; X64-NEXT:    pushq %r13
279; X64-NEXT:    pushq %r12
280; X64-NEXT:    pushq %rbx
281; X64-NEXT:    movq %r8, %r14
282; X64-NEXT:    movq %rcx, %rbx
283; X64-NEXT:    movq %rdx, %r15
284; X64-NEXT:    movq %rsi, %r12
285; X64-NEXT:    movq %rdi, %r13
286; X64-NEXT:    callq __udivti3@PLT
287; X64-NEXT:    movq %rdx, %rcx
288; X64-NEXT:    movq %rdx, 8(%r14)
289; X64-NEXT:    movq %rax, (%r14)
290; X64-NEXT:    imulq %rax, %rbx
291; X64-NEXT:    mulq %r15
292; X64-NEXT:    addq %rbx, %rdx
293; X64-NEXT:    imulq %r15, %rcx
294; X64-NEXT:    addq %rdx, %rcx
295; X64-NEXT:    subq %rax, %r13
296; X64-NEXT:    sbbq %rcx, %r12
297; X64-NEXT:    movq %r13, %rax
298; X64-NEXT:    movq %r12, %rdx
299; X64-NEXT:    popq %rbx
300; X64-NEXT:    popq %r12
301; X64-NEXT:    popq %r13
302; X64-NEXT:    popq %r14
303; X64-NEXT:    popq %r15
304; X64-NEXT:    retq
305  %div = udiv i128 %x, %y
306  store i128 %div, i128* %divdst, align 4
307  %t1 = mul i128 %div, %y
308  %t2 = sub i128 %x, %t1
309  ret i128 %t2
310}
311
312define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind {
313; X86-LABEL: vector_i128_i8:
314; X86:       # %bb.0:
315; X86-NEXT:    pushl %ebp
316; X86-NEXT:    movl %esp, %ebp
317; X86-NEXT:    pushl %ebx
318; X86-NEXT:    pushl %edi
319; X86-NEXT:    pushl %esi
320; X86-NEXT:    andl $-16, %esp
321; X86-NEXT:    subl $48, %esp
322; X86-NEXT:    movdqa %xmm0, (%esp)
323; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
324; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
325; X86-NEXT:    divb {{[0-9]+}}(%esp)
326; X86-NEXT:    movzbl %al, %eax
327; X86-NEXT:    movd %eax, %xmm2
328; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
329; X86-NEXT:    divb {{[0-9]+}}(%esp)
330; X86-NEXT:    movzbl %al, %eax
331; X86-NEXT:    movd %eax, %xmm3
332; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
333; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
334; X86-NEXT:    divb {{[0-9]+}}(%esp)
335; X86-NEXT:    movzbl %al, %eax
336; X86-NEXT:    movd %eax, %xmm4
337; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
338; X86-NEXT:    divb {{[0-9]+}}(%esp)
339; X86-NEXT:    movzbl %al, %eax
340; X86-NEXT:    movd %eax, %xmm2
341; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
342; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
343; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
344; X86-NEXT:    divb {{[0-9]+}}(%esp)
345; X86-NEXT:    movzbl %al, %eax
346; X86-NEXT:    movd %eax, %xmm3
347; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
348; X86-NEXT:    divb {{[0-9]+}}(%esp)
349; X86-NEXT:    movzbl %al, %eax
350; X86-NEXT:    movd %eax, %xmm4
351; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
352; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
353; X86-NEXT:    divb {{[0-9]+}}(%esp)
354; X86-NEXT:    movzbl %al, %eax
355; X86-NEXT:    movd %eax, %xmm5
356; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
357; X86-NEXT:    divb {{[0-9]+}}(%esp)
358; X86-NEXT:    movzbl %al, %eax
359; X86-NEXT:    movd %eax, %xmm3
360; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
361; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
362; X86-NEXT:    divb {{[0-9]+}}(%esp)
363; X86-NEXT:    movzbl %al, %eax
364; X86-NEXT:    movd %eax, %xmm5
365; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
366; X86-NEXT:    divb {{[0-9]+}}(%esp)
367; X86-NEXT:    movzbl %al, %eax
368; X86-NEXT:    movd %eax, %xmm6
369; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
370; X86-NEXT:    divb {{[0-9]+}}(%esp)
371; X86-NEXT:    movzbl %al, %edx
372; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
373; X86-NEXT:    divb {{[0-9]+}}(%esp)
374; X86-NEXT:    movzbl %al, %esi
375; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
376; X86-NEXT:    divb {{[0-9]+}}(%esp)
377; X86-NEXT:    movzbl %al, %edi
378; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
379; X86-NEXT:    divb {{[0-9]+}}(%esp)
380; X86-NEXT:    movzbl %al, %ebx
381; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
382; X86-NEXT:    divb {{[0-9]+}}(%esp)
383; X86-NEXT:    movl %eax, %ecx
384; X86-NEXT:    movzbl (%esp), %eax
385; X86-NEXT:    divb {{[0-9]+}}(%esp)
386; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
387; X86-NEXT:    movd %edx, %xmm4
388; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
389; X86-NEXT:    movd %esi, %xmm2
390; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
391; X86-NEXT:    movd %edi, %xmm5
392; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
393; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
394; X86-NEXT:    movd %ebx, %xmm4
395; X86-NEXT:    movzbl %cl, %ecx
396; X86-NEXT:    movd %ecx, %xmm6
397; X86-NEXT:    movl 8(%ebp), %ecx
398; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
399; X86-NEXT:    movzbl %al, %eax
400; X86-NEXT:    movd %eax, %xmm5
401; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
402; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
403; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
404; X86-NEXT:    movdqa %xmm5, %xmm2
405; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
406; X86-NEXT:    movdqa %xmm2, (%ecx)
407; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
408; X86-NEXT:    movdqa %xmm1, %xmm2
409; X86-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
410; X86-NEXT:    pmullw %xmm3, %xmm2
411; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
412; X86-NEXT:    pand %xmm3, %xmm2
413; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
414; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
415; X86-NEXT:    pmullw %xmm5, %xmm1
416; X86-NEXT:    pand %xmm3, %xmm1
417; X86-NEXT:    packuswb %xmm2, %xmm1
418; X86-NEXT:    psubb %xmm1, %xmm0
419; X86-NEXT:    leal -12(%ebp), %esp
420; X86-NEXT:    popl %esi
421; X86-NEXT:    popl %edi
422; X86-NEXT:    popl %ebx
423; X86-NEXT:    popl %ebp
424; X86-NEXT:    retl
425;
426; X64-LABEL: vector_i128_i8:
427; X64:       # %bb.0:
428; X64-NEXT:    pushq %rbp
429; X64-NEXT:    pushq %r15
430; X64-NEXT:    pushq %r14
431; X64-NEXT:    pushq %r13
432; X64-NEXT:    pushq %r12
433; X64-NEXT:    pushq %rbx
434; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
435; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
436; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
437; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
438; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
439; X64-NEXT:    movzbl %al, %eax
440; X64-NEXT:    movd %eax, %xmm2
441; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
442; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
443; X64-NEXT:    movzbl %al, %r8d
444; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
445; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
446; X64-NEXT:    movzbl %al, %r9d
447; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
448; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
449; X64-NEXT:    movzbl %al, %r10d
450; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
451; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
452; X64-NEXT:    movzbl %al, %r11d
453; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
454; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
455; X64-NEXT:    movzbl %al, %r14d
456; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
457; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
458; X64-NEXT:    movzbl %al, %r15d
459; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
460; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
461; X64-NEXT:    movzbl %al, %r12d
462; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
463; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
464; X64-NEXT:    movzbl %al, %r13d
465; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
466; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
467; X64-NEXT:    movzbl %al, %edi
468; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
469; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
470; X64-NEXT:    movzbl %al, %esi
471; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
472; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
473; X64-NEXT:    movzbl %al, %ebx
474; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
475; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
476; X64-NEXT:    movzbl %al, %ebp
477; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
478; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
479; X64-NEXT:    movzbl %al, %edx
480; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
481; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
482; X64-NEXT:    movl %eax, %ecx
483; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
484; X64-NEXT:    divb -{{[0-9]+}}(%rsp)
485; X64-NEXT:    movd %r8d, %xmm3
486; X64-NEXT:    movd %r9d, %xmm4
487; X64-NEXT:    movd %r10d, %xmm5
488; X64-NEXT:    movd %r11d, %xmm6
489; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
490; X64-NEXT:    movd %r14d, %xmm2
491; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
492; X64-NEXT:    movd %r15d, %xmm4
493; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
494; X64-NEXT:    movd %r12d, %xmm3
495; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
496; X64-NEXT:    movd %r13d, %xmm6
497; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
498; X64-NEXT:    movd %edi, %xmm4
499; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
500; X64-NEXT:    movd %esi, %xmm2
501; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
502; X64-NEXT:    movd %ebx, %xmm5
503; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
504; X64-NEXT:    movd %ebp, %xmm6
505; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
506; X64-NEXT:    movd %edx, %xmm2
507; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
508; X64-NEXT:    movzbl %cl, %ecx
509; X64-NEXT:    movd %ecx, %xmm4
510; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
511; X64-NEXT:    movzbl %al, %eax
512; X64-NEXT:    movd %eax, %xmm6
513; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
514; X64-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
515; X64-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
516; X64-NEXT:    movdqa %xmm6, %xmm2
517; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
518; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
519; X64-NEXT:    movdqa %xmm2, (%rax)
520; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
521; X64-NEXT:    movdqa %xmm1, %xmm2
522; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
523; X64-NEXT:    pmullw %xmm3, %xmm2
524; X64-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
525; X64-NEXT:    pand %xmm3, %xmm2
526; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
527; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
528; X64-NEXT:    pmullw %xmm6, %xmm1
529; X64-NEXT:    pand %xmm3, %xmm1
530; X64-NEXT:    packuswb %xmm2, %xmm1
531; X64-NEXT:    psubb %xmm1, %xmm0
532; X64-NEXT:    popq %rbx
533; X64-NEXT:    popq %r12
534; X64-NEXT:    popq %r13
535; X64-NEXT:    popq %r14
536; X64-NEXT:    popq %r15
537; X64-NEXT:    popq %rbp
538; X64-NEXT:    retq
539  %div = udiv <16 x i8> %x, %y
540  store <16 x i8> %div, <16 x i8>* %divdst, align 16
541  %t1 = mul <16 x i8> %div, %y
542  %t2 = sub <16 x i8> %x, %t1
543  ret <16 x i8> %t2
544}
545
546define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind {
547; X86-LABEL: vector_i128_i16:
548; X86:       # %bb.0:
549; X86-NEXT:    pushl %esi
550; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
551; X86-NEXT:    pextrw $7, %xmm0, %eax
552; X86-NEXT:    pextrw $7, %xmm1, %esi
553; X86-NEXT:    # kill: def $ax killed $ax killed $eax
554; X86-NEXT:    xorl %edx, %edx
555; X86-NEXT:    divw %si
556; X86-NEXT:    # kill: def $ax killed $ax def $eax
557; X86-NEXT:    movd %eax, %xmm2
558; X86-NEXT:    pextrw $6, %xmm0, %eax
559; X86-NEXT:    pextrw $6, %xmm1, %esi
560; X86-NEXT:    # kill: def $ax killed $ax killed $eax
561; X86-NEXT:    xorl %edx, %edx
562; X86-NEXT:    divw %si
563; X86-NEXT:    # kill: def $ax killed $ax def $eax
564; X86-NEXT:    movd %eax, %xmm3
565; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
566; X86-NEXT:    pextrw $5, %xmm0, %eax
567; X86-NEXT:    pextrw $5, %xmm1, %esi
568; X86-NEXT:    # kill: def $ax killed $ax killed $eax
569; X86-NEXT:    xorl %edx, %edx
570; X86-NEXT:    divw %si
571; X86-NEXT:    # kill: def $ax killed $ax def $eax
572; X86-NEXT:    movd %eax, %xmm4
573; X86-NEXT:    pextrw $4, %xmm0, %eax
574; X86-NEXT:    pextrw $4, %xmm1, %esi
575; X86-NEXT:    # kill: def $ax killed $ax killed $eax
576; X86-NEXT:    xorl %edx, %edx
577; X86-NEXT:    divw %si
578; X86-NEXT:    # kill: def $ax killed $ax def $eax
579; X86-NEXT:    movd %eax, %xmm2
580; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
581; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
582; X86-NEXT:    pextrw $3, %xmm0, %eax
583; X86-NEXT:    pextrw $3, %xmm1, %esi
584; X86-NEXT:    # kill: def $ax killed $ax killed $eax
585; X86-NEXT:    xorl %edx, %edx
586; X86-NEXT:    divw %si
587; X86-NEXT:    # kill: def $ax killed $ax def $eax
588; X86-NEXT:    movd %eax, %xmm3
589; X86-NEXT:    pextrw $2, %xmm0, %eax
590; X86-NEXT:    pextrw $2, %xmm1, %esi
591; X86-NEXT:    # kill: def $ax killed $ax killed $eax
592; X86-NEXT:    xorl %edx, %edx
593; X86-NEXT:    divw %si
594; X86-NEXT:    # kill: def $ax killed $ax def $eax
595; X86-NEXT:    movd %eax, %xmm4
596; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
597; X86-NEXT:    pextrw $1, %xmm0, %eax
598; X86-NEXT:    pextrw $1, %xmm1, %esi
599; X86-NEXT:    # kill: def $ax killed $ax killed $eax
600; X86-NEXT:    xorl %edx, %edx
601; X86-NEXT:    divw %si
602; X86-NEXT:    # kill: def $ax killed $ax def $eax
603; X86-NEXT:    movd %eax, %xmm3
604; X86-NEXT:    movd %xmm0, %eax
605; X86-NEXT:    movd %xmm1, %esi
606; X86-NEXT:    # kill: def $ax killed $ax killed $eax
607; X86-NEXT:    xorl %edx, %edx
608; X86-NEXT:    divw %si
609; X86-NEXT:    # kill: def $ax killed $ax def $eax
610; X86-NEXT:    movd %eax, %xmm5
611; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
612; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
613; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
614; X86-NEXT:    movdqa %xmm5, (%ecx)
615; X86-NEXT:    pmullw %xmm1, %xmm5
616; X86-NEXT:    psubw %xmm5, %xmm0
617; X86-NEXT:    popl %esi
618; X86-NEXT:    retl
619;
620; X64-LABEL: vector_i128_i16:
621; X64:       # %bb.0:
622; X64-NEXT:    pextrw $7, %xmm0, %eax
623; X64-NEXT:    pextrw $7, %xmm1, %ecx
624; X64-NEXT:    # kill: def $ax killed $ax killed $eax
625; X64-NEXT:    xorl %edx, %edx
626; X64-NEXT:    divw %cx
627; X64-NEXT:    # kill: def $ax killed $ax def $eax
628; X64-NEXT:    movd %eax, %xmm2
629; X64-NEXT:    pextrw $6, %xmm0, %eax
630; X64-NEXT:    pextrw $6, %xmm1, %ecx
631; X64-NEXT:    # kill: def $ax killed $ax killed $eax
632; X64-NEXT:    xorl %edx, %edx
633; X64-NEXT:    divw %cx
634; X64-NEXT:    # kill: def $ax killed $ax def $eax
635; X64-NEXT:    movd %eax, %xmm3
636; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
637; X64-NEXT:    pextrw $5, %xmm0, %eax
638; X64-NEXT:    pextrw $5, %xmm1, %ecx
639; X64-NEXT:    # kill: def $ax killed $ax killed $eax
640; X64-NEXT:    xorl %edx, %edx
641; X64-NEXT:    divw %cx
642; X64-NEXT:    # kill: def $ax killed $ax def $eax
643; X64-NEXT:    movd %eax, %xmm4
644; X64-NEXT:    pextrw $4, %xmm0, %eax
645; X64-NEXT:    pextrw $4, %xmm1, %ecx
646; X64-NEXT:    # kill: def $ax killed $ax killed $eax
647; X64-NEXT:    xorl %edx, %edx
648; X64-NEXT:    divw %cx
649; X64-NEXT:    # kill: def $ax killed $ax def $eax
650; X64-NEXT:    movd %eax, %xmm2
651; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
652; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
653; X64-NEXT:    pextrw $3, %xmm0, %eax
654; X64-NEXT:    pextrw $3, %xmm1, %ecx
655; X64-NEXT:    # kill: def $ax killed $ax killed $eax
656; X64-NEXT:    xorl %edx, %edx
657; X64-NEXT:    divw %cx
658; X64-NEXT:    # kill: def $ax killed $ax def $eax
659; X64-NEXT:    movd %eax, %xmm3
660; X64-NEXT:    pextrw $2, %xmm0, %eax
661; X64-NEXT:    pextrw $2, %xmm1, %ecx
662; X64-NEXT:    # kill: def $ax killed $ax killed $eax
663; X64-NEXT:    xorl %edx, %edx
664; X64-NEXT:    divw %cx
665; X64-NEXT:    # kill: def $ax killed $ax def $eax
666; X64-NEXT:    movd %eax, %xmm4
667; X64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
668; X64-NEXT:    pextrw $1, %xmm0, %eax
669; X64-NEXT:    pextrw $1, %xmm1, %ecx
670; X64-NEXT:    # kill: def $ax killed $ax killed $eax
671; X64-NEXT:    xorl %edx, %edx
672; X64-NEXT:    divw %cx
673; X64-NEXT:    # kill: def $ax killed $ax def $eax
674; X64-NEXT:    movd %eax, %xmm3
675; X64-NEXT:    movd %xmm0, %eax
676; X64-NEXT:    movd %xmm1, %ecx
677; X64-NEXT:    # kill: def $ax killed $ax killed $eax
678; X64-NEXT:    xorl %edx, %edx
679; X64-NEXT:    divw %cx
680; X64-NEXT:    # kill: def $ax killed $ax def $eax
681; X64-NEXT:    movd %eax, %xmm5
682; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
683; X64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
684; X64-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
685; X64-NEXT:    movdqa %xmm5, (%rdi)
686; X64-NEXT:    pmullw %xmm1, %xmm5
687; X64-NEXT:    psubw %xmm5, %xmm0
688; X64-NEXT:    retq
689  %div = udiv <8 x i16> %x, %y
690  store <8 x i16> %div, <8 x i16>* %divdst, align 16
691  %t1 = mul <8 x i16> %div, %y
692  %t2 = sub <8 x i16> %x, %t1
693  ret <8 x i16> %t2
694}
695
696define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind {
697; X86-LABEL: vector_i128_i32:
698; X86:       # %bb.0:
699; X86-NEXT:    pushl %esi
700; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
701; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
702; X86-NEXT:    movd %xmm2, %eax
703; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
704; X86-NEXT:    movd %xmm2, %esi
705; X86-NEXT:    xorl %edx, %edx
706; X86-NEXT:    divl %esi
707; X86-NEXT:    movd %eax, %xmm2
708; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
709; X86-NEXT:    movd %xmm3, %eax
710; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
711; X86-NEXT:    movd %xmm3, %esi
712; X86-NEXT:    xorl %edx, %edx
713; X86-NEXT:    divl %esi
714; X86-NEXT:    movd %eax, %xmm3
715; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
716; X86-NEXT:    movd %xmm0, %eax
717; X86-NEXT:    movd %xmm1, %esi
718; X86-NEXT:    xorl %edx, %edx
719; X86-NEXT:    divl %esi
720; X86-NEXT:    movd %eax, %xmm2
721; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
722; X86-NEXT:    movd %xmm4, %eax
723; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
724; X86-NEXT:    movd %xmm4, %esi
725; X86-NEXT:    xorl %edx, %edx
726; X86-NEXT:    divl %esi
727; X86-NEXT:    movd %eax, %xmm4
728; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
729; X86-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
730; X86-NEXT:    movdqa %xmm2, (%ecx)
731; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
732; X86-NEXT:    pmuludq %xmm1, %xmm2
733; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
734; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
735; X86-NEXT:    pmuludq %xmm3, %xmm1
736; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
737; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
738; X86-NEXT:    psubd %xmm2, %xmm0
739; X86-NEXT:    popl %esi
740; X86-NEXT:    retl
741;
742; X64-LABEL: vector_i128_i32:
743; X64:       # %bb.0:
744; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
745; X64-NEXT:    movd %xmm2, %eax
746; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
747; X64-NEXT:    movd %xmm2, %ecx
748; X64-NEXT:    xorl %edx, %edx
749; X64-NEXT:    divl %ecx
750; X64-NEXT:    movd %eax, %xmm2
751; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
752; X64-NEXT:    movd %xmm3, %eax
753; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
754; X64-NEXT:    movd %xmm3, %ecx
755; X64-NEXT:    xorl %edx, %edx
756; X64-NEXT:    divl %ecx
757; X64-NEXT:    movd %eax, %xmm3
758; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
759; X64-NEXT:    movd %xmm0, %eax
760; X64-NEXT:    movd %xmm1, %ecx
761; X64-NEXT:    xorl %edx, %edx
762; X64-NEXT:    divl %ecx
763; X64-NEXT:    movd %eax, %xmm2
764; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
765; X64-NEXT:    movd %xmm4, %eax
766; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
767; X64-NEXT:    movd %xmm4, %ecx
768; X64-NEXT:    xorl %edx, %edx
769; X64-NEXT:    divl %ecx
770; X64-NEXT:    movd %eax, %xmm4
771; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
772; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
773; X64-NEXT:    movdqa %xmm2, (%rdi)
774; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
775; X64-NEXT:    pmuludq %xmm1, %xmm2
776; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
777; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
778; X64-NEXT:    pmuludq %xmm3, %xmm1
779; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
780; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
781; X64-NEXT:    psubd %xmm2, %xmm0
782; X64-NEXT:    retq
783  %div = udiv <4 x i32> %x, %y
784  store <4 x i32> %div, <4 x i32>* %divdst, align 16
785  %t1 = mul <4 x i32> %div, %y
786  %t2 = sub <4 x i32> %x, %t1
787  ret <4 x i32> %t2
788}
789
790define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind {
791; X86-LABEL: vector_i128_i64:
792; X86:       # %bb.0:
793; X86-NEXT:    pushl %esi
794; X86-NEXT:    subl $64, %esp
795; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
796; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
797; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
798; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
799; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
800; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
801; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
802; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
803; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
804; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
805; X86-NEXT:    movd %xmm1, (%esp)
806; X86-NEXT:    calll __udivdi3
807; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
808; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
809; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
810; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
811; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
812; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
813; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
814; X86-NEXT:    movd %xmm1, (%esp)
815; X86-NEXT:    movd %edx, %xmm0
816; X86-NEXT:    movd %eax, %xmm1
817; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
818; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
819; X86-NEXT:    calll __udivdi3
820; X86-NEXT:    movd %edx, %xmm0
821; X86-NEXT:    movd %eax, %xmm1
822; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
823; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
824; X86-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
825; X86-NEXT:    movdqa %xmm1, (%esi)
826; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
827; X86-NEXT:    movdqa %xmm3, %xmm0
828; X86-NEXT:    psrlq $32, %xmm0
829; X86-NEXT:    pmuludq %xmm1, %xmm0
830; X86-NEXT:    movdqa %xmm1, %xmm2
831; X86-NEXT:    psrlq $32, %xmm2
832; X86-NEXT:    pmuludq %xmm3, %xmm2
833; X86-NEXT:    paddq %xmm0, %xmm2
834; X86-NEXT:    psllq $32, %xmm2
835; X86-NEXT:    pmuludq %xmm3, %xmm1
836; X86-NEXT:    paddq %xmm2, %xmm1
837; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
838; X86-NEXT:    psubq %xmm1, %xmm0
839; X86-NEXT:    addl $64, %esp
840; X86-NEXT:    popl %esi
841; X86-NEXT:    retl
842;
843; X64-LABEL: vector_i128_i64:
844; X64:       # %bb.0:
845; X64-NEXT:    movq %xmm0, %rax
846; X64-NEXT:    movq %xmm1, %rcx
847; X64-NEXT:    xorl %edx, %edx
848; X64-NEXT:    divq %rcx
849; X64-NEXT:    movq %rax, %xmm2
850; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
851; X64-NEXT:    movq %xmm3, %rax
852; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
853; X64-NEXT:    movq %xmm3, %rcx
854; X64-NEXT:    xorl %edx, %edx
855; X64-NEXT:    divq %rcx
856; X64-NEXT:    movq %rax, %xmm3
857; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
858; X64-NEXT:    movdqa %xmm2, (%rdi)
859; X64-NEXT:    movdqa %xmm1, %xmm3
860; X64-NEXT:    psrlq $32, %xmm3
861; X64-NEXT:    pmuludq %xmm2, %xmm3
862; X64-NEXT:    movdqa %xmm2, %xmm4
863; X64-NEXT:    psrlq $32, %xmm4
864; X64-NEXT:    pmuludq %xmm1, %xmm4
865; X64-NEXT:    paddq %xmm3, %xmm4
866; X64-NEXT:    psllq $32, %xmm4
867; X64-NEXT:    pmuludq %xmm1, %xmm2
868; X64-NEXT:    paddq %xmm4, %xmm2
869; X64-NEXT:    psubq %xmm2, %xmm0
870; X64-NEXT:    retq
871  %div = udiv <2 x i64> %x, %y
872  store <2 x i64> %div, <2 x i64>* %divdst, align 16
873  %t1 = mul <2 x i64> %div, %y
874  %t2 = sub <2 x i64> %x, %t1
875  ret <2 x i64> %t2
876}
877
878; Special tests.
879
880define i32 @scalar_i32_commutative(i32 %x, i32* %ysrc, i32* %divdst) nounwind {
881; X86-LABEL: scalar_i32_commutative:
882; X86:       # %bb.0:
883; X86-NEXT:    pushl %edi
884; X86-NEXT:    pushl %esi
885; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
886; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
887; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
888; X86-NEXT:    movl (%eax), %edi
889; X86-NEXT:    movl %ecx, %eax
890; X86-NEXT:    xorl %edx, %edx
891; X86-NEXT:    divl %edi
892; X86-NEXT:    movl %eax, (%esi)
893; X86-NEXT:    imull %eax, %edi
894; X86-NEXT:    subl %edi, %ecx
895; X86-NEXT:    movl %ecx, %eax
896; X86-NEXT:    popl %esi
897; X86-NEXT:    popl %edi
898; X86-NEXT:    retl
899;
900; X64-LABEL: scalar_i32_commutative:
901; X64:       # %bb.0:
902; X64-NEXT:    movq %rdx, %rcx
903; X64-NEXT:    movl (%rsi), %esi
904; X64-NEXT:    movl %edi, %eax
905; X64-NEXT:    xorl %edx, %edx
906; X64-NEXT:    divl %esi
907; X64-NEXT:    movl %eax, (%rcx)
908; X64-NEXT:    imull %eax, %esi
909; X64-NEXT:    subl %esi, %edi
910; X64-NEXT:    movl %edi, %eax
911; X64-NEXT:    retq
912  %y = load i32, i32* %ysrc, align 4
913  %div = udiv i32 %x, %y
914  store i32 %div, i32* %divdst, align 4
915  %t1 = mul i32 %y, %div ; commutative
916  %t2 = sub i32 %x, %t1
917  ret i32 %t2
918}
919
920; We do not care about extra uses.
921define i32 @extrause(i32 %x, i32 %y, i32* %divdst, i32* %t1dst) nounwind {
922; X86-LABEL: extrause:
923; X86:       # %bb.0:
924; X86-NEXT:    pushl %ebx
925; X86-NEXT:    pushl %edi
926; X86-NEXT:    pushl %esi
927; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
928; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
929; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
930; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
931; X86-NEXT:    movl %ecx, %eax
932; X86-NEXT:    xorl %edx, %edx
933; X86-NEXT:    divl %ebx
934; X86-NEXT:    movl %eax, (%edi)
935; X86-NEXT:    imull %ebx, %eax
936; X86-NEXT:    movl %eax, (%esi)
937; X86-NEXT:    subl %eax, %ecx
938; X86-NEXT:    movl %ecx, %eax
939; X86-NEXT:    popl %esi
940; X86-NEXT:    popl %edi
941; X86-NEXT:    popl %ebx
942; X86-NEXT:    retl
943;
944; X64-LABEL: extrause:
945; X64:       # %bb.0:
946; X64-NEXT:    movq %rdx, %r8
947; X64-NEXT:    movl %edi, %eax
948; X64-NEXT:    xorl %edx, %edx
949; X64-NEXT:    divl %esi
950; X64-NEXT:    movl %eax, (%r8)
951; X64-NEXT:    imull %esi, %eax
952; X64-NEXT:    movl %eax, (%rcx)
953; X64-NEXT:    subl %eax, %edi
954; X64-NEXT:    movl %edi, %eax
955; X64-NEXT:    retq
956  %div = udiv i32 %x, %y
957  store i32 %div, i32* %divdst, align 4
958  %t1 = mul i32 %div, %y
959  store i32 %t1, i32* %t1dst, align 4
960  %t2 = sub i32 %x, %t1
961  ret i32 %t2
962}
963
964; 'rem' should appear next to 'div'.
965define i32 @multiple_bb(i32 %x, i32 %y, i32* %divdst, i1 zeroext %store_urem, i32* %uremdst) nounwind {
966; X86-LABEL: multiple_bb:
967; X86:       # %bb.0:
968; X86-NEXT:    pushl %ebx
969; X86-NEXT:    pushl %edi
970; X86-NEXT:    pushl %esi
971; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
972; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
973; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
974; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
975; X86-NEXT:    movl %ecx, %eax
976; X86-NEXT:    xorl %edx, %edx
977; X86-NEXT:    divl %esi
978; X86-NEXT:    movl %eax, (%edi)
979; X86-NEXT:    testb %bl, %bl
980; X86-NEXT:    je .LBB11_2
981; X86-NEXT:  # %bb.1: # %do_urem
982; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
983; X86-NEXT:    movl %eax, %edi
984; X86-NEXT:    imull %esi, %edi
985; X86-NEXT:    subl %edi, %ecx
986; X86-NEXT:    movl %ecx, (%edx)
987; X86-NEXT:  .LBB11_2: # %end
988; X86-NEXT:    popl %esi
989; X86-NEXT:    popl %edi
990; X86-NEXT:    popl %ebx
991; X86-NEXT:    retl
992;
993; X64-LABEL: multiple_bb:
994; X64:       # %bb.0:
995; X64-NEXT:    movq %rdx, %r9
996; X64-NEXT:    movl %edi, %eax
997; X64-NEXT:    xorl %edx, %edx
998; X64-NEXT:    divl %esi
999; X64-NEXT:    movl %eax, (%r9)
1000; X64-NEXT:    testl %ecx, %ecx
1001; X64-NEXT:    je .LBB11_2
1002; X64-NEXT:  # %bb.1: # %do_urem
1003; X64-NEXT:    movl %eax, %ecx
1004; X64-NEXT:    imull %esi, %ecx
1005; X64-NEXT:    subl %ecx, %edi
1006; X64-NEXT:    movl %edi, (%r8)
1007; X64-NEXT:  .LBB11_2: # %end
1008; X64-NEXT:    retq
1009  %div = udiv i32 %x, %y
1010  store i32 %div, i32* %divdst, align 4
1011  br i1 %store_urem, label %do_urem, label %end
1012do_urem:
1013  %t1 = mul i32 %div, %y
1014  %t2 = sub i32 %x, %t1
1015  store i32 %t2, i32* %uremdst, align 4
1016  br label %end
1017end:
1018  ret i32 %div
1019}
1020
1021define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind {
1022; X86-LABEL: negative_different_x:
1023; X86:       # %bb.0:
1024; X86-NEXT:    pushl %edi
1025; X86-NEXT:    pushl %esi
1026; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1027; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1028; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1029; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1030; X86-NEXT:    xorl %edx, %edx
1031; X86-NEXT:    divl %edi
1032; X86-NEXT:    movl %eax, (%esi)
1033; X86-NEXT:    imull %edi, %eax
1034; X86-NEXT:    subl %eax, %ecx
1035; X86-NEXT:    movl %ecx, %eax
1036; X86-NEXT:    popl %esi
1037; X86-NEXT:    popl %edi
1038; X86-NEXT:    retl
1039;
1040; X64-LABEL: negative_different_x:
1041; X64:       # %bb.0:
1042; X64-NEXT:    movl %edx, %r8d
1043; X64-NEXT:    movl %edi, %eax
1044; X64-NEXT:    xorl %edx, %edx
1045; X64-NEXT:    divl %r8d
1046; X64-NEXT:    movl %eax, (%rcx)
1047; X64-NEXT:    imull %r8d, %eax
1048; X64-NEXT:    subl %eax, %esi
1049; X64-NEXT:    movl %esi, %eax
1050; X64-NEXT:    retq
1051  %div = udiv i32 %x0, %y ; not %x1
1052  store i32 %div, i32* %divdst, align 4
1053  %t1 = mul i32 %div, %y
1054  %t2 = sub i32 %x1, %t1 ; not %x0
1055  ret i32 %t2
1056}
1057
1058define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, i32* %divdst) nounwind {
1059; X86-LABEL: negative_different_y:
1060; X86:       # %bb.0:
1061; X86-NEXT:    pushl %esi
1062; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1063; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1064; X86-NEXT:    movl %ecx, %eax
1065; X86-NEXT:    xorl %edx, %edx
1066; X86-NEXT:    divl {{[0-9]+}}(%esp)
1067; X86-NEXT:    movl %eax, (%esi)
1068; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
1069; X86-NEXT:    subl %eax, %ecx
1070; X86-NEXT:    movl %ecx, %eax
1071; X86-NEXT:    popl %esi
1072; X86-NEXT:    retl
1073;
1074; X64-LABEL: negative_different_y:
1075; X64:       # %bb.0:
1076; X64-NEXT:    movl %edx, %edi
1077; X64-NEXT:    movl %esi, %eax
1078; X64-NEXT:    xorl %edx, %edx
1079; X64-NEXT:    divl %ecx
1080; X64-NEXT:    movl %eax, (%r8)
1081; X64-NEXT:    imull %eax, %edi
1082; X64-NEXT:    subl %edi, %esi
1083; X64-NEXT:    movl %esi, %eax
1084; X64-NEXT:    retq
1085  %div = udiv i32 %x1, %z ; not %x0
1086  store i32 %div, i32* %divdst, align 4
1087  %t1 = mul i32 %div, %y
1088  %t2 = sub i32 %x1, %t1
1089  ret i32 %t2
1090}
1091
1092define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind {
1093; X86-LABEL: negative_inverted_division:
1094; X86:       # %bb.0:
1095; X86-NEXT:    pushl %esi
1096; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1097; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1098; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1099; X86-NEXT:    xorl %edx, %edx
1100; X86-NEXT:    divl %ecx
1101; X86-NEXT:    movl %eax, (%esi)
1102; X86-NEXT:    imull %ecx, %eax
1103; X86-NEXT:    subl %eax, %ecx
1104; X86-NEXT:    movl %ecx, %eax
1105; X86-NEXT:    popl %esi
1106; X86-NEXT:    retl
1107;
1108; X64-LABEL: negative_inverted_division:
1109; X64:       # %bb.0:
1110; X64-NEXT:    movl %edi, %eax
1111; X64-NEXT:    xorl %edx, %edx
1112; X64-NEXT:    divl %esi
1113; X64-NEXT:    movl %eax, (%rcx)
1114; X64-NEXT:    imull %esi, %eax
1115; X64-NEXT:    subl %eax, %esi
1116; X64-NEXT:    movl %esi, %eax
1117; X64-NEXT:    retq
1118  %div = udiv i32 %x0, %x1 ; inverted division
1119  store i32 %div, i32* %divdst, align 4
1120  %t1 = mul i32 %div, %x1
1121  %t2 = sub i32 %x1, %t1
1122  ret i32 %t2
1123}
1124