1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64 4 5; If the target does not have a single div/rem operation, 6; -div-rem-pairs pass will decompose the remainder calculation as: 7; X % Y --> X - ((X / Y) * Y) 8; But if the target does have a single div/rem operation, 9; the opposite transform is likely beneficial. 10 11define i8 @scalar_i8(i8 %x, i8 %y, i8* %divdst) nounwind { 12; X86-LABEL: scalar_i8: 13; X86: # %bb.0: 14; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 15; X86-NEXT: movb {{[0-9]+}}(%esp), %ch 16; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 17; X86-NEXT: movzbl %cl, %eax 18; X86-NEXT: divb %ch 19; X86-NEXT: movb %al, (%edx) 20; X86-NEXT: mulb %ch 21; X86-NEXT: subb %al, %cl 22; X86-NEXT: movl %ecx, %eax 23; X86-NEXT: retl 24; 25; X64-LABEL: scalar_i8: 26; X64: # %bb.0: 27; X64-NEXT: movzbl %dil, %ecx 28; X64-NEXT: movl %ecx, %eax 29; X64-NEXT: divb %sil 30; X64-NEXT: movb %al, (%rdx) 31; X64-NEXT: mulb %sil 32; X64-NEXT: subb %al, %cl 33; X64-NEXT: movl %ecx, %eax 34; X64-NEXT: retq 35 %div = udiv i8 %x, %y 36 store i8 %div, i8* %divdst, align 4 37 %t1 = mul i8 %div, %y 38 %t2 = sub i8 %x, %t1 39 ret i8 %t2 40} 41 42define i16 @scalar_i16(i16 %x, i16 %y, i16* %divdst) nounwind { 43; X86-LABEL: scalar_i16: 44; X86: # %bb.0: 45; X86-NEXT: pushl %edi 46; X86-NEXT: pushl %esi 47; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 48; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 49; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 50; X86-NEXT: movl %ecx, %eax 51; X86-NEXT: xorl %edx, %edx 52; X86-NEXT: divw %si 53; X86-NEXT: # kill: def $ax killed $ax def $eax 54; X86-NEXT: movw %ax, (%edi) 55; X86-NEXT: imull %eax, %esi 56; X86-NEXT: subl %esi, %ecx 57; X86-NEXT: movl %ecx, %eax 58; X86-NEXT: popl %esi 59; X86-NEXT: popl %edi 60; X86-NEXT: retl 61; 62; X64-LABEL: scalar_i16: 63; X64: # %bb.0: 64; X64-NEXT: movq %rdx, %rcx 65; X64-NEXT: movl %edi, %eax 66; X64-NEXT: xorl %edx, %edx 67; X64-NEXT: divw %si 68; X64-NEXT: # kill: def $ax killed $ax def $eax 69; X64-NEXT: movw %ax, (%rcx) 70; X64-NEXT: imull %eax, %esi 71; X64-NEXT: subl %esi, %edi 72; X64-NEXT: movl %edi, %eax 73; X64-NEXT: retq 74 %div = udiv i16 %x, %y 75 store i16 %div, i16* %divdst, align 4 76 %t1 = mul i16 %div, %y 77 %t2 = sub i16 %x, %t1 78 ret i16 %t2 79} 80 81define i32 @scalar_i32(i32 %x, i32 %y, i32* %divdst) nounwind { 82; X86-LABEL: scalar_i32: 83; X86: # %bb.0: 84; X86-NEXT: pushl %edi 85; X86-NEXT: pushl %esi 86; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 87; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 88; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 89; X86-NEXT: movl %ecx, %eax 90; X86-NEXT: xorl %edx, %edx 91; X86-NEXT: divl %edi 92; X86-NEXT: movl %eax, (%esi) 93; X86-NEXT: imull %edi, %eax 94; X86-NEXT: subl %eax, %ecx 95; X86-NEXT: movl %ecx, %eax 96; X86-NEXT: popl %esi 97; X86-NEXT: popl %edi 98; X86-NEXT: retl 99; 100; X64-LABEL: scalar_i32: 101; X64: # %bb.0: 102; X64-NEXT: movq %rdx, %rcx 103; X64-NEXT: movl %edi, %eax 104; X64-NEXT: xorl %edx, %edx 105; X64-NEXT: divl %esi 106; X64-NEXT: movl %eax, (%rcx) 107; X64-NEXT: imull %esi, %eax 108; X64-NEXT: subl %eax, %edi 109; X64-NEXT: movl %edi, %eax 110; X64-NEXT: retq 111 %div = udiv i32 %x, %y 112 store i32 %div, i32* %divdst, align 4 113 %t1 = mul i32 %div, %y 114 %t2 = sub i32 %x, %t1 115 ret i32 %t2 116} 117 118define i64 @scalar_i64(i64 %x, i64 %y, i64* %divdst) nounwind { 119; X86-LABEL: scalar_i64: 120; X86: # %bb.0: 121; X86-NEXT: pushl %ebp 122; X86-NEXT: pushl %ebx 123; X86-NEXT: pushl %edi 124; X86-NEXT: pushl %esi 125; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 126; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 127; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp 128; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx 129; X86-NEXT: pushl %ebx 130; X86-NEXT: pushl %ebp 131; X86-NEXT: pushl %edi 132; X86-NEXT: pushl %esi 133; X86-NEXT: calll __udivdi3 134; X86-NEXT: addl $16, %esp 135; X86-NEXT: movl %edx, %ecx 136; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 137; X86-NEXT: movl %ecx, 4(%edx) 138; X86-NEXT: movl %eax, (%edx) 139; X86-NEXT: imull %eax, %ebx 140; X86-NEXT: mull %ebp 141; X86-NEXT: addl %ebx, %edx 142; X86-NEXT: imull %ebp, %ecx 143; X86-NEXT: addl %edx, %ecx 144; X86-NEXT: subl %eax, %esi 145; X86-NEXT: sbbl %ecx, %edi 146; X86-NEXT: movl %esi, %eax 147; X86-NEXT: movl %edi, %edx 148; X86-NEXT: popl %esi 149; X86-NEXT: popl %edi 150; X86-NEXT: popl %ebx 151; X86-NEXT: popl %ebp 152; X86-NEXT: retl 153; 154; X64-LABEL: scalar_i64: 155; X64: # %bb.0: 156; X64-NEXT: movq %rdx, %rcx 157; X64-NEXT: movq %rdi, %rax 158; X64-NEXT: xorl %edx, %edx 159; X64-NEXT: divq %rsi 160; X64-NEXT: movq %rax, (%rcx) 161; X64-NEXT: imulq %rsi, %rax 162; X64-NEXT: subq %rax, %rdi 163; X64-NEXT: movq %rdi, %rax 164; X64-NEXT: retq 165 %div = udiv i64 %x, %y 166 store i64 %div, i64* %divdst, align 4 167 %t1 = mul i64 %div, %y 168 %t2 = sub i64 %x, %t1 169 ret i64 %t2 170} 171 172define i128 @scalar_i128(i128 %x, i128 %y, i128* %divdst) nounwind { 173; X86-LABEL: scalar_i128: 174; X86: # %bb.0: 175; X86-NEXT: pushl %ebp 176; X86-NEXT: movl %esp, %ebp 177; X86-NEXT: pushl %ebx 178; X86-NEXT: pushl %edi 179; X86-NEXT: pushl %esi 180; X86-NEXT: andl $-8, %esp 181; X86-NEXT: subl $48, %esp 182; X86-NEXT: movl 44(%ebp), %edi 183; X86-NEXT: movl 28(%ebp), %ecx 184; X86-NEXT: leal {{[0-9]+}}(%esp), %eax 185; X86-NEXT: pushl 40(%ebp) 186; X86-NEXT: pushl 36(%ebp) 187; X86-NEXT: pushl 32(%ebp) 188; X86-NEXT: pushl %ecx 189; X86-NEXT: movl %ecx, %ebx 190; X86-NEXT: pushl 24(%ebp) 191; X86-NEXT: pushl 20(%ebp) 192; X86-NEXT: pushl 16(%ebp) 193; X86-NEXT: pushl 12(%ebp) 194; X86-NEXT: pushl %eax 195; X86-NEXT: calll __udivti3 196; X86-NEXT: addl $32, %esp 197; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 198; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 199; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 200; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 201; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 202; X86-NEXT: movl %ecx, 12(%edi) 203; X86-NEXT: movl %esi, 8(%edi) 204; X86-NEXT: movl %eax, 4(%edi) 205; X86-NEXT: movl %edx, (%edi) 206; X86-NEXT: movl %edx, %edi 207; X86-NEXT: movl %ebx, %eax 208; X86-NEXT: imull %ebx, %ecx 209; X86-NEXT: mull %esi 210; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 211; X86-NEXT: addl %ecx, %edx 212; X86-NEXT: imull 32(%ebp), %esi 213; X86-NEXT: addl %edx, %esi 214; X86-NEXT: movl 36(%ebp), %eax 215; X86-NEXT: movl %eax, %ecx 216; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload 217; X86-NEXT: imull %ebx, %ecx 218; X86-NEXT: mull %edi 219; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 220; X86-NEXT: addl %ecx, %edx 221; X86-NEXT: movl %edi, %eax 222; X86-NEXT: movl 40(%ebp), %edi 223; X86-NEXT: imull %eax, %edi 224; X86-NEXT: addl %edx, %edi 225; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload 226; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill 227; X86-NEXT: adcl %esi, %edi 228; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 229; X86-NEXT: movl %eax, %edi 230; X86-NEXT: movl 28(%ebp), %ecx 231; X86-NEXT: mull %ecx 232; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 233; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill 234; X86-NEXT: movl %ebx, %eax 235; X86-NEXT: mull %ecx 236; X86-NEXT: movl %edx, %esi 237; X86-NEXT: movl %eax, %ecx 238; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload 239; X86-NEXT: adcl $0, %esi 240; X86-NEXT: movl %edi, %eax 241; X86-NEXT: mull 32(%ebp) 242; X86-NEXT: movl %edx, %edi 243; X86-NEXT: addl %ecx, %eax 244; X86-NEXT: movl %eax, %ebx 245; X86-NEXT: adcl %esi, %edi 246; X86-NEXT: setb %cl 247; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload 248; X86-NEXT: mull 32(%ebp) 249; X86-NEXT: addl %edi, %eax 250; X86-NEXT: movzbl %cl, %ecx 251; X86-NEXT: adcl %ecx, %edx 252; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload 253; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload 254; X86-NEXT: movl 12(%ebp), %ecx 255; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload 256; X86-NEXT: movl 16(%ebp), %esi 257; X86-NEXT: sbbl %ebx, %esi 258; X86-NEXT: movl 20(%ebp), %edi 259; X86-NEXT: sbbl %eax, %edi 260; X86-NEXT: movl 24(%ebp), %ebx 261; X86-NEXT: sbbl %edx, %ebx 262; X86-NEXT: movl 8(%ebp), %eax 263; X86-NEXT: movl %ecx, (%eax) 264; X86-NEXT: movl %esi, 4(%eax) 265; X86-NEXT: movl %edi, 8(%eax) 266; X86-NEXT: movl %ebx, 12(%eax) 267; X86-NEXT: leal -12(%ebp), %esp 268; X86-NEXT: popl %esi 269; X86-NEXT: popl %edi 270; X86-NEXT: popl %ebx 271; X86-NEXT: popl %ebp 272; X86-NEXT: retl $4 273; 274; X64-LABEL: scalar_i128: 275; X64: # %bb.0: 276; X64-NEXT: pushq %r15 277; X64-NEXT: pushq %r14 278; X64-NEXT: pushq %r13 279; X64-NEXT: pushq %r12 280; X64-NEXT: pushq %rbx 281; X64-NEXT: movq %r8, %r14 282; X64-NEXT: movq %rcx, %rbx 283; X64-NEXT: movq %rdx, %r15 284; X64-NEXT: movq %rsi, %r12 285; X64-NEXT: movq %rdi, %r13 286; X64-NEXT: callq __udivti3@PLT 287; X64-NEXT: movq %rdx, %rcx 288; X64-NEXT: movq %rdx, 8(%r14) 289; X64-NEXT: movq %rax, (%r14) 290; X64-NEXT: imulq %rax, %rbx 291; X64-NEXT: mulq %r15 292; X64-NEXT: addq %rbx, %rdx 293; X64-NEXT: imulq %r15, %rcx 294; X64-NEXT: addq %rdx, %rcx 295; X64-NEXT: subq %rax, %r13 296; X64-NEXT: sbbq %rcx, %r12 297; X64-NEXT: movq %r13, %rax 298; X64-NEXT: movq %r12, %rdx 299; X64-NEXT: popq %rbx 300; X64-NEXT: popq %r12 301; X64-NEXT: popq %r13 302; X64-NEXT: popq %r14 303; X64-NEXT: popq %r15 304; X64-NEXT: retq 305 %div = udiv i128 %x, %y 306 store i128 %div, i128* %divdst, align 4 307 %t1 = mul i128 %div, %y 308 %t2 = sub i128 %x, %t1 309 ret i128 %t2 310} 311 312define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, <16 x i8>* %divdst) nounwind { 313; X86-LABEL: vector_i128_i8: 314; X86: # %bb.0: 315; X86-NEXT: pushl %ebp 316; X86-NEXT: movl %esp, %ebp 317; X86-NEXT: pushl %ebx 318; X86-NEXT: pushl %edi 319; X86-NEXT: pushl %esi 320; X86-NEXT: andl $-16, %esp 321; X86-NEXT: subl $48, %esp 322; X86-NEXT: movdqa %xmm0, (%esp) 323; X86-NEXT: movdqa %xmm1, {{[0-9]+}}(%esp) 324; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 325; X86-NEXT: divb {{[0-9]+}}(%esp) 326; X86-NEXT: movzbl %al, %eax 327; X86-NEXT: movd %eax, %xmm2 328; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 329; X86-NEXT: divb {{[0-9]+}}(%esp) 330; X86-NEXT: movzbl %al, %eax 331; X86-NEXT: movd %eax, %xmm3 332; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 333; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 334; X86-NEXT: divb {{[0-9]+}}(%esp) 335; X86-NEXT: movzbl %al, %eax 336; X86-NEXT: movd %eax, %xmm4 337; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 338; X86-NEXT: divb {{[0-9]+}}(%esp) 339; X86-NEXT: movzbl %al, %eax 340; X86-NEXT: movd %eax, %xmm2 341; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 342; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 343; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 344; X86-NEXT: divb {{[0-9]+}}(%esp) 345; X86-NEXT: movzbl %al, %eax 346; X86-NEXT: movd %eax, %xmm3 347; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 348; X86-NEXT: divb {{[0-9]+}}(%esp) 349; X86-NEXT: movzbl %al, %eax 350; X86-NEXT: movd %eax, %xmm4 351; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 352; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 353; X86-NEXT: divb {{[0-9]+}}(%esp) 354; X86-NEXT: movzbl %al, %eax 355; X86-NEXT: movd %eax, %xmm5 356; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 357; X86-NEXT: divb {{[0-9]+}}(%esp) 358; X86-NEXT: movzbl %al, %eax 359; X86-NEXT: movd %eax, %xmm3 360; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 361; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 362; X86-NEXT: divb {{[0-9]+}}(%esp) 363; X86-NEXT: movzbl %al, %eax 364; X86-NEXT: movd %eax, %xmm5 365; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 366; X86-NEXT: divb {{[0-9]+}}(%esp) 367; X86-NEXT: movzbl %al, %eax 368; X86-NEXT: movd %eax, %xmm6 369; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 370; X86-NEXT: divb {{[0-9]+}}(%esp) 371; X86-NEXT: movzbl %al, %edx 372; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 373; X86-NEXT: divb {{[0-9]+}}(%esp) 374; X86-NEXT: movzbl %al, %esi 375; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 376; X86-NEXT: divb {{[0-9]+}}(%esp) 377; X86-NEXT: movzbl %al, %edi 378; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 379; X86-NEXT: divb {{[0-9]+}}(%esp) 380; X86-NEXT: movzbl %al, %ebx 381; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax 382; X86-NEXT: divb {{[0-9]+}}(%esp) 383; X86-NEXT: movl %eax, %ecx 384; X86-NEXT: movzbl (%esp), %eax 385; X86-NEXT: divb {{[0-9]+}}(%esp) 386; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 387; X86-NEXT: movd %edx, %xmm4 388; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 389; X86-NEXT: movd %esi, %xmm2 390; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 391; X86-NEXT: movd %edi, %xmm5 392; X86-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 393; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 394; X86-NEXT: movd %ebx, %xmm4 395; X86-NEXT: movzbl %cl, %ecx 396; X86-NEXT: movd %ecx, %xmm6 397; X86-NEXT: movl 8(%ebp), %ecx 398; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 399; X86-NEXT: movzbl %al, %eax 400; X86-NEXT: movd %eax, %xmm5 401; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 402; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 403; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 404; X86-NEXT: movdqa %xmm5, %xmm2 405; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 406; X86-NEXT: movdqa %xmm2, (%ecx) 407; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 408; X86-NEXT: movdqa %xmm1, %xmm2 409; X86-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 410; X86-NEXT: pmullw %xmm3, %xmm2 411; X86-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 412; X86-NEXT: pand %xmm3, %xmm2 413; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 414; X86-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 415; X86-NEXT: pmullw %xmm5, %xmm1 416; X86-NEXT: pand %xmm3, %xmm1 417; X86-NEXT: packuswb %xmm2, %xmm1 418; X86-NEXT: psubb %xmm1, %xmm0 419; X86-NEXT: leal -12(%ebp), %esp 420; X86-NEXT: popl %esi 421; X86-NEXT: popl %edi 422; X86-NEXT: popl %ebx 423; X86-NEXT: popl %ebp 424; X86-NEXT: retl 425; 426; X64-LABEL: vector_i128_i8: 427; X64: # %bb.0: 428; X64-NEXT: pushq %rbp 429; X64-NEXT: pushq %r15 430; X64-NEXT: pushq %r14 431; X64-NEXT: pushq %r13 432; X64-NEXT: pushq %r12 433; X64-NEXT: pushq %rbx 434; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 435; X64-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) 436; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) 437; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 438; X64-NEXT: divb -{{[0-9]+}}(%rsp) 439; X64-NEXT: movzbl %al, %eax 440; X64-NEXT: movd %eax, %xmm2 441; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 442; X64-NEXT: divb -{{[0-9]+}}(%rsp) 443; X64-NEXT: movzbl %al, %r8d 444; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 445; X64-NEXT: divb -{{[0-9]+}}(%rsp) 446; X64-NEXT: movzbl %al, %r9d 447; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 448; X64-NEXT: divb -{{[0-9]+}}(%rsp) 449; X64-NEXT: movzbl %al, %r10d 450; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 451; X64-NEXT: divb -{{[0-9]+}}(%rsp) 452; X64-NEXT: movzbl %al, %r11d 453; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 454; X64-NEXT: divb -{{[0-9]+}}(%rsp) 455; X64-NEXT: movzbl %al, %r14d 456; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 457; X64-NEXT: divb -{{[0-9]+}}(%rsp) 458; X64-NEXT: movzbl %al, %r15d 459; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 460; X64-NEXT: divb -{{[0-9]+}}(%rsp) 461; X64-NEXT: movzbl %al, %r12d 462; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 463; X64-NEXT: divb -{{[0-9]+}}(%rsp) 464; X64-NEXT: movzbl %al, %r13d 465; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 466; X64-NEXT: divb -{{[0-9]+}}(%rsp) 467; X64-NEXT: movzbl %al, %edi 468; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 469; X64-NEXT: divb -{{[0-9]+}}(%rsp) 470; X64-NEXT: movzbl %al, %esi 471; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 472; X64-NEXT: divb -{{[0-9]+}}(%rsp) 473; X64-NEXT: movzbl %al, %ebx 474; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 475; X64-NEXT: divb -{{[0-9]+}}(%rsp) 476; X64-NEXT: movzbl %al, %ebp 477; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 478; X64-NEXT: divb -{{[0-9]+}}(%rsp) 479; X64-NEXT: movzbl %al, %edx 480; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 481; X64-NEXT: divb -{{[0-9]+}}(%rsp) 482; X64-NEXT: movl %eax, %ecx 483; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax 484; X64-NEXT: divb -{{[0-9]+}}(%rsp) 485; X64-NEXT: movd %r8d, %xmm3 486; X64-NEXT: movd %r9d, %xmm4 487; X64-NEXT: movd %r10d, %xmm5 488; X64-NEXT: movd %r11d, %xmm6 489; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 490; X64-NEXT: movd %r14d, %xmm2 491; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] 492; X64-NEXT: movd %r15d, %xmm4 493; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 494; X64-NEXT: movd %r12d, %xmm3 495; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 496; X64-NEXT: movd %r13d, %xmm6 497; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 498; X64-NEXT: movd %edi, %xmm4 499; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 500; X64-NEXT: movd %esi, %xmm2 501; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] 502; X64-NEXT: movd %ebx, %xmm5 503; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] 504; X64-NEXT: movd %ebp, %xmm6 505; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 506; X64-NEXT: movd %edx, %xmm2 507; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 508; X64-NEXT: movzbl %cl, %ecx 509; X64-NEXT: movd %ecx, %xmm4 510; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 511; X64-NEXT: movzbl %al, %eax 512; X64-NEXT: movd %eax, %xmm6 513; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] 514; X64-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 515; X64-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 516; X64-NEXT: movdqa %xmm6, %xmm2 517; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 518; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 519; X64-NEXT: movdqa %xmm2, (%rax) 520; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 521; X64-NEXT: movdqa %xmm1, %xmm2 522; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 523; X64-NEXT: pmullw %xmm3, %xmm2 524; X64-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 525; X64-NEXT: pand %xmm3, %xmm2 526; X64-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 527; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 528; X64-NEXT: pmullw %xmm6, %xmm1 529; X64-NEXT: pand %xmm3, %xmm1 530; X64-NEXT: packuswb %xmm2, %xmm1 531; X64-NEXT: psubb %xmm1, %xmm0 532; X64-NEXT: popq %rbx 533; X64-NEXT: popq %r12 534; X64-NEXT: popq %r13 535; X64-NEXT: popq %r14 536; X64-NEXT: popq %r15 537; X64-NEXT: popq %rbp 538; X64-NEXT: retq 539 %div = udiv <16 x i8> %x, %y 540 store <16 x i8> %div, <16 x i8>* %divdst, align 16 541 %t1 = mul <16 x i8> %div, %y 542 %t2 = sub <16 x i8> %x, %t1 543 ret <16 x i8> %t2 544} 545 546define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16>* %divdst) nounwind { 547; X86-LABEL: vector_i128_i16: 548; X86: # %bb.0: 549; X86-NEXT: pushl %esi 550; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 551; X86-NEXT: pextrw $7, %xmm0, %eax 552; X86-NEXT: pextrw $7, %xmm1, %esi 553; X86-NEXT: # kill: def $ax killed $ax killed $eax 554; X86-NEXT: xorl %edx, %edx 555; X86-NEXT: divw %si 556; X86-NEXT: # kill: def $ax killed $ax def $eax 557; X86-NEXT: movd %eax, %xmm2 558; X86-NEXT: pextrw $6, %xmm0, %eax 559; X86-NEXT: pextrw $6, %xmm1, %esi 560; X86-NEXT: # kill: def $ax killed $ax killed $eax 561; X86-NEXT: xorl %edx, %edx 562; X86-NEXT: divw %si 563; X86-NEXT: # kill: def $ax killed $ax def $eax 564; X86-NEXT: movd %eax, %xmm3 565; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 566; X86-NEXT: pextrw $5, %xmm0, %eax 567; X86-NEXT: pextrw $5, %xmm1, %esi 568; X86-NEXT: # kill: def $ax killed $ax killed $eax 569; X86-NEXT: xorl %edx, %edx 570; X86-NEXT: divw %si 571; X86-NEXT: # kill: def $ax killed $ax def $eax 572; X86-NEXT: movd %eax, %xmm4 573; X86-NEXT: pextrw $4, %xmm0, %eax 574; X86-NEXT: pextrw $4, %xmm1, %esi 575; X86-NEXT: # kill: def $ax killed $ax killed $eax 576; X86-NEXT: xorl %edx, %edx 577; X86-NEXT: divw %si 578; X86-NEXT: # kill: def $ax killed $ax def $eax 579; X86-NEXT: movd %eax, %xmm2 580; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 581; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 582; X86-NEXT: pextrw $3, %xmm0, %eax 583; X86-NEXT: pextrw $3, %xmm1, %esi 584; X86-NEXT: # kill: def $ax killed $ax killed $eax 585; X86-NEXT: xorl %edx, %edx 586; X86-NEXT: divw %si 587; X86-NEXT: # kill: def $ax killed $ax def $eax 588; X86-NEXT: movd %eax, %xmm3 589; X86-NEXT: pextrw $2, %xmm0, %eax 590; X86-NEXT: pextrw $2, %xmm1, %esi 591; X86-NEXT: # kill: def $ax killed $ax killed $eax 592; X86-NEXT: xorl %edx, %edx 593; X86-NEXT: divw %si 594; X86-NEXT: # kill: def $ax killed $ax def $eax 595; X86-NEXT: movd %eax, %xmm4 596; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 597; X86-NEXT: pextrw $1, %xmm0, %eax 598; X86-NEXT: pextrw $1, %xmm1, %esi 599; X86-NEXT: # kill: def $ax killed $ax killed $eax 600; X86-NEXT: xorl %edx, %edx 601; X86-NEXT: divw %si 602; X86-NEXT: # kill: def $ax killed $ax def $eax 603; X86-NEXT: movd %eax, %xmm3 604; X86-NEXT: movd %xmm0, %eax 605; X86-NEXT: movd %xmm1, %esi 606; X86-NEXT: # kill: def $ax killed $ax killed $eax 607; X86-NEXT: xorl %edx, %edx 608; X86-NEXT: divw %si 609; X86-NEXT: # kill: def $ax killed $ax def $eax 610; X86-NEXT: movd %eax, %xmm5 611; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 612; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 613; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] 614; X86-NEXT: movdqa %xmm5, (%ecx) 615; X86-NEXT: pmullw %xmm1, %xmm5 616; X86-NEXT: psubw %xmm5, %xmm0 617; X86-NEXT: popl %esi 618; X86-NEXT: retl 619; 620; X64-LABEL: vector_i128_i16: 621; X64: # %bb.0: 622; X64-NEXT: pextrw $7, %xmm0, %eax 623; X64-NEXT: pextrw $7, %xmm1, %ecx 624; X64-NEXT: # kill: def $ax killed $ax killed $eax 625; X64-NEXT: xorl %edx, %edx 626; X64-NEXT: divw %cx 627; X64-NEXT: # kill: def $ax killed $ax def $eax 628; X64-NEXT: movd %eax, %xmm2 629; X64-NEXT: pextrw $6, %xmm0, %eax 630; X64-NEXT: pextrw $6, %xmm1, %ecx 631; X64-NEXT: # kill: def $ax killed $ax killed $eax 632; X64-NEXT: xorl %edx, %edx 633; X64-NEXT: divw %cx 634; X64-NEXT: # kill: def $ax killed $ax def $eax 635; X64-NEXT: movd %eax, %xmm3 636; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 637; X64-NEXT: pextrw $5, %xmm0, %eax 638; X64-NEXT: pextrw $5, %xmm1, %ecx 639; X64-NEXT: # kill: def $ax killed $ax killed $eax 640; X64-NEXT: xorl %edx, %edx 641; X64-NEXT: divw %cx 642; X64-NEXT: # kill: def $ax killed $ax def $eax 643; X64-NEXT: movd %eax, %xmm4 644; X64-NEXT: pextrw $4, %xmm0, %eax 645; X64-NEXT: pextrw $4, %xmm1, %ecx 646; X64-NEXT: # kill: def $ax killed $ax killed $eax 647; X64-NEXT: xorl %edx, %edx 648; X64-NEXT: divw %cx 649; X64-NEXT: # kill: def $ax killed $ax def $eax 650; X64-NEXT: movd %eax, %xmm2 651; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 652; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 653; X64-NEXT: pextrw $3, %xmm0, %eax 654; X64-NEXT: pextrw $3, %xmm1, %ecx 655; X64-NEXT: # kill: def $ax killed $ax killed $eax 656; X64-NEXT: xorl %edx, %edx 657; X64-NEXT: divw %cx 658; X64-NEXT: # kill: def $ax killed $ax def $eax 659; X64-NEXT: movd %eax, %xmm3 660; X64-NEXT: pextrw $2, %xmm0, %eax 661; X64-NEXT: pextrw $2, %xmm1, %ecx 662; X64-NEXT: # kill: def $ax killed $ax killed $eax 663; X64-NEXT: xorl %edx, %edx 664; X64-NEXT: divw %cx 665; X64-NEXT: # kill: def $ax killed $ax def $eax 666; X64-NEXT: movd %eax, %xmm4 667; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 668; X64-NEXT: pextrw $1, %xmm0, %eax 669; X64-NEXT: pextrw $1, %xmm1, %ecx 670; X64-NEXT: # kill: def $ax killed $ax killed $eax 671; X64-NEXT: xorl %edx, %edx 672; X64-NEXT: divw %cx 673; X64-NEXT: # kill: def $ax killed $ax def $eax 674; X64-NEXT: movd %eax, %xmm3 675; X64-NEXT: movd %xmm0, %eax 676; X64-NEXT: movd %xmm1, %ecx 677; X64-NEXT: # kill: def $ax killed $ax killed $eax 678; X64-NEXT: xorl %edx, %edx 679; X64-NEXT: divw %cx 680; X64-NEXT: # kill: def $ax killed $ax def $eax 681; X64-NEXT: movd %eax, %xmm5 682; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 683; X64-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 684; X64-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] 685; X64-NEXT: movdqa %xmm5, (%rdi) 686; X64-NEXT: pmullw %xmm1, %xmm5 687; X64-NEXT: psubw %xmm5, %xmm0 688; X64-NEXT: retq 689 %div = udiv <8 x i16> %x, %y 690 store <8 x i16> %div, <8 x i16>* %divdst, align 16 691 %t1 = mul <8 x i16> %div, %y 692 %t2 = sub <8 x i16> %x, %t1 693 ret <8 x i16> %t2 694} 695 696define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %divdst) nounwind { 697; X86-LABEL: vector_i128_i32: 698; X86: # %bb.0: 699; X86-NEXT: pushl %esi 700; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 701; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 702; X86-NEXT: movd %xmm2, %eax 703; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 704; X86-NEXT: movd %xmm2, %esi 705; X86-NEXT: xorl %edx, %edx 706; X86-NEXT: divl %esi 707; X86-NEXT: movd %eax, %xmm2 708; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 709; X86-NEXT: movd %xmm3, %eax 710; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 711; X86-NEXT: movd %xmm3, %esi 712; X86-NEXT: xorl %edx, %edx 713; X86-NEXT: divl %esi 714; X86-NEXT: movd %eax, %xmm3 715; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 716; X86-NEXT: movd %xmm0, %eax 717; X86-NEXT: movd %xmm1, %esi 718; X86-NEXT: xorl %edx, %edx 719; X86-NEXT: divl %esi 720; X86-NEXT: movd %eax, %xmm2 721; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 722; X86-NEXT: movd %xmm4, %eax 723; X86-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 724; X86-NEXT: movd %xmm4, %esi 725; X86-NEXT: xorl %edx, %edx 726; X86-NEXT: divl %esi 727; X86-NEXT: movd %eax, %xmm4 728; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 729; X86-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 730; X86-NEXT: movdqa %xmm2, (%ecx) 731; X86-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 732; X86-NEXT: pmuludq %xmm1, %xmm2 733; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 734; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 735; X86-NEXT: pmuludq %xmm3, %xmm1 736; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 737; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 738; X86-NEXT: psubd %xmm2, %xmm0 739; X86-NEXT: popl %esi 740; X86-NEXT: retl 741; 742; X64-LABEL: vector_i128_i32: 743; X64: # %bb.0: 744; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] 745; X64-NEXT: movd %xmm2, %eax 746; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 747; X64-NEXT: movd %xmm2, %ecx 748; X64-NEXT: xorl %edx, %edx 749; X64-NEXT: divl %ecx 750; X64-NEXT: movd %eax, %xmm2 751; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 752; X64-NEXT: movd %xmm3, %eax 753; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 754; X64-NEXT: movd %xmm3, %ecx 755; X64-NEXT: xorl %edx, %edx 756; X64-NEXT: divl %ecx 757; X64-NEXT: movd %eax, %xmm3 758; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 759; X64-NEXT: movd %xmm0, %eax 760; X64-NEXT: movd %xmm1, %ecx 761; X64-NEXT: xorl %edx, %edx 762; X64-NEXT: divl %ecx 763; X64-NEXT: movd %eax, %xmm2 764; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 765; X64-NEXT: movd %xmm4, %eax 766; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1] 767; X64-NEXT: movd %xmm4, %ecx 768; X64-NEXT: xorl %edx, %edx 769; X64-NEXT: divl %ecx 770; X64-NEXT: movd %eax, %xmm4 771; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 772; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 773; X64-NEXT: movdqa %xmm2, (%rdi) 774; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 775; X64-NEXT: pmuludq %xmm1, %xmm2 776; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 777; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 778; X64-NEXT: pmuludq %xmm3, %xmm1 779; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 780; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 781; X64-NEXT: psubd %xmm2, %xmm0 782; X64-NEXT: retq 783 %div = udiv <4 x i32> %x, %y 784 store <4 x i32> %div, <4 x i32>* %divdst, align 16 785 %t1 = mul <4 x i32> %div, %y 786 %t2 = sub <4 x i32> %x, %t1 787 ret <4 x i32> %t2 788} 789 790define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, <2 x i64>* %divdst) nounwind { 791; X86-LABEL: vector_i128_i64: 792; X86: # %bb.0: 793; X86-NEXT: pushl %esi 794; X86-NEXT: subl $64, %esp 795; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 796; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 797; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 798; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 799; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) 800; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 801; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp) 802; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 803; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) 804; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] 805; X86-NEXT: movd %xmm1, (%esp) 806; X86-NEXT: calll __udivdi3 807; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 808; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 809; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) 810; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp) 811; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 812; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 813; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp) 814; X86-NEXT: movd %xmm1, (%esp) 815; X86-NEXT: movd %edx, %xmm0 816; X86-NEXT: movd %eax, %xmm1 817; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 818; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 819; X86-NEXT: calll __udivdi3 820; X86-NEXT: movd %edx, %xmm0 821; X86-NEXT: movd %eax, %xmm1 822; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 823; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 824; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 825; X86-NEXT: movdqa %xmm1, (%esi) 826; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload 827; X86-NEXT: movdqa %xmm3, %xmm0 828; X86-NEXT: psrlq $32, %xmm0 829; X86-NEXT: pmuludq %xmm1, %xmm0 830; X86-NEXT: movdqa %xmm1, %xmm2 831; X86-NEXT: psrlq $32, %xmm2 832; X86-NEXT: pmuludq %xmm3, %xmm2 833; X86-NEXT: paddq %xmm0, %xmm2 834; X86-NEXT: psllq $32, %xmm2 835; X86-NEXT: pmuludq %xmm3, %xmm1 836; X86-NEXT: paddq %xmm2, %xmm1 837; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 838; X86-NEXT: psubq %xmm1, %xmm0 839; X86-NEXT: addl $64, %esp 840; X86-NEXT: popl %esi 841; X86-NEXT: retl 842; 843; X64-LABEL: vector_i128_i64: 844; X64: # %bb.0: 845; X64-NEXT: movq %xmm0, %rax 846; X64-NEXT: movq %xmm1, %rcx 847; X64-NEXT: xorl %edx, %edx 848; X64-NEXT: divq %rcx 849; X64-NEXT: movq %rax, %xmm2 850; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 851; X64-NEXT: movq %xmm3, %rax 852; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 853; X64-NEXT: movq %xmm3, %rcx 854; X64-NEXT: xorl %edx, %edx 855; X64-NEXT: divq %rcx 856; X64-NEXT: movq %rax, %xmm3 857; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 858; X64-NEXT: movdqa %xmm2, (%rdi) 859; X64-NEXT: movdqa %xmm1, %xmm3 860; X64-NEXT: psrlq $32, %xmm3 861; X64-NEXT: pmuludq %xmm2, %xmm3 862; X64-NEXT: movdqa %xmm2, %xmm4 863; X64-NEXT: psrlq $32, %xmm4 864; X64-NEXT: pmuludq %xmm1, %xmm4 865; X64-NEXT: paddq %xmm3, %xmm4 866; X64-NEXT: psllq $32, %xmm4 867; X64-NEXT: pmuludq %xmm1, %xmm2 868; X64-NEXT: paddq %xmm4, %xmm2 869; X64-NEXT: psubq %xmm2, %xmm0 870; X64-NEXT: retq 871 %div = udiv <2 x i64> %x, %y 872 store <2 x i64> %div, <2 x i64>* %divdst, align 16 873 %t1 = mul <2 x i64> %div, %y 874 %t2 = sub <2 x i64> %x, %t1 875 ret <2 x i64> %t2 876} 877 878; Special tests. 879 880define i32 @scalar_i32_commutative(i32 %x, i32* %ysrc, i32* %divdst) nounwind { 881; X86-LABEL: scalar_i32_commutative: 882; X86: # %bb.0: 883; X86-NEXT: pushl %edi 884; X86-NEXT: pushl %esi 885; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 886; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 887; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 888; X86-NEXT: movl (%eax), %edi 889; X86-NEXT: movl %ecx, %eax 890; X86-NEXT: xorl %edx, %edx 891; X86-NEXT: divl %edi 892; X86-NEXT: movl %eax, (%esi) 893; X86-NEXT: imull %eax, %edi 894; X86-NEXT: subl %edi, %ecx 895; X86-NEXT: movl %ecx, %eax 896; X86-NEXT: popl %esi 897; X86-NEXT: popl %edi 898; X86-NEXT: retl 899; 900; X64-LABEL: scalar_i32_commutative: 901; X64: # %bb.0: 902; X64-NEXT: movq %rdx, %rcx 903; X64-NEXT: movl (%rsi), %esi 904; X64-NEXT: movl %edi, %eax 905; X64-NEXT: xorl %edx, %edx 906; X64-NEXT: divl %esi 907; X64-NEXT: movl %eax, (%rcx) 908; X64-NEXT: imull %eax, %esi 909; X64-NEXT: subl %esi, %edi 910; X64-NEXT: movl %edi, %eax 911; X64-NEXT: retq 912 %y = load i32, i32* %ysrc, align 4 913 %div = udiv i32 %x, %y 914 store i32 %div, i32* %divdst, align 4 915 %t1 = mul i32 %y, %div ; commutative 916 %t2 = sub i32 %x, %t1 917 ret i32 %t2 918} 919 920; We do not care about extra uses. 921define i32 @extrause(i32 %x, i32 %y, i32* %divdst, i32* %t1dst) nounwind { 922; X86-LABEL: extrause: 923; X86: # %bb.0: 924; X86-NEXT: pushl %ebx 925; X86-NEXT: pushl %edi 926; X86-NEXT: pushl %esi 927; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 928; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 929; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 930; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx 931; X86-NEXT: movl %ecx, %eax 932; X86-NEXT: xorl %edx, %edx 933; X86-NEXT: divl %ebx 934; X86-NEXT: movl %eax, (%edi) 935; X86-NEXT: imull %ebx, %eax 936; X86-NEXT: movl %eax, (%esi) 937; X86-NEXT: subl %eax, %ecx 938; X86-NEXT: movl %ecx, %eax 939; X86-NEXT: popl %esi 940; X86-NEXT: popl %edi 941; X86-NEXT: popl %ebx 942; X86-NEXT: retl 943; 944; X64-LABEL: extrause: 945; X64: # %bb.0: 946; X64-NEXT: movq %rdx, %r8 947; X64-NEXT: movl %edi, %eax 948; X64-NEXT: xorl %edx, %edx 949; X64-NEXT: divl %esi 950; X64-NEXT: movl %eax, (%r8) 951; X64-NEXT: imull %esi, %eax 952; X64-NEXT: movl %eax, (%rcx) 953; X64-NEXT: subl %eax, %edi 954; X64-NEXT: movl %edi, %eax 955; X64-NEXT: retq 956 %div = udiv i32 %x, %y 957 store i32 %div, i32* %divdst, align 4 958 %t1 = mul i32 %div, %y 959 store i32 %t1, i32* %t1dst, align 4 960 %t2 = sub i32 %x, %t1 961 ret i32 %t2 962} 963 964; 'rem' should appear next to 'div'. 965define i32 @multiple_bb(i32 %x, i32 %y, i32* %divdst, i1 zeroext %store_urem, i32* %uremdst) nounwind { 966; X86-LABEL: multiple_bb: 967; X86: # %bb.0: 968; X86-NEXT: pushl %ebx 969; X86-NEXT: pushl %edi 970; X86-NEXT: pushl %esi 971; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 972; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 973; X86-NEXT: movb {{[0-9]+}}(%esp), %bl 974; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 975; X86-NEXT: movl %ecx, %eax 976; X86-NEXT: xorl %edx, %edx 977; X86-NEXT: divl %esi 978; X86-NEXT: movl %eax, (%edi) 979; X86-NEXT: testb %bl, %bl 980; X86-NEXT: je .LBB11_2 981; X86-NEXT: # %bb.1: # %do_urem 982; X86-NEXT: movl {{[0-9]+}}(%esp), %edx 983; X86-NEXT: movl %eax, %edi 984; X86-NEXT: imull %esi, %edi 985; X86-NEXT: subl %edi, %ecx 986; X86-NEXT: movl %ecx, (%edx) 987; X86-NEXT: .LBB11_2: # %end 988; X86-NEXT: popl %esi 989; X86-NEXT: popl %edi 990; X86-NEXT: popl %ebx 991; X86-NEXT: retl 992; 993; X64-LABEL: multiple_bb: 994; X64: # %bb.0: 995; X64-NEXT: movq %rdx, %r9 996; X64-NEXT: movl %edi, %eax 997; X64-NEXT: xorl %edx, %edx 998; X64-NEXT: divl %esi 999; X64-NEXT: movl %eax, (%r9) 1000; X64-NEXT: testl %ecx, %ecx 1001; X64-NEXT: je .LBB11_2 1002; X64-NEXT: # %bb.1: # %do_urem 1003; X64-NEXT: movl %eax, %ecx 1004; X64-NEXT: imull %esi, %ecx 1005; X64-NEXT: subl %ecx, %edi 1006; X64-NEXT: movl %edi, (%r8) 1007; X64-NEXT: .LBB11_2: # %end 1008; X64-NEXT: retq 1009 %div = udiv i32 %x, %y 1010 store i32 %div, i32* %divdst, align 4 1011 br i1 %store_urem, label %do_urem, label %end 1012do_urem: 1013 %t1 = mul i32 %div, %y 1014 %t2 = sub i32 %x, %t1 1015 store i32 %t2, i32* %uremdst, align 4 1016 br label %end 1017end: 1018 ret i32 %div 1019} 1020 1021define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind { 1022; X86-LABEL: negative_different_x: 1023; X86: # %bb.0: 1024; X86-NEXT: pushl %edi 1025; X86-NEXT: pushl %esi 1026; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1027; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1028; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1029; X86-NEXT: movl {{[0-9]+}}(%esp), %edi 1030; X86-NEXT: xorl %edx, %edx 1031; X86-NEXT: divl %edi 1032; X86-NEXT: movl %eax, (%esi) 1033; X86-NEXT: imull %edi, %eax 1034; X86-NEXT: subl %eax, %ecx 1035; X86-NEXT: movl %ecx, %eax 1036; X86-NEXT: popl %esi 1037; X86-NEXT: popl %edi 1038; X86-NEXT: retl 1039; 1040; X64-LABEL: negative_different_x: 1041; X64: # %bb.0: 1042; X64-NEXT: movl %edx, %r8d 1043; X64-NEXT: movl %edi, %eax 1044; X64-NEXT: xorl %edx, %edx 1045; X64-NEXT: divl %r8d 1046; X64-NEXT: movl %eax, (%rcx) 1047; X64-NEXT: imull %r8d, %eax 1048; X64-NEXT: subl %eax, %esi 1049; X64-NEXT: movl %esi, %eax 1050; X64-NEXT: retq 1051 %div = udiv i32 %x0, %y ; not %x1 1052 store i32 %div, i32* %divdst, align 4 1053 %t1 = mul i32 %div, %y 1054 %t2 = sub i32 %x1, %t1 ; not %x0 1055 ret i32 %t2 1056} 1057 1058define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, i32* %divdst) nounwind { 1059; X86-LABEL: negative_different_y: 1060; X86: # %bb.0: 1061; X86-NEXT: pushl %esi 1062; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1063; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1064; X86-NEXT: movl %ecx, %eax 1065; X86-NEXT: xorl %edx, %edx 1066; X86-NEXT: divl {{[0-9]+}}(%esp) 1067; X86-NEXT: movl %eax, (%esi) 1068; X86-NEXT: imull {{[0-9]+}}(%esp), %eax 1069; X86-NEXT: subl %eax, %ecx 1070; X86-NEXT: movl %ecx, %eax 1071; X86-NEXT: popl %esi 1072; X86-NEXT: retl 1073; 1074; X64-LABEL: negative_different_y: 1075; X64: # %bb.0: 1076; X64-NEXT: movl %edx, %edi 1077; X64-NEXT: movl %esi, %eax 1078; X64-NEXT: xorl %edx, %edx 1079; X64-NEXT: divl %ecx 1080; X64-NEXT: movl %eax, (%r8) 1081; X64-NEXT: imull %eax, %edi 1082; X64-NEXT: subl %edi, %esi 1083; X64-NEXT: movl %esi, %eax 1084; X64-NEXT: retq 1085 %div = udiv i32 %x1, %z ; not %x0 1086 store i32 %div, i32* %divdst, align 4 1087 %t1 = mul i32 %div, %y 1088 %t2 = sub i32 %x1, %t1 1089 ret i32 %t2 1090} 1091 1092define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, i32* %divdst) nounwind { 1093; X86-LABEL: negative_inverted_division: 1094; X86: # %bb.0: 1095; X86-NEXT: pushl %esi 1096; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 1097; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1098; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 1099; X86-NEXT: xorl %edx, %edx 1100; X86-NEXT: divl %ecx 1101; X86-NEXT: movl %eax, (%esi) 1102; X86-NEXT: imull %ecx, %eax 1103; X86-NEXT: subl %eax, %ecx 1104; X86-NEXT: movl %ecx, %eax 1105; X86-NEXT: popl %esi 1106; X86-NEXT: retl 1107; 1108; X64-LABEL: negative_inverted_division: 1109; X64: # %bb.0: 1110; X64-NEXT: movl %edi, %eax 1111; X64-NEXT: xorl %edx, %edx 1112; X64-NEXT: divl %esi 1113; X64-NEXT: movl %eax, (%rcx) 1114; X64-NEXT: imull %esi, %eax 1115; X64-NEXT: subl %eax, %esi 1116; X64-NEXT: movl %esi, %eax 1117; X64-NEXT: retq 1118 %div = udiv i32 %x0, %x1 ; inverted division 1119 store i32 %div, i32* %divdst, align 4 1120 %t1 = mul i32 %div, %x1 1121 %t2 = sub i32 %x1, %t1 1122 ret i32 %t2 1123} 1124