1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX,AVX2ORLATER,AVX512BW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefixes=CHECK,AVX,XOP 9 10; fold (sdiv x, 1) -> x 11define i32 @combine_sdiv_by_one(i32 %x) { 12; CHECK-LABEL: combine_sdiv_by_one: 13; CHECK: # %bb.0: 14; CHECK-NEXT: movl %edi, %eax 15; CHECK-NEXT: retq 16 %1 = sdiv i32 %x, 1 17 ret i32 %1 18} 19 20define <4 x i32> @combine_vec_sdiv_by_one(<4 x i32> %x) { 21; CHECK-LABEL: combine_vec_sdiv_by_one: 22; CHECK: # %bb.0: 23; CHECK-NEXT: retq 24 %1 = sdiv <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1> 25 ret <4 x i32> %1 26} 27 28; fold (sdiv x, -1) -> 0 - x 29define i32 @combine_sdiv_by_negone(i32 %x) { 30; CHECK-LABEL: combine_sdiv_by_negone: 31; CHECK: # %bb.0: 32; CHECK-NEXT: movl %edi, %eax 33; CHECK-NEXT: negl %eax 34; CHECK-NEXT: retq 35 %1 = sdiv i32 %x, -1 36 ret i32 %1 37} 38 39define <4 x i32> @combine_vec_sdiv_by_negone(<4 x i32> %x) { 40; SSE-LABEL: combine_vec_sdiv_by_negone: 41; SSE: # %bb.0: 42; SSE-NEXT: pxor %xmm1, %xmm1 43; SSE-NEXT: psubd %xmm0, %xmm1 44; SSE-NEXT: movdqa %xmm1, %xmm0 45; SSE-NEXT: retq 46; 47; AVX-LABEL: combine_vec_sdiv_by_negone: 48; AVX: # %bb.0: 49; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 50; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 51; AVX-NEXT: retq 52 %1 = sdiv <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1> 53 ret <4 x i32> %1 54} 55 56; fold (sdiv x, INT_MIN) -> select((icmp eq x, INT_MIN), 1, 0) 57define i32 @combine_sdiv_by_minsigned(i32 %x) { 58; CHECK-LABEL: combine_sdiv_by_minsigned: 59; CHECK: # %bb.0: 60; CHECK-NEXT: xorl %eax, %eax 61; CHECK-NEXT: cmpl $-2147483648, %edi # imm = 0x80000000 62; CHECK-NEXT: sete %al 63; CHECK-NEXT: retq 64 %1 = sdiv i32 %x, -2147483648 65 ret i32 %1 66} 67 68define <4 x i32> @combine_vec_sdiv_by_minsigned(<4 x i32> %x) { 69; SSE-LABEL: combine_vec_sdiv_by_minsigned: 70; SSE: # %bb.0: 71; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 72; SSE-NEXT: psrld $31, %xmm0 73; SSE-NEXT: retq 74; 75; AVX1-LABEL: combine_vec_sdiv_by_minsigned: 76; AVX1: # %bb.0: 77; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 78; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0 79; AVX1-NEXT: retq 80; 81; AVX2-LABEL: combine_vec_sdiv_by_minsigned: 82; AVX2: # %bb.0: 83; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 84; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 85; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 86; AVX2-NEXT: retq 87; 88; AVX512F-LABEL: combine_vec_sdiv_by_minsigned: 89; AVX512F: # %bb.0: 90; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] 91; AVX512F-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 92; AVX512F-NEXT: vpsrld $31, %xmm0, %xmm0 93; AVX512F-NEXT: retq 94; 95; AVX512BW-LABEL: combine_vec_sdiv_by_minsigned: 96; AVX512BW: # %bb.0: 97; AVX512BW-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 98; AVX512BW-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} 99; AVX512BW-NEXT: retq 100; 101; XOP-LABEL: combine_vec_sdiv_by_minsigned: 102; XOP: # %bb.0: 103; XOP-NEXT: vpcomeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 104; XOP-NEXT: vpsrld $31, %xmm0, %xmm0 105; XOP-NEXT: retq 106 %1 = sdiv <4 x i32> %x, <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 107 ret <4 x i32> %1 108} 109 110; fold (sdiv 0, x) -> 0 111define i32 @combine_sdiv_zero(i32 %x) { 112; CHECK-LABEL: combine_sdiv_zero: 113; CHECK: # %bb.0: 114; CHECK-NEXT: xorl %eax, %eax 115; CHECK-NEXT: retq 116 %1 = sdiv i32 0, %x 117 ret i32 %1 118} 119 120define <4 x i32> @combine_vec_sdiv_zero(<4 x i32> %x) { 121; SSE-LABEL: combine_vec_sdiv_zero: 122; SSE: # %bb.0: 123; SSE-NEXT: xorps %xmm0, %xmm0 124; SSE-NEXT: retq 125; 126; AVX-LABEL: combine_vec_sdiv_zero: 127; AVX: # %bb.0: 128; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 129; AVX-NEXT: retq 130 %1 = sdiv <4 x i32> zeroinitializer, %x 131 ret <4 x i32> %1 132} 133 134; fold (sdiv x, x) -> 1 135define i32 @combine_sdiv_dupe(i32 %x) { 136; CHECK-LABEL: combine_sdiv_dupe: 137; CHECK: # %bb.0: 138; CHECK-NEXT: movl $1, %eax 139; CHECK-NEXT: retq 140 %1 = sdiv i32 %x, %x 141 ret i32 %1 142} 143 144define <4 x i32> @combine_vec_sdiv_dupe(<4 x i32> %x) { 145; SSE-LABEL: combine_vec_sdiv_dupe: 146; SSE: # %bb.0: 147; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: combine_vec_sdiv_dupe: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] 153; AVX1-NEXT: retq 154; 155; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: 156; AVX2ORLATER: # %bb.0: 157; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] 158; AVX2ORLATER-NEXT: retq 159; 160; XOP-LABEL: combine_vec_sdiv_dupe: 161; XOP: # %bb.0: 162; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] 163; XOP-NEXT: retq 164 %1 = sdiv <4 x i32> %x, %x 165 ret <4 x i32> %1 166} 167 168; fold (sdiv x, y) -> (udiv x, y) iff x and y are positive 169define <4 x i32> @combine_vec_sdiv_by_pos0(<4 x i32> %x) { 170; SSE-LABEL: combine_vec_sdiv_by_pos0: 171; SSE: # %bb.0: 172; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 173; SSE-NEXT: psrld $2, %xmm0 174; SSE-NEXT: retq 175; 176; AVX-LABEL: combine_vec_sdiv_by_pos0: 177; AVX: # %bb.0: 178; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 179; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 180; AVX-NEXT: retq 181 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 182 %2 = sdiv <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4> 183 ret <4 x i32> %2 184} 185 186define <4 x i32> @combine_vec_sdiv_by_pos1(<4 x i32> %x) { 187; SSE2-LABEL: combine_vec_sdiv_by_pos1: 188; SSE2: # %bb.0: 189; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 190; SSE2-NEXT: movdqa %xmm0, %xmm1 191; SSE2-NEXT: psrld $4, %xmm1 192; SSE2-NEXT: movdqa %xmm0, %xmm2 193; SSE2-NEXT: psrld $3, %xmm2 194; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] 195; SSE2-NEXT: movdqa %xmm0, %xmm1 196; SSE2-NEXT: psrld $2, %xmm1 197; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 198; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] 199; SSE2-NEXT: retq 200; 201; SSE41-LABEL: combine_vec_sdiv_by_pos1: 202; SSE41: # %bb.0: 203; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 204; SSE41-NEXT: movdqa %xmm0, %xmm2 205; SSE41-NEXT: movdqa %xmm0, %xmm1 206; SSE41-NEXT: psrld $3, %xmm1 207; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 208; SSE41-NEXT: psrld $4, %xmm0 209; SSE41-NEXT: psrld $2, %xmm2 210; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 211; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 212; SSE41-NEXT: movdqa %xmm1, %xmm0 213; SSE41-NEXT: retq 214; 215; AVX1-LABEL: combine_vec_sdiv_by_pos1: 216; AVX1: # %bb.0: 217; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 218; AVX1-NEXT: vpsrld $4, %xmm0, %xmm1 219; AVX1-NEXT: vpsrld $2, %xmm0, %xmm2 220; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 221; AVX1-NEXT: vpsrld $3, %xmm0, %xmm2 222; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 223; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 224; AVX1-NEXT: retq 225; 226; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pos1: 227; AVX2ORLATER: # %bb.0: 228; AVX2ORLATER-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 229; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 230; AVX2ORLATER-NEXT: retq 231; 232; XOP-LABEL: combine_vec_sdiv_by_pos1: 233; XOP: # %bb.0: 234; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 235; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 236; XOP-NEXT: retq 237 %1 = and <4 x i32> %x, <i32 255, i32 255, i32 255, i32 255> 238 %2 = sdiv <4 x i32> %1, <i32 1, i32 4, i32 8, i32 16> 239 ret <4 x i32> %2 240} 241 242; fold (sdiv x, (1 << c)) -> x >>u c 243define <4 x i32> @combine_vec_sdiv_by_pow2a(<4 x i32> %x) { 244; SSE-LABEL: combine_vec_sdiv_by_pow2a: 245; SSE: # %bb.0: 246; SSE-NEXT: movdqa %xmm0, %xmm1 247; SSE-NEXT: psrad $31, %xmm1 248; SSE-NEXT: psrld $30, %xmm1 249; SSE-NEXT: paddd %xmm0, %xmm1 250; SSE-NEXT: psrad $2, %xmm1 251; SSE-NEXT: movdqa %xmm1, %xmm0 252; SSE-NEXT: retq 253; 254; AVX-LABEL: combine_vec_sdiv_by_pow2a: 255; AVX: # %bb.0: 256; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 257; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 258; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 259; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 260; AVX-NEXT: retq 261 %1 = sdiv <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4> 262 ret <4 x i32> %1 263} 264 265define <4 x i32> @combine_vec_sdiv_by_pow2a_neg(<4 x i32> %x) { 266; SSE-LABEL: combine_vec_sdiv_by_pow2a_neg: 267; SSE: # %bb.0: 268; SSE-NEXT: movdqa %xmm0, %xmm1 269; SSE-NEXT: psrad $31, %xmm1 270; SSE-NEXT: psrld $30, %xmm1 271; SSE-NEXT: paddd %xmm0, %xmm1 272; SSE-NEXT: psrad $2, %xmm1 273; SSE-NEXT: pxor %xmm0, %xmm0 274; SSE-NEXT: psubd %xmm1, %xmm0 275; SSE-NEXT: retq 276; 277; AVX-LABEL: combine_vec_sdiv_by_pow2a_neg: 278; AVX: # %bb.0: 279; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 280; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 281; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 282; AVX-NEXT: vpsrad $2, %xmm0, %xmm0 283; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 284; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm0 285; AVX-NEXT: retq 286 %1 = sdiv <4 x i32> %x, <i32 -4, i32 -4, i32 -4, i32 -4> 287 ret <4 x i32> %1 288} 289 290define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { 291; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 292; SSE2: # %bb.0: 293; SSE2-NEXT: pxor %xmm1, %xmm1 294; SSE2-NEXT: pxor %xmm2, %xmm2 295; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 296; SSE2-NEXT: movdqa %xmm2, %xmm3 297; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 298; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [256,4,2,16,8,32,64,2] 299; SSE2-NEXT: pmullw %xmm4, %xmm3 300; SSE2-NEXT: psrlw $8, %xmm3 301; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 302; SSE2-NEXT: pmullw %xmm4, %xmm2 303; SSE2-NEXT: psrlw $8, %xmm2 304; SSE2-NEXT: packuswb %xmm3, %xmm2 305; SSE2-NEXT: paddb %xmm0, %xmm2 306; SSE2-NEXT: movdqa %xmm2, %xmm1 307; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 308; SSE2-NEXT: psraw $8, %xmm1 309; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 310; SSE2-NEXT: pmullw %xmm3, %xmm1 311; SSE2-NEXT: psrlw $8, %xmm1 312; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 313; SSE2-NEXT: psraw $8, %xmm2 314; SSE2-NEXT: pmullw %xmm3, %xmm2 315; SSE2-NEXT: psrlw $8, %xmm2 316; SSE2-NEXT: packuswb %xmm1, %xmm2 317; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 318; SSE2-NEXT: pand %xmm1, %xmm2 319; SSE2-NEXT: pandn %xmm0, %xmm1 320; SSE2-NEXT: por %xmm2, %xmm1 321; SSE2-NEXT: movdqa %xmm1, %xmm0 322; SSE2-NEXT: retq 323; 324; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 325; SSE41: # %bb.0: 326; SSE41-NEXT: movdqa %xmm0, %xmm1 327; SSE41-NEXT: pxor %xmm0, %xmm0 328; SSE41-NEXT: pxor %xmm3, %xmm3 329; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 330; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 331; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 332; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [256,4,2,16,8,32,64,2] 333; SSE41-NEXT: pmullw %xmm0, %xmm3 334; SSE41-NEXT: psrlw $8, %xmm3 335; SSE41-NEXT: pmullw %xmm0, %xmm2 336; SSE41-NEXT: psrlw $8, %xmm2 337; SSE41-NEXT: packuswb %xmm3, %xmm2 338; SSE41-NEXT: paddb %xmm1, %xmm2 339; SSE41-NEXT: movdqa %xmm2, %xmm0 340; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 341; SSE41-NEXT: psraw $8, %xmm0 342; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 343; SSE41-NEXT: pmullw %xmm3, %xmm0 344; SSE41-NEXT: psrlw $8, %xmm0 345; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 346; SSE41-NEXT: psraw $8, %xmm2 347; SSE41-NEXT: pmullw %xmm3, %xmm2 348; SSE41-NEXT: psrlw $8, %xmm2 349; SSE41-NEXT: packuswb %xmm0, %xmm2 350; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 351; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 352; SSE41-NEXT: movdqa %xmm1, %xmm0 353; SSE41-NEXT: retq 354; 355; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 356; AVX1: # %bb.0: 357; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 358; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 359; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 360; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,4,2,16,8,32,64,2] 361; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 362; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 363; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 364; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 365; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 366; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 367; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 368; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 369; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 370; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [256,64,128,16,32,8,4,128] 371; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 372; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 373; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 374; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 375; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 376; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 377; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 378; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 379; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 380; AVX1-NEXT: retq 381; 382; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 383; AVX2: # %bb.0: 384; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 385; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 386; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 387; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 388; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 389; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 390; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 391; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 392; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 393; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 394; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 395; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 396; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 397; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 398; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 399; AVX2-NEXT: vzeroupper 400; AVX2-NEXT: retq 401; 402; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 403; AVX512F: # %bb.0: 404; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 405; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 406; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 407; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 408; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 409; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 410; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 411; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 412; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 413; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 414; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 415; AVX512F-NEXT: vzeroupper 416; AVX512F-NEXT: retq 417; 418; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 419; AVX512BW: # %bb.0: 420; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 421; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 422; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 423; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 424; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 425; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm1 426; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 427; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 428; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 429; AVX512BW-NEXT: movw $257, %ax # imm = 0x101 430; AVX512BW-NEXT: kmovd %eax, %k1 431; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} 432; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0 433; AVX512BW-NEXT: vzeroupper 434; AVX512BW-NEXT: retq 435; 436; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i8: 437; XOP: # %bb.0: 438; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 439; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 440; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 441; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 442; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 443; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] 444; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 445; XOP-NEXT: retq 446 %1 = sdiv <16 x i8> %x, <i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2, i8 1, i8 4, i8 2, i8 16, i8 8, i8 32, i8 64, i8 2> 447 ret <16 x i8> %1 448} 449 450define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { 451; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 452; SSE2: # %bb.0: 453; SSE2-NEXT: movdqa %xmm0, %xmm1 454; SSE2-NEXT: psraw $15, %xmm1 455; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 456; SSE2-NEXT: paddw %xmm0, %xmm1 457; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,0,0,65535] 458; SSE2-NEXT: movdqa %xmm1, %xmm3 459; SSE2-NEXT: pand %xmm2, %xmm3 460; SSE2-NEXT: psraw $4, %xmm1 461; SSE2-NEXT: pandn %xmm1, %xmm2 462; SSE2-NEXT: por %xmm3, %xmm2 463; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,0,65535,0,65535] 464; SSE2-NEXT: movdqa %xmm2, %xmm3 465; SSE2-NEXT: pand %xmm1, %xmm3 466; SSE2-NEXT: psraw $2, %xmm2 467; SSE2-NEXT: pandn %xmm2, %xmm1 468; SSE2-NEXT: por %xmm3, %xmm1 469; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,0,0,65535,0] 470; SSE2-NEXT: movdqa %xmm1, %xmm3 471; SSE2-NEXT: pand %xmm2, %xmm3 472; SSE2-NEXT: psraw $1, %xmm1 473; SSE2-NEXT: pandn %xmm1, %xmm2 474; SSE2-NEXT: por %xmm3, %xmm2 475; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] 476; SSE2-NEXT: pand %xmm1, %xmm2 477; SSE2-NEXT: pandn %xmm0, %xmm1 478; SSE2-NEXT: por %xmm2, %xmm1 479; SSE2-NEXT: movdqa %xmm1, %xmm0 480; SSE2-NEXT: retq 481; 482; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 483; SSE41: # %bb.0: 484; SSE41-NEXT: movdqa %xmm0, %xmm1 485; SSE41-NEXT: psraw $15, %xmm1 486; SSE41-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 487; SSE41-NEXT: paddw %xmm0, %xmm1 488; SSE41-NEXT: movdqa %xmm1, %xmm2 489; SSE41-NEXT: psraw $1, %xmm2 490; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 491; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 492; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] 493; SSE41-NEXT: movdqa %xmm1, %xmm0 494; SSE41-NEXT: retq 495; 496; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 497; AVX1: # %bb.0: 498; AVX1-NEXT: vpsraw $15, %xmm0, %xmm1 499; AVX1-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 500; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 501; AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 502; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 503; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 504; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 505; AVX1-NEXT: retq 506; 507; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 508; AVX2: # %bb.0: 509; AVX2-NEXT: vpsraw $15, %xmm0, %xmm1 510; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 511; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 512; AVX2-NEXT: vpsraw $1, %xmm1, %xmm2 513; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 514; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6],xmm2[7] 515; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 516; AVX2-NEXT: retq 517; 518; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 519; AVX512F: # %bb.0: 520; AVX512F-NEXT: vpsraw $15, %xmm0, %xmm1 521; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 522; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm1 523; AVX512F-NEXT: vpmovsxwd %xmm1, %ymm1 524; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 525; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 526; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 527; AVX512F-NEXT: vzeroupper 528; AVX512F-NEXT: retq 529; 530; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 531; AVX512BW: # %bb.0: 532; AVX512BW-NEXT: vpsraw $15, %xmm0, %xmm1 533; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 534; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm1 535; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 536; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 537; AVX512BW-NEXT: retq 538; 539; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i16: 540; XOP: # %bb.0: 541; XOP-NEXT: vpsraw $15, %xmm0, %xmm1 542; XOP-NEXT: vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 543; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm1 544; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 545; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] 546; XOP-NEXT: retq 547 %1 = sdiv <8 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 548 ret <8 x i16> %1 549} 550 551define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { 552; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 553; SSE2: # %bb.0: 554; SSE2-NEXT: movdqa %xmm0, %xmm3 555; SSE2-NEXT: psraw $15, %xmm0 556; SSE2-NEXT: movdqa {{.*#+}} xmm8 = <u,4,2,16,8,32,64,2> 557; SSE2-NEXT: pmulhuw %xmm8, %xmm0 558; SSE2-NEXT: paddw %xmm3, %xmm0 559; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535] 560; SSE2-NEXT: movdqa %xmm0, %xmm2 561; SSE2-NEXT: pand %xmm4, %xmm2 562; SSE2-NEXT: psraw $4, %xmm0 563; SSE2-NEXT: movdqa %xmm4, %xmm6 564; SSE2-NEXT: pandn %xmm0, %xmm6 565; SSE2-NEXT: por %xmm2, %xmm6 566; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,0,65535] 567; SSE2-NEXT: movdqa %xmm6, %xmm0 568; SSE2-NEXT: pand %xmm5, %xmm0 569; SSE2-NEXT: psraw $2, %xmm6 570; SSE2-NEXT: movdqa %xmm5, %xmm2 571; SSE2-NEXT: pandn %xmm6, %xmm2 572; SSE2-NEXT: por %xmm0, %xmm2 573; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,0,0,65535,0] 574; SSE2-NEXT: movdqa %xmm2, %xmm0 575; SSE2-NEXT: pand %xmm7, %xmm0 576; SSE2-NEXT: psraw $1, %xmm2 577; SSE2-NEXT: movdqa %xmm7, %xmm6 578; SSE2-NEXT: pandn %xmm2, %xmm6 579; SSE2-NEXT: por %xmm0, %xmm6 580; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] 581; SSE2-NEXT: pand %xmm2, %xmm6 582; SSE2-NEXT: movdqa %xmm2, %xmm0 583; SSE2-NEXT: pandn %xmm3, %xmm0 584; SSE2-NEXT: por %xmm6, %xmm0 585; SSE2-NEXT: movdqa %xmm1, %xmm3 586; SSE2-NEXT: psraw $15, %xmm3 587; SSE2-NEXT: pmulhuw %xmm8, %xmm3 588; SSE2-NEXT: paddw %xmm1, %xmm3 589; SSE2-NEXT: movdqa %xmm3, %xmm6 590; SSE2-NEXT: pand %xmm4, %xmm6 591; SSE2-NEXT: psraw $4, %xmm3 592; SSE2-NEXT: pandn %xmm3, %xmm4 593; SSE2-NEXT: por %xmm6, %xmm4 594; SSE2-NEXT: movdqa %xmm4, %xmm3 595; SSE2-NEXT: pand %xmm5, %xmm3 596; SSE2-NEXT: psraw $2, %xmm4 597; SSE2-NEXT: pandn %xmm4, %xmm5 598; SSE2-NEXT: por %xmm3, %xmm5 599; SSE2-NEXT: movdqa %xmm5, %xmm3 600; SSE2-NEXT: pand %xmm7, %xmm3 601; SSE2-NEXT: psraw $1, %xmm5 602; SSE2-NEXT: pandn %xmm5, %xmm7 603; SSE2-NEXT: por %xmm3, %xmm7 604; SSE2-NEXT: pand %xmm2, %xmm7 605; SSE2-NEXT: pandn %xmm1, %xmm2 606; SSE2-NEXT: por %xmm7, %xmm2 607; SSE2-NEXT: movdqa %xmm2, %xmm1 608; SSE2-NEXT: retq 609; 610; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 611; SSE41: # %bb.0: 612; SSE41-NEXT: movdqa %xmm0, %xmm2 613; SSE41-NEXT: psraw $15, %xmm2 614; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2> 615; SSE41-NEXT: pmulhuw %xmm4, %xmm2 616; SSE41-NEXT: paddw %xmm0, %xmm2 617; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <u,16384,32768,4096,8192,2048,1024,32768> 618; SSE41-NEXT: movdqa %xmm2, %xmm3 619; SSE41-NEXT: pmulhw %xmm5, %xmm3 620; SSE41-NEXT: psraw $1, %xmm2 621; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6],xmm2[7] 622; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] 623; SSE41-NEXT: movdqa %xmm1, %xmm3 624; SSE41-NEXT: psraw $15, %xmm3 625; SSE41-NEXT: pmulhuw %xmm4, %xmm3 626; SSE41-NEXT: paddw %xmm1, %xmm3 627; SSE41-NEXT: pmulhw %xmm3, %xmm5 628; SSE41-NEXT: psraw $1, %xmm3 629; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4,5,6],xmm3[7] 630; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3,4,5,6,7] 631; SSE41-NEXT: movdqa %xmm2, %xmm0 632; SSE41-NEXT: movdqa %xmm3, %xmm1 633; SSE41-NEXT: retq 634; 635; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 636; AVX1: # %bb.0: 637; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 638; AVX1-NEXT: vpsraw $15, %xmm1, %xmm2 639; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2> 640; AVX1-NEXT: vpmulhuw %xmm3, %xmm2, %xmm2 641; AVX1-NEXT: vpaddw %xmm2, %xmm1, %xmm1 642; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768> 643; AVX1-NEXT: vpmulhw %xmm2, %xmm1, %xmm4 644; AVX1-NEXT: vpsraw $1, %xmm1, %xmm1 645; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7] 646; AVX1-NEXT: vpsraw $15, %xmm0, %xmm4 647; AVX1-NEXT: vpmulhuw %xmm3, %xmm4, %xmm3 648; AVX1-NEXT: vpaddw %xmm3, %xmm0, %xmm3 649; AVX1-NEXT: vpmulhw %xmm2, %xmm3, %xmm2 650; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 651; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] 652; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 653; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 654; AVX1-NEXT: # ymm2 = mem[0,1,0,1] 655; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 656; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 657; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 658; AVX1-NEXT: retq 659; 660; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 661; AVX2: # %bb.0: 662; AVX2-NEXT: vpsraw $15, %ymm0, %ymm1 663; AVX2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 664; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 665; AVX2-NEXT: vpsraw $1, %ymm1, %ymm2 666; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 667; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7],ymm1[8,9],ymm2[10],ymm1[11,12,13,14],ymm2[15] 668; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 669; AVX2-NEXT: retq 670; 671; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 672; AVX512F: # %bb.0: 673; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 674; AVX512F-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 675; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 676; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 677; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 678; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 679; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 680; AVX512F-NEXT: retq 681; 682; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 683; AVX512BW: # %bb.0: 684; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm1 685; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 686; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm1 687; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 688; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] 689; AVX512BW-NEXT: retq 690; 691; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i16: 692; XOP: # %bb.0: 693; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 694; XOP-NEXT: vpsraw $15, %xmm1, %xmm2 695; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521> 696; XOP-NEXT: vpshlw %xmm3, %xmm2, %xmm2 697; XOP-NEXT: vpaddw %xmm2, %xmm1, %xmm1 698; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535> 699; XOP-NEXT: vpshaw %xmm2, %xmm1, %xmm1 700; XOP-NEXT: vpsraw $15, %xmm0, %xmm4 701; XOP-NEXT: vpshlw %xmm3, %xmm4, %xmm3 702; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 703; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 704; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 705; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 706; XOP-NEXT: # ymm2 = mem[0,1,0,1] 707; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 708; XOP-NEXT: retq 709 %1 = sdiv <16 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 710 ret <16 x i16> %1 711} 712 713define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { 714; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 715; SSE2: # %bb.0: 716; SSE2-NEXT: movdqa %xmm1, %xmm8 717; SSE2-NEXT: movdqa %xmm0, %xmm1 718; SSE2-NEXT: psraw $15, %xmm0 719; SSE2-NEXT: movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2> 720; SSE2-NEXT: pmulhuw %xmm9, %xmm0 721; SSE2-NEXT: paddw %xmm1, %xmm0 722; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [65535,65535,65535,0,65535,0,0,65535] 723; SSE2-NEXT: movdqa %xmm0, %xmm4 724; SSE2-NEXT: pand %xmm11, %xmm4 725; SSE2-NEXT: psraw $4, %xmm0 726; SSE2-NEXT: movdqa %xmm11, %xmm5 727; SSE2-NEXT: pandn %xmm0, %xmm5 728; SSE2-NEXT: por %xmm4, %xmm5 729; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [65535,0,65535,65535,0,65535,0,65535] 730; SSE2-NEXT: movdqa %xmm5, %xmm0 731; SSE2-NEXT: pand %xmm7, %xmm0 732; SSE2-NEXT: psraw $2, %xmm5 733; SSE2-NEXT: movdqa %xmm7, %xmm4 734; SSE2-NEXT: pandn %xmm5, %xmm4 735; SSE2-NEXT: por %xmm0, %xmm4 736; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,0,0,65535,0] 737; SSE2-NEXT: movdqa %xmm4, %xmm0 738; SSE2-NEXT: pand %xmm10, %xmm0 739; SSE2-NEXT: psraw $1, %xmm4 740; SSE2-NEXT: movdqa %xmm10, %xmm5 741; SSE2-NEXT: pandn %xmm4, %xmm5 742; SSE2-NEXT: por %xmm0, %xmm5 743; SSE2-NEXT: movdqa {{.*#+}} xmm12 = [0,65535,65535,65535,65535,65535,65535,65535] 744; SSE2-NEXT: pand %xmm12, %xmm5 745; SSE2-NEXT: movdqa %xmm12, %xmm0 746; SSE2-NEXT: pandn %xmm1, %xmm0 747; SSE2-NEXT: por %xmm5, %xmm0 748; SSE2-NEXT: movdqa %xmm8, %xmm1 749; SSE2-NEXT: psraw $15, %xmm1 750; SSE2-NEXT: pmulhuw %xmm9, %xmm1 751; SSE2-NEXT: paddw %xmm8, %xmm1 752; SSE2-NEXT: movdqa %xmm1, %xmm5 753; SSE2-NEXT: pand %xmm11, %xmm5 754; SSE2-NEXT: psraw $4, %xmm1 755; SSE2-NEXT: movdqa %xmm11, %xmm6 756; SSE2-NEXT: pandn %xmm1, %xmm6 757; SSE2-NEXT: por %xmm5, %xmm6 758; SSE2-NEXT: movdqa %xmm6, %xmm1 759; SSE2-NEXT: pand %xmm7, %xmm1 760; SSE2-NEXT: psraw $2, %xmm6 761; SSE2-NEXT: movdqa %xmm7, %xmm5 762; SSE2-NEXT: pandn %xmm6, %xmm5 763; SSE2-NEXT: por %xmm1, %xmm5 764; SSE2-NEXT: movdqa %xmm5, %xmm1 765; SSE2-NEXT: pand %xmm10, %xmm1 766; SSE2-NEXT: psraw $1, %xmm5 767; SSE2-NEXT: movdqa %xmm10, %xmm6 768; SSE2-NEXT: pandn %xmm5, %xmm6 769; SSE2-NEXT: por %xmm1, %xmm6 770; SSE2-NEXT: pand %xmm12, %xmm6 771; SSE2-NEXT: movdqa %xmm12, %xmm1 772; SSE2-NEXT: pandn %xmm8, %xmm1 773; SSE2-NEXT: por %xmm6, %xmm1 774; SSE2-NEXT: movdqa %xmm2, %xmm5 775; SSE2-NEXT: psraw $15, %xmm5 776; SSE2-NEXT: pmulhuw %xmm9, %xmm5 777; SSE2-NEXT: paddw %xmm2, %xmm5 778; SSE2-NEXT: movdqa %xmm5, %xmm6 779; SSE2-NEXT: pand %xmm11, %xmm6 780; SSE2-NEXT: psraw $4, %xmm5 781; SSE2-NEXT: movdqa %xmm11, %xmm4 782; SSE2-NEXT: pandn %xmm5, %xmm4 783; SSE2-NEXT: por %xmm6, %xmm4 784; SSE2-NEXT: movdqa %xmm4, %xmm5 785; SSE2-NEXT: pand %xmm7, %xmm5 786; SSE2-NEXT: psraw $2, %xmm4 787; SSE2-NEXT: movdqa %xmm7, %xmm6 788; SSE2-NEXT: pandn %xmm4, %xmm6 789; SSE2-NEXT: por %xmm5, %xmm6 790; SSE2-NEXT: movdqa %xmm6, %xmm4 791; SSE2-NEXT: pand %xmm10, %xmm4 792; SSE2-NEXT: psraw $1, %xmm6 793; SSE2-NEXT: movdqa %xmm10, %xmm5 794; SSE2-NEXT: pandn %xmm6, %xmm5 795; SSE2-NEXT: por %xmm4, %xmm5 796; SSE2-NEXT: pand %xmm12, %xmm5 797; SSE2-NEXT: movdqa %xmm12, %xmm8 798; SSE2-NEXT: pandn %xmm2, %xmm8 799; SSE2-NEXT: por %xmm5, %xmm8 800; SSE2-NEXT: movdqa %xmm3, %xmm2 801; SSE2-NEXT: psraw $15, %xmm2 802; SSE2-NEXT: pmulhuw %xmm9, %xmm2 803; SSE2-NEXT: paddw %xmm3, %xmm2 804; SSE2-NEXT: movdqa %xmm2, %xmm4 805; SSE2-NEXT: pand %xmm11, %xmm4 806; SSE2-NEXT: psraw $4, %xmm2 807; SSE2-NEXT: pandn %xmm2, %xmm11 808; SSE2-NEXT: por %xmm4, %xmm11 809; SSE2-NEXT: movdqa %xmm11, %xmm2 810; SSE2-NEXT: pand %xmm7, %xmm2 811; SSE2-NEXT: psraw $2, %xmm11 812; SSE2-NEXT: pandn %xmm11, %xmm7 813; SSE2-NEXT: por %xmm2, %xmm7 814; SSE2-NEXT: movdqa %xmm7, %xmm2 815; SSE2-NEXT: pand %xmm10, %xmm2 816; SSE2-NEXT: psraw $1, %xmm7 817; SSE2-NEXT: pandn %xmm7, %xmm10 818; SSE2-NEXT: por %xmm2, %xmm10 819; SSE2-NEXT: pand %xmm12, %xmm10 820; SSE2-NEXT: pandn %xmm3, %xmm12 821; SSE2-NEXT: por %xmm10, %xmm12 822; SSE2-NEXT: movdqa %xmm8, %xmm2 823; SSE2-NEXT: movdqa %xmm12, %xmm3 824; SSE2-NEXT: retq 825; 826; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 827; SSE41: # %bb.0: 828; SSE41-NEXT: movdqa %xmm1, %xmm4 829; SSE41-NEXT: movdqa %xmm0, %xmm1 830; SSE41-NEXT: psraw $15, %xmm0 831; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2> 832; SSE41-NEXT: pmulhuw %xmm7, %xmm0 833; SSE41-NEXT: paddw %xmm1, %xmm0 834; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <u,16384,32768,4096,8192,2048,1024,32768> 835; SSE41-NEXT: movdqa %xmm0, %xmm5 836; SSE41-NEXT: pmulhw %xmm6, %xmm5 837; SSE41-NEXT: psraw $1, %xmm0 838; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2],xmm5[3,4,5,6],xmm0[7] 839; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 840; SSE41-NEXT: movdqa %xmm4, %xmm1 841; SSE41-NEXT: psraw $15, %xmm1 842; SSE41-NEXT: pmulhuw %xmm7, %xmm1 843; SSE41-NEXT: paddw %xmm4, %xmm1 844; SSE41-NEXT: movdqa %xmm1, %xmm5 845; SSE41-NEXT: pmulhw %xmm6, %xmm5 846; SSE41-NEXT: psraw $1, %xmm1 847; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4,5,6],xmm1[7] 848; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3,4,5,6,7] 849; SSE41-NEXT: movdqa %xmm2, %xmm4 850; SSE41-NEXT: psraw $15, %xmm4 851; SSE41-NEXT: pmulhuw %xmm7, %xmm4 852; SSE41-NEXT: paddw %xmm2, %xmm4 853; SSE41-NEXT: movdqa %xmm4, %xmm5 854; SSE41-NEXT: pmulhw %xmm6, %xmm5 855; SSE41-NEXT: psraw $1, %xmm4 856; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4,5,6],xmm4[7] 857; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3,4,5,6,7] 858; SSE41-NEXT: movdqa %xmm3, %xmm5 859; SSE41-NEXT: psraw $15, %xmm5 860; SSE41-NEXT: pmulhuw %xmm7, %xmm5 861; SSE41-NEXT: paddw %xmm3, %xmm5 862; SSE41-NEXT: pmulhw %xmm5, %xmm6 863; SSE41-NEXT: psraw $1, %xmm5 864; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] 865; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3,4,5,6,7] 866; SSE41-NEXT: movdqa %xmm4, %xmm2 867; SSE41-NEXT: movdqa %xmm5, %xmm3 868; SSE41-NEXT: retq 869; 870; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 871; AVX1: # %bb.0: 872; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 873; AVX1-NEXT: vpsraw $15, %xmm2, %xmm3 874; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2> 875; AVX1-NEXT: vpmulhuw %xmm4, %xmm3, %xmm3 876; AVX1-NEXT: vpaddw %xmm3, %xmm2, %xmm2 877; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768> 878; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm5 879; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 880; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7] 881; AVX1-NEXT: vpsraw $15, %xmm0, %xmm5 882; AVX1-NEXT: vpmulhuw %xmm4, %xmm5, %xmm5 883; AVX1-NEXT: vpaddw %xmm5, %xmm0, %xmm5 884; AVX1-NEXT: vpmulhw %xmm3, %xmm5, %xmm6 885; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5 886; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] 887; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 888; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 889; AVX1-NEXT: # ymm5 = mem[0,1,0,1] 890; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 891; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 892; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 893; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 894; AVX1-NEXT: vpsraw $15, %xmm2, %xmm6 895; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm6 896; AVX1-NEXT: vpaddw %xmm6, %xmm2, %xmm2 897; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm6 898; AVX1-NEXT: vpsraw $1, %xmm2, %xmm2 899; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm2[2],xmm6[3,4,5,6],xmm2[7] 900; AVX1-NEXT: vpsraw $15, %xmm1, %xmm6 901; AVX1-NEXT: vpmulhuw %xmm4, %xmm6, %xmm4 902; AVX1-NEXT: vpaddw %xmm4, %xmm1, %xmm4 903; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3 904; AVX1-NEXT: vpsraw $1, %xmm4, %xmm4 905; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5,6],xmm4[7] 906; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 907; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 908; AVX1-NEXT: vandnps %ymm1, %ymm5, %ymm1 909; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 910; AVX1-NEXT: retq 911; 912; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 913; AVX2: # %bb.0: 914; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2 915; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 916; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 917; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 918; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2 919; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768] 920; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 921; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5 922; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 923; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15] 924; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] 925; AVX2-NEXT: vpsraw $15, %ymm1, %ymm2 926; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 927; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm2 928; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm3 929; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 930; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11,12,13,14],ymm2[15] 931; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15] 932; AVX2-NEXT: retq 933; 934; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 935; AVX512F: # %bb.0: 936; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm1 937; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] 938; AVX512F-NEXT: # ymm2 = mem[0,1,0,1] 939; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1 940; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm1 941; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 942; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,1,4,3,5,6,1,0,2,1,4,3,5,6,1] 943; AVX512F-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 944; AVX512F-NEXT: vpsravd %zmm3, %zmm1, %zmm1 945; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 946; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 947; AVX512F-NEXT: vpsraw $15, %ymm4, %ymm5 948; AVX512F-NEXT: vpmulhuw %ymm2, %ymm5, %ymm2 949; AVX512F-NEXT: vpaddw %ymm2, %ymm4, %ymm2 950; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 951; AVX512F-NEXT: vpsravd %zmm3, %zmm2, %zmm2 952; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 953; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 954; AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 955; AVX512F-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 956; AVX512F-NEXT: vpternlogq $216, %zmm2, %zmm1, %zmm0 957; AVX512F-NEXT: retq 958; 959; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 960; AVX512BW: # %bb.0: 961; AVX512BW-NEXT: vpsraw $15, %zmm0, %zmm1 962; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 963; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm1 964; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 965; AVX512BW-NEXT: movl $16843009, %eax # imm = 0x1010101 966; AVX512BW-NEXT: kmovd %eax, %k1 967; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1} 968; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 969; AVX512BW-NEXT: retq 970; 971; XOP-LABEL: combine_vec_sdiv_by_pow2b_v32i16: 972; XOP: # %bb.0: 973; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 974; XOP-NEXT: vpsraw $15, %xmm2, %xmm3 975; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521> 976; XOP-NEXT: vpshlw %xmm4, %xmm3, %xmm3 977; XOP-NEXT: vpaddw %xmm3, %xmm2, %xmm2 978; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535> 979; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 980; XOP-NEXT: vpsraw $15, %xmm0, %xmm5 981; XOP-NEXT: vpshlw %xmm4, %xmm5, %xmm5 982; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 983; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 984; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 985; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] 986; XOP-NEXT: # ymm5 = mem[0,1,0,1] 987; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 988; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 989; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 990; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm6 991; XOP-NEXT: vpaddw %xmm6, %xmm2, %xmm2 992; XOP-NEXT: vpshaw %xmm3, %xmm2, %xmm2 993; XOP-NEXT: vpsraw $15, %xmm1, %xmm6 994; XOP-NEXT: vpshlw %xmm4, %xmm6, %xmm4 995; XOP-NEXT: vpaddw %xmm4, %xmm1, %xmm4 996; XOP-NEXT: vpshaw %xmm3, %xmm4, %xmm3 997; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 998; XOP-NEXT: vpcmov %ymm5, %ymm1, %ymm2, %ymm1 999; XOP-NEXT: retq 1000 %1 = sdiv <32 x i16> %x, <i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2, i16 1, i16 4, i16 2, i16 16, i16 8, i16 32, i16 64, i16 2> 1001 ret <32 x i16> %1 1002} 1003 1004define <4 x i32> @combine_vec_sdiv_by_pow2b_v4i32(<4 x i32> %x) { 1005; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1006; SSE2: # %bb.0: 1007; SSE2-NEXT: movdqa %xmm0, %xmm1 1008; SSE2-NEXT: psrad $31, %xmm1 1009; SSE2-NEXT: movdqa %xmm1, %xmm2 1010; SSE2-NEXT: psrld $28, %xmm2 1011; SSE2-NEXT: movdqa %xmm1, %xmm3 1012; SSE2-NEXT: psrld $29, %xmm3 1013; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1014; SSE2-NEXT: psrld $30, %xmm1 1015; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1016; SSE2-NEXT: paddd %xmm0, %xmm1 1017; SSE2-NEXT: movdqa %xmm1, %xmm2 1018; SSE2-NEXT: psrad $4, %xmm2 1019; SSE2-NEXT: movdqa %xmm1, %xmm3 1020; SSE2-NEXT: psrad $3, %xmm3 1021; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1022; SSE2-NEXT: psrad $2, %xmm1 1023; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] 1024; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1025; SSE2-NEXT: movaps %xmm1, %xmm0 1026; SSE2-NEXT: retq 1027; 1028; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1029; SSE41: # %bb.0: 1030; SSE41-NEXT: movdqa %xmm0, %xmm1 1031; SSE41-NEXT: psrad $31, %xmm1 1032; SSE41-NEXT: movdqa %xmm1, %xmm2 1033; SSE41-NEXT: psrld $28, %xmm2 1034; SSE41-NEXT: movdqa %xmm1, %xmm3 1035; SSE41-NEXT: psrld $30, %xmm3 1036; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1037; SSE41-NEXT: psrld $29, %xmm1 1038; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1039; SSE41-NEXT: paddd %xmm0, %xmm1 1040; SSE41-NEXT: movdqa %xmm1, %xmm2 1041; SSE41-NEXT: psrad $4, %xmm2 1042; SSE41-NEXT: movdqa %xmm1, %xmm3 1043; SSE41-NEXT: psrad $2, %xmm3 1044; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1045; SSE41-NEXT: psrad $3, %xmm1 1046; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1047; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1048; SSE41-NEXT: movdqa %xmm1, %xmm0 1049; SSE41-NEXT: retq 1050; 1051; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1052; AVX1: # %bb.0: 1053; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 1054; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1055; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 1056; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1057; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 1058; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1059; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1060; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1061; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1062; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1063; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1064; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1065; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1066; AVX1-NEXT: retq 1067; 1068; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1069; AVX2ORLATER: # %bb.0: 1070; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 1071; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1072; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1073; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1074; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1075; AVX2ORLATER-NEXT: retq 1076; 1077; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i32: 1078; XOP: # %bb.0: 1079; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 1080; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1081; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 1082; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1083; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1084; XOP-NEXT: retq 1085 %1 = sdiv <4 x i32> %x, <i32 1, i32 4, i32 8, i32 16> 1086 ret <4 x i32> %1 1087} 1088 1089define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) { 1090; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1091; SSE2: # %bb.0: 1092; SSE2-NEXT: movdqa %xmm0, %xmm2 1093; SSE2-NEXT: psrad $31, %xmm0 1094; SSE2-NEXT: movdqa %xmm0, %xmm3 1095; SSE2-NEXT: psrld $28, %xmm3 1096; SSE2-NEXT: movdqa %xmm0, %xmm4 1097; SSE2-NEXT: psrld $29, %xmm4 1098; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1099; SSE2-NEXT: psrld $30, %xmm0 1100; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1101; SSE2-NEXT: paddd %xmm2, %xmm0 1102; SSE2-NEXT: movdqa %xmm0, %xmm3 1103; SSE2-NEXT: psrad $4, %xmm3 1104; SSE2-NEXT: movdqa %xmm0, %xmm4 1105; SSE2-NEXT: psrad $3, %xmm4 1106; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1107; SSE2-NEXT: psrad $2, %xmm0 1108; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 1109; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 1110; SSE2-NEXT: movdqa %xmm1, %xmm2 1111; SSE2-NEXT: psrad $31, %xmm2 1112; SSE2-NEXT: movdqa %xmm2, %xmm3 1113; SSE2-NEXT: psrld $28, %xmm3 1114; SSE2-NEXT: movdqa %xmm2, %xmm4 1115; SSE2-NEXT: psrld $29, %xmm4 1116; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1117; SSE2-NEXT: psrld $30, %xmm2 1118; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1119; SSE2-NEXT: paddd %xmm1, %xmm2 1120; SSE2-NEXT: movdqa %xmm2, %xmm3 1121; SSE2-NEXT: psrad $4, %xmm3 1122; SSE2-NEXT: movdqa %xmm2, %xmm4 1123; SSE2-NEXT: psrad $3, %xmm4 1124; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] 1125; SSE2-NEXT: psrad $2, %xmm2 1126; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] 1127; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1128; SSE2-NEXT: movaps %xmm2, %xmm1 1129; SSE2-NEXT: retq 1130; 1131; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1132; SSE41: # %bb.0: 1133; SSE41-NEXT: movdqa %xmm0, %xmm2 1134; SSE41-NEXT: psrad $31, %xmm0 1135; SSE41-NEXT: movdqa %xmm0, %xmm3 1136; SSE41-NEXT: psrld $28, %xmm3 1137; SSE41-NEXT: movdqa %xmm0, %xmm4 1138; SSE41-NEXT: psrld $30, %xmm4 1139; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1140; SSE41-NEXT: psrld $29, %xmm0 1141; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 1142; SSE41-NEXT: paddd %xmm2, %xmm0 1143; SSE41-NEXT: movdqa %xmm0, %xmm3 1144; SSE41-NEXT: psrad $4, %xmm3 1145; SSE41-NEXT: movdqa %xmm0, %xmm4 1146; SSE41-NEXT: psrad $2, %xmm4 1147; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1148; SSE41-NEXT: psrad $3, %xmm0 1149; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] 1150; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] 1151; SSE41-NEXT: movdqa %xmm1, %xmm2 1152; SSE41-NEXT: psrad $31, %xmm2 1153; SSE41-NEXT: movdqa %xmm2, %xmm3 1154; SSE41-NEXT: psrld $28, %xmm3 1155; SSE41-NEXT: movdqa %xmm2, %xmm4 1156; SSE41-NEXT: psrld $30, %xmm4 1157; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1158; SSE41-NEXT: psrld $29, %xmm2 1159; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1160; SSE41-NEXT: paddd %xmm1, %xmm2 1161; SSE41-NEXT: movdqa %xmm2, %xmm3 1162; SSE41-NEXT: psrad $4, %xmm3 1163; SSE41-NEXT: movdqa %xmm2, %xmm4 1164; SSE41-NEXT: psrad $2, %xmm4 1165; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1166; SSE41-NEXT: psrad $3, %xmm2 1167; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1168; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] 1169; SSE41-NEXT: movdqa %xmm2, %xmm1 1170; SSE41-NEXT: retq 1171; 1172; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1173; AVX1: # %bb.0: 1174; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1175; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 1176; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1177; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1178; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1179; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1180; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1181; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1182; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 1183; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 1184; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 1185; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 1186; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1187; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 1188; AVX1-NEXT: vpsrld $28, %xmm2, %xmm3 1189; AVX1-NEXT: vpsrld $30, %xmm2, %xmm4 1190; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1191; AVX1-NEXT: vpsrld $29, %xmm2, %xmm2 1192; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1193; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2 1194; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1195; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1196; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1197; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1198; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1199; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1200; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1201; AVX1-NEXT: retq 1202; 1203; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1204; AVX2ORLATER: # %bb.0: 1205; AVX2ORLATER-NEXT: vpsrad $31, %ymm0, %ymm1 1206; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1207; AVX2ORLATER-NEXT: vpaddd %ymm1, %ymm0, %ymm1 1208; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1209; AVX2ORLATER-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1210; AVX2ORLATER-NEXT: retq 1211; 1212; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i32: 1213; XOP: # %bb.0: 1214; XOP-NEXT: vextractf128 $1, %ymm0, %xmm1 1215; XOP-NEXT: vpsrad $31, %xmm1, %xmm2 1216; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268> 1217; XOP-NEXT: vpshld %xmm3, %xmm2, %xmm2 1218; XOP-NEXT: vpaddd %xmm2, %xmm1, %xmm1 1219; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292> 1220; XOP-NEXT: vpshad %xmm2, %xmm1, %xmm1 1221; XOP-NEXT: vpsrad $31, %xmm0, %xmm4 1222; XOP-NEXT: vpshld %xmm3, %xmm4, %xmm3 1223; XOP-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1224; XOP-NEXT: vpshad %xmm2, %xmm3, %xmm2 1225; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1226; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] 1227; XOP-NEXT: retq 1228 %1 = sdiv <8 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1229 ret <8 x i32> %1 1230} 1231 1232define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) { 1233; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1234; SSE2: # %bb.0: 1235; SSE2-NEXT: movdqa %xmm1, %xmm4 1236; SSE2-NEXT: movdqa %xmm0, %xmm1 1237; SSE2-NEXT: psrad $31, %xmm0 1238; SSE2-NEXT: movdqa %xmm0, %xmm5 1239; SSE2-NEXT: psrld $28, %xmm5 1240; SSE2-NEXT: movdqa %xmm0, %xmm6 1241; SSE2-NEXT: psrld $29, %xmm6 1242; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1243; SSE2-NEXT: psrld $30, %xmm0 1244; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1245; SSE2-NEXT: paddd %xmm1, %xmm0 1246; SSE2-NEXT: movdqa %xmm0, %xmm5 1247; SSE2-NEXT: psrad $4, %xmm5 1248; SSE2-NEXT: movdqa %xmm0, %xmm6 1249; SSE2-NEXT: psrad $3, %xmm6 1250; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1251; SSE2-NEXT: psrad $2, %xmm0 1252; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] 1253; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1254; SSE2-NEXT: movdqa %xmm4, %xmm1 1255; SSE2-NEXT: psrad $31, %xmm1 1256; SSE2-NEXT: movdqa %xmm1, %xmm5 1257; SSE2-NEXT: psrld $28, %xmm5 1258; SSE2-NEXT: movdqa %xmm1, %xmm6 1259; SSE2-NEXT: psrld $29, %xmm6 1260; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1261; SSE2-NEXT: psrld $30, %xmm1 1262; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1263; SSE2-NEXT: paddd %xmm4, %xmm1 1264; SSE2-NEXT: movdqa %xmm1, %xmm5 1265; SSE2-NEXT: psrad $4, %xmm5 1266; SSE2-NEXT: movdqa %xmm1, %xmm6 1267; SSE2-NEXT: psrad $3, %xmm6 1268; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1269; SSE2-NEXT: psrad $2, %xmm1 1270; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] 1271; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] 1272; SSE2-NEXT: movdqa %xmm2, %xmm4 1273; SSE2-NEXT: psrad $31, %xmm4 1274; SSE2-NEXT: movdqa %xmm4, %xmm5 1275; SSE2-NEXT: psrld $28, %xmm5 1276; SSE2-NEXT: movdqa %xmm4, %xmm6 1277; SSE2-NEXT: psrld $29, %xmm6 1278; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1279; SSE2-NEXT: psrld $30, %xmm4 1280; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1281; SSE2-NEXT: paddd %xmm2, %xmm4 1282; SSE2-NEXT: movdqa %xmm4, %xmm5 1283; SSE2-NEXT: psrad $4, %xmm5 1284; SSE2-NEXT: movdqa %xmm4, %xmm6 1285; SSE2-NEXT: psrad $3, %xmm6 1286; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] 1287; SSE2-NEXT: psrad $2, %xmm4 1288; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] 1289; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] 1290; SSE2-NEXT: movdqa %xmm3, %xmm5 1291; SSE2-NEXT: psrad $31, %xmm5 1292; SSE2-NEXT: movdqa %xmm5, %xmm2 1293; SSE2-NEXT: psrld $28, %xmm2 1294; SSE2-NEXT: movdqa %xmm5, %xmm6 1295; SSE2-NEXT: psrld $29, %xmm6 1296; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1297; SSE2-NEXT: psrld $30, %xmm5 1298; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1299; SSE2-NEXT: paddd %xmm3, %xmm5 1300; SSE2-NEXT: movdqa %xmm5, %xmm2 1301; SSE2-NEXT: psrad $4, %xmm2 1302; SSE2-NEXT: movdqa %xmm5, %xmm6 1303; SSE2-NEXT: psrad $3, %xmm6 1304; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] 1305; SSE2-NEXT: psrad $2, %xmm5 1306; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] 1307; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 1308; SSE2-NEXT: movaps %xmm4, %xmm2 1309; SSE2-NEXT: movaps %xmm5, %xmm3 1310; SSE2-NEXT: retq 1311; 1312; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1313; SSE41: # %bb.0: 1314; SSE41-NEXT: movdqa %xmm1, %xmm4 1315; SSE41-NEXT: movdqa %xmm0, %xmm1 1316; SSE41-NEXT: psrad $31, %xmm0 1317; SSE41-NEXT: movdqa %xmm0, %xmm5 1318; SSE41-NEXT: psrld $28, %xmm5 1319; SSE41-NEXT: movdqa %xmm0, %xmm6 1320; SSE41-NEXT: psrld $30, %xmm6 1321; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1322; SSE41-NEXT: psrld $29, %xmm0 1323; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] 1324; SSE41-NEXT: paddd %xmm1, %xmm0 1325; SSE41-NEXT: movdqa %xmm0, %xmm5 1326; SSE41-NEXT: psrad $4, %xmm5 1327; SSE41-NEXT: movdqa %xmm0, %xmm6 1328; SSE41-NEXT: psrad $2, %xmm6 1329; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1330; SSE41-NEXT: psrad $3, %xmm0 1331; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5],xmm6[6,7] 1332; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1333; SSE41-NEXT: movdqa %xmm4, %xmm1 1334; SSE41-NEXT: psrad $31, %xmm1 1335; SSE41-NEXT: movdqa %xmm1, %xmm5 1336; SSE41-NEXT: psrld $28, %xmm5 1337; SSE41-NEXT: movdqa %xmm1, %xmm6 1338; SSE41-NEXT: psrld $30, %xmm6 1339; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1340; SSE41-NEXT: psrld $29, %xmm1 1341; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 1342; SSE41-NEXT: paddd %xmm4, %xmm1 1343; SSE41-NEXT: movdqa %xmm1, %xmm5 1344; SSE41-NEXT: psrad $4, %xmm5 1345; SSE41-NEXT: movdqa %xmm1, %xmm6 1346; SSE41-NEXT: psrad $2, %xmm6 1347; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1348; SSE41-NEXT: psrad $3, %xmm1 1349; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 1350; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] 1351; SSE41-NEXT: movdqa %xmm2, %xmm4 1352; SSE41-NEXT: psrad $31, %xmm4 1353; SSE41-NEXT: movdqa %xmm4, %xmm5 1354; SSE41-NEXT: psrld $28, %xmm5 1355; SSE41-NEXT: movdqa %xmm4, %xmm6 1356; SSE41-NEXT: psrld $30, %xmm6 1357; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1358; SSE41-NEXT: psrld $29, %xmm4 1359; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1360; SSE41-NEXT: paddd %xmm2, %xmm4 1361; SSE41-NEXT: movdqa %xmm4, %xmm5 1362; SSE41-NEXT: psrad $4, %xmm5 1363; SSE41-NEXT: movdqa %xmm4, %xmm6 1364; SSE41-NEXT: psrad $2, %xmm6 1365; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm5[4,5,6,7] 1366; SSE41-NEXT: psrad $3, %xmm4 1367; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 1368; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] 1369; SSE41-NEXT: movdqa %xmm3, %xmm5 1370; SSE41-NEXT: psrad $31, %xmm5 1371; SSE41-NEXT: movdqa %xmm5, %xmm2 1372; SSE41-NEXT: psrld $28, %xmm2 1373; SSE41-NEXT: movdqa %xmm5, %xmm6 1374; SSE41-NEXT: psrld $30, %xmm6 1375; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] 1376; SSE41-NEXT: psrld $29, %xmm5 1377; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1378; SSE41-NEXT: paddd %xmm3, %xmm5 1379; SSE41-NEXT: movdqa %xmm5, %xmm2 1380; SSE41-NEXT: psrad $4, %xmm2 1381; SSE41-NEXT: movdqa %xmm5, %xmm6 1382; SSE41-NEXT: psrad $2, %xmm6 1383; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] 1384; SSE41-NEXT: psrad $3, %xmm5 1385; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1386; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] 1387; SSE41-NEXT: movdqa %xmm4, %xmm2 1388; SSE41-NEXT: movdqa %xmm5, %xmm3 1389; SSE41-NEXT: retq 1390; 1391; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1392; AVX1: # %bb.0: 1393; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1394; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1395; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1396; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1397; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1398; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1399; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1400; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1401; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1402; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1403; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1404; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1405; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1406; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 1407; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1408; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1409; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1410; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1411; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1412; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm3 1413; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1414; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1415; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1416; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1417; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1418; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1419; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1420; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1421; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 1422; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1423; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1424; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1425; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1426; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1427; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1428; AVX1-NEXT: vpsrad $4, %xmm2, %xmm3 1429; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1430; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1431; AVX1-NEXT: vpsrad $3, %xmm2, %xmm2 1432; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1433; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 1434; AVX1-NEXT: vpsrld $28, %xmm3, %xmm4 1435; AVX1-NEXT: vpsrld $30, %xmm3, %xmm5 1436; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1437; AVX1-NEXT: vpsrld $29, %xmm3, %xmm3 1438; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1439; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3 1440; AVX1-NEXT: vpsrad $4, %xmm3, %xmm4 1441; AVX1-NEXT: vpsrad $2, %xmm3, %xmm5 1442; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 1443; AVX1-NEXT: vpsrad $3, %xmm3, %xmm3 1444; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] 1445; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1446; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1447; AVX1-NEXT: retq 1448; 1449; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1450; AVX2: # %bb.0: 1451; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 1452; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28] 1453; AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1454; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1455; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 1456; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4] 1457; AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1458; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1459; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1460; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 1461; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 1462; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm2 1463; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 1464; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1465; AVX2-NEXT: retq 1466; 1467; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1468; AVX512F: # %bb.0: 1469; AVX512F-NEXT: vpsrad $31, %zmm0, %zmm1 1470; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1471; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1472; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1473; AVX512F-NEXT: movw $4369, %ax # imm = 0x1111 1474; AVX512F-NEXT: kmovw %eax, %k1 1475; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1476; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1477; AVX512F-NEXT: retq 1478; 1479; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1480; AVX512BW: # %bb.0: 1481; AVX512BW-NEXT: vpsrad $31, %zmm0, %zmm1 1482; AVX512BW-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1483; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm1 1484; AVX512BW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1485; AVX512BW-NEXT: movw $4369, %ax # imm = 0x1111 1486; AVX512BW-NEXT: kmovd %eax, %k1 1487; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 1488; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1489; AVX512BW-NEXT: retq 1490; 1491; XOP-LABEL: combine_vec_sdiv_by_pow2b_v16i32: 1492; XOP: # %bb.0: 1493; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1494; XOP-NEXT: vpsrad $31, %xmm2, %xmm3 1495; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268> 1496; XOP-NEXT: vpshld %xmm4, %xmm3, %xmm3 1497; XOP-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1498; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292> 1499; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1500; XOP-NEXT: vpsrad $31, %xmm0, %xmm5 1501; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1502; XOP-NEXT: vpaddd %xmm5, %xmm0, %xmm5 1503; XOP-NEXT: vpshad %xmm3, %xmm5, %xmm5 1504; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 1505; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] 1506; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1507; XOP-NEXT: vpsrad $31, %xmm2, %xmm5 1508; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm5 1509; XOP-NEXT: vpaddd %xmm5, %xmm2, %xmm2 1510; XOP-NEXT: vpshad %xmm3, %xmm2, %xmm2 1511; XOP-NEXT: vpsrad $31, %xmm1, %xmm5 1512; XOP-NEXT: vpshld %xmm4, %xmm5, %xmm4 1513; XOP-NEXT: vpaddd %xmm4, %xmm1, %xmm4 1514; XOP-NEXT: vpshad %xmm3, %xmm4, %xmm3 1515; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1516; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] 1517; XOP-NEXT: retq 1518 %1 = sdiv <16 x i32> %x, <i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16, i32 1, i32 4, i32 8, i32 16> 1519 ret <16 x i32> %1 1520} 1521 1522define <2 x i64> @combine_vec_sdiv_by_pow2b_v2i64(<2 x i64> %x) { 1523; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1524; SSE2: # %bb.0: 1525; SSE2-NEXT: movdqa %xmm0, %xmm1 1526; SSE2-NEXT: psrad $31, %xmm1 1527; SSE2-NEXT: psrlq $62, %xmm1 1528; SSE2-NEXT: paddq %xmm0, %xmm1 1529; SSE2-NEXT: movdqa %xmm1, %xmm2 1530; SSE2-NEXT: psrad $2, %xmm2 1531; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 1532; SSE2-NEXT: psrlq $2, %xmm1 1533; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1534; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1535; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1536; SSE2-NEXT: retq 1537; 1538; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1539; SSE41: # %bb.0: 1540; SSE41-NEXT: movdqa %xmm0, %xmm1 1541; SSE41-NEXT: psrad $31, %xmm1 1542; SSE41-NEXT: psrlq $62, %xmm1 1543; SSE41-NEXT: paddq %xmm0, %xmm1 1544; SSE41-NEXT: movdqa %xmm1, %xmm2 1545; SSE41-NEXT: psrad $2, %xmm2 1546; SSE41-NEXT: psrlq $2, %xmm1 1547; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1548; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1549; SSE41-NEXT: movdqa %xmm1, %xmm0 1550; SSE41-NEXT: retq 1551; 1552; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1553; AVX1: # %bb.0: 1554; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1555; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1556; AVX1-NEXT: vpsrlq $62, %xmm1, %xmm1 1557; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1558; AVX1-NEXT: vpsrad $2, %xmm1, %xmm2 1559; AVX1-NEXT: vpsrlq $2, %xmm1, %xmm1 1560; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1561; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1562; AVX1-NEXT: retq 1563; 1564; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1565; AVX2: # %bb.0: 1566; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1567; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 1568; AVX2-NEXT: vpsrlq $62, %xmm1, %xmm1 1569; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1570; AVX2-NEXT: vpsrad $2, %xmm1, %xmm2 1571; AVX2-NEXT: vpsrlq $2, %xmm1, %xmm1 1572; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] 1573; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1574; AVX2-NEXT: retq 1575; 1576; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1577; AVX512F: # %bb.0: 1578; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1579; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1580; AVX512F-NEXT: vpsrlq $62, %xmm1, %xmm1 1581; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1582; AVX512F-NEXT: vpsraq $2, %zmm1, %zmm1 1583; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1584; AVX512F-NEXT: vzeroupper 1585; AVX512F-NEXT: retq 1586; 1587; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1588; AVX512BW: # %bb.0: 1589; AVX512BW-NEXT: vpsraq $63, %xmm0, %xmm1 1590; AVX512BW-NEXT: vpsrlq $62, %xmm1, %xmm1 1591; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1592; AVX512BW-NEXT: vpsraq $2, %xmm1, %xmm1 1593; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1594; AVX512BW-NEXT: retq 1595; 1596; XOP-LABEL: combine_vec_sdiv_by_pow2b_v2i64: 1597; XOP: # %bb.0: 1598; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1599; XOP-NEXT: vpsrlq $62, %xmm1, %xmm1 1600; XOP-NEXT: vpaddq %xmm1, %xmm0, %xmm1 1601; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1602; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1603; XOP-NEXT: retq 1604 %1 = sdiv <2 x i64> %x, <i64 1, i64 4> 1605 ret <2 x i64> %1 1606} 1607 1608define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) { 1609; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1610; SSE2: # %bb.0: 1611; SSE2-NEXT: movdqa %xmm0, %xmm2 1612; SSE2-NEXT: psrad $31, %xmm2 1613; SSE2-NEXT: psrlq $62, %xmm2 1614; SSE2-NEXT: paddq %xmm0, %xmm2 1615; SSE2-NEXT: movdqa %xmm2, %xmm3 1616; SSE2-NEXT: psrad $2, %xmm3 1617; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 1618; SSE2-NEXT: psrlq $2, %xmm2 1619; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1620; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1621; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] 1622; SSE2-NEXT: movdqa %xmm1, %xmm2 1623; SSE2-NEXT: psrad $31, %xmm2 1624; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1625; SSE2-NEXT: movdqa %xmm2, %xmm3 1626; SSE2-NEXT: psrlq $61, %xmm3 1627; SSE2-NEXT: psrlq $60, %xmm2 1628; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1] 1629; SSE2-NEXT: paddq %xmm1, %xmm2 1630; SSE2-NEXT: movdqa %xmm2, %xmm1 1631; SSE2-NEXT: psrlq $3, %xmm1 1632; SSE2-NEXT: psrlq $4, %xmm2 1633; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1634; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1635; SSE2-NEXT: xorpd %xmm1, %xmm2 1636; SSE2-NEXT: psubq %xmm1, %xmm2 1637; SSE2-NEXT: movdqa %xmm2, %xmm1 1638; SSE2-NEXT: retq 1639; 1640; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1641; SSE41: # %bb.0: 1642; SSE41-NEXT: movdqa %xmm0, %xmm2 1643; SSE41-NEXT: psrad $31, %xmm0 1644; SSE41-NEXT: psrlq $62, %xmm0 1645; SSE41-NEXT: paddq %xmm2, %xmm0 1646; SSE41-NEXT: movdqa %xmm0, %xmm3 1647; SSE41-NEXT: psrad $2, %xmm3 1648; SSE41-NEXT: psrlq $2, %xmm0 1649; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 1650; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] 1651; SSE41-NEXT: movdqa %xmm1, %xmm2 1652; SSE41-NEXT: psrad $31, %xmm2 1653; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1654; SSE41-NEXT: movdqa %xmm2, %xmm3 1655; SSE41-NEXT: psrlq $60, %xmm3 1656; SSE41-NEXT: psrlq $61, %xmm2 1657; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 1658; SSE41-NEXT: paddq %xmm1, %xmm2 1659; SSE41-NEXT: movdqa %xmm2, %xmm1 1660; SSE41-NEXT: psrlq $4, %xmm1 1661; SSE41-NEXT: psrlq $3, %xmm2 1662; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1663; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1664; SSE41-NEXT: pxor %xmm1, %xmm2 1665; SSE41-NEXT: psubq %xmm1, %xmm2 1666; SSE41-NEXT: movdqa %xmm2, %xmm1 1667; SSE41-NEXT: retq 1668; 1669; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1670; AVX1: # %bb.0: 1671; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1672; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1673; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 1674; AVX1-NEXT: vpsrlq $60, %xmm3, %xmm4 1675; AVX1-NEXT: vpsrlq $61, %xmm3, %xmm3 1676; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1677; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 1678; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 1679; AVX1-NEXT: vpsrlq $3, %xmm1, %xmm1 1680; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 1681; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1152921504606846976,576460752303423488] 1682; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 1683; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 1684; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 1685; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1686; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1687; AVX1-NEXT: vpsrad $2, %xmm2, %xmm3 1688; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1689; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1690; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1691; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1692; AVX1-NEXT: retq 1693; 1694; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1695; AVX2: # %bb.0: 1696; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1697; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 1698; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1699; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1700; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1701; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488> 1702; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 1703; AVX2-NEXT: vpsubq %ymm2, %ymm1, %ymm1 1704; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1705; AVX2-NEXT: retq 1706; 1707; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1708; AVX512F: # %bb.0: 1709; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1710; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = <u,2,3,4> 1711; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm2 1712; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 1713; AVX512F-NEXT: vpaddq %ymm2, %ymm0, %ymm2 1714; AVX512F-NEXT: vpsravq %zmm1, %zmm2, %zmm1 1715; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1716; AVX512F-NEXT: retq 1717; 1718; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1719; AVX512BW: # %bb.0: 1720; AVX512BW-NEXT: vpsraq $63, %ymm0, %ymm1 1721; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1722; AVX512BW-NEXT: vpaddq %ymm1, %ymm0, %ymm1 1723; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 1724; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1725; AVX512BW-NEXT: retq 1726; 1727; XOP-LABEL: combine_vec_sdiv_by_pow2b_v4i64: 1728; XOP: # %bb.0: 1729; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551553,18446744073709551553] 1730; XOP-NEXT: vpshaq %xmm1, %xmm0, %xmm2 1731; XOP-NEXT: vpsrlq $62, %xmm2, %xmm2 1732; XOP-NEXT: vpaddq %xmm2, %xmm0, %xmm2 1733; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 1734; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 1735; XOP-NEXT: vpshaq %xmm1, %xmm3, %xmm1 1736; XOP-NEXT: vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1737; XOP-NEXT: vpaddq %xmm1, %xmm3, %xmm1 1738; XOP-NEXT: vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1739; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1740; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] 1741; XOP-NEXT: retq 1742 %1 = sdiv <4 x i64> %x, <i64 1, i64 4, i64 8, i64 16> 1743 ret <4 x i64> %1 1744} 1745 1746define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) { 1747; SSE2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1748; SSE2: # %bb.0: 1749; SSE2-NEXT: movdqa %xmm0, %xmm4 1750; SSE2-NEXT: psrad $31, %xmm4 1751; SSE2-NEXT: psrlq $62, %xmm4 1752; SSE2-NEXT: paddq %xmm0, %xmm4 1753; SSE2-NEXT: movdqa %xmm4, %xmm5 1754; SSE2-NEXT: psrad $2, %xmm5 1755; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1756; SSE2-NEXT: psrlq $2, %xmm4 1757; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1758; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1759; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] 1760; SSE2-NEXT: movdqa %xmm2, %xmm4 1761; SSE2-NEXT: psrad $31, %xmm4 1762; SSE2-NEXT: psrlq $62, %xmm4 1763; SSE2-NEXT: paddq %xmm2, %xmm4 1764; SSE2-NEXT: movdqa %xmm4, %xmm5 1765; SSE2-NEXT: psrad $2, %xmm5 1766; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,3,2,3] 1767; SSE2-NEXT: psrlq $2, %xmm4 1768; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1769; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1770; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] 1771; SSE2-NEXT: movdqa %xmm1, %xmm4 1772; SSE2-NEXT: psrad $31, %xmm4 1773; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1774; SSE2-NEXT: movdqa %xmm4, %xmm5 1775; SSE2-NEXT: psrlq $61, %xmm5 1776; SSE2-NEXT: psrlq $60, %xmm4 1777; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1] 1778; SSE2-NEXT: paddq %xmm1, %xmm4 1779; SSE2-NEXT: movdqa %xmm4, %xmm1 1780; SSE2-NEXT: psrlq $3, %xmm1 1781; SSE2-NEXT: psrlq $4, %xmm4 1782; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] 1783; SSE2-NEXT: movapd {{.*#+}} xmm1 = [1152921504606846976,576460752303423488] 1784; SSE2-NEXT: xorpd %xmm1, %xmm4 1785; SSE2-NEXT: psubq %xmm1, %xmm4 1786; SSE2-NEXT: movdqa %xmm3, %xmm5 1787; SSE2-NEXT: psrad $31, %xmm5 1788; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1789; SSE2-NEXT: movdqa %xmm5, %xmm6 1790; SSE2-NEXT: psrlq $61, %xmm6 1791; SSE2-NEXT: psrlq $60, %xmm5 1792; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm6[0],xmm5[1] 1793; SSE2-NEXT: paddq %xmm3, %xmm5 1794; SSE2-NEXT: movdqa %xmm5, %xmm3 1795; SSE2-NEXT: psrlq $3, %xmm3 1796; SSE2-NEXT: psrlq $4, %xmm5 1797; SSE2-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1] 1798; SSE2-NEXT: xorpd %xmm1, %xmm5 1799; SSE2-NEXT: psubq %xmm1, %xmm5 1800; SSE2-NEXT: movdqa %xmm4, %xmm1 1801; SSE2-NEXT: movdqa %xmm5, %xmm3 1802; SSE2-NEXT: retq 1803; 1804; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1805; SSE41: # %bb.0: 1806; SSE41-NEXT: movdqa %xmm2, %xmm5 1807; SSE41-NEXT: movdqa %xmm1, %xmm4 1808; SSE41-NEXT: movdqa %xmm0, %xmm1 1809; SSE41-NEXT: psrad $31, %xmm0 1810; SSE41-NEXT: psrlq $62, %xmm0 1811; SSE41-NEXT: paddq %xmm1, %xmm0 1812; SSE41-NEXT: movdqa %xmm0, %xmm2 1813; SSE41-NEXT: psrad $2, %xmm2 1814; SSE41-NEXT: psrlq $2, %xmm0 1815; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1816; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1817; SSE41-NEXT: movdqa %xmm5, %xmm2 1818; SSE41-NEXT: psrad $31, %xmm2 1819; SSE41-NEXT: psrlq $62, %xmm2 1820; SSE41-NEXT: paddq %xmm5, %xmm2 1821; SSE41-NEXT: movdqa %xmm2, %xmm1 1822; SSE41-NEXT: psrad $2, %xmm1 1823; SSE41-NEXT: psrlq $2, %xmm2 1824; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 1825; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] 1826; SSE41-NEXT: movdqa %xmm4, %xmm1 1827; SSE41-NEXT: psrad $31, %xmm1 1828; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1829; SSE41-NEXT: movdqa %xmm1, %xmm5 1830; SSE41-NEXT: psrlq $60, %xmm5 1831; SSE41-NEXT: psrlq $61, %xmm1 1832; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7] 1833; SSE41-NEXT: paddq %xmm4, %xmm1 1834; SSE41-NEXT: movdqa %xmm1, %xmm4 1835; SSE41-NEXT: psrlq $4, %xmm4 1836; SSE41-NEXT: psrlq $3, %xmm1 1837; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 1838; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [1152921504606846976,576460752303423488] 1839; SSE41-NEXT: pxor %xmm5, %xmm1 1840; SSE41-NEXT: psubq %xmm5, %xmm1 1841; SSE41-NEXT: movdqa %xmm3, %xmm4 1842; SSE41-NEXT: psrad $31, %xmm4 1843; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1844; SSE41-NEXT: movdqa %xmm4, %xmm6 1845; SSE41-NEXT: psrlq $60, %xmm6 1846; SSE41-NEXT: psrlq $61, %xmm4 1847; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] 1848; SSE41-NEXT: paddq %xmm3, %xmm4 1849; SSE41-NEXT: movdqa %xmm4, %xmm3 1850; SSE41-NEXT: psrlq $4, %xmm3 1851; SSE41-NEXT: psrlq $3, %xmm4 1852; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] 1853; SSE41-NEXT: pxor %xmm5, %xmm4 1854; SSE41-NEXT: psubq %xmm5, %xmm4 1855; SSE41-NEXT: movdqa %xmm4, %xmm3 1856; SSE41-NEXT: retq 1857; 1858; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1859; AVX1: # %bb.0: 1860; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1861; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1862; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 1863; AVX1-NEXT: vpsrlq $60, %xmm4, %xmm5 1864; AVX1-NEXT: vpsrlq $61, %xmm4, %xmm4 1865; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4,5,6,7] 1866; AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1867; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm4 1868; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1869; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 1870; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1152921504606846976,576460752303423488] 1871; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1872; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1873; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm5 1874; AVX1-NEXT: vpsrlq $62, %xmm5, %xmm5 1875; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm5 1876; AVX1-NEXT: vpsrad $2, %xmm5, %xmm6 1877; AVX1-NEXT: vpsrlq $2, %xmm5, %xmm5 1878; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1879; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 1880; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1881; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1882; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm5 1883; AVX1-NEXT: vpsrlq $60, %xmm5, %xmm6 1884; AVX1-NEXT: vpsrlq $61, %xmm5, %xmm5 1885; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4,5,6,7] 1886; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1887; AVX1-NEXT: vpsrlq $4, %xmm3, %xmm5 1888; AVX1-NEXT: vpsrlq $3, %xmm3, %xmm3 1889; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5,6,7] 1890; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 1891; AVX1-NEXT: vpsubq %xmm4, %xmm3, %xmm3 1892; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2 1893; AVX1-NEXT: vpsrlq $62, %xmm2, %xmm2 1894; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm2 1895; AVX1-NEXT: vpsrad $2, %xmm2, %xmm4 1896; AVX1-NEXT: vpsrlq $2, %xmm2, %xmm2 1897; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] 1898; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1899; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1900; AVX1-NEXT: retq 1901; 1902; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1903; AVX2: # %bb.0: 1904; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1905; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 1906; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = <u,62,61,60> 1907; AVX2-NEXT: vpsrlvq %ymm4, %ymm3, %ymm3 1908; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm3 1909; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = <u,2,3,4> 1910; AVX2-NEXT: vpsrlvq %ymm5, %ymm3, %ymm3 1911; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488> 1912; AVX2-NEXT: vpxor %ymm6, %ymm3, %ymm3 1913; AVX2-NEXT: vpsubq %ymm6, %ymm3, %ymm3 1914; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] 1915; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 1916; AVX2-NEXT: vpsrlvq %ymm4, %ymm2, %ymm2 1917; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm2 1918; AVX2-NEXT: vpsrlvq %ymm5, %ymm2, %ymm2 1919; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 1920; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1921; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1922; AVX2-NEXT: retq 1923; 1924; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1925; AVX512F: # %bb.0: 1926; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1 1927; AVX512F-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1928; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1929; AVX512F-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1930; AVX512F-NEXT: movb $17, %al 1931; AVX512F-NEXT: kmovw %eax, %k1 1932; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1933; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 1934; AVX512F-NEXT: retq 1935; 1936; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1937; AVX512BW: # %bb.0: 1938; AVX512BW-NEXT: vpsraq $63, %zmm0, %zmm1 1939; AVX512BW-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1940; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 1941; AVX512BW-NEXT: vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 1942; AVX512BW-NEXT: movb $17, %al 1943; AVX512BW-NEXT: kmovd %eax, %k1 1944; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 1945; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 1946; AVX512BW-NEXT: retq 1947; 1948; XOP-LABEL: combine_vec_sdiv_by_pow2b_v8i64: 1949; XOP: # %bb.0: 1950; XOP-NEXT: vextractf128 $1, %ymm0, %xmm2 1951; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551553,18446744073709551553] 1952; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm4 1953; XOP-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551555,18446744073709551556] 1954; XOP-NEXT: vpshlq %xmm5, %xmm4, %xmm4 1955; XOP-NEXT: vpaddq %xmm4, %xmm2, %xmm2 1956; XOP-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709551613,18446744073709551612] 1957; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1958; XOP-NEXT: vpshaq %xmm3, %xmm0, %xmm6 1959; XOP-NEXT: vpsrlq $62, %xmm6, %xmm6 1960; XOP-NEXT: vpaddq %xmm6, %xmm0, %xmm6 1961; XOP-NEXT: vmovdqa {{.*#+}} xmm7 = <u,18446744073709551614> 1962; XOP-NEXT: vpshaq %xmm7, %xmm6, %xmm6 1963; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 1964; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 1965; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 1966; XOP-NEXT: vpshaq %xmm3, %xmm2, %xmm6 1967; XOP-NEXT: vpshlq %xmm5, %xmm6, %xmm5 1968; XOP-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1969; XOP-NEXT: vpshaq %xmm4, %xmm2, %xmm2 1970; XOP-NEXT: vpshaq %xmm3, %xmm1, %xmm3 1971; XOP-NEXT: vpsrlq $62, %xmm3, %xmm3 1972; XOP-NEXT: vpaddq %xmm3, %xmm1, %xmm3 1973; XOP-NEXT: vpshaq %xmm7, %xmm3, %xmm3 1974; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 1975; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1976; XOP-NEXT: retq 1977 %1 = sdiv <8 x i64> %x, <i64 1, i64 4, i64 8, i64 16, i64 1, i64 4, i64 8, i64 16> 1978 ret <8 x i64> %1 1979} 1980 1981define <4 x i32> @combine_vec_sdiv_by_pow2b_PosAndNeg(<4 x i32> %x) { 1982; SSE2-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 1983; SSE2: # %bb.0: 1984; SSE2-NEXT: movdqa %xmm0, %xmm1 1985; SSE2-NEXT: psrad $31, %xmm0 1986; SSE2-NEXT: movdqa %xmm0, %xmm2 1987; SSE2-NEXT: psrld $28, %xmm2 1988; SSE2-NEXT: movdqa %xmm0, %xmm3 1989; SSE2-NEXT: psrld $29, %xmm3 1990; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1991; SSE2-NEXT: psrld $30, %xmm0 1992; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] 1993; SSE2-NEXT: paddd %xmm1, %xmm0 1994; SSE2-NEXT: movdqa %xmm0, %xmm2 1995; SSE2-NEXT: psrad $4, %xmm2 1996; SSE2-NEXT: movdqa %xmm0, %xmm3 1997; SSE2-NEXT: psrad $3, %xmm3 1998; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] 1999; SSE2-NEXT: psrad $2, %xmm0 2000; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] 2001; SSE2-NEXT: movaps %xmm0, %xmm2 2002; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm0[2,3] 2003; SSE2-NEXT: pxor %xmm3, %xmm3 2004; SSE2-NEXT: psubd %xmm2, %xmm3 2005; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2006; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,2,3] 2007; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2008; SSE2-NEXT: retq 2009; 2010; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2011; SSE41: # %bb.0: 2012; SSE41-NEXT: movdqa %xmm0, %xmm1 2013; SSE41-NEXT: psrad $31, %xmm1 2014; SSE41-NEXT: movdqa %xmm1, %xmm2 2015; SSE41-NEXT: psrld $28, %xmm2 2016; SSE41-NEXT: movdqa %xmm1, %xmm3 2017; SSE41-NEXT: psrld $30, %xmm3 2018; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2019; SSE41-NEXT: psrld $29, %xmm1 2020; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 2021; SSE41-NEXT: paddd %xmm0, %xmm1 2022; SSE41-NEXT: movdqa %xmm1, %xmm2 2023; SSE41-NEXT: psrad $4, %xmm2 2024; SSE41-NEXT: movdqa %xmm1, %xmm3 2025; SSE41-NEXT: psrad $2, %xmm3 2026; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2027; SSE41-NEXT: pxor %xmm2, %xmm2 2028; SSE41-NEXT: psubd %xmm3, %xmm2 2029; SSE41-NEXT: psrad $3, %xmm1 2030; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2031; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2032; SSE41-NEXT: movdqa %xmm1, %xmm0 2033; SSE41-NEXT: retq 2034; 2035; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2036; AVX1: # %bb.0: 2037; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 2038; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 2039; AVX1-NEXT: vpsrld $30, %xmm1, %xmm3 2040; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2041; AVX1-NEXT: vpsrld $29, %xmm1, %xmm1 2042; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 2043; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2044; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 2045; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 2046; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 2047; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2048; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2 2049; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 2050; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2051; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2052; AVX1-NEXT: retq 2053; 2054; AVX2ORLATER-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2055; AVX2ORLATER: # %bb.0: 2056; AVX2ORLATER-NEXT: vpsrad $31, %xmm0, %xmm1 2057; AVX2ORLATER-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2058; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2059; AVX2ORLATER-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2060; AVX2ORLATER-NEXT: vpxor %xmm2, %xmm2, %xmm2 2061; AVX2ORLATER-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2062; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2063; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 2064; AVX2ORLATER-NEXT: retq 2065; 2066; XOP-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: 2067; XOP: # %bb.0: 2068; XOP-NEXT: vpsrad $31, %xmm0, %xmm1 2069; XOP-NEXT: vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2070; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2071; XOP-NEXT: vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2072; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2073; XOP-NEXT: vpsubd %xmm1, %xmm2, %xmm2 2074; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2075; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 2076; XOP-NEXT: retq 2077 %1 = sdiv <4 x i32> %x, <i32 1, i32 -4, i32 8, i32 -16> 2078 ret <4 x i32> %1 2079} 2080 2081define <4 x i32> @combine_vec_sdiv_by_pow2b_undef1(<4 x i32> %x) { 2082; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef1: 2083; CHECK: # %bb.0: 2084; CHECK-NEXT: retq 2085 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 -16> 2086 ret <4 x i32> %1 2087} 2088 2089define <4 x i32> @combine_vec_sdiv_by_pow2b_undef2(<4 x i32> %x) { 2090; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef2: 2091; CHECK: # %bb.0: 2092; CHECK-NEXT: retq 2093 %1 = sdiv <4 x i32> %x, <i32 undef, i32 4, i32 undef, i32 16> 2094 ret <4 x i32> %1 2095} 2096 2097define <4 x i32> @combine_vec_sdiv_by_pow2b_undef3(<4 x i32> %x) { 2098; CHECK-LABEL: combine_vec_sdiv_by_pow2b_undef3: 2099; CHECK: # %bb.0: 2100; CHECK-NEXT: retq 2101 %1 = sdiv <4 x i32> %x, <i32 undef, i32 -4, i32 undef, i32 16> 2102 ret <4 x i32> %1 2103} 2104 2105; PR37119 2106define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { 2107; SSE-LABEL: non_splat_minus_one_divisor_0: 2108; SSE: # %bb.0: 2109; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2110; SSE-NEXT: pxor %xmm1, %xmm0 2111; SSE-NEXT: psubb %xmm1, %xmm0 2112; SSE-NEXT: retq 2113; 2114; AVX1-LABEL: non_splat_minus_one_divisor_0: 2115; AVX1: # %bb.0: 2116; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2117; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2118; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2119; AVX1-NEXT: retq 2120; 2121; AVX2-LABEL: non_splat_minus_one_divisor_0: 2122; AVX2: # %bb.0: 2123; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2124; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2125; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2126; AVX2-NEXT: retq 2127; 2128; AVX512F-LABEL: non_splat_minus_one_divisor_0: 2129; AVX512F: # %bb.0: 2130; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2131; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2132; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2133; AVX512F-NEXT: retq 2134; 2135; AVX512BW-LABEL: non_splat_minus_one_divisor_0: 2136; AVX512BW: # %bb.0: 2137; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2138; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2139; AVX512BW-NEXT: kmovd %eax, %k1 2140; AVX512BW-NEXT: vpsubb %xmm0, %xmm1, %xmm0 {%k1} 2141; AVX512BW-NEXT: retq 2142; 2143; XOP-LABEL: non_splat_minus_one_divisor_0: 2144; XOP: # %bb.0: 2145; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,0,0,0] 2146; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2147; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2148; XOP-NEXT: retq 2149 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2150 ret <16 x i8> %div 2151} 2152 2153define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { 2154; SSE2-LABEL: non_splat_minus_one_divisor_1: 2155; SSE2: # %bb.0: 2156; SSE2-NEXT: pxor %xmm1, %xmm1 2157; SSE2-NEXT: pxor %xmm2, %xmm2 2158; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 2159; SSE2-NEXT: movdqa %xmm2, %xmm3 2160; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] 2161; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2162; SSE2-NEXT: psrlw $8, %xmm3 2163; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 2164; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2165; SSE2-NEXT: psrlw $8, %xmm2 2166; SSE2-NEXT: packuswb %xmm3, %xmm2 2167; SSE2-NEXT: paddb %xmm0, %xmm2 2168; SSE2-NEXT: movdqa %xmm2, %xmm1 2169; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 2170; SSE2-NEXT: psraw $8, %xmm1 2171; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2172; SSE2-NEXT: psrlw $8, %xmm1 2173; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2174; SSE2-NEXT: psraw $8, %xmm2 2175; SSE2-NEXT: psllw $7, %xmm2 2176; SSE2-NEXT: psrlw $8, %xmm2 2177; SSE2-NEXT: packuswb %xmm1, %xmm2 2178; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2179; SSE2-NEXT: pand %xmm1, %xmm2 2180; SSE2-NEXT: pandn %xmm0, %xmm1 2181; SSE2-NEXT: por %xmm2, %xmm1 2182; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2183; SSE2-NEXT: pxor %xmm0, %xmm1 2184; SSE2-NEXT: psubb %xmm0, %xmm1 2185; SSE2-NEXT: movdqa %xmm1, %xmm0 2186; SSE2-NEXT: retq 2187; 2188; SSE41-LABEL: non_splat_minus_one_divisor_1: 2189; SSE41: # %bb.0: 2190; SSE41-NEXT: movdqa %xmm0, %xmm1 2191; SSE41-NEXT: pxor %xmm0, %xmm0 2192; SSE41-NEXT: pxor %xmm3, %xmm3 2193; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 2194; SSE41-NEXT: pxor %xmm4, %xmm4 2195; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 2196; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2197; SSE41-NEXT: psllw $1, %xmm2 2198; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7] 2199; SSE41-NEXT: psrlw $8, %xmm2 2200; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2201; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2202; SSE41-NEXT: psrlw $8, %xmm3 2203; SSE41-NEXT: packuswb %xmm3, %xmm2 2204; SSE41-NEXT: paddb %xmm1, %xmm2 2205; SSE41-NEXT: movdqa %xmm2, %xmm0 2206; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 2207; SSE41-NEXT: psraw $8, %xmm0 2208; SSE41-NEXT: movdqa %xmm0, %xmm3 2209; SSE41-NEXT: psllw $1, %xmm3 2210; SSE41-NEXT: psllw $7, %xmm0 2211; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7] 2212; SSE41-NEXT: psrlw $8, %xmm0 2213; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2214; SSE41-NEXT: psraw $8, %xmm2 2215; SSE41-NEXT: psllw $7, %xmm2 2216; SSE41-NEXT: psrlw $8, %xmm2 2217; SSE41-NEXT: packuswb %xmm0, %xmm2 2218; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2219; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 2220; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2221; SSE41-NEXT: pxor %xmm0, %xmm1 2222; SSE41-NEXT: psubb %xmm0, %xmm1 2223; SSE41-NEXT: movdqa %xmm1, %xmm0 2224; SSE41-NEXT: retq 2225; 2226; AVX1-LABEL: non_splat_minus_one_divisor_1: 2227; AVX1: # %bb.0: 2228; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2229; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2230; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 2231; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2232; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4 2233; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7] 2234; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2235; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2236; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2237; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2238; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 2239; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2240; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2241; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 2242; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3 2243; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2 2244; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7] 2245; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2246; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2247; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 2248; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 2249; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 2250; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2251; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2252; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2253; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2254; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2255; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2256; AVX1-NEXT: retq 2257; 2258; AVX2-LABEL: non_splat_minus_one_divisor_1: 2259; AVX2: # %bb.0: 2260; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2261; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2262; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2263; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2264; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2265; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2266; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2267; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2268; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 2269; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2270; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2271; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 2272; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2273; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2274; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2275; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2276; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2277; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2278; AVX2-NEXT: vzeroupper 2279; AVX2-NEXT: retq 2280; 2281; AVX512F-LABEL: non_splat_minus_one_divisor_1: 2282; AVX512F: # %bb.0: 2283; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2284; AVX512F-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2285; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 2286; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2287; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2288; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2289; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 2290; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2291; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 2292; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2293; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2294; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2295; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 2296; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2297; AVX512F-NEXT: vzeroupper 2298; AVX512F-NEXT: retq 2299; 2300; AVX512BW-LABEL: non_splat_minus_one_divisor_1: 2301; AVX512BW: # %bb.0: 2302; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2303; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2 2304; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2305; AVX512BW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2306; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2307; AVX512BW-NEXT: vpaddb %xmm2, %xmm0, %xmm2 2308; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm2 2309; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2310; AVX512BW-NEXT: vpmovwb %ymm2, %xmm2 2311; AVX512BW-NEXT: movw $443, %ax # imm = 0x1BB 2312; AVX512BW-NEXT: kmovd %eax, %k1 2313; AVX512BW-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1} 2314; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm0 2315; AVX512BW-NEXT: movw $24132, %ax # imm = 0x5E44 2316; AVX512BW-NEXT: kmovd %eax, %k1 2317; AVX512BW-NEXT: vmovdqu8 %xmm2, %xmm0 {%k1} 2318; AVX512BW-NEXT: vzeroupper 2319; AVX512BW-NEXT: retq 2320; 2321; XOP-LABEL: non_splat_minus_one_divisor_1: 2322; XOP: # %bb.0: 2323; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2324; XOP-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1 2325; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2326; XOP-NEXT: vpaddb %xmm1, %xmm0, %xmm1 2327; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2328; XOP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,255,0,0,0,255,0,0,255,255,255,255,255,255,255] 2329; XOP-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 2330; XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,0,255,255,0,0,0,0,255,0,255] 2331; XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0 2332; XOP-NEXT: vpsubb %xmm1, %xmm0, %xmm0 2333; XOP-NEXT: retq 2334 %div = sdiv <16 x i8> %A, <i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 2, i8 2, i8 2, i8 2, i8 -128, i8 2, i8 -128> 2335 ret <16 x i8> %div 2336} 2337 2338define <4 x i32> @non_splat_minus_one_divisor_2(<4 x i32> %A) { 2339; SSE2-LABEL: non_splat_minus_one_divisor_2: 2340; SSE2: # %bb.0: 2341; SSE2-NEXT: movdqa %xmm0, %xmm1 2342; SSE2-NEXT: psrld $31, %xmm1 2343; SSE2-NEXT: paddd %xmm0, %xmm1 2344; SSE2-NEXT: psrad $1, %xmm1 2345; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2346; SSE2-NEXT: pxor %xmm0, %xmm0 2347; SSE2-NEXT: psubd %xmm1, %xmm0 2348; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 2349; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 2350; SSE2-NEXT: retq 2351; 2352; SSE41-LABEL: non_splat_minus_one_divisor_2: 2353; SSE41: # %bb.0: 2354; SSE41-NEXT: movdqa %xmm0, %xmm1 2355; SSE41-NEXT: psrld $31, %xmm1 2356; SSE41-NEXT: paddd %xmm0, %xmm1 2357; SSE41-NEXT: psrad $1, %xmm1 2358; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2359; SSE41-NEXT: pxor %xmm0, %xmm0 2360; SSE41-NEXT: psubd %xmm1, %xmm0 2361; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 2362; SSE41-NEXT: movdqa %xmm1, %xmm0 2363; SSE41-NEXT: retq 2364; 2365; AVX1-LABEL: non_splat_minus_one_divisor_2: 2366; AVX1: # %bb.0: 2367; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 2368; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2369; AVX1-NEXT: vpsrad $1, %xmm1, %xmm1 2370; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2371; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2372; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2373; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2374; AVX1-NEXT: retq 2375; 2376; AVX2ORLATER-LABEL: non_splat_minus_one_divisor_2: 2377; AVX2ORLATER: # %bb.0: 2378; AVX2ORLATER-NEXT: vpsrld $31, %xmm0, %xmm1 2379; AVX2ORLATER-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2380; AVX2ORLATER-NEXT: vpsrad $1, %xmm1, %xmm1 2381; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2382; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2383; AVX2ORLATER-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2384; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 2385; AVX2ORLATER-NEXT: retq 2386; 2387; XOP-LABEL: non_splat_minus_one_divisor_2: 2388; XOP: # %bb.0: 2389; XOP-NEXT: vpsrld $31, %xmm0, %xmm1 2390; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 2391; XOP-NEXT: vpsrad $1, %xmm1, %xmm1 2392; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 2393; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2394; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm1 2395; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 2396; XOP-NEXT: retq 2397 %div = sdiv <4 x i32> %A, <i32 -1, i32 1, i32 2, i32 -2> 2398 ret <4 x i32> %div 2399} 2400 2401define <8 x i16> @combine_vec_sdiv_nonuniform(<8 x i16> %x) { 2402; SSE-LABEL: combine_vec_sdiv_nonuniform: 2403; SSE: # %bb.0: 2404; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2405; SSE-NEXT: movdqa %xmm0, %xmm1 2406; SSE-NEXT: psrlw $15, %xmm1 2407; SSE-NEXT: paddw %xmm0, %xmm1 2408; SSE-NEXT: movdqa %xmm1, %xmm0 2409; SSE-NEXT: retq 2410; 2411; AVX-LABEL: combine_vec_sdiv_nonuniform: 2412; AVX: # %bb.0: 2413; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2414; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 2415; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2416; AVX-NEXT: retq 2417 %1 = sdiv <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 22, i16 22, i16 22, i16 22> 2418 ret <8 x i16> %1 2419} 2420 2421define <8 x i16> @combine_vec_sdiv_nonuniform2(<8 x i16> %x) { 2422; SSE2-LABEL: combine_vec_sdiv_nonuniform2: 2423; SSE2: # %bb.0: 2424; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2425; SSE2-NEXT: movdqa %xmm0, %xmm1 2426; SSE2-NEXT: psraw $2, %xmm1 2427; SSE2-NEXT: movdqa %xmm0, %xmm2 2428; SSE2-NEXT: psraw $1, %xmm2 2429; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 2430; SSE2-NEXT: psrlw $15, %xmm0 2431; SSE2-NEXT: paddw %xmm2, %xmm0 2432; SSE2-NEXT: retq 2433; 2434; SSE41-LABEL: combine_vec_sdiv_nonuniform2: 2435; SSE41: # %bb.0: 2436; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2437; SSE41-NEXT: movdqa %xmm0, %xmm1 2438; SSE41-NEXT: psraw $1, %xmm1 2439; SSE41-NEXT: movdqa %xmm0, %xmm2 2440; SSE41-NEXT: psraw $2, %xmm2 2441; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2442; SSE41-NEXT: psrlw $15, %xmm0 2443; SSE41-NEXT: paddw %xmm2, %xmm0 2444; SSE41-NEXT: retq 2445; 2446; AVX1-LABEL: combine_vec_sdiv_nonuniform2: 2447; AVX1: # %bb.0: 2448; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2449; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 2450; AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 2451; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2452; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2453; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2454; AVX1-NEXT: retq 2455; 2456; AVX2-LABEL: combine_vec_sdiv_nonuniform2: 2457; AVX2: # %bb.0: 2458; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2459; AVX2-NEXT: vpsraw $1, %xmm0, %xmm1 2460; AVX2-NEXT: vpsraw $2, %xmm0, %xmm2 2461; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2462; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2463; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2464; AVX2-NEXT: retq 2465; 2466; AVX512F-LABEL: combine_vec_sdiv_nonuniform2: 2467; AVX512F: # %bb.0: 2468; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2469; AVX512F-NEXT: vpsraw $1, %xmm0, %xmm1 2470; AVX512F-NEXT: vpsraw $2, %xmm0, %xmm2 2471; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2472; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2473; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2474; AVX512F-NEXT: retq 2475; 2476; AVX512BW-LABEL: combine_vec_sdiv_nonuniform2: 2477; AVX512BW: # %bb.0: 2478; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2479; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2480; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2481; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2482; AVX512BW-NEXT: retq 2483; 2484; XOP-LABEL: combine_vec_sdiv_nonuniform2: 2485; XOP: # %bb.0: 2486; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2487; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2488; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2489; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2490; XOP-NEXT: retq 2491 %1 = sdiv <8 x i16> %x, <i16 24, i16 24, i16 24, i16 24, i16 25, i16 25, i16 25, i16 25> 2492 ret <8 x i16> %1 2493} 2494 2495define <8 x i16> @combine_vec_sdiv_nonuniform3(<8 x i16> %x) { 2496; SSE2-LABEL: combine_vec_sdiv_nonuniform3: 2497; SSE2: # %bb.0: 2498; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2499; SSE2-NEXT: pmulhw %xmm0, %xmm1 2500; SSE2-NEXT: paddw %xmm0, %xmm1 2501; SSE2-NEXT: movdqa %xmm1, %xmm0 2502; SSE2-NEXT: psraw $4, %xmm0 2503; SSE2-NEXT: movdqa %xmm1, %xmm2 2504; SSE2-NEXT: psraw $8, %xmm2 2505; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2506; SSE2-NEXT: psrlw $15, %xmm1 2507; SSE2-NEXT: paddw %xmm2, %xmm1 2508; SSE2-NEXT: movdqa %xmm1, %xmm0 2509; SSE2-NEXT: retq 2510; 2511; SSE41-LABEL: combine_vec_sdiv_nonuniform3: 2512; SSE41: # %bb.0: 2513; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [45591,45591,45591,45591,32833,32833,32833,32833] 2514; SSE41-NEXT: pmulhw %xmm0, %xmm1 2515; SSE41-NEXT: paddw %xmm0, %xmm1 2516; SSE41-NEXT: movdqa %xmm1, %xmm0 2517; SSE41-NEXT: psraw $8, %xmm0 2518; SSE41-NEXT: movdqa %xmm1, %xmm2 2519; SSE41-NEXT: psraw $4, %xmm2 2520; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 2521; SSE41-NEXT: psrlw $15, %xmm1 2522; SSE41-NEXT: paddw %xmm2, %xmm1 2523; SSE41-NEXT: movdqa %xmm1, %xmm0 2524; SSE41-NEXT: retq 2525; 2526; AVX1-LABEL: combine_vec_sdiv_nonuniform3: 2527; AVX1: # %bb.0: 2528; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2529; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2530; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2531; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2532; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2533; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2534; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2535; AVX1-NEXT: retq 2536; 2537; AVX2-LABEL: combine_vec_sdiv_nonuniform3: 2538; AVX2: # %bb.0: 2539; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2540; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2541; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2542; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2543; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2544; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2545; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2546; AVX2-NEXT: retq 2547; 2548; AVX512F-LABEL: combine_vec_sdiv_nonuniform3: 2549; AVX512F: # %bb.0: 2550; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2551; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2552; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2553; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2554; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2555; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2556; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2557; AVX512F-NEXT: retq 2558; 2559; AVX512BW-LABEL: combine_vec_sdiv_nonuniform3: 2560; AVX512BW: # %bb.0: 2561; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2562; AVX512BW-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2563; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2564; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2565; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2566; AVX512BW-NEXT: retq 2567; 2568; XOP-LABEL: combine_vec_sdiv_nonuniform3: 2569; XOP: # %bb.0: 2570; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2571; XOP-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2572; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2573; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2574; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2575; XOP-NEXT: retq 2576 %1 = sdiv <8 x i16> %x, <i16 23, i16 23, i16 23, i16 23, i16 511, i16 511, i16 511, i16 511> 2577 ret <8 x i16> %1 2578} 2579 2580define <8 x i16> @combine_vec_sdiv_nonuniform4(<8 x i16> %x) { 2581; SSE2-LABEL: combine_vec_sdiv_nonuniform4: 2582; SSE2: # %bb.0: 2583; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2584; SSE2-NEXT: pmulhw %xmm0, %xmm1 2585; SSE2-NEXT: psubw %xmm0, %xmm1 2586; SSE2-NEXT: movdqa %xmm1, %xmm0 2587; SSE2-NEXT: psraw $4, %xmm0 2588; SSE2-NEXT: movdqa %xmm1, %xmm2 2589; SSE2-NEXT: psraw $8, %xmm2 2590; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2591; SSE2-NEXT: psrlw $15, %xmm1 2592; SSE2-NEXT: paddw %xmm2, %xmm1 2593; SSE2-NEXT: movdqa %xmm1, %xmm0 2594; SSE2-NEXT: retq 2595; 2596; SSE41-LABEL: combine_vec_sdiv_nonuniform4: 2597; SSE41: # %bb.0: 2598; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [19945,19945,19945,19945,32639,32639,32639,32639] 2599; SSE41-NEXT: pmulhw %xmm0, %xmm1 2600; SSE41-NEXT: psubw %xmm0, %xmm1 2601; SSE41-NEXT: movdqa %xmm1, %xmm0 2602; SSE41-NEXT: psraw $8, %xmm0 2603; SSE41-NEXT: movdqa %xmm1, %xmm2 2604; SSE41-NEXT: psraw $4, %xmm2 2605; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4,5,6,7] 2606; SSE41-NEXT: psrlw $15, %xmm1 2607; SSE41-NEXT: paddw %xmm2, %xmm1 2608; SSE41-NEXT: movdqa %xmm1, %xmm0 2609; SSE41-NEXT: retq 2610; 2611; AVX1-LABEL: combine_vec_sdiv_nonuniform4: 2612; AVX1: # %bb.0: 2613; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2614; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2615; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1 2616; AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 2617; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 2618; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2619; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2620; AVX1-NEXT: retq 2621; 2622; AVX2-LABEL: combine_vec_sdiv_nonuniform4: 2623; AVX2: # %bb.0: 2624; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2625; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2626; AVX2-NEXT: vpsraw $8, %xmm0, %xmm1 2627; AVX2-NEXT: vpsraw $4, %xmm0, %xmm2 2628; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2629; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2630; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2631; AVX2-NEXT: retq 2632; 2633; AVX512F-LABEL: combine_vec_sdiv_nonuniform4: 2634; AVX512F: # %bb.0: 2635; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2636; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2637; AVX512F-NEXT: vpsraw $8, %xmm0, %xmm1 2638; AVX512F-NEXT: vpsraw $4, %xmm0, %xmm2 2639; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] 2640; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm0 2641; AVX512F-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2642; AVX512F-NEXT: retq 2643; 2644; AVX512BW-LABEL: combine_vec_sdiv_nonuniform4: 2645; AVX512BW: # %bb.0: 2646; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2647; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2648; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2649; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2650; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2651; AVX512BW-NEXT: retq 2652; 2653; XOP-LABEL: combine_vec_sdiv_nonuniform4: 2654; XOP: # %bb.0: 2655; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2656; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm0 2657; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2658; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2659; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2660; XOP-NEXT: retq 2661 %1 = sdiv <8 x i16> %x, <i16 -23, i16 -23, i16 -23, i16 -23, i16 -510, i16 -510, i16 -510, i16 -510> 2662 ret <8 x i16> %1 2663} 2664 2665define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { 2666; SSE2-LABEL: combine_vec_sdiv_nonuniform5: 2667; SSE2: # %bb.0: 2668; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] 2669; SSE2-NEXT: pmullw %xmm0, %xmm1 2670; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2671; SSE2-NEXT: paddw %xmm1, %xmm0 2672; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] 2673; SSE2-NEXT: movdqa %xmm0, %xmm2 2674; SSE2-NEXT: pand %xmm1, %xmm2 2675; SSE2-NEXT: movdqa %xmm0, %xmm3 2676; SSE2-NEXT: psraw $8, %xmm3 2677; SSE2-NEXT: pandn %xmm3, %xmm1 2678; SSE2-NEXT: por %xmm2, %xmm1 2679; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] 2680; SSE2-NEXT: pand %xmm2, %xmm1 2681; SSE2-NEXT: movdqa %xmm0, %xmm3 2682; SSE2-NEXT: psraw $4, %xmm3 2683; SSE2-NEXT: pandn %xmm3, %xmm2 2684; SSE2-NEXT: por %xmm1, %xmm2 2685; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] 2686; SSE2-NEXT: movdqa %xmm2, %xmm3 2687; SSE2-NEXT: pand %xmm1, %xmm3 2688; SSE2-NEXT: psraw $2, %xmm2 2689; SSE2-NEXT: pandn %xmm2, %xmm1 2690; SSE2-NEXT: por %xmm3, %xmm1 2691; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] 2692; SSE2-NEXT: movdqa %xmm1, %xmm3 2693; SSE2-NEXT: pand %xmm2, %xmm3 2694; SSE2-NEXT: psraw $1, %xmm1 2695; SSE2-NEXT: pandn %xmm1, %xmm2 2696; SSE2-NEXT: por %xmm3, %xmm2 2697; SSE2-NEXT: psrlw $15, %xmm0 2698; SSE2-NEXT: paddw %xmm2, %xmm0 2699; SSE2-NEXT: retq 2700; 2701; SSE41-LABEL: combine_vec_sdiv_nonuniform5: 2702; SSE41: # %bb.0: 2703; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] 2704; SSE41-NEXT: pmullw %xmm0, %xmm1 2705; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2706; SSE41-NEXT: paddw %xmm1, %xmm0 2707; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256> 2708; SSE41-NEXT: pmulhw %xmm0, %xmm1 2709; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2710; SSE41-NEXT: movdqa %xmm0, %xmm2 2711; SSE41-NEXT: psraw $1, %xmm2 2712; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2713; SSE41-NEXT: psrlw $15, %xmm0 2714; SSE41-NEXT: paddw %xmm2, %xmm0 2715; SSE41-NEXT: retq 2716; 2717; AVX1-LABEL: combine_vec_sdiv_nonuniform5: 2718; AVX1: # %bb.0: 2719; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2720; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2721; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2722; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2723; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2724; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 2725; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2726; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2727; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2728; AVX1-NEXT: retq 2729; 2730; AVX2-LABEL: combine_vec_sdiv_nonuniform5: 2731; AVX2: # %bb.0: 2732; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2733; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2734; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2735; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2736; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2737; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2 2738; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] 2739; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2740; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2741; AVX2-NEXT: retq 2742; 2743; AVX512F-LABEL: combine_vec_sdiv_nonuniform5: 2744; AVX512F: # %bb.0: 2745; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2746; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2747; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2748; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2749; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2750; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2751; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2752; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2753; AVX512F-NEXT: vzeroupper 2754; AVX512F-NEXT: retq 2755; 2756; AVX512BW-LABEL: combine_vec_sdiv_nonuniform5: 2757; AVX512BW: # %bb.0: 2758; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2759; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2760; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2761; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2762; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2763; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2764; AVX512BW-NEXT: retq 2765; 2766; XOP-LABEL: combine_vec_sdiv_nonuniform5: 2767; XOP: # %bb.0: 2768; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2769; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2770; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2771; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2772; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2773; XOP-NEXT: retq 2774 %1 = sdiv <8 x i16> %x, <i16 -510, i16 -24, i16 -23, i16 3, i16 22, i16 25, i16 255, i16 511> 2775 ret <8 x i16> %1 2776} 2777 2778define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { 2779; SSE2-LABEL: combine_vec_sdiv_nonuniform6: 2780; SSE2: # %bb.0: 2781; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] 2782; SSE2-NEXT: pmullw %xmm0, %xmm1 2783; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2784; SSE2-NEXT: paddw %xmm1, %xmm0 2785; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] 2786; SSE2-NEXT: movdqa %xmm0, %xmm2 2787; SSE2-NEXT: psraw $8, %xmm2 2788; SSE2-NEXT: pand %xmm1, %xmm2 2789; SSE2-NEXT: pandn %xmm0, %xmm1 2790; SSE2-NEXT: por %xmm2, %xmm1 2791; SSE2-NEXT: movdqa %xmm1, %xmm2 2792; SSE2-NEXT: psraw $6, %xmm2 2793; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] 2794; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0] 2795; SSE2-NEXT: pand %xmm4, %xmm1 2796; SSE2-NEXT: movdqa %xmm0, %xmm5 2797; SSE2-NEXT: psraw $12, %xmm5 2798; SSE2-NEXT: pandn %xmm5, %xmm4 2799; SSE2-NEXT: por %xmm1, %xmm4 2800; SSE2-NEXT: pand %xmm3, %xmm4 2801; SSE2-NEXT: pandn %xmm2, %xmm3 2802; SSE2-NEXT: por %xmm4, %xmm3 2803; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0] 2804; SSE2-NEXT: movdqa %xmm3, %xmm2 2805; SSE2-NEXT: pand %xmm1, %xmm2 2806; SSE2-NEXT: psraw $1, %xmm3 2807; SSE2-NEXT: pandn %xmm3, %xmm1 2808; SSE2-NEXT: por %xmm2, %xmm1 2809; SSE2-NEXT: psrlw $15, %xmm0 2810; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2811; SSE2-NEXT: paddw %xmm1, %xmm0 2812; SSE2-NEXT: retq 2813; 2814; SSE41-LABEL: combine_vec_sdiv_nonuniform6: 2815; SSE41: # %bb.0: 2816; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] 2817; SSE41-NEXT: pmullw %xmm0, %xmm1 2818; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2819; SSE41-NEXT: paddw %xmm1, %xmm0 2820; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <4,256,256,u,u,512,256,8> 2821; SSE41-NEXT: pmulhw %xmm0, %xmm2 2822; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 2823; SSE41-NEXT: psrlw $15, %xmm0 2824; SSE41-NEXT: pxor %xmm1, %xmm1 2825; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 2826; SSE41-NEXT: paddw %xmm2, %xmm1 2827; SSE41-NEXT: movdqa %xmm1, %xmm0 2828; SSE41-NEXT: retq 2829; 2830; AVX1-LABEL: combine_vec_sdiv_nonuniform6: 2831; AVX1: # %bb.0: 2832; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2833; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2834; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2835; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2836; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2837; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 2838; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 2839; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 2840; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2841; AVX1-NEXT: retq 2842; 2843; AVX2-LABEL: combine_vec_sdiv_nonuniform6: 2844; AVX2: # %bb.0: 2845; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2846; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2847; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2848; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2849; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 2850; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 2851; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2852; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] 2853; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 2854; AVX2-NEXT: retq 2855; 2856; AVX512F-LABEL: combine_vec_sdiv_nonuniform6: 2857; AVX512F: # %bb.0: 2858; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2859; AVX512F-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2860; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2861; AVX512F-NEXT: vpsrlw $15, %xmm0, %xmm1 2862; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 2863; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2864; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2865; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2866; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2867; AVX512F-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2868; AVX512F-NEXT: vzeroupper 2869; AVX512F-NEXT: retq 2870; 2871; AVX512BW-LABEL: combine_vec_sdiv_nonuniform6: 2872; AVX512BW: # %bb.0: 2873; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2874; AVX512BW-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2875; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2876; AVX512BW-NEXT: vpsrlw $15, %xmm0, %xmm1 2877; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2878; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2879; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2880; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2881; AVX512BW-NEXT: retq 2882; 2883; XOP-LABEL: combine_vec_sdiv_nonuniform6: 2884; XOP: # %bb.0: 2885; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2886; XOP-NEXT: vpmacsww %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2887; XOP-NEXT: vpsrlw $15, %xmm0, %xmm1 2888; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 2889; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 2890; XOP-NEXT: vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2891; XOP-NEXT: vpaddw %xmm1, %xmm0, %xmm0 2892; XOP-NEXT: retq 2893 %1 = sdiv <8 x i16> %x, <i16 -32768, i16 -512, i16 -511, i16 -1, i16 1, i16 255, i16 512, i16 32767> 2894 ret <8 x i16> %1 2895} 2896 2897define <8 x i16> @combine_vec_sdiv_nonuniform7(<8 x i16> %x) { 2898; SSE2-LABEL: combine_vec_sdiv_nonuniform7: 2899; SSE2: # %bb.0: 2900; SSE2-NEXT: pxor %xmm1, %xmm1 2901; SSE2-NEXT: psubw %xmm0, %xmm1 2902; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2903; SSE2-NEXT: retq 2904; 2905; SSE41-LABEL: combine_vec_sdiv_nonuniform7: 2906; SSE41: # %bb.0: 2907; SSE41-NEXT: pxor %xmm1, %xmm1 2908; SSE41-NEXT: psubw %xmm0, %xmm1 2909; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2910; SSE41-NEXT: retq 2911; 2912; AVX1-LABEL: combine_vec_sdiv_nonuniform7: 2913; AVX1: # %bb.0: 2914; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2915; AVX1-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2916; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2917; AVX1-NEXT: retq 2918; 2919; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: 2920; AVX2ORLATER: # %bb.0: 2921; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 2922; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2923; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2924; AVX2ORLATER-NEXT: retq 2925; 2926; XOP-LABEL: combine_vec_sdiv_nonuniform7: 2927; XOP: # %bb.0: 2928; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2929; XOP-NEXT: vpsubw %xmm0, %xmm1, %xmm1 2930; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2931; XOP-NEXT: retq 2932 %1 = sdiv <8 x i16> %x, <i16 -1, i16 -1, i16 -1, i16 -1, i16 1, i16 1, i16 1, i16 1> 2933 ret <8 x i16> %1 2934} 2935 2936define <16 x i8> @pr38658(<16 x i8> %x) { 2937; SSE2-LABEL: pr38658: 2938; SSE2: # %bb.0: 2939; SSE2-NEXT: pxor %xmm2, %xmm2 2940; SSE2-NEXT: pxor %xmm3, %xmm3 2941; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] 2942; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2943; SSE2-NEXT: psrlw $8, %xmm3 2944; SSE2-NEXT: pxor %xmm1, %xmm1 2945; SSE2-NEXT: packuswb %xmm3, %xmm1 2946; SSE2-NEXT: paddb %xmm0, %xmm1 2947; SSE2-NEXT: movdqa %xmm1, %xmm0 2948; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 2949; SSE2-NEXT: movdqa %xmm1, %xmm2 2950; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 2951; SSE2-NEXT: psraw $8, %xmm2 2952; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2953; SSE2-NEXT: psrlw $8, %xmm2 2954; SSE2-NEXT: packuswb %xmm2, %xmm0 2955; SSE2-NEXT: psrlw $7, %xmm1 2956; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2957; SSE2-NEXT: paddb %xmm0, %xmm1 2958; SSE2-NEXT: movdqa %xmm1, %xmm0 2959; SSE2-NEXT: retq 2960; 2961; SSE41-LABEL: pr38658: 2962; SSE41: # %bb.0: 2963; SSE41-NEXT: pxor %xmm1, %xmm1 2964; SSE41-NEXT: pxor %xmm2, %xmm2 2965; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 2966; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2967; SSE41-NEXT: psrlw $8, %xmm2 2968; SSE41-NEXT: packuswb %xmm2, %xmm1 2969; SSE41-NEXT: paddb %xmm0, %xmm1 2970; SSE41-NEXT: movdqa %xmm1, %xmm0 2971; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 2972; SSE41-NEXT: psraw $8, %xmm0 2973; SSE41-NEXT: movdqa %xmm0, %xmm2 2974; SSE41-NEXT: psllw $6, %xmm2 2975; SSE41-NEXT: psllw $8, %xmm0 2976; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] 2977; SSE41-NEXT: psrlw $8, %xmm0 2978; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2979; SSE41-NEXT: packuswb %xmm0, %xmm2 2980; SSE41-NEXT: psrlw $7, %xmm1 2981; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2982; SSE41-NEXT: paddb %xmm2, %xmm1 2983; SSE41-NEXT: movdqa %xmm1, %xmm0 2984; SSE41-NEXT: retq 2985; 2986; AVX1-LABEL: pr38658: 2987; AVX1: # %bb.0: 2988; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2989; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 2990; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2991; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2992; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 2993; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 2994; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2995; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 2996; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2 2997; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 2998; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] 2999; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 3000; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 3001; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 3002; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 3003; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3004; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3005; AVX1-NEXT: retq 3006; 3007; AVX2-LABEL: pr38658: 3008; AVX2: # %bb.0: 3009; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 3010; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3011; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 3012; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3013; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3014; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3015; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 3016; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3017; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 3018; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 3019; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 3020; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 3021; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3022; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3023; AVX2-NEXT: vzeroupper 3024; AVX2-NEXT: retq 3025; 3026; AVX512F-LABEL: pr38658: 3027; AVX512F: # %bb.0: 3028; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 3029; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3030; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 3031; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 3032; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 3033; AVX512F-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3034; AVX512F-NEXT: vpsrlw $7, %xmm0, %xmm1 3035; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3036; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 3037; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 3038; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3039; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3040; AVX512F-NEXT: vzeroupper 3041; AVX512F-NEXT: retq 3042; 3043; AVX512BW-LABEL: pr38658: 3044; AVX512BW: # %bb.0: 3045; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 3046; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 3047; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 3048; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 3049; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3050; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm1 3051; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 3052; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 3053; AVX512BW-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3054; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 3055; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 3056; AVX512BW-NEXT: vzeroupper 3057; AVX512BW-NEXT: retq 3058; 3059; XOP-LABEL: pr38658: 3060; XOP: # %bb.0: 3061; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 3062; XOP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 3063; XOP-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 3064; XOP-NEXT: vpperm {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15],xmm2[1,3,5,7,9,11,13,15] 3065; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3066; XOP-NEXT: vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 3067; XOP-NEXT: vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3068; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3069; XOP-NEXT: vpaddb %xmm0, %xmm1, %xmm0 3070; XOP-NEXT: retq 3071 %1 = sdiv <16 x i8> %x, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 7> 3072 ret <16 x i8> %1 3073} 3074 3075define i1 @bool_sdiv(i1 %x, i1 %y) { 3076; CHECK-LABEL: bool_sdiv: 3077; CHECK: # %bb.0: 3078; CHECK-NEXT: movl %edi, %eax 3079; CHECK-NEXT: # kill: def $al killed $al killed $eax 3080; CHECK-NEXT: retq 3081 %r = sdiv i1 %x, %y 3082 ret i1 %r 3083} 3084 3085define <4 x i1> @boolvec_sdiv(<4 x i1> %x, <4 x i1> %y) { 3086; CHECK-LABEL: boolvec_sdiv: 3087; CHECK: # %bb.0: 3088; CHECK-NEXT: retq 3089 %r = sdiv <4 x i1> %x, %y 3090 ret <4 x i1> %r 3091} 3092 3093define i32 @combine_sdiv_two(i32 %x) { 3094; CHECK-LABEL: combine_sdiv_two: 3095; CHECK: # %bb.0: 3096; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3097; CHECK-NEXT: movl %edi, %eax 3098; CHECK-NEXT: shrl $31, %eax 3099; CHECK-NEXT: addl %edi, %eax 3100; CHECK-NEXT: sarl %eax 3101; CHECK-NEXT: retq 3102 %1 = sdiv i32 %x, 2 3103 ret i32 %1 3104} 3105 3106define i32 @combine_sdiv_negtwo(i32 %x) { 3107; CHECK-LABEL: combine_sdiv_negtwo: 3108; CHECK: # %bb.0: 3109; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3110; CHECK-NEXT: movl %edi, %eax 3111; CHECK-NEXT: shrl $31, %eax 3112; CHECK-NEXT: addl %edi, %eax 3113; CHECK-NEXT: sarl %eax 3114; CHECK-NEXT: negl %eax 3115; CHECK-NEXT: retq 3116 %1 = sdiv i32 %x, -2 3117 ret i32 %1 3118} 3119 3120define i8 @combine_i8_sdiv_pow2(i8 %x) { 3121; CHECK-LABEL: combine_i8_sdiv_pow2: 3122; CHECK: # %bb.0: 3123; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3124; CHECK-NEXT: movl %edi, %eax 3125; CHECK-NEXT: sarb $7, %al 3126; CHECK-NEXT: shrb $4, %al 3127; CHECK-NEXT: addl %edi, %eax 3128; CHECK-NEXT: sarb $4, %al 3129; CHECK-NEXT: # kill: def $al killed $al killed $eax 3130; CHECK-NEXT: retq 3131 %1 = sdiv i8 %x, 16 3132 ret i8 %1 3133} 3134 3135define i8 @combine_i8_sdiv_negpow2(i8 %x) { 3136; CHECK-LABEL: combine_i8_sdiv_negpow2: 3137; CHECK: # %bb.0: 3138; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3139; CHECK-NEXT: movl %edi, %eax 3140; CHECK-NEXT: sarb $7, %al 3141; CHECK-NEXT: shrb $2, %al 3142; CHECK-NEXT: addl %edi, %eax 3143; CHECK-NEXT: sarb $6, %al 3144; CHECK-NEXT: negb %al 3145; CHECK-NEXT: # kill: def $al killed $al killed $eax 3146; CHECK-NEXT: retq 3147 %1 = sdiv i8 %x, -64 3148 ret i8 %1 3149} 3150 3151define i16 @combine_i16_sdiv_pow2(i16 %x) { 3152; CHECK-LABEL: combine_i16_sdiv_pow2: 3153; CHECK: # %bb.0: 3154; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3155; CHECK-NEXT: leal 15(%rdi), %eax 3156; CHECK-NEXT: testw %di, %di 3157; CHECK-NEXT: cmovnsl %edi, %eax 3158; CHECK-NEXT: cwtl 3159; CHECK-NEXT: shrl $4, %eax 3160; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3161; CHECK-NEXT: retq 3162 %1 = sdiv i16 %x, 16 3163 ret i16 %1 3164} 3165 3166define i16 @combine_i16_sdiv_negpow2(i16 %x) { 3167; CHECK-LABEL: combine_i16_sdiv_negpow2: 3168; CHECK: # %bb.0: 3169; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3170; CHECK-NEXT: leal 255(%rdi), %eax 3171; CHECK-NEXT: testw %di, %di 3172; CHECK-NEXT: cmovnsl %edi, %eax 3173; CHECK-NEXT: cwtl 3174; CHECK-NEXT: sarl $8, %eax 3175; CHECK-NEXT: negl %eax 3176; CHECK-NEXT: # kill: def $ax killed $ax killed $eax 3177; CHECK-NEXT: retq 3178 %1 = sdiv i16 %x, -256 3179 ret i16 %1 3180} 3181 3182define i32 @combine_i32_sdiv_pow2(i32 %x) { 3183; CHECK-LABEL: combine_i32_sdiv_pow2: 3184; CHECK: # %bb.0: 3185; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3186; CHECK-NEXT: leal 15(%rdi), %eax 3187; CHECK-NEXT: testl %edi, %edi 3188; CHECK-NEXT: cmovnsl %edi, %eax 3189; CHECK-NEXT: sarl $4, %eax 3190; CHECK-NEXT: retq 3191 %1 = sdiv i32 %x, 16 3192 ret i32 %1 3193} 3194 3195define i32 @combine_i32_sdiv_negpow2(i32 %x) { 3196; CHECK-LABEL: combine_i32_sdiv_negpow2: 3197; CHECK: # %bb.0: 3198; CHECK-NEXT: # kill: def $edi killed $edi def $rdi 3199; CHECK-NEXT: leal 255(%rdi), %eax 3200; CHECK-NEXT: testl %edi, %edi 3201; CHECK-NEXT: cmovnsl %edi, %eax 3202; CHECK-NEXT: sarl $8, %eax 3203; CHECK-NEXT: negl %eax 3204; CHECK-NEXT: retq 3205 %1 = sdiv i32 %x, -256 3206 ret i32 %1 3207} 3208 3209define i64 @combine_i64_sdiv_pow2(i64 %x) { 3210; CHECK-LABEL: combine_i64_sdiv_pow2: 3211; CHECK: # %bb.0: 3212; CHECK-NEXT: leaq 15(%rdi), %rax 3213; CHECK-NEXT: testq %rdi, %rdi 3214; CHECK-NEXT: cmovnsq %rdi, %rax 3215; CHECK-NEXT: sarq $4, %rax 3216; CHECK-NEXT: retq 3217 %1 = sdiv i64 %x, 16 3218 ret i64 %1 3219} 3220 3221define i64 @combine_i64_sdiv_negpow2(i64 %x) { 3222; CHECK-LABEL: combine_i64_sdiv_negpow2: 3223; CHECK: # %bb.0: 3224; CHECK-NEXT: leaq 255(%rdi), %rax 3225; CHECK-NEXT: testq %rdi, %rdi 3226; CHECK-NEXT: cmovnsq %rdi, %rax 3227; CHECK-NEXT: sarq $8, %rax 3228; CHECK-NEXT: negq %rax 3229; CHECK-NEXT: retq 3230 %1 = sdiv i64 %x, -256 3231 ret i64 %1 3232} 3233