1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2NOBW 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX512BW 7 8; 9; sdiv by 7 10; 11 12define <2 x i64> @test_div7_2i64(<2 x i64> %a) nounwind { 13; SSE2-LABEL: test_div7_2i64: 14; SSE2: # %bb.0: 15; SSE2-NEXT: movq %xmm0, %rax 16; SSE2-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 17; SSE2-NEXT: imulq %rcx 18; SSE2-NEXT: movq %rdx, %rax 19; SSE2-NEXT: shrq $63, %rax 20; SSE2-NEXT: sarq %rdx 21; SSE2-NEXT: addq %rax, %rdx 22; SSE2-NEXT: movq %rdx, %xmm1 23; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 24; SSE2-NEXT: movq %xmm0, %rax 25; SSE2-NEXT: imulq %rcx 26; SSE2-NEXT: movq %rdx, %rax 27; SSE2-NEXT: shrq $63, %rax 28; SSE2-NEXT: sarq %rdx 29; SSE2-NEXT: addq %rax, %rdx 30; SSE2-NEXT: movq %rdx, %xmm0 31; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 32; SSE2-NEXT: movdqa %xmm1, %xmm0 33; SSE2-NEXT: retq 34; 35; SSE41-LABEL: test_div7_2i64: 36; SSE41: # %bb.0: 37; SSE41-NEXT: pextrq $1, %xmm0, %rax 38; SSE41-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 39; SSE41-NEXT: imulq %rcx 40; SSE41-NEXT: movq %rdx, %rax 41; SSE41-NEXT: shrq $63, %rax 42; SSE41-NEXT: sarq %rdx 43; SSE41-NEXT: addq %rax, %rdx 44; SSE41-NEXT: movq %rdx, %xmm1 45; SSE41-NEXT: movq %xmm0, %rax 46; SSE41-NEXT: imulq %rcx 47; SSE41-NEXT: movq %rdx, %rax 48; SSE41-NEXT: shrq $63, %rax 49; SSE41-NEXT: sarq %rdx 50; SSE41-NEXT: addq %rax, %rdx 51; SSE41-NEXT: movq %rdx, %xmm0 52; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 53; SSE41-NEXT: retq 54; 55; AVX-LABEL: test_div7_2i64: 56; AVX: # %bb.0: 57; AVX-NEXT: vpextrq $1, %xmm0, %rax 58; AVX-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 59; AVX-NEXT: imulq %rcx 60; AVX-NEXT: movq %rdx, %rax 61; AVX-NEXT: shrq $63, %rax 62; AVX-NEXT: sarq %rdx 63; AVX-NEXT: addq %rax, %rdx 64; AVX-NEXT: vmovq %rdx, %xmm1 65; AVX-NEXT: vmovq %xmm0, %rax 66; AVX-NEXT: imulq %rcx 67; AVX-NEXT: movq %rdx, %rax 68; AVX-NEXT: shrq $63, %rax 69; AVX-NEXT: sarq %rdx 70; AVX-NEXT: addq %rax, %rdx 71; AVX-NEXT: vmovq %rdx, %xmm0 72; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 73; AVX-NEXT: retq 74 %res = sdiv <2 x i64> %a, <i64 7, i64 7> 75 ret <2 x i64> %res 76} 77 78define <4 x i32> @test_div7_4i32(<4 x i32> %a) nounwind { 79; SSE2-LABEL: test_div7_4i32: 80; SSE2: # %bb.0: 81; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 82; SSE2-NEXT: movdqa %xmm0, %xmm1 83; SSE2-NEXT: pmuludq %xmm2, %xmm1 84; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 85; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 86; SSE2-NEXT: pmuludq %xmm2, %xmm3 87; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 88; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 89; SSE2-NEXT: pxor %xmm3, %xmm3 90; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 91; SSE2-NEXT: pand %xmm2, %xmm3 92; SSE2-NEXT: paddd %xmm0, %xmm3 93; SSE2-NEXT: psubd %xmm3, %xmm1 94; SSE2-NEXT: paddd %xmm0, %xmm1 95; SSE2-NEXT: movdqa %xmm1, %xmm0 96; SSE2-NEXT: psrld $31, %xmm0 97; SSE2-NEXT: psrad $2, %xmm1 98; SSE2-NEXT: paddd %xmm0, %xmm1 99; SSE2-NEXT: movdqa %xmm1, %xmm0 100; SSE2-NEXT: retq 101; 102; SSE41-LABEL: test_div7_4i32: 103; SSE41: # %bb.0: 104; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 105; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 106; SSE41-NEXT: pmuldq %xmm1, %xmm2 107; SSE41-NEXT: pmuldq %xmm0, %xmm1 108; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 109; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 110; SSE41-NEXT: paddd %xmm0, %xmm1 111; SSE41-NEXT: movdqa %xmm1, %xmm0 112; SSE41-NEXT: psrld $31, %xmm0 113; SSE41-NEXT: psrad $2, %xmm1 114; SSE41-NEXT: paddd %xmm0, %xmm1 115; SSE41-NEXT: movdqa %xmm1, %xmm0 116; SSE41-NEXT: retq 117; 118; AVX1-LABEL: test_div7_4i32: 119; AVX1: # %bb.0: 120; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 121; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 122; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 123; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 124; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 125; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 126; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 127; AVX1-NEXT: vpsrld $31, %xmm0, %xmm1 128; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 129; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 130; AVX1-NEXT: retq 131; 132; AVX2-LABEL: test_div7_4i32: 133; AVX2: # %bb.0: 134; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 135; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 136; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 137; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 138; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 139; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 140; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 141; AVX2-NEXT: vpsrld $31, %xmm0, %xmm1 142; AVX2-NEXT: vpsrad $2, %xmm0, %xmm0 143; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 144; AVX2-NEXT: retq 145 %res = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 146 ret <4 x i32> %res 147} 148 149define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind { 150; SSE-LABEL: test_div7_8i16: 151; SSE: # %bb.0: 152; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 153; SSE-NEXT: movdqa %xmm0, %xmm1 154; SSE-NEXT: psrlw $15, %xmm1 155; SSE-NEXT: psraw $1, %xmm0 156; SSE-NEXT: paddw %xmm1, %xmm0 157; SSE-NEXT: retq 158; 159; AVX-LABEL: test_div7_8i16: 160; AVX: # %bb.0: 161; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 162; AVX-NEXT: vpsrlw $15, %xmm0, %xmm1 163; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 164; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 165; AVX-NEXT: retq 166 %res = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 167 ret <8 x i16> %res 168} 169 170define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind { 171; SSE-LABEL: test_div7_16i8: 172; SSE: # %bb.0: 173; SSE-NEXT: movdqa %xmm0, %xmm1 174; SSE-NEXT: pxor %xmm0, %xmm0 175; SSE-NEXT: pxor %xmm2, %xmm2 176; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 177; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 178; SSE-NEXT: pmulhw %xmm3, %xmm2 179; SSE-NEXT: psrlw $8, %xmm2 180; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 181; SSE-NEXT: pmulhw %xmm3, %xmm0 182; SSE-NEXT: psrlw $8, %xmm0 183; SSE-NEXT: packuswb %xmm2, %xmm0 184; SSE-NEXT: paddb %xmm1, %xmm0 185; SSE-NEXT: movdqa %xmm0, %xmm1 186; SSE-NEXT: psrlw $2, %xmm1 187; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 188; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 189; SSE-NEXT: pxor %xmm2, %xmm1 190; SSE-NEXT: psrlw $7, %xmm0 191; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 192; SSE-NEXT: paddb %xmm1, %xmm0 193; SSE-NEXT: psubb %xmm2, %xmm0 194; SSE-NEXT: retq 195; 196; AVX1-LABEL: test_div7_16i8: 197; AVX1: # %bb.0: 198; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 199; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 200; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 201; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 202; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 203; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 204; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 205; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 206; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 207; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 208; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 209; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 210; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 211; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 212; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 213; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 214; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 215; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0 216; AVX1-NEXT: retq 217; 218; AVX2NOBW-LABEL: test_div7_16i8: 219; AVX2NOBW: # %bb.0: 220; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 221; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 222; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 223; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 224; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 225; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 226; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 227; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 228; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 229; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 230; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 231; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 232; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 233; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 234; AVX2NOBW-NEXT: vzeroupper 235; AVX2NOBW-NEXT: retq 236; 237; AVX512BW-LABEL: test_div7_16i8: 238; AVX512BW: # %bb.0: 239; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 240; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 241; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 242; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 243; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 244; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 245; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 246; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 247; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 248; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 249; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 250; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 251; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0 252; AVX512BW-NEXT: vzeroupper 253; AVX512BW-NEXT: retq 254 %res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 255 ret <16 x i8> %res 256} 257 258; 259; sdiv by non-splat constant 260; 261 262define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind { 263; SSE-LABEL: test_divconstant_16i8: 264; SSE: # %bb.0: 265; SSE-NEXT: pxor %xmm1, %xmm1 266; SSE-NEXT: pxor %xmm2, %xmm2 267; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 268; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 269; SSE-NEXT: psrlw $8, %xmm2 270; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 271; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 272; SSE-NEXT: psrlw $8, %xmm1 273; SSE-NEXT: packuswb %xmm2, %xmm1 274; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 275; SSE-NEXT: paddb %xmm1, %xmm0 276; SSE-NEXT: movdqa %xmm0, %xmm1 277; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 278; SSE-NEXT: psraw $8, %xmm1 279; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 280; SSE-NEXT: psrlw $8, %xmm1 281; SSE-NEXT: movdqa %xmm0, %xmm2 282; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 283; SSE-NEXT: psraw $8, %xmm2 284; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 285; SSE-NEXT: psrlw $8, %xmm2 286; SSE-NEXT: packuswb %xmm1, %xmm2 287; SSE-NEXT: psrlw $7, %xmm0 288; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 289; SSE-NEXT: paddb %xmm2, %xmm0 290; SSE-NEXT: retq 291; 292; AVX1-LABEL: test_divconstant_16i8: 293; AVX1: # %bb.0: 294; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 295; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 296; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 297; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 298; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 299; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 300; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 301; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 302; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 303; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 304; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 305; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 306; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 307; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 308; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 309; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 310; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 311; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 312; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 313; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 314; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 315; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 316; AVX1-NEXT: retq 317; 318; AVX2NOBW-LABEL: test_divconstant_16i8: 319; AVX2NOBW: # %bb.0: 320; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 321; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 322; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 323; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 324; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 325; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 326; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 327; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 328; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 329; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 330; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 331; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 332; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 333; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 334; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 335; AVX2NOBW-NEXT: vzeroupper 336; AVX2NOBW-NEXT: retq 337; 338; AVX512BW-LABEL: test_divconstant_16i8: 339; AVX512BW: # %bb.0: 340; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 341; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 342; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 343; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 344; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 345; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 346; AVX512BW-NEXT: vpaddb %xmm0, %xmm2, %xmm0 347; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 348; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1 349; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 350; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 351; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 352; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 353; AVX512BW-NEXT: vzeroupper 354; AVX512BW-NEXT: retq 355 %res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 356 ret <16 x i8> %res 357} 358 359; 360; srem by 7 361; 362 363define <2 x i64> @test_rem7_2i64(<2 x i64> %a) nounwind { 364; SSE2-LABEL: test_rem7_2i64: 365; SSE2: # %bb.0: 366; SSE2-NEXT: movq %xmm0, %rcx 367; SSE2-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 368; SSE2-NEXT: movq %rcx, %rax 369; SSE2-NEXT: imulq %rsi 370; SSE2-NEXT: movq %rdx, %rax 371; SSE2-NEXT: shrq $63, %rax 372; SSE2-NEXT: sarq %rdx 373; SSE2-NEXT: addq %rax, %rdx 374; SSE2-NEXT: leaq (,%rdx,8), %rax 375; SSE2-NEXT: subq %rax, %rdx 376; SSE2-NEXT: addq %rcx, %rdx 377; SSE2-NEXT: movq %rdx, %xmm1 378; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 379; SSE2-NEXT: movq %xmm0, %rcx 380; SSE2-NEXT: movq %rcx, %rax 381; SSE2-NEXT: imulq %rsi 382; SSE2-NEXT: movq %rdx, %rax 383; SSE2-NEXT: shrq $63, %rax 384; SSE2-NEXT: sarq %rdx 385; SSE2-NEXT: addq %rax, %rdx 386; SSE2-NEXT: leaq (,%rdx,8), %rax 387; SSE2-NEXT: subq %rax, %rdx 388; SSE2-NEXT: addq %rcx, %rdx 389; SSE2-NEXT: movq %rdx, %xmm0 390; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 391; SSE2-NEXT: movdqa %xmm1, %xmm0 392; SSE2-NEXT: retq 393; 394; SSE41-LABEL: test_rem7_2i64: 395; SSE41: # %bb.0: 396; SSE41-NEXT: pextrq $1, %xmm0, %rcx 397; SSE41-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 398; SSE41-NEXT: movq %rcx, %rax 399; SSE41-NEXT: imulq %rsi 400; SSE41-NEXT: movq %rdx, %rax 401; SSE41-NEXT: shrq $63, %rax 402; SSE41-NEXT: sarq %rdx 403; SSE41-NEXT: addq %rax, %rdx 404; SSE41-NEXT: leaq (,%rdx,8), %rax 405; SSE41-NEXT: subq %rax, %rdx 406; SSE41-NEXT: addq %rcx, %rdx 407; SSE41-NEXT: movq %rdx, %xmm1 408; SSE41-NEXT: movq %xmm0, %rcx 409; SSE41-NEXT: movq %rcx, %rax 410; SSE41-NEXT: imulq %rsi 411; SSE41-NEXT: movq %rdx, %rax 412; SSE41-NEXT: shrq $63, %rax 413; SSE41-NEXT: sarq %rdx 414; SSE41-NEXT: addq %rax, %rdx 415; SSE41-NEXT: leaq (,%rdx,8), %rax 416; SSE41-NEXT: subq %rax, %rdx 417; SSE41-NEXT: addq %rcx, %rdx 418; SSE41-NEXT: movq %rdx, %xmm0 419; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 420; SSE41-NEXT: retq 421; 422; AVX-LABEL: test_rem7_2i64: 423; AVX: # %bb.0: 424; AVX-NEXT: vpextrq $1, %xmm0, %rcx 425; AVX-NEXT: movabsq $5270498306774157605, %rsi # imm = 0x4924924924924925 426; AVX-NEXT: movq %rcx, %rax 427; AVX-NEXT: imulq %rsi 428; AVX-NEXT: movq %rdx, %rax 429; AVX-NEXT: shrq $63, %rax 430; AVX-NEXT: sarq %rdx 431; AVX-NEXT: addq %rax, %rdx 432; AVX-NEXT: leaq (,%rdx,8), %rax 433; AVX-NEXT: subq %rax, %rdx 434; AVX-NEXT: addq %rcx, %rdx 435; AVX-NEXT: vmovq %rdx, %xmm1 436; AVX-NEXT: vmovq %xmm0, %rcx 437; AVX-NEXT: movq %rcx, %rax 438; AVX-NEXT: imulq %rsi 439; AVX-NEXT: movq %rdx, %rax 440; AVX-NEXT: shrq $63, %rax 441; AVX-NEXT: sarq %rdx 442; AVX-NEXT: addq %rax, %rdx 443; AVX-NEXT: leaq (,%rdx,8), %rax 444; AVX-NEXT: subq %rax, %rdx 445; AVX-NEXT: addq %rcx, %rdx 446; AVX-NEXT: vmovq %rdx, %xmm0 447; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 448; AVX-NEXT: retq 449 %res = srem <2 x i64> %a, <i64 7, i64 7> 450 ret <2 x i64> %res 451} 452 453define <4 x i32> @test_rem7_4i32(<4 x i32> %a) nounwind { 454; SSE2-LABEL: test_rem7_4i32: 455; SSE2: # %bb.0: 456; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] 457; SSE2-NEXT: movdqa %xmm0, %xmm2 458; SSE2-NEXT: pmuludq %xmm1, %xmm2 459; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] 460; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 461; SSE2-NEXT: pmuludq %xmm1, %xmm3 462; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] 463; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 464; SSE2-NEXT: pxor %xmm3, %xmm3 465; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 466; SSE2-NEXT: pand %xmm1, %xmm3 467; SSE2-NEXT: paddd %xmm0, %xmm3 468; SSE2-NEXT: psubd %xmm3, %xmm2 469; SSE2-NEXT: paddd %xmm0, %xmm2 470; SSE2-NEXT: movdqa %xmm2, %xmm1 471; SSE2-NEXT: psrld $31, %xmm1 472; SSE2-NEXT: psrad $2, %xmm2 473; SSE2-NEXT: paddd %xmm1, %xmm2 474; SSE2-NEXT: movdqa %xmm2, %xmm1 475; SSE2-NEXT: pslld $3, %xmm1 476; SSE2-NEXT: psubd %xmm1, %xmm2 477; SSE2-NEXT: paddd %xmm2, %xmm0 478; SSE2-NEXT: retq 479; 480; SSE41-LABEL: test_rem7_4i32: 481; SSE41: # %bb.0: 482; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 483; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 484; SSE41-NEXT: pmuldq %xmm2, %xmm1 485; SSE41-NEXT: pmuldq %xmm0, %xmm2 486; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 487; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 488; SSE41-NEXT: paddd %xmm0, %xmm2 489; SSE41-NEXT: movdqa %xmm2, %xmm1 490; SSE41-NEXT: psrld $31, %xmm1 491; SSE41-NEXT: psrad $2, %xmm2 492; SSE41-NEXT: paddd %xmm1, %xmm2 493; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 494; SSE41-NEXT: psubd %xmm2, %xmm0 495; SSE41-NEXT: retq 496; 497; AVX1-LABEL: test_rem7_4i32: 498; AVX1: # %bb.0: 499; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 500; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 501; AVX1-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 502; AVX1-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 503; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 504; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] 505; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm1 506; AVX1-NEXT: vpsrld $31, %xmm1, %xmm2 507; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 508; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 509; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 510; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 511; AVX1-NEXT: retq 512; 513; AVX2-LABEL: test_rem7_4i32: 514; AVX2: # %bb.0: 515; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 516; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] 517; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 518; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 519; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 520; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 521; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 522; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 523; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 524; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 525; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] 526; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 527; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 528; AVX2-NEXT: retq 529 %res = srem <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7> 530 ret <4 x i32> %res 531} 532 533define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind { 534; SSE-LABEL: test_rem7_8i16: 535; SSE: # %bb.0: 536; SSE-NEXT: movdqa {{.*#+}} xmm1 = [18725,18725,18725,18725,18725,18725,18725,18725] 537; SSE-NEXT: pmulhw %xmm0, %xmm1 538; SSE-NEXT: movdqa %xmm1, %xmm2 539; SSE-NEXT: psrlw $15, %xmm2 540; SSE-NEXT: psraw $1, %xmm1 541; SSE-NEXT: paddw %xmm2, %xmm1 542; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 543; SSE-NEXT: psubw %xmm1, %xmm0 544; SSE-NEXT: retq 545; 546; AVX-LABEL: test_rem7_8i16: 547; AVX: # %bb.0: 548; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 549; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 550; AVX-NEXT: vpsraw $1, %xmm1, %xmm1 551; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 552; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 553; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 554; AVX-NEXT: retq 555 %res = srem <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 556 ret <8 x i16> %res 557} 558 559define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind { 560; SSE-LABEL: test_rem7_16i8: 561; SSE: # %bb.0: 562; SSE-NEXT: pxor %xmm1, %xmm1 563; SSE-NEXT: pxor %xmm2, %xmm2 564; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] 565; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 566; SSE-NEXT: pmulhw %xmm3, %xmm2 567; SSE-NEXT: psrlw $8, %xmm2 568; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 569; SSE-NEXT: pmulhw %xmm3, %xmm1 570; SSE-NEXT: psrlw $8, %xmm1 571; SSE-NEXT: packuswb %xmm2, %xmm1 572; SSE-NEXT: paddb %xmm0, %xmm1 573; SSE-NEXT: movdqa %xmm1, %xmm2 574; SSE-NEXT: psrlw $2, %xmm2 575; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 576; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 577; SSE-NEXT: pxor %xmm3, %xmm2 578; SSE-NEXT: psrlw $7, %xmm1 579; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 580; SSE-NEXT: paddb %xmm2, %xmm1 581; SSE-NEXT: psubb %xmm3, %xmm1 582; SSE-NEXT: movdqa %xmm1, %xmm2 583; SSE-NEXT: psllw $3, %xmm2 584; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 585; SSE-NEXT: psubb %xmm2, %xmm1 586; SSE-NEXT: paddb %xmm1, %xmm0 587; SSE-NEXT: retq 588; 589; AVX1-LABEL: test_rem7_16i8: 590; AVX1: # %bb.0: 591; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 592; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 593; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632] 594; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2 595; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 596; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 597; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1 598; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 599; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 600; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 601; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 602; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 603; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 604; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 605; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 606; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 607; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 608; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 609; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2 610; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 611; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1 612; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 613; AVX1-NEXT: retq 614; 615; AVX2NOBW-LABEL: test_rem7_16i8: 616; AVX2NOBW: # %bb.0: 617; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 618; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 619; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 620; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 621; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 622; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 623; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 624; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 625; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 626; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 627; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 628; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 629; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 630; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 631; AVX2NOBW-NEXT: vpsllw $3, %xmm1, %xmm2 632; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 633; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 634; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 635; AVX2NOBW-NEXT: vzeroupper 636; AVX2NOBW-NEXT: retq 637; 638; AVX512BW-LABEL: test_rem7_16i8: 639; AVX512BW: # %bb.0: 640; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm1 641; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 642; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 643; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 644; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 645; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 646; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 647; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] 648; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 649; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 650; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 651; AVX512BW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 652; AVX512BW-NEXT: vpsubb %xmm3, %xmm1, %xmm1 653; AVX512BW-NEXT: vpsllw $3, %xmm1, %xmm2 654; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 655; AVX512BW-NEXT: vpsubb %xmm2, %xmm1, %xmm1 656; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 657; AVX512BW-NEXT: vzeroupper 658; AVX512BW-NEXT: retq 659 %res = srem <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7> 660 ret <16 x i8> %res 661} 662 663; 664; srem by non-splat constant 665; 666 667define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { 668; SSE2-LABEL: test_remconstant_16i8: 669; SSE2: # %bb.0: 670; SSE2-NEXT: pxor %xmm2, %xmm2 671; SSE2-NEXT: pxor %xmm1, %xmm1 672; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 673; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 674; SSE2-NEXT: psrlw $8, %xmm1 675; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 676; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 677; SSE2-NEXT: psrlw $8, %xmm2 678; SSE2-NEXT: packuswb %xmm1, %xmm2 679; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 680; SSE2-NEXT: pand %xmm0, %xmm1 681; SSE2-NEXT: paddb %xmm2, %xmm1 682; SSE2-NEXT: movdqa %xmm1, %xmm2 683; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 684; SSE2-NEXT: psraw $8, %xmm2 685; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 686; SSE2-NEXT: psrlw $8, %xmm2 687; SSE2-NEXT: movdqa %xmm1, %xmm3 688; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 689; SSE2-NEXT: psraw $8, %xmm3 690; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 691; SSE2-NEXT: psrlw $8, %xmm3 692; SSE2-NEXT: packuswb %xmm2, %xmm3 693; SSE2-NEXT: psrlw $7, %xmm1 694; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 695; SSE2-NEXT: paddb %xmm3, %xmm1 696; SSE2-NEXT: movdqa %xmm1, %xmm2 697; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 698; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 699; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 700; SSE2-NEXT: pand %xmm3, %xmm2 701; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 702; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 703; SSE2-NEXT: pand %xmm3, %xmm1 704; SSE2-NEXT: packuswb %xmm2, %xmm1 705; SSE2-NEXT: psubb %xmm1, %xmm0 706; SSE2-NEXT: retq 707; 708; SSE41-LABEL: test_remconstant_16i8: 709; SSE41: # %bb.0: 710; SSE41-NEXT: pxor %xmm2, %xmm2 711; SSE41-NEXT: pxor %xmm1, %xmm1 712; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 713; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 714; SSE41-NEXT: psrlw $8, %xmm1 715; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 716; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 717; SSE41-NEXT: psrlw $8, %xmm2 718; SSE41-NEXT: packuswb %xmm1, %xmm2 719; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,0,255,0,0,255,255,0,0,255,0,0,0,255] 720; SSE41-NEXT: pand %xmm0, %xmm1 721; SSE41-NEXT: paddb %xmm2, %xmm1 722; SSE41-NEXT: movdqa %xmm1, %xmm2 723; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 724; SSE41-NEXT: psraw $8, %xmm2 725; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 726; SSE41-NEXT: psrlw $8, %xmm2 727; SSE41-NEXT: movdqa %xmm1, %xmm3 728; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 729; SSE41-NEXT: psraw $8, %xmm3 730; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 731; SSE41-NEXT: psrlw $8, %xmm3 732; SSE41-NEXT: packuswb %xmm2, %xmm3 733; SSE41-NEXT: psrlw $7, %xmm1 734; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 735; SSE41-NEXT: paddb %xmm3, %xmm1 736; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 737; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 738; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 739; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 740; SSE41-NEXT: pand %xmm3, %xmm1 741; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 742; SSE41-NEXT: pand %xmm3, %xmm2 743; SSE41-NEXT: packuswb %xmm1, %xmm2 744; SSE41-NEXT: psubb %xmm2, %xmm0 745; SSE41-NEXT: retq 746; 747; AVX1-LABEL: test_remconstant_16i8: 748; AVX1: # %bb.0: 749; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 750; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 751; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 752; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 753; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 754; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 755; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 756; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 757; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 758; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 759; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 760; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 761; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 762; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 763; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 764; AVX1-NEXT: vpsraw $8, %xmm3, %xmm3 765; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 766; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 767; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 768; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 769; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 770; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 771; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 772; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 773; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 774; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 775; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 776; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 777; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 778; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 779; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 780; AVX1-NEXT: retq 781; 782; AVX2NOBW-LABEL: test_remconstant_16i8: 783; AVX2NOBW: # %bb.0: 784; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1 785; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 786; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1 787; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 788; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 789; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 790; AVX2NOBW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 791; AVX2NOBW-NEXT: vpmovsxbw %xmm1, %ymm2 792; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 793; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 794; AVX2NOBW-NEXT: vextracti128 $1, %ymm2, %xmm3 795; AVX2NOBW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 796; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 797; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 798; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1 799; AVX2NOBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 800; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 801; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 802; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2 803; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 804; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 805; AVX2NOBW-NEXT: vzeroupper 806; AVX2NOBW-NEXT: retq 807; 808; AVX512BW-LABEL: test_remconstant_16i8: 809; AVX512BW: # %bb.0: 810; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,1,2,3,1,2,3,3,2,1,3,2,1,1,2] 811; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2 812; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 813; AVX512BW-NEXT: vpsrlw $8, %ymm2, %ymm2 814; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 815; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 816; AVX512BW-NEXT: vpaddb %xmm3, %xmm2, %xmm2 817; AVX512BW-NEXT: vpmovsxbw %xmm2, %ymm3 818; AVX512BW-NEXT: vpsravw %zmm1, %zmm3, %zmm1 819; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 820; AVX512BW-NEXT: vpsrlw $7, %xmm2, %xmm2 821; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 822; AVX512BW-NEXT: vpaddb %xmm2, %xmm1, %xmm1 823; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 824; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 825; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 826; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0 827; AVX512BW-NEXT: vzeroupper 828; AVX512BW-NEXT: retq 829 %res = srem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7> 830 ret <16 x i8> %res 831} 832 833; This test is just to show what an scalarized v16i8 division looks like. 834define <16 x i8> @test_rem_variable_16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 835; SSE2-LABEL: test_rem_variable_16i8: 836; SSE2: # %bb.0: 837; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) 838; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) 839; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 840; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 841; SSE2-NEXT: movsbl %ah, %eax 842; SSE2-NEXT: movd %eax, %xmm0 843; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 844; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 845; SSE2-NEXT: movsbl %ah, %eax 846; SSE2-NEXT: movd %eax, %xmm1 847; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 848; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 849; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 850; SSE2-NEXT: movsbl %ah, %eax 851; SSE2-NEXT: movd %eax, %xmm0 852; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 853; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 854; SSE2-NEXT: movsbl %ah, %eax 855; SSE2-NEXT: movd %eax, %xmm2 856; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 857; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 858; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 859; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 860; SSE2-NEXT: movsbl %ah, %eax 861; SSE2-NEXT: movd %eax, %xmm0 862; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 863; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 864; SSE2-NEXT: movsbl %ah, %eax 865; SSE2-NEXT: movd %eax, %xmm3 866; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 867; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 868; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 869; SSE2-NEXT: movsbl %ah, %eax 870; SSE2-NEXT: movd %eax, %xmm0 871; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 872; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 873; SSE2-NEXT: movsbl %ah, %eax 874; SSE2-NEXT: movd %eax, %xmm1 875; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 876; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 877; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 878; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 879; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 880; SSE2-NEXT: movsbl %ah, %eax 881; SSE2-NEXT: movd %eax, %xmm0 882; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 883; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 884; SSE2-NEXT: movsbl %ah, %eax 885; SSE2-NEXT: movd %eax, %xmm2 886; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 887; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 888; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 889; SSE2-NEXT: movsbl %ah, %eax 890; SSE2-NEXT: movd %eax, %xmm0 891; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 892; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 893; SSE2-NEXT: movsbl %ah, %eax 894; SSE2-NEXT: movd %eax, %xmm3 895; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] 896; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 897; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 898; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 899; SSE2-NEXT: movsbl %ah, %eax 900; SSE2-NEXT: movd %eax, %xmm0 901; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 902; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 903; SSE2-NEXT: movsbl %ah, %eax 904; SSE2-NEXT: movd %eax, %xmm2 905; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 906; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 907; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 908; SSE2-NEXT: movsbl %ah, %eax 909; SSE2-NEXT: movd %eax, %xmm4 910; SSE2-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax 911; SSE2-NEXT: idivb -{{[0-9]+}}(%rsp) 912; SSE2-NEXT: movsbl %ah, %eax 913; SSE2-NEXT: movd %eax, %xmm0 914; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 915; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 916; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 917; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 918; SSE2-NEXT: retq 919; 920; SSE41-LABEL: test_rem_variable_16i8: 921; SSE41: # %bb.0: 922; SSE41-NEXT: pextrb $1, %xmm1, %ecx 923; SSE41-NEXT: pextrb $1, %xmm0, %eax 924; SSE41-NEXT: cbtw 925; SSE41-NEXT: idivb %cl 926; SSE41-NEXT: movsbl %ah, %ecx 927; SSE41-NEXT: movd %xmm1, %edx 928; SSE41-NEXT: movd %xmm0, %eax 929; SSE41-NEXT: cbtw 930; SSE41-NEXT: idivb %dl 931; SSE41-NEXT: movsbl %ah, %eax 932; SSE41-NEXT: movd %eax, %xmm2 933; SSE41-NEXT: pinsrb $1, %ecx, %xmm2 934; SSE41-NEXT: pextrb $2, %xmm1, %ecx 935; SSE41-NEXT: pextrb $2, %xmm0, %eax 936; SSE41-NEXT: cbtw 937; SSE41-NEXT: idivb %cl 938; SSE41-NEXT: movsbl %ah, %eax 939; SSE41-NEXT: pinsrb $2, %eax, %xmm2 940; SSE41-NEXT: pextrb $3, %xmm1, %ecx 941; SSE41-NEXT: pextrb $3, %xmm0, %eax 942; SSE41-NEXT: cbtw 943; SSE41-NEXT: idivb %cl 944; SSE41-NEXT: movsbl %ah, %eax 945; SSE41-NEXT: pinsrb $3, %eax, %xmm2 946; SSE41-NEXT: pextrb $4, %xmm1, %ecx 947; SSE41-NEXT: pextrb $4, %xmm0, %eax 948; SSE41-NEXT: cbtw 949; SSE41-NEXT: idivb %cl 950; SSE41-NEXT: movsbl %ah, %eax 951; SSE41-NEXT: pinsrb $4, %eax, %xmm2 952; SSE41-NEXT: pextrb $5, %xmm1, %ecx 953; SSE41-NEXT: pextrb $5, %xmm0, %eax 954; SSE41-NEXT: cbtw 955; SSE41-NEXT: idivb %cl 956; SSE41-NEXT: movsbl %ah, %eax 957; SSE41-NEXT: pinsrb $5, %eax, %xmm2 958; SSE41-NEXT: pextrb $6, %xmm1, %ecx 959; SSE41-NEXT: pextrb $6, %xmm0, %eax 960; SSE41-NEXT: cbtw 961; SSE41-NEXT: idivb %cl 962; SSE41-NEXT: movsbl %ah, %eax 963; SSE41-NEXT: pinsrb $6, %eax, %xmm2 964; SSE41-NEXT: pextrb $7, %xmm1, %ecx 965; SSE41-NEXT: pextrb $7, %xmm0, %eax 966; SSE41-NEXT: cbtw 967; SSE41-NEXT: idivb %cl 968; SSE41-NEXT: movsbl %ah, %eax 969; SSE41-NEXT: pinsrb $7, %eax, %xmm2 970; SSE41-NEXT: pextrb $8, %xmm1, %ecx 971; SSE41-NEXT: pextrb $8, %xmm0, %eax 972; SSE41-NEXT: cbtw 973; SSE41-NEXT: idivb %cl 974; SSE41-NEXT: movsbl %ah, %eax 975; SSE41-NEXT: pinsrb $8, %eax, %xmm2 976; SSE41-NEXT: pextrb $9, %xmm1, %ecx 977; SSE41-NEXT: pextrb $9, %xmm0, %eax 978; SSE41-NEXT: cbtw 979; SSE41-NEXT: idivb %cl 980; SSE41-NEXT: movsbl %ah, %eax 981; SSE41-NEXT: pinsrb $9, %eax, %xmm2 982; SSE41-NEXT: pextrb $10, %xmm1, %ecx 983; SSE41-NEXT: pextrb $10, %xmm0, %eax 984; SSE41-NEXT: cbtw 985; SSE41-NEXT: idivb %cl 986; SSE41-NEXT: movsbl %ah, %eax 987; SSE41-NEXT: pinsrb $10, %eax, %xmm2 988; SSE41-NEXT: pextrb $11, %xmm1, %ecx 989; SSE41-NEXT: pextrb $11, %xmm0, %eax 990; SSE41-NEXT: cbtw 991; SSE41-NEXT: idivb %cl 992; SSE41-NEXT: movsbl %ah, %eax 993; SSE41-NEXT: pinsrb $11, %eax, %xmm2 994; SSE41-NEXT: pextrb $12, %xmm1, %ecx 995; SSE41-NEXT: pextrb $12, %xmm0, %eax 996; SSE41-NEXT: cbtw 997; SSE41-NEXT: idivb %cl 998; SSE41-NEXT: movsbl %ah, %eax 999; SSE41-NEXT: pinsrb $12, %eax, %xmm2 1000; SSE41-NEXT: pextrb $13, %xmm1, %ecx 1001; SSE41-NEXT: pextrb $13, %xmm0, %eax 1002; SSE41-NEXT: cbtw 1003; SSE41-NEXT: idivb %cl 1004; SSE41-NEXT: movsbl %ah, %eax 1005; SSE41-NEXT: pinsrb $13, %eax, %xmm2 1006; SSE41-NEXT: pextrb $14, %xmm1, %ecx 1007; SSE41-NEXT: pextrb $14, %xmm0, %eax 1008; SSE41-NEXT: cbtw 1009; SSE41-NEXT: idivb %cl 1010; SSE41-NEXT: movsbl %ah, %eax 1011; SSE41-NEXT: pinsrb $14, %eax, %xmm2 1012; SSE41-NEXT: pextrb $15, %xmm1, %ecx 1013; SSE41-NEXT: pextrb $15, %xmm0, %eax 1014; SSE41-NEXT: cbtw 1015; SSE41-NEXT: idivb %cl 1016; SSE41-NEXT: movsbl %ah, %eax 1017; SSE41-NEXT: pinsrb $15, %eax, %xmm2 1018; SSE41-NEXT: movdqa %xmm2, %xmm0 1019; SSE41-NEXT: retq 1020; 1021; AVX-LABEL: test_rem_variable_16i8: 1022; AVX: # %bb.0: 1023; AVX-NEXT: vpextrb $1, %xmm1, %ecx 1024; AVX-NEXT: vpextrb $1, %xmm0, %eax 1025; AVX-NEXT: cbtw 1026; AVX-NEXT: idivb %cl 1027; AVX-NEXT: movsbl %ah, %ecx 1028; AVX-NEXT: vmovd %xmm1, %edx 1029; AVX-NEXT: vmovd %xmm0, %eax 1030; AVX-NEXT: cbtw 1031; AVX-NEXT: idivb %dl 1032; AVX-NEXT: movsbl %ah, %eax 1033; AVX-NEXT: vmovd %eax, %xmm2 1034; AVX-NEXT: vpinsrb $1, %ecx, %xmm2, %xmm2 1035; AVX-NEXT: vpextrb $2, %xmm1, %ecx 1036; AVX-NEXT: vpextrb $2, %xmm0, %eax 1037; AVX-NEXT: cbtw 1038; AVX-NEXT: idivb %cl 1039; AVX-NEXT: movsbl %ah, %eax 1040; AVX-NEXT: vpinsrb $2, %eax, %xmm2, %xmm2 1041; AVX-NEXT: vpextrb $3, %xmm1, %ecx 1042; AVX-NEXT: vpextrb $3, %xmm0, %eax 1043; AVX-NEXT: cbtw 1044; AVX-NEXT: idivb %cl 1045; AVX-NEXT: movsbl %ah, %eax 1046; AVX-NEXT: vpinsrb $3, %eax, %xmm2, %xmm2 1047; AVX-NEXT: vpextrb $4, %xmm1, %ecx 1048; AVX-NEXT: vpextrb $4, %xmm0, %eax 1049; AVX-NEXT: cbtw 1050; AVX-NEXT: idivb %cl 1051; AVX-NEXT: movsbl %ah, %eax 1052; AVX-NEXT: vpinsrb $4, %eax, %xmm2, %xmm2 1053; AVX-NEXT: vpextrb $5, %xmm1, %ecx 1054; AVX-NEXT: vpextrb $5, %xmm0, %eax 1055; AVX-NEXT: cbtw 1056; AVX-NEXT: idivb %cl 1057; AVX-NEXT: movsbl %ah, %eax 1058; AVX-NEXT: vpinsrb $5, %eax, %xmm2, %xmm2 1059; AVX-NEXT: vpextrb $6, %xmm1, %ecx 1060; AVX-NEXT: vpextrb $6, %xmm0, %eax 1061; AVX-NEXT: cbtw 1062; AVX-NEXT: idivb %cl 1063; AVX-NEXT: movsbl %ah, %eax 1064; AVX-NEXT: vpinsrb $6, %eax, %xmm2, %xmm2 1065; AVX-NEXT: vpextrb $7, %xmm1, %ecx 1066; AVX-NEXT: vpextrb $7, %xmm0, %eax 1067; AVX-NEXT: cbtw 1068; AVX-NEXT: idivb %cl 1069; AVX-NEXT: movsbl %ah, %eax 1070; AVX-NEXT: vpinsrb $7, %eax, %xmm2, %xmm2 1071; AVX-NEXT: vpextrb $8, %xmm1, %ecx 1072; AVX-NEXT: vpextrb $8, %xmm0, %eax 1073; AVX-NEXT: cbtw 1074; AVX-NEXT: idivb %cl 1075; AVX-NEXT: movsbl %ah, %eax 1076; AVX-NEXT: vpinsrb $8, %eax, %xmm2, %xmm2 1077; AVX-NEXT: vpextrb $9, %xmm1, %ecx 1078; AVX-NEXT: vpextrb $9, %xmm0, %eax 1079; AVX-NEXT: cbtw 1080; AVX-NEXT: idivb %cl 1081; AVX-NEXT: movsbl %ah, %eax 1082; AVX-NEXT: vpinsrb $9, %eax, %xmm2, %xmm2 1083; AVX-NEXT: vpextrb $10, %xmm1, %ecx 1084; AVX-NEXT: vpextrb $10, %xmm0, %eax 1085; AVX-NEXT: cbtw 1086; AVX-NEXT: idivb %cl 1087; AVX-NEXT: movsbl %ah, %eax 1088; AVX-NEXT: vpinsrb $10, %eax, %xmm2, %xmm2 1089; AVX-NEXT: vpextrb $11, %xmm1, %ecx 1090; AVX-NEXT: vpextrb $11, %xmm0, %eax 1091; AVX-NEXT: cbtw 1092; AVX-NEXT: idivb %cl 1093; AVX-NEXT: movsbl %ah, %eax 1094; AVX-NEXT: vpinsrb $11, %eax, %xmm2, %xmm2 1095; AVX-NEXT: vpextrb $12, %xmm1, %ecx 1096; AVX-NEXT: vpextrb $12, %xmm0, %eax 1097; AVX-NEXT: cbtw 1098; AVX-NEXT: idivb %cl 1099; AVX-NEXT: movsbl %ah, %eax 1100; AVX-NEXT: vpinsrb $12, %eax, %xmm2, %xmm2 1101; AVX-NEXT: vpextrb $13, %xmm1, %ecx 1102; AVX-NEXT: vpextrb $13, %xmm0, %eax 1103; AVX-NEXT: cbtw 1104; AVX-NEXT: idivb %cl 1105; AVX-NEXT: movsbl %ah, %eax 1106; AVX-NEXT: vpinsrb $13, %eax, %xmm2, %xmm2 1107; AVX-NEXT: vpextrb $14, %xmm1, %ecx 1108; AVX-NEXT: vpextrb $14, %xmm0, %eax 1109; AVX-NEXT: cbtw 1110; AVX-NEXT: idivb %cl 1111; AVX-NEXT: movsbl %ah, %eax 1112; AVX-NEXT: vpinsrb $14, %eax, %xmm2, %xmm2 1113; AVX-NEXT: vpextrb $15, %xmm1, %ecx 1114; AVX-NEXT: vpextrb $15, %xmm0, %eax 1115; AVX-NEXT: cbtw 1116; AVX-NEXT: idivb %cl 1117; AVX-NEXT: movsbl %ah, %eax 1118; AVX-NEXT: vpinsrb $15, %eax, %xmm2, %xmm0 1119; AVX-NEXT: retq 1120 %res = srem <16 x i8> %a, %b 1121 ret <16 x i8> %res 1122} 1123