1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 12 13; 14; add 15; 16 17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 18; SSE-LABEL: trunc_add_v4i64_v4i32: 19; SSE: # %bb.0: 20; SSE-NEXT: paddq %xmm3, %xmm1 21; SSE-NEXT: paddq %xmm2, %xmm0 22; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc_add_v4i64_v4i32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 28; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 29; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 30; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 31; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 32; AVX1-NEXT: vzeroupper 33; AVX1-NEXT: retq 34; 35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 36; AVX2-SLOW: # %bb.0: 37; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 38; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 39; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 40; AVX2-SLOW-NEXT: vzeroupper 41; AVX2-SLOW-NEXT: retq 42; 43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: 44; AVX2-FAST-ALL: # %bb.0: 45; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 46; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 47; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 48; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 49; AVX2-FAST-ALL-NEXT: vzeroupper 50; AVX2-FAST-ALL-NEXT: retq 51; 52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: 53; AVX2-FAST-PERLANE: # %bb.0: 54; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 55; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 56; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 57; AVX2-FAST-PERLANE-NEXT: vzeroupper 58; AVX2-FAST-PERLANE-NEXT: retq 59; 60; AVX512-LABEL: trunc_add_v4i64_v4i32: 61; AVX512: # %bb.0: 62; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 63; AVX512-NEXT: vpmovqd %zmm0, %ymm0 64; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 65; AVX512-NEXT: vzeroupper 66; AVX512-NEXT: retq 67 %1 = add <4 x i64> %a0, %a1 68 %2 = trunc <4 x i64> %1 to <4 x i32> 69 ret <4 x i32> %2 70} 71 72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 73; SSE-LABEL: trunc_add_v8i64_v8i16: 74; SSE: # %bb.0: 75; SSE-NEXT: paddq %xmm6, %xmm2 76; SSE-NEXT: paddq %xmm7, %xmm3 77; SSE-NEXT: paddq %xmm4, %xmm0 78; SSE-NEXT: paddq %xmm5, %xmm1 79; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 80; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 81; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 82; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 83; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 84; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 85; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 86; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 87; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 88; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 89; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 90; SSE-NEXT: retq 91; 92; AVX1-LABEL: trunc_add_v8i64_v8i16: 93; AVX1: # %bb.0: 94; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 95; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 96; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 97; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 98; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 99; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 100; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 101; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 102; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 103; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 104; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 105; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 106; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 107; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 108; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 109; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 110; AVX1-NEXT: vzeroupper 111; AVX1-NEXT: retq 112; 113; AVX2-LABEL: trunc_add_v8i64_v8i16: 114; AVX2: # %bb.0: 115; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 116; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 117; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 118; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 119; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 120; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 121; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 122; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 123; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 124; AVX2-NEXT: vzeroupper 125; AVX2-NEXT: retq 126; 127; AVX512-LABEL: trunc_add_v8i64_v8i16: 128; AVX512: # %bb.0: 129; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 130; AVX512-NEXT: vpmovqw %zmm0, %xmm0 131; AVX512-NEXT: vzeroupper 132; AVX512-NEXT: retq 133 %1 = add <8 x i64> %a0, %a1 134 %2 = trunc <8 x i64> %1 to <8 x i16> 135 ret <8 x i16> %2 136} 137 138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 139; SSE-LABEL: trunc_add_v8i32_v8i16: 140; SSE: # %bb.0: 141; SSE-NEXT: paddd %xmm2, %xmm0 142; SSE-NEXT: paddd %xmm3, %xmm1 143; SSE-NEXT: pslld $16, %xmm1 144; SSE-NEXT: psrad $16, %xmm1 145; SSE-NEXT: pslld $16, %xmm0 146; SSE-NEXT: psrad $16, %xmm0 147; SSE-NEXT: packssdw %xmm1, %xmm0 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: trunc_add_v8i32_v8i16: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 153; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 154; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 155; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 156; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 157; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 158; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 159; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 160; AVX1-NEXT: vzeroupper 161; AVX1-NEXT: retq 162; 163; AVX2-LABEL: trunc_add_v8i32_v8i16: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 166; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 167; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 168; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 169; AVX2-NEXT: vzeroupper 170; AVX2-NEXT: retq 171; 172; AVX512-LABEL: trunc_add_v8i32_v8i16: 173; AVX512: # %bb.0: 174; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 175; AVX512-NEXT: vpmovdw %zmm0, %ymm0 176; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 177; AVX512-NEXT: vzeroupper 178; AVX512-NEXT: retq 179 %1 = add <8 x i32> %a0, %a1 180 %2 = trunc <8 x i32> %1 to <8 x i16> 181 ret <8 x i16> %2 182} 183 184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 185; SSE-LABEL: trunc_add_v16i64_v16i8: 186; SSE: # %bb.0: 187; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 188; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 189; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 190; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 191; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 192; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 193; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 194; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 195; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 196; SSE-NEXT: pand %xmm8, %xmm7 197; SSE-NEXT: pand %xmm8, %xmm6 198; SSE-NEXT: packuswb %xmm7, %xmm6 199; SSE-NEXT: pand %xmm8, %xmm5 200; SSE-NEXT: pand %xmm8, %xmm4 201; SSE-NEXT: packuswb %xmm5, %xmm4 202; SSE-NEXT: packuswb %xmm6, %xmm4 203; SSE-NEXT: pand %xmm8, %xmm3 204; SSE-NEXT: pand %xmm8, %xmm2 205; SSE-NEXT: packuswb %xmm3, %xmm2 206; SSE-NEXT: pand %xmm8, %xmm1 207; SSE-NEXT: pand %xmm8, %xmm0 208; SSE-NEXT: packuswb %xmm1, %xmm0 209; SSE-NEXT: packuswb %xmm2, %xmm0 210; SSE-NEXT: packuswb %xmm4, %xmm0 211; SSE-NEXT: retq 212; 213; AVX1-LABEL: trunc_add_v16i64_v16i8: 214; AVX1: # %bb.0: 215; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 216; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 218; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 219; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 220; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 221; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 222; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 223; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 224; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 225; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 226; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 227; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 228; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 229; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 230; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 231; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 232; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 233; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 234; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 235; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 236; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 237; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 238; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 239; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 240; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 241; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 242; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 243; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 244; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 245; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 246; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 247; AVX1-NEXT: vzeroupper 248; AVX1-NEXT: retq 249; 250; AVX2-LABEL: trunc_add_v16i64_v16i8: 251; AVX2: # %bb.0: 252; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 253; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 254; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 255; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 256; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 257; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 258; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 259; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 260; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 261; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 262; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 263; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 264; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 265; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 266; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 267; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 268; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 269; AVX2-NEXT: vzeroupper 270; AVX2-NEXT: retq 271; 272; AVX512-LABEL: trunc_add_v16i64_v16i8: 273; AVX512: # %bb.0: 274; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 275; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 276; AVX512-NEXT: vpmovqb %zmm1, %xmm1 277; AVX512-NEXT: vpmovqb %zmm0, %xmm0 278; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 279; AVX512-NEXT: vzeroupper 280; AVX512-NEXT: retq 281 %1 = add <16 x i64> %a0, %a1 282 %2 = trunc <16 x i64> %1 to <16 x i8> 283 ret <16 x i8> %2 284} 285 286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 287; SSE-LABEL: trunc_add_v16i32_v16i8: 288; SSE: # %bb.0: 289; SSE-NEXT: paddd %xmm4, %xmm0 290; SSE-NEXT: paddd %xmm5, %xmm1 291; SSE-NEXT: paddd %xmm6, %xmm2 292; SSE-NEXT: paddd %xmm7, %xmm3 293; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 294; SSE-NEXT: pand %xmm4, %xmm3 295; SSE-NEXT: pand %xmm4, %xmm2 296; SSE-NEXT: packuswb %xmm3, %xmm2 297; SSE-NEXT: pand %xmm4, %xmm1 298; SSE-NEXT: pand %xmm4, %xmm0 299; SSE-NEXT: packuswb %xmm1, %xmm0 300; SSE-NEXT: packuswb %xmm2, %xmm0 301; SSE-NEXT: retq 302; 303; AVX1-LABEL: trunc_add_v16i32_v16i8: 304; AVX1: # %bb.0: 305; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 306; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 307; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 308; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 309; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 310; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 311; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 312; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 313; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 314; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 315; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 316; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 317; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 318; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 319; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 320; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 321; AVX1-NEXT: vzeroupper 322; AVX1-NEXT: retq 323; 324; AVX2-LABEL: trunc_add_v16i32_v16i8: 325; AVX2: # %bb.0: 326; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 327; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 328; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 329; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 330; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 331; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 332; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 333; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 334; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 335; AVX2-NEXT: vzeroupper 336; AVX2-NEXT: retq 337; 338; AVX512-LABEL: trunc_add_v16i32_v16i8: 339; AVX512: # %bb.0: 340; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 341; AVX512-NEXT: vpmovdb %zmm0, %xmm0 342; AVX512-NEXT: vzeroupper 343; AVX512-NEXT: retq 344 %1 = add <16 x i32> %a0, %a1 345 %2 = trunc <16 x i32> %1 to <16 x i8> 346 ret <16 x i8> %2 347} 348 349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 350; SSE-LABEL: trunc_add_v16i16_v16i8: 351; SSE: # %bb.0: 352; SSE-NEXT: paddw %xmm2, %xmm0 353; SSE-NEXT: paddw %xmm3, %xmm1 354; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 355; SSE-NEXT: pand %xmm2, %xmm1 356; SSE-NEXT: pand %xmm2, %xmm0 357; SSE-NEXT: packuswb %xmm1, %xmm0 358; SSE-NEXT: retq 359; 360; AVX1-LABEL: trunc_add_v16i16_v16i8: 361; AVX1: # %bb.0: 362; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 363; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 364; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 365; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 366; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 367; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 368; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 369; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 370; AVX1-NEXT: vzeroupper 371; AVX1-NEXT: retq 372; 373; AVX2-LABEL: trunc_add_v16i16_v16i8: 374; AVX2: # %bb.0: 375; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 376; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 377; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 378; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 379; AVX2-NEXT: vzeroupper 380; AVX2-NEXT: retq 381; 382; AVX512F-LABEL: trunc_add_v16i16_v16i8: 383; AVX512F: # %bb.0: 384; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 385; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 386; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 387; AVX512F-NEXT: vzeroupper 388; AVX512F-NEXT: retq 389; 390; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 391; AVX512BW: # %bb.0: 392; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 393; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 394; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 395; AVX512BW-NEXT: vzeroupper 396; AVX512BW-NEXT: retq 397; 398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 399; AVX512DQ: # %bb.0: 400; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 401; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 402; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 403; AVX512DQ-NEXT: vzeroupper 404; AVX512DQ-NEXT: retq 405 %1 = add <16 x i16> %a0, %a1 406 %2 = trunc <16 x i16> %1 to <16 x i8> 407 ret <16 x i8> %2 408} 409 410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 412; SSE: # %bb.0: 413; SSE-NEXT: pslld $16, %xmm2 414; SSE-NEXT: psrad $16, %xmm2 415; SSE-NEXT: pslld $16, %xmm1 416; SSE-NEXT: psrad $16, %xmm1 417; SSE-NEXT: packssdw %xmm2, %xmm1 418; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 419; SSE-NEXT: psraw $8, %xmm0 420; SSE-NEXT: paddw %xmm1, %xmm0 421; SSE-NEXT: retq 422; 423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 426; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 427; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 428; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 429; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 430; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 431; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 432; AVX1-NEXT: vzeroupper 433; AVX1-NEXT: retq 434; 435; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 436; AVX2: # %bb.0: 437; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 438; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 439; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 440; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 441; AVX2-NEXT: vzeroupper 442; AVX2-NEXT: retq 443; 444; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 445; AVX512: # %bb.0: 446; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 447; AVX512-NEXT: vpmovdw %zmm1, %ymm1 448; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 449; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 450; AVX512-NEXT: vzeroupper 451; AVX512-NEXT: retq 452 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 453 %2 = sext <8 x i8> %1 to <8 x i32> 454 %3 = add <8 x i32> %2, %a1 455 %4 = trunc <8 x i32> %3 to <8 x i16> 456 ret <8 x i16> %4 457} 458 459; 460; add to constant 461; 462 463define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 464; SSE-LABEL: trunc_add_const_v4i64_v4i32: 465; SSE: # %bb.0: 466; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 467; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 468; SSE-NEXT: retq 469; 470; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 471; AVX1: # %bb.0: 472; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 473; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 474; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 475; AVX1-NEXT: vzeroupper 476; AVX1-NEXT: retq 477; 478; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 479; AVX2-SLOW: # %bb.0: 480; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 481; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 482; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 483; AVX2-SLOW-NEXT: vzeroupper 484; AVX2-SLOW-NEXT: retq 485; 486; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: 487; AVX2-FAST-ALL: # %bb.0: 488; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 489; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 490; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 491; AVX2-FAST-ALL-NEXT: vzeroupper 492; AVX2-FAST-ALL-NEXT: retq 493; 494; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: 495; AVX2-FAST-PERLANE: # %bb.0: 496; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 497; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 498; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 499; AVX2-FAST-PERLANE-NEXT: vzeroupper 500; AVX2-FAST-PERLANE-NEXT: retq 501; 502; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 503; AVX512: # %bb.0: 504; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 505; AVX512-NEXT: vpmovqd %zmm0, %ymm0 506; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 507; AVX512-NEXT: vzeroupper 508; AVX512-NEXT: retq 509 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 510 %2 = trunc <4 x i64> %1 to <4 x i32> 511 ret <4 x i32> %2 512} 513 514define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 515; SSE-LABEL: trunc_add_const_v8i64_v8i16: 516; SSE: # %bb.0: 517; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 518; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 519; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 520; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 521; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 522; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 523; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 524; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 525; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 526; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 527; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 528; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 529; SSE-NEXT: retq 530; 531; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 532; AVX1: # %bb.0: 533; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 534; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 535; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 536; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 537; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 538; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 539; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 540; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 541; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 542; AVX1-NEXT: vzeroupper 543; AVX1-NEXT: retq 544; 545; AVX2-LABEL: trunc_add_const_v8i64_v8i16: 546; AVX2: # %bb.0: 547; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 548; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 549; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 550; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 551; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 552; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 553; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 554; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 555; AVX2-NEXT: vzeroupper 556; AVX2-NEXT: retq 557; 558; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 559; AVX512: # %bb.0: 560; AVX512-NEXT: vpmovqw %zmm0, %xmm0 561; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 562; AVX512-NEXT: vzeroupper 563; AVX512-NEXT: retq 564 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 565 %2 = trunc <8 x i64> %1 to <8 x i16> 566 ret <8 x i16> %2 567} 568 569define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 570; SSE-LABEL: trunc_add_const_v8i32_v8i16: 571; SSE: # %bb.0: 572; SSE-NEXT: pslld $16, %xmm1 573; SSE-NEXT: psrad $16, %xmm1 574; SSE-NEXT: pslld $16, %xmm0 575; SSE-NEXT: psrad $16, %xmm0 576; SSE-NEXT: packssdw %xmm1, %xmm0 577; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 578; SSE-NEXT: retq 579; 580; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 581; AVX1: # %bb.0: 582; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 583; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 584; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 585; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 586; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 587; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 588; AVX1-NEXT: vzeroupper 589; AVX1-NEXT: retq 590; 591; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 592; AVX2: # %bb.0: 593; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 594; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 595; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 596; AVX2-NEXT: vzeroupper 597; AVX2-NEXT: retq 598; 599; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 600; AVX512: # %bb.0: 601; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 602; AVX512-NEXT: vpmovdw %zmm0, %ymm0 603; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 604; AVX512-NEXT: vzeroupper 605; AVX512-NEXT: retq 606 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 607 %2 = trunc <8 x i32> %1 to <8 x i16> 608 ret <8 x i16> %2 609} 610 611define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 612; SSE-LABEL: trunc_add_const_v16i64_v16i8: 613; SSE: # %bb.0: 614; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 615; SSE-NEXT: pand %xmm8, %xmm7 616; SSE-NEXT: pand %xmm8, %xmm6 617; SSE-NEXT: packuswb %xmm7, %xmm6 618; SSE-NEXT: pand %xmm8, %xmm5 619; SSE-NEXT: pand %xmm8, %xmm4 620; SSE-NEXT: packuswb %xmm5, %xmm4 621; SSE-NEXT: packuswb %xmm6, %xmm4 622; SSE-NEXT: pand %xmm8, %xmm3 623; SSE-NEXT: pand %xmm8, %xmm2 624; SSE-NEXT: packuswb %xmm3, %xmm2 625; SSE-NEXT: pand %xmm8, %xmm1 626; SSE-NEXT: pand %xmm8, %xmm0 627; SSE-NEXT: packuswb %xmm1, %xmm0 628; SSE-NEXT: packuswb %xmm2, %xmm0 629; SSE-NEXT: packuswb %xmm4, %xmm0 630; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 631; SSE-NEXT: retq 632; 633; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 634; AVX1: # %bb.0: 635; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 636; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 637; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 638; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 639; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 640; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 641; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 642; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 643; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 644; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 645; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 646; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 647; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 648; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 649; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 650; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 651; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 652; AVX1-NEXT: vzeroupper 653; AVX1-NEXT: retq 654; 655; AVX2-LABEL: trunc_add_const_v16i64_v16i8: 656; AVX2: # %bb.0: 657; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 658; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 659; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 660; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 661; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 662; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 663; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 664; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 665; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 666; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 667; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 668; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 669; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 670; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 671; AVX2-NEXT: vzeroupper 672; AVX2-NEXT: retq 673; 674; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 675; AVX512: # %bb.0: 676; AVX512-NEXT: vpmovqb %zmm1, %xmm1 677; AVX512-NEXT: vpmovqb %zmm0, %xmm0 678; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 679; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 680; AVX512-NEXT: vzeroupper 681; AVX512-NEXT: retq 682 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 683 %2 = trunc <16 x i64> %1 to <16 x i8> 684 ret <16 x i8> %2 685} 686 687define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 688; SSE-LABEL: trunc_add_const_v16i32_v16i8: 689; SSE: # %bb.0: 690; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 691; SSE-NEXT: pand %xmm4, %xmm3 692; SSE-NEXT: pand %xmm4, %xmm2 693; SSE-NEXT: packuswb %xmm3, %xmm2 694; SSE-NEXT: pand %xmm4, %xmm1 695; SSE-NEXT: pand %xmm4, %xmm0 696; SSE-NEXT: packuswb %xmm1, %xmm0 697; SSE-NEXT: packuswb %xmm2, %xmm0 698; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 699; SSE-NEXT: retq 700; 701; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 702; AVX1: # %bb.0: 703; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 704; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 705; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 706; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 707; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 708; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 709; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 710; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 711; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 712; AVX1-NEXT: vzeroupper 713; AVX1-NEXT: retq 714; 715; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 716; AVX2: # %bb.0: 717; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 718; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 719; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 720; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 721; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 722; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 723; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 724; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 725; AVX2-NEXT: vzeroupper 726; AVX2-NEXT: retq 727; 728; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 729; AVX512: # %bb.0: 730; AVX512-NEXT: vpmovdb %zmm0, %xmm0 731; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 732; AVX512-NEXT: vzeroupper 733; AVX512-NEXT: retq 734 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 735 %2 = trunc <16 x i32> %1 to <16 x i8> 736 ret <16 x i8> %2 737} 738 739define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 740; SSE-LABEL: trunc_add_const_v16i16_v16i8: 741; SSE: # %bb.0: 742; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 743; SSE-NEXT: pand %xmm2, %xmm1 744; SSE-NEXT: pand %xmm2, %xmm0 745; SSE-NEXT: packuswb %xmm1, %xmm0 746; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 747; SSE-NEXT: retq 748; 749; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 750; AVX1: # %bb.0: 751; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 752; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 753; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 754; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 755; AVX1-NEXT: vzeroupper 756; AVX1-NEXT: retq 757; 758; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 759; AVX2: # %bb.0: 760; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 761; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 762; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 763; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 764; AVX2-NEXT: vzeroupper 765; AVX2-NEXT: retq 766; 767; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 768; AVX512F: # %bb.0: 769; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 770; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 771; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 772; AVX512F-NEXT: vzeroupper 773; AVX512F-NEXT: retq 774; 775; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 776; AVX512BW: # %bb.0: 777; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 778; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 779; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 780; AVX512BW-NEXT: vzeroupper 781; AVX512BW-NEXT: retq 782; 783; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 784; AVX512DQ: # %bb.0: 785; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 786; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 787; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 788; AVX512DQ-NEXT: vzeroupper 789; AVX512DQ-NEXT: retq 790 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 791 %2 = trunc <16 x i16> %1 to <16 x i8> 792 ret <16 x i8> %2 793} 794 795; 796; sub 797; 798 799define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 800; SSE-LABEL: trunc_sub_v4i64_v4i32: 801; SSE: # %bb.0: 802; SSE-NEXT: psubq %xmm3, %xmm1 803; SSE-NEXT: psubq %xmm2, %xmm0 804; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 805; SSE-NEXT: retq 806; 807; AVX1-LABEL: trunc_sub_v4i64_v4i32: 808; AVX1: # %bb.0: 809; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 810; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 811; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 812; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 813; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 814; AVX1-NEXT: vzeroupper 815; AVX1-NEXT: retq 816; 817; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 818; AVX2-SLOW: # %bb.0: 819; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 820; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 821; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 822; AVX2-SLOW-NEXT: vzeroupper 823; AVX2-SLOW-NEXT: retq 824; 825; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: 826; AVX2-FAST-ALL: # %bb.0: 827; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 828; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 829; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 830; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 831; AVX2-FAST-ALL-NEXT: vzeroupper 832; AVX2-FAST-ALL-NEXT: retq 833; 834; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: 835; AVX2-FAST-PERLANE: # %bb.0: 836; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 837; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 838; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 839; AVX2-FAST-PERLANE-NEXT: vzeroupper 840; AVX2-FAST-PERLANE-NEXT: retq 841; 842; AVX512-LABEL: trunc_sub_v4i64_v4i32: 843; AVX512: # %bb.0: 844; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 845; AVX512-NEXT: vpmovqd %zmm0, %ymm0 846; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 847; AVX512-NEXT: vzeroupper 848; AVX512-NEXT: retq 849 %1 = sub <4 x i64> %a0, %a1 850 %2 = trunc <4 x i64> %1 to <4 x i32> 851 ret <4 x i32> %2 852} 853 854define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 855; SSE-LABEL: trunc_sub_v8i64_v8i16: 856; SSE: # %bb.0: 857; SSE-NEXT: psubq %xmm6, %xmm2 858; SSE-NEXT: psubq %xmm7, %xmm3 859; SSE-NEXT: psubq %xmm4, %xmm0 860; SSE-NEXT: psubq %xmm5, %xmm1 861; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 862; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 863; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 864; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 865; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 866; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 867; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 868; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 869; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 870; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 871; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 872; SSE-NEXT: retq 873; 874; AVX1-LABEL: trunc_sub_v8i64_v8i16: 875; AVX1: # %bb.0: 876; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 877; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 878; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 879; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 880; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 881; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 882; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 883; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 884; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 885; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 886; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 887; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 888; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 889; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 890; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 891; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 892; AVX1-NEXT: vzeroupper 893; AVX1-NEXT: retq 894; 895; AVX2-LABEL: trunc_sub_v8i64_v8i16: 896; AVX2: # %bb.0: 897; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 898; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 899; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 900; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 901; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 902; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 903; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 904; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 905; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 906; AVX2-NEXT: vzeroupper 907; AVX2-NEXT: retq 908; 909; AVX512-LABEL: trunc_sub_v8i64_v8i16: 910; AVX512: # %bb.0: 911; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 912; AVX512-NEXT: vpmovqw %zmm0, %xmm0 913; AVX512-NEXT: vzeroupper 914; AVX512-NEXT: retq 915 %1 = sub <8 x i64> %a0, %a1 916 %2 = trunc <8 x i64> %1 to <8 x i16> 917 ret <8 x i16> %2 918} 919 920define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 921; SSE-LABEL: trunc_sub_v8i32_v8i16: 922; SSE: # %bb.0: 923; SSE-NEXT: psubd %xmm2, %xmm0 924; SSE-NEXT: psubd %xmm3, %xmm1 925; SSE-NEXT: pslld $16, %xmm1 926; SSE-NEXT: psrad $16, %xmm1 927; SSE-NEXT: pslld $16, %xmm0 928; SSE-NEXT: psrad $16, %xmm0 929; SSE-NEXT: packssdw %xmm1, %xmm0 930; SSE-NEXT: retq 931; 932; AVX1-LABEL: trunc_sub_v8i32_v8i16: 933; AVX1: # %bb.0: 934; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 935; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 936; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 937; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 938; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 939; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 940; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 941; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 942; AVX1-NEXT: vzeroupper 943; AVX1-NEXT: retq 944; 945; AVX2-LABEL: trunc_sub_v8i32_v8i16: 946; AVX2: # %bb.0: 947; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 948; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 949; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 950; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 951; AVX2-NEXT: vzeroupper 952; AVX2-NEXT: retq 953; 954; AVX512-LABEL: trunc_sub_v8i32_v8i16: 955; AVX512: # %bb.0: 956; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 957; AVX512-NEXT: vpmovdw %zmm0, %ymm0 958; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 959; AVX512-NEXT: vzeroupper 960; AVX512-NEXT: retq 961 %1 = sub <8 x i32> %a0, %a1 962 %2 = trunc <8 x i32> %1 to <8 x i16> 963 ret <8 x i16> %2 964} 965 966define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 967; SSE-LABEL: trunc_sub_v16i64_v16i8: 968; SSE: # %bb.0: 969; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 970; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 971; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 972; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 973; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 974; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 975; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 976; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 977; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 978; SSE-NEXT: pand %xmm8, %xmm7 979; SSE-NEXT: pand %xmm8, %xmm6 980; SSE-NEXT: packuswb %xmm7, %xmm6 981; SSE-NEXT: pand %xmm8, %xmm5 982; SSE-NEXT: pand %xmm8, %xmm4 983; SSE-NEXT: packuswb %xmm5, %xmm4 984; SSE-NEXT: packuswb %xmm6, %xmm4 985; SSE-NEXT: pand %xmm8, %xmm3 986; SSE-NEXT: pand %xmm8, %xmm2 987; SSE-NEXT: packuswb %xmm3, %xmm2 988; SSE-NEXT: pand %xmm8, %xmm1 989; SSE-NEXT: pand %xmm8, %xmm0 990; SSE-NEXT: packuswb %xmm1, %xmm0 991; SSE-NEXT: packuswb %xmm2, %xmm0 992; SSE-NEXT: packuswb %xmm4, %xmm0 993; SSE-NEXT: retq 994; 995; AVX1-LABEL: trunc_sub_v16i64_v16i8: 996; AVX1: # %bb.0: 997; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 998; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 999; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1000; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 1001; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 1002; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1003; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1004; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 1005; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 1006; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1007; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1008; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1009; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 1010; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1011; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1012; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 1013; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1014; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1015; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1016; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1017; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1018; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1019; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1020; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1021; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1022; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1023; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1024; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1025; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1026; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1027; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1028; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1029; AVX1-NEXT: vzeroupper 1030; AVX1-NEXT: retq 1031; 1032; AVX2-LABEL: trunc_sub_v16i64_v16i8: 1033; AVX2: # %bb.0: 1034; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1035; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1036; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1037; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1038; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1039; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1040; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1041; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1042; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1043; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1044; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1045; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1046; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1047; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1048; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1049; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1050; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1051; AVX2-NEXT: vzeroupper 1052; AVX2-NEXT: retq 1053; 1054; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1055; AVX512: # %bb.0: 1056; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1057; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1058; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1059; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1060; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1061; AVX512-NEXT: vzeroupper 1062; AVX512-NEXT: retq 1063 %1 = sub <16 x i64> %a0, %a1 1064 %2 = trunc <16 x i64> %1 to <16 x i8> 1065 ret <16 x i8> %2 1066} 1067 1068define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1069; SSE-LABEL: trunc_sub_v16i32_v16i8: 1070; SSE: # %bb.0: 1071; SSE-NEXT: psubd %xmm4, %xmm0 1072; SSE-NEXT: psubd %xmm5, %xmm1 1073; SSE-NEXT: psubd %xmm6, %xmm2 1074; SSE-NEXT: psubd %xmm7, %xmm3 1075; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1076; SSE-NEXT: pand %xmm4, %xmm3 1077; SSE-NEXT: pand %xmm4, %xmm2 1078; SSE-NEXT: packuswb %xmm3, %xmm2 1079; SSE-NEXT: pand %xmm4, %xmm1 1080; SSE-NEXT: pand %xmm4, %xmm0 1081; SSE-NEXT: packuswb %xmm1, %xmm0 1082; SSE-NEXT: packuswb %xmm2, %xmm0 1083; SSE-NEXT: retq 1084; 1085; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1086; AVX1: # %bb.0: 1087; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1088; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1089; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1090; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1091; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1092; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1093; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1094; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1095; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1096; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1097; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1098; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1099; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1100; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1101; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1102; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1103; AVX1-NEXT: vzeroupper 1104; AVX1-NEXT: retq 1105; 1106; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1107; AVX2: # %bb.0: 1108; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1109; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1110; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1111; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1112; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1113; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1114; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1115; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1116; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1117; AVX2-NEXT: vzeroupper 1118; AVX2-NEXT: retq 1119; 1120; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1121; AVX512: # %bb.0: 1122; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1123; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1124; AVX512-NEXT: vzeroupper 1125; AVX512-NEXT: retq 1126 %1 = sub <16 x i32> %a0, %a1 1127 %2 = trunc <16 x i32> %1 to <16 x i8> 1128 ret <16 x i8> %2 1129} 1130 1131define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1132; SSE-LABEL: trunc_sub_v16i16_v16i8: 1133; SSE: # %bb.0: 1134; SSE-NEXT: psubw %xmm2, %xmm0 1135; SSE-NEXT: psubw %xmm3, %xmm1 1136; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1137; SSE-NEXT: pand %xmm2, %xmm1 1138; SSE-NEXT: pand %xmm2, %xmm0 1139; SSE-NEXT: packuswb %xmm1, %xmm0 1140; SSE-NEXT: retq 1141; 1142; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1143; AVX1: # %bb.0: 1144; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1145; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1146; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1147; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1148; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1149; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1150; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1151; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1152; AVX1-NEXT: vzeroupper 1153; AVX1-NEXT: retq 1154; 1155; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1156; AVX2: # %bb.0: 1157; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1158; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1159; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1160; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1161; AVX2-NEXT: vzeroupper 1162; AVX2-NEXT: retq 1163; 1164; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1165; AVX512F: # %bb.0: 1166; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1167; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1168; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1169; AVX512F-NEXT: vzeroupper 1170; AVX512F-NEXT: retq 1171; 1172; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1173; AVX512BW: # %bb.0: 1174; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1175; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1176; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1177; AVX512BW-NEXT: vzeroupper 1178; AVX512BW-NEXT: retq 1179; 1180; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1181; AVX512DQ: # %bb.0: 1182; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1183; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1184; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1185; AVX512DQ-NEXT: vzeroupper 1186; AVX512DQ-NEXT: retq 1187 %1 = sub <16 x i16> %a0, %a1 1188 %2 = trunc <16 x i16> %1 to <16 x i8> 1189 ret <16 x i8> %2 1190} 1191 1192define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { 1193; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: 1194; SSE: # %bb.0: 1195; SSE-NEXT: psubb %xmm1, %xmm0 1196; SSE-NEXT: retq 1197; 1198; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1201; AVX-NEXT: retq 1202 %a = zext <16 x i8> %x to <16 x i16> 1203 %b = zext <16 x i8> %y to <16 x i16> 1204 %c = sub <16 x i16> %a, %b 1205 %d = trunc <16 x i16> %c to <16 x i8> 1206 ret <16 x i8> %d 1207} 1208 1209; 1210; sub to constant 1211; 1212 1213define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1214; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1215; SSE: # %bb.0: 1216; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1217; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1218; SSE-NEXT: retq 1219; 1220; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1221; AVX1: # %bb.0: 1222; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1223; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1224; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1225; AVX1-NEXT: vzeroupper 1226; AVX1-NEXT: retq 1227; 1228; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1229; AVX2-SLOW: # %bb.0: 1230; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1231; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1232; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1233; AVX2-SLOW-NEXT: vzeroupper 1234; AVX2-SLOW-NEXT: retq 1235; 1236; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: 1237; AVX2-FAST-ALL: # %bb.0: 1238; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 1239; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 1240; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1241; AVX2-FAST-ALL-NEXT: vzeroupper 1242; AVX2-FAST-ALL-NEXT: retq 1243; 1244; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: 1245; AVX2-FAST-PERLANE: # %bb.0: 1246; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 1247; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1248; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1249; AVX2-FAST-PERLANE-NEXT: vzeroupper 1250; AVX2-FAST-PERLANE-NEXT: retq 1251; 1252; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1253; AVX512: # %bb.0: 1254; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1255; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1256; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1257; AVX512-NEXT: vzeroupper 1258; AVX512-NEXT: retq 1259 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1260 %2 = trunc <4 x i64> %1 to <4 x i32> 1261 ret <4 x i32> %2 1262} 1263 1264define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1265; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1266; SSE: # %bb.0: 1267; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1268; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1269; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1270; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1271; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1272; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1273; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1274; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1275; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1276; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1277; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1278; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1279; SSE-NEXT: retq 1280; 1281; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1282; AVX1: # %bb.0: 1283; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 1284; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1285; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1286; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1287; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1288; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1289; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1290; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1291; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1292; AVX1-NEXT: vzeroupper 1293; AVX1-NEXT: retq 1294; 1295; AVX2-LABEL: trunc_sub_const_v8i64_v8i16: 1296; AVX2: # %bb.0: 1297; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1298; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1299; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1300; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1302; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1303; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1304; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1305; AVX2-NEXT: vzeroupper 1306; AVX2-NEXT: retq 1307; 1308; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1309; AVX512: # %bb.0: 1310; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1311; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1312; AVX512-NEXT: vzeroupper 1313; AVX512-NEXT: retq 1314 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1315 %2 = trunc <8 x i64> %1 to <8 x i16> 1316 ret <8 x i16> %2 1317} 1318 1319define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1320; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1321; SSE: # %bb.0: 1322; SSE-NEXT: pslld $16, %xmm1 1323; SSE-NEXT: psrad $16, %xmm1 1324; SSE-NEXT: pslld $16, %xmm0 1325; SSE-NEXT: psrad $16, %xmm0 1326; SSE-NEXT: packssdw %xmm1, %xmm0 1327; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1328; SSE-NEXT: retq 1329; 1330; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1331; AVX1: # %bb.0: 1332; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1333; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1334; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1335; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1336; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1337; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1338; AVX1-NEXT: vzeroupper 1339; AVX1-NEXT: retq 1340; 1341; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1342; AVX2: # %bb.0: 1343; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1344; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1345; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1346; AVX2-NEXT: vzeroupper 1347; AVX2-NEXT: retq 1348; 1349; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1350; AVX512: # %bb.0: 1351; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1352; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1353; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1354; AVX512-NEXT: vzeroupper 1355; AVX512-NEXT: retq 1356 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1357 %2 = trunc <8 x i32> %1 to <8 x i16> 1358 ret <8 x i16> %2 1359} 1360 1361define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1362; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1363; SSE: # %bb.0: 1364; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1365; SSE-NEXT: pand %xmm8, %xmm7 1366; SSE-NEXT: pand %xmm8, %xmm6 1367; SSE-NEXT: packuswb %xmm7, %xmm6 1368; SSE-NEXT: pand %xmm8, %xmm5 1369; SSE-NEXT: pand %xmm8, %xmm4 1370; SSE-NEXT: packuswb %xmm5, %xmm4 1371; SSE-NEXT: packuswb %xmm6, %xmm4 1372; SSE-NEXT: pand %xmm8, %xmm3 1373; SSE-NEXT: pand %xmm8, %xmm2 1374; SSE-NEXT: packuswb %xmm3, %xmm2 1375; SSE-NEXT: pand %xmm8, %xmm1 1376; SSE-NEXT: pand %xmm8, %xmm0 1377; SSE-NEXT: packuswb %xmm1, %xmm0 1378; SSE-NEXT: packuswb %xmm2, %xmm0 1379; SSE-NEXT: packuswb %xmm4, %xmm0 1380; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1381; SSE-NEXT: retq 1382; 1383; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1384; AVX1: # %bb.0: 1385; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 1386; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1387; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1388; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1389; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1390; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1391; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1392; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1393; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1394; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1395; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1396; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1397; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1398; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1399; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1400; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1401; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1402; AVX1-NEXT: vzeroupper 1403; AVX1-NEXT: retq 1404; 1405; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: 1406; AVX2: # %bb.0: 1407; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1408; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1409; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1410; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1411; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1412; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1413; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1414; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1415; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1416; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1417; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1418; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1419; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1420; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1421; AVX2-NEXT: vzeroupper 1422; AVX2-NEXT: retq 1423; 1424; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1425; AVX512: # %bb.0: 1426; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1427; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1428; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1429; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1430; AVX512-NEXT: vzeroupper 1431; AVX512-NEXT: retq 1432 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1433 %2 = trunc <16 x i64> %1 to <16 x i8> 1434 ret <16 x i8> %2 1435} 1436 1437define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1438; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1439; SSE: # %bb.0: 1440; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1441; SSE-NEXT: pand %xmm4, %xmm3 1442; SSE-NEXT: pand %xmm4, %xmm2 1443; SSE-NEXT: packuswb %xmm3, %xmm2 1444; SSE-NEXT: pand %xmm4, %xmm1 1445; SSE-NEXT: pand %xmm4, %xmm0 1446; SSE-NEXT: packuswb %xmm1, %xmm0 1447; SSE-NEXT: packuswb %xmm2, %xmm0 1448; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1449; SSE-NEXT: retq 1450; 1451; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1452; AVX1: # %bb.0: 1453; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1454; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1455; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1456; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1457; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1458; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1459; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1460; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1461; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1462; AVX1-NEXT: vzeroupper 1463; AVX1-NEXT: retq 1464; 1465; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1466; AVX2: # %bb.0: 1467; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1468; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1469; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1470; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1471; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1472; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1473; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1474; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1475; AVX2-NEXT: vzeroupper 1476; AVX2-NEXT: retq 1477; 1478; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1479; AVX512: # %bb.0: 1480; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1481; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1482; AVX512-NEXT: vzeroupper 1483; AVX512-NEXT: retq 1484 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1485 %2 = trunc <16 x i32> %1 to <16 x i8> 1486 ret <16 x i8> %2 1487} 1488 1489define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1490; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1491; SSE: # %bb.0: 1492; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1493; SSE-NEXT: pand %xmm2, %xmm1 1494; SSE-NEXT: pand %xmm2, %xmm0 1495; SSE-NEXT: packuswb %xmm1, %xmm0 1496; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1497; SSE-NEXT: retq 1498; 1499; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1500; AVX1: # %bb.0: 1501; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1502; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1503; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1504; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1505; AVX1-NEXT: vzeroupper 1506; AVX1-NEXT: retq 1507; 1508; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1509; AVX2: # %bb.0: 1510; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1511; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1512; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1513; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1514; AVX2-NEXT: vzeroupper 1515; AVX2-NEXT: retq 1516; 1517; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1518; AVX512F: # %bb.0: 1519; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1520; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1521; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1522; AVX512F-NEXT: vzeroupper 1523; AVX512F-NEXT: retq 1524; 1525; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1526; AVX512BW: # %bb.0: 1527; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1528; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1529; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1530; AVX512BW-NEXT: vzeroupper 1531; AVX512BW-NEXT: retq 1532; 1533; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1534; AVX512DQ: # %bb.0: 1535; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1536; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1537; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1538; AVX512DQ-NEXT: vzeroupper 1539; AVX512DQ-NEXT: retq 1540 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1541 %2 = trunc <16 x i16> %1 to <16 x i8> 1542 ret <16 x i8> %2 1543} 1544 1545define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { 1546; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1547; SSE: # %bb.0: 1548; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1549; SSE-NEXT: retq 1550; 1551; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1552; AVX: # %bb.0: 1553; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1554; AVX-NEXT: retq 1555 %a = zext <16 x i8> %x to <16 x i16> 1556 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1557 %c = trunc <16 x i16> %b to <16 x i8> 1558 ret <16 x i8> %c 1559} 1560 1561define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { 1562; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1563; SSE: # %bb.0: 1564; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1565; SSE-NEXT: psubb %xmm0, %xmm1 1566; SSE-NEXT: movdqa %xmm1, %xmm0 1567; SSE-NEXT: retq 1568; 1569; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1570; AVX: # %bb.0: 1571; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1572; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1573; AVX-NEXT: retq 1574 %a = zext <16 x i8> %x to <16 x i16> 1575 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1576 %c = trunc <16 x i16> %b to <16 x i8> 1577 ret <16 x i8> %c 1578} 1579 1580; 1581; mul 1582; 1583 1584define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1585; SSE-LABEL: trunc_mul_v4i64_v4i32: 1586; SSE: # %bb.0: 1587; SSE-NEXT: pmuludq %xmm3, %xmm1 1588; SSE-NEXT: pmuludq %xmm2, %xmm0 1589; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1590; SSE-NEXT: retq 1591; 1592; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1593; AVX1: # %bb.0: 1594; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1595; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1596; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1597; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1598; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1599; AVX1-NEXT: vzeroupper 1600; AVX1-NEXT: retq 1601; 1602; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1603; AVX2-SLOW: # %bb.0: 1604; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1605; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1606; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1607; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1608; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1609; AVX2-SLOW-NEXT: vzeroupper 1610; AVX2-SLOW-NEXT: retq 1611; 1612; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: 1613; AVX2-FAST-ALL: # %bb.0: 1614; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1615; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 1616; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 1617; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1618; AVX2-FAST-ALL-NEXT: vzeroupper 1619; AVX2-FAST-ALL-NEXT: retq 1620; 1621; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: 1622; AVX2-FAST-PERLANE: # %bb.0: 1623; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 1624; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1625; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 1626; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1627; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1628; AVX2-FAST-PERLANE-NEXT: vzeroupper 1629; AVX2-FAST-PERLANE-NEXT: retq 1630; 1631; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1632; AVX512F: # %bb.0: 1633; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1634; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1635; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1636; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1637; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1638; AVX512F-NEXT: vzeroupper 1639; AVX512F-NEXT: retq 1640; 1641; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1642; AVX512BW: # %bb.0: 1643; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1644; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1645; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1646; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1647; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1648; AVX512BW-NEXT: vzeroupper 1649; AVX512BW-NEXT: retq 1650; 1651; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1652; AVX512DQ: # %bb.0: 1653; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1654; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1655; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1656; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1657; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1658; AVX512DQ-NEXT: vzeroupper 1659; AVX512DQ-NEXT: retq 1660 %1 = mul <4 x i64> %a0, %a1 1661 %2 = trunc <4 x i64> %1 to <4 x i32> 1662 ret <4 x i32> %2 1663} 1664 1665define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1666; SSE-LABEL: trunc_mul_v8i64_v8i16: 1667; SSE: # %bb.0: 1668; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1669; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 1670; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1671; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1672; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1673; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 1674; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] 1675; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1676; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] 1677; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1678; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] 1679; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1680; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1681; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1682; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1683; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1684; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1685; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1686; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1687; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1688; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1689; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1690; SSE-NEXT: pmullw %xmm6, %xmm0 1691; SSE-NEXT: retq 1692; 1693; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1694; AVX1: # %bb.0: 1695; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 1696; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1697; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1698; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1699; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1700; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1701; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1702; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1703; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1704; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1705; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1706; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1707; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1708; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1709; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1710; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1711; AVX1-NEXT: vzeroupper 1712; AVX1-NEXT: retq 1713; 1714; AVX2-LABEL: trunc_mul_v8i64_v8i16: 1715; AVX2: # %bb.0: 1716; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1717; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15] 1718; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15] 1719; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1720; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1721; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1722; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15] 1723; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15] 1724; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1725; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1726; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1727; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1728; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1729; AVX2-NEXT: vzeroupper 1730; AVX2-NEXT: retq 1731; 1732; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1733; AVX512F: # %bb.0: 1734; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1735; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1736; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1737; AVX512F-NEXT: vzeroupper 1738; AVX512F-NEXT: retq 1739; 1740; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1741; AVX512BW: # %bb.0: 1742; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1743; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1744; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1745; AVX512BW-NEXT: vzeroupper 1746; AVX512BW-NEXT: retq 1747; 1748; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1749; AVX512DQ: # %bb.0: 1750; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1751; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1752; AVX512DQ-NEXT: vzeroupper 1753; AVX512DQ-NEXT: retq 1754 %1 = mul <8 x i64> %a0, %a1 1755 %2 = trunc <8 x i64> %1 to <8 x i16> 1756 ret <8 x i16> %2 1757} 1758 1759define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1760; SSE-LABEL: trunc_mul_v8i32_v8i16: 1761; SSE: # %bb.0: 1762; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1763; SSE-NEXT: pmuludq %xmm2, %xmm0 1764; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1765; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1766; SSE-NEXT: pmuludq %xmm4, %xmm2 1767; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1768; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1769; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1770; SSE-NEXT: pmuludq %xmm3, %xmm1 1771; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1772; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1773; SSE-NEXT: pmuludq %xmm2, %xmm3 1774; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1775; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1776; SSE-NEXT: pslld $16, %xmm1 1777; SSE-NEXT: psrad $16, %xmm1 1778; SSE-NEXT: pslld $16, %xmm0 1779; SSE-NEXT: psrad $16, %xmm0 1780; SSE-NEXT: packssdw %xmm1, %xmm0 1781; SSE-NEXT: retq 1782; 1783; AVX1-LABEL: trunc_mul_v8i32_v8i16: 1784; AVX1: # %bb.0: 1785; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1786; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1787; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1788; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1789; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1790; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1791; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1792; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1793; AVX1-NEXT: vzeroupper 1794; AVX1-NEXT: retq 1795; 1796; AVX2-LABEL: trunc_mul_v8i32_v8i16: 1797; AVX2: # %bb.0: 1798; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1799; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1800; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1801; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1802; AVX2-NEXT: vzeroupper 1803; AVX2-NEXT: retq 1804; 1805; AVX512-LABEL: trunc_mul_v8i32_v8i16: 1806; AVX512: # %bb.0: 1807; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1808; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1809; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1810; AVX512-NEXT: vzeroupper 1811; AVX512-NEXT: retq 1812 %1 = mul <8 x i32> %a0, %a1 1813 %2 = trunc <8 x i32> %1 to <8 x i16> 1814 ret <8 x i16> %2 1815} 1816 1817define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1818; SSE-LABEL: trunc_mul_v16i64_v16i8: 1819; SSE: # %bb.0: 1820; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 1821; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 1822; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 1823; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 1824; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 1825; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 1826; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 1827; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 1828; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1829; SSE-NEXT: pand %xmm8, %xmm7 1830; SSE-NEXT: pand %xmm8, %xmm6 1831; SSE-NEXT: packuswb %xmm7, %xmm6 1832; SSE-NEXT: pand %xmm8, %xmm5 1833; SSE-NEXT: pand %xmm8, %xmm4 1834; SSE-NEXT: packuswb %xmm5, %xmm4 1835; SSE-NEXT: packuswb %xmm6, %xmm4 1836; SSE-NEXT: pand %xmm8, %xmm3 1837; SSE-NEXT: pand %xmm8, %xmm2 1838; SSE-NEXT: packuswb %xmm3, %xmm2 1839; SSE-NEXT: pand %xmm8, %xmm1 1840; SSE-NEXT: pand %xmm8, %xmm0 1841; SSE-NEXT: packuswb %xmm1, %xmm0 1842; SSE-NEXT: packuswb %xmm2, %xmm0 1843; SSE-NEXT: packuswb %xmm4, %xmm0 1844; SSE-NEXT: retq 1845; 1846; AVX1-LABEL: trunc_mul_v16i64_v16i8: 1847; AVX1: # %bb.0: 1848; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 1849; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1850; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1851; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 1852; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 1853; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1854; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1855; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 1856; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 1857; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1858; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1859; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 1860; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 1861; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1862; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1863; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 1864; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1865; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1866; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1867; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1868; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1869; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1870; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1871; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1872; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1873; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1874; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1875; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1876; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1877; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1878; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1879; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1880; AVX1-NEXT: vzeroupper 1881; AVX1-NEXT: retq 1882; 1883; AVX2-LABEL: trunc_mul_v16i64_v16i8: 1884; AVX2: # %bb.0: 1885; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 1886; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 1887; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 1888; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 1889; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1890; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1891; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1892; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1893; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1894; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1895; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1896; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1897; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1898; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1899; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1900; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1901; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1902; AVX2-NEXT: vzeroupper 1903; AVX2-NEXT: retq 1904; 1905; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 1906; AVX512F: # %bb.0: 1907; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1908; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1909; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 1910; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1911; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1912; AVX512F-NEXT: vzeroupper 1913; AVX512F-NEXT: retq 1914; 1915; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 1916; AVX512BW: # %bb.0: 1917; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1918; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1919; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 1920; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1921; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1922; AVX512BW-NEXT: vzeroupper 1923; AVX512BW-NEXT: retq 1924; 1925; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 1926; AVX512DQ: # %bb.0: 1927; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 1928; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 1929; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 1930; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 1931; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1932; AVX512DQ-NEXT: vzeroupper 1933; AVX512DQ-NEXT: retq 1934 %1 = mul <16 x i64> %a0, %a1 1935 %2 = trunc <16 x i64> %1 to <16 x i8> 1936 ret <16 x i8> %2 1937} 1938 1939define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1940; SSE-LABEL: trunc_mul_v16i32_v16i8: 1941; SSE: # %bb.0: 1942; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 1943; SSE-NEXT: pmuludq %xmm4, %xmm0 1944; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1945; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1946; SSE-NEXT: pmuludq %xmm8, %xmm4 1947; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1948; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1949; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1950; SSE-NEXT: pmuludq %xmm5, %xmm1 1951; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1952; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1953; SSE-NEXT: pmuludq %xmm4, %xmm5 1954; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1955; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1956; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 1957; SSE-NEXT: pmuludq %xmm6, %xmm2 1958; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1959; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 1960; SSE-NEXT: pmuludq %xmm4, %xmm5 1961; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1962; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 1963; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1964; SSE-NEXT: pmuludq %xmm7, %xmm3 1965; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1966; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1967; SSE-NEXT: pmuludq %xmm4, %xmm5 1968; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1969; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1970; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1971; SSE-NEXT: pand %xmm4, %xmm3 1972; SSE-NEXT: pand %xmm4, %xmm2 1973; SSE-NEXT: packuswb %xmm3, %xmm2 1974; SSE-NEXT: pand %xmm4, %xmm1 1975; SSE-NEXT: pand %xmm4, %xmm0 1976; SSE-NEXT: packuswb %xmm1, %xmm0 1977; SSE-NEXT: packuswb %xmm2, %xmm0 1978; SSE-NEXT: retq 1979; 1980; AVX1-LABEL: trunc_mul_v16i32_v16i8: 1981; AVX1: # %bb.0: 1982; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 1983; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1984; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1985; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 1986; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 1987; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1988; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1989; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1990; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1991; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1992; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1993; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1994; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1995; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1996; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1997; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1998; AVX1-NEXT: vzeroupper 1999; AVX1-NEXT: retq 2000; 2001; AVX2-LABEL: trunc_mul_v16i32_v16i8: 2002; AVX2: # %bb.0: 2003; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2004; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2005; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2006; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2007; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2008; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2009; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2010; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2011; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2012; AVX2-NEXT: vzeroupper 2013; AVX2-NEXT: retq 2014; 2015; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2016; AVX512: # %bb.0: 2017; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2018; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2019; AVX512-NEXT: vzeroupper 2020; AVX512-NEXT: retq 2021 %1 = mul <16 x i32> %a0, %a1 2022 %2 = trunc <16 x i32> %1 to <16 x i8> 2023 ret <16 x i8> %2 2024} 2025 2026define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2027; SSE-LABEL: trunc_mul_v16i16_v16i8: 2028; SSE: # %bb.0: 2029; SSE-NEXT: pmullw %xmm2, %xmm0 2030; SSE-NEXT: pmullw %xmm3, %xmm1 2031; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2032; SSE-NEXT: pand %xmm2, %xmm1 2033; SSE-NEXT: pand %xmm2, %xmm0 2034; SSE-NEXT: packuswb %xmm1, %xmm0 2035; SSE-NEXT: retq 2036; 2037; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2038; AVX1: # %bb.0: 2039; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2040; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2041; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2042; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2043; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2044; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 2045; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 2046; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2047; AVX1-NEXT: vzeroupper 2048; AVX1-NEXT: retq 2049; 2050; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2051; AVX2: # %bb.0: 2052; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2053; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2054; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2055; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2056; AVX2-NEXT: vzeroupper 2057; AVX2-NEXT: retq 2058; 2059; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2060; AVX512F: # %bb.0: 2061; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2062; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2063; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2064; AVX512F-NEXT: vzeroupper 2065; AVX512F-NEXT: retq 2066; 2067; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2068; AVX512BW: # %bb.0: 2069; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2070; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2071; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2072; AVX512BW-NEXT: vzeroupper 2073; AVX512BW-NEXT: retq 2074; 2075; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2076; AVX512DQ: # %bb.0: 2077; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2078; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2079; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2080; AVX512DQ-NEXT: vzeroupper 2081; AVX512DQ-NEXT: retq 2082 %1 = mul <16 x i16> %a0, %a1 2083 %2 = trunc <16 x i16> %1 to <16 x i8> 2084 ret <16 x i8> %2 2085} 2086 2087define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2088; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2089; SSE: # %bb.0: 2090; SSE-NEXT: pxor %xmm3, %xmm3 2091; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2092; SSE-NEXT: pslld $16, %xmm2 2093; SSE-NEXT: psrad $16, %xmm2 2094; SSE-NEXT: pslld $16, %xmm1 2095; SSE-NEXT: psrad $16, %xmm1 2096; SSE-NEXT: packssdw %xmm2, %xmm1 2097; SSE-NEXT: pmullw %xmm1, %xmm0 2098; SSE-NEXT: retq 2099; 2100; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2101; AVX1: # %bb.0: 2102; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2103; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2104; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2105; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2106; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2107; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2108; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2109; AVX1-NEXT: vzeroupper 2110; AVX1-NEXT: retq 2111; 2112; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2113; AVX2: # %bb.0: 2114; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2115; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2116; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2117; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2118; AVX2-NEXT: vzeroupper 2119; AVX2-NEXT: retq 2120; 2121; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2122; AVX512: # %bb.0: 2123; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2124; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2125; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2126; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2127; AVX512-NEXT: vzeroupper 2128; AVX512-NEXT: retq 2129 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2130 %2 = zext <8 x i8> %1 to <8 x i32> 2131 %3 = mul <8 x i32> %2, %a1 2132 %4 = trunc <8 x i32> %3 to <8 x i16> 2133 ret <8 x i16> %4 2134} 2135 2136; 2137; mul to constant 2138; 2139 2140define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2141; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2142; SSE: # %bb.0: 2143; SSE-NEXT: xorps %xmm2, %xmm2 2144; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2145; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2146; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2147; SSE-NEXT: movaps %xmm2, %xmm0 2148; SSE-NEXT: retq 2149; 2150; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2151; AVX1: # %bb.0: 2152; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2153; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2154; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2155; AVX1-NEXT: vzeroupper 2156; AVX1-NEXT: retq 2157; 2158; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2159; AVX2-SLOW: # %bb.0: 2160; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2161; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2162; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2163; AVX2-SLOW-NEXT: vzeroupper 2164; AVX2-SLOW-NEXT: retq 2165; 2166; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: 2167; AVX2-FAST-ALL: # %bb.0: 2168; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2169; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2170; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2171; AVX2-FAST-ALL-NEXT: vzeroupper 2172; AVX2-FAST-ALL-NEXT: retq 2173; 2174; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: 2175; AVX2-FAST-PERLANE: # %bb.0: 2176; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2177; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2178; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2179; AVX2-FAST-PERLANE-NEXT: vzeroupper 2180; AVX2-FAST-PERLANE-NEXT: retq 2181; 2182; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2183; AVX512: # %bb.0: 2184; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2185; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2186; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2187; AVX512-NEXT: vzeroupper 2188; AVX512-NEXT: retq 2189 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2190 %2 = trunc <4 x i64> %1 to <4 x i32> 2191 ret <4 x i32> %2 2192} 2193 2194define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2195; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2196; SSE: # %bb.0: 2197; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2198; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2199; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2200; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2201; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2202; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2203; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2204; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2205; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2206; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2207; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2208; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2209; SSE-NEXT: retq 2210; 2211; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2212; AVX1: # %bb.0: 2213; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2214; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2215; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2216; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2217; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2218; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2219; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2220; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2221; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2222; AVX1-NEXT: vzeroupper 2223; AVX1-NEXT: retq 2224; 2225; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: 2226; AVX2: # %bb.0: 2227; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2228; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2229; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2230; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2231; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2232; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2233; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2234; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2235; AVX2-NEXT: vzeroupper 2236; AVX2-NEXT: retq 2237; 2238; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2239; AVX512: # %bb.0: 2240; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2241; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2242; AVX512-NEXT: vzeroupper 2243; AVX512-NEXT: retq 2244 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2245 %2 = trunc <8 x i64> %1 to <8 x i16> 2246 ret <8 x i16> %2 2247} 2248 2249define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2250; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2251; SSE: # %bb.0: 2252; SSE-NEXT: pslld $16, %xmm1 2253; SSE-NEXT: psrad $16, %xmm1 2254; SSE-NEXT: pslld $16, %xmm0 2255; SSE-NEXT: psrad $16, %xmm0 2256; SSE-NEXT: packssdw %xmm1, %xmm0 2257; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2258; SSE-NEXT: retq 2259; 2260; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2261; AVX1: # %bb.0: 2262; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2263; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2264; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2265; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2266; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2267; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2268; AVX1-NEXT: vzeroupper 2269; AVX1-NEXT: retq 2270; 2271; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2272; AVX2: # %bb.0: 2273; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2274; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2275; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2276; AVX2-NEXT: vzeroupper 2277; AVX2-NEXT: retq 2278; 2279; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2280; AVX512: # %bb.0: 2281; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2282; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2283; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2284; AVX512-NEXT: vzeroupper 2285; AVX512-NEXT: retq 2286 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2287 %2 = trunc <8 x i32> %1 to <8 x i16> 2288 ret <8 x i16> %2 2289} 2290 2291define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2292; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2293; SSE: # %bb.0: 2294; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2295; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2296; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2297; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2298; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 2299; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 2300; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 2301; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2302; SSE-NEXT: pand %xmm8, %xmm7 2303; SSE-NEXT: pand %xmm8, %xmm6 2304; SSE-NEXT: packuswb %xmm7, %xmm6 2305; SSE-NEXT: pand %xmm8, %xmm5 2306; SSE-NEXT: pand %xmm8, %xmm4 2307; SSE-NEXT: packuswb %xmm5, %xmm4 2308; SSE-NEXT: packuswb %xmm6, %xmm4 2309; SSE-NEXT: pand %xmm8, %xmm3 2310; SSE-NEXT: pand %xmm8, %xmm2 2311; SSE-NEXT: packuswb %xmm3, %xmm2 2312; SSE-NEXT: pand %xmm8, %xmm1 2313; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2314; SSE-NEXT: packuswb %xmm1, %xmm0 2315; SSE-NEXT: packuswb %xmm2, %xmm0 2316; SSE-NEXT: packuswb %xmm4, %xmm0 2317; SSE-NEXT: retq 2318; 2319; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2320; AVX1: # %bb.0: 2321; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 2322; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2323; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2324; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 2325; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2326; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2327; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 2328; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2329; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2330; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 2331; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2332; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2333; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 2334; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2335; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2336; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 2337; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2338; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 2339; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 2340; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2341; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2342; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2343; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2344; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2345; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2346; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2347; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2348; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2349; AVX1-NEXT: vzeroupper 2350; AVX1-NEXT: retq 2351; 2352; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: 2353; AVX2: # %bb.0: 2354; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2355; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2356; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2357; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2358; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 2359; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2360; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2361; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2362; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2363; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 2364; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2365; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2366; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2367; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2368; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2369; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2370; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2371; AVX2-NEXT: vzeroupper 2372; AVX2-NEXT: retq 2373; 2374; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: 2375; AVX512F: # %bb.0: 2376; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2377; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2378; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2379; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2380; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2381; AVX512F-NEXT: vzeroupper 2382; AVX512F-NEXT: retq 2383; 2384; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: 2385; AVX512BW: # %bb.0: 2386; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2387; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2388; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2389; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2390; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2391; AVX512BW-NEXT: vzeroupper 2392; AVX512BW-NEXT: retq 2393; 2394; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: 2395; AVX512DQ: # %bb.0: 2396; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2397; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2398; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2399; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2400; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2401; AVX512DQ-NEXT: vzeroupper 2402; AVX512DQ-NEXT: retq 2403 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2404 %2 = trunc <16 x i64> %1 to <16 x i8> 2405 ret <16 x i8> %2 2406} 2407 2408define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2409; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2410; SSE: # %bb.0: 2411; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] 2412; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2413; SSE-NEXT: pmuludq %xmm4, %xmm0 2414; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2415; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2416; SSE-NEXT: pmuludq %xmm5, %xmm4 2417; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2418; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2419; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] 2420; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 2421; SSE-NEXT: pmuludq %xmm4, %xmm1 2422; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2423; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2424; SSE-NEXT: pmuludq %xmm5, %xmm4 2425; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2426; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2427; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] 2428; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 2429; SSE-NEXT: pmuludq %xmm4, %xmm2 2430; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2431; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2432; SSE-NEXT: pmuludq %xmm5, %xmm4 2433; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2434; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2435; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] 2436; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2437; SSE-NEXT: pmuludq %xmm4, %xmm3 2438; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2439; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2440; SSE-NEXT: pmuludq %xmm5, %xmm4 2441; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2442; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2443; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2444; SSE-NEXT: pand %xmm4, %xmm3 2445; SSE-NEXT: pand %xmm4, %xmm2 2446; SSE-NEXT: packuswb %xmm3, %xmm2 2447; SSE-NEXT: pand %xmm4, %xmm1 2448; SSE-NEXT: pand %xmm4, %xmm0 2449; SSE-NEXT: packuswb %xmm1, %xmm0 2450; SSE-NEXT: packuswb %xmm2, %xmm0 2451; SSE-NEXT: retq 2452; 2453; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2454; AVX1: # %bb.0: 2455; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2456; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2457; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2458; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 2459; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2460; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2461; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] 2462; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2463; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2464; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2465; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2466; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2467; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2468; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2469; AVX1-NEXT: vzeroupper 2470; AVX1-NEXT: retq 2471; 2472; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2473; AVX2: # %bb.0: 2474; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2475; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2476; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2477; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2478; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2479; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2480; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2481; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2482; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2483; AVX2-NEXT: vzeroupper 2484; AVX2-NEXT: retq 2485; 2486; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2487; AVX512: # %bb.0: 2488; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2489; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2490; AVX512-NEXT: vzeroupper 2491; AVX512-NEXT: retq 2492 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2493 %2 = trunc <16 x i32> %1 to <16 x i8> 2494 ret <16 x i8> %2 2495} 2496 2497define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2498; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2499; SSE: # %bb.0: 2500; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2501; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2502; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2503; SSE-NEXT: pand %xmm2, %xmm1 2504; SSE-NEXT: pand %xmm2, %xmm0 2505; SSE-NEXT: packuswb %xmm1, %xmm0 2506; SSE-NEXT: retq 2507; 2508; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2509; AVX1: # %bb.0: 2510; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2511; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2512; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2513; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2514; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2515; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2516; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2517; AVX1-NEXT: vzeroupper 2518; AVX1-NEXT: retq 2519; 2520; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2521; AVX2: # %bb.0: 2522; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2523; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2524; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2525; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2526; AVX2-NEXT: vzeroupper 2527; AVX2-NEXT: retq 2528; 2529; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2530; AVX512F: # %bb.0: 2531; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2532; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2533; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2534; AVX512F-NEXT: vzeroupper 2535; AVX512F-NEXT: retq 2536; 2537; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2538; AVX512BW: # %bb.0: 2539; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2540; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2541; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2542; AVX512BW-NEXT: vzeroupper 2543; AVX512BW-NEXT: retq 2544; 2545; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 2546; AVX512DQ: # %bb.0: 2547; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2548; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2549; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2550; AVX512DQ-NEXT: vzeroupper 2551; AVX512DQ-NEXT: retq 2552 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2553 %2 = trunc <16 x i16> %1 to <16 x i8> 2554 ret <16 x i8> %2 2555} 2556 2557; 2558; and 2559; 2560 2561define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2562; SSE-LABEL: trunc_and_v4i64_v4i32: 2563; SSE: # %bb.0: 2564; SSE-NEXT: andps %xmm3, %xmm1 2565; SSE-NEXT: andps %xmm2, %xmm0 2566; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2567; SSE-NEXT: retq 2568; 2569; AVX1-LABEL: trunc_and_v4i64_v4i32: 2570; AVX1: # %bb.0: 2571; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2572; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2573; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2574; AVX1-NEXT: vzeroupper 2575; AVX1-NEXT: retq 2576; 2577; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 2578; AVX2-SLOW: # %bb.0: 2579; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 2580; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2581; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2582; AVX2-SLOW-NEXT: vzeroupper 2583; AVX2-SLOW-NEXT: retq 2584; 2585; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: 2586; AVX2-FAST-ALL: # %bb.0: 2587; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 2588; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2589; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2590; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2591; AVX2-FAST-ALL-NEXT: vzeroupper 2592; AVX2-FAST-ALL-NEXT: retq 2593; 2594; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: 2595; AVX2-FAST-PERLANE: # %bb.0: 2596; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 2597; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2598; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2599; AVX2-FAST-PERLANE-NEXT: vzeroupper 2600; AVX2-FAST-PERLANE-NEXT: retq 2601; 2602; AVX512-LABEL: trunc_and_v4i64_v4i32: 2603; AVX512: # %bb.0: 2604; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2605; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2606; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2607; AVX512-NEXT: vzeroupper 2608; AVX512-NEXT: retq 2609 %1 = and <4 x i64> %a0, %a1 2610 %2 = trunc <4 x i64> %1 to <4 x i32> 2611 ret <4 x i32> %2 2612} 2613 2614define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 2615; SSE-LABEL: trunc_and_v8i64_v8i16: 2616; SSE: # %bb.0: 2617; SSE-NEXT: pand %xmm6, %xmm2 2618; SSE-NEXT: pand %xmm7, %xmm3 2619; SSE-NEXT: pand %xmm4, %xmm0 2620; SSE-NEXT: pand %xmm5, %xmm1 2621; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2622; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2623; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2624; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2625; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2626; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2627; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2628; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2629; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2630; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2631; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2632; SSE-NEXT: retq 2633; 2634; AVX1-LABEL: trunc_and_v8i64_v8i16: 2635; AVX1: # %bb.0: 2636; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 2637; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2638; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2639; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2640; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2641; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2642; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2643; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2644; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2645; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2646; AVX1-NEXT: vzeroupper 2647; AVX1-NEXT: retq 2648; 2649; AVX2-LABEL: trunc_and_v8i64_v8i16: 2650; AVX2: # %bb.0: 2651; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2652; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2653; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2654; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2655; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2656; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2657; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2658; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2659; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2660; AVX2-NEXT: vzeroupper 2661; AVX2-NEXT: retq 2662; 2663; AVX512-LABEL: trunc_and_v8i64_v8i16: 2664; AVX512: # %bb.0: 2665; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 2666; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2667; AVX512-NEXT: vzeroupper 2668; AVX512-NEXT: retq 2669 %1 = and <8 x i64> %a0, %a1 2670 %2 = trunc <8 x i64> %1 to <8 x i16> 2671 ret <8 x i16> %2 2672} 2673 2674define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 2675; SSE-LABEL: trunc_and_v8i32_v8i16: 2676; SSE: # %bb.0: 2677; SSE-NEXT: pand %xmm2, %xmm0 2678; SSE-NEXT: pand %xmm3, %xmm1 2679; SSE-NEXT: pslld $16, %xmm1 2680; SSE-NEXT: psrad $16, %xmm1 2681; SSE-NEXT: pslld $16, %xmm0 2682; SSE-NEXT: psrad $16, %xmm0 2683; SSE-NEXT: packssdw %xmm1, %xmm0 2684; SSE-NEXT: retq 2685; 2686; AVX1-LABEL: trunc_and_v8i32_v8i16: 2687; AVX1: # %bb.0: 2688; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2689; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2690; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2691; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2692; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2693; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2694; AVX1-NEXT: vzeroupper 2695; AVX1-NEXT: retq 2696; 2697; AVX2-LABEL: trunc_and_v8i32_v8i16: 2698; AVX2: # %bb.0: 2699; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2700; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2701; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2702; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2703; AVX2-NEXT: vzeroupper 2704; AVX2-NEXT: retq 2705; 2706; AVX512-LABEL: trunc_and_v8i32_v8i16: 2707; AVX512: # %bb.0: 2708; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2709; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2710; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2711; AVX512-NEXT: vzeroupper 2712; AVX512-NEXT: retq 2713 %1 = and <8 x i32> %a0, %a1 2714 %2 = trunc <8 x i32> %1 to <8 x i16> 2715 ret <8 x i16> %2 2716} 2717 2718define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2719; SSE-LABEL: trunc_and_v16i64_v16i8: 2720; SSE: # %bb.0: 2721; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 2722; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 2723; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 2724; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 2725; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 2726; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 2727; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 2728; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 2729; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2730; SSE-NEXT: pand %xmm8, %xmm7 2731; SSE-NEXT: pand %xmm8, %xmm6 2732; SSE-NEXT: packuswb %xmm7, %xmm6 2733; SSE-NEXT: pand %xmm8, %xmm5 2734; SSE-NEXT: pand %xmm8, %xmm4 2735; SSE-NEXT: packuswb %xmm5, %xmm4 2736; SSE-NEXT: packuswb %xmm6, %xmm4 2737; SSE-NEXT: pand %xmm8, %xmm3 2738; SSE-NEXT: pand %xmm8, %xmm2 2739; SSE-NEXT: packuswb %xmm3, %xmm2 2740; SSE-NEXT: pand %xmm8, %xmm1 2741; SSE-NEXT: pand %xmm8, %xmm0 2742; SSE-NEXT: packuswb %xmm1, %xmm0 2743; SSE-NEXT: packuswb %xmm2, %xmm0 2744; SSE-NEXT: packuswb %xmm4, %xmm0 2745; SSE-NEXT: retq 2746; 2747; AVX1-LABEL: trunc_and_v16i64_v16i8: 2748; AVX1: # %bb.0: 2749; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] 2750; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 2751; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 2752; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 2753; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 2754; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6 2755; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 2756; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 2757; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 2758; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2759; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3 2760; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2761; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2762; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2763; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3 2764; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 2765; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2766; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2767; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2768; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2769; AVX1-NEXT: vzeroupper 2770; AVX1-NEXT: retq 2771; 2772; AVX2-LABEL: trunc_and_v16i64_v16i8: 2773; AVX2: # %bb.0: 2774; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255] 2775; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 2776; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 2777; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 2778; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 2779; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2780; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2781; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3 2782; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2783; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3 2784; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2785; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2786; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2787; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2788; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2789; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2790; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2791; AVX2-NEXT: vzeroupper 2792; AVX2-NEXT: retq 2793; 2794; AVX512-LABEL: trunc_and_v16i64_v16i8: 2795; AVX512: # %bb.0: 2796; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 2797; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 2798; AVX512-NEXT: vpmovqb %zmm1, %xmm1 2799; AVX512-NEXT: vpmovqb %zmm0, %xmm0 2800; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2801; AVX512-NEXT: vzeroupper 2802; AVX512-NEXT: retq 2803 %1 = and <16 x i64> %a0, %a1 2804 %2 = trunc <16 x i64> %1 to <16 x i8> 2805 ret <16 x i8> %2 2806} 2807 2808define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2809; SSE-LABEL: trunc_and_v16i32_v16i8: 2810; SSE: # %bb.0: 2811; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2812; SSE-NEXT: pand %xmm8, %xmm7 2813; SSE-NEXT: pand %xmm3, %xmm7 2814; SSE-NEXT: pand %xmm8, %xmm6 2815; SSE-NEXT: pand %xmm2, %xmm6 2816; SSE-NEXT: packuswb %xmm7, %xmm6 2817; SSE-NEXT: pand %xmm8, %xmm5 2818; SSE-NEXT: pand %xmm1, %xmm5 2819; SSE-NEXT: pand %xmm8, %xmm4 2820; SSE-NEXT: pand %xmm4, %xmm0 2821; SSE-NEXT: packuswb %xmm5, %xmm0 2822; SSE-NEXT: packuswb %xmm6, %xmm0 2823; SSE-NEXT: retq 2824; 2825; AVX1-LABEL: trunc_and_v16i32_v16i8: 2826; AVX1: # %bb.0: 2827; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2828; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2829; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2830; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2831; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2832; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2833; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2834; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2835; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2836; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2837; AVX1-NEXT: vzeroupper 2838; AVX1-NEXT: retq 2839; 2840; AVX2-LABEL: trunc_and_v16i32_v16i8: 2841; AVX2: # %bb.0: 2842; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2843; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2844; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2845; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2846; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2847; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2848; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2849; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2850; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2851; AVX2-NEXT: vzeroupper 2852; AVX2-NEXT: retq 2853; 2854; AVX512-LABEL: trunc_and_v16i32_v16i8: 2855; AVX512: # %bb.0: 2856; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 2857; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2858; AVX512-NEXT: vzeroupper 2859; AVX512-NEXT: retq 2860 %1 = and <16 x i32> %a0, %a1 2861 %2 = trunc <16 x i32> %1 to <16 x i8> 2862 ret <16 x i8> %2 2863} 2864 2865define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2866; SSE-LABEL: trunc_and_v16i16_v16i8: 2867; SSE: # %bb.0: 2868; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2869; SSE-NEXT: pand %xmm4, %xmm3 2870; SSE-NEXT: pand %xmm1, %xmm3 2871; SSE-NEXT: pand %xmm4, %xmm2 2872; SSE-NEXT: pand %xmm2, %xmm0 2873; SSE-NEXT: packuswb %xmm3, %xmm0 2874; SSE-NEXT: retq 2875; 2876; AVX1-LABEL: trunc_and_v16i16_v16i8: 2877; AVX1: # %bb.0: 2878; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2879; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2880; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2881; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2882; AVX1-NEXT: vzeroupper 2883; AVX1-NEXT: retq 2884; 2885; AVX2-LABEL: trunc_and_v16i16_v16i8: 2886; AVX2: # %bb.0: 2887; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2888; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2889; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2890; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2891; AVX2-NEXT: vzeroupper 2892; AVX2-NEXT: retq 2893; 2894; AVX512F-LABEL: trunc_and_v16i16_v16i8: 2895; AVX512F: # %bb.0: 2896; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 2897; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2898; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2899; AVX512F-NEXT: vzeroupper 2900; AVX512F-NEXT: retq 2901; 2902; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 2903; AVX512BW: # %bb.0: 2904; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 2905; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2906; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2907; AVX512BW-NEXT: vzeroupper 2908; AVX512BW-NEXT: retq 2909; 2910; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 2911; AVX512DQ: # %bb.0: 2912; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 2913; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2914; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2915; AVX512DQ-NEXT: vzeroupper 2916; AVX512DQ-NEXT: retq 2917 %1 = and <16 x i16> %a0, %a1 2918 %2 = trunc <16 x i16> %1 to <16 x i8> 2919 ret <16 x i8> %2 2920} 2921 2922; 2923; and to constant 2924; 2925 2926define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2927; SSE-LABEL: trunc_and_const_v4i64_v4i32: 2928; SSE: # %bb.0: 2929; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2930; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2931; SSE-NEXT: retq 2932; 2933; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 2934; AVX1: # %bb.0: 2935; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2936; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2937; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2938; AVX1-NEXT: vzeroupper 2939; AVX1-NEXT: retq 2940; 2941; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 2942; AVX2-SLOW: # %bb.0: 2943; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2944; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2945; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2946; AVX2-SLOW-NEXT: vzeroupper 2947; AVX2-SLOW-NEXT: retq 2948; 2949; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: 2950; AVX2-FAST-ALL: # %bb.0: 2951; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2952; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2953; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2954; AVX2-FAST-ALL-NEXT: vzeroupper 2955; AVX2-FAST-ALL-NEXT: retq 2956; 2957; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: 2958; AVX2-FAST-PERLANE: # %bb.0: 2959; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2960; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2961; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2962; AVX2-FAST-PERLANE-NEXT: vzeroupper 2963; AVX2-FAST-PERLANE-NEXT: retq 2964; 2965; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 2966; AVX512: # %bb.0: 2967; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2968; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2969; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2970; AVX512-NEXT: vzeroupper 2971; AVX512-NEXT: retq 2972 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2973 %2 = trunc <4 x i64> %1 to <4 x i32> 2974 ret <4 x i32> %2 2975} 2976 2977define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2978; SSE-LABEL: trunc_and_const_v8i64_v8i16: 2979; SSE: # %bb.0: 2980; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2981; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2982; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2983; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2984; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2985; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2986; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2987; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2988; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2989; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2990; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2991; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2992; SSE-NEXT: retq 2993; 2994; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 2995; AVX1: # %bb.0: 2996; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2997; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2998; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2999; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3000; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3001; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3002; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3003; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3004; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3005; AVX1-NEXT: vzeroupper 3006; AVX1-NEXT: retq 3007; 3008; AVX2-LABEL: trunc_and_const_v8i64_v8i16: 3009; AVX2: # %bb.0: 3010; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3011; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3012; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3013; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3014; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3015; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3016; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3017; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3018; AVX2-NEXT: vzeroupper 3019; AVX2-NEXT: retq 3020; 3021; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 3022; AVX512: # %bb.0: 3023; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3024; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3025; AVX512-NEXT: vzeroupper 3026; AVX512-NEXT: retq 3027 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3028 %2 = trunc <8 x i64> %1 to <8 x i16> 3029 ret <8 x i16> %2 3030} 3031 3032define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3033; SSE-LABEL: trunc_and_const_v8i32_v8i16: 3034; SSE: # %bb.0: 3035; SSE-NEXT: pslld $16, %xmm1 3036; SSE-NEXT: psrad $16, %xmm1 3037; SSE-NEXT: pslld $16, %xmm0 3038; SSE-NEXT: psrad $16, %xmm0 3039; SSE-NEXT: packssdw %xmm1, %xmm0 3040; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3041; SSE-NEXT: retq 3042; 3043; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 3044; AVX1: # %bb.0: 3045; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3046; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3047; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3048; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3049; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3050; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3051; AVX1-NEXT: vzeroupper 3052; AVX1-NEXT: retq 3053; 3054; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 3055; AVX2: # %bb.0: 3056; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3057; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3058; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3059; AVX2-NEXT: vzeroupper 3060; AVX2-NEXT: retq 3061; 3062; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3063; AVX512: # %bb.0: 3064; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3065; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3066; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3067; AVX512-NEXT: vzeroupper 3068; AVX512-NEXT: retq 3069 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3070 %2 = trunc <8 x i32> %1 to <8 x i16> 3071 ret <8 x i16> %2 3072} 3073 3074define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3075; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3076; SSE: # %bb.0: 3077; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3078; SSE-NEXT: pand %xmm8, %xmm7 3079; SSE-NEXT: pand %xmm8, %xmm6 3080; SSE-NEXT: packuswb %xmm7, %xmm6 3081; SSE-NEXT: pand %xmm8, %xmm5 3082; SSE-NEXT: pand %xmm8, %xmm4 3083; SSE-NEXT: packuswb %xmm5, %xmm4 3084; SSE-NEXT: packuswb %xmm6, %xmm4 3085; SSE-NEXT: pand %xmm8, %xmm3 3086; SSE-NEXT: pand %xmm8, %xmm2 3087; SSE-NEXT: packuswb %xmm3, %xmm2 3088; SSE-NEXT: pand %xmm8, %xmm1 3089; SSE-NEXT: pand %xmm8, %xmm0 3090; SSE-NEXT: packuswb %xmm1, %xmm0 3091; SSE-NEXT: packuswb %xmm2, %xmm0 3092; SSE-NEXT: packuswb %xmm4, %xmm0 3093; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3094; SSE-NEXT: retq 3095; 3096; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3097; AVX1: # %bb.0: 3098; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3099; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3100; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3101; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3102; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3103; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3104; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3105; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3106; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3107; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3108; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3109; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3110; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3111; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3112; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3113; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3114; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3115; AVX1-NEXT: vzeroupper 3116; AVX1-NEXT: retq 3117; 3118; AVX2-LABEL: trunc_and_const_v16i64_v16i8: 3119; AVX2: # %bb.0: 3120; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3121; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3122; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3123; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3124; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3125; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3126; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3127; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3128; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3129; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3130; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3131; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3132; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3133; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3134; AVX2-NEXT: vzeroupper 3135; AVX2-NEXT: retq 3136; 3137; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3138; AVX512: # %bb.0: 3139; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3140; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3141; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3142; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3143; AVX512-NEXT: vzeroupper 3144; AVX512-NEXT: retq 3145 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3146 %2 = trunc <16 x i64> %1 to <16 x i8> 3147 ret <16 x i8> %2 3148} 3149 3150define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3151; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3152; SSE: # %bb.0: 3153; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3154; SSE-NEXT: pand %xmm4, %xmm3 3155; SSE-NEXT: pand %xmm4, %xmm2 3156; SSE-NEXT: packuswb %xmm3, %xmm2 3157; SSE-NEXT: pand %xmm4, %xmm1 3158; SSE-NEXT: pand %xmm4, %xmm0 3159; SSE-NEXT: packuswb %xmm1, %xmm0 3160; SSE-NEXT: packuswb %xmm2, %xmm0 3161; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3162; SSE-NEXT: retq 3163; 3164; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3165; AVX1: # %bb.0: 3166; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3167; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3168; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3169; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3170; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3171; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3172; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3173; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3174; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3175; AVX1-NEXT: vzeroupper 3176; AVX1-NEXT: retq 3177; 3178; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3179; AVX2: # %bb.0: 3180; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3181; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3182; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3183; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3184; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3185; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3186; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3187; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3188; AVX2-NEXT: vzeroupper 3189; AVX2-NEXT: retq 3190; 3191; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3192; AVX512: # %bb.0: 3193; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3194; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3195; AVX512-NEXT: vzeroupper 3196; AVX512-NEXT: retq 3197 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3198 %2 = trunc <16 x i32> %1 to <16 x i8> 3199 ret <16 x i8> %2 3200} 3201 3202define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3203; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3204; SSE: # %bb.0: 3205; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3206; SSE-NEXT: pand %xmm2, %xmm1 3207; SSE-NEXT: pand %xmm2, %xmm0 3208; SSE-NEXT: packuswb %xmm1, %xmm0 3209; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3210; SSE-NEXT: retq 3211; 3212; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3213; AVX1: # %bb.0: 3214; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3215; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3216; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3217; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3218; AVX1-NEXT: vzeroupper 3219; AVX1-NEXT: retq 3220; 3221; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3222; AVX2: # %bb.0: 3223; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3224; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3225; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3226; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3227; AVX2-NEXT: vzeroupper 3228; AVX2-NEXT: retq 3229; 3230; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3231; AVX512F: # %bb.0: 3232; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3233; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3234; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3235; AVX512F-NEXT: vzeroupper 3236; AVX512F-NEXT: retq 3237; 3238; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3239; AVX512BW: # %bb.0: 3240; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3241; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3242; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3243; AVX512BW-NEXT: vzeroupper 3244; AVX512BW-NEXT: retq 3245; 3246; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3247; AVX512DQ: # %bb.0: 3248; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3249; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3250; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3251; AVX512DQ-NEXT: vzeroupper 3252; AVX512DQ-NEXT: retq 3253 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3254 %2 = trunc <16 x i16> %1 to <16 x i8> 3255 ret <16 x i8> %2 3256} 3257 3258; 3259; xor 3260; 3261 3262define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3263; SSE-LABEL: trunc_xor_v4i64_v4i32: 3264; SSE: # %bb.0: 3265; SSE-NEXT: xorps %xmm3, %xmm1 3266; SSE-NEXT: xorps %xmm2, %xmm0 3267; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3268; SSE-NEXT: retq 3269; 3270; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3271; AVX1: # %bb.0: 3272; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3273; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3274; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3275; AVX1-NEXT: vzeroupper 3276; AVX1-NEXT: retq 3277; 3278; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3279; AVX2-SLOW: # %bb.0: 3280; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3281; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3282; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3283; AVX2-SLOW-NEXT: vzeroupper 3284; AVX2-SLOW-NEXT: retq 3285; 3286; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: 3287; AVX2-FAST-ALL: # %bb.0: 3288; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 3289; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3290; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3291; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3292; AVX2-FAST-ALL-NEXT: vzeroupper 3293; AVX2-FAST-ALL-NEXT: retq 3294; 3295; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: 3296; AVX2-FAST-PERLANE: # %bb.0: 3297; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 3298; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3299; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3300; AVX2-FAST-PERLANE-NEXT: vzeroupper 3301; AVX2-FAST-PERLANE-NEXT: retq 3302; 3303; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3304; AVX512: # %bb.0: 3305; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3306; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3307; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3308; AVX512-NEXT: vzeroupper 3309; AVX512-NEXT: retq 3310 %1 = xor <4 x i64> %a0, %a1 3311 %2 = trunc <4 x i64> %1 to <4 x i32> 3312 ret <4 x i32> %2 3313} 3314 3315define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3316; SSE-LABEL: trunc_xor_v8i64_v8i16: 3317; SSE: # %bb.0: 3318; SSE-NEXT: pxor %xmm6, %xmm2 3319; SSE-NEXT: pxor %xmm7, %xmm3 3320; SSE-NEXT: pxor %xmm4, %xmm0 3321; SSE-NEXT: pxor %xmm5, %xmm1 3322; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3323; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3324; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3325; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3326; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3327; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3328; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3329; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3330; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3331; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3332; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3333; SSE-NEXT: retq 3334; 3335; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3336; AVX1: # %bb.0: 3337; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3338; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3339; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3340; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3341; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3342; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3343; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3344; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3345; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3346; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3347; AVX1-NEXT: vzeroupper 3348; AVX1-NEXT: retq 3349; 3350; AVX2-LABEL: trunc_xor_v8i64_v8i16: 3351; AVX2: # %bb.0: 3352; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3353; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3354; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3355; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3356; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3357; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3358; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3359; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3360; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3361; AVX2-NEXT: vzeroupper 3362; AVX2-NEXT: retq 3363; 3364; AVX512-LABEL: trunc_xor_v8i64_v8i16: 3365; AVX512: # %bb.0: 3366; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3367; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3368; AVX512-NEXT: vzeroupper 3369; AVX512-NEXT: retq 3370 %1 = xor <8 x i64> %a0, %a1 3371 %2 = trunc <8 x i64> %1 to <8 x i16> 3372 ret <8 x i16> %2 3373} 3374 3375define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3376; SSE-LABEL: trunc_xor_v8i32_v8i16: 3377; SSE: # %bb.0: 3378; SSE-NEXT: pxor %xmm2, %xmm0 3379; SSE-NEXT: pxor %xmm3, %xmm1 3380; SSE-NEXT: pslld $16, %xmm1 3381; SSE-NEXT: psrad $16, %xmm1 3382; SSE-NEXT: pslld $16, %xmm0 3383; SSE-NEXT: psrad $16, %xmm0 3384; SSE-NEXT: packssdw %xmm1, %xmm0 3385; SSE-NEXT: retq 3386; 3387; AVX1-LABEL: trunc_xor_v8i32_v8i16: 3388; AVX1: # %bb.0: 3389; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3390; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3391; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3392; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3393; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3394; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3395; AVX1-NEXT: vzeroupper 3396; AVX1-NEXT: retq 3397; 3398; AVX2-LABEL: trunc_xor_v8i32_v8i16: 3399; AVX2: # %bb.0: 3400; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3401; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3402; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3403; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3404; AVX2-NEXT: vzeroupper 3405; AVX2-NEXT: retq 3406; 3407; AVX512-LABEL: trunc_xor_v8i32_v8i16: 3408; AVX512: # %bb.0: 3409; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3410; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3411; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3412; AVX512-NEXT: vzeroupper 3413; AVX512-NEXT: retq 3414 %1 = xor <8 x i32> %a0, %a1 3415 %2 = trunc <8 x i32> %1 to <8 x i16> 3416 ret <8 x i16> %2 3417} 3418 3419define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3420; SSE-LABEL: trunc_xor_v16i64_v16i8: 3421; SSE: # %bb.0: 3422; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3423; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3424; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3425; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3426; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3427; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3428; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3429; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3430; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3431; SSE-NEXT: pand %xmm8, %xmm7 3432; SSE-NEXT: pand %xmm8, %xmm6 3433; SSE-NEXT: packuswb %xmm7, %xmm6 3434; SSE-NEXT: pand %xmm8, %xmm5 3435; SSE-NEXT: pand %xmm8, %xmm4 3436; SSE-NEXT: packuswb %xmm5, %xmm4 3437; SSE-NEXT: packuswb %xmm6, %xmm4 3438; SSE-NEXT: pand %xmm8, %xmm3 3439; SSE-NEXT: pand %xmm8, %xmm2 3440; SSE-NEXT: packuswb %xmm3, %xmm2 3441; SSE-NEXT: pand %xmm8, %xmm1 3442; SSE-NEXT: pand %xmm8, %xmm0 3443; SSE-NEXT: packuswb %xmm1, %xmm0 3444; SSE-NEXT: packuswb %xmm2, %xmm0 3445; SSE-NEXT: packuswb %xmm4, %xmm0 3446; SSE-NEXT: retq 3447; 3448; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3449; AVX1: # %bb.0: 3450; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3451; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3452; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3453; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3454; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3455; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3456; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3457; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3458; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3459; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3460; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3461; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3462; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3463; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3464; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3465; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3466; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3467; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3468; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3469; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3470; AVX1-NEXT: vzeroupper 3471; AVX1-NEXT: retq 3472; 3473; AVX2-LABEL: trunc_xor_v16i64_v16i8: 3474; AVX2: # %bb.0: 3475; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 3476; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 3477; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 3478; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 3479; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3480; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3481; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3482; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3483; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3484; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3485; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3486; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3487; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3488; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3489; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3490; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3491; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3492; AVX2-NEXT: vzeroupper 3493; AVX2-NEXT: retq 3494; 3495; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3496; AVX512: # %bb.0: 3497; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3498; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3499; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3500; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3501; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3502; AVX512-NEXT: vzeroupper 3503; AVX512-NEXT: retq 3504 %1 = xor <16 x i64> %a0, %a1 3505 %2 = trunc <16 x i64> %1 to <16 x i8> 3506 ret <16 x i8> %2 3507} 3508 3509define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3510; SSE-LABEL: trunc_xor_v16i32_v16i8: 3511; SSE: # %bb.0: 3512; SSE-NEXT: pxor %xmm4, %xmm0 3513; SSE-NEXT: pxor %xmm5, %xmm1 3514; SSE-NEXT: pxor %xmm6, %xmm2 3515; SSE-NEXT: pxor %xmm7, %xmm3 3516; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3517; SSE-NEXT: pand %xmm4, %xmm3 3518; SSE-NEXT: pand %xmm4, %xmm2 3519; SSE-NEXT: packuswb %xmm3, %xmm2 3520; SSE-NEXT: pand %xmm4, %xmm1 3521; SSE-NEXT: pand %xmm4, %xmm0 3522; SSE-NEXT: packuswb %xmm1, %xmm0 3523; SSE-NEXT: packuswb %xmm2, %xmm0 3524; SSE-NEXT: retq 3525; 3526; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3527; AVX1: # %bb.0: 3528; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3529; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3530; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3531; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3532; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3533; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3534; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3535; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3536; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3537; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3538; AVX1-NEXT: vzeroupper 3539; AVX1-NEXT: retq 3540; 3541; AVX2-LABEL: trunc_xor_v16i32_v16i8: 3542; AVX2: # %bb.0: 3543; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3544; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3545; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3546; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3547; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3548; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3549; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3550; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3551; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3552; AVX2-NEXT: vzeroupper 3553; AVX2-NEXT: retq 3554; 3555; AVX512-LABEL: trunc_xor_v16i32_v16i8: 3556; AVX512: # %bb.0: 3557; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 3558; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3559; AVX512-NEXT: vzeroupper 3560; AVX512-NEXT: retq 3561 %1 = xor <16 x i32> %a0, %a1 3562 %2 = trunc <16 x i32> %1 to <16 x i8> 3563 ret <16 x i8> %2 3564} 3565 3566define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3567; SSE-LABEL: trunc_xor_v16i16_v16i8: 3568; SSE: # %bb.0: 3569; SSE-NEXT: pxor %xmm2, %xmm0 3570; SSE-NEXT: pxor %xmm3, %xmm1 3571; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3572; SSE-NEXT: pand %xmm2, %xmm1 3573; SSE-NEXT: pand %xmm2, %xmm0 3574; SSE-NEXT: packuswb %xmm1, %xmm0 3575; SSE-NEXT: retq 3576; 3577; AVX1-LABEL: trunc_xor_v16i16_v16i8: 3578; AVX1: # %bb.0: 3579; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3580; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3581; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3582; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3583; AVX1-NEXT: vzeroupper 3584; AVX1-NEXT: retq 3585; 3586; AVX2-LABEL: trunc_xor_v16i16_v16i8: 3587; AVX2: # %bb.0: 3588; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3589; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3590; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3591; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3592; AVX2-NEXT: vzeroupper 3593; AVX2-NEXT: retq 3594; 3595; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 3596; AVX512F: # %bb.0: 3597; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 3598; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3599; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3600; AVX512F-NEXT: vzeroupper 3601; AVX512F-NEXT: retq 3602; 3603; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 3604; AVX512BW: # %bb.0: 3605; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 3606; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3607; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3608; AVX512BW-NEXT: vzeroupper 3609; AVX512BW-NEXT: retq 3610; 3611; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 3612; AVX512DQ: # %bb.0: 3613; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 3614; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3615; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3616; AVX512DQ-NEXT: vzeroupper 3617; AVX512DQ-NEXT: retq 3618 %1 = xor <16 x i16> %a0, %a1 3619 %2 = trunc <16 x i16> %1 to <16 x i8> 3620 ret <16 x i8> %2 3621} 3622 3623; 3624; xor to constant 3625; 3626 3627define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3628; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 3629; SSE: # %bb.0: 3630; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3631; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3632; SSE-NEXT: retq 3633; 3634; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 3635; AVX1: # %bb.0: 3636; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3637; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3638; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3639; AVX1-NEXT: vzeroupper 3640; AVX1-NEXT: retq 3641; 3642; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 3643; AVX2-SLOW: # %bb.0: 3644; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3645; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3646; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3647; AVX2-SLOW-NEXT: vzeroupper 3648; AVX2-SLOW-NEXT: retq 3649; 3650; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: 3651; AVX2-FAST-ALL: # %bb.0: 3652; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3653; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3654; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3655; AVX2-FAST-ALL-NEXT: vzeroupper 3656; AVX2-FAST-ALL-NEXT: retq 3657; 3658; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: 3659; AVX2-FAST-PERLANE: # %bb.0: 3660; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3661; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3662; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3663; AVX2-FAST-PERLANE-NEXT: vzeroupper 3664; AVX2-FAST-PERLANE-NEXT: retq 3665; 3666; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 3667; AVX512: # %bb.0: 3668; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3669; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3670; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3671; AVX512-NEXT: vzeroupper 3672; AVX512-NEXT: retq 3673 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3674 %2 = trunc <4 x i64> %1 to <4 x i32> 3675 ret <4 x i32> %2 3676} 3677 3678define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3679; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 3680; SSE: # %bb.0: 3681; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3682; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3683; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3684; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3685; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3686; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3687; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3688; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3689; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3690; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3691; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3692; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3693; SSE-NEXT: retq 3694; 3695; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 3696; AVX1: # %bb.0: 3697; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3698; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3699; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3700; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3701; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3702; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3703; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3704; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3705; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3706; AVX1-NEXT: vzeroupper 3707; AVX1-NEXT: retq 3708; 3709; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: 3710; AVX2: # %bb.0: 3711; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3712; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3713; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3714; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3715; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3716; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3717; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3718; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3719; AVX2-NEXT: vzeroupper 3720; AVX2-NEXT: retq 3721; 3722; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 3723; AVX512: # %bb.0: 3724; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3725; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3726; AVX512-NEXT: vzeroupper 3727; AVX512-NEXT: retq 3728 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3729 %2 = trunc <8 x i64> %1 to <8 x i16> 3730 ret <8 x i16> %2 3731} 3732 3733define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3734; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 3735; SSE: # %bb.0: 3736; SSE-NEXT: pslld $16, %xmm1 3737; SSE-NEXT: psrad $16, %xmm1 3738; SSE-NEXT: pslld $16, %xmm0 3739; SSE-NEXT: psrad $16, %xmm0 3740; SSE-NEXT: packssdw %xmm1, %xmm0 3741; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3742; SSE-NEXT: retq 3743; 3744; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 3745; AVX1: # %bb.0: 3746; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3747; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3748; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3749; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3750; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3751; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3752; AVX1-NEXT: vzeroupper 3753; AVX1-NEXT: retq 3754; 3755; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 3756; AVX2: # %bb.0: 3757; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3758; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3759; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3760; AVX2-NEXT: vzeroupper 3761; AVX2-NEXT: retq 3762; 3763; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 3764; AVX512: # %bb.0: 3765; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3766; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3767; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3768; AVX512-NEXT: vzeroupper 3769; AVX512-NEXT: retq 3770 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3771 %2 = trunc <8 x i32> %1 to <8 x i16> 3772 ret <8 x i16> %2 3773} 3774 3775define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3776; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 3777; SSE: # %bb.0: 3778; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3779; SSE-NEXT: pand %xmm8, %xmm7 3780; SSE-NEXT: pand %xmm8, %xmm6 3781; SSE-NEXT: packuswb %xmm7, %xmm6 3782; SSE-NEXT: pand %xmm8, %xmm5 3783; SSE-NEXT: pand %xmm8, %xmm4 3784; SSE-NEXT: packuswb %xmm5, %xmm4 3785; SSE-NEXT: packuswb %xmm6, %xmm4 3786; SSE-NEXT: pand %xmm8, %xmm3 3787; SSE-NEXT: pand %xmm8, %xmm2 3788; SSE-NEXT: packuswb %xmm3, %xmm2 3789; SSE-NEXT: pand %xmm8, %xmm1 3790; SSE-NEXT: pand %xmm8, %xmm0 3791; SSE-NEXT: packuswb %xmm1, %xmm0 3792; SSE-NEXT: packuswb %xmm2, %xmm0 3793; SSE-NEXT: packuswb %xmm4, %xmm0 3794; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3795; SSE-NEXT: retq 3796; 3797; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 3798; AVX1: # %bb.0: 3799; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3800; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3801; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3802; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3803; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3804; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3805; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3806; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3807; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3808; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3809; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3810; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3811; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3812; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3813; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3814; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3815; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3816; AVX1-NEXT: vzeroupper 3817; AVX1-NEXT: retq 3818; 3819; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: 3820; AVX2: # %bb.0: 3821; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3822; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3823; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3824; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3825; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3826; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3827; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3828; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3829; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3830; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3831; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3832; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3833; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3834; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3835; AVX2-NEXT: vzeroupper 3836; AVX2-NEXT: retq 3837; 3838; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 3839; AVX512: # %bb.0: 3840; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3841; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3842; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3843; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3844; AVX512-NEXT: vzeroupper 3845; AVX512-NEXT: retq 3846 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3847 %2 = trunc <16 x i64> %1 to <16 x i8> 3848 ret <16 x i8> %2 3849} 3850 3851define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3852; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 3853; SSE: # %bb.0: 3854; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3855; SSE-NEXT: pand %xmm4, %xmm3 3856; SSE-NEXT: pand %xmm4, %xmm2 3857; SSE-NEXT: packuswb %xmm3, %xmm2 3858; SSE-NEXT: pand %xmm4, %xmm1 3859; SSE-NEXT: pand %xmm4, %xmm0 3860; SSE-NEXT: packuswb %xmm1, %xmm0 3861; SSE-NEXT: packuswb %xmm2, %xmm0 3862; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3863; SSE-NEXT: retq 3864; 3865; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 3866; AVX1: # %bb.0: 3867; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3868; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3869; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3870; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3871; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3872; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3873; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3874; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3875; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3876; AVX1-NEXT: vzeroupper 3877; AVX1-NEXT: retq 3878; 3879; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 3880; AVX2: # %bb.0: 3881; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3882; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3883; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3884; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3885; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3886; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3887; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3888; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3889; AVX2-NEXT: vzeroupper 3890; AVX2-NEXT: retq 3891; 3892; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 3893; AVX512: # %bb.0: 3894; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3895; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3896; AVX512-NEXT: vzeroupper 3897; AVX512-NEXT: retq 3898 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3899 %2 = trunc <16 x i32> %1 to <16 x i8> 3900 ret <16 x i8> %2 3901} 3902 3903define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3904; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 3905; SSE: # %bb.0: 3906; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3907; SSE-NEXT: pand %xmm2, %xmm1 3908; SSE-NEXT: pand %xmm2, %xmm0 3909; SSE-NEXT: packuswb %xmm1, %xmm0 3910; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3911; SSE-NEXT: retq 3912; 3913; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 3914; AVX1: # %bb.0: 3915; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3916; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3917; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3918; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3919; AVX1-NEXT: vzeroupper 3920; AVX1-NEXT: retq 3921; 3922; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 3923; AVX2: # %bb.0: 3924; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3925; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3926; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3927; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3928; AVX2-NEXT: vzeroupper 3929; AVX2-NEXT: retq 3930; 3931; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 3932; AVX512F: # %bb.0: 3933; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3934; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3935; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3936; AVX512F-NEXT: vzeroupper 3937; AVX512F-NEXT: retq 3938; 3939; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 3940; AVX512BW: # %bb.0: 3941; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3942; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3943; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3944; AVX512BW-NEXT: vzeroupper 3945; AVX512BW-NEXT: retq 3946; 3947; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 3948; AVX512DQ: # %bb.0: 3949; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3950; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3951; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3952; AVX512DQ-NEXT: vzeroupper 3953; AVX512DQ-NEXT: retq 3954 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3955 %2 = trunc <16 x i16> %1 to <16 x i8> 3956 ret <16 x i8> %2 3957} 3958 3959; 3960; or 3961; 3962 3963define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3964; SSE-LABEL: trunc_or_v4i64_v4i32: 3965; SSE: # %bb.0: 3966; SSE-NEXT: orps %xmm3, %xmm1 3967; SSE-NEXT: orps %xmm2, %xmm0 3968; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3969; SSE-NEXT: retq 3970; 3971; AVX1-LABEL: trunc_or_v4i64_v4i32: 3972; AVX1: # %bb.0: 3973; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 3974; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3975; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3976; AVX1-NEXT: vzeroupper 3977; AVX1-NEXT: retq 3978; 3979; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 3980; AVX2-SLOW: # %bb.0: 3981; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 3982; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3983; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3984; AVX2-SLOW-NEXT: vzeroupper 3985; AVX2-SLOW-NEXT: retq 3986; 3987; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: 3988; AVX2-FAST-ALL: # %bb.0: 3989; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 3990; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3991; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3992; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3993; AVX2-FAST-ALL-NEXT: vzeroupper 3994; AVX2-FAST-ALL-NEXT: retq 3995; 3996; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: 3997; AVX2-FAST-PERLANE: # %bb.0: 3998; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 3999; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 4000; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4001; AVX2-FAST-PERLANE-NEXT: vzeroupper 4002; AVX2-FAST-PERLANE-NEXT: retq 4003; 4004; AVX512-LABEL: trunc_or_v4i64_v4i32: 4005; AVX512: # %bb.0: 4006; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4007; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4008; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4009; AVX512-NEXT: vzeroupper 4010; AVX512-NEXT: retq 4011 %1 = or <4 x i64> %a0, %a1 4012 %2 = trunc <4 x i64> %1 to <4 x i32> 4013 ret <4 x i32> %2 4014} 4015 4016define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 4017; SSE-LABEL: trunc_or_v8i64_v8i16: 4018; SSE: # %bb.0: 4019; SSE-NEXT: por %xmm6, %xmm2 4020; SSE-NEXT: por %xmm7, %xmm3 4021; SSE-NEXT: por %xmm4, %xmm0 4022; SSE-NEXT: por %xmm5, %xmm1 4023; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4024; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4025; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4026; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4027; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4028; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4029; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4030; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4031; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4032; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4033; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4034; SSE-NEXT: retq 4035; 4036; AVX1-LABEL: trunc_or_v8i64_v8i16: 4037; AVX1: # %bb.0: 4038; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4039; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4040; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4041; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4042; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4043; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4044; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4045; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4046; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4047; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4048; AVX1-NEXT: vzeroupper 4049; AVX1-NEXT: retq 4050; 4051; AVX2-LABEL: trunc_or_v8i64_v8i16: 4052; AVX2: # %bb.0: 4053; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4054; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4055; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4056; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4057; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4058; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4059; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4060; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4061; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4062; AVX2-NEXT: vzeroupper 4063; AVX2-NEXT: retq 4064; 4065; AVX512-LABEL: trunc_or_v8i64_v8i16: 4066; AVX512: # %bb.0: 4067; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4068; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4069; AVX512-NEXT: vzeroupper 4070; AVX512-NEXT: retq 4071 %1 = or <8 x i64> %a0, %a1 4072 %2 = trunc <8 x i64> %1 to <8 x i16> 4073 ret <8 x i16> %2 4074} 4075 4076define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4077; SSE-LABEL: trunc_or_v8i32_v8i16: 4078; SSE: # %bb.0: 4079; SSE-NEXT: por %xmm2, %xmm0 4080; SSE-NEXT: por %xmm3, %xmm1 4081; SSE-NEXT: pslld $16, %xmm1 4082; SSE-NEXT: psrad $16, %xmm1 4083; SSE-NEXT: pslld $16, %xmm0 4084; SSE-NEXT: psrad $16, %xmm0 4085; SSE-NEXT: packssdw %xmm1, %xmm0 4086; SSE-NEXT: retq 4087; 4088; AVX1-LABEL: trunc_or_v8i32_v8i16: 4089; AVX1: # %bb.0: 4090; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4091; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4092; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4093; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4094; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4095; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4096; AVX1-NEXT: vzeroupper 4097; AVX1-NEXT: retq 4098; 4099; AVX2-LABEL: trunc_or_v8i32_v8i16: 4100; AVX2: # %bb.0: 4101; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4102; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4103; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4104; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4105; AVX2-NEXT: vzeroupper 4106; AVX2-NEXT: retq 4107; 4108; AVX512-LABEL: trunc_or_v8i32_v8i16: 4109; AVX512: # %bb.0: 4110; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4111; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4112; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4113; AVX512-NEXT: vzeroupper 4114; AVX512-NEXT: retq 4115 %1 = or <8 x i32> %a0, %a1 4116 %2 = trunc <8 x i32> %1 to <8 x i16> 4117 ret <8 x i16> %2 4118} 4119 4120define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4121; SSE-LABEL: trunc_or_v16i64_v16i8: 4122; SSE: # %bb.0: 4123; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4124; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4125; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4126; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4127; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4128; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4129; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4130; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4131; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4132; SSE-NEXT: pand %xmm8, %xmm7 4133; SSE-NEXT: pand %xmm8, %xmm6 4134; SSE-NEXT: packuswb %xmm7, %xmm6 4135; SSE-NEXT: pand %xmm8, %xmm5 4136; SSE-NEXT: pand %xmm8, %xmm4 4137; SSE-NEXT: packuswb %xmm5, %xmm4 4138; SSE-NEXT: packuswb %xmm6, %xmm4 4139; SSE-NEXT: pand %xmm8, %xmm3 4140; SSE-NEXT: pand %xmm8, %xmm2 4141; SSE-NEXT: packuswb %xmm3, %xmm2 4142; SSE-NEXT: pand %xmm8, %xmm1 4143; SSE-NEXT: pand %xmm8, %xmm0 4144; SSE-NEXT: packuswb %xmm1, %xmm0 4145; SSE-NEXT: packuswb %xmm2, %xmm0 4146; SSE-NEXT: packuswb %xmm4, %xmm0 4147; SSE-NEXT: retq 4148; 4149; AVX1-LABEL: trunc_or_v16i64_v16i8: 4150; AVX1: # %bb.0: 4151; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4152; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4153; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4154; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4155; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4156; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4157; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4158; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4159; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4160; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4161; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4162; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4163; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4164; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4165; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4166; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4167; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4168; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4169; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4170; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4171; AVX1-NEXT: vzeroupper 4172; AVX1-NEXT: retq 4173; 4174; AVX2-LABEL: trunc_or_v16i64_v16i8: 4175; AVX2: # %bb.0: 4176; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 4177; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 4178; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 4179; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 4180; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4181; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4182; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4183; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4184; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4185; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4186; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4187; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4188; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4189; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4190; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4191; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4192; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4193; AVX2-NEXT: vzeroupper 4194; AVX2-NEXT: retq 4195; 4196; AVX512-LABEL: trunc_or_v16i64_v16i8: 4197; AVX512: # %bb.0: 4198; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4199; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4200; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4201; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4202; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4203; AVX512-NEXT: vzeroupper 4204; AVX512-NEXT: retq 4205 %1 = or <16 x i64> %a0, %a1 4206 %2 = trunc <16 x i64> %1 to <16 x i8> 4207 ret <16 x i8> %2 4208} 4209 4210define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4211; SSE-LABEL: trunc_or_v16i32_v16i8: 4212; SSE: # %bb.0: 4213; SSE-NEXT: por %xmm4, %xmm0 4214; SSE-NEXT: por %xmm5, %xmm1 4215; SSE-NEXT: por %xmm6, %xmm2 4216; SSE-NEXT: por %xmm7, %xmm3 4217; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4218; SSE-NEXT: pand %xmm4, %xmm3 4219; SSE-NEXT: pand %xmm4, %xmm2 4220; SSE-NEXT: packuswb %xmm3, %xmm2 4221; SSE-NEXT: pand %xmm4, %xmm1 4222; SSE-NEXT: pand %xmm4, %xmm0 4223; SSE-NEXT: packuswb %xmm1, %xmm0 4224; SSE-NEXT: packuswb %xmm2, %xmm0 4225; SSE-NEXT: retq 4226; 4227; AVX1-LABEL: trunc_or_v16i32_v16i8: 4228; AVX1: # %bb.0: 4229; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4230; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4231; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4232; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4233; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4234; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4235; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4236; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4237; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4238; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4239; AVX1-NEXT: vzeroupper 4240; AVX1-NEXT: retq 4241; 4242; AVX2-LABEL: trunc_or_v16i32_v16i8: 4243; AVX2: # %bb.0: 4244; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4245; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4246; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4247; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4248; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4249; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4250; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4251; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4252; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4253; AVX2-NEXT: vzeroupper 4254; AVX2-NEXT: retq 4255; 4256; AVX512-LABEL: trunc_or_v16i32_v16i8: 4257; AVX512: # %bb.0: 4258; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4259; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4260; AVX512-NEXT: vzeroupper 4261; AVX512-NEXT: retq 4262 %1 = or <16 x i32> %a0, %a1 4263 %2 = trunc <16 x i32> %1 to <16 x i8> 4264 ret <16 x i8> %2 4265} 4266 4267define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4268; SSE-LABEL: trunc_or_v16i16_v16i8: 4269; SSE: # %bb.0: 4270; SSE-NEXT: por %xmm2, %xmm0 4271; SSE-NEXT: por %xmm3, %xmm1 4272; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4273; SSE-NEXT: pand %xmm2, %xmm1 4274; SSE-NEXT: pand %xmm2, %xmm0 4275; SSE-NEXT: packuswb %xmm1, %xmm0 4276; SSE-NEXT: retq 4277; 4278; AVX1-LABEL: trunc_or_v16i16_v16i8: 4279; AVX1: # %bb.0: 4280; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4281; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4282; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4283; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4284; AVX1-NEXT: vzeroupper 4285; AVX1-NEXT: retq 4286; 4287; AVX2-LABEL: trunc_or_v16i16_v16i8: 4288; AVX2: # %bb.0: 4289; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4290; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4291; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4292; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4293; AVX2-NEXT: vzeroupper 4294; AVX2-NEXT: retq 4295; 4296; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4297; AVX512F: # %bb.0: 4298; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 4299; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4300; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4301; AVX512F-NEXT: vzeroupper 4302; AVX512F-NEXT: retq 4303; 4304; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4305; AVX512BW: # %bb.0: 4306; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 4307; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4308; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4309; AVX512BW-NEXT: vzeroupper 4310; AVX512BW-NEXT: retq 4311; 4312; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 4313; AVX512DQ: # %bb.0: 4314; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 4315; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4316; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4317; AVX512DQ-NEXT: vzeroupper 4318; AVX512DQ-NEXT: retq 4319 %1 = or <16 x i16> %a0, %a1 4320 %2 = trunc <16 x i16> %1 to <16 x i8> 4321 ret <16 x i8> %2 4322} 4323 4324; 4325; or to constant 4326; 4327 4328define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4329; SSE-LABEL: trunc_or_const_v4i64_v4i32: 4330; SSE: # %bb.0: 4331; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4332; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4333; SSE-NEXT: retq 4334; 4335; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 4336; AVX1: # %bb.0: 4337; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4338; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4339; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4340; AVX1-NEXT: vzeroupper 4341; AVX1-NEXT: retq 4342; 4343; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 4344; AVX2-SLOW: # %bb.0: 4345; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4346; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4347; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4348; AVX2-SLOW-NEXT: vzeroupper 4349; AVX2-SLOW-NEXT: retq 4350; 4351; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: 4352; AVX2-FAST-ALL: # %bb.0: 4353; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 4354; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 4355; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4356; AVX2-FAST-ALL-NEXT: vzeroupper 4357; AVX2-FAST-ALL-NEXT: retq 4358; 4359; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: 4360; AVX2-FAST-PERLANE: # %bb.0: 4361; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 4362; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4363; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4364; AVX2-FAST-PERLANE-NEXT: vzeroupper 4365; AVX2-FAST-PERLANE-NEXT: retq 4366; 4367; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 4368; AVX512: # %bb.0: 4369; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4370; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4371; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4372; AVX512-NEXT: vzeroupper 4373; AVX512-NEXT: retq 4374 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4375 %2 = trunc <4 x i64> %1 to <4 x i32> 4376 ret <4 x i32> %2 4377} 4378 4379define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4380; SSE-LABEL: trunc_or_const_v8i64_v8i16: 4381; SSE: # %bb.0: 4382; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4383; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4384; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4385; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4386; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4387; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4388; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4389; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4390; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4391; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4392; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4393; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4394; SSE-NEXT: retq 4395; 4396; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 4397; AVX1: # %bb.0: 4398; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4399; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4400; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4401; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4402; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4403; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4404; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4405; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4406; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4407; AVX1-NEXT: vzeroupper 4408; AVX1-NEXT: retq 4409; 4410; AVX2-LABEL: trunc_or_const_v8i64_v8i16: 4411; AVX2: # %bb.0: 4412; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4413; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4414; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4415; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4416; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4417; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4418; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4419; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4420; AVX2-NEXT: vzeroupper 4421; AVX2-NEXT: retq 4422; 4423; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 4424; AVX512: # %bb.0: 4425; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4426; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4427; AVX512-NEXT: vzeroupper 4428; AVX512-NEXT: retq 4429 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4430 %2 = trunc <8 x i64> %1 to <8 x i16> 4431 ret <8 x i16> %2 4432} 4433 4434define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4435; SSE-LABEL: trunc_or_const_v8i32_v8i16: 4436; SSE: # %bb.0: 4437; SSE-NEXT: pslld $16, %xmm1 4438; SSE-NEXT: psrad $16, %xmm1 4439; SSE-NEXT: pslld $16, %xmm0 4440; SSE-NEXT: psrad $16, %xmm0 4441; SSE-NEXT: packssdw %xmm1, %xmm0 4442; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4443; SSE-NEXT: retq 4444; 4445; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 4446; AVX1: # %bb.0: 4447; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4448; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4449; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4450; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4451; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4452; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4453; AVX1-NEXT: vzeroupper 4454; AVX1-NEXT: retq 4455; 4456; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 4457; AVX2: # %bb.0: 4458; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4459; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4460; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4461; AVX2-NEXT: vzeroupper 4462; AVX2-NEXT: retq 4463; 4464; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 4465; AVX512: # %bb.0: 4466; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4467; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4468; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4469; AVX512-NEXT: vzeroupper 4470; AVX512-NEXT: retq 4471 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4472 %2 = trunc <8 x i32> %1 to <8 x i16> 4473 ret <8 x i16> %2 4474} 4475 4476define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4477; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4478; SSE: # %bb.0: 4479; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4480; SSE-NEXT: pand %xmm8, %xmm7 4481; SSE-NEXT: pand %xmm8, %xmm6 4482; SSE-NEXT: packuswb %xmm7, %xmm6 4483; SSE-NEXT: pand %xmm8, %xmm5 4484; SSE-NEXT: pand %xmm8, %xmm4 4485; SSE-NEXT: packuswb %xmm5, %xmm4 4486; SSE-NEXT: packuswb %xmm6, %xmm4 4487; SSE-NEXT: pand %xmm8, %xmm3 4488; SSE-NEXT: pand %xmm8, %xmm2 4489; SSE-NEXT: packuswb %xmm3, %xmm2 4490; SSE-NEXT: pand %xmm8, %xmm1 4491; SSE-NEXT: pand %xmm8, %xmm0 4492; SSE-NEXT: packuswb %xmm1, %xmm0 4493; SSE-NEXT: packuswb %xmm2, %xmm0 4494; SSE-NEXT: packuswb %xmm4, %xmm0 4495; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4496; SSE-NEXT: retq 4497; 4498; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 4499; AVX1: # %bb.0: 4500; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4501; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4502; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4503; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4504; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4505; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4506; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4507; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4508; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4509; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4510; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4511; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4512; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4513; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4514; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4515; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4516; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4517; AVX1-NEXT: vzeroupper 4518; AVX1-NEXT: retq 4519; 4520; AVX2-LABEL: trunc_or_const_v16i64_v16i8: 4521; AVX2: # %bb.0: 4522; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4523; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4524; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4525; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4526; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4527; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4528; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4529; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4530; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4531; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4532; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4533; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4534; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4535; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4536; AVX2-NEXT: vzeroupper 4537; AVX2-NEXT: retq 4538; 4539; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 4540; AVX512: # %bb.0: 4541; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4542; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4543; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4544; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4545; AVX512-NEXT: vzeroupper 4546; AVX512-NEXT: retq 4547 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4548 %2 = trunc <16 x i64> %1 to <16 x i8> 4549 ret <16 x i8> %2 4550} 4551 4552define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4553; SSE-LABEL: trunc_or_const_v16i32_v16i8: 4554; SSE: # %bb.0: 4555; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4556; SSE-NEXT: pand %xmm4, %xmm3 4557; SSE-NEXT: pand %xmm4, %xmm2 4558; SSE-NEXT: packuswb %xmm3, %xmm2 4559; SSE-NEXT: pand %xmm4, %xmm1 4560; SSE-NEXT: pand %xmm4, %xmm0 4561; SSE-NEXT: packuswb %xmm1, %xmm0 4562; SSE-NEXT: packuswb %xmm2, %xmm0 4563; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4564; SSE-NEXT: retq 4565; 4566; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 4567; AVX1: # %bb.0: 4568; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4569; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4570; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4571; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4572; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4573; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4574; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4575; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4576; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4577; AVX1-NEXT: vzeroupper 4578; AVX1-NEXT: retq 4579; 4580; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 4581; AVX2: # %bb.0: 4582; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4583; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4584; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4585; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4586; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4587; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4588; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4589; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4590; AVX2-NEXT: vzeroupper 4591; AVX2-NEXT: retq 4592; 4593; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 4594; AVX512: # %bb.0: 4595; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4596; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4597; AVX512-NEXT: vzeroupper 4598; AVX512-NEXT: retq 4599 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4600 %2 = trunc <16 x i32> %1 to <16 x i8> 4601 ret <16 x i8> %2 4602} 4603 4604define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4605; SSE-LABEL: trunc_or_const_v16i16_v16i8: 4606; SSE: # %bb.0: 4607; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4608; SSE-NEXT: pand %xmm2, %xmm1 4609; SSE-NEXT: pand %xmm2, %xmm0 4610; SSE-NEXT: packuswb %xmm1, %xmm0 4611; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4612; SSE-NEXT: retq 4613; 4614; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 4615; AVX1: # %bb.0: 4616; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4617; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4618; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4619; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4620; AVX1-NEXT: vzeroupper 4621; AVX1-NEXT: retq 4622; 4623; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 4624; AVX2: # %bb.0: 4625; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4626; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4627; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4628; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4629; AVX2-NEXT: vzeroupper 4630; AVX2-NEXT: retq 4631; 4632; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 4633; AVX512F: # %bb.0: 4634; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4635; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4636; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4637; AVX512F-NEXT: vzeroupper 4638; AVX512F-NEXT: retq 4639; 4640; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 4641; AVX512BW: # %bb.0: 4642; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4643; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4644; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4645; AVX512BW-NEXT: vzeroupper 4646; AVX512BW-NEXT: retq 4647; 4648; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 4649; AVX512DQ: # %bb.0: 4650; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4651; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4652; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4653; AVX512DQ-NEXT: vzeroupper 4654; AVX512DQ-NEXT: retq 4655 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4656 %2 = trunc <16 x i16> %1 to <16 x i8> 4657 ret <16 x i8> %2 4658} 4659 4660; 4661; complex patterns - often created by vectorizer 4662; 4663 4664define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4665; SSE-LABEL: mul_add_const_v4i64_v4i32: 4666; SSE: # %bb.0: 4667; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4668; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4669; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4670; SSE-NEXT: pmuludq %xmm2, %xmm0 4671; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4672; SSE-NEXT: pmuludq %xmm3, %xmm1 4673; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4674; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4675; SSE-NEXT: retq 4676; 4677; AVX-LABEL: mul_add_const_v4i64_v4i32: 4678; AVX: # %bb.0: 4679; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4680; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4681; AVX-NEXT: retq 4682 %1 = sext <4 x i32> %a0 to <4 x i64> 4683 %2 = sext <4 x i32> %a1 to <4 x i64> 4684 %3 = mul <4 x i64> %1, %2 4685 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 4686 %5 = trunc <4 x i64> %4 to <4 x i32> 4687 ret <4 x i32> %5 4688} 4689 4690define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4691; SSE-LABEL: mul_add_self_v4i64_v4i32: 4692; SSE: # %bb.0: 4693; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4694; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4695; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4696; SSE-NEXT: pmuludq %xmm2, %xmm0 4697; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4698; SSE-NEXT: pmuludq %xmm3, %xmm1 4699; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4700; SSE-NEXT: paddd %xmm0, %xmm0 4701; SSE-NEXT: retq 4702; 4703; AVX-LABEL: mul_add_self_v4i64_v4i32: 4704; AVX: # %bb.0: 4705; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4706; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 4707; AVX-NEXT: retq 4708 %1 = sext <4 x i32> %a0 to <4 x i64> 4709 %2 = sext <4 x i32> %a1 to <4 x i64> 4710 %3 = mul <4 x i64> %1, %2 4711 %4 = add <4 x i64> %3, %3 4712 %5 = trunc <4 x i64> %4 to <4 x i32> 4713 ret <4 x i32> %5 4714} 4715 4716define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4717; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 4718; SSE: # %bb.0: 4719; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4720; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4721; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 4722; SSE-NEXT: pmuludq %xmm2, %xmm4 4723; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4724; SSE-NEXT: pmuludq %xmm3, %xmm1 4725; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] 4726; SSE-NEXT: paddd %xmm4, %xmm0 4727; SSE-NEXT: retq 4728; 4729; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 4730; AVX: # %bb.0: 4731; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 4732; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 4733; AVX-NEXT: retq 4734 %1 = sext <4 x i32> %a0 to <4 x i64> 4735 %2 = sext <4 x i32> %a1 to <4 x i64> 4736 %3 = mul <4 x i64> %1, %2 4737 %4 = add <4 x i64> %1, %3 4738 %5 = trunc <4 x i64> %4 to <4 x i32> 4739 ret <4 x i32> %5 4740} 4741