1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 10 11; 12; Variable Rotates 13; 14 15define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 16; AVX1-LABEL: var_rotate_v4i64: 17; AVX1: # %bb.0: 18; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 19; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm3 20; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 21; AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 22; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 23; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 24; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 25; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 26; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] 27; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm6 28; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 29; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 30; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] 31; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 32; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm4 33; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 34; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 35; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] 36; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm4 37; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 38; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm0 39; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7] 40; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 41; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 42; AVX1-NEXT: retq 43; 44; AVX2-LABEL: var_rotate_v4i64: 45; AVX2: # %bb.0: 46; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64] 47; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm2 48; AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm1 49; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 50; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 51; AVX2-NEXT: retq 52; 53; AVX512F-LABEL: var_rotate_v4i64: 54; AVX512F: # %bb.0: 55; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 56; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 57; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 58; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 59; AVX512F-NEXT: retq 60; 61; AVX512VL-LABEL: var_rotate_v4i64: 62; AVX512VL: # %bb.0: 63; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0 64; AVX512VL-NEXT: retq 65; 66; AVX512BW-LABEL: var_rotate_v4i64: 67; AVX512BW: # %bb.0: 68; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 69; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 70; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 71; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 72; AVX512BW-NEXT: retq 73; 74; AVX512VLBW-LABEL: var_rotate_v4i64: 75; AVX512VLBW: # %bb.0: 76; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 77; AVX512VLBW-NEXT: retq 78; 79; XOPAVX1-LABEL: var_rotate_v4i64: 80; XOPAVX1: # %bb.0: 81; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 82; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 83; XOPAVX1-NEXT: vprotq %xmm2, %xmm3, %xmm2 84; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 85; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 86; XOPAVX1-NEXT: retq 87; 88; XOPAVX2-LABEL: var_rotate_v4i64: 89; XOPAVX2: # %bb.0: 90; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 91; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 92; XOPAVX2-NEXT: vprotq %xmm2, %xmm3, %xmm2 93; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 94; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 95; XOPAVX2-NEXT: retq 96 %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b 97 %shl = shl <4 x i64> %a, %b 98 %lshr = lshr <4 x i64> %a, %b64 99 %or = or <4 x i64> %shl, %lshr 100 ret <4 x i64> %or 101} 102 103define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 104; AVX1-LABEL: var_rotate_v8i32: 105; AVX1: # %bb.0: 106; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 107; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [31,31,31,31] 108; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 109; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 110; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 111; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 112; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 113; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 114; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 115; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 116; AVX1-NEXT: vpmuludq %xmm5, %xmm7, %xmm5 117; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm2 118; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 119; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7] 120; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2] 121; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] 122; AVX1-NEXT: vpor %xmm6, %xmm2, %xmm2 123; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 124; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 125; AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 126; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 127; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 128; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 129; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 130; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 131; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 132; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 133; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] 134; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 135; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 136; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 137; AVX1-NEXT: retq 138; 139; AVX2-LABEL: var_rotate_v8i32: 140; AVX2: # %bb.0: 141; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31] 142; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 143; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 144; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32] 145; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 146; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 147; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 148; AVX2-NEXT: retq 149; 150; AVX512F-LABEL: var_rotate_v8i32: 151; AVX512F: # %bb.0: 152; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 153; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 154; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 155; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 156; AVX512F-NEXT: retq 157; 158; AVX512VL-LABEL: var_rotate_v8i32: 159; AVX512VL: # %bb.0: 160; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0 161; AVX512VL-NEXT: retq 162; 163; AVX512BW-LABEL: var_rotate_v8i32: 164; AVX512BW: # %bb.0: 165; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 166; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 167; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 168; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 169; AVX512BW-NEXT: retq 170; 171; AVX512VLBW-LABEL: var_rotate_v8i32: 172; AVX512VLBW: # %bb.0: 173; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 174; AVX512VLBW-NEXT: retq 175; 176; XOPAVX1-LABEL: var_rotate_v8i32: 177; XOPAVX1: # %bb.0: 178; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 179; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 180; XOPAVX1-NEXT: vprotd %xmm2, %xmm3, %xmm2 181; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 182; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 183; XOPAVX1-NEXT: retq 184; 185; XOPAVX2-LABEL: var_rotate_v8i32: 186; XOPAVX2: # %bb.0: 187; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 188; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 189; XOPAVX2-NEXT: vprotd %xmm2, %xmm3, %xmm2 190; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 191; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 192; XOPAVX2-NEXT: retq 193 %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b 194 %shl = shl <8 x i32> %a, %b 195 %lshr = lshr <8 x i32> %a, %b32 196 %or = or <8 x i32> %shl, %lshr 197 ret <8 x i32> %or 198} 199 200define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 201; AVX1-LABEL: var_rotate_v16i16: 202; AVX1: # %bb.0: 203; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 204; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] 205; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 206; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 207; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 208; AVX1-NEXT: vpslld $23, %xmm5, %xmm5 209; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216] 210; AVX1-NEXT: vpaddd %xmm6, %xmm5, %xmm5 211; AVX1-NEXT: vcvttps2dq %xmm5, %xmm5 212; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero 213; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 214; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2 215; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 216; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 218; AVX1-NEXT: vpmulhuw %xmm2, %xmm5, %xmm7 219; AVX1-NEXT: vpmullw %xmm2, %xmm5, %xmm2 220; AVX1-NEXT: vpor %xmm7, %xmm2, %xmm2 221; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 222; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] 223; AVX1-NEXT: vpslld $23, %xmm3, %xmm3 224; AVX1-NEXT: vpaddd %xmm6, %xmm3, %xmm3 225; AVX1-NEXT: vcvttps2dq %xmm3, %xmm3 226; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 227; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 228; AVX1-NEXT: vpaddd %xmm6, %xmm1, %xmm1 229; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 230; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 231; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 232; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 233; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 234; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 235; AVX1-NEXT: retq 236; 237; AVX2-LABEL: var_rotate_v16i16: 238; AVX2: # %bb.0: 239; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 240; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] 241; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 242; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 243; AVX2-NEXT: vpsllvd %ymm4, %ymm3, %ymm4 244; AVX2-NEXT: vpsrld $16, %ymm4, %ymm4 245; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] 246; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 247; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5 248; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 249; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 250; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 251; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 252; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] 253; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 254; AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 255; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] 256; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 257; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 258; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 259; AVX2-NEXT: vpor %ymm0, %ymm4, %ymm0 260; AVX2-NEXT: retq 261; 262; AVX512F-LABEL: var_rotate_v16i16: 263; AVX512F: # %bb.0: 264; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 265; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 266; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 267; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 268; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 269; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 270; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 271; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 272; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0 273; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 274; AVX512F-NEXT: retq 275; 276; AVX512VL-LABEL: var_rotate_v16i16: 277; AVX512VL: # %bb.0: 278; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 279; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 280; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 281; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 282; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 283; AVX512VL-NEXT: vpsubw %ymm1, %ymm3, %ymm1 284; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 285; AVX512VL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 286; AVX512VL-NEXT: vpord %zmm0, %zmm2, %zmm0 287; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 288; AVX512VL-NEXT: retq 289; 290; AVX512BW-LABEL: var_rotate_v16i16: 291; AVX512BW: # %bb.0: 292; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 293; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 294; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 295; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 296; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 297; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 298; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 299; AVX512BW-NEXT: retq 300; 301; AVX512VLBW-LABEL: var_rotate_v16i16: 302; AVX512VLBW: # %bb.0: 303; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 304; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm2 305; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] 306; AVX512VLBW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 307; AVX512VLBW-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 308; AVX512VLBW-NEXT: vpor %ymm0, %ymm2, %ymm0 309; AVX512VLBW-NEXT: retq 310; 311; XOPAVX1-LABEL: var_rotate_v16i16: 312; XOPAVX1: # %bb.0: 313; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 314; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 315; XOPAVX1-NEXT: vprotw %xmm2, %xmm3, %xmm2 316; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 317; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 318; XOPAVX1-NEXT: retq 319; 320; XOPAVX2-LABEL: var_rotate_v16i16: 321; XOPAVX2: # %bb.0: 322; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 323; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 324; XOPAVX2-NEXT: vprotw %xmm2, %xmm3, %xmm2 325; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 326; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 327; XOPAVX2-NEXT: retq 328 %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 329 %shl = shl <16 x i16> %a, %b 330 %lshr = lshr <16 x i16> %a, %b16 331 %or = or <16 x i16> %shl, %lshr 332 ret <16 x i16> %or 333} 334 335define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 336; AVX1-LABEL: var_rotate_v32i8: 337; AVX1: # %bb.0: 338; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 339; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 340; AVX1-NEXT: vmovdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 341; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 342; AVX1-NEXT: vpsllw $4, %xmm2, %xmm5 343; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 344; AVX1-NEXT: vpand %xmm9, %xmm5, %xmm5 345; AVX1-NEXT: vpor %xmm3, %xmm5, %xmm3 346; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 347; AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 348; AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 349; AVX1-NEXT: vpsrlw $6, %xmm2, %xmm3 350; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] 351; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3 352; AVX1-NEXT: vpsllw $2, %xmm2, %xmm4 353; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] 354; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 355; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 356; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm4 357; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 358; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm3 359; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 360; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 361; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm7 362; AVX1-NEXT: vpor %xmm3, %xmm7, %xmm3 363; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 364; AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm2, %xmm2 365; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 366; AVX1-NEXT: vpand %xmm8, %xmm3, %xmm3 367; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 368; AVX1-NEXT: vpand %xmm9, %xmm4, %xmm4 369; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 370; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 371; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 372; AVX1-NEXT: vpsrlw $6, %xmm0, %xmm3 373; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3 374; AVX1-NEXT: vpsllw $2, %xmm0, %xmm4 375; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 376; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 377; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 378; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 379; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm3 380; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 381; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm4 382; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3 383; AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 384; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 385; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 386; AVX1-NEXT: retq 387; 388; AVX2-LABEL: var_rotate_v32i8: 389; AVX2: # %bb.0: 390; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 391; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 392; AVX2-NEXT: vpsllw $4, %ymm0, %ymm3 393; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 394; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 395; AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 396; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 397; AVX2-NEXT: vpsrlw $6, %ymm0, %ymm2 398; AVX2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 399; AVX2-NEXT: vpsllw $2, %ymm0, %ymm3 400; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 401; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 402; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 403; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 404; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 405; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm3 406; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 407; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 408; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 409; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 410; AVX2-NEXT: retq 411; 412; AVX512F-LABEL: var_rotate_v32i8: 413; AVX512F: # %bb.0: 414; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm2 415; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 416; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm3 417; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 418; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 419; AVX512F-NEXT: vpsllw $5, %ymm1, %ymm1 420; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 421; AVX512F-NEXT: vpsrlw $6, %ymm0, %ymm2 422; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 423; AVX512F-NEXT: vpsllw $2, %ymm0, %ymm3 424; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 425; AVX512F-NEXT: vpor %ymm2, %ymm3, %ymm2 426; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 427; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 428; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm2 429; AVX512F-NEXT: vpsrlw $7, %ymm0, %ymm3 430; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 431; AVX512F-NEXT: vpor %ymm3, %ymm2, %ymm2 432; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm1 433; AVX512F-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 434; AVX512F-NEXT: retq 435; 436; AVX512VL-LABEL: var_rotate_v32i8: 437; AVX512VL: # %bb.0: 438; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 439; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 440; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm3 441; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 442; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 443; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 444; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 445; AVX512VL-NEXT: vpsrlw $6, %ymm0, %ymm2 446; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 447; AVX512VL-NEXT: vpsllw $2, %ymm0, %ymm3 448; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 449; AVX512VL-NEXT: vpor %ymm2, %ymm3, %ymm2 450; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 451; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 452; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 453; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 454; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 455; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 456; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 457; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 458; AVX512VL-NEXT: retq 459; 460; AVX512BW-LABEL: var_rotate_v32i8: 461; AVX512BW: # %bb.0: 462; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 463; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 464; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 465; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 466; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 467; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 468; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 469; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 470; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 471; AVX512BW-NEXT: retq 472; 473; AVX512VLBW-LABEL: var_rotate_v32i8: 474; AVX512VLBW: # %bb.0: 475; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 476; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 477; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 478; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 479; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 480; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 481; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 482; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 483; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 484; AVX512VLBW-NEXT: retq 485; 486; XOPAVX1-LABEL: var_rotate_v32i8: 487; XOPAVX1: # %bb.0: 488; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 489; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 490; XOPAVX1-NEXT: vprotb %xmm2, %xmm3, %xmm2 491; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 492; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 493; XOPAVX1-NEXT: retq 494; 495; XOPAVX2-LABEL: var_rotate_v32i8: 496; XOPAVX2: # %bb.0: 497; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 498; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 499; XOPAVX2-NEXT: vprotb %xmm2, %xmm3, %xmm2 500; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 501; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 502; XOPAVX2-NEXT: retq 503 %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 504 %shl = shl <32 x i8> %a, %b 505 %lshr = lshr <32 x i8> %a, %b8 506 %or = or <32 x i8> %shl, %lshr 507 ret <32 x i8> %or 508} 509 510; 511; Uniform Variable Rotates 512; 513 514define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { 515; AVX1-LABEL: splatvar_rotate_v4i64: 516; AVX1: # %bb.0: 517; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm1[0,0] 518; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 519; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 520; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 521; AVX1-NEXT: vpsllq %xmm1, %xmm3, %xmm4 522; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 523; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 524; AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm3 525; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 526; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 527; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 528; AVX1-NEXT: retq 529; 530; AVX2-LABEL: splatvar_rotate_v4i64: 531; AVX2: # %bb.0: 532; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 533; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] 534; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 535; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 536; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 537; AVX2-NEXT: retq 538; 539; AVX512F-LABEL: splatvar_rotate_v4i64: 540; AVX512F: # %bb.0: 541; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 542; AVX512F-NEXT: vpbroadcastq %xmm1, %ymm1 543; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 544; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 545; AVX512F-NEXT: retq 546; 547; AVX512VL-LABEL: splatvar_rotate_v4i64: 548; AVX512VL: # %bb.0: 549; AVX512VL-NEXT: vpbroadcastq %xmm1, %ymm1 550; AVX512VL-NEXT: vprolvq %ymm1, %ymm0, %ymm0 551; AVX512VL-NEXT: retq 552; 553; AVX512BW-LABEL: splatvar_rotate_v4i64: 554; AVX512BW: # %bb.0: 555; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 556; AVX512BW-NEXT: vpbroadcastq %xmm1, %ymm1 557; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 558; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 559; AVX512BW-NEXT: retq 560; 561; AVX512VLBW-LABEL: splatvar_rotate_v4i64: 562; AVX512VLBW: # %bb.0: 563; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %ymm1 564; AVX512VLBW-NEXT: vprolvq %ymm1, %ymm0, %ymm0 565; AVX512VLBW-NEXT: retq 566; 567; XOPAVX1-LABEL: splatvar_rotate_v4i64: 568; XOPAVX1: # %bb.0: 569; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 570; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 571; XOPAVX1-NEXT: vprotq %xmm1, %xmm2, %xmm2 572; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 573; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 574; XOPAVX1-NEXT: retq 575; 576; XOPAVX2-LABEL: splatvar_rotate_v4i64: 577; XOPAVX2: # %bb.0: 578; XOPAVX2-NEXT: vpbroadcastq %xmm1, %ymm1 579; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 580; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 581; XOPAVX2-NEXT: vprotq %xmm3, %xmm2, %xmm2 582; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 583; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 584; XOPAVX2-NEXT: retq 585 %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer 586 %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat 587 %shl = shl <4 x i64> %a, %splat 588 %lshr = lshr <4 x i64> %a, %splat64 589 %or = or <4 x i64> %shl, %lshr 590 ret <4 x i64> %or 591} 592 593define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { 594; AVX1-LABEL: splatvar_rotate_v8i32: 595; AVX1: # %bb.0: 596; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 597; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 598; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 599; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero 600; AVX1-NEXT: vpslld %xmm3, %xmm2, %xmm4 601; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32] 602; AVX1-NEXT: vpsubd %xmm1, %xmm5, %xmm1 603; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 604; AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 605; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 606; AVX1-NEXT: vpslld %xmm3, %xmm0, %xmm3 607; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 608; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 609; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 610; AVX1-NEXT: retq 611; 612; AVX2-LABEL: splatvar_rotate_v8i32: 613; AVX2: # %bb.0: 614; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 615; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 616; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 617; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 618; AVX2-NEXT: vpslld %xmm2, %ymm0, %ymm2 619; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 620; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 621; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 622; AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 623; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 624; AVX2-NEXT: retq 625; 626; AVX512F-LABEL: splatvar_rotate_v8i32: 627; AVX512F: # %bb.0: 628; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 629; AVX512F-NEXT: vpbroadcastd %xmm1, %ymm1 630; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 631; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 632; AVX512F-NEXT: retq 633; 634; AVX512VL-LABEL: splatvar_rotate_v8i32: 635; AVX512VL: # %bb.0: 636; AVX512VL-NEXT: vpbroadcastd %xmm1, %ymm1 637; AVX512VL-NEXT: vprolvd %ymm1, %ymm0, %ymm0 638; AVX512VL-NEXT: retq 639; 640; AVX512BW-LABEL: splatvar_rotate_v8i32: 641; AVX512BW: # %bb.0: 642; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 643; AVX512BW-NEXT: vpbroadcastd %xmm1, %ymm1 644; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 645; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 646; AVX512BW-NEXT: retq 647; 648; AVX512VLBW-LABEL: splatvar_rotate_v8i32: 649; AVX512VLBW: # %bb.0: 650; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %ymm1 651; AVX512VLBW-NEXT: vprolvd %ymm1, %ymm0, %ymm0 652; AVX512VLBW-NEXT: retq 653; 654; XOPAVX1-LABEL: splatvar_rotate_v8i32: 655; XOPAVX1: # %bb.0: 656; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 657; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 658; XOPAVX1-NEXT: vprotd %xmm1, %xmm2, %xmm2 659; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 660; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 661; XOPAVX1-NEXT: retq 662; 663; XOPAVX2-LABEL: splatvar_rotate_v8i32: 664; XOPAVX2: # %bb.0: 665; XOPAVX2-NEXT: vpbroadcastd %xmm1, %ymm1 666; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 667; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 668; XOPAVX2-NEXT: vprotd %xmm3, %xmm2, %xmm2 669; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 670; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 671; XOPAVX2-NEXT: retq 672 %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer 673 %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat 674 %shl = shl <8 x i32> %a, %splat 675 %lshr = lshr <8 x i32> %a, %splat32 676 %or = or <8 x i32> %shl, %lshr 677 ret <8 x i32> %or 678} 679 680define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { 681; AVX1-LABEL: splatvar_rotate_v16i16: 682; AVX1: # %bb.0: 683; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 684; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 685; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 686; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 687; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 688; AVX1-NEXT: vpsllw %xmm3, %xmm2, %xmm4 689; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [16,16,16,16,16,16,16,16] 690; AVX1-NEXT: vpsubw %xmm1, %xmm5, %xmm1 691; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 692; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 693; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 694; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 695; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 696; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 697; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 698; AVX1-NEXT: retq 699; 700; AVX2-LABEL: splatvar_rotate_v16i16: 701; AVX2: # %bb.0: 702; AVX2-NEXT: vpbroadcastw %xmm1, %ymm1 703; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 704; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 705; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm2 706; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 707; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 708; AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 709; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 710; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 711; AVX2-NEXT: retq 712; 713; AVX512-LABEL: splatvar_rotate_v16i16: 714; AVX512: # %bb.0: 715; AVX512-NEXT: vpbroadcastw %xmm1, %ymm1 716; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 717; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 718; AVX512-NEXT: vpsllw %xmm2, %ymm0, %ymm2 719; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 720; AVX512-NEXT: vpsubw %xmm1, %xmm3, %xmm1 721; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 722; AVX512-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 723; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0 724; AVX512-NEXT: retq 725; 726; XOPAVX1-LABEL: splatvar_rotate_v16i16: 727; XOPAVX1: # %bb.0: 728; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] 729; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 730; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 731; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 732; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 733; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 734; XOPAVX1-NEXT: retq 735; 736; XOPAVX2-LABEL: splatvar_rotate_v16i16: 737; XOPAVX2: # %bb.0: 738; XOPAVX2-NEXT: vpbroadcastw %xmm1, %ymm1 739; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 740; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 741; XOPAVX2-NEXT: vprotw %xmm3, %xmm2, %xmm2 742; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 743; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 744; XOPAVX2-NEXT: retq 745 %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer 746 %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 747 %shl = shl <16 x i16> %a, %splat 748 %lshr = lshr <16 x i16> %a, %splat16 749 %or = or <16 x i16> %shl, %lshr 750 ret <16 x i16> %or 751} 752 753define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { 754; AVX1-LABEL: splatvar_rotate_v32i8: 755; AVX1: # %bb.0: 756; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 757; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 758; AVX1-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 759; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 760; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 761; AVX1-NEXT: vpsllw %xmm3, %xmm4, %xmm5 762; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 763; AVX1-NEXT: vpsllw %xmm3, %xmm6, %xmm7 764; AVX1-NEXT: vpshufb %xmm2, %xmm7, %xmm2 765; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 766; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 767; AVX1-NEXT: vpsubb %xmm1, %xmm7, %xmm1 768; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 769; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm4 770; AVX1-NEXT: vpsrlw %xmm1, %xmm6, %xmm6 771; AVX1-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 772; AVX1-NEXT: vpand %xmm6, %xmm4, %xmm4 773; AVX1-NEXT: vpor %xmm4, %xmm5, %xmm4 774; AVX1-NEXT: vpsllw %xmm3, %xmm0, %xmm3 775; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 776; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 777; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm0 778; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 779; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 780; AVX1-NEXT: retq 781; 782; AVX2-LABEL: splatvar_rotate_v32i8: 783; AVX2: # %bb.0: 784; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 785; AVX2-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 786; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 787; AVX2-NEXT: vpsllw %xmm2, %ymm0, %ymm3 788; AVX2-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 789; AVX2-NEXT: vpsllw %xmm2, %ymm4, %ymm2 790; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 791; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 792; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 793; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 794; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 795; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 796; AVX2-NEXT: vpsrlw %xmm1, %ymm4, %ymm1 797; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 798; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 799; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 800; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 801; AVX2-NEXT: retq 802; 803; AVX512F-LABEL: splatvar_rotate_v32i8: 804; AVX512F: # %bb.0: 805; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 806; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 807; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 808; AVX512F-NEXT: vpsllw %xmm2, %ymm0, %ymm3 809; AVX512F-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 810; AVX512F-NEXT: vpsllw %xmm2, %ymm4, %ymm2 811; AVX512F-NEXT: vpbroadcastb %xmm2, %ymm2 812; AVX512F-NEXT: vpand %ymm2, %ymm3, %ymm2 813; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 814; AVX512F-NEXT: vpsubb %xmm1, %xmm3, %xmm1 815; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 816; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 817; AVX512F-NEXT: vpsrlw %xmm1, %ymm4, %ymm1 818; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 819; AVX512F-NEXT: vpbroadcastb %xmm1, %ymm1 820; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 821; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 822; AVX512F-NEXT: retq 823; 824; AVX512VL-LABEL: splatvar_rotate_v32i8: 825; AVX512VL: # %bb.0: 826; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 827; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 828; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 829; AVX512VL-NEXT: vpsllw %xmm2, %ymm0, %ymm3 830; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 831; AVX512VL-NEXT: vpsllw %xmm2, %ymm4, %ymm2 832; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 833; AVX512VL-NEXT: vpand %ymm2, %ymm3, %ymm2 834; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 835; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 836; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 837; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 838; AVX512VL-NEXT: vpsrlw %xmm1, %ymm4, %ymm1 839; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 840; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 841; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 842; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 843; AVX512VL-NEXT: retq 844; 845; AVX512BW-LABEL: splatvar_rotate_v32i8: 846; AVX512BW: # %bb.0: 847; AVX512BW-NEXT: vpbroadcastb %xmm1, %ymm1 848; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 849; AVX512BW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 850; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 851; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 852; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 853; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 854; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 855; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 856; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 857; AVX512BW-NEXT: retq 858; 859; AVX512VLBW-LABEL: splatvar_rotate_v32i8: 860; AVX512VLBW: # %bb.0: 861; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %ymm1 862; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 863; AVX512VLBW-NEXT: vpsubb %ymm1, %ymm2, %ymm2 864; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 865; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 866; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 867; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero 868; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 869; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 870; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 871; AVX512VLBW-NEXT: retq 872; 873; XOPAVX1-LABEL: splatvar_rotate_v32i8: 874; XOPAVX1: # %bb.0: 875; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 876; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 877; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 878; XOPAVX1-NEXT: vprotb %xmm1, %xmm2, %xmm2 879; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 880; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 881; XOPAVX1-NEXT: retq 882; 883; XOPAVX2-LABEL: splatvar_rotate_v32i8: 884; XOPAVX2: # %bb.0: 885; XOPAVX2-NEXT: vpbroadcastb %xmm1, %ymm1 886; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 887; XOPAVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 888; XOPAVX2-NEXT: vprotb %xmm3, %xmm2, %xmm2 889; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 890; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 891; XOPAVX2-NEXT: retq 892 %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer 893 %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 894 %shl = shl <32 x i8> %a, %splat 895 %lshr = lshr <32 x i8> %a, %splat8 896 %or = or <32 x i8> %shl, %lshr 897 ret <32 x i8> %or 898} 899 900; 901; Constant Rotates 902; 903 904define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind { 905; AVX1-LABEL: constant_rotate_v4i64: 906; AVX1: # %bb.0: 907; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 908; AVX1-NEXT: vpsllq $60, %xmm1, %xmm2 909; AVX1-NEXT: vpsllq $50, %xmm1, %xmm3 910; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] 911; AVX1-NEXT: vpsllq $14, %xmm0, %xmm3 912; AVX1-NEXT: vpsllq $4, %xmm0, %xmm4 913; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] 914; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 915; AVX1-NEXT: vpsrlq $4, %xmm1, %xmm3 916; AVX1-NEXT: vpsrlq $14, %xmm1, %xmm1 917; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] 918; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm3 919; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 920; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] 921; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 922; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 923; AVX1-NEXT: retq 924; 925; AVX2-LABEL: constant_rotate_v4i64: 926; AVX2: # %bb.0: 927; AVX2-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm1 928; AVX2-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 929; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 930; AVX2-NEXT: retq 931; 932; AVX512F-LABEL: constant_rotate_v4i64: 933; AVX512F: # %bb.0: 934; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 935; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] 936; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 937; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 938; AVX512F-NEXT: retq 939; 940; AVX512VL-LABEL: constant_rotate_v4i64: 941; AVX512VL: # %bb.0: 942; AVX512VL-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 943; AVX512VL-NEXT: retq 944; 945; AVX512BW-LABEL: constant_rotate_v4i64: 946; AVX512BW: # %bb.0: 947; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 948; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,14,50,60] 949; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 950; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 951; AVX512BW-NEXT: retq 952; 953; AVX512VLBW-LABEL: constant_rotate_v4i64: 954; AVX512VLBW: # %bb.0: 955; AVX512VLBW-NEXT: vprolvq {{.*}}(%rip), %ymm0, %ymm0 956; AVX512VLBW-NEXT: retq 957; 958; XOPAVX1-LABEL: constant_rotate_v4i64: 959; XOPAVX1: # %bb.0: 960; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 961; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 962; XOPAVX1-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 963; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 964; XOPAVX1-NEXT: retq 965; 966; XOPAVX2-LABEL: constant_rotate_v4i64: 967; XOPAVX2: # %bb.0: 968; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm1 969; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 970; XOPAVX2-NEXT: vprotq {{.*}}(%rip), %xmm0, %xmm0 971; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 972; XOPAVX2-NEXT: retq 973 %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60> 974 %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4> 975 %or = or <4 x i64> %shl, %lshr 976 ret <4 x i64> %or 977} 978 979define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind { 980; AVX1-LABEL: constant_rotate_v8i32: 981; AVX1: # %bb.0: 982; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [256,512,1024,2048] 983; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 984; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 985; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 986; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm2 987; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 988; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 989; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 990; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 991; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 992; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 993; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [16,32,64,128] 994; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 995; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 996; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 997; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 998; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 999; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 1000; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2] 1001; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] 1002; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1003; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1004; AVX1-NEXT: retq 1005; 1006; AVX2-LABEL: constant_rotate_v8i32: 1007; AVX2: # %bb.0: 1008; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm1 1009; AVX2-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 1010; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1011; AVX2-NEXT: retq 1012; 1013; AVX512F-LABEL: constant_rotate_v8i32: 1014; AVX512F: # %bb.0: 1015; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1016; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] 1017; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1018; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1019; AVX512F-NEXT: retq 1020; 1021; AVX512VL-LABEL: constant_rotate_v8i32: 1022; AVX512VL: # %bb.0: 1023; AVX512VL-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 1024; AVX512VL-NEXT: retq 1025; 1026; AVX512BW-LABEL: constant_rotate_v8i32: 1027; AVX512BW: # %bb.0: 1028; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1029; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11] 1030; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1031; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1032; AVX512BW-NEXT: retq 1033; 1034; AVX512VLBW-LABEL: constant_rotate_v8i32: 1035; AVX512VLBW: # %bb.0: 1036; AVX512VLBW-NEXT: vprolvd {{.*}}(%rip), %ymm0, %ymm0 1037; AVX512VLBW-NEXT: retq 1038; 1039; XOPAVX1-LABEL: constant_rotate_v8i32: 1040; XOPAVX1: # %bb.0: 1041; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 1042; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1043; XOPAVX1-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1044; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1045; XOPAVX1-NEXT: retq 1046; 1047; XOPAVX2-LABEL: constant_rotate_v8i32: 1048; XOPAVX2: # %bb.0: 1049; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm1 1050; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1051; XOPAVX2-NEXT: vprotd {{.*}}(%rip), %xmm0, %xmm0 1052; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1053; XOPAVX2-NEXT: retq 1054 %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 1055 %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21> 1056 %or = or <8 x i32> %shl, %lshr 1057 ret <8 x i32> %or 1058} 1059 1060define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind { 1061; AVX1-LABEL: constant_rotate_v16i16: 1062; AVX1: # %bb.0: 1063; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1064; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] 1065; AVX1-NEXT: vpmulhuw %xmm2, %xmm1, %xmm3 1066; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 1067; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 1068; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] 1069; AVX1-NEXT: vpmulhuw %xmm2, %xmm0, %xmm3 1070; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1071; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 1072; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1073; AVX1-NEXT: retq 1074; 1075; AVX2-LABEL: constant_rotate_v16i16: 1076; AVX2: # %bb.0: 1077; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 1078; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 1079; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1080; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 1081; AVX2-NEXT: retq 1082; 1083; AVX512F-LABEL: constant_rotate_v16i16: 1084; AVX512F: # %bb.0: 1085; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 1086; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 1087; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1088; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 1089; AVX512F-NEXT: retq 1090; 1091; AVX512VL-LABEL: constant_rotate_v16i16: 1092; AVX512VL: # %bb.0: 1093; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] 1094; AVX512VL-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 1095; AVX512VL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1096; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0 1097; AVX512VL-NEXT: retq 1098; 1099; AVX512BW-LABEL: constant_rotate_v16i16: 1100; AVX512BW: # %bb.0: 1101; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1102; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1103; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1] 1104; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1105; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1106; AVX512BW-NEXT: vpor %ymm2, %ymm0, %ymm0 1107; AVX512BW-NEXT: retq 1108; 1109; AVX512VLBW-LABEL: constant_rotate_v16i16: 1110; AVX512VLBW: # %bb.0: 1111; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm1 1112; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 1113; AVX512VLBW-NEXT: vpor %ymm1, %ymm0, %ymm0 1114; AVX512VLBW-NEXT: retq 1115; 1116; XOPAVX1-LABEL: constant_rotate_v16i16: 1117; XOPAVX1: # %bb.0: 1118; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 1119; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1120; XOPAVX1-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1121; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1122; XOPAVX1-NEXT: retq 1123; 1124; XOPAVX2-LABEL: constant_rotate_v16i16: 1125; XOPAVX2: # %bb.0: 1126; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm1 1127; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1128; XOPAVX2-NEXT: vprotw {{.*}}(%rip), %xmm0, %xmm0 1129; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1130; XOPAVX2-NEXT: retq 1131 %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1132 %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1> 1133 %or = or <16 x i16> %shl, %lshr 1134 ret <16 x i16> %or 1135} 1136 1137define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind { 1138; AVX1-LABEL: constant_rotate_v32i8: 1139; AVX1: # %bb.0: 1140; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1141; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 1142; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1143; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [256,128,64,32,16,8,4,2] 1144; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3 1145; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1146; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1147; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [256,2,4,8,16,32,64,128] 1148; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm7 1149; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm7 1150; AVX1-NEXT: vpackuswb %xmm3, %xmm7, %xmm3 1151; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] 1152; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [1,128,64,32,16,8,4,2] 1153; AVX1-NEXT: vpmullw %xmm7, %xmm1, %xmm1 1154; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1155; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1156; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] 1157; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm5 1158; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm5 1159; AVX1-NEXT: vpackuswb %xmm1, %xmm5, %xmm1 1160; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 1161; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1162; AVX1-NEXT: vpmullw %xmm9, %xmm3, %xmm3 1163; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 1164; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1165; AVX1-NEXT: vpmullw %xmm6, %xmm5, %xmm6 1166; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm6 1167; AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 1168; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1169; AVX1-NEXT: vpmullw %xmm7, %xmm0, %xmm0 1170; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 1171; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 1172; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 1173; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1174; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0 1175; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1176; AVX1-NEXT: retq 1177; 1178; AVX2-LABEL: constant_rotate_v32i8: 1179; AVX2: # %bb.0: 1180; AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 1181; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1182; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 1183; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 1184; AVX2-NEXT: vpsllw $2, %ymm1, %ymm3 1185; AVX2-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 1186; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1187; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1188; AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm3 1189; AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1190; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1191; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1192; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1193; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 1194; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm3 1195; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1196; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1197; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1198; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 1199; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 1200; AVX2-NEXT: retq 1201; 1202; AVX512F-LABEL: constant_rotate_v32i8: 1203; AVX512F: # %bb.0: 1204; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm1 1205; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1206; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 1207; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 1208; AVX512F-NEXT: vpsllw $2, %ymm1, %ymm3 1209; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 1210; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1211; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1212; AVX512F-NEXT: vpaddb %ymm1, %ymm1, %ymm3 1213; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1214; AVX512F-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1215; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 1216; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1217; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm3, %ymm3 1218; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 1219; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1220; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1221; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1222; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 1223; AVX512F-NEXT: vpor %ymm0, %ymm1, %ymm0 1224; AVX512F-NEXT: retq 1225; 1226; AVX512VL-LABEL: constant_rotate_v32i8: 1227; AVX512VL: # %bb.0: 1228; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 1229; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1230; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,57344,41152,24704,8256,8192,24640,41088,57536,57344,41152,24704,8256] 1231; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm1 1232; AVX512VL-NEXT: vpsllw $2, %ymm1, %ymm3 1233; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 1234; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1235; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1236; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm3 1237; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 1238; AVX512VL-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 1239; AVX512VL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 1240; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 1241; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm2, %ymm2 1242; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 1243; AVX512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 1244; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1245; AVX512VL-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 1246; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1247; AVX512VL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 1248; AVX512VL-NEXT: vpor %ymm0, %ymm1, %ymm0 1249; AVX512VL-NEXT: retq 1250; 1251; AVX512BW-LABEL: constant_rotate_v32i8: 1252; AVX512BW: # %bb.0: 1253; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1254; AVX512BW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 1255; AVX512BW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 1256; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 1257; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1258; AVX512BW-NEXT: retq 1259; 1260; AVX512VLBW-LABEL: constant_rotate_v32i8: 1261; AVX512VLBW: # %bb.0: 1262; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1263; AVX512VLBW-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm1 1264; AVX512VLBW-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 1265; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 1266; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0 1267; AVX512VLBW-NEXT: retq 1268; 1269; XOPAVX1-LABEL: constant_rotate_v32i8: 1270; XOPAVX1: # %bb.0: 1271; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1272; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1273; XOPAVX1-NEXT: vprotb %xmm2, %xmm1, %xmm1 1274; XOPAVX1-NEXT: vprotb %xmm2, %xmm0, %xmm0 1275; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1276; XOPAVX1-NEXT: retq 1277; 1278; XOPAVX2-LABEL: constant_rotate_v32i8: 1279; XOPAVX2: # %bb.0: 1280; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1281; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1282; XOPAVX2-NEXT: vprotb %xmm2, %xmm1, %xmm1 1283; XOPAVX2-NEXT: vprotb %xmm2, %xmm0, %xmm0 1284; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1285; XOPAVX2-NEXT: retq 1286 %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1287 %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1288 %or = or <32 x i8> %shl, %lshr 1289 ret <32 x i8> %or 1290} 1291 1292; 1293; Uniform Constant Rotates 1294; 1295 1296define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind { 1297; AVX1-LABEL: splatconstant_rotate_v4i64: 1298; AVX1: # %bb.0: 1299; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 1300; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1301; AVX1-NEXT: vpsllq $14, %xmm2, %xmm3 1302; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1303; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm0 1304; AVX1-NEXT: vpsrlq $50, %xmm2, %xmm2 1305; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1306; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 1307; AVX1-NEXT: retq 1308; 1309; AVX2-LABEL: splatconstant_rotate_v4i64: 1310; AVX2: # %bb.0: 1311; AVX2-NEXT: vpsllq $14, %ymm0, %ymm1 1312; AVX2-NEXT: vpsrlq $50, %ymm0, %ymm0 1313; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 1314; AVX2-NEXT: retq 1315; 1316; AVX512F-LABEL: splatconstant_rotate_v4i64: 1317; AVX512F: # %bb.0: 1318; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1319; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0 1320; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1321; AVX512F-NEXT: retq 1322; 1323; AVX512VL-LABEL: splatconstant_rotate_v4i64: 1324; AVX512VL: # %bb.0: 1325; AVX512VL-NEXT: vprolq $14, %ymm0, %ymm0 1326; AVX512VL-NEXT: retq 1327; 1328; AVX512BW-LABEL: splatconstant_rotate_v4i64: 1329; AVX512BW: # %bb.0: 1330; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1331; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 1332; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1333; AVX512BW-NEXT: retq 1334; 1335; AVX512VLBW-LABEL: splatconstant_rotate_v4i64: 1336; AVX512VLBW: # %bb.0: 1337; AVX512VLBW-NEXT: vprolq $14, %ymm0, %ymm0 1338; AVX512VLBW-NEXT: retq 1339; 1340; XOPAVX1-LABEL: splatconstant_rotate_v4i64: 1341; XOPAVX1: # %bb.0: 1342; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm1 1343; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1344; XOPAVX1-NEXT: vprotq $14, %xmm0, %xmm0 1345; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1346; XOPAVX1-NEXT: retq 1347; 1348; XOPAVX2-LABEL: splatconstant_rotate_v4i64: 1349; XOPAVX2: # %bb.0: 1350; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm1 1351; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1352; XOPAVX2-NEXT: vprotq $14, %xmm0, %xmm0 1353; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1354; XOPAVX2-NEXT: retq 1355 %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14> 1356 %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50> 1357 %or = or <4 x i64> %shl, %lshr 1358 ret <4 x i64> %or 1359} 1360 1361define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind { 1362; AVX1-LABEL: splatconstant_rotate_v8i32: 1363; AVX1: # %bb.0: 1364; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1365; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1366; AVX1-NEXT: vpslld $4, %xmm1, %xmm1 1367; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1368; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2 1369; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 1370; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1371; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1372; AVX1-NEXT: retq 1373; 1374; AVX2-LABEL: splatconstant_rotate_v8i32: 1375; AVX2: # %bb.0: 1376; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1 1377; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 1378; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1379; AVX2-NEXT: retq 1380; 1381; AVX512F-LABEL: splatconstant_rotate_v8i32: 1382; AVX512F: # %bb.0: 1383; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1384; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1385; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1386; AVX512F-NEXT: retq 1387; 1388; AVX512VL-LABEL: splatconstant_rotate_v8i32: 1389; AVX512VL: # %bb.0: 1390; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 1391; AVX512VL-NEXT: retq 1392; 1393; AVX512BW-LABEL: splatconstant_rotate_v8i32: 1394; AVX512BW: # %bb.0: 1395; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1396; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1397; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 1398; AVX512BW-NEXT: retq 1399; 1400; AVX512VLBW-LABEL: splatconstant_rotate_v8i32: 1401; AVX512VLBW: # %bb.0: 1402; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0 1403; AVX512VLBW-NEXT: retq 1404; 1405; XOPAVX1-LABEL: splatconstant_rotate_v8i32: 1406; XOPAVX1: # %bb.0: 1407; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 1408; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1409; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 1410; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1411; XOPAVX1-NEXT: retq 1412; 1413; XOPAVX2-LABEL: splatconstant_rotate_v8i32: 1414; XOPAVX2: # %bb.0: 1415; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 1416; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1417; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 1418; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1419; XOPAVX2-NEXT: retq 1420 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 1421 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 1422 %or = or <8 x i32> %shl, %lshr 1423 ret <8 x i32> %or 1424} 1425 1426define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind { 1427; AVX1-LABEL: splatconstant_rotate_v16i16: 1428; AVX1: # %bb.0: 1429; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1430; AVX1-NEXT: vpsrlw $9, %xmm1, %xmm2 1431; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 1432; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1433; AVX1-NEXT: vpsrlw $9, %xmm0, %xmm2 1434; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 1435; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1436; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1437; AVX1-NEXT: retq 1438; 1439; AVX2-LABEL: splatconstant_rotate_v16i16: 1440; AVX2: # %bb.0: 1441; AVX2-NEXT: vpsrlw $9, %ymm0, %ymm1 1442; AVX2-NEXT: vpsllw $7, %ymm0, %ymm0 1443; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1444; AVX2-NEXT: retq 1445; 1446; AVX512-LABEL: splatconstant_rotate_v16i16: 1447; AVX512: # %bb.0: 1448; AVX512-NEXT: vpsrlw $9, %ymm0, %ymm1 1449; AVX512-NEXT: vpsllw $7, %ymm0, %ymm0 1450; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 1451; AVX512-NEXT: retq 1452; 1453; XOPAVX1-LABEL: splatconstant_rotate_v16i16: 1454; XOPAVX1: # %bb.0: 1455; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm1 1456; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1457; XOPAVX1-NEXT: vprotw $7, %xmm0, %xmm0 1458; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1459; XOPAVX1-NEXT: retq 1460; 1461; XOPAVX2-LABEL: splatconstant_rotate_v16i16: 1462; XOPAVX2: # %bb.0: 1463; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm1 1464; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1465; XOPAVX2-NEXT: vprotw $7, %xmm0, %xmm0 1466; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1467; XOPAVX2-NEXT: retq 1468 %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 1469 %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 1470 %or = or <16 x i16> %shl, %lshr 1471 ret <16 x i16> %or 1472} 1473 1474define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind { 1475; AVX1-LABEL: splatconstant_rotate_v32i8: 1476; AVX1: # %bb.0: 1477; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1478; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 1479; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1480; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1481; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 1482; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1483; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1484; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1485; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1486; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1487; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 1488; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1489; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1490; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1491; AVX1-NEXT: retq 1492; 1493; AVX2-LABEL: splatconstant_rotate_v32i8: 1494; AVX2: # %bb.0: 1495; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 1496; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1497; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 1498; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1499; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1500; AVX2-NEXT: retq 1501; 1502; AVX512F-LABEL: splatconstant_rotate_v32i8: 1503; AVX512F: # %bb.0: 1504; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1 1505; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1506; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 1507; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1508; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1509; AVX512F-NEXT: retq 1510; 1511; AVX512VL-LABEL: splatconstant_rotate_v32i8: 1512; AVX512VL: # %bb.0: 1513; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1 1514; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1515; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 1516; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1517; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1518; AVX512VL-NEXT: retq 1519; 1520; AVX512BW-LABEL: splatconstant_rotate_v32i8: 1521; AVX512BW: # %bb.0: 1522; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 1523; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1524; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 1525; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1526; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 1527; AVX512BW-NEXT: retq 1528; 1529; AVX512VLBW-LABEL: splatconstant_rotate_v32i8: 1530; AVX512VLBW: # %bb.0: 1531; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 1532; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1533; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 1534; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1535; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 1536; AVX512VLBW-NEXT: retq 1537; 1538; XOPAVX1-LABEL: splatconstant_rotate_v32i8: 1539; XOPAVX1: # %bb.0: 1540; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 1541; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1542; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 1543; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1544; XOPAVX1-NEXT: retq 1545; 1546; XOPAVX2-LABEL: splatconstant_rotate_v32i8: 1547; XOPAVX2: # %bb.0: 1548; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 1549; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1550; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 1551; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1552; XOPAVX2-NEXT: retq 1553 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1554 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1555 %or = or <32 x i8> %shl, %lshr 1556 ret <32 x i8> %or 1557} 1558 1559; 1560; Masked Uniform Constant Rotates 1561; 1562 1563define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind { 1564; AVX1-LABEL: splatconstant_rotate_mask_v4i64: 1565; AVX1: # %bb.0: 1566; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm1 1567; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1568; AVX1-NEXT: vpsrlq $49, %xmm0, %xmm0 1569; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1570; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1571; AVX1-NEXT: retq 1572; 1573; AVX2-LABEL: splatconstant_rotate_mask_v4i64: 1574; AVX2: # %bb.0: 1575; AVX2-NEXT: vpsrlq $49, %ymm0, %ymm0 1576; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1577; AVX2-NEXT: retq 1578; 1579; AVX512F-LABEL: splatconstant_rotate_mask_v4i64: 1580; AVX512F: # %bb.0: 1581; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1582; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 1583; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1584; AVX512F-NEXT: retq 1585; 1586; AVX512VL-LABEL: splatconstant_rotate_mask_v4i64: 1587; AVX512VL: # %bb.0: 1588; AVX512VL-NEXT: vprolq $15, %ymm0, %ymm0 1589; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1590; AVX512VL-NEXT: retq 1591; 1592; AVX512BW-LABEL: splatconstant_rotate_mask_v4i64: 1593; AVX512BW: # %bb.0: 1594; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1595; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 1596; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1597; AVX512BW-NEXT: retq 1598; 1599; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i64: 1600; AVX512VLBW: # %bb.0: 1601; AVX512VLBW-NEXT: vprolq $15, %ymm0, %ymm0 1602; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1603; AVX512VLBW-NEXT: retq 1604; 1605; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64: 1606; XOPAVX1: # %bb.0: 1607; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm1 1608; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1609; XOPAVX1-NEXT: vprotq $15, %xmm0, %xmm0 1610; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1611; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1612; XOPAVX1-NEXT: retq 1613; 1614; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64: 1615; XOPAVX2: # %bb.0: 1616; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm1 1617; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1618; XOPAVX2-NEXT: vprotq $15, %xmm0, %xmm0 1619; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1620; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1621; XOPAVX2-NEXT: retq 1622 %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15> 1623 %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49> 1624 %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255> 1625 %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257> 1626 %or = or <4 x i64> %lmask, %rmask 1627 ret <4 x i64> %or 1628} 1629 1630define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind { 1631; AVX1-LABEL: splatconstant_rotate_mask_v8i32: 1632; AVX1: # %bb.0: 1633; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1634; AVX1-NEXT: vpsrld $28, %xmm1, %xmm2 1635; AVX1-NEXT: vpslld $4, %xmm1, %xmm1 1636; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1637; AVX1-NEXT: vpsrld $28, %xmm0, %xmm2 1638; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 1639; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1640; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1641; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1642; AVX1-NEXT: retq 1643; 1644; AVX2-LABEL: splatconstant_rotate_mask_v8i32: 1645; AVX2: # %bb.0: 1646; AVX2-NEXT: vpsrld $28, %ymm0, %ymm1 1647; AVX2-NEXT: vpslld $4, %ymm0, %ymm0 1648; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1649; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1650; AVX2-NEXT: retq 1651; 1652; AVX512F-LABEL: splatconstant_rotate_mask_v8i32: 1653; AVX512F: # %bb.0: 1654; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1655; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1656; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1657; AVX512F-NEXT: retq 1658; 1659; AVX512VL-LABEL: splatconstant_rotate_mask_v8i32: 1660; AVX512VL: # %bb.0: 1661; AVX512VL-NEXT: vprold $4, %ymm0, %ymm0 1662; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1663; AVX512VL-NEXT: retq 1664; 1665; AVX512BW-LABEL: splatconstant_rotate_mask_v8i32: 1666; AVX512BW: # %bb.0: 1667; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1668; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1669; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1670; AVX512BW-NEXT: retq 1671; 1672; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i32: 1673; AVX512VLBW: # %bb.0: 1674; AVX512VLBW-NEXT: vprold $4, %ymm0, %ymm0 1675; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1676; AVX512VLBW-NEXT: retq 1677; 1678; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32: 1679; XOPAVX1: # %bb.0: 1680; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm1 1681; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1682; XOPAVX1-NEXT: vprotd $4, %xmm0, %xmm0 1683; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1684; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1685; XOPAVX1-NEXT: retq 1686; 1687; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32: 1688; XOPAVX2: # %bb.0: 1689; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm1 1690; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1691; XOPAVX2-NEXT: vprotd $4, %xmm0, %xmm0 1692; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1693; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1694; XOPAVX2-NEXT: retq 1695 %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 1696 %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28> 1697 %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511> 1698 %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3> 1699 %or = or <8 x i32> %lmask, %rmask 1700 ret <8 x i32> %or 1701} 1702 1703define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { 1704; AVX1-LABEL: splatconstant_rotate_mask_v16i16: 1705; AVX1: # %bb.0: 1706; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1707; AVX1-NEXT: vpsrlw $11, %xmm1, %xmm2 1708; AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 1709; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1710; AVX1-NEXT: vpsrlw $11, %xmm0, %xmm2 1711; AVX1-NEXT: vpsllw $5, %xmm0, %xmm0 1712; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1713; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1714; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1715; AVX1-NEXT: retq 1716; 1717; AVX2-LABEL: splatconstant_rotate_mask_v16i16: 1718; AVX2: # %bb.0: 1719; AVX2-NEXT: vpsrlw $11, %ymm0, %ymm1 1720; AVX2-NEXT: vpsllw $5, %ymm0, %ymm0 1721; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1722; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1723; AVX2-NEXT: retq 1724; 1725; AVX512-LABEL: splatconstant_rotate_mask_v16i16: 1726; AVX512: # %bb.0: 1727; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1 1728; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0 1729; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 1730; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1731; AVX512-NEXT: retq 1732; 1733; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: 1734; XOPAVX1: # %bb.0: 1735; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm1 1736; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1737; XOPAVX1-NEXT: vprotw $5, %xmm0, %xmm0 1738; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1739; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1740; XOPAVX1-NEXT: retq 1741; 1742; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16: 1743; XOPAVX2: # %bb.0: 1744; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm1 1745; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1746; XOPAVX2-NEXT: vprotw $5, %xmm0, %xmm0 1747; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1748; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1749; XOPAVX2-NEXT: retq 1750 %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 1751 %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 1752 %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 1753 %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 1754 %or = or <16 x i16> %lmask, %rmask 1755 ret <16 x i16> %or 1756} 1757 1758define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { 1759; AVX1-LABEL: splatconstant_rotate_mask_v32i8: 1760; AVX1: # %bb.0: 1761; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1762; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 1763; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] 1764; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1765; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 1766; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] 1767; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 1768; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 1769; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 1770; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1771; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 1772; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 1773; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 1774; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1775; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1776; AVX1-NEXT: retq 1777; 1778; AVX2-LABEL: splatconstant_rotate_mask_v32i8: 1779; AVX2: # %bb.0: 1780; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 1781; AVX2-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1782; AVX2-NEXT: vpsllw $4, %ymm0, %ymm0 1783; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1784; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 1785; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1786; AVX2-NEXT: retq 1787; 1788; AVX512F-LABEL: splatconstant_rotate_mask_v32i8: 1789; AVX512F: # %bb.0: 1790; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm1 1791; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1792; AVX512F-NEXT: vpsllw $4, %ymm0, %ymm0 1793; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1794; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 1795; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1796; AVX512F-NEXT: retq 1797; 1798; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8: 1799; AVX512VL: # %bb.0: 1800; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm1 1801; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1802; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 1803; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1804; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 1805; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1806; AVX512VL-NEXT: retq 1807; 1808; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8: 1809; AVX512BW: # %bb.0: 1810; AVX512BW-NEXT: vpsllw $4, %ymm0, %ymm1 1811; AVX512BW-NEXT: vpsrlw $4, %ymm0, %ymm0 1812; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1813; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1814; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 1815; AVX512BW-NEXT: retq 1816; 1817; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8: 1818; AVX512VLBW: # %bb.0: 1819; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 1820; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 1821; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1822; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 1823; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 1824; AVX512VLBW-NEXT: retq 1825; 1826; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: 1827; XOPAVX1: # %bb.0: 1828; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm1 1829; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1830; XOPAVX1-NEXT: vprotb $4, %xmm0, %xmm0 1831; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1832; XOPAVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 1833; XOPAVX1-NEXT: retq 1834; 1835; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8: 1836; XOPAVX2: # %bb.0: 1837; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm1 1838; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1839; XOPAVX2-NEXT: vprotb $4, %xmm0, %xmm0 1840; XOPAVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 1841; XOPAVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 1842; XOPAVX2-NEXT: retq 1843 %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1844 %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 1845 %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 1846 %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 1847 %or = or <32 x i8> %lmask, %rmask 1848 ret <32 x i8> %or 1849} 1850