1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2 14 15; Just one 32-bit run to make sure we do reasonable things for i64 rotates. 16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2 17 18; 19; Variable Rotates 20; 21 22define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 23; SSE2-LABEL: var_rotate_v2i64: 24; SSE2: # %bb.0: 25; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 26; SSE2-NEXT: psubq %xmm1, %xmm2 27; SSE2-NEXT: movdqa %xmm0, %xmm3 28; SSE2-NEXT: psllq %xmm1, %xmm3 29; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 30; SSE2-NEXT: movdqa %xmm0, %xmm4 31; SSE2-NEXT: psllq %xmm1, %xmm4 32; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 33; SSE2-NEXT: movdqa %xmm0, %xmm1 34; SSE2-NEXT: psrlq %xmm2, %xmm1 35; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 36; SSE2-NEXT: psrlq %xmm2, %xmm0 37; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 38; SSE2-NEXT: orpd %xmm4, %xmm0 39; SSE2-NEXT: retq 40; 41; SSE41-LABEL: var_rotate_v2i64: 42; SSE41: # %bb.0: 43; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 44; SSE41-NEXT: psubq %xmm1, %xmm2 45; SSE41-NEXT: movdqa %xmm0, %xmm3 46; SSE41-NEXT: psllq %xmm1, %xmm3 47; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 48; SSE41-NEXT: movdqa %xmm0, %xmm4 49; SSE41-NEXT: psllq %xmm1, %xmm4 50; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7] 51; SSE41-NEXT: movdqa %xmm0, %xmm1 52; SSE41-NEXT: psrlq %xmm2, %xmm1 53; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 54; SSE41-NEXT: psrlq %xmm2, %xmm0 55; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 56; SSE41-NEXT: por %xmm4, %xmm0 57; SSE41-NEXT: retq 58; 59; AVX1-LABEL: var_rotate_v2i64: 60; AVX1: # %bb.0: 61; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 62; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 63; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 64; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 65; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 66; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 67; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm3 68; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 69; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 70; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] 71; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 72; AVX1-NEXT: retq 73; 74; AVX2-LABEL: var_rotate_v2i64: 75; AVX2: # %bb.0: 76; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 77; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 78; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 79; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 80; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 81; AVX2-NEXT: retq 82; 83; AVX512F-LABEL: var_rotate_v2i64: 84; AVX512F: # %bb.0: 85; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 86; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 87; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 88; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 89; AVX512F-NEXT: vzeroupper 90; AVX512F-NEXT: retq 91; 92; AVX512VL-LABEL: var_rotate_v2i64: 93; AVX512VL: # %bb.0: 94; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 95; AVX512VL-NEXT: retq 96; 97; AVX512BW-LABEL: var_rotate_v2i64: 98; AVX512BW: # %bb.0: 99; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 100; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 101; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 102; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 103; AVX512BW-NEXT: vzeroupper 104; AVX512BW-NEXT: retq 105; 106; AVX512VLBW-LABEL: var_rotate_v2i64: 107; AVX512VLBW: # %bb.0: 108; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 109; AVX512VLBW-NEXT: retq 110; 111; AVX512VBMI2-LABEL: var_rotate_v2i64: 112; AVX512VBMI2: # %bb.0: 113; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 114; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 115; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 116; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 117; AVX512VBMI2-NEXT: vzeroupper 118; AVX512VBMI2-NEXT: retq 119; 120; AVX512VLVBMI2-LABEL: var_rotate_v2i64: 121; AVX512VLVBMI2: # %bb.0: 122; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 123; AVX512VLVBMI2-NEXT: retq 124; 125; XOP-LABEL: var_rotate_v2i64: 126; XOP: # %bb.0: 127; XOP-NEXT: vprotq %xmm1, %xmm0, %xmm0 128; XOP-NEXT: retq 129; 130; X86-SSE2-LABEL: var_rotate_v2i64: 131; X86-SSE2: # %bb.0: 132; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,64,0] 133; X86-SSE2-NEXT: psubq %xmm1, %xmm2 134; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 135; X86-SSE2-NEXT: psllq %xmm1, %xmm3 136; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 137; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 138; X86-SSE2-NEXT: psllq %xmm1, %xmm4 139; X86-SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] 140; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 141; X86-SSE2-NEXT: psrlq %xmm2, %xmm1 142; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 143; X86-SSE2-NEXT: psrlq %xmm2, %xmm0 144; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 145; X86-SSE2-NEXT: orpd %xmm4, %xmm0 146; X86-SSE2-NEXT: retl 147 %b64 = sub <2 x i64> <i64 64, i64 64>, %b 148 %shl = shl <2 x i64> %a, %b 149 %lshr = lshr <2 x i64> %a, %b64 150 %or = or <2 x i64> %shl, %lshr 151 ret <2 x i64> %or 152} 153 154define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 155; SSE2-LABEL: var_rotate_v4i32: 156; SSE2: # %bb.0: 157; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 158; SSE2-NEXT: pslld $23, %xmm1 159; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 160; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 161; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 162; SSE2-NEXT: pmuludq %xmm1, %xmm0 163; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 164; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 165; SSE2-NEXT: pmuludq %xmm2, %xmm1 166; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 167; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 168; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 169; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 170; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 171; SSE2-NEXT: por %xmm3, %xmm0 172; SSE2-NEXT: retq 173; 174; SSE41-LABEL: var_rotate_v4i32: 175; SSE41: # %bb.0: 176; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 177; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 178; SSE41-NEXT: pslld $23, %xmm1 179; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 180; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 181; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 182; SSE41-NEXT: pmuludq %xmm2, %xmm3 183; SSE41-NEXT: pmuludq %xmm1, %xmm0 184; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 185; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 186; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 187; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 188; SSE41-NEXT: por %xmm1, %xmm0 189; SSE41-NEXT: retq 190; 191; AVX1-LABEL: var_rotate_v4i32: 192; AVX1: # %bb.0: 193; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 194; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 195; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 196; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 197; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 198; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 199; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 200; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 201; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 202; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 203; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 204; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 205; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 206; AVX1-NEXT: retq 207; 208; AVX2-LABEL: var_rotate_v4i32: 209; AVX2: # %bb.0: 210; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 211; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 212; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 213; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 214; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 215; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 216; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 217; AVX2-NEXT: retq 218; 219; AVX512F-LABEL: var_rotate_v4i32: 220; AVX512F: # %bb.0: 221; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 222; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 223; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 224; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 225; AVX512F-NEXT: vzeroupper 226; AVX512F-NEXT: retq 227; 228; AVX512VL-LABEL: var_rotate_v4i32: 229; AVX512VL: # %bb.0: 230; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 231; AVX512VL-NEXT: retq 232; 233; AVX512BW-LABEL: var_rotate_v4i32: 234; AVX512BW: # %bb.0: 235; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 236; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 237; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 238; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 239; AVX512BW-NEXT: vzeroupper 240; AVX512BW-NEXT: retq 241; 242; AVX512VLBW-LABEL: var_rotate_v4i32: 243; AVX512VLBW: # %bb.0: 244; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 245; AVX512VLBW-NEXT: retq 246; 247; AVX512VBMI2-LABEL: var_rotate_v4i32: 248; AVX512VBMI2: # %bb.0: 249; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 250; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 251; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 252; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 253; AVX512VBMI2-NEXT: vzeroupper 254; AVX512VBMI2-NEXT: retq 255; 256; AVX512VLVBMI2-LABEL: var_rotate_v4i32: 257; AVX512VLVBMI2: # %bb.0: 258; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 259; AVX512VLVBMI2-NEXT: retq 260; 261; XOP-LABEL: var_rotate_v4i32: 262; XOP: # %bb.0: 263; XOP-NEXT: vprotd %xmm1, %xmm0, %xmm0 264; XOP-NEXT: retq 265; 266; X86-SSE2-LABEL: var_rotate_v4i32: 267; X86-SSE2: # %bb.0: 268; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 269; X86-SSE2-NEXT: pslld $23, %xmm1 270; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 271; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 272; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 273; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 274; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 275; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 276; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 277; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 278; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 279; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 280; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 281; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 282; X86-SSE2-NEXT: por %xmm3, %xmm0 283; X86-SSE2-NEXT: retl 284 %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b 285 %shl = shl <4 x i32> %a, %b 286 %lshr = lshr <4 x i32> %a, %b32 287 %or = or <4 x i32> %shl, %lshr 288 ret <4 x i32> %or 289} 290 291define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 292; SSE2-LABEL: var_rotate_v8i16: 293; SSE2: # %bb.0: 294; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 295; SSE2-NEXT: movdqa %xmm1, %xmm2 296; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 297; SSE2-NEXT: pslld $23, %xmm2 298; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 299; SSE2-NEXT: paddd %xmm3, %xmm2 300; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 301; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 302; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 303; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 304; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 305; SSE2-NEXT: pslld $23, %xmm1 306; SSE2-NEXT: paddd %xmm3, %xmm1 307; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 308; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 309; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 310; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 311; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 312; SSE2-NEXT: movdqa %xmm0, %xmm2 313; SSE2-NEXT: pmulhuw %xmm1, %xmm2 314; SSE2-NEXT: pmullw %xmm1, %xmm0 315; SSE2-NEXT: por %xmm2, %xmm0 316; SSE2-NEXT: retq 317; 318; SSE41-LABEL: var_rotate_v8i16: 319; SSE41: # %bb.0: 320; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 321; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 322; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 323; SSE41-NEXT: pslld $23, %xmm1 324; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 325; SSE41-NEXT: paddd %xmm3, %xmm1 326; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 327; SSE41-NEXT: pslld $23, %xmm2 328; SSE41-NEXT: paddd %xmm3, %xmm2 329; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 330; SSE41-NEXT: packusdw %xmm1, %xmm2 331; SSE41-NEXT: movdqa %xmm0, %xmm1 332; SSE41-NEXT: pmulhuw %xmm2, %xmm1 333; SSE41-NEXT: pmullw %xmm2, %xmm0 334; SSE41-NEXT: por %xmm1, %xmm0 335; SSE41-NEXT: retq 336; 337; AVX1-LABEL: var_rotate_v8i16: 338; AVX1: # %bb.0: 339; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 340; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7] 341; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 342; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 343; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 344; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 345; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 346; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 347; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 348; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 349; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 350; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 351; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 352; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 353; AVX1-NEXT: retq 354; 355; AVX2-LABEL: var_rotate_v8i16: 356; AVX2: # %bb.0: 357; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 358; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 359; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 360; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 361; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 362; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 363; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 364; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 365; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 366; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 367; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 368; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 369; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 370; AVX2-NEXT: vzeroupper 371; AVX2-NEXT: retq 372; 373; AVX512F-LABEL: var_rotate_v8i16: 374; AVX512F: # %bb.0: 375; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 376; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 377; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 378; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 379; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 380; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 381; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 382; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 383; AVX512F-NEXT: vpor %ymm0, %ymm2, %ymm0 384; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 385; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 386; AVX512F-NEXT: vzeroupper 387; AVX512F-NEXT: retq 388; 389; AVX512VL-LABEL: var_rotate_v8i16: 390; AVX512VL: # %bb.0: 391; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 392; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 393; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 394; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 395; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 396; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 397; AVX512VL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 398; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 399; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 400; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 401; AVX512VL-NEXT: vzeroupper 402; AVX512VL-NEXT: retq 403; 404; AVX512BW-LABEL: var_rotate_v8i16: 405; AVX512BW: # %bb.0: 406; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 407; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 408; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 409; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 410; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 411; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 412; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 413; AVX512BW-NEXT: vzeroupper 414; AVX512BW-NEXT: retq 415; 416; AVX512VLBW-LABEL: var_rotate_v8i16: 417; AVX512VLBW: # %bb.0: 418; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 419; AVX512VLBW-NEXT: vpsllvw %xmm1, %xmm0, %xmm2 420; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 421; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 422; AVX512VLBW-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 423; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 424; AVX512VLBW-NEXT: retq 425; 426; AVX512VBMI2-LABEL: var_rotate_v8i16: 427; AVX512VBMI2: # %bb.0: 428; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 429; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 430; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 431; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 432; AVX512VBMI2-NEXT: vzeroupper 433; AVX512VBMI2-NEXT: retq 434; 435; AVX512VLVBMI2-LABEL: var_rotate_v8i16: 436; AVX512VLVBMI2: # %bb.0: 437; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 438; AVX512VLVBMI2-NEXT: retq 439; 440; XOP-LABEL: var_rotate_v8i16: 441; XOP: # %bb.0: 442; XOP-NEXT: vprotw %xmm1, %xmm0, %xmm0 443; XOP-NEXT: retq 444; 445; X86-SSE2-LABEL: var_rotate_v8i16: 446; X86-SSE2: # %bb.0: 447; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 448; X86-SSE2-NEXT: movdqa %xmm1, %xmm2 449; X86-SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 450; X86-SSE2-NEXT: pslld $23, %xmm2 451; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 452; X86-SSE2-NEXT: paddd %xmm3, %xmm2 453; X86-SSE2-NEXT: cvttps2dq %xmm2, %xmm2 454; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] 455; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 456; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 457; X86-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] 458; X86-SSE2-NEXT: pslld $23, %xmm1 459; X86-SSE2-NEXT: paddd %xmm3, %xmm1 460; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1 461; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 462; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 463; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 464; X86-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 465; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 466; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 467; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 468; X86-SSE2-NEXT: por %xmm2, %xmm0 469; X86-SSE2-NEXT: retl 470 %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b 471 %shl = shl <8 x i16> %a, %b 472 %lshr = lshr <8 x i16> %a, %b16 473 %or = or <8 x i16> %shl, %lshr 474 ret <8 x i16> %or 475} 476 477define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 478; SSE2-LABEL: var_rotate_v16i8: 479; SSE2: # %bb.0: 480; SSE2-NEXT: movdqa %xmm0, %xmm2 481; SSE2-NEXT: psllw $5, %xmm1 482; SSE2-NEXT: pxor %xmm0, %xmm0 483; SSE2-NEXT: pxor %xmm3, %xmm3 484; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 485; SSE2-NEXT: movdqa %xmm2, %xmm4 486; SSE2-NEXT: psrlw $4, %xmm4 487; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 488; SSE2-NEXT: movdqa %xmm2, %xmm5 489; SSE2-NEXT: psllw $4, %xmm5 490; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 491; SSE2-NEXT: por %xmm4, %xmm5 492; SSE2-NEXT: pand %xmm3, %xmm5 493; SSE2-NEXT: pandn %xmm2, %xmm3 494; SSE2-NEXT: por %xmm5, %xmm3 495; SSE2-NEXT: movdqa %xmm3, %xmm2 496; SSE2-NEXT: psrlw $6, %xmm2 497; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 498; SSE2-NEXT: movdqa %xmm3, %xmm4 499; SSE2-NEXT: psllw $2, %xmm4 500; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 501; SSE2-NEXT: por %xmm2, %xmm4 502; SSE2-NEXT: paddb %xmm1, %xmm1 503; SSE2-NEXT: pxor %xmm2, %xmm2 504; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 505; SSE2-NEXT: pand %xmm2, %xmm4 506; SSE2-NEXT: pandn %xmm3, %xmm2 507; SSE2-NEXT: por %xmm4, %xmm2 508; SSE2-NEXT: movdqa %xmm2, %xmm3 509; SSE2-NEXT: paddb %xmm2, %xmm3 510; SSE2-NEXT: movdqa %xmm2, %xmm4 511; SSE2-NEXT: psrlw $7, %xmm4 512; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 513; SSE2-NEXT: por %xmm3, %xmm4 514; SSE2-NEXT: paddb %xmm1, %xmm1 515; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 516; SSE2-NEXT: pand %xmm0, %xmm4 517; SSE2-NEXT: pandn %xmm2, %xmm0 518; SSE2-NEXT: por %xmm4, %xmm0 519; SSE2-NEXT: retq 520; 521; SSE41-LABEL: var_rotate_v16i8: 522; SSE41: # %bb.0: 523; SSE41-NEXT: movdqa %xmm1, %xmm2 524; SSE41-NEXT: movdqa %xmm0, %xmm1 525; SSE41-NEXT: psrlw $4, %xmm0 526; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 527; SSE41-NEXT: movdqa %xmm1, %xmm3 528; SSE41-NEXT: psllw $4, %xmm3 529; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 530; SSE41-NEXT: por %xmm0, %xmm3 531; SSE41-NEXT: psllw $5, %xmm2 532; SSE41-NEXT: movdqa %xmm2, %xmm0 533; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 534; SSE41-NEXT: movdqa %xmm1, %xmm0 535; SSE41-NEXT: psrlw $6, %xmm0 536; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 537; SSE41-NEXT: movdqa %xmm1, %xmm3 538; SSE41-NEXT: psllw $2, %xmm3 539; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 540; SSE41-NEXT: por %xmm0, %xmm3 541; SSE41-NEXT: paddb %xmm2, %xmm2 542; SSE41-NEXT: movdqa %xmm2, %xmm0 543; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 544; SSE41-NEXT: movdqa %xmm1, %xmm0 545; SSE41-NEXT: paddb %xmm1, %xmm0 546; SSE41-NEXT: movdqa %xmm1, %xmm3 547; SSE41-NEXT: psrlw $7, %xmm3 548; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 549; SSE41-NEXT: por %xmm0, %xmm3 550; SSE41-NEXT: paddb %xmm2, %xmm2 551; SSE41-NEXT: movdqa %xmm2, %xmm0 552; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm1 553; SSE41-NEXT: movdqa %xmm1, %xmm0 554; SSE41-NEXT: retq 555; 556; AVX-LABEL: var_rotate_v16i8: 557; AVX: # %bb.0: 558; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 559; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 560; AVX-NEXT: vpsllw $4, %xmm0, %xmm3 561; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 562; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 563; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 564; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 565; AVX-NEXT: vpsrlw $6, %xmm0, %xmm2 566; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 567; AVX-NEXT: vpsllw $2, %xmm0, %xmm3 568; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 569; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2 570; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 571; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 572; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 573; AVX-NEXT: vpsrlw $7, %xmm0, %xmm3 574; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 575; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 576; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 577; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 578; AVX-NEXT: retq 579; 580; AVX512F-LABEL: var_rotate_v16i8: 581; AVX512F: # %bb.0: 582; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 583; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm2 584; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 585; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 586; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 587; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 588; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 589; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 590; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 591; AVX512F-NEXT: vzeroupper 592; AVX512F-NEXT: retq 593; 594; AVX512VL-LABEL: var_rotate_v16i8: 595; AVX512VL: # %bb.0: 596; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 597; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2 598; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero 599; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 600; AVX512VL-NEXT: vpsllvd %zmm1, %zmm0, %zmm1 601; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero 602; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm0, %zmm0 603; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0 604; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 605; AVX512VL-NEXT: vzeroupper 606; AVX512VL-NEXT: retq 607; 608; AVX512BW-LABEL: var_rotate_v16i8: 609; AVX512BW: # %bb.0: 610; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 611; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 612; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 613; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 614; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 615; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 616; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 617; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 618; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 619; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 620; AVX512BW-NEXT: vzeroupper 621; AVX512BW-NEXT: retq 622; 623; AVX512VLBW-LABEL: var_rotate_v16i8: 624; AVX512VLBW: # %bb.0: 625; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 626; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 627; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 628; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 629; AVX512VLBW-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 630; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 631; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 632; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 633; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 634; AVX512VLBW-NEXT: vzeroupper 635; AVX512VLBW-NEXT: retq 636; 637; AVX512VBMI2-LABEL: var_rotate_v16i8: 638; AVX512VBMI2: # %bb.0: 639; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 640; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 641; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 642; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 643; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 644; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 645; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 646; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 647; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 648; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 649; AVX512VBMI2-NEXT: vzeroupper 650; AVX512VBMI2-NEXT: retq 651; 652; AVX512VLVBMI2-LABEL: var_rotate_v16i8: 653; AVX512VLVBMI2: # %bb.0: 654; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 655; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 656; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 657; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 658; AVX512VLVBMI2-NEXT: vpsllvw %ymm1, %ymm0, %ymm1 659; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 660; AVX512VLVBMI2-NEXT: vpsrlvw %ymm2, %ymm0, %ymm0 661; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 662; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 663; AVX512VLVBMI2-NEXT: vzeroupper 664; AVX512VLVBMI2-NEXT: retq 665; 666; XOP-LABEL: var_rotate_v16i8: 667; XOP: # %bb.0: 668; XOP-NEXT: vprotb %xmm1, %xmm0, %xmm0 669; XOP-NEXT: retq 670; 671; X86-SSE2-LABEL: var_rotate_v16i8: 672; X86-SSE2: # %bb.0: 673; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 674; X86-SSE2-NEXT: psllw $5, %xmm1 675; X86-SSE2-NEXT: pxor %xmm0, %xmm0 676; X86-SSE2-NEXT: pxor %xmm3, %xmm3 677; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm3 678; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 679; X86-SSE2-NEXT: psrlw $4, %xmm4 680; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 681; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 682; X86-SSE2-NEXT: psllw $4, %xmm5 683; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5 684; X86-SSE2-NEXT: por %xmm4, %xmm5 685; X86-SSE2-NEXT: pand %xmm3, %xmm5 686; X86-SSE2-NEXT: pandn %xmm2, %xmm3 687; X86-SSE2-NEXT: por %xmm5, %xmm3 688; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 689; X86-SSE2-NEXT: psrlw $6, %xmm2 690; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 691; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 692; X86-SSE2-NEXT: psllw $2, %xmm4 693; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 694; X86-SSE2-NEXT: por %xmm2, %xmm4 695; X86-SSE2-NEXT: paddb %xmm1, %xmm1 696; X86-SSE2-NEXT: pxor %xmm2, %xmm2 697; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm2 698; X86-SSE2-NEXT: pand %xmm2, %xmm4 699; X86-SSE2-NEXT: pandn %xmm3, %xmm2 700; X86-SSE2-NEXT: por %xmm4, %xmm2 701; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 702; X86-SSE2-NEXT: paddb %xmm2, %xmm3 703; X86-SSE2-NEXT: movdqa %xmm2, %xmm4 704; X86-SSE2-NEXT: psrlw $7, %xmm4 705; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4 706; X86-SSE2-NEXT: por %xmm3, %xmm4 707; X86-SSE2-NEXT: paddb %xmm1, %xmm1 708; X86-SSE2-NEXT: pcmpgtb %xmm1, %xmm0 709; X86-SSE2-NEXT: pand %xmm0, %xmm4 710; X86-SSE2-NEXT: pandn %xmm2, %xmm0 711; X86-SSE2-NEXT: por %xmm4, %xmm0 712; X86-SSE2-NEXT: retl 713 %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b 714 %shl = shl <16 x i8> %a, %b 715 %lshr = lshr <16 x i8> %a, %b8 716 %or = or <16 x i8> %shl, %lshr 717 ret <16 x i8> %or 718} 719 720; 721; Uniform Variable Rotates 722; 723 724define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind { 725; SSE-LABEL: splatvar_rotate_v2i64: 726; SSE: # %bb.0: 727; SSE-NEXT: movdqa {{.*#+}} xmm2 = [64,64] 728; SSE-NEXT: psubq %xmm1, %xmm2 729; SSE-NEXT: movdqa %xmm0, %xmm3 730; SSE-NEXT: psllq %xmm1, %xmm3 731; SSE-NEXT: psrlq %xmm2, %xmm0 732; SSE-NEXT: por %xmm3, %xmm0 733; SSE-NEXT: retq 734; 735; AVX-LABEL: splatvar_rotate_v2i64: 736; AVX: # %bb.0: 737; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] 738; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 739; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 740; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 741; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 742; AVX-NEXT: retq 743; 744; AVX512F-LABEL: splatvar_rotate_v2i64: 745; AVX512F: # %bb.0: 746; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 747; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 748; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 749; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 750; AVX512F-NEXT: vzeroupper 751; AVX512F-NEXT: retq 752; 753; AVX512VL-LABEL: splatvar_rotate_v2i64: 754; AVX512VL: # %bb.0: 755; AVX512VL-NEXT: vpbroadcastq %xmm1, %xmm1 756; AVX512VL-NEXT: vprolvq %xmm1, %xmm0, %xmm0 757; AVX512VL-NEXT: retq 758; 759; AVX512BW-LABEL: splatvar_rotate_v2i64: 760; AVX512BW: # %bb.0: 761; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 762; AVX512BW-NEXT: vpbroadcastq %xmm1, %xmm1 763; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 764; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 765; AVX512BW-NEXT: vzeroupper 766; AVX512BW-NEXT: retq 767; 768; AVX512VLBW-LABEL: splatvar_rotate_v2i64: 769; AVX512VLBW: # %bb.0: 770; AVX512VLBW-NEXT: vpbroadcastq %xmm1, %xmm1 771; AVX512VLBW-NEXT: vprolvq %xmm1, %xmm0, %xmm0 772; AVX512VLBW-NEXT: retq 773; 774; AVX512VBMI2-LABEL: splatvar_rotate_v2i64: 775; AVX512VBMI2: # %bb.0: 776; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 777; AVX512VBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 778; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 779; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 780; AVX512VBMI2-NEXT: vzeroupper 781; AVX512VBMI2-NEXT: retq 782; 783; AVX512VLVBMI2-LABEL: splatvar_rotate_v2i64: 784; AVX512VLVBMI2: # %bb.0: 785; AVX512VLVBMI2-NEXT: vpbroadcastq %xmm1, %xmm1 786; AVX512VLVBMI2-NEXT: vprolvq %xmm1, %xmm0, %xmm0 787; AVX512VLVBMI2-NEXT: retq 788; 789; XOPAVX1-LABEL: splatvar_rotate_v2i64: 790; XOPAVX1: # %bb.0: 791; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 792; XOPAVX1-NEXT: vprotq %xmm1, %xmm0, %xmm0 793; XOPAVX1-NEXT: retq 794; 795; XOPAVX2-LABEL: splatvar_rotate_v2i64: 796; XOPAVX2: # %bb.0: 797; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1 798; XOPAVX2-NEXT: vprotq %xmm1, %xmm0, %xmm0 799; XOPAVX2-NEXT: retq 800; 801; X86-SSE2-LABEL: splatvar_rotate_v2i64: 802; X86-SSE2: # %bb.0: 803; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] 804; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [64,0,64,0] 805; X86-SSE2-NEXT: psubq %xmm2, %xmm3 806; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 807; X86-SSE2-NEXT: psllq %xmm1, %xmm2 808; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 809; X86-SSE2-NEXT: psrlq %xmm3, %xmm1 810; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 811; X86-SSE2-NEXT: psrlq %xmm3, %xmm0 812; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 813; X86-SSE2-NEXT: orpd %xmm2, %xmm0 814; X86-SSE2-NEXT: retl 815 %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer 816 %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat 817 %shl = shl <2 x i64> %a, %splat 818 %lshr = lshr <2 x i64> %a, %splat64 819 %or = or <2 x i64> %shl, %lshr 820 ret <2 x i64> %or 821} 822 823define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { 824; SSE2-LABEL: splatvar_rotate_v4i32: 825; SSE2: # %bb.0: 826; SSE2-NEXT: movd %xmm1, %eax 827; SSE2-NEXT: andl $31, %eax 828; SSE2-NEXT: movd %eax, %xmm1 829; SSE2-NEXT: movdqa %xmm0, %xmm2 830; SSE2-NEXT: pslld %xmm1, %xmm2 831; SSE2-NEXT: movl $32, %ecx 832; SSE2-NEXT: subl %eax, %ecx 833; SSE2-NEXT: movd %ecx, %xmm1 834; SSE2-NEXT: psrld %xmm1, %xmm0 835; SSE2-NEXT: por %xmm2, %xmm0 836; SSE2-NEXT: retq 837; 838; SSE41-LABEL: splatvar_rotate_v4i32: 839; SSE41: # %bb.0: 840; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 841; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 842; SSE41-NEXT: movdqa %xmm0, %xmm3 843; SSE41-NEXT: pslld %xmm2, %xmm3 844; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 845; SSE41-NEXT: psubd %xmm1, %xmm2 846; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero 847; SSE41-NEXT: psrld %xmm1, %xmm0 848; SSE41-NEXT: por %xmm3, %xmm0 849; SSE41-NEXT: retq 850; 851; AVX1-LABEL: splatvar_rotate_v4i32: 852; AVX1: # %bb.0: 853; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 854; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 855; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 856; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32] 857; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1 858; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 859; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 860; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 861; AVX1-NEXT: retq 862; 863; AVX2-LABEL: splatvar_rotate_v4i32: 864; AVX2: # %bb.0: 865; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] 866; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 867; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 868; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2 869; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] 870; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 871; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 872; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 873; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 874; AVX2-NEXT: retq 875; 876; AVX512F-LABEL: splatvar_rotate_v4i32: 877; AVX512F: # %bb.0: 878; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 879; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 880; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 881; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 882; AVX512F-NEXT: vzeroupper 883; AVX512F-NEXT: retq 884; 885; AVX512VL-LABEL: splatvar_rotate_v4i32: 886; AVX512VL: # %bb.0: 887; AVX512VL-NEXT: vpbroadcastd %xmm1, %xmm1 888; AVX512VL-NEXT: vprolvd %xmm1, %xmm0, %xmm0 889; AVX512VL-NEXT: retq 890; 891; AVX512BW-LABEL: splatvar_rotate_v4i32: 892; AVX512BW: # %bb.0: 893; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 894; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 895; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 896; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 897; AVX512BW-NEXT: vzeroupper 898; AVX512BW-NEXT: retq 899; 900; AVX512VLBW-LABEL: splatvar_rotate_v4i32: 901; AVX512VLBW: # %bb.0: 902; AVX512VLBW-NEXT: vpbroadcastd %xmm1, %xmm1 903; AVX512VLBW-NEXT: vprolvd %xmm1, %xmm0, %xmm0 904; AVX512VLBW-NEXT: retq 905; 906; AVX512VBMI2-LABEL: splatvar_rotate_v4i32: 907; AVX512VBMI2: # %bb.0: 908; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 909; AVX512VBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 910; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 911; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 912; AVX512VBMI2-NEXT: vzeroupper 913; AVX512VBMI2-NEXT: retq 914; 915; AVX512VLVBMI2-LABEL: splatvar_rotate_v4i32: 916; AVX512VLVBMI2: # %bb.0: 917; AVX512VLVBMI2-NEXT: vpbroadcastd %xmm1, %xmm1 918; AVX512VLVBMI2-NEXT: vprolvd %xmm1, %xmm0, %xmm0 919; AVX512VLVBMI2-NEXT: retq 920; 921; XOPAVX1-LABEL: splatvar_rotate_v4i32: 922; XOPAVX1: # %bb.0: 923; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 924; XOPAVX1-NEXT: vprotd %xmm1, %xmm0, %xmm0 925; XOPAVX1-NEXT: retq 926; 927; XOPAVX2-LABEL: splatvar_rotate_v4i32: 928; XOPAVX2: # %bb.0: 929; XOPAVX2-NEXT: vpbroadcastd %xmm1, %xmm1 930; XOPAVX2-NEXT: vprotd %xmm1, %xmm0, %xmm0 931; XOPAVX2-NEXT: retq 932; 933; X86-SSE2-LABEL: splatvar_rotate_v4i32: 934; X86-SSE2: # %bb.0: 935; X86-SSE2-NEXT: movd %xmm1, %eax 936; X86-SSE2-NEXT: andl $31, %eax 937; X86-SSE2-NEXT: movd %eax, %xmm1 938; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 939; X86-SSE2-NEXT: pslld %xmm1, %xmm2 940; X86-SSE2-NEXT: movl $32, %ecx 941; X86-SSE2-NEXT: subl %eax, %ecx 942; X86-SSE2-NEXT: movd %ecx, %xmm1 943; X86-SSE2-NEXT: psrld %xmm1, %xmm0 944; X86-SSE2-NEXT: por %xmm2, %xmm0 945; X86-SSE2-NEXT: retl 946 %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer 947 %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat 948 %shl = shl <4 x i32> %a, %splat 949 %lshr = lshr <4 x i32> %a, %splat32 950 %or = or <4 x i32> %shl, %lshr 951 ret <4 x i32> %or 952} 953 954define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { 955; SSE2-LABEL: splatvar_rotate_v8i16: 956; SSE2: # %bb.0: 957; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 958; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 959; SSE2-NEXT: psubw %xmm1, %xmm2 960; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 961; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 962; SSE2-NEXT: movdqa %xmm0, %xmm3 963; SSE2-NEXT: psllw %xmm1, %xmm3 964; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] 965; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 966; SSE2-NEXT: psrlw %xmm2, %xmm0 967; SSE2-NEXT: por %xmm3, %xmm0 968; SSE2-NEXT: retq 969; 970; SSE41-LABEL: splatvar_rotate_v8i16: 971; SSE41: # %bb.0: 972; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 973; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 974; SSE41-NEXT: movdqa %xmm0, %xmm3 975; SSE41-NEXT: psllw %xmm2, %xmm3 976; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 977; SSE41-NEXT: psubw %xmm1, %xmm2 978; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 979; SSE41-NEXT: psrlw %xmm1, %xmm0 980; SSE41-NEXT: por %xmm3, %xmm0 981; SSE41-NEXT: retq 982; 983; AVX-LABEL: splatvar_rotate_v8i16: 984; AVX: # %bb.0: 985; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 986; AVX-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 987; AVX-NEXT: vpsllw %xmm2, %xmm0, %xmm2 988; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 989; AVX-NEXT: vpsubw %xmm1, %xmm3, %xmm1 990; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 991; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 992; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 993; AVX-NEXT: retq 994; 995; AVX512F-LABEL: splatvar_rotate_v8i16: 996; AVX512F: # %bb.0: 997; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 998; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 999; AVX512F-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1000; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1001; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1002; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1003; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1004; AVX512F-NEXT: vpor %xmm0, %xmm2, %xmm0 1005; AVX512F-NEXT: retq 1006; 1007; AVX512VL-LABEL: splatvar_rotate_v8i16: 1008; AVX512VL: # %bb.0: 1009; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1010; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1011; AVX512VL-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1012; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1013; AVX512VL-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1014; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1015; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1016; AVX512VL-NEXT: vpor %xmm0, %xmm2, %xmm0 1017; AVX512VL-NEXT: retq 1018; 1019; AVX512BW-LABEL: splatvar_rotate_v8i16: 1020; AVX512BW: # %bb.0: 1021; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1022; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1023; AVX512BW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1024; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1025; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1026; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1027; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1028; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 1029; AVX512BW-NEXT: retq 1030; 1031; AVX512VLBW-LABEL: splatvar_rotate_v8i16: 1032; AVX512VLBW: # %bb.0: 1033; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1034; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1035; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm0, %xmm2 1036; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] 1037; AVX512VLBW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 1038; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 1039; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1040; AVX512VLBW-NEXT: vpor %xmm0, %xmm2, %xmm0 1041; AVX512VLBW-NEXT: retq 1042; 1043; AVX512VBMI2-LABEL: splatvar_rotate_v8i16: 1044; AVX512VBMI2: # %bb.0: 1045; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1046; AVX512VBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1047; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1048; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1049; AVX512VBMI2-NEXT: vzeroupper 1050; AVX512VBMI2-NEXT: retq 1051; 1052; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i16: 1053; AVX512VLVBMI2: # %bb.0: 1054; AVX512VLVBMI2-NEXT: vpbroadcastw %xmm1, %xmm1 1055; AVX512VLVBMI2-NEXT: vpshldvw %xmm1, %xmm0, %xmm0 1056; AVX512VLVBMI2-NEXT: retq 1057; 1058; XOPAVX1-LABEL: splatvar_rotate_v8i16: 1059; XOPAVX1: # %bb.0: 1060; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1061; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 1062; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 1063; XOPAVX1-NEXT: retq 1064; 1065; XOPAVX2-LABEL: splatvar_rotate_v8i16: 1066; XOPAVX2: # %bb.0: 1067; XOPAVX2-NEXT: vpbroadcastw %xmm1, %xmm1 1068; XOPAVX2-NEXT: vprotw %xmm1, %xmm0, %xmm0 1069; XOPAVX2-NEXT: retq 1070; 1071; X86-SSE2-LABEL: splatvar_rotate_v8i16: 1072; X86-SSE2: # %bb.0: 1073; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1074; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] 1075; X86-SSE2-NEXT: psubw %xmm1, %xmm2 1076; X86-SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] 1077; X86-SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1078; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1079; X86-SSE2-NEXT: psllw %xmm1, %xmm3 1080; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] 1081; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1082; X86-SSE2-NEXT: psrlw %xmm2, %xmm0 1083; X86-SSE2-NEXT: por %xmm3, %xmm0 1084; X86-SSE2-NEXT: retl 1085 %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer 1086 %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat 1087 %shl = shl <8 x i16> %a, %splat 1088 %lshr = lshr <8 x i16> %a, %splat16 1089 %or = or <8 x i16> %shl, %lshr 1090 ret <8 x i16> %or 1091} 1092 1093define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { 1094; SSE2-LABEL: splatvar_rotate_v16i8: 1095; SSE2: # %bb.0: 1096; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1097; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1098; SSE2-NEXT: psubb %xmm1, %xmm2 1099; SSE2-NEXT: movdqa %xmm1, %xmm3 1100; SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1101; SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1102; SSE2-NEXT: movdqa %xmm0, %xmm1 1103; SSE2-NEXT: psllw %xmm3, %xmm1 1104; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1105; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1106; SSE2-NEXT: psllw %xmm3, %xmm5 1107; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1108; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] 1109; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1110; SSE2-NEXT: pand %xmm3, %xmm1 1111; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1112; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1113; SSE2-NEXT: psrlw %xmm2, %xmm0 1114; SSE2-NEXT: psrlw %xmm2, %xmm4 1115; SSE2-NEXT: psrlw $8, %xmm4 1116; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1117; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1118; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1119; SSE2-NEXT: pand %xmm0, %xmm2 1120; SSE2-NEXT: por %xmm2, %xmm1 1121; SSE2-NEXT: movdqa %xmm1, %xmm0 1122; SSE2-NEXT: retq 1123; 1124; SSE41-LABEL: splatvar_rotate_v16i8: 1125; SSE41: # %bb.0: 1126; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1127; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1128; SSE41-NEXT: movdqa %xmm0, %xmm2 1129; SSE41-NEXT: psllw %xmm3, %xmm2 1130; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 1131; SSE41-NEXT: pcmpeqd %xmm5, %xmm5 1132; SSE41-NEXT: psllw %xmm3, %xmm5 1133; SSE41-NEXT: pxor %xmm3, %xmm3 1134; SSE41-NEXT: pshufb %xmm3, %xmm5 1135; SSE41-NEXT: pand %xmm5, %xmm2 1136; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1137; SSE41-NEXT: psubb %xmm1, %xmm3 1138; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero 1139; SSE41-NEXT: psrlw %xmm1, %xmm0 1140; SSE41-NEXT: psrlw %xmm1, %xmm4 1141; SSE41-NEXT: pshufb {{.*#+}} xmm4 = xmm4[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1142; SSE41-NEXT: pand %xmm0, %xmm4 1143; SSE41-NEXT: por %xmm4, %xmm2 1144; SSE41-NEXT: movdqa %xmm2, %xmm0 1145; SSE41-NEXT: retq 1146; 1147; AVX1-LABEL: splatvar_rotate_v16i8: 1148; AVX1: # %bb.0: 1149; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1150; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1151; AVX1-NEXT: vpsllw %xmm2, %xmm0, %xmm3 1152; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1153; AVX1-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1154; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 1155; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 1156; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 1157; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1158; AVX1-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1159; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1160; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1161; AVX1-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 1162; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 1163; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1164; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 1165; AVX1-NEXT: retq 1166; 1167; AVX2-LABEL: splatvar_rotate_v16i8: 1168; AVX2: # %bb.0: 1169; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1170; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1171; AVX2-NEXT: vpsllw %xmm2, %xmm0, %xmm3 1172; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 1173; AVX2-NEXT: vpsllw %xmm2, %xmm4, %xmm2 1174; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 1175; AVX2-NEXT: vpand %xmm2, %xmm3, %xmm2 1176; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1177; AVX2-NEXT: vpsubb %xmm1, %xmm3, %xmm1 1178; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1179; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 1180; AVX2-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 1181; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 1182; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1183; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 1184; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 1185; AVX2-NEXT: retq 1186; 1187; AVX512F-LABEL: splatvar_rotate_v16i8: 1188; AVX512F: # %bb.0: 1189; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1190; AVX512F-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1191; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1192; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1193; AVX512F-NEXT: vpslld %xmm1, %zmm0, %zmm1 1194; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1195; AVX512F-NEXT: vpsrld %xmm2, %zmm0, %zmm0 1196; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 1197; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1198; AVX512F-NEXT: vzeroupper 1199; AVX512F-NEXT: retq 1200; 1201; AVX512VL-LABEL: splatvar_rotate_v16i8: 1202; AVX512VL: # %bb.0: 1203; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1204; AVX512VL-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1205; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1206; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1207; AVX512VL-NEXT: vpslld %xmm1, %zmm0, %zmm1 1208; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1209; AVX512VL-NEXT: vpsrld %xmm2, %zmm0, %zmm0 1210; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0 1211; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1212; AVX512VL-NEXT: vzeroupper 1213; AVX512VL-NEXT: retq 1214; 1215; AVX512BW-LABEL: splatvar_rotate_v16i8: 1216; AVX512BW: # %bb.0: 1217; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1218; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1219; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1220; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1221; AVX512BW-NEXT: vpsllw %xmm1, %ymm0, %ymm1 1222; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1223; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 1224; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 1225; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1226; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1227; AVX512BW-NEXT: vzeroupper 1228; AVX512BW-NEXT: retq 1229; 1230; AVX512VLBW-LABEL: splatvar_rotate_v16i8: 1231; AVX512VLBW: # %bb.0: 1232; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1233; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1234; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1235; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1236; AVX512VLBW-NEXT: vpsllw %xmm1, %ymm0, %ymm1 1237; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1238; AVX512VLBW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 1239; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 1240; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1241; AVX512VLBW-NEXT: vzeroupper 1242; AVX512VLBW-NEXT: retq 1243; 1244; AVX512VBMI2-LABEL: splatvar_rotate_v16i8: 1245; AVX512VBMI2: # %bb.0: 1246; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1247; AVX512VBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1248; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1249; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1250; AVX512VBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1 1251; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1252; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 1253; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 1254; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1255; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1256; AVX512VBMI2-NEXT: vzeroupper 1257; AVX512VBMI2-NEXT: retq 1258; 1259; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i8: 1260; AVX512VLVBMI2: # %bb.0: 1261; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1262; AVX512VLVBMI2-NEXT: vpsubb %xmm1, %xmm2, %xmm2 1263; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero 1264; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1265; AVX512VLVBMI2-NEXT: vpsllw %xmm1, %ymm0, %ymm1 1266; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero 1267; AVX512VLVBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 1268; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 1269; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1270; AVX512VLVBMI2-NEXT: vzeroupper 1271; AVX512VLVBMI2-NEXT: retq 1272; 1273; XOPAVX1-LABEL: splatvar_rotate_v16i8: 1274; XOPAVX1: # %bb.0: 1275; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1276; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1277; XOPAVX1-NEXT: vprotb %xmm1, %xmm0, %xmm0 1278; XOPAVX1-NEXT: retq 1279; 1280; XOPAVX2-LABEL: splatvar_rotate_v16i8: 1281; XOPAVX2: # %bb.0: 1282; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 1283; XOPAVX2-NEXT: vprotb %xmm1, %xmm0, %xmm0 1284; XOPAVX2-NEXT: retq 1285; 1286; X86-SSE2-LABEL: splatvar_rotate_v16i8: 1287; X86-SSE2: # %bb.0: 1288; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1289; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] 1290; X86-SSE2-NEXT: psubb %xmm1, %xmm2 1291; X86-SSE2-NEXT: movdqa %xmm1, %xmm3 1292; X86-SSE2-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0] 1293; X86-SSE2-NEXT: psrldq {{.*#+}} xmm3 = xmm3[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1294; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1295; X86-SSE2-NEXT: psllw %xmm3, %xmm1 1296; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1297; X86-SSE2-NEXT: pcmpeqd %xmm5, %xmm5 1298; X86-SSE2-NEXT: psllw %xmm3, %xmm5 1299; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1300; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,0,0,0,4,5,6,7] 1301; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] 1302; X86-SSE2-NEXT: pand %xmm3, %xmm1 1303; X86-SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0] 1304; X86-SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1305; X86-SSE2-NEXT: psrlw %xmm2, %xmm0 1306; X86-SSE2-NEXT: psrlw %xmm2, %xmm4 1307; X86-SSE2-NEXT: psrlw $8, %xmm4 1308; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1309; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[0,0,0,0,4,5,6,7] 1310; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] 1311; X86-SSE2-NEXT: pand %xmm0, %xmm2 1312; X86-SSE2-NEXT: por %xmm2, %xmm1 1313; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 1314; X86-SSE2-NEXT: retl 1315 %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer 1316 %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat 1317 %shl = shl <16 x i8> %a, %splat 1318 %lshr = lshr <16 x i8> %a, %splat8 1319 %or = or <16 x i8> %shl, %lshr 1320 ret <16 x i8> %or 1321} 1322 1323; 1324; Constant Rotates 1325; 1326 1327define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind { 1328; SSE2-LABEL: constant_rotate_v2i64: 1329; SSE2: # %bb.0: 1330; SSE2-NEXT: movdqa %xmm0, %xmm1 1331; SSE2-NEXT: psllq $4, %xmm1 1332; SSE2-NEXT: movdqa %xmm0, %xmm2 1333; SSE2-NEXT: psllq $14, %xmm2 1334; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1335; SSE2-NEXT: movdqa %xmm0, %xmm1 1336; SSE2-NEXT: psrlq $60, %xmm1 1337; SSE2-NEXT: psrlq $50, %xmm0 1338; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1339; SSE2-NEXT: orpd %xmm2, %xmm0 1340; SSE2-NEXT: retq 1341; 1342; SSE41-LABEL: constant_rotate_v2i64: 1343; SSE41: # %bb.0: 1344; SSE41-NEXT: movdqa %xmm0, %xmm1 1345; SSE41-NEXT: psllq $14, %xmm1 1346; SSE41-NEXT: movdqa %xmm0, %xmm2 1347; SSE41-NEXT: psllq $4, %xmm2 1348; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1349; SSE41-NEXT: movdqa %xmm0, %xmm1 1350; SSE41-NEXT: psrlq $50, %xmm1 1351; SSE41-NEXT: psrlq $60, %xmm0 1352; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 1353; SSE41-NEXT: por %xmm2, %xmm0 1354; SSE41-NEXT: retq 1355; 1356; AVX1-LABEL: constant_rotate_v2i64: 1357; AVX1: # %bb.0: 1358; AVX1-NEXT: vpsllq $14, %xmm0, %xmm1 1359; AVX1-NEXT: vpsllq $4, %xmm0, %xmm2 1360; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] 1361; AVX1-NEXT: vpsrlq $50, %xmm0, %xmm2 1362; AVX1-NEXT: vpsrlq $60, %xmm0, %xmm0 1363; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] 1364; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1365; AVX1-NEXT: retq 1366; 1367; AVX2-LABEL: constant_rotate_v2i64: 1368; AVX2: # %bb.0: 1369; AVX2-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1370; AVX2-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1371; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 1372; AVX2-NEXT: retq 1373; 1374; AVX512F-LABEL: constant_rotate_v2i64: 1375; AVX512F: # %bb.0: 1376; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1377; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1378; AVX512F-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1379; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1380; AVX512F-NEXT: vzeroupper 1381; AVX512F-NEXT: retq 1382; 1383; AVX512VL-LABEL: constant_rotate_v2i64: 1384; AVX512VL: # %bb.0: 1385; AVX512VL-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1386; AVX512VL-NEXT: retq 1387; 1388; AVX512BW-LABEL: constant_rotate_v2i64: 1389; AVX512BW: # %bb.0: 1390; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1391; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1392; AVX512BW-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1393; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1394; AVX512BW-NEXT: vzeroupper 1395; AVX512BW-NEXT: retq 1396; 1397; AVX512VLBW-LABEL: constant_rotate_v2i64: 1398; AVX512VLBW: # %bb.0: 1399; AVX512VLBW-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1400; AVX512VLBW-NEXT: retq 1401; 1402; AVX512VBMI2-LABEL: constant_rotate_v2i64: 1403; AVX512VBMI2: # %bb.0: 1404; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1405; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,14] 1406; AVX512VBMI2-NEXT: vprolvq %zmm1, %zmm0, %zmm0 1407; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1408; AVX512VBMI2-NEXT: vzeroupper 1409; AVX512VBMI2-NEXT: retq 1410; 1411; AVX512VLVBMI2-LABEL: constant_rotate_v2i64: 1412; AVX512VLVBMI2: # %bb.0: 1413; AVX512VLVBMI2-NEXT: vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1414; AVX512VLVBMI2-NEXT: retq 1415; 1416; XOP-LABEL: constant_rotate_v2i64: 1417; XOP: # %bb.0: 1418; XOP-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1419; XOP-NEXT: retq 1420; 1421; X86-SSE2-LABEL: constant_rotate_v2i64: 1422; X86-SSE2: # %bb.0: 1423; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1424; X86-SSE2-NEXT: psllq $4, %xmm1 1425; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1426; X86-SSE2-NEXT: psllq $14, %xmm2 1427; X86-SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 1428; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1429; X86-SSE2-NEXT: psrlq $60, %xmm1 1430; X86-SSE2-NEXT: psrlq $50, %xmm0 1431; X86-SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1432; X86-SSE2-NEXT: orpd %xmm2, %xmm0 1433; X86-SSE2-NEXT: retl 1434 %shl = shl <2 x i64> %a, <i64 4, i64 14> 1435 %lshr = lshr <2 x i64> %a, <i64 60, i64 50> 1436 %or = or <2 x i64> %shl, %lshr 1437 ret <2 x i64> %or 1438} 1439 1440define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind { 1441; SSE2-LABEL: constant_rotate_v4i32: 1442; SSE2: # %bb.0: 1443; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1444; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1445; SSE2-NEXT: pmuludq %xmm1, %xmm0 1446; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1447; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1448; SSE2-NEXT: pmuludq %xmm2, %xmm1 1449; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1450; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1451; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1452; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1453; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1454; SSE2-NEXT: por %xmm3, %xmm0 1455; SSE2-NEXT: retq 1456; 1457; SSE41-LABEL: constant_rotate_v4i32: 1458; SSE41: # %bb.0: 1459; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1460; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1461; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1462; SSE41-NEXT: pmuludq %xmm2, %xmm3 1463; SSE41-NEXT: pmuludq %xmm1, %xmm0 1464; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1465; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] 1466; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1467; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1468; SSE41-NEXT: por %xmm1, %xmm0 1469; SSE41-NEXT: retq 1470; 1471; AVX1-LABEL: constant_rotate_v4i32: 1472; AVX1: # %bb.0: 1473; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,32,64,128] 1474; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1475; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 1476; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 1477; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 1478; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 1479; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 1480; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] 1481; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 1482; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 1483; AVX1-NEXT: retq 1484; 1485; AVX2-LABEL: constant_rotate_v4i32: 1486; AVX2: # %bb.0: 1487; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1488; AVX2-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1489; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1490; AVX2-NEXT: retq 1491; 1492; AVX512F-LABEL: constant_rotate_v4i32: 1493; AVX512F: # %bb.0: 1494; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1495; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1496; AVX512F-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1497; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1498; AVX512F-NEXT: vzeroupper 1499; AVX512F-NEXT: retq 1500; 1501; AVX512VL-LABEL: constant_rotate_v4i32: 1502; AVX512VL: # %bb.0: 1503; AVX512VL-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1504; AVX512VL-NEXT: retq 1505; 1506; AVX512BW-LABEL: constant_rotate_v4i32: 1507; AVX512BW: # %bb.0: 1508; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1509; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1510; AVX512BW-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1511; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1512; AVX512BW-NEXT: vzeroupper 1513; AVX512BW-NEXT: retq 1514; 1515; AVX512VLBW-LABEL: constant_rotate_v4i32: 1516; AVX512VLBW: # %bb.0: 1517; AVX512VLBW-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1518; AVX512VLBW-NEXT: retq 1519; 1520; AVX512VBMI2-LABEL: constant_rotate_v4i32: 1521; AVX512VBMI2: # %bb.0: 1522; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1523; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,6,7] 1524; AVX512VBMI2-NEXT: vprolvd %zmm1, %zmm0, %zmm0 1525; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1526; AVX512VBMI2-NEXT: vzeroupper 1527; AVX512VBMI2-NEXT: retq 1528; 1529; AVX512VLVBMI2-LABEL: constant_rotate_v4i32: 1530; AVX512VLVBMI2: # %bb.0: 1531; AVX512VLVBMI2-NEXT: vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1532; AVX512VLVBMI2-NEXT: retq 1533; 1534; XOP-LABEL: constant_rotate_v4i32: 1535; XOP: # %bb.0: 1536; XOP-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1537; XOP-NEXT: retq 1538; 1539; X86-SSE2-LABEL: constant_rotate_v4i32: 1540; X86-SSE2: # %bb.0: 1541; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,32,64,128] 1542; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1543; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0 1544; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 1545; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1546; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1 1547; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 1548; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 1549; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1550; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1551; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1552; X86-SSE2-NEXT: por %xmm3, %xmm0 1553; X86-SSE2-NEXT: retl 1554 %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7> 1555 %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25> 1556 %or = or <4 x i32> %shl, %lshr 1557 ret <4 x i32> %or 1558} 1559 1560define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind { 1561; SSE-LABEL: constant_rotate_v8i16: 1562; SSE: # %bb.0: 1563; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1564; SSE-NEXT: movdqa %xmm0, %xmm2 1565; SSE-NEXT: pmulhuw %xmm1, %xmm2 1566; SSE-NEXT: pmullw %xmm1, %xmm0 1567; SSE-NEXT: por %xmm2, %xmm0 1568; SSE-NEXT: retq 1569; 1570; AVX-LABEL: constant_rotate_v8i16: 1571; AVX: # %bb.0: 1572; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1573; AVX-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1574; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1575; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 1576; AVX-NEXT: retq 1577; 1578; AVX512F-LABEL: constant_rotate_v8i16: 1579; AVX512F: # %bb.0: 1580; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1581; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1582; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1583; AVX512F-NEXT: vpor %xmm2, %xmm0, %xmm0 1584; AVX512F-NEXT: retq 1585; 1586; AVX512VL-LABEL: constant_rotate_v8i16: 1587; AVX512VL: # %bb.0: 1588; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1589; AVX512VL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 1590; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1591; AVX512VL-NEXT: vpor %xmm2, %xmm0, %xmm0 1592; AVX512VL-NEXT: retq 1593; 1594; AVX512BW-LABEL: constant_rotate_v8i16: 1595; AVX512BW: # %bb.0: 1596; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1597; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1598; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9] 1599; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm2 1600; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 1601; AVX512BW-NEXT: vpor %xmm2, %xmm0, %xmm0 1602; AVX512BW-NEXT: vzeroupper 1603; AVX512BW-NEXT: retq 1604; 1605; AVX512VLBW-LABEL: constant_rotate_v8i16: 1606; AVX512VLBW: # %bb.0: 1607; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1608; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1609; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 1610; AVX512VLBW-NEXT: retq 1611; 1612; AVX512VBMI2-LABEL: constant_rotate_v8i16: 1613; AVX512VBMI2: # %bb.0: 1614; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1615; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] 1616; AVX512VBMI2-NEXT: vpshldvw %zmm1, %zmm0, %zmm0 1617; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1618; AVX512VBMI2-NEXT: vzeroupper 1619; AVX512VBMI2-NEXT: retq 1620; 1621; AVX512VLVBMI2-LABEL: constant_rotate_v8i16: 1622; AVX512VLVBMI2: # %bb.0: 1623; AVX512VLVBMI2-NEXT: vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1624; AVX512VLVBMI2-NEXT: retq 1625; 1626; XOP-LABEL: constant_rotate_v8i16: 1627; XOP: # %bb.0: 1628; XOP-NEXT: vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1629; XOP-NEXT: retq 1630; 1631; X86-SSE2-LABEL: constant_rotate_v8i16: 1632; X86-SSE2: # %bb.0: 1633; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] 1634; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1635; X86-SSE2-NEXT: pmulhuw %xmm1, %xmm2 1636; X86-SSE2-NEXT: pmullw %xmm1, %xmm0 1637; X86-SSE2-NEXT: por %xmm2, %xmm0 1638; X86-SSE2-NEXT: retl 1639 %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7> 1640 %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9> 1641 %or = or <8 x i16> %shl, %lshr 1642 ret <8 x i16> %or 1643} 1644 1645define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind { 1646; SSE2-LABEL: constant_rotate_v16i8: 1647; SSE2: # %bb.0: 1648; SSE2-NEXT: pxor %xmm1, %xmm1 1649; SSE2-NEXT: movdqa %xmm0, %xmm2 1650; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1651; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1652; SSE2-NEXT: psrlw $8, %xmm2 1653; SSE2-NEXT: movdqa %xmm0, %xmm3 1654; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1655; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1656; SSE2-NEXT: psrlw $8, %xmm3 1657; SSE2-NEXT: packuswb %xmm2, %xmm3 1658; SSE2-NEXT: movdqa %xmm0, %xmm1 1659; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1660; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1661; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1662; SSE2-NEXT: pand %xmm2, %xmm1 1663; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1664; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1665; SSE2-NEXT: pand %xmm2, %xmm0 1666; SSE2-NEXT: packuswb %xmm1, %xmm0 1667; SSE2-NEXT: por %xmm3, %xmm0 1668; SSE2-NEXT: retq 1669; 1670; SSE41-LABEL: constant_rotate_v16i8: 1671; SSE41: # %bb.0: 1672; SSE41-NEXT: movdqa %xmm0, %xmm2 1673; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1674; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 1675; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 1676; SSE41-NEXT: pand %xmm3, %xmm2 1677; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1678; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] 1679; SSE41-NEXT: pmullw %xmm1, %xmm4 1680; SSE41-NEXT: pand %xmm3, %xmm4 1681; SSE41-NEXT: packuswb %xmm2, %xmm4 1682; SSE41-NEXT: pxor %xmm2, %xmm2 1683; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1684; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1685; SSE41-NEXT: psrlw $8, %xmm0 1686; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1687; SSE41-NEXT: psrlw $8, %xmm1 1688; SSE41-NEXT: packuswb %xmm0, %xmm1 1689; SSE41-NEXT: por %xmm4, %xmm1 1690; SSE41-NEXT: movdqa %xmm1, %xmm0 1691; SSE41-NEXT: retq 1692; 1693; AVX1-LABEL: constant_rotate_v16i8: 1694; AVX1: # %bb.0: 1695; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1696; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 1697; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1698; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1699; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1700; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm4 1701; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 1702; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1703; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1704; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1705; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1706; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1707; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm2 1708; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1709; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 1710; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 1711; AVX1-NEXT: retq 1712; 1713; AVX2-LABEL: constant_rotate_v16i8: 1714; AVX2: # %bb.0: 1715; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1716; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1717; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1718; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1719; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1720; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1721; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1722; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1723; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1724; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 1725; AVX2-NEXT: vzeroupper 1726; AVX2-NEXT: retq 1727; 1728; AVX512F-LABEL: constant_rotate_v16i8: 1729; AVX512F: # %bb.0: 1730; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1731; AVX512F-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 1732; AVX512F-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1733; AVX512F-NEXT: vpord %zmm0, %zmm1, %zmm0 1734; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1735; AVX512F-NEXT: vzeroupper 1736; AVX512F-NEXT: retq 1737; 1738; AVX512VL-LABEL: constant_rotate_v16i8: 1739; AVX512VL: # %bb.0: 1740; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 1741; AVX512VL-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 1742; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 1743; AVX512VL-NEXT: vpord %zmm0, %zmm1, %zmm0 1744; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 1745; AVX512VL-NEXT: vzeroupper 1746; AVX512VL-NEXT: retq 1747; 1748; AVX512BW-LABEL: constant_rotate_v16i8: 1749; AVX512BW: # %bb.0: 1750; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1751; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1752; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 1753; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1754; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 1755; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0 1756; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1757; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1758; AVX512BW-NEXT: vzeroupper 1759; AVX512BW-NEXT: retq 1760; 1761; AVX512VLBW-LABEL: constant_rotate_v16i8: 1762; AVX512VLBW: # %bb.0: 1763; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1764; AVX512VLBW-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1765; AVX512VLBW-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1766; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 1767; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 1768; AVX512VLBW-NEXT: vzeroupper 1769; AVX512VLBW-NEXT: retq 1770; 1771; AVX512VBMI2-LABEL: constant_rotate_v16i8: 1772; AVX512VBMI2: # %bb.0: 1773; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1] 1774; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1775; AVX512VBMI2-NEXT: vpsllvw %zmm1, %zmm0, %zmm1 1776; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,7,6,5,4,3,2,1,0,1,2,3,4,5,6,7] 1777; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0 1778; AVX512VBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 1779; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0 1780; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1781; AVX512VBMI2-NEXT: vzeroupper 1782; AVX512VBMI2-NEXT: retq 1783; 1784; AVX512VLVBMI2-LABEL: constant_rotate_v16i8: 1785; AVX512VLVBMI2: # %bb.0: 1786; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1787; AVX512VLVBMI2-NEXT: vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1788; AVX512VLVBMI2-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1789; AVX512VLVBMI2-NEXT: vpor %ymm0, %ymm1, %ymm0 1790; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0 1791; AVX512VLVBMI2-NEXT: vzeroupper 1792; AVX512VLVBMI2-NEXT: retq 1793; 1794; XOP-LABEL: constant_rotate_v16i8: 1795; XOP: # %bb.0: 1796; XOP-NEXT: vprotb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1797; XOP-NEXT: retq 1798; 1799; X86-SSE2-LABEL: constant_rotate_v16i8: 1800; X86-SSE2: # %bb.0: 1801; X86-SSE2-NEXT: pxor %xmm1, %xmm1 1802; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 1803; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] 1804; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 1805; X86-SSE2-NEXT: psrlw $8, %xmm2 1806; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 1807; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 1808; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 1809; X86-SSE2-NEXT: psrlw $8, %xmm3 1810; X86-SSE2-NEXT: packuswb %xmm2, %xmm3 1811; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1812; X86-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1813; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 1814; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1815; X86-SSE2-NEXT: pand %xmm2, %xmm1 1816; X86-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1817; X86-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 1818; X86-SSE2-NEXT: pand %xmm2, %xmm0 1819; X86-SSE2-NEXT: packuswb %xmm1, %xmm0 1820; X86-SSE2-NEXT: por %xmm3, %xmm0 1821; X86-SSE2-NEXT: retl 1822 %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1> 1823 %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7> 1824 %or = or <16 x i8> %shl, %lshr 1825 ret <16 x i8> %or 1826} 1827 1828; 1829; Uniform Constant Rotates 1830; 1831 1832define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind { 1833; SSE-LABEL: splatconstant_rotate_v2i64: 1834; SSE: # %bb.0: 1835; SSE-NEXT: movdqa %xmm0, %xmm1 1836; SSE-NEXT: psllq $14, %xmm1 1837; SSE-NEXT: psrlq $50, %xmm0 1838; SSE-NEXT: por %xmm1, %xmm0 1839; SSE-NEXT: retq 1840; 1841; AVX-LABEL: splatconstant_rotate_v2i64: 1842; AVX: # %bb.0: 1843; AVX-NEXT: vpsllq $14, %xmm0, %xmm1 1844; AVX-NEXT: vpsrlq $50, %xmm0, %xmm0 1845; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 1846; AVX-NEXT: retq 1847; 1848; AVX512F-LABEL: splatconstant_rotate_v2i64: 1849; AVX512F: # %bb.0: 1850; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1851; AVX512F-NEXT: vprolq $14, %zmm0, %zmm0 1852; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1853; AVX512F-NEXT: vzeroupper 1854; AVX512F-NEXT: retq 1855; 1856; AVX512VL-LABEL: splatconstant_rotate_v2i64: 1857; AVX512VL: # %bb.0: 1858; AVX512VL-NEXT: vprolq $14, %xmm0, %xmm0 1859; AVX512VL-NEXT: retq 1860; 1861; AVX512BW-LABEL: splatconstant_rotate_v2i64: 1862; AVX512BW: # %bb.0: 1863; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1864; AVX512BW-NEXT: vprolq $14, %zmm0, %zmm0 1865; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1866; AVX512BW-NEXT: vzeroupper 1867; AVX512BW-NEXT: retq 1868; 1869; AVX512VLBW-LABEL: splatconstant_rotate_v2i64: 1870; AVX512VLBW: # %bb.0: 1871; AVX512VLBW-NEXT: vprolq $14, %xmm0, %xmm0 1872; AVX512VLBW-NEXT: retq 1873; 1874; AVX512VBMI2-LABEL: splatconstant_rotate_v2i64: 1875; AVX512VBMI2: # %bb.0: 1876; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1877; AVX512VBMI2-NEXT: vprolq $14, %zmm0, %zmm0 1878; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1879; AVX512VBMI2-NEXT: vzeroupper 1880; AVX512VBMI2-NEXT: retq 1881; 1882; AVX512VLVBMI2-LABEL: splatconstant_rotate_v2i64: 1883; AVX512VLVBMI2: # %bb.0: 1884; AVX512VLVBMI2-NEXT: vprolq $14, %xmm0, %xmm0 1885; AVX512VLVBMI2-NEXT: retq 1886; 1887; XOP-LABEL: splatconstant_rotate_v2i64: 1888; XOP: # %bb.0: 1889; XOP-NEXT: vprotq $14, %xmm0, %xmm0 1890; XOP-NEXT: retq 1891; 1892; X86-SSE2-LABEL: splatconstant_rotate_v2i64: 1893; X86-SSE2: # %bb.0: 1894; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1895; X86-SSE2-NEXT: psllq $14, %xmm1 1896; X86-SSE2-NEXT: psrlq $50, %xmm0 1897; X86-SSE2-NEXT: por %xmm1, %xmm0 1898; X86-SSE2-NEXT: retl 1899 %shl = shl <2 x i64> %a, <i64 14, i64 14> 1900 %lshr = lshr <2 x i64> %a, <i64 50, i64 50> 1901 %or = or <2 x i64> %shl, %lshr 1902 ret <2 x i64> %or 1903} 1904 1905define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind { 1906; SSE-LABEL: splatconstant_rotate_v4i32: 1907; SSE: # %bb.0: 1908; SSE-NEXT: movdqa %xmm0, %xmm1 1909; SSE-NEXT: psrld $28, %xmm1 1910; SSE-NEXT: pslld $4, %xmm0 1911; SSE-NEXT: por %xmm1, %xmm0 1912; SSE-NEXT: retq 1913; 1914; AVX-LABEL: splatconstant_rotate_v4i32: 1915; AVX: # %bb.0: 1916; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 1917; AVX-NEXT: vpslld $4, %xmm0, %xmm0 1918; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1919; AVX-NEXT: retq 1920; 1921; AVX512F-LABEL: splatconstant_rotate_v4i32: 1922; AVX512F: # %bb.0: 1923; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1924; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 1925; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1926; AVX512F-NEXT: vzeroupper 1927; AVX512F-NEXT: retq 1928; 1929; AVX512VL-LABEL: splatconstant_rotate_v4i32: 1930; AVX512VL: # %bb.0: 1931; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 1932; AVX512VL-NEXT: retq 1933; 1934; AVX512BW-LABEL: splatconstant_rotate_v4i32: 1935; AVX512BW: # %bb.0: 1936; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1937; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 1938; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1939; AVX512BW-NEXT: vzeroupper 1940; AVX512BW-NEXT: retq 1941; 1942; AVX512VLBW-LABEL: splatconstant_rotate_v4i32: 1943; AVX512VLBW: # %bb.0: 1944; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 1945; AVX512VLBW-NEXT: retq 1946; 1947; AVX512VBMI2-LABEL: splatconstant_rotate_v4i32: 1948; AVX512VBMI2: # %bb.0: 1949; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1950; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 1951; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 1952; AVX512VBMI2-NEXT: vzeroupper 1953; AVX512VBMI2-NEXT: retq 1954; 1955; AVX512VLVBMI2-LABEL: splatconstant_rotate_v4i32: 1956; AVX512VLVBMI2: # %bb.0: 1957; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 1958; AVX512VLVBMI2-NEXT: retq 1959; 1960; XOP-LABEL: splatconstant_rotate_v4i32: 1961; XOP: # %bb.0: 1962; XOP-NEXT: vprotd $4, %xmm0, %xmm0 1963; XOP-NEXT: retq 1964; 1965; X86-SSE2-LABEL: splatconstant_rotate_v4i32: 1966; X86-SSE2: # %bb.0: 1967; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 1968; X86-SSE2-NEXT: psrld $28, %xmm1 1969; X86-SSE2-NEXT: pslld $4, %xmm0 1970; X86-SSE2-NEXT: por %xmm1, %xmm0 1971; X86-SSE2-NEXT: retl 1972 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 1973 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 1974 %or = or <4 x i32> %shl, %lshr 1975 ret <4 x i32> %or 1976} 1977 1978define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind { 1979; SSE-LABEL: splatconstant_rotate_v8i16: 1980; SSE: # %bb.0: 1981; SSE-NEXT: movdqa %xmm0, %xmm1 1982; SSE-NEXT: psrlw $9, %xmm1 1983; SSE-NEXT: psllw $7, %xmm0 1984; SSE-NEXT: por %xmm1, %xmm0 1985; SSE-NEXT: retq 1986; 1987; AVX-LABEL: splatconstant_rotate_v8i16: 1988; AVX: # %bb.0: 1989; AVX-NEXT: vpsrlw $9, %xmm0, %xmm1 1990; AVX-NEXT: vpsllw $7, %xmm0, %xmm0 1991; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 1992; AVX-NEXT: retq 1993; 1994; AVX512F-LABEL: splatconstant_rotate_v8i16: 1995; AVX512F: # %bb.0: 1996; AVX512F-NEXT: vpsrlw $9, %xmm0, %xmm1 1997; AVX512F-NEXT: vpsllw $7, %xmm0, %xmm0 1998; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 1999; AVX512F-NEXT: retq 2000; 2001; AVX512VL-LABEL: splatconstant_rotate_v8i16: 2002; AVX512VL: # %bb.0: 2003; AVX512VL-NEXT: vpsrlw $9, %xmm0, %xmm1 2004; AVX512VL-NEXT: vpsllw $7, %xmm0, %xmm0 2005; AVX512VL-NEXT: vpor %xmm1, %xmm0, %xmm0 2006; AVX512VL-NEXT: retq 2007; 2008; AVX512BW-LABEL: splatconstant_rotate_v8i16: 2009; AVX512BW: # %bb.0: 2010; AVX512BW-NEXT: vpsrlw $9, %xmm0, %xmm1 2011; AVX512BW-NEXT: vpsllw $7, %xmm0, %xmm0 2012; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2013; AVX512BW-NEXT: retq 2014; 2015; AVX512VLBW-LABEL: splatconstant_rotate_v8i16: 2016; AVX512VLBW: # %bb.0: 2017; AVX512VLBW-NEXT: vpsrlw $9, %xmm0, %xmm1 2018; AVX512VLBW-NEXT: vpsllw $7, %xmm0, %xmm0 2019; AVX512VLBW-NEXT: vpor %xmm1, %xmm0, %xmm0 2020; AVX512VLBW-NEXT: retq 2021; 2022; AVX512VBMI2-LABEL: splatconstant_rotate_v8i16: 2023; AVX512VBMI2: # %bb.0: 2024; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2025; AVX512VBMI2-NEXT: vpshldw $7, %zmm0, %zmm0, %zmm0 2026; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 2027; AVX512VBMI2-NEXT: vzeroupper 2028; AVX512VBMI2-NEXT: retq 2029; 2030; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i16: 2031; AVX512VLVBMI2: # %bb.0: 2032; AVX512VLVBMI2-NEXT: vpshldw $7, %xmm0, %xmm0, %xmm0 2033; AVX512VLVBMI2-NEXT: retq 2034; 2035; XOP-LABEL: splatconstant_rotate_v8i16: 2036; XOP: # %bb.0: 2037; XOP-NEXT: vprotw $7, %xmm0, %xmm0 2038; XOP-NEXT: retq 2039; 2040; X86-SSE2-LABEL: splatconstant_rotate_v8i16: 2041; X86-SSE2: # %bb.0: 2042; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2043; X86-SSE2-NEXT: psrlw $9, %xmm1 2044; X86-SSE2-NEXT: psllw $7, %xmm0 2045; X86-SSE2-NEXT: por %xmm1, %xmm0 2046; X86-SSE2-NEXT: retl 2047 %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2048 %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9> 2049 %or = or <8 x i16> %shl, %lshr 2050 ret <8 x i16> %or 2051} 2052 2053define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind { 2054; SSE-LABEL: splatconstant_rotate_v16i8: 2055; SSE: # %bb.0: 2056; SSE-NEXT: movdqa %xmm0, %xmm1 2057; SSE-NEXT: psrlw $4, %xmm1 2058; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2059; SSE-NEXT: psllw $4, %xmm0 2060; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2061; SSE-NEXT: por %xmm1, %xmm0 2062; SSE-NEXT: retq 2063; 2064; AVX-LABEL: splatconstant_rotate_v16i8: 2065; AVX: # %bb.0: 2066; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2067; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2068; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2069; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2070; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2071; AVX-NEXT: retq 2072; 2073; AVX512F-LABEL: splatconstant_rotate_v16i8: 2074; AVX512F: # %bb.0: 2075; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 2076; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2077; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 2078; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2079; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 2080; AVX512F-NEXT: retq 2081; 2082; AVX512VL-LABEL: splatconstant_rotate_v16i8: 2083; AVX512VL: # %bb.0: 2084; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 2085; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm0 2086; AVX512VL-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2087; AVX512VL-NEXT: retq 2088; 2089; AVX512BW-LABEL: splatconstant_rotate_v16i8: 2090; AVX512BW: # %bb.0: 2091; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 2092; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2093; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 2094; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2095; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 2096; AVX512BW-NEXT: retq 2097; 2098; AVX512VLBW-LABEL: splatconstant_rotate_v16i8: 2099; AVX512VLBW: # %bb.0: 2100; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 2101; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm0 2102; AVX512VLBW-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2103; AVX512VLBW-NEXT: retq 2104; 2105; AVX512VBMI2-LABEL: splatconstant_rotate_v16i8: 2106; AVX512VBMI2: # %bb.0: 2107; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2108; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2109; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 2110; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2111; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0 2112; AVX512VBMI2-NEXT: retq 2113; 2114; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i8: 2115; AVX512VLVBMI2: # %bb.0: 2116; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2117; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 2118; AVX512VLVBMI2-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2119; AVX512VLVBMI2-NEXT: retq 2120; 2121; XOP-LABEL: splatconstant_rotate_v16i8: 2122; XOP: # %bb.0: 2123; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2124; XOP-NEXT: retq 2125; 2126; X86-SSE2-LABEL: splatconstant_rotate_v16i8: 2127; X86-SSE2: # %bb.0: 2128; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2129; X86-SSE2-NEXT: psrlw $4, %xmm1 2130; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 2131; X86-SSE2-NEXT: psllw $4, %xmm0 2132; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2133; X86-SSE2-NEXT: por %xmm1, %xmm0 2134; X86-SSE2-NEXT: retl 2135 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2136 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2137 %or = or <16 x i8> %shl, %lshr 2138 ret <16 x i8> %or 2139} 2140 2141; 2142; Masked Uniform Constant Rotates 2143; 2144 2145define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind { 2146; SSE-LABEL: splatconstant_rotate_mask_v2i64: 2147; SSE: # %bb.0: 2148; SSE-NEXT: psrlq $49, %xmm0 2149; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2150; SSE-NEXT: retq 2151; 2152; AVX-LABEL: splatconstant_rotate_mask_v2i64: 2153; AVX: # %bb.0: 2154; AVX-NEXT: vpsrlq $49, %xmm0, %xmm0 2155; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2156; AVX-NEXT: retq 2157; 2158; AVX512F-LABEL: splatconstant_rotate_mask_v2i64: 2159; AVX512F: # %bb.0: 2160; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2161; AVX512F-NEXT: vprolq $15, %zmm0, %zmm0 2162; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2163; AVX512F-NEXT: vzeroupper 2164; AVX512F-NEXT: retq 2165; 2166; AVX512VL-LABEL: splatconstant_rotate_mask_v2i64: 2167; AVX512VL: # %bb.0: 2168; AVX512VL-NEXT: vprolq $15, %xmm0, %xmm0 2169; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2170; AVX512VL-NEXT: retq 2171; 2172; AVX512BW-LABEL: splatconstant_rotate_mask_v2i64: 2173; AVX512BW: # %bb.0: 2174; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2175; AVX512BW-NEXT: vprolq $15, %zmm0, %zmm0 2176; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2177; AVX512BW-NEXT: vzeroupper 2178; AVX512BW-NEXT: retq 2179; 2180; AVX512VLBW-LABEL: splatconstant_rotate_mask_v2i64: 2181; AVX512VLBW: # %bb.0: 2182; AVX512VLBW-NEXT: vprolq $15, %xmm0, %xmm0 2183; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2184; AVX512VLBW-NEXT: retq 2185; 2186; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v2i64: 2187; AVX512VBMI2: # %bb.0: 2188; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2189; AVX512VBMI2-NEXT: vprolq $15, %zmm0, %zmm0 2190; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2191; AVX512VBMI2-NEXT: vzeroupper 2192; AVX512VBMI2-NEXT: retq 2193; 2194; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v2i64: 2195; AVX512VLVBMI2: # %bb.0: 2196; AVX512VLVBMI2-NEXT: vprolq $15, %xmm0, %xmm0 2197; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2198; AVX512VLVBMI2-NEXT: retq 2199; 2200; XOP-LABEL: splatconstant_rotate_mask_v2i64: 2201; XOP: # %bb.0: 2202; XOP-NEXT: vprotq $15, %xmm0, %xmm0 2203; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2204; XOP-NEXT: retq 2205; 2206; X86-SSE2-LABEL: splatconstant_rotate_mask_v2i64: 2207; X86-SSE2: # %bb.0: 2208; X86-SSE2-NEXT: psrlq $49, %xmm0 2209; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2210; X86-SSE2-NEXT: retl 2211 %shl = shl <2 x i64> %a, <i64 15, i64 15> 2212 %lshr = lshr <2 x i64> %a, <i64 49, i64 49> 2213 %rmask = and <2 x i64> %lshr, <i64 255, i64 127> 2214 %lmask = and <2 x i64> %shl, <i64 65, i64 33> 2215 %or = or <2 x i64> %lmask, %rmask 2216 ret <2 x i64> %or 2217} 2218 2219define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind { 2220; SSE-LABEL: splatconstant_rotate_mask_v4i32: 2221; SSE: # %bb.0: 2222; SSE-NEXT: movdqa %xmm0, %xmm1 2223; SSE-NEXT: psrld $28, %xmm1 2224; SSE-NEXT: pslld $4, %xmm0 2225; SSE-NEXT: por %xmm1, %xmm0 2226; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2227; SSE-NEXT: retq 2228; 2229; AVX-LABEL: splatconstant_rotate_mask_v4i32: 2230; AVX: # %bb.0: 2231; AVX-NEXT: vpsrld $28, %xmm0, %xmm1 2232; AVX-NEXT: vpslld $4, %xmm0, %xmm0 2233; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2234; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2235; AVX-NEXT: retq 2236; 2237; AVX512F-LABEL: splatconstant_rotate_mask_v4i32: 2238; AVX512F: # %bb.0: 2239; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2240; AVX512F-NEXT: vprold $4, %zmm0, %zmm0 2241; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2242; AVX512F-NEXT: vzeroupper 2243; AVX512F-NEXT: retq 2244; 2245; AVX512VL-LABEL: splatconstant_rotate_mask_v4i32: 2246; AVX512VL: # %bb.0: 2247; AVX512VL-NEXT: vprold $4, %xmm0, %xmm0 2248; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2249; AVX512VL-NEXT: retq 2250; 2251; AVX512BW-LABEL: splatconstant_rotate_mask_v4i32: 2252; AVX512BW: # %bb.0: 2253; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2254; AVX512BW-NEXT: vprold $4, %zmm0, %zmm0 2255; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2256; AVX512BW-NEXT: vzeroupper 2257; AVX512BW-NEXT: retq 2258; 2259; AVX512VLBW-LABEL: splatconstant_rotate_mask_v4i32: 2260; AVX512VLBW: # %bb.0: 2261; AVX512VLBW-NEXT: vprold $4, %xmm0, %xmm0 2262; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2263; AVX512VLBW-NEXT: retq 2264; 2265; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v4i32: 2266; AVX512VBMI2: # %bb.0: 2267; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2268; AVX512VBMI2-NEXT: vprold $4, %zmm0, %zmm0 2269; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2270; AVX512VBMI2-NEXT: vzeroupper 2271; AVX512VBMI2-NEXT: retq 2272; 2273; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v4i32: 2274; AVX512VLVBMI2: # %bb.0: 2275; AVX512VLVBMI2-NEXT: vprold $4, %xmm0, %xmm0 2276; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2277; AVX512VLVBMI2-NEXT: retq 2278; 2279; XOP-LABEL: splatconstant_rotate_mask_v4i32: 2280; XOP: # %bb.0: 2281; XOP-NEXT: vprotd $4, %xmm0, %xmm0 2282; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2283; XOP-NEXT: retq 2284; 2285; X86-SSE2-LABEL: splatconstant_rotate_mask_v4i32: 2286; X86-SSE2: # %bb.0: 2287; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2288; X86-SSE2-NEXT: psrld $28, %xmm1 2289; X86-SSE2-NEXT: pslld $4, %xmm0 2290; X86-SSE2-NEXT: por %xmm1, %xmm0 2291; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2292; X86-SSE2-NEXT: retl 2293 %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4> 2294 %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28> 2295 %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023> 2296 %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127> 2297 %or = or <4 x i32> %lmask, %rmask 2298 ret <4 x i32> %or 2299} 2300 2301define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { 2302; SSE-LABEL: splatconstant_rotate_mask_v8i16: 2303; SSE: # %bb.0: 2304; SSE-NEXT: movdqa %xmm0, %xmm1 2305; SSE-NEXT: psrlw $11, %xmm1 2306; SSE-NEXT: psllw $5, %xmm0 2307; SSE-NEXT: por %xmm1, %xmm0 2308; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2309; SSE-NEXT: retq 2310; 2311; AVX-LABEL: splatconstant_rotate_mask_v8i16: 2312; AVX: # %bb.0: 2313; AVX-NEXT: vpsrlw $11, %xmm0, %xmm1 2314; AVX-NEXT: vpsllw $5, %xmm0, %xmm0 2315; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2316; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2317; AVX-NEXT: retq 2318; 2319; AVX512F-LABEL: splatconstant_rotate_mask_v8i16: 2320; AVX512F: # %bb.0: 2321; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1 2322; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0 2323; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 2324; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2325; AVX512F-NEXT: retq 2326; 2327; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16: 2328; AVX512VL: # %bb.0: 2329; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 2330; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 2331; AVX512VL-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2332; AVX512VL-NEXT: retq 2333; 2334; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: 2335; AVX512BW: # %bb.0: 2336; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1 2337; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 2338; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 2339; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2340; AVX512BW-NEXT: retq 2341; 2342; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16: 2343; AVX512VLBW: # %bb.0: 2344; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 2345; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 2346; AVX512VLBW-NEXT: vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2347; AVX512VLBW-NEXT: retq 2348; 2349; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16: 2350; AVX512VBMI2: # %bb.0: 2351; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2352; AVX512VBMI2-NEXT: vpshldw $5, %zmm0, %zmm0, %zmm0 2353; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2354; AVX512VBMI2-NEXT: vzeroupper 2355; AVX512VBMI2-NEXT: retq 2356; 2357; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16: 2358; AVX512VLVBMI2: # %bb.0: 2359; AVX512VLVBMI2-NEXT: vpshldw $5, %xmm0, %xmm0, %xmm0 2360; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2361; AVX512VLVBMI2-NEXT: retq 2362; 2363; XOP-LABEL: splatconstant_rotate_mask_v8i16: 2364; XOP: # %bb.0: 2365; XOP-NEXT: vprotw $5, %xmm0, %xmm0 2366; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2367; XOP-NEXT: retq 2368; 2369; X86-SSE2-LABEL: splatconstant_rotate_mask_v8i16: 2370; X86-SSE2: # %bb.0: 2371; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2372; X86-SSE2-NEXT: psrlw $11, %xmm1 2373; X86-SSE2-NEXT: psllw $5, %xmm0 2374; X86-SSE2-NEXT: por %xmm1, %xmm0 2375; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2376; X86-SSE2-NEXT: retl 2377 %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5> 2378 %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11> 2379 %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55> 2380 %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33> 2381 %or = or <8 x i16> %lmask, %rmask 2382 ret <8 x i16> %or 2383} 2384 2385define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { 2386; SSE-LABEL: splatconstant_rotate_mask_v16i8: 2387; SSE: # %bb.0: 2388; SSE-NEXT: movdqa %xmm0, %xmm1 2389; SSE-NEXT: psrlw $4, %xmm1 2390; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2391; SSE-NEXT: psllw $4, %xmm0 2392; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2393; SSE-NEXT: por %xmm1, %xmm0 2394; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2395; SSE-NEXT: retq 2396; 2397; AVX-LABEL: splatconstant_rotate_mask_v16i8: 2398; AVX: # %bb.0: 2399; AVX-NEXT: vpsrlw $4, %xmm0, %xmm1 2400; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2401; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 2402; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2403; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2404; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2405; AVX-NEXT: retq 2406; 2407; AVX512F-LABEL: splatconstant_rotate_mask_v16i8: 2408; AVX512F: # %bb.0: 2409; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 2410; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 2411; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2412; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2413; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 2414; AVX512F-NEXT: retq 2415; 2416; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8: 2417; AVX512VL: # %bb.0: 2418; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 2419; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 2420; AVX512VL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2421; AVX512VL-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 2422; AVX512VL-NEXT: retq 2423; 2424; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8: 2425; AVX512BW: # %bb.0: 2426; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 2427; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 2428; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2429; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2430; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 2431; AVX512BW-NEXT: retq 2432; 2433; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8: 2434; AVX512VLBW: # %bb.0: 2435; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 2436; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm2 2437; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2438; AVX512VLBW-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 2439; AVX512VLBW-NEXT: retq 2440; 2441; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i8: 2442; AVX512VBMI2: # %bb.0: 2443; AVX512VBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2444; AVX512VBMI2-NEXT: vpsrlw $4, %xmm0, %xmm0 2445; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2446; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2447; AVX512VBMI2-NEXT: vpor %xmm0, %xmm1, %xmm0 2448; AVX512VBMI2-NEXT: retq 2449; 2450; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i8: 2451; AVX512VLVBMI2: # %bb.0: 2452; AVX512VLVBMI2-NEXT: vpsllw $4, %xmm0, %xmm1 2453; AVX512VLVBMI2-NEXT: vpsrlw $4, %xmm0, %xmm2 2454; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 2455; AVX512VLVBMI2-NEXT: vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 2456; AVX512VLVBMI2-NEXT: retq 2457; 2458; XOP-LABEL: splatconstant_rotate_mask_v16i8: 2459; XOP: # %bb.0: 2460; XOP-NEXT: vprotb $4, %xmm0, %xmm0 2461; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2462; XOP-NEXT: retq 2463; 2464; X86-SSE2-LABEL: splatconstant_rotate_mask_v16i8: 2465; X86-SSE2: # %bb.0: 2466; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2467; X86-SSE2-NEXT: psrlw $4, %xmm1 2468; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 2469; X86-SSE2-NEXT: psllw $4, %xmm0 2470; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2471; X86-SSE2-NEXT: por %xmm1, %xmm0 2472; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2473; X86-SSE2-NEXT: retl 2474 %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2475 %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4> 2476 %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55> 2477 %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33> 2478 %or = or <16 x i8> %lmask, %rmask 2479 ret <16 x i8> %or 2480} 2481 2482define <4 x i32> @rot16_demandedbits(<4 x i32> %x, <4 x i32> %y) nounwind { 2483; X86-LABEL: rot16_demandedbits: 2484; X86: # %bb.0: 2485; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 2486; X86-NEXT: movl %eax, %ecx 2487; X86-NEXT: shrl $11, %ecx 2488; X86-NEXT: shll $5, %eax 2489; X86-NEXT: orl %ecx, %eax 2490; X86-NEXT: andl $65536, %eax # imm = 0x10000 2491; X86-NEXT: retl 2492; 2493; X64-LABEL: rot16_demandedbits: 2494; X64: # %bb.0: 2495; X64-NEXT: movl %edi, %eax 2496; X64-NEXT: movl %edi, %ecx 2497; X64-NEXT: shrl $11, %ecx 2498; X64-NEXT: shll $5, %eax 2499; X64-NEXT: orl %ecx, %eax 2500; X64-NEXT: andl $65536, %eax # imm = 0x10000 2501; X64-NEXT: retq 2502; SSE2-LABEL: rot16_demandedbits: 2503; SSE2: # %bb.0: 2504; SSE2-NEXT: movdqa %xmm0, %xmm1 2505; SSE2-NEXT: psrld $11, %xmm1 2506; SSE2-NEXT: pslld $11, %xmm0 2507; SSE2-NEXT: por %xmm1, %xmm0 2508; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2509; SSE2-NEXT: retq 2510; 2511; SSE41-LABEL: rot16_demandedbits: 2512; SSE41: # %bb.0: 2513; SSE41-NEXT: movdqa %xmm0, %xmm1 2514; SSE41-NEXT: psrld $11, %xmm1 2515; SSE41-NEXT: pslld $11, %xmm0 2516; SSE41-NEXT: por %xmm1, %xmm0 2517; SSE41-NEXT: pxor %xmm1, %xmm1 2518; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2519; SSE41-NEXT: retq 2520; 2521; AVX-LABEL: rot16_demandedbits: 2522; AVX: # %bb.0: 2523; AVX-NEXT: vpsrld $11, %xmm0, %xmm1 2524; AVX-NEXT: vpslld $11, %xmm0, %xmm0 2525; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 2526; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2527; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2528; AVX-NEXT: retq 2529; 2530; AVX512-LABEL: rot16_demandedbits: 2531; AVX512: # %bb.0: 2532; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1 2533; AVX512-NEXT: vpslld $11, %xmm0, %xmm0 2534; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 2535; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 2536; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2537; AVX512-NEXT: retq 2538; 2539; XOP-LABEL: rot16_demandedbits: 2540; XOP: # %bb.0: 2541; XOP-NEXT: vpsrld $11, %xmm0, %xmm1 2542; XOP-NEXT: vpslld $11, %xmm0, %xmm0 2543; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 2544; XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1 2545; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 2546; XOP-NEXT: retq 2547; 2548; X86-SSE2-LABEL: rot16_demandedbits: 2549; X86-SSE2: # %bb.0: 2550; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2551; X86-SSE2-NEXT: psrld $11, %xmm1 2552; X86-SSE2-NEXT: pslld $11, %xmm0 2553; X86-SSE2-NEXT: por %xmm1, %xmm0 2554; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 2555; X86-SSE2-NEXT: retl 2556 %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 2557 %t1 = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 2558 %t2 = or <4 x i32> %t0, %t1 2559 %t3 = and <4 x i32> %t2, <i32 65535, i32 65535, i32 65535, i32 65535> 2560 ret <4 x i32> %t3 2561} 2562 2563define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind { 2564; SSE2-LABEL: rot16_trunc: 2565; SSE2: # %bb.0: 2566; SSE2-NEXT: movdqa %xmm0, %xmm1 2567; SSE2-NEXT: psrld $11, %xmm1 2568; SSE2-NEXT: pslld $5, %xmm0 2569; SSE2-NEXT: por %xmm1, %xmm0 2570; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2571; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2572; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2573; SSE2-NEXT: retq 2574; 2575; SSE41-LABEL: rot16_trunc: 2576; SSE41: # %bb.0: 2577; SSE41-NEXT: movdqa %xmm0, %xmm1 2578; SSE41-NEXT: psrld $11, %xmm1 2579; SSE41-NEXT: pslld $5, %xmm0 2580; SSE41-NEXT: por %xmm1, %xmm0 2581; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2582; SSE41-NEXT: retq 2583; 2584; AVX-LABEL: rot16_trunc: 2585; AVX: # %bb.0: 2586; AVX-NEXT: vpsrld $11, %xmm0, %xmm1 2587; AVX-NEXT: vpslld $5, %xmm0, %xmm0 2588; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 2589; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2590; AVX-NEXT: retq 2591; 2592; AVX512-LABEL: rot16_trunc: 2593; AVX512: # %bb.0: 2594; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1 2595; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 2596; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 2597; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2598; AVX512-NEXT: retq 2599; 2600; XOP-LABEL: rot16_trunc: 2601; XOP: # %bb.0: 2602; XOP-NEXT: vpsrld $11, %xmm0, %xmm1 2603; XOP-NEXT: vpslld $5, %xmm0, %xmm0 2604; XOP-NEXT: vpor %xmm0, %xmm1, %xmm0 2605; XOP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2606; XOP-NEXT: retq 2607; 2608; X86-SSE2-LABEL: rot16_trunc: 2609; X86-SSE2: # %bb.0: 2610; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 2611; X86-SSE2-NEXT: psrld $11, %xmm1 2612; X86-SSE2-NEXT: pslld $5, %xmm0 2613; X86-SSE2-NEXT: por %xmm1, %xmm0 2614; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2615; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2616; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2617; X86-SSE2-NEXT: retl 2618 %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11> 2619 %t1 = shl <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5> 2620 %t2 = or <4 x i32> %t0, %t1 2621 %t3 = trunc <4 x i32> %t2 to <4 x i16> 2622 ret <4 x i16> %t3 2623} 2624