1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 4; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 5; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 6; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1 7; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW 8; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST 9; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST 10; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL 11; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2OR512VL,AVX512VL 12 13define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { 14; SSE-LABEL: shuffle_v4i32_0001: 15; SSE: # %bb.0: 16; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] 17; SSE-NEXT: retq 18; 19; AVX-LABEL: shuffle_v4i32_0001: 20; AVX: # %bb.0: 21; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] 22; AVX-NEXT: retq 23 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 24 ret <4 x i32> %shuffle 25} 26define <4 x i32> @shuffle_v4i32_0020(<4 x i32> %a, <4 x i32> %b) { 27; SSE-LABEL: shuffle_v4i32_0020: 28; SSE: # %bb.0: 29; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,0] 30; SSE-NEXT: retq 31; 32; AVX-LABEL: shuffle_v4i32_0020: 33; AVX: # %bb.0: 34; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] 35; AVX-NEXT: retq 36 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 37 ret <4 x i32> %shuffle 38} 39define <4 x i32> @shuffle_v4i32_0112(<4 x i32> %a, <4 x i32> %b) { 40; SSE-LABEL: shuffle_v4i32_0112: 41; SSE: # %bb.0: 42; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] 43; SSE-NEXT: retq 44; 45; AVX-LABEL: shuffle_v4i32_0112: 46; AVX: # %bb.0: 47; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 48; AVX-NEXT: retq 49 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 1, i32 2> 50 ret <4 x i32> %shuffle 51} 52define <4 x i32> @shuffle_v4i32_0300(<4 x i32> %a, <4 x i32> %b) { 53; SSE-LABEL: shuffle_v4i32_0300: 54; SSE: # %bb.0: 55; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,0,0] 56; SSE-NEXT: retq 57; 58; AVX-LABEL: shuffle_v4i32_0300: 59; AVX: # %bb.0: 60; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] 61; AVX-NEXT: retq 62 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 63 ret <4 x i32> %shuffle 64} 65define <4 x i32> @shuffle_v4i32_1000(<4 x i32> %a, <4 x i32> %b) { 66; SSE-LABEL: shuffle_v4i32_1000: 67; SSE: # %bb.0: 68; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 69; SSE-NEXT: retq 70; 71; AVX-LABEL: shuffle_v4i32_1000: 72; AVX: # %bb.0: 73; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 74; AVX-NEXT: retq 75 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 76 ret <4 x i32> %shuffle 77} 78define <4 x i32> @shuffle_v4i32_2200(<4 x i32> %a, <4 x i32> %b) { 79; SSE-LABEL: shuffle_v4i32_2200: 80; SSE: # %bb.0: 81; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,0,0] 82; SSE-NEXT: retq 83; 84; AVX-LABEL: shuffle_v4i32_2200: 85; AVX: # %bb.0: 86; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] 87; AVX-NEXT: retq 88 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 89 ret <4 x i32> %shuffle 90} 91define <4 x i32> @shuffle_v4i32_3330(<4 x i32> %a, <4 x i32> %b) { 92; SSE-LABEL: shuffle_v4i32_3330: 93; SSE: # %bb.0: 94; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,0] 95; SSE-NEXT: retq 96; 97; AVX-LABEL: shuffle_v4i32_3330: 98; AVX: # %bb.0: 99; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] 100; AVX-NEXT: retq 101 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 102 ret <4 x i32> %shuffle 103} 104define <4 x i32> @shuffle_v4i32_3210(<4 x i32> %a, <4 x i32> %b) { 105; SSE-LABEL: shuffle_v4i32_3210: 106; SSE: # %bb.0: 107; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 108; SSE-NEXT: retq 109; 110; AVX-LABEL: shuffle_v4i32_3210: 111; AVX: # %bb.0: 112; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 113; AVX-NEXT: retq 114 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 115 ret <4 x i32> %shuffle 116} 117 118define <4 x i32> @shuffle_v4i32_2121(<4 x i32> %a, <4 x i32> %b) { 119; SSE-LABEL: shuffle_v4i32_2121: 120; SSE: # %bb.0: 121; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,2,1] 122; SSE-NEXT: retq 123; 124; AVX-LABEL: shuffle_v4i32_2121: 125; AVX: # %bb.0: 126; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] 127; AVX-NEXT: retq 128 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 1, i32 2, i32 1> 129 ret <4 x i32> %shuffle 130} 131 132define <4 x float> @shuffle_v4f32_0001(<4 x float> %a, <4 x float> %b) { 133; SSE-LABEL: shuffle_v4f32_0001: 134; SSE: # %bb.0: 135; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,1] 136; SSE-NEXT: retq 137; 138; AVX-LABEL: shuffle_v4f32_0001: 139; AVX: # %bb.0: 140; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] 141; AVX-NEXT: retq 142 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1> 143 ret <4 x float> %shuffle 144} 145define <4 x float> @shuffle_v4f32_0020(<4 x float> %a, <4 x float> %b) { 146; SSE-LABEL: shuffle_v4f32_0020: 147; SSE: # %bb.0: 148; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,0] 149; SSE-NEXT: retq 150; 151; AVX-LABEL: shuffle_v4f32_0020: 152; AVX: # %bb.0: 153; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] 154; AVX-NEXT: retq 155 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0> 156 ret <4 x float> %shuffle 157} 158define <4 x float> @shuffle_v4f32_0300(<4 x float> %a, <4 x float> %b) { 159; SSE-LABEL: shuffle_v4f32_0300: 160; SSE: # %bb.0: 161; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,0,0] 162; SSE-NEXT: retq 163; 164; AVX-LABEL: shuffle_v4f32_0300: 165; AVX: # %bb.0: 166; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] 167; AVX-NEXT: retq 168 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0> 169 ret <4 x float> %shuffle 170} 171define <4 x float> @shuffle_v4f32_1000(<4 x float> %a, <4 x float> %b) { 172; SSE-LABEL: shuffle_v4f32_1000: 173; SSE: # %bb.0: 174; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,0,0] 175; SSE-NEXT: retq 176; 177; AVX-LABEL: shuffle_v4f32_1000: 178; AVX: # %bb.0: 179; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 180; AVX-NEXT: retq 181 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 182 ret <4 x float> %shuffle 183} 184define <4 x float> @shuffle_v4f32_2200(<4 x float> %a, <4 x float> %b) { 185; SSE-LABEL: shuffle_v4f32_2200: 186; SSE: # %bb.0: 187; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,2,0,0] 188; SSE-NEXT: retq 189; 190; AVX-LABEL: shuffle_v4f32_2200: 191; AVX: # %bb.0: 192; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] 193; AVX-NEXT: retq 194 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0> 195 ret <4 x float> %shuffle 196} 197define <4 x float> @shuffle_v4f32_3330(<4 x float> %a, <4 x float> %b) { 198; SSE-LABEL: shuffle_v4f32_3330: 199; SSE: # %bb.0: 200; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,0] 201; SSE-NEXT: retq 202; 203; AVX-LABEL: shuffle_v4f32_3330: 204; AVX: # %bb.0: 205; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] 206; AVX-NEXT: retq 207 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 0> 208 ret <4 x float> %shuffle 209} 210define <4 x float> @shuffle_v4f32_3210(<4 x float> %a, <4 x float> %b) { 211; SSE-LABEL: shuffle_v4f32_3210: 212; SSE: # %bb.0: 213; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 214; SSE-NEXT: retq 215; 216; AVX-LABEL: shuffle_v4f32_3210: 217; AVX: # %bb.0: 218; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] 219; AVX-NEXT: retq 220 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 221 ret <4 x float> %shuffle 222} 223define <4 x float> @shuffle_v4f32_0011(<4 x float> %a, <4 x float> %b) { 224; SSE-LABEL: shuffle_v4f32_0011: 225; SSE: # %bb.0: 226; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] 227; SSE-NEXT: retq 228; 229; AVX-LABEL: shuffle_v4f32_0011: 230; AVX: # %bb.0: 231; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 232; AVX-NEXT: retq 233 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 234 ret <4 x float> %shuffle 235} 236define <4 x float> @shuffle_v4f32_2233(<4 x float> %a, <4 x float> %b) { 237; SSE-LABEL: shuffle_v4f32_2233: 238; SSE: # %bb.0: 239; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2,2,3,3] 240; SSE-NEXT: retq 241; 242; AVX-LABEL: shuffle_v4f32_2233: 243; AVX: # %bb.0: 244; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 245; AVX-NEXT: retq 246 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 2, i32 3, i32 3> 247 ret <4 x float> %shuffle 248} 249define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { 250; SSE2-LABEL: shuffle_v4f32_0022: 251; SSE2: # %bb.0: 252; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,2,2] 253; SSE2-NEXT: retq 254; 255; SSE3-LABEL: shuffle_v4f32_0022: 256; SSE3: # %bb.0: 257; SSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 258; SSE3-NEXT: retq 259; 260; SSSE3-LABEL: shuffle_v4f32_0022: 261; SSSE3: # %bb.0: 262; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 263; SSSE3-NEXT: retq 264; 265; SSE41-LABEL: shuffle_v4f32_0022: 266; SSE41: # %bb.0: 267; SSE41-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 268; SSE41-NEXT: retq 269; 270; AVX-LABEL: shuffle_v4f32_0022: 271; AVX: # %bb.0: 272; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 273; AVX-NEXT: retq 274 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 275 ret <4 x float> %shuffle 276} 277define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { 278; SSE2-LABEL: shuffle_v4f32_1133: 279; SSE2: # %bb.0: 280; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] 281; SSE2-NEXT: retq 282; 283; SSE3-LABEL: shuffle_v4f32_1133: 284; SSE3: # %bb.0: 285; SSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 286; SSE3-NEXT: retq 287; 288; SSSE3-LABEL: shuffle_v4f32_1133: 289; SSSE3: # %bb.0: 290; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 291; SSSE3-NEXT: retq 292; 293; SSE41-LABEL: shuffle_v4f32_1133: 294; SSE41: # %bb.0: 295; SSE41-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 296; SSE41-NEXT: retq 297; 298; AVX-LABEL: shuffle_v4f32_1133: 299; AVX: # %bb.0: 300; AVX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 301; AVX-NEXT: retq 302 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 303 ret <4 x float> %shuffle 304} 305 306define <4 x float> @shuffle_v4f32_0145(<4 x float> %a, <4 x float> %b) { 307; SSE-LABEL: shuffle_v4f32_0145: 308; SSE: # %bb.0: 309; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 310; SSE-NEXT: retq 311; 312; AVX-LABEL: shuffle_v4f32_0145: 313; AVX: # %bb.0: 314; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 315; AVX-NEXT: retq 316 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 317 ret <4 x float> %shuffle 318} 319 320define <4 x float> @shuffle_v4f32_6723(<4 x float> %a, <4 x float> %b) { 321; SSE-LABEL: shuffle_v4f32_6723: 322; SSE: # %bb.0: 323; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 324; SSE-NEXT: retq 325; 326; AVX-LABEL: shuffle_v4f32_6723: 327; AVX: # %bb.0: 328; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 329; AVX-NEXT: retq 330 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 2, i32 3> 331 ret <4 x float> %shuffle 332} 333 334define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { 335; SSE2-LABEL: shuffle_v4i32_0124: 336; SSE2: # %bb.0: 337; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 338; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 339; SSE2-NEXT: retq 340; 341; SSE3-LABEL: shuffle_v4i32_0124: 342; SSE3: # %bb.0: 343; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 344; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 345; SSE3-NEXT: retq 346; 347; SSSE3-LABEL: shuffle_v4i32_0124: 348; SSSE3: # %bb.0: 349; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 350; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 351; SSSE3-NEXT: retq 352; 353; SSE41-LABEL: shuffle_v4i32_0124: 354; SSE41: # %bb.0: 355; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 356; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 357; SSE41-NEXT: retq 358; 359; AVX1-LABEL: shuffle_v4i32_0124: 360; AVX1: # %bb.0: 361; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] 362; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 363; AVX1-NEXT: retq 364; 365; AVX2-LABEL: shuffle_v4i32_0124: 366; AVX2: # %bb.0: 367; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 368; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 369; AVX2-NEXT: retq 370; 371; AVX512VL-LABEL: shuffle_v4i32_0124: 372; AVX512VL: # %bb.0: 373; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,4] 374; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 375; AVX512VL-NEXT: retq 376 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 4> 377 ret <4 x i32> %shuffle 378} 379define <4 x i32> @shuffle_v4i32_0142(<4 x i32> %a, <4 x i32> %b) { 380; SSE2-LABEL: shuffle_v4i32_0142: 381; SSE2: # %bb.0: 382; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 383; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 384; SSE2-NEXT: retq 385; 386; SSE3-LABEL: shuffle_v4i32_0142: 387; SSE3: # %bb.0: 388; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 389; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 390; SSE3-NEXT: retq 391; 392; SSSE3-LABEL: shuffle_v4i32_0142: 393; SSSE3: # %bb.0: 394; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 395; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 396; SSSE3-NEXT: retq 397; 398; SSE41-LABEL: shuffle_v4i32_0142: 399; SSE41: # %bb.0: 400; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 401; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] 402; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] 403; SSE41-NEXT: retq 404; 405; AVX1-LABEL: shuffle_v4i32_0142: 406; AVX1: # %bb.0: 407; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] 408; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] 409; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 410; AVX1-NEXT: retq 411; 412; AVX2-LABEL: shuffle_v4i32_0142: 413; AVX2: # %bb.0: 414; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 415; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] 416; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 417; AVX2-NEXT: retq 418; 419; AVX512VL-LABEL: shuffle_v4i32_0142: 420; AVX512VL: # %bb.0: 421; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,2] 422; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 423; AVX512VL-NEXT: retq 424 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 425 ret <4 x i32> %shuffle 426} 427define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { 428; SSE2-LABEL: shuffle_v4i32_0412: 429; SSE2: # %bb.0: 430; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 431; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 432; SSE2-NEXT: movaps %xmm1, %xmm0 433; SSE2-NEXT: retq 434; 435; SSE3-LABEL: shuffle_v4i32_0412: 436; SSE3: # %bb.0: 437; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 438; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 439; SSE3-NEXT: movaps %xmm1, %xmm0 440; SSE3-NEXT: retq 441; 442; SSSE3-LABEL: shuffle_v4i32_0412: 443; SSSE3: # %bb.0: 444; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 445; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,2] 446; SSSE3-NEXT: movaps %xmm1, %xmm0 447; SSSE3-NEXT: retq 448; 449; SSE41-LABEL: shuffle_v4i32_0412: 450; SSE41: # %bb.0: 451; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 452; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,2] 453; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 454; SSE41-NEXT: retq 455; 456; AVX1-LABEL: shuffle_v4i32_0412: 457; AVX1: # %bb.0: 458; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 459; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 460; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 461; AVX1-NEXT: retq 462; 463; AVX2-LABEL: shuffle_v4i32_0412: 464; AVX2: # %bb.0: 465; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 466; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] 467; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 468; AVX2-NEXT: retq 469; 470; AVX512VL-LABEL: shuffle_v4i32_0412: 471; AVX512VL: # %bb.0: 472; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,1,2] 473; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 474; AVX512VL-NEXT: retq 475 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 2> 476 ret <4 x i32> %shuffle 477} 478define <4 x i32> @shuffle_v4i32_4012(<4 x i32> %a, <4 x i32> %b) { 479; SSE2-LABEL: shuffle_v4i32_4012: 480; SSE2: # %bb.0: 481; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 482; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 483; SSE2-NEXT: movaps %xmm1, %xmm0 484; SSE2-NEXT: retq 485; 486; SSE3-LABEL: shuffle_v4i32_4012: 487; SSE3: # %bb.0: 488; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 489; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 490; SSE3-NEXT: movaps %xmm1, %xmm0 491; SSE3-NEXT: retq 492; 493; SSSE3-LABEL: shuffle_v4i32_4012: 494; SSSE3: # %bb.0: 495; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 496; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 497; SSSE3-NEXT: movaps %xmm1, %xmm0 498; SSSE3-NEXT: retq 499; 500; SSE41-LABEL: shuffle_v4i32_4012: 501; SSE41: # %bb.0: 502; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,2] 503; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 504; SSE41-NEXT: retq 505; 506; AVX1OR2-LABEL: shuffle_v4i32_4012: 507; AVX1OR2: # %bb.0: 508; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] 509; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 510; AVX1OR2-NEXT: retq 511; 512; AVX512VL-LABEL: shuffle_v4i32_4012: 513; AVX512VL: # %bb.0: 514; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,2] 515; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 516; AVX512VL-NEXT: retq 517 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 2> 518 ret <4 x i32> %shuffle 519} 520define <4 x i32> @shuffle_v4i32_0145(<4 x i32> %a, <4 x i32> %b) { 521; SSE-LABEL: shuffle_v4i32_0145: 522; SSE: # %bb.0: 523; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 524; SSE-NEXT: retq 525; 526; AVX-LABEL: shuffle_v4i32_0145: 527; AVX: # %bb.0: 528; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 529; AVX-NEXT: retq 530 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 531 ret <4 x i32> %shuffle 532} 533define <4 x i32> @shuffle_v4i32_0451(<4 x i32> %a, <4 x i32> %b) { 534; SSE2-LABEL: shuffle_v4i32_0451: 535; SSE2: # %bb.0: 536; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 537; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 538; SSE2-NEXT: retq 539; 540; SSE3-LABEL: shuffle_v4i32_0451: 541; SSE3: # %bb.0: 542; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 543; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 544; SSE3-NEXT: retq 545; 546; SSSE3-LABEL: shuffle_v4i32_0451: 547; SSSE3: # %bb.0: 548; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 549; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] 550; SSSE3-NEXT: retq 551; 552; SSE41-LABEL: shuffle_v4i32_0451: 553; SSE41: # %bb.0: 554; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 555; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 556; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] 557; SSE41-NEXT: retq 558; 559; AVX1-LABEL: shuffle_v4i32_0451: 560; AVX1: # %bb.0: 561; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 562; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 563; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 564; AVX1-NEXT: retq 565; 566; AVX2-LABEL: shuffle_v4i32_0451: 567; AVX2: # %bb.0: 568; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 569; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 570; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 571; AVX2-NEXT: retq 572; 573; AVX512VL-LABEL: shuffle_v4i32_0451: 574; AVX512VL: # %bb.0: 575; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,5,1] 576; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 577; AVX512VL-NEXT: retq 578 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 5, i32 1> 579 ret <4 x i32> %shuffle 580} 581define <4 x i32> @shuffle_v4i32_4501(<4 x i32> %a, <4 x i32> %b) { 582; SSE-LABEL: shuffle_v4i32_4501: 583; SSE: # %bb.0: 584; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 585; SSE-NEXT: movaps %xmm1, %xmm0 586; SSE-NEXT: retq 587; 588; AVX-LABEL: shuffle_v4i32_4501: 589; AVX: # %bb.0: 590; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 591; AVX-NEXT: retq 592 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 593 ret <4 x i32> %shuffle 594} 595define <4 x i32> @shuffle_v4i32_4015(<4 x i32> %a, <4 x i32> %b) { 596; SSE2-LABEL: shuffle_v4i32_4015: 597; SSE2: # %bb.0: 598; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 599; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 600; SSE2-NEXT: retq 601; 602; SSE3-LABEL: shuffle_v4i32_4015: 603; SSE3: # %bb.0: 604; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 605; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 606; SSE3-NEXT: retq 607; 608; SSSE3-LABEL: shuffle_v4i32_4015: 609; SSSE3: # %bb.0: 610; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 611; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3] 612; SSSE3-NEXT: retq 613; 614; SSE41-LABEL: shuffle_v4i32_4015: 615; SSE41: # %bb.0: 616; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 617; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] 618; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5],xmm1[6,7] 619; SSE41-NEXT: retq 620; 621; AVX1-LABEL: shuffle_v4i32_4015: 622; AVX1: # %bb.0: 623; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] 624; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 625; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 626; AVX1-NEXT: retq 627; 628; AVX2-LABEL: shuffle_v4i32_4015: 629; AVX2: # %bb.0: 630; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] 631; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] 632; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 633; AVX2-NEXT: retq 634; 635; AVX512VL-LABEL: shuffle_v4i32_4015: 636; AVX512VL: # %bb.0: 637; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,1,5] 638; AVX512VL-NEXT: vpermt2d %xmm1, %xmm2, %xmm0 639; AVX512VL-NEXT: retq 640 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 1, i32 5> 641 ret <4 x i32> %shuffle 642} 643 644define <4 x float> @shuffle_v4f32_4zzz(<4 x float> %a) { 645; SSE2-LABEL: shuffle_v4f32_4zzz: 646; SSE2: # %bb.0: 647; SSE2-NEXT: xorps %xmm1, %xmm1 648; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 649; SSE2-NEXT: movaps %xmm1, %xmm0 650; SSE2-NEXT: retq 651; 652; SSE3-LABEL: shuffle_v4f32_4zzz: 653; SSE3: # %bb.0: 654; SSE3-NEXT: xorps %xmm1, %xmm1 655; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 656; SSE3-NEXT: movaps %xmm1, %xmm0 657; SSE3-NEXT: retq 658; 659; SSSE3-LABEL: shuffle_v4f32_4zzz: 660; SSSE3: # %bb.0: 661; SSSE3-NEXT: xorps %xmm1, %xmm1 662; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 663; SSSE3-NEXT: movaps %xmm1, %xmm0 664; SSSE3-NEXT: retq 665; 666; SSE41-LABEL: shuffle_v4f32_4zzz: 667; SSE41: # %bb.0: 668; SSE41-NEXT: xorps %xmm1, %xmm1 669; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 670; SSE41-NEXT: retq 671; 672; AVX-LABEL: shuffle_v4f32_4zzz: 673; AVX: # %bb.0: 674; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 675; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 676; AVX-NEXT: retq 677 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 678 ret <4 x float> %shuffle 679} 680 681define <4 x float> @shuffle_v4f32_z4zz(<4 x float> %a) { 682; SSE2-LABEL: shuffle_v4f32_z4zz: 683; SSE2: # %bb.0: 684; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 685; SSE2-NEXT: xorps %xmm1, %xmm1 686; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 687; SSE2-NEXT: retq 688; 689; SSE3-LABEL: shuffle_v4f32_z4zz: 690; SSE3: # %bb.0: 691; SSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 692; SSE3-NEXT: xorps %xmm1, %xmm1 693; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 694; SSE3-NEXT: retq 695; 696; SSSE3-LABEL: shuffle_v4f32_z4zz: 697; SSSE3: # %bb.0: 698; SSSE3-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 699; SSSE3-NEXT: xorps %xmm1, %xmm1 700; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 701; SSSE3-NEXT: retq 702; 703; SSE41-LABEL: shuffle_v4f32_z4zz: 704; SSE41: # %bb.0: 705; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero 706; SSE41-NEXT: retq 707; 708; AVX-LABEL: shuffle_v4f32_z4zz: 709; AVX: # %bb.0: 710; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero 711; AVX-NEXT: retq 712 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> 713 ret <4 x float> %shuffle 714} 715 716define <4 x float> @shuffle_v4f32_zz4z(<4 x float> %a) { 717; SSE2-LABEL: shuffle_v4f32_zz4z: 718; SSE2: # %bb.0: 719; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 720; SSE2-NEXT: pxor %xmm0, %xmm0 721; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 722; SSE2-NEXT: retq 723; 724; SSE3-LABEL: shuffle_v4f32_zz4z: 725; SSE3: # %bb.0: 726; SSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 727; SSE3-NEXT: pxor %xmm0, %xmm0 728; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 729; SSE3-NEXT: retq 730; 731; SSSE3-LABEL: shuffle_v4f32_zz4z: 732; SSSE3: # %bb.0: 733; SSSE3-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero 734; SSSE3-NEXT: pxor %xmm0, %xmm0 735; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 736; SSSE3-NEXT: retq 737; 738; SSE41-LABEL: shuffle_v4f32_zz4z: 739; SSE41: # %bb.0: 740; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero 741; SSE41-NEXT: retq 742; 743; AVX-LABEL: shuffle_v4f32_zz4z: 744; AVX: # %bb.0: 745; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[0],zero 746; AVX-NEXT: retq 747 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> 748 ret <4 x float> %shuffle 749} 750 751define <4 x float> @shuffle_v4f32_zuu4(<4 x float> %a) { 752; SSE2-LABEL: shuffle_v4f32_zuu4: 753; SSE2: # %bb.0: 754; SSE2-NEXT: xorps %xmm1, %xmm1 755; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 756; SSE2-NEXT: movaps %xmm1, %xmm0 757; SSE2-NEXT: retq 758; 759; SSE3-LABEL: shuffle_v4f32_zuu4: 760; SSE3: # %bb.0: 761; SSE3-NEXT: xorps %xmm1, %xmm1 762; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 763; SSE3-NEXT: movaps %xmm1, %xmm0 764; SSE3-NEXT: retq 765; 766; SSSE3-LABEL: shuffle_v4f32_zuu4: 767; SSSE3: # %bb.0: 768; SSSE3-NEXT: xorps %xmm1, %xmm1 769; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0] 770; SSSE3-NEXT: movaps %xmm1, %xmm0 771; SSSE3-NEXT: retq 772; 773; SSE41-LABEL: shuffle_v4f32_zuu4: 774; SSE41: # %bb.0: 775; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] 776; SSE41-NEXT: retq 777; 778; AVX-LABEL: shuffle_v4f32_zuu4: 779; AVX: # %bb.0: 780; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,zero,xmm0[0] 781; AVX-NEXT: retq 782 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> 783 ret <4 x float> %shuffle 784} 785 786define <4 x float> @shuffle_v4f32_zzz7(<4 x float> %a) { 787; SSE2-LABEL: shuffle_v4f32_zzz7: 788; SSE2: # %bb.0: 789; SSE2-NEXT: xorps %xmm1, %xmm1 790; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 791; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 792; SSE2-NEXT: movaps %xmm1, %xmm0 793; SSE2-NEXT: retq 794; 795; SSE3-LABEL: shuffle_v4f32_zzz7: 796; SSE3: # %bb.0: 797; SSE3-NEXT: xorps %xmm1, %xmm1 798; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 799; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 800; SSE3-NEXT: movaps %xmm1, %xmm0 801; SSE3-NEXT: retq 802; 803; SSSE3-LABEL: shuffle_v4f32_zzz7: 804; SSSE3: # %bb.0: 805; SSSE3-NEXT: xorps %xmm1, %xmm1 806; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 807; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 808; SSSE3-NEXT: movaps %xmm1, %xmm0 809; SSSE3-NEXT: retq 810; 811; SSE41-LABEL: shuffle_v4f32_zzz7: 812; SSE41: # %bb.0: 813; SSE41-NEXT: xorps %xmm1, %xmm1 814; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 815; SSE41-NEXT: retq 816; 817; AVX-LABEL: shuffle_v4f32_zzz7: 818; AVX: # %bb.0: 819; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 820; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 821; AVX-NEXT: retq 822 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 823 ret <4 x float> %shuffle 824} 825 826define <4 x float> @shuffle_v4f32_z6zz(<4 x float> %a) { 827; SSE2-LABEL: shuffle_v4f32_z6zz: 828; SSE2: # %bb.0: 829; SSE2-NEXT: xorps %xmm1, %xmm1 830; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 831; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 832; SSE2-NEXT: retq 833; 834; SSE3-LABEL: shuffle_v4f32_z6zz: 835; SSE3: # %bb.0: 836; SSE3-NEXT: xorps %xmm1, %xmm1 837; SSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 838; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 839; SSE3-NEXT: retq 840; 841; SSSE3-LABEL: shuffle_v4f32_z6zz: 842; SSSE3: # %bb.0: 843; SSSE3-NEXT: xorps %xmm1, %xmm1 844; SSSE3-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 845; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 846; SSSE3-NEXT: retq 847; 848; SSE41-LABEL: shuffle_v4f32_z6zz: 849; SSE41: # %bb.0: 850; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero 851; SSE41-NEXT: retq 852; 853; AVX-LABEL: shuffle_v4f32_z6zz: 854; AVX: # %bb.0: 855; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm0[2],zero,zero 856; AVX-NEXT: retq 857 %shuffle = shufflevector <4 x float> zeroinitializer, <4 x float> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 858 ret <4 x float> %shuffle 859} 860 861define <4 x float> @shuffle_v4f32_0z23(<4 x float> %a) { 862; SSE2-LABEL: shuffle_v4f32_0z23: 863; SSE2: # %bb.0: 864; SSE2-NEXT: xorps %xmm1, %xmm1 865; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 866; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 867; SSE2-NEXT: movaps %xmm1, %xmm0 868; SSE2-NEXT: retq 869; 870; SSE3-LABEL: shuffle_v4f32_0z23: 871; SSE3: # %bb.0: 872; SSE3-NEXT: xorps %xmm1, %xmm1 873; SSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 874; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 875; SSE3-NEXT: movaps %xmm1, %xmm0 876; SSE3-NEXT: retq 877; 878; SSSE3-LABEL: shuffle_v4f32_0z23: 879; SSSE3: # %bb.0: 880; SSSE3-NEXT: xorps %xmm1, %xmm1 881; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 882; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 883; SSSE3-NEXT: movaps %xmm1, %xmm0 884; SSSE3-NEXT: retq 885; 886; SSE41-LABEL: shuffle_v4f32_0z23: 887; SSE41: # %bb.0: 888; SSE41-NEXT: xorps %xmm1, %xmm1 889; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 890; SSE41-NEXT: retq 891; 892; AVX-LABEL: shuffle_v4f32_0z23: 893; AVX: # %bb.0: 894; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 895; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 896; AVX-NEXT: retq 897 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 898 ret <4 x float> %shuffle 899} 900 901define <4 x float> @shuffle_v4f32_01z3(<4 x float> %a) { 902; SSE2-LABEL: shuffle_v4f32_01z3: 903; SSE2: # %bb.0: 904; SSE2-NEXT: xorps %xmm1, %xmm1 905; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 906; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 907; SSE2-NEXT: retq 908; 909; SSE3-LABEL: shuffle_v4f32_01z3: 910; SSE3: # %bb.0: 911; SSE3-NEXT: xorps %xmm1, %xmm1 912; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 913; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 914; SSE3-NEXT: retq 915; 916; SSSE3-LABEL: shuffle_v4f32_01z3: 917; SSSE3: # %bb.0: 918; SSSE3-NEXT: xorps %xmm1, %xmm1 919; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 920; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 921; SSSE3-NEXT: retq 922; 923; SSE41-LABEL: shuffle_v4f32_01z3: 924; SSE41: # %bb.0: 925; SSE41-NEXT: xorps %xmm1, %xmm1 926; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 927; SSE41-NEXT: retq 928; 929; AVX-LABEL: shuffle_v4f32_01z3: 930; AVX: # %bb.0: 931; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 932; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 933; AVX-NEXT: retq 934 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 935 ret <4 x float> %shuffle 936} 937 938define <4 x float> @shuffle_v4f32_012z(<4 x float> %a) { 939; SSE2-LABEL: shuffle_v4f32_012z: 940; SSE2: # %bb.0: 941; SSE2-NEXT: xorps %xmm1, %xmm1 942; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 943; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 944; SSE2-NEXT: retq 945; 946; SSE3-LABEL: shuffle_v4f32_012z: 947; SSE3: # %bb.0: 948; SSE3-NEXT: xorps %xmm1, %xmm1 949; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 950; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 951; SSE3-NEXT: retq 952; 953; SSSE3-LABEL: shuffle_v4f32_012z: 954; SSSE3: # %bb.0: 955; SSSE3-NEXT: xorps %xmm1, %xmm1 956; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0] 957; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 958; SSSE3-NEXT: retq 959; 960; SSE41-LABEL: shuffle_v4f32_012z: 961; SSE41: # %bb.0: 962; SSE41-NEXT: xorps %xmm1, %xmm1 963; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 964; SSE41-NEXT: retq 965; 966; AVX-LABEL: shuffle_v4f32_012z: 967; AVX: # %bb.0: 968; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 969; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 970; AVX-NEXT: retq 971 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 972 ret <4 x float> %shuffle 973} 974 975define <4 x float> @shuffle_v4f32_0zz3(<4 x float> %a) { 976; SSE2-LABEL: shuffle_v4f32_0zz3: 977; SSE2: # %bb.0: 978; SSE2-NEXT: xorps %xmm1, %xmm1 979; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 980; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 981; SSE2-NEXT: retq 982; 983; SSE3-LABEL: shuffle_v4f32_0zz3: 984; SSE3: # %bb.0: 985; SSE3-NEXT: xorps %xmm1, %xmm1 986; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 987; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 988; SSE3-NEXT: retq 989; 990; SSSE3-LABEL: shuffle_v4f32_0zz3: 991; SSSE3: # %bb.0: 992; SSSE3-NEXT: xorps %xmm1, %xmm1 993; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[1,2] 994; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] 995; SSSE3-NEXT: retq 996; 997; SSE41-LABEL: shuffle_v4f32_0zz3: 998; SSE41: # %bb.0: 999; SSE41-NEXT: xorps %xmm1, %xmm1 1000; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1001; SSE41-NEXT: retq 1002; 1003; AVX-LABEL: shuffle_v4f32_0zz3: 1004; AVX: # %bb.0: 1005; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1006; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1007; AVX-NEXT: retq 1008 %shuffle = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3> 1009 ret <4 x float> %shuffle 1010} 1011 1012define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) { 1013; SSE2-LABEL: shuffle_v4f32_0z2z: 1014; SSE2: # %bb.0: 1015; SSE2-NEXT: xorps %xmm1, %xmm1 1016; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1017; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1018; SSE2-NEXT: retq 1019; 1020; SSE3-LABEL: shuffle_v4f32_0z2z: 1021; SSE3: # %bb.0: 1022; SSE3-NEXT: xorps %xmm1, %xmm1 1023; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1024; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1025; SSE3-NEXT: retq 1026; 1027; SSSE3-LABEL: shuffle_v4f32_0z2z: 1028; SSSE3: # %bb.0: 1029; SSSE3-NEXT: xorps %xmm1, %xmm1 1030; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] 1031; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 1032; SSSE3-NEXT: retq 1033; 1034; SSE41-LABEL: shuffle_v4f32_0z2z: 1035; SSE41: # %bb.0: 1036; SSE41-NEXT: xorps %xmm1, %xmm1 1037; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1038; SSE41-NEXT: retq 1039; 1040; AVX-LABEL: shuffle_v4f32_0z2z: 1041; AVX: # %bb.0: 1042; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1043; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 1044; AVX-NEXT: retq 1045 %shuffle = shufflevector <4 x float> %v, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 2, i32 4> 1046 ret <4 x float> %shuffle 1047} 1048 1049define <4 x float> @shuffle_v4f32_u051(<4 x float> %a, <4 x float> %b) { 1050; SSE-LABEL: shuffle_v4f32_u051: 1051; SSE: # %bb.0: 1052; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1053; SSE-NEXT: movaps %xmm1, %xmm0 1054; SSE-NEXT: retq 1055; 1056; AVX-LABEL: shuffle_v4f32_u051: 1057; AVX: # %bb.0: 1058; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1059; AVX-NEXT: retq 1060 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 undef, i32 0, i32 5, i32 1> 1061 ret <4 x float> %shuffle 1062} 1063 1064define <4 x float> @shuffle_v4f32_0zz4(<4 x float> %a, <4 x float> %b) { 1065; SSE2-LABEL: shuffle_v4f32_0zz4: 1066; SSE2: # %bb.0: 1067; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1068; SSE2-NEXT: pxor %xmm1, %xmm1 1069; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1070; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1071; SSE2-NEXT: movaps %xmm1, %xmm0 1072; SSE2-NEXT: retq 1073; 1074; SSE3-LABEL: shuffle_v4f32_0zz4: 1075; SSE3: # %bb.0: 1076; SSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1077; SSE3-NEXT: pxor %xmm1, %xmm1 1078; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1079; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1080; SSE3-NEXT: movaps %xmm1, %xmm0 1081; SSE3-NEXT: retq 1082; 1083; SSSE3-LABEL: shuffle_v4f32_0zz4: 1084; SSSE3: # %bb.0: 1085; SSSE3-NEXT: movq {{.*#+}} xmm2 = xmm1[0],zero 1086; SSSE3-NEXT: pxor %xmm1, %xmm1 1087; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0] 1088; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1089; SSSE3-NEXT: movaps %xmm1, %xmm0 1090; SSSE3-NEXT: retq 1091; 1092; SSE41-LABEL: shuffle_v4f32_0zz4: 1093; SSE41: # %bb.0: 1094; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 1095; SSE41-NEXT: retq 1096; 1097; AVX-LABEL: shuffle_v4f32_0zz4: 1098; AVX: # %bb.0: 1099; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] 1100; AVX-NEXT: retq 1101 %shuffle = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 6, i32 0> 1102 %shuffle1 = shufflevector <4 x float> %a, <4 x float> %shuffle, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1103 ret <4 x float> %shuffle1 1104} 1105 1106define <4 x float> @shuffle_v4f32_0zz6(<4 x float> %a, <4 x float> %b) { 1107; SSE2-LABEL: shuffle_v4f32_0zz6: 1108; SSE2: # %bb.0: 1109; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1110; SSE2-NEXT: xorps %xmm1, %xmm1 1111; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1112; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1113; SSE2-NEXT: movaps %xmm1, %xmm0 1114; SSE2-NEXT: retq 1115; 1116; SSE3-LABEL: shuffle_v4f32_0zz6: 1117; SSE3: # %bb.0: 1118; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1119; SSE3-NEXT: xorps %xmm1, %xmm1 1120; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1121; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1122; SSE3-NEXT: movaps %xmm1, %xmm0 1123; SSE3-NEXT: retq 1124; 1125; SSSE3-LABEL: shuffle_v4f32_0zz6: 1126; SSSE3: # %bb.0: 1127; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,2] 1128; SSSE3-NEXT: xorps %xmm1, %xmm1 1129; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[0,3] 1130; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0,1,3] 1131; SSSE3-NEXT: movaps %xmm1, %xmm0 1132; SSSE3-NEXT: retq 1133; 1134; SSE41-LABEL: shuffle_v4f32_0zz6: 1135; SSE41: # %bb.0: 1136; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] 1137; SSE41-NEXT: retq 1138; 1139; AVX-LABEL: shuffle_v4f32_0zz6: 1140; AVX: # %bb.0: 1141; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] 1142; AVX-NEXT: retq 1143 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 undef, i32 6> 1144 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 2, i32 7> 1145 ret <4 x float> %shuffle1 1146} 1147 1148define <4 x float> @shuffle_v4f32_0z24(<4 x float> %a, <4 x float> %b) { 1149; SSE2-LABEL: shuffle_v4f32_0z24: 1150; SSE2: # %bb.0: 1151; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1152; SSE2-NEXT: xorps %xmm2, %xmm2 1153; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1154; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1155; SSE2-NEXT: movaps %xmm2, %xmm0 1156; SSE2-NEXT: retq 1157; 1158; SSE3-LABEL: shuffle_v4f32_0z24: 1159; SSE3: # %bb.0: 1160; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1161; SSE3-NEXT: xorps %xmm2, %xmm2 1162; SSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1163; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1164; SSE3-NEXT: movaps %xmm2, %xmm0 1165; SSE3-NEXT: retq 1166; 1167; SSSE3-LABEL: shuffle_v4f32_0z24: 1168; SSSE3: # %bb.0: 1169; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 1170; SSSE3-NEXT: xorps %xmm2, %xmm2 1171; SSSE3-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] 1172; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] 1173; SSSE3-NEXT: movaps %xmm2, %xmm0 1174; SSSE3-NEXT: retq 1175; 1176; SSE41-LABEL: shuffle_v4f32_0z24: 1177; SSE41: # %bb.0: 1178; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] 1179; SSE41-NEXT: retq 1180; 1181; AVX-LABEL: shuffle_v4f32_0z24: 1182; AVX: # %bb.0: 1183; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] 1184; AVX-NEXT: retq 1185 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 undef, i32 2, i32 4> 1186 %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1187 ret <4 x float> %shuffle1 1188} 1189 1190define <4 x i32> @shuffle_v4i32_4zzz(<4 x i32> %a) { 1191; SSE2-LABEL: shuffle_v4i32_4zzz: 1192; SSE2: # %bb.0: 1193; SSE2-NEXT: xorps %xmm1, %xmm1 1194; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1195; SSE2-NEXT: movaps %xmm1, %xmm0 1196; SSE2-NEXT: retq 1197; 1198; SSE3-LABEL: shuffle_v4i32_4zzz: 1199; SSE3: # %bb.0: 1200; SSE3-NEXT: xorps %xmm1, %xmm1 1201; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1202; SSE3-NEXT: movaps %xmm1, %xmm0 1203; SSE3-NEXT: retq 1204; 1205; SSSE3-LABEL: shuffle_v4i32_4zzz: 1206; SSSE3: # %bb.0: 1207; SSSE3-NEXT: xorps %xmm1, %xmm1 1208; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1209; SSSE3-NEXT: movaps %xmm1, %xmm0 1210; SSSE3-NEXT: retq 1211; 1212; SSE41-LABEL: shuffle_v4i32_4zzz: 1213; SSE41: # %bb.0: 1214; SSE41-NEXT: xorps %xmm1, %xmm1 1215; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1216; SSE41-NEXT: retq 1217; 1218; AVX-LABEL: shuffle_v4i32_4zzz: 1219; AVX: # %bb.0: 1220; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1221; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1222; AVX-NEXT: retq 1223 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1224 ret <4 x i32> %shuffle 1225} 1226 1227define <4 x i32> @shuffle_v4i32_z4zz(<4 x i32> %a) { 1228; SSE2-LABEL: shuffle_v4i32_z4zz: 1229; SSE2: # %bb.0: 1230; SSE2-NEXT: xorps %xmm1, %xmm1 1231; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1232; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1233; SSE2-NEXT: retq 1234; 1235; SSE3-LABEL: shuffle_v4i32_z4zz: 1236; SSE3: # %bb.0: 1237; SSE3-NEXT: xorps %xmm1, %xmm1 1238; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1239; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1240; SSE3-NEXT: retq 1241; 1242; SSSE3-LABEL: shuffle_v4i32_z4zz: 1243; SSSE3: # %bb.0: 1244; SSSE3-NEXT: xorps %xmm1, %xmm1 1245; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1246; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1247; SSSE3-NEXT: retq 1248; 1249; SSE41-LABEL: shuffle_v4i32_z4zz: 1250; SSE41: # %bb.0: 1251; SSE41-NEXT: pxor %xmm1, %xmm1 1252; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1253; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] 1254; SSE41-NEXT: retq 1255; 1256; AVX1-LABEL: shuffle_v4i32_z4zz: 1257; AVX1: # %bb.0: 1258; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1259; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1260; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] 1261; AVX1-NEXT: retq 1262; 1263; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: 1264; AVX2-SLOW: # %bb.0: 1265; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1266; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1267; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] 1268; AVX2-SLOW-NEXT: retq 1269; 1270; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: 1271; AVX2-FAST: # %bb.0: 1272; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 1273; AVX2-FAST-NEXT: retq 1274; 1275; AVX512VL-LABEL: shuffle_v4i32_z4zz: 1276; AVX512VL: # %bb.0: 1277; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero 1278; AVX512VL-NEXT: retq 1279 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 2, i32 4, i32 3, i32 0> 1280 ret <4 x i32> %shuffle 1281} 1282 1283define <4 x i32> @shuffle_v4i32_zz4z(<4 x i32> %a) { 1284; SSE2-LABEL: shuffle_v4i32_zz4z: 1285; SSE2: # %bb.0: 1286; SSE2-NEXT: xorps %xmm1, %xmm1 1287; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1288; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1289; SSE2-NEXT: retq 1290; 1291; SSE3-LABEL: shuffle_v4i32_zz4z: 1292; SSE3: # %bb.0: 1293; SSE3-NEXT: xorps %xmm1, %xmm1 1294; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1295; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1296; SSE3-NEXT: retq 1297; 1298; SSSE3-LABEL: shuffle_v4i32_zz4z: 1299; SSSE3: # %bb.0: 1300; SSSE3-NEXT: xorps %xmm1, %xmm1 1301; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1302; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1303; SSSE3-NEXT: retq 1304; 1305; SSE41-LABEL: shuffle_v4i32_zz4z: 1306; SSE41: # %bb.0: 1307; SSE41-NEXT: pxor %xmm1, %xmm1 1308; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1309; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] 1310; SSE41-NEXT: retq 1311; 1312; AVX1-LABEL: shuffle_v4i32_zz4z: 1313; AVX1: # %bb.0: 1314; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1315; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1316; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] 1317; AVX1-NEXT: retq 1318; 1319; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: 1320; AVX2-SLOW: # %bb.0: 1321; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1322; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1323; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] 1324; AVX2-SLOW-NEXT: retq 1325; 1326; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: 1327; AVX2-FAST: # %bb.0: 1328; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 1329; AVX2-FAST-NEXT: retq 1330; 1331; AVX512VL-LABEL: shuffle_v4i32_zz4z: 1332; AVX512VL: # %bb.0: 1333; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 1334; AVX512VL-NEXT: retq 1335 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 4, i32 0> 1336 ret <4 x i32> %shuffle 1337} 1338 1339define <4 x i32> @shuffle_v4i32_zuu4(<4 x i32> %a) { 1340; SSE-LABEL: shuffle_v4i32_zuu4: 1341; SSE: # %bb.0: 1342; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1343; SSE-NEXT: retq 1344; 1345; AVX-LABEL: shuffle_v4i32_zuu4: 1346; AVX: # %bb.0: 1347; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] 1348; AVX-NEXT: retq 1349 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 undef, i32 undef, i32 4> 1350 ret <4 x i32> %shuffle 1351} 1352 1353define <4 x i32> @shuffle_v4i32_z6zz(<4 x i32> %a) { 1354; SSE2-LABEL: shuffle_v4i32_z6zz: 1355; SSE2: # %bb.0: 1356; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1357; SSE2-NEXT: xorps %xmm1, %xmm1 1358; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1359; SSE2-NEXT: retq 1360; 1361; SSE3-LABEL: shuffle_v4i32_z6zz: 1362; SSE3: # %bb.0: 1363; SSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1364; SSE3-NEXT: xorps %xmm1, %xmm1 1365; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1366; SSE3-NEXT: retq 1367; 1368; SSSE3-LABEL: shuffle_v4i32_z6zz: 1369; SSSE3: # %bb.0: 1370; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero 1371; SSSE3-NEXT: xorps %xmm1, %xmm1 1372; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1373; SSSE3-NEXT: retq 1374; 1375; SSE41-LABEL: shuffle_v4i32_z6zz: 1376; SSE41: # %bb.0: 1377; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] 1378; SSE41-NEXT: pxor %xmm0, %xmm0 1379; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1380; SSE41-NEXT: retq 1381; 1382; AVX1-LABEL: shuffle_v4i32_z6zz: 1383; AVX1: # %bb.0: 1384; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 1385; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 1386; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1387; AVX1-NEXT: retq 1388; 1389; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: 1390; AVX2-SLOW: # %bb.0: 1391; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] 1392; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 1393; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1394; AVX2-SLOW-NEXT: retq 1395; 1396; AVX2-FAST-LABEL: shuffle_v4i32_z6zz: 1397; AVX2-FAST: # %bb.0: 1398; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero 1399; AVX2-FAST-NEXT: retq 1400; 1401; AVX512VL-LABEL: shuffle_v4i32_z6zz: 1402; AVX512VL: # %bb.0: 1403; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero 1404; AVX512VL-NEXT: retq 1405 %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 1406 ret <4 x i32> %shuffle 1407} 1408 1409define <4 x i32> @shuffle_v4i32_7012(<4 x i32> %a, <4 x i32> %b) { 1410; SSE2-LABEL: shuffle_v4i32_7012: 1411; SSE2: # %bb.0: 1412; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] 1413; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 1414; SSE2-NEXT: movaps %xmm1, %xmm0 1415; SSE2-NEXT: retq 1416; 1417; SSE3-LABEL: shuffle_v4i32_7012: 1418; SSE3: # %bb.0: 1419; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[0,0] 1420; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,2] 1421; SSE3-NEXT: movaps %xmm1, %xmm0 1422; SSE3-NEXT: retq 1423; 1424; SSSE3-LABEL: shuffle_v4i32_7012: 1425; SSSE3: # %bb.0: 1426; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1427; SSSE3-NEXT: retq 1428; 1429; SSE41-LABEL: shuffle_v4i32_7012: 1430; SSE41: # %bb.0: 1431; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1432; SSE41-NEXT: retq 1433; 1434; AVX-LABEL: shuffle_v4i32_7012: 1435; AVX: # %bb.0: 1436; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11] 1437; AVX-NEXT: retq 1438 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 0, i32 1, i32 2> 1439 ret <4 x i32> %shuffle 1440} 1441 1442define <4 x i32> @shuffle_v4i32_6701(<4 x i32> %a, <4 x i32> %b) { 1443; SSE2-LABEL: shuffle_v4i32_6701: 1444; SSE2: # %bb.0: 1445; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 1446; SSE2-NEXT: movaps %xmm1, %xmm0 1447; SSE2-NEXT: retq 1448; 1449; SSE3-LABEL: shuffle_v4i32_6701: 1450; SSE3: # %bb.0: 1451; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 1452; SSE3-NEXT: movaps %xmm1, %xmm0 1453; SSE3-NEXT: retq 1454; 1455; SSSE3-LABEL: shuffle_v4i32_6701: 1456; SSSE3: # %bb.0: 1457; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1458; SSSE3-NEXT: retq 1459; 1460; SSE41-LABEL: shuffle_v4i32_6701: 1461; SSE41: # %bb.0: 1462; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1463; SSE41-NEXT: retq 1464; 1465; AVX-LABEL: shuffle_v4i32_6701: 1466; AVX: # %bb.0: 1467; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 1468; AVX-NEXT: retq 1469 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1470 ret <4 x i32> %shuffle 1471} 1472 1473define <4 x i32> @shuffle_v4i32_5670(<4 x i32> %a, <4 x i32> %b) { 1474; SSE2-LABEL: shuffle_v4i32_5670: 1475; SSE2: # %bb.0: 1476; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1477; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] 1478; SSE2-NEXT: movaps %xmm1, %xmm0 1479; SSE2-NEXT: retq 1480; 1481; SSE3-LABEL: shuffle_v4i32_5670: 1482; SSE3: # %bb.0: 1483; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1484; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,2],xmm0[2,0] 1485; SSE3-NEXT: movaps %xmm1, %xmm0 1486; SSE3-NEXT: retq 1487; 1488; SSSE3-LABEL: shuffle_v4i32_5670: 1489; SSSE3: # %bb.0: 1490; SSSE3-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1491; SSSE3-NEXT: retq 1492; 1493; SSE41-LABEL: shuffle_v4i32_5670: 1494; SSE41: # %bb.0: 1495; SSE41-NEXT: palignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1496; SSE41-NEXT: retq 1497; 1498; AVX-LABEL: shuffle_v4i32_5670: 1499; AVX: # %bb.0: 1500; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3] 1501; AVX-NEXT: retq 1502 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 5, i32 6, i32 7, i32 0> 1503 ret <4 x i32> %shuffle 1504} 1505 1506define <4 x i32> @shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b) { 1507; SSE2-LABEL: shuffle_v4i32_1234: 1508; SSE2: # %bb.0: 1509; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1510; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] 1511; SSE2-NEXT: retq 1512; 1513; SSE3-LABEL: shuffle_v4i32_1234: 1514; SSE3: # %bb.0: 1515; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 1516; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[2,0] 1517; SSE3-NEXT: retq 1518; 1519; SSSE3-LABEL: shuffle_v4i32_1234: 1520; SSSE3: # %bb.0: 1521; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1522; SSSE3-NEXT: movdqa %xmm1, %xmm0 1523; SSSE3-NEXT: retq 1524; 1525; SSE41-LABEL: shuffle_v4i32_1234: 1526; SSE41: # %bb.0: 1527; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1528; SSE41-NEXT: movdqa %xmm1, %xmm0 1529; SSE41-NEXT: retq 1530; 1531; AVX-LABEL: shuffle_v4i32_1234: 1532; AVX: # %bb.0: 1533; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3] 1534; AVX-NEXT: retq 1535 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4> 1536 ret <4 x i32> %shuffle 1537} 1538 1539define <4 x i32> @shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b) { 1540; SSE2-LABEL: shuffle_v4i32_2345: 1541; SSE2: # %bb.0: 1542; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1543; SSE2-NEXT: retq 1544; 1545; SSE3-LABEL: shuffle_v4i32_2345: 1546; SSE3: # %bb.0: 1547; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1548; SSE3-NEXT: retq 1549; 1550; SSSE3-LABEL: shuffle_v4i32_2345: 1551; SSSE3: # %bb.0: 1552; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1553; SSSE3-NEXT: movdqa %xmm1, %xmm0 1554; SSSE3-NEXT: retq 1555; 1556; SSE41-LABEL: shuffle_v4i32_2345: 1557; SSE41: # %bb.0: 1558; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1559; SSE41-NEXT: movdqa %xmm1, %xmm0 1560; SSE41-NEXT: retq 1561; 1562; AVX-LABEL: shuffle_v4i32_2345: 1563; AVX: # %bb.0: 1564; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 1565; AVX-NEXT: retq 1566 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 1567 ret <4 x i32> %shuffle 1568} 1569 1570; PR22391 1571define <4 x i32> @shuffle_v4i32_2456(<4 x i32> %a, <4 x i32> %b) { 1572; SSE2-LABEL: shuffle_v4i32_2456: 1573; SSE2: # %bb.0: 1574; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1575; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1576; SSE2-NEXT: retq 1577; 1578; SSE3-LABEL: shuffle_v4i32_2456: 1579; SSE3: # %bb.0: 1580; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 1581; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1582; SSE3-NEXT: retq 1583; 1584; SSSE3-LABEL: shuffle_v4i32_2456: 1585; SSSE3: # %bb.0: 1586; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1587; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1588; SSSE3-NEXT: movdqa %xmm1, %xmm0 1589; SSSE3-NEXT: retq 1590; 1591; SSE41-LABEL: shuffle_v4i32_2456: 1592; SSE41: # %bb.0: 1593; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1594; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1595; SSE41-NEXT: movdqa %xmm1, %xmm0 1596; SSE41-NEXT: retq 1597; 1598; AVX1OR2-LABEL: shuffle_v4i32_2456: 1599; AVX1OR2: # %bb.0: 1600; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1601; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1602; AVX1OR2-NEXT: retq 1603; 1604; AVX512VL-LABEL: shuffle_v4i32_2456: 1605; AVX512VL: # %bb.0: 1606; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,0,1,2] 1607; AVX512VL-NEXT: vpermi2d %xmm0, %xmm1, %xmm2 1608; AVX512VL-NEXT: vmovdqa %xmm2, %xmm0 1609; AVX512VL-NEXT: retq 1610 %s1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 2> 1611 %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1612 ret <4 x i32> %s2 1613} 1614 1615define <4 x i32> @shuffle_v4i32_40u1(<4 x i32> %a, <4 x i32> %b) { 1616; SSE-LABEL: shuffle_v4i32_40u1: 1617; SSE: # %bb.0: 1618; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1619; SSE-NEXT: movaps %xmm1, %xmm0 1620; SSE-NEXT: retq 1621; 1622; AVX-LABEL: shuffle_v4i32_40u1: 1623; AVX: # %bb.0: 1624; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1625; AVX-NEXT: retq 1626 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 0, i32 undef, i32 1> 1627 ret <4 x i32> %shuffle 1628} 1629 1630define <4 x i32> @shuffle_v4i32_3456(<4 x i32> %a, <4 x i32> %b) { 1631; SSE2-LABEL: shuffle_v4i32_3456: 1632; SSE2: # %bb.0: 1633; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] 1634; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1635; SSE2-NEXT: retq 1636; 1637; SSE3-LABEL: shuffle_v4i32_3456: 1638; SSE3: # %bb.0: 1639; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0] 1640; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,2] 1641; SSE3-NEXT: retq 1642; 1643; SSSE3-LABEL: shuffle_v4i32_3456: 1644; SSSE3: # %bb.0: 1645; SSSE3-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1646; SSSE3-NEXT: movdqa %xmm1, %xmm0 1647; SSSE3-NEXT: retq 1648; 1649; SSE41-LABEL: shuffle_v4i32_3456: 1650; SSE41: # %bb.0: 1651; SSE41-NEXT: palignr {{.*#+}} xmm1 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1652; SSE41-NEXT: movdqa %xmm1, %xmm0 1653; SSE41-NEXT: retq 1654; 1655; AVX-LABEL: shuffle_v4i32_3456: 1656; AVX: # %bb.0: 1657; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11] 1658; AVX-NEXT: retq 1659 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6> 1660 ret <4 x i32> %shuffle 1661} 1662 1663define <4 x i32> @shuffle_v4i32_0u1u(<4 x i32> %a, <4 x i32> %b) { 1664; SSE2-LABEL: shuffle_v4i32_0u1u: 1665; SSE2: # %bb.0: 1666; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1667; SSE2-NEXT: retq 1668; 1669; SSE3-LABEL: shuffle_v4i32_0u1u: 1670; SSE3: # %bb.0: 1671; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1672; SSE3-NEXT: retq 1673; 1674; SSSE3-LABEL: shuffle_v4i32_0u1u: 1675; SSSE3: # %bb.0: 1676; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1677; SSSE3-NEXT: retq 1678; 1679; SSE41-LABEL: shuffle_v4i32_0u1u: 1680; SSE41: # %bb.0: 1681; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1682; SSE41-NEXT: retq 1683; 1684; AVX-LABEL: shuffle_v4i32_0u1u: 1685; AVX: # %bb.0: 1686; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1687; AVX-NEXT: retq 1688 %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef> 1689 ret <4 x i32> %shuffle 1690} 1691 1692define <4 x i32> @shuffle_v4i32_0z1z(<4 x i32> %a) { 1693; SSE2-LABEL: shuffle_v4i32_0z1z: 1694; SSE2: # %bb.0: 1695; SSE2-NEXT: xorps %xmm1, %xmm1 1696; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1697; SSE2-NEXT: retq 1698; 1699; SSE3-LABEL: shuffle_v4i32_0z1z: 1700; SSE3: # %bb.0: 1701; SSE3-NEXT: xorps %xmm1, %xmm1 1702; SSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1703; SSE3-NEXT: retq 1704; 1705; SSSE3-LABEL: shuffle_v4i32_0z1z: 1706; SSSE3: # %bb.0: 1707; SSSE3-NEXT: xorps %xmm1, %xmm1 1708; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1709; SSSE3-NEXT: retq 1710; 1711; SSE41-LABEL: shuffle_v4i32_0z1z: 1712; SSE41: # %bb.0: 1713; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1714; SSE41-NEXT: retq 1715; 1716; AVX-LABEL: shuffle_v4i32_0z1z: 1717; AVX: # %bb.0: 1718; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1719; AVX-NEXT: retq 1720 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1721 ret <4 x i32> %shuffle 1722} 1723 1724define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) { 1725; SSE-LABEL: shuffle_v4i32_01zu: 1726; SSE: # %bb.0: 1727; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 1728; SSE-NEXT: retq 1729; 1730; AVX-LABEL: shuffle_v4i32_01zu: 1731; AVX: # %bb.0: 1732; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 1733; AVX-NEXT: retq 1734 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 7, i32 undef> 1735 ret <4 x i32> %shuffle 1736} 1737 1738define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) { 1739; SSE2-LABEL: shuffle_v4i32_0z23: 1740; SSE2: # %bb.0: 1741; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1742; SSE2-NEXT: retq 1743; 1744; SSE3-LABEL: shuffle_v4i32_0z23: 1745; SSE3: # %bb.0: 1746; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1747; SSE3-NEXT: retq 1748; 1749; SSSE3-LABEL: shuffle_v4i32_0z23: 1750; SSSE3: # %bb.0: 1751; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1752; SSSE3-NEXT: retq 1753; 1754; SSE41-LABEL: shuffle_v4i32_0z23: 1755; SSE41: # %bb.0: 1756; SSE41-NEXT: xorps %xmm1, %xmm1 1757; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1758; SSE41-NEXT: retq 1759; 1760; AVX-LABEL: shuffle_v4i32_0z23: 1761; AVX: # %bb.0: 1762; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1763; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1764; AVX-NEXT: retq 1765 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3> 1766 ret <4 x i32> %shuffle 1767} 1768 1769define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) { 1770; SSE2-LABEL: shuffle_v4i32_01z3: 1771; SSE2: # %bb.0: 1772; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1773; SSE2-NEXT: retq 1774; 1775; SSE3-LABEL: shuffle_v4i32_01z3: 1776; SSE3: # %bb.0: 1777; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1778; SSE3-NEXT: retq 1779; 1780; SSSE3-LABEL: shuffle_v4i32_01z3: 1781; SSSE3: # %bb.0: 1782; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1783; SSSE3-NEXT: retq 1784; 1785; SSE41-LABEL: shuffle_v4i32_01z3: 1786; SSE41: # %bb.0: 1787; SSE41-NEXT: xorps %xmm1, %xmm1 1788; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 1789; SSE41-NEXT: retq 1790; 1791; AVX-LABEL: shuffle_v4i32_01z3: 1792; AVX: # %bb.0: 1793; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1794; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] 1795; AVX-NEXT: retq 1796 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 1797 ret <4 x i32> %shuffle 1798} 1799 1800define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) { 1801; SSE2-LABEL: shuffle_v4i32_012z: 1802; SSE2: # %bb.0: 1803; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1804; SSE2-NEXT: retq 1805; 1806; SSE3-LABEL: shuffle_v4i32_012z: 1807; SSE3: # %bb.0: 1808; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1809; SSE3-NEXT: retq 1810; 1811; SSSE3-LABEL: shuffle_v4i32_012z: 1812; SSSE3: # %bb.0: 1813; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1814; SSSE3-NEXT: retq 1815; 1816; SSE41-LABEL: shuffle_v4i32_012z: 1817; SSE41: # %bb.0: 1818; SSE41-NEXT: xorps %xmm1, %xmm1 1819; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1820; SSE41-NEXT: retq 1821; 1822; AVX-LABEL: shuffle_v4i32_012z: 1823; AVX: # %bb.0: 1824; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1825; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 1826; AVX-NEXT: retq 1827 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1828 ret <4 x i32> %shuffle 1829} 1830 1831define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { 1832; SSE2-LABEL: shuffle_v4i32_0zz3: 1833; SSE2: # %bb.0: 1834; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1835; SSE2-NEXT: retq 1836; 1837; SSE3-LABEL: shuffle_v4i32_0zz3: 1838; SSE3: # %bb.0: 1839; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1840; SSE3-NEXT: retq 1841; 1842; SSSE3-LABEL: shuffle_v4i32_0zz3: 1843; SSSE3: # %bb.0: 1844; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1845; SSSE3-NEXT: retq 1846; 1847; SSE41-LABEL: shuffle_v4i32_0zz3: 1848; SSE41: # %bb.0: 1849; SSE41-NEXT: xorps %xmm1, %xmm1 1850; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1851; SSE41-NEXT: retq 1852; 1853; AVX-LABEL: shuffle_v4i32_0zz3: 1854; AVX: # %bb.0: 1855; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1856; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] 1857; AVX-NEXT: retq 1858 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3> 1859 ret <4 x i32> %shuffle 1860} 1861 1862define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) { 1863; SSE-LABEL: shuffle_v4i32_bitcast_0415: 1864; SSE: # %bb.0: 1865; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1866; SSE-NEXT: retq 1867; 1868; AVX-LABEL: shuffle_v4i32_bitcast_0415: 1869; AVX: # %bb.0: 1870; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1871; AVX-NEXT: retq 1872 %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 0, i32 4> 1873 %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double> 1874 %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> <i32 1, i32 0> 1875 %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32> 1876 ret <4 x i32> %bitcast32 1877} 1878 1879define <4 x float> @shuffle_v4f32_bitcast_4401(<4 x float> %a, <4 x i32> %b) { 1880; SSE-LABEL: shuffle_v4f32_bitcast_4401: 1881; SSE: # %bb.0: 1882; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,1] 1883; SSE-NEXT: movaps %xmm1, %xmm0 1884; SSE-NEXT: retq 1885; 1886; AVX-LABEL: shuffle_v4f32_bitcast_4401: 1887; AVX: # %bb.0: 1888; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,1] 1889; AVX-NEXT: retq 1890 %1 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 1891 %2 = bitcast <4 x i32> %1 to <2 x double> 1892 %3 = bitcast <4 x float> %a to <2 x double> 1893 %4 = shufflevector <2 x double> %2, <2 x double> %3, <2 x i32> <i32 0, i32 2> 1894 %5 = bitcast <2 x double> %4 to <4 x float> 1895 ret <4 x float> %5 1896} 1897 1898define <4 x float> @shuffle_v4f32_bitcast_0045(<4 x float> %a, <4 x i32> %b) { 1899; SSE-LABEL: shuffle_v4f32_bitcast_0045: 1900; SSE: # %bb.0: 1901; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] 1902; SSE-NEXT: retq 1903; 1904; AVX-LABEL: shuffle_v4f32_bitcast_0045: 1905; AVX: # %bb.0: 1906; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] 1907; AVX-NEXT: retq 1908 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 1> 1909 %2 = bitcast <4 x i32> %b to <4 x float> 1910 %3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 1, i32 0, i32 4, i32 5> 1911 ret <4 x float> %3 1912} 1913 1914define <4 x float> @mask_v4f32_4127(<4 x float> %a, <4 x float> %b) { 1915; SSE2-LABEL: mask_v4f32_4127: 1916; SSE2: # %bb.0: 1917; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1918; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1919; SSE2-NEXT: movaps %xmm1, %xmm0 1920; SSE2-NEXT: retq 1921; 1922; SSE3-LABEL: mask_v4f32_4127: 1923; SSE3: # %bb.0: 1924; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1925; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1926; SSE3-NEXT: movaps %xmm1, %xmm0 1927; SSE3-NEXT: retq 1928; 1929; SSSE3-LABEL: mask_v4f32_4127: 1930; SSSE3: # %bb.0: 1931; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[1,2] 1932; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1933; SSSE3-NEXT: movaps %xmm1, %xmm0 1934; SSSE3-NEXT: retq 1935; 1936; SSE41-LABEL: mask_v4f32_4127: 1937; SSE41: # %bb.0: 1938; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 1939; SSE41-NEXT: retq 1940; 1941; AVX-LABEL: mask_v4f32_4127: 1942; AVX: # %bb.0: 1943; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] 1944; AVX-NEXT: retq 1945 %1 = bitcast <4 x float> %a to <4 x i32> 1946 %2 = bitcast <4 x float> %b to <4 x i32> 1947 %3 = and <4 x i32> %1, <i32 0, i32 -1, i32 -1, i32 0> 1948 %4 = and <4 x i32> %2, <i32 -1, i32 0, i32 0, i32 -1> 1949 %5 = or <4 x i32> %4, %3 1950 %6 = bitcast <4 x i32> %5 to <4 x float> 1951 ret <4 x float> %6 1952} 1953 1954define <4 x float> @mask_v4f32_0127(<4 x float> %a, <4 x float> %b) { 1955; SSE2-LABEL: mask_v4f32_0127: 1956; SSE2: # %bb.0: 1957; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1958; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1959; SSE2-NEXT: movaps %xmm1, %xmm0 1960; SSE2-NEXT: retq 1961; 1962; SSE3-LABEL: mask_v4f32_0127: 1963; SSE3: # %bb.0: 1964; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1965; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1966; SSE3-NEXT: movaps %xmm1, %xmm0 1967; SSE3-NEXT: retq 1968; 1969; SSSE3-LABEL: mask_v4f32_0127: 1970; SSSE3: # %bb.0: 1971; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1972; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1973; SSSE3-NEXT: movaps %xmm1, %xmm0 1974; SSSE3-NEXT: retq 1975; 1976; SSE41-LABEL: mask_v4f32_0127: 1977; SSE41: # %bb.0: 1978; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 1979; SSE41-NEXT: retq 1980; 1981; AVX-LABEL: mask_v4f32_0127: 1982; AVX: # %bb.0: 1983; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 1984; AVX-NEXT: retq 1985 %1 = bitcast <4 x float> %a to <2 x i64> 1986 %2 = bitcast <4 x float> %b to <2 x i64> 1987 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296> 1988 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295> 1989 %5 = or <2 x i64> %4, %3 1990 %6 = bitcast <2 x i64> %5 to <4 x float> 1991 ret <4 x float> %6 1992} 1993 1994define <4 x i32> @mask_v4i32_0127(<4 x i32> %a, <4 x i32> %b) { 1995; SSE2-LABEL: mask_v4i32_0127: 1996; SSE2: # %bb.0: 1997; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 1998; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 1999; SSE2-NEXT: movaps %xmm1, %xmm0 2000; SSE2-NEXT: retq 2001; 2002; SSE3-LABEL: mask_v4i32_0127: 2003; SSE3: # %bb.0: 2004; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 2005; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2006; SSE3-NEXT: movaps %xmm1, %xmm0 2007; SSE3-NEXT: retq 2008; 2009; SSSE3-LABEL: mask_v4i32_0127: 2010; SSSE3: # %bb.0: 2011; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0] 2012; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0] 2013; SSSE3-NEXT: movaps %xmm1, %xmm0 2014; SSSE3-NEXT: retq 2015; 2016; SSE41-LABEL: mask_v4i32_0127: 2017; SSE41: # %bb.0: 2018; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2019; SSE41-NEXT: retq 2020; 2021; AVX-LABEL: mask_v4i32_0127: 2022; AVX: # %bb.0: 2023; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] 2024; AVX-NEXT: retq 2025 %1 = bitcast <4 x i32> %a to <2 x i64> 2026 %2 = bitcast <4 x i32> %b to <2 x i64> 2027 %3 = and <2 x i64> %1, <i64 0, i64 -4294967296> 2028 %4 = and <2 x i64> %2, <i64 -1, i64 4294967295> 2029 %5 = or <2 x i64> %4, %3 2030 %6 = bitcast <2 x i64> %5 to <4 x i32> 2031 ret <4 x i32> %6 2032} 2033 2034define <4 x float> @broadcast_v4f32_0101_from_v2f32(<2 x float>* %x) { 2035; SSE2-LABEL: broadcast_v4f32_0101_from_v2f32: 2036; SSE2: # %bb.0: 2037; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2038; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2039; SSE2-NEXT: retq 2040; 2041; SSE3-LABEL: broadcast_v4f32_0101_from_v2f32: 2042; SSE3: # %bb.0: 2043; SSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2044; SSE3-NEXT: retq 2045; 2046; SSSE3-LABEL: broadcast_v4f32_0101_from_v2f32: 2047; SSSE3: # %bb.0: 2048; SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2049; SSSE3-NEXT: retq 2050; 2051; SSE41-LABEL: broadcast_v4f32_0101_from_v2f32: 2052; SSE41: # %bb.0: 2053; SSE41-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] 2054; SSE41-NEXT: retq 2055; 2056; AVX-LABEL: broadcast_v4f32_0101_from_v2f32: 2057; AVX: # %bb.0: 2058; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 2059; AVX-NEXT: retq 2060 %1 = load <2 x float>, <2 x float>* %x, align 1 2061 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 2062 ret <4 x float> %2 2063} 2064 2065define <4 x i32> @extract3_insert0_v4i32_7123(<4 x i32> %a0, <4 x i32> %a1) { 2066; SSE2-LABEL: extract3_insert0_v4i32_7123: 2067; SSE2: # %bb.0: 2068; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2069; SSE2-NEXT: movd %xmm1, %eax 2070; SSE2-NEXT: movd %eax, %xmm1 2071; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2072; SSE2-NEXT: retq 2073; 2074; SSE3-LABEL: extract3_insert0_v4i32_7123: 2075; SSE3: # %bb.0: 2076; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2077; SSE3-NEXT: movd %xmm1, %eax 2078; SSE3-NEXT: movd %eax, %xmm1 2079; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2080; SSE3-NEXT: retq 2081; 2082; SSSE3-LABEL: extract3_insert0_v4i32_7123: 2083; SSSE3: # %bb.0: 2084; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 2085; SSSE3-NEXT: movd %xmm1, %eax 2086; SSSE3-NEXT: movd %eax, %xmm1 2087; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2088; SSSE3-NEXT: retq 2089; 2090; SSE41-LABEL: extract3_insert0_v4i32_7123: 2091; SSE41: # %bb.0: 2092; SSE41-NEXT: extractps $3, %xmm1, %eax 2093; SSE41-NEXT: pinsrd $0, %eax, %xmm0 2094; SSE41-NEXT: retq 2095; 2096; AVX-LABEL: extract3_insert0_v4i32_7123: 2097; AVX: # %bb.0: 2098; AVX-NEXT: vextractps $3, %xmm1, %eax 2099; AVX-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 2100; AVX-NEXT: retq 2101 %1 = extractelement <4 x i32> %a1, i32 3 2102 %2 = insertelement <4 x i32> %a0, i32 %1, i32 0 2103 ret <4 x i32> %2 2104} 2105 2106define <4 x i32> @extract3_insert3_v4i32_0127(<4 x i32> %a0, <4 x i32> %a1) { 2107; SSE2-LABEL: extract3_insert3_v4i32_0127: 2108; SSE2: # %bb.0: 2109; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2110; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2111; SSE2-NEXT: retq 2112; 2113; SSE3-LABEL: extract3_insert3_v4i32_0127: 2114; SSE3: # %bb.0: 2115; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2116; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2117; SSE3-NEXT: retq 2118; 2119; SSSE3-LABEL: extract3_insert3_v4i32_0127: 2120; SSSE3: # %bb.0: 2121; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2122; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2123; SSSE3-NEXT: retq 2124; 2125; SSE41-LABEL: extract3_insert3_v4i32_0127: 2126; SSE41: # %bb.0: 2127; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2128; SSE41-NEXT: retq 2129; 2130; AVX-LABEL: extract3_insert3_v4i32_0127: 2131; AVX: # %bb.0: 2132; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] 2133; AVX-NEXT: retq 2134 %1 = extractelement <4 x i32> %a1, i32 3 2135 %2 = insertelement <4 x i32> %a0, i32 %1, i32 3 2136 ret <4 x i32> %2 2137} 2138 2139define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { 2140; SSE-LABEL: insert_reg_and_zero_v4i32: 2141; SSE: # %bb.0: 2142; SSE-NEXT: movd %edi, %xmm0 2143; SSE-NEXT: retq 2144; 2145; AVX-LABEL: insert_reg_and_zero_v4i32: 2146; AVX: # %bb.0: 2147; AVX-NEXT: vmovd %edi, %xmm0 2148; AVX-NEXT: retq 2149 %v = insertelement <4 x i32> undef, i32 %a, i32 0 2150 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2151 ret <4 x i32> %shuffle 2152} 2153 2154define <4 x i32> @insert_mem_and_zero_v4i32(i32* %ptr) { 2155; SSE-LABEL: insert_mem_and_zero_v4i32: 2156; SSE: # %bb.0: 2157; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2158; SSE-NEXT: retq 2159; 2160; AVX-LABEL: insert_mem_and_zero_v4i32: 2161; AVX: # %bb.0: 2162; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2163; AVX-NEXT: retq 2164 %a = load i32, i32* %ptr 2165 %v = insertelement <4 x i32> undef, i32 %a, i32 0 2166 %shuffle = shufflevector <4 x i32> %v, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2167 ret <4 x i32> %shuffle 2168} 2169 2170define <4 x float> @insert_reg_and_zero_v4f32(float %a) { 2171; SSE2-LABEL: insert_reg_and_zero_v4f32: 2172; SSE2: # %bb.0: 2173; SSE2-NEXT: xorps %xmm1, %xmm1 2174; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2175; SSE2-NEXT: movaps %xmm1, %xmm0 2176; SSE2-NEXT: retq 2177; 2178; SSE3-LABEL: insert_reg_and_zero_v4f32: 2179; SSE3: # %bb.0: 2180; SSE3-NEXT: xorps %xmm1, %xmm1 2181; SSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2182; SSE3-NEXT: movaps %xmm1, %xmm0 2183; SSE3-NEXT: retq 2184; 2185; SSSE3-LABEL: insert_reg_and_zero_v4f32: 2186; SSSE3: # %bb.0: 2187; SSSE3-NEXT: xorps %xmm1, %xmm1 2188; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2189; SSSE3-NEXT: movaps %xmm1, %xmm0 2190; SSSE3-NEXT: retq 2191; 2192; SSE41-LABEL: insert_reg_and_zero_v4f32: 2193; SSE41: # %bb.0: 2194; SSE41-NEXT: xorps %xmm1, %xmm1 2195; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2196; SSE41-NEXT: retq 2197; 2198; AVX-LABEL: insert_reg_and_zero_v4f32: 2199; AVX: # %bb.0: 2200; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 2201; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2202; AVX-NEXT: retq 2203 %v = insertelement <4 x float> undef, float %a, i32 0 2204 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2205 ret <4 x float> %shuffle 2206} 2207 2208define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) { 2209; SSE-LABEL: insert_mem_and_zero_v4f32: 2210; SSE: # %bb.0: 2211; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2212; SSE-NEXT: retq 2213; 2214; AVX-LABEL: insert_mem_and_zero_v4f32: 2215; AVX: # %bb.0: 2216; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2217; AVX-NEXT: retq 2218 %a = load float, float* %ptr 2219 %v = insertelement <4 x float> undef, float %a, i32 0 2220 %shuffle = shufflevector <4 x float> %v, <4 x float> zeroinitializer, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2221 ret <4 x float> %shuffle 2222} 2223 2224define <4 x i32> @insert_reg_lo_v4i32(i64 %a, <4 x i32> %b) { 2225; SSE2-LABEL: insert_reg_lo_v4i32: 2226; SSE2: # %bb.0: 2227; SSE2-NEXT: movq %rdi, %xmm1 2228; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2229; SSE2-NEXT: retq 2230; 2231; SSE3-LABEL: insert_reg_lo_v4i32: 2232; SSE3: # %bb.0: 2233; SSE3-NEXT: movq %rdi, %xmm1 2234; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2235; SSE3-NEXT: retq 2236; 2237; SSSE3-LABEL: insert_reg_lo_v4i32: 2238; SSSE3: # %bb.0: 2239; SSSE3-NEXT: movq %rdi, %xmm1 2240; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2241; SSSE3-NEXT: retq 2242; 2243; SSE41-LABEL: insert_reg_lo_v4i32: 2244; SSE41: # %bb.0: 2245; SSE41-NEXT: movq %rdi, %xmm1 2246; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2247; SSE41-NEXT: retq 2248; 2249; AVX1-LABEL: insert_reg_lo_v4i32: 2250; AVX1: # %bb.0: 2251; AVX1-NEXT: vmovq %rdi, %xmm1 2252; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2253; AVX1-NEXT: retq 2254; 2255; AVX2OR512VL-LABEL: insert_reg_lo_v4i32: 2256; AVX2OR512VL: # %bb.0: 2257; AVX2OR512VL-NEXT: vmovq %rdi, %xmm1 2258; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2259; AVX2OR512VL-NEXT: retq 2260 %a.cast = bitcast i64 %a to <2 x i32> 2261 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2262 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2263 ret <4 x i32> %shuffle 2264} 2265 2266define <4 x i32> @insert_mem_lo_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { 2267; SSE2-LABEL: insert_mem_lo_v4i32: 2268; SSE2: # %bb.0: 2269; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2270; SSE2-NEXT: retq 2271; 2272; SSE3-LABEL: insert_mem_lo_v4i32: 2273; SSE3: # %bb.0: 2274; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2275; SSE3-NEXT: retq 2276; 2277; SSSE3-LABEL: insert_mem_lo_v4i32: 2278; SSSE3: # %bb.0: 2279; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2280; SSSE3-NEXT: retq 2281; 2282; SSE41-LABEL: insert_mem_lo_v4i32: 2283; SSE41: # %bb.0: 2284; SSE41-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 2285; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2286; SSE41-NEXT: retq 2287; 2288; AVX-LABEL: insert_mem_lo_v4i32: 2289; AVX: # %bb.0: 2290; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2291; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2292; AVX-NEXT: retq 2293 %a = load <2 x i32>, <2 x i32>* %ptr 2294 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2295 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2296 ret <4 x i32> %shuffle 2297} 2298 2299define <4 x i32> @insert_reg_hi_v4i32(i64 %a, <4 x i32> %b) { 2300; SSE-LABEL: insert_reg_hi_v4i32: 2301; SSE: # %bb.0: 2302; SSE-NEXT: movq %rdi, %xmm1 2303; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2304; SSE-NEXT: retq 2305; 2306; AVX-LABEL: insert_reg_hi_v4i32: 2307; AVX: # %bb.0: 2308; AVX-NEXT: vmovq %rdi, %xmm1 2309; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2310; AVX-NEXT: retq 2311 %a.cast = bitcast i64 %a to <2 x i32> 2312 %v = shufflevector <2 x i32> %a.cast, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2313 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2314 ret <4 x i32> %shuffle 2315} 2316 2317define <4 x i32> @insert_mem_hi_v4i32(<2 x i32>* %ptr, <4 x i32> %b) { 2318; SSE-LABEL: insert_mem_hi_v4i32: 2319; SSE: # %bb.0: 2320; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero 2321; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2322; SSE-NEXT: retq 2323; 2324; AVX-LABEL: insert_mem_hi_v4i32: 2325; AVX: # %bb.0: 2326; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2327; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2328; AVX-NEXT: retq 2329 %a = load <2 x i32>, <2 x i32>* %ptr 2330 %v = shufflevector <2 x i32> %a, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2331 %shuffle = shufflevector <4 x i32> %v, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2332 ret <4 x i32> %shuffle 2333} 2334 2335define <4 x float> @insert_reg_lo_v4f32(double %a, <4 x float> %b) { 2336; SSE2-LABEL: insert_reg_lo_v4f32: 2337; SSE2: # %bb.0: 2338; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2339; SSE2-NEXT: retq 2340; 2341; SSE3-LABEL: insert_reg_lo_v4f32: 2342; SSE3: # %bb.0: 2343; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2344; SSE3-NEXT: retq 2345; 2346; SSSE3-LABEL: insert_reg_lo_v4f32: 2347; SSSE3: # %bb.0: 2348; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2349; SSSE3-NEXT: retq 2350; 2351; SSE41-LABEL: insert_reg_lo_v4f32: 2352; SSE41: # %bb.0: 2353; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2354; SSE41-NEXT: retq 2355; 2356; AVX-LABEL: insert_reg_lo_v4f32: 2357; AVX: # %bb.0: 2358; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2359; AVX-NEXT: retq 2360 %a.cast = bitcast double %a to <2 x float> 2361 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2362 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2363 ret <4 x float> %shuffle 2364} 2365 2366define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) { 2367; SSE-LABEL: insert_mem_lo_v4f32: 2368; SSE: # %bb.0: 2369; SSE-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2370; SSE-NEXT: retq 2371; 2372; AVX-LABEL: insert_mem_lo_v4f32: 2373; AVX: # %bb.0: 2374; AVX-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2375; AVX-NEXT: retq 2376 %a = load <2 x float>, <2 x float>* %ptr 2377 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2378 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2379 ret <4 x float> %shuffle 2380} 2381 2382define <4 x float> @insert_reg_hi_v4f32(double %a, <4 x float> %b) { 2383; SSE-LABEL: insert_reg_hi_v4f32: 2384; SSE: # %bb.0: 2385; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2386; SSE-NEXT: movaps %xmm1, %xmm0 2387; SSE-NEXT: retq 2388; 2389; AVX-LABEL: insert_reg_hi_v4f32: 2390; AVX: # %bb.0: 2391; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2392; AVX-NEXT: retq 2393 %a.cast = bitcast double %a to <2 x float> 2394 %v = shufflevector <2 x float> %a.cast, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2395 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2396 ret <4 x float> %shuffle 2397} 2398 2399define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) { 2400; SSE-LABEL: insert_mem_hi_v4f32: 2401; SSE: # %bb.0: 2402; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2403; SSE-NEXT: retq 2404; 2405; AVX-LABEL: insert_mem_hi_v4f32: 2406; AVX: # %bb.0: 2407; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2408; AVX-NEXT: retq 2409 %a = load <2 x float>, <2 x float>* %ptr 2410 %v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 2411 %shuffle = shufflevector <4 x float> %v, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 2412 ret <4 x float> %shuffle 2413} 2414 2415; PR21137 2416define <4 x float> @shuffle_mem_v4f32_3210(<4 x float>* %ptr) { 2417; SSE-LABEL: shuffle_mem_v4f32_3210: 2418; SSE: # %bb.0: 2419; SSE-NEXT: movaps (%rdi), %xmm0 2420; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,1,0] 2421; SSE-NEXT: retq 2422; 2423; AVX-LABEL: shuffle_mem_v4f32_3210: 2424; AVX: # %bb.0: 2425; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,2,1,0] 2426; AVX-NEXT: retq 2427 %a = load <4 x float>, <4 x float>* %ptr 2428 %shuffle = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 2429 ret <4 x float> %shuffle 2430} 2431 2432define <4 x i32> @insert_dup_mem_v4i32(i32* %ptr) { 2433; SSE-LABEL: insert_dup_mem_v4i32: 2434; SSE: # %bb.0: 2435; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2436; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 2437; SSE-NEXT: retq 2438; 2439; AVX-LABEL: insert_dup_mem_v4i32: 2440; AVX: # %bb.0: 2441; AVX-NEXT: vbroadcastss (%rdi), %xmm0 2442; AVX-NEXT: retq 2443 %tmp = load i32, i32* %ptr, align 4 2444 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 2445 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer 2446 ret <4 x i32> %tmp2 2447} 2448 2449; PR41249 2450define <4 x float> @shuffle_mem_pmovzx_v4f32(<2 x float>* %p0, <4 x float>* %p1) { 2451; SSE-LABEL: shuffle_mem_pmovzx_v4f32: 2452; SSE: # %bb.0: 2453; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2454; SSE-NEXT: xorps %xmm1, %xmm1 2455; SSE-NEXT: movaps %xmm0, %xmm2 2456; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2457; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2458; SSE-NEXT: movaps %xmm2, (%rsi) 2459; SSE-NEXT: retq 2460; 2461; AVX1-LABEL: shuffle_mem_pmovzx_v4f32: 2462; AVX1: # %bb.0: 2463; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2464; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 2465; AVX1-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2466; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 2467; AVX1-NEXT: vmovaps %xmm1, (%rsi) 2468; AVX1-NEXT: retq 2469; 2470; AVX2OR512VL-LABEL: shuffle_mem_pmovzx_v4f32: 2471; AVX2OR512VL: # %bb.0: 2472; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2473; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 2474; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2475; AVX2OR512VL-NEXT: vbroadcastss %xmm0, %xmm0 2476; AVX2OR512VL-NEXT: vmovaps %xmm1, (%rsi) 2477; AVX2OR512VL-NEXT: retq 2478 %1 = load <2 x float>, <2 x float>* %p0 2479 %2 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1> 2480 %3 = shufflevector <4 x float> %2, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 2481 %4 = shufflevector <2 x float> %1, <2 x float> undef, <4 x i32> zeroinitializer 2482 store <4 x float> %3, <4 x float>* %p1 2483 ret <4 x float> %4 2484} 2485 2486; 2487; Shuffle to logical bit shifts 2488; 2489 2490define <4 x i32> @shuffle_v4i32_z0zX(<4 x i32> %a) { 2491; SSE-LABEL: shuffle_v4i32_z0zX: 2492; SSE: # %bb.0: 2493; SSE-NEXT: psllq $32, %xmm0 2494; SSE-NEXT: retq 2495; 2496; AVX-LABEL: shuffle_v4i32_z0zX: 2497; AVX: # %bb.0: 2498; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 2499; AVX-NEXT: retq 2500 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 4, i32 0, i32 4, i32 undef> 2501 ret <4 x i32> %shuffle 2502} 2503 2504define <4 x i32> @shuffle_v4i32_1z3z(<4 x i32> %a) { 2505; SSE-LABEL: shuffle_v4i32_1z3z: 2506; SSE: # %bb.0: 2507; SSE-NEXT: psrlq $32, %xmm0 2508; SSE-NEXT: retq 2509; 2510; AVX-LABEL: shuffle_v4i32_1z3z: 2511; AVX: # %bb.0: 2512; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 2513; AVX-NEXT: retq 2514 %shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 2515 ret <4 x i32> %shuffle 2516} 2517 2518define <4 x float> @shuffle_mem_v4f32_0145(<4 x float> %a, <4 x float>* %pb) { 2519; SSE-LABEL: shuffle_mem_v4f32_0145: 2520; SSE: # %bb.0: 2521; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 2522; SSE-NEXT: retq 2523; 2524; AVX-LABEL: shuffle_mem_v4f32_0145: 2525; AVX: # %bb.0: 2526; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] 2527; AVX-NEXT: retq 2528 %b = load <4 x float>, <4 x float>* %pb, align 1 2529 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 2530 ret <4 x float> %shuffle 2531} 2532 2533define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) { 2534; SSE2-LABEL: shuffle_mem_v4f32_4523: 2535; SSE2: # %bb.0: 2536; SSE2-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2537; SSE2-NEXT: retq 2538; 2539; SSE3-LABEL: shuffle_mem_v4f32_4523: 2540; SSE3: # %bb.0: 2541; SSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2542; SSE3-NEXT: retq 2543; 2544; SSSE3-LABEL: shuffle_mem_v4f32_4523: 2545; SSSE3: # %bb.0: 2546; SSSE3-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2547; SSSE3-NEXT: retq 2548; 2549; SSE41-LABEL: shuffle_mem_v4f32_4523: 2550; SSE41: # %bb.0: 2551; SSE41-NEXT: movups (%rdi), %xmm1 2552; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2553; SSE41-NEXT: retq 2554; 2555; AVX-LABEL: shuffle_mem_v4f32_4523: 2556; AVX: # %bb.0: 2557; AVX-NEXT: vblendps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] 2558; AVX-NEXT: retq 2559 %b = load <4 x float>, <4 x float>* %pb, align 1 2560 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 2561 ret <4 x float> %shuffle 2562} 2563 2564define <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) { 2565; SSE-LABEL: shuffle_mem_v4f32_0624: 2566; SSE: # %bb.0: 2567; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] 2568; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,3,1] 2569; SSE-NEXT: retq 2570; 2571; AVX1OR2-LABEL: shuffle_mem_v4f32_0624: 2572; AVX1OR2: # %bb.0: 2573; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] 2574; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] 2575; AVX1OR2-NEXT: retq 2576; 2577; AVX512VL-LABEL: shuffle_mem_v4f32_0624: 2578; AVX512VL: # %bb.0: 2579; AVX512VL-NEXT: vmovaps (%rdi), %xmm2 2580; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,2,4] 2581; AVX512VL-NEXT: vpermi2ps %xmm0, %xmm2, %xmm1 2582; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 2583; AVX512VL-NEXT: retq 2584 %1 = load <4 x float>, <4 x float>* %a1 2585 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 2586 ret <4 x float> %2 2587} 2588 2589define <4 x float> @shuffle_mem_v4f32_4760(<4 x float> %a0, <4 x float>* %a1) { 2590; SSE-LABEL: shuffle_mem_v4f32_4760: 2591; SSE: # %bb.0: 2592; SSE-NEXT: movaps %xmm0, %xmm1 2593; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[0,0] 2594; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] 2595; SSE-NEXT: retq 2596; 2597; AVX1OR2-LABEL: shuffle_mem_v4f32_4760: 2598; AVX1OR2: # %bb.0: 2599; AVX1OR2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,0],mem[0,0] 2600; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,2] 2601; AVX1OR2-NEXT: retq 2602; 2603; AVX512VL-LABEL: shuffle_mem_v4f32_4760: 2604; AVX512VL: # %bb.0: 2605; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,3,2,4] 2606; AVX512VL-NEXT: vpermt2ps (%rdi), %xmm1, %xmm0 2607; AVX512VL-NEXT: retq 2608 %1 = load <4 x float>, <4 x float>* %a1 2609 %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 4, i32 7, i32 6, i32 0> 2610 ret <4 x float> %2 2611} 2612