1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 3; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX2 4; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 5; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X86,X86-AVX512 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=X64,X64-AVX512 12 13; 14; Subvector Load + Broadcast 15; 16 17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 18; X86-LABEL: test_broadcast_2f64_4f64: 19; X86: # %bb.0: 20; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 21; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 22; X86-NEXT: retl 23; 24; X64-LABEL: test_broadcast_2f64_4f64: 25; X64: # %bb.0: 26; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 27; X64-NEXT: retq 28 %1 = load <2 x double>, <2 x double> *%p 29 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 30 ret <4 x double> %2 31} 32 33define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { 34; X86-AVX-LABEL: test_broadcast_2f64_8f64: 35; X86-AVX: # %bb.0: 36; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 38; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 39; X86-AVX-NEXT: retl 40; 41; X86-AVX512-LABEL: test_broadcast_2f64_8f64: 42; X86-AVX512: # %bb.0: 43; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 44; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 45; X86-AVX512-NEXT: retl 46; 47; X64-AVX-LABEL: test_broadcast_2f64_8f64: 48; X64-AVX: # %bb.0: 49; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 50; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 51; X64-AVX-NEXT: retq 52; 53; X64-AVX512-LABEL: test_broadcast_2f64_8f64: 54; X64-AVX512: # %bb.0: 55; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 56; X64-AVX512-NEXT: retq 57 %1 = load <2 x double>, <2 x double> *%p 58 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 59 ret <8 x double> %2 60} 61 62define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { 63; X86-AVX-LABEL: test_broadcast_4f64_8f64: 64; X86-AVX: # %bb.0: 65; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 66; X86-AVX-NEXT: vmovaps (%eax), %ymm0 67; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 68; X86-AVX-NEXT: retl 69; 70; X86-AVX512-LABEL: test_broadcast_4f64_8f64: 71; X86-AVX512: # %bb.0: 72; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 73; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 74; X86-AVX512-NEXT: retl 75; 76; X64-AVX-LABEL: test_broadcast_4f64_8f64: 77; X64-AVX: # %bb.0: 78; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 79; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 80; X64-AVX-NEXT: retq 81; 82; X64-AVX512-LABEL: test_broadcast_4f64_8f64: 83; X64-AVX512: # %bb.0: 84; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 85; X64-AVX512-NEXT: retq 86 %1 = load <4 x double>, <4 x double> *%p 87 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 ret <8 x double> %2 89} 90 91define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 92; X86-AVX-LABEL: test_broadcast_2i64_4i64: 93; X86-AVX: # %bb.0: 94; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 95; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 96; X86-AVX-NEXT: retl 97; 98; X86-AVX512-LABEL: test_broadcast_2i64_4i64: 99; X86-AVX512: # %bb.0: 100; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 101; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 102; X86-AVX512-NEXT: retl 103; 104; X64-AVX-LABEL: test_broadcast_2i64_4i64: 105; X64-AVX: # %bb.0: 106; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 107; X64-AVX-NEXT: retq 108; 109; X64-AVX512-LABEL: test_broadcast_2i64_4i64: 110; X64-AVX512: # %bb.0: 111; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 112; X64-AVX512-NEXT: retq 113 %1 = load <2 x i64>, <2 x i64> *%p 114 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 115 ret <4 x i64> %2 116} 117 118define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { 119; X86-AVX-LABEL: test_broadcast_2i64_8i64: 120; X86-AVX: # %bb.0: 121; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 122; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 123; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 124; X86-AVX-NEXT: retl 125; 126; X86-AVX512-LABEL: test_broadcast_2i64_8i64: 127; X86-AVX512: # %bb.0: 128; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 129; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 130; X86-AVX512-NEXT: retl 131; 132; X64-AVX-LABEL: test_broadcast_2i64_8i64: 133; X64-AVX: # %bb.0: 134; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 135; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 136; X64-AVX-NEXT: retq 137; 138; X64-AVX512-LABEL: test_broadcast_2i64_8i64: 139; X64-AVX512: # %bb.0: 140; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 141; X64-AVX512-NEXT: retq 142 %1 = load <2 x i64>, <2 x i64> *%p 143 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 144 ret <8 x i64> %2 145} 146 147define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { 148; X86-AVX-LABEL: test_broadcast_4i64_8i64: 149; X86-AVX: # %bb.0: 150; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 151; X86-AVX-NEXT: vmovaps (%eax), %ymm0 152; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 153; X86-AVX-NEXT: retl 154; 155; X86-AVX512-LABEL: test_broadcast_4i64_8i64: 156; X86-AVX512: # %bb.0: 157; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 158; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 159; X86-AVX512-NEXT: retl 160; 161; X64-AVX-LABEL: test_broadcast_4i64_8i64: 162; X64-AVX: # %bb.0: 163; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 164; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 165; X64-AVX-NEXT: retq 166; 167; X64-AVX512-LABEL: test_broadcast_4i64_8i64: 168; X64-AVX512: # %bb.0: 169; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 170; X64-AVX512-NEXT: retq 171 %1 = load <4 x i64>, <4 x i64> *%p 172 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 173 ret <8 x i64> %2 174} 175 176define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 177; X86-LABEL: test_broadcast_4f32_8f32: 178; X86: # %bb.0: 179; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 180; X86-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 181; X86-NEXT: retl 182; 183; X64-LABEL: test_broadcast_4f32_8f32: 184; X64: # %bb.0: 185; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 186; X64-NEXT: retq 187 %1 = load <4 x float>, <4 x float> *%p 188 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 189 ret <8 x float> %2 190} 191 192define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { 193; X86-AVX-LABEL: test_broadcast_4f32_16f32: 194; X86-AVX: # %bb.0: 195; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 196; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 197; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 198; X86-AVX-NEXT: retl 199; 200; X86-AVX512-LABEL: test_broadcast_4f32_16f32: 201; X86-AVX512: # %bb.0: 202; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 203; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 204; X86-AVX512-NEXT: retl 205; 206; X64-AVX-LABEL: test_broadcast_4f32_16f32: 207; X64-AVX: # %bb.0: 208; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 209; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 210; X64-AVX-NEXT: retq 211; 212; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 213; X64-AVX512: # %bb.0: 214; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 215; X64-AVX512-NEXT: retq 216 %1 = load <4 x float>, <4 x float> *%p 217 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 218 ret <16 x float> %2 219} 220 221define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { 222; X86-AVX-LABEL: test_broadcast_8f32_16f32: 223; X86-AVX: # %bb.0: 224; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X86-AVX-NEXT: vmovaps (%eax), %ymm0 226; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 227; X86-AVX-NEXT: retl 228; 229; X86-AVX512-LABEL: test_broadcast_8f32_16f32: 230; X86-AVX512: # %bb.0: 231; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 232; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 233; X86-AVX512-NEXT: retl 234; 235; X64-AVX-LABEL: test_broadcast_8f32_16f32: 236; X64-AVX: # %bb.0: 237; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 238; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 239; X64-AVX-NEXT: retq 240; 241; X64-AVX512-LABEL: test_broadcast_8f32_16f32: 242; X64-AVX512: # %bb.0: 243; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 244; X64-AVX512-NEXT: retq 245 %1 = load <8 x float>, <8 x float> *%p 246 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 247 ret <16 x float> %2 248} 249 250define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 251; X86-AVX-LABEL: test_broadcast_4i32_8i32: 252; X86-AVX: # %bb.0: 253; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 254; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 255; X86-AVX-NEXT: retl 256; 257; X86-AVX512-LABEL: test_broadcast_4i32_8i32: 258; X86-AVX512: # %bb.0: 259; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 260; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 261; X86-AVX512-NEXT: retl 262; 263; X64-AVX-LABEL: test_broadcast_4i32_8i32: 264; X64-AVX: # %bb.0: 265; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 266; X64-AVX-NEXT: retq 267; 268; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 269; X64-AVX512: # %bb.0: 270; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 271; X64-AVX512-NEXT: retq 272 %1 = load <4 x i32>, <4 x i32> *%p 273 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 274 ret <8 x i32> %2 275} 276 277define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { 278; X86-AVX-LABEL: test_broadcast_4i32_16i32: 279; X86-AVX: # %bb.0: 280; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 281; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 282; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 283; X86-AVX-NEXT: retl 284; 285; X86-AVX512-LABEL: test_broadcast_4i32_16i32: 286; X86-AVX512: # %bb.0: 287; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 288; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 289; X86-AVX512-NEXT: retl 290; 291; X64-AVX-LABEL: test_broadcast_4i32_16i32: 292; X64-AVX: # %bb.0: 293; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 294; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 295; X64-AVX-NEXT: retq 296; 297; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 298; X64-AVX512: # %bb.0: 299; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 300; X64-AVX512-NEXT: retq 301 %1 = load <4 x i32>, <4 x i32> *%p 302 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 303 ret <16 x i32> %2 304} 305 306define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { 307; X86-AVX-LABEL: test_broadcast_8i32_16i32: 308; X86-AVX: # %bb.0: 309; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 310; X86-AVX-NEXT: vmovaps (%eax), %ymm0 311; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 312; X86-AVX-NEXT: retl 313; 314; X86-AVX512-LABEL: test_broadcast_8i32_16i32: 315; X86-AVX512: # %bb.0: 316; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 317; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 318; X86-AVX512-NEXT: retl 319; 320; X64-AVX-LABEL: test_broadcast_8i32_16i32: 321; X64-AVX: # %bb.0: 322; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 323; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 324; X64-AVX-NEXT: retq 325; 326; X64-AVX512-LABEL: test_broadcast_8i32_16i32: 327; X64-AVX512: # %bb.0: 328; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 329; X64-AVX512-NEXT: retq 330 %1 = load <8 x i32>, <8 x i32> *%p 331 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 332 ret <16 x i32> %2 333} 334 335define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 336; X86-AVX-LABEL: test_broadcast_8i16_16i16: 337; X86-AVX: # %bb.0: 338; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 339; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 340; X86-AVX-NEXT: retl 341; 342; X86-AVX512-LABEL: test_broadcast_8i16_16i16: 343; X86-AVX512: # %bb.0: 344; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 345; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 346; X86-AVX512-NEXT: retl 347; 348; X64-AVX-LABEL: test_broadcast_8i16_16i16: 349; X64-AVX: # %bb.0: 350; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 351; X64-AVX-NEXT: retq 352; 353; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 354; X64-AVX512: # %bb.0: 355; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 356; X64-AVX512-NEXT: retq 357 %1 = load <8 x i16>, <8 x i16> *%p 358 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 359 ret <16 x i16> %2 360} 361 362define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { 363; X86-AVX-LABEL: test_broadcast_8i16_32i16: 364; X86-AVX: # %bb.0: 365; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 366; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 367; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 368; X86-AVX-NEXT: retl 369; 370; X86-AVX512-LABEL: test_broadcast_8i16_32i16: 371; X86-AVX512: # %bb.0: 372; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 373; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 374; X86-AVX512-NEXT: retl 375; 376; X64-AVX-LABEL: test_broadcast_8i16_32i16: 377; X64-AVX: # %bb.0: 378; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 379; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 380; X64-AVX-NEXT: retq 381; 382; X64-AVX512-LABEL: test_broadcast_8i16_32i16: 383; X64-AVX512: # %bb.0: 384; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 385; X64-AVX512-NEXT: retq 386 %1 = load <8 x i16>, <8 x i16> *%p 387 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 388 ret <32 x i16> %2 389} 390 391define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { 392; X86-AVX-LABEL: test_broadcast_16i16_32i16: 393; X86-AVX: # %bb.0: 394; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 395; X86-AVX-NEXT: vmovaps (%eax), %ymm0 396; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 397; X86-AVX-NEXT: retl 398; 399; X86-AVX512-LABEL: test_broadcast_16i16_32i16: 400; X86-AVX512: # %bb.0: 401; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 402; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 403; X86-AVX512-NEXT: retl 404; 405; X64-AVX-LABEL: test_broadcast_16i16_32i16: 406; X64-AVX: # %bb.0: 407; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 408; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 409; X64-AVX-NEXT: retq 410; 411; X64-AVX512-LABEL: test_broadcast_16i16_32i16: 412; X64-AVX512: # %bb.0: 413; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 414; X64-AVX512-NEXT: retq 415 %1 = load <16 x i16>, <16 x i16> *%p 416 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 417 ret <32 x i16> %2 418} 419 420define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 421; X86-AVX-LABEL: test_broadcast_16i8_32i8: 422; X86-AVX: # %bb.0: 423; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 424; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 425; X86-AVX-NEXT: retl 426; 427; X86-AVX512-LABEL: test_broadcast_16i8_32i8: 428; X86-AVX512: # %bb.0: 429; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 430; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 431; X86-AVX512-NEXT: retl 432; 433; X64-AVX-LABEL: test_broadcast_16i8_32i8: 434; X64-AVX: # %bb.0: 435; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 436; X64-AVX-NEXT: retq 437; 438; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 439; X64-AVX512: # %bb.0: 440; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 441; X64-AVX512-NEXT: retq 442 %1 = load <16 x i8>, <16 x i8> *%p 443 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 444 ret <32 x i8> %2 445} 446 447define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { 448; X86-AVX-LABEL: test_broadcast_16i8_64i8: 449; X86-AVX: # %bb.0: 450; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 451; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 452; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 453; X86-AVX-NEXT: retl 454; 455; X86-AVX512-LABEL: test_broadcast_16i8_64i8: 456; X86-AVX512: # %bb.0: 457; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 458; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 459; X86-AVX512-NEXT: retl 460; 461; X64-AVX-LABEL: test_broadcast_16i8_64i8: 462; X64-AVX: # %bb.0: 463; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 464; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 465; X64-AVX-NEXT: retq 466; 467; X64-AVX512-LABEL: test_broadcast_16i8_64i8: 468; X64-AVX512: # %bb.0: 469; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 470; X64-AVX512-NEXT: retq 471 %1 = load <16 x i8>, <16 x i8> *%p 472 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 473 ret <64 x i8> %2 474} 475 476define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { 477; X86-AVX-LABEL: test_broadcast_32i8_64i8: 478; X86-AVX: # %bb.0: 479; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 480; X86-AVX-NEXT: vmovaps (%eax), %ymm0 481; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 482; X86-AVX-NEXT: retl 483; 484; X86-AVX512-LABEL: test_broadcast_32i8_64i8: 485; X86-AVX512: # %bb.0: 486; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 487; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 488; X86-AVX512-NEXT: retl 489; 490; X64-AVX-LABEL: test_broadcast_32i8_64i8: 491; X64-AVX: # %bb.0: 492; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 493; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 494; X64-AVX-NEXT: retq 495; 496; X64-AVX512-LABEL: test_broadcast_32i8_64i8: 497; X64-AVX512: # %bb.0: 498; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 499; X64-AVX512-NEXT: retq 500 %1 = load <32 x i8>, <32 x i8> *%p 501 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 502 ret <64 x i8> %2 503} 504 505; 506; Subvector Load + Broadcast + Store 507; 508 509define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 510; X86-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 511; X86-AVX: # %bb.0: 512; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 513; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 514; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 515; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 516; X86-AVX-NEXT: retl 517; 518; X86-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: 519; X86-AVX512: # %bb.0: 520; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 521; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 522; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 523; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 524; X86-AVX512-NEXT: retl 525; 526; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 527; X64-AVX: # %bb.0: 528; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 529; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 530; X64-AVX-NEXT: retq 531; 532; X64-AVX512-LABEL: test_broadcast_2f64_4f64_reuse: 533; X64-AVX512: # %bb.0: 534; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 535; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 536; X64-AVX512-NEXT: retq 537 %1 = load <2 x double>, <2 x double>* %p0 538 store <2 x double> %1, <2 x double>* %p1 539 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 540 ret <4 x double> %2 541} 542 543define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 544; X86-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 545; X86-AVX: # %bb.0: 546; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 547; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 548; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 549; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 550; X86-AVX-NEXT: retl 551; 552; X86-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 553; X86-AVX512: # %bb.0: 554; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 555; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 556; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 557; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 558; X86-AVX512-NEXT: retl 559; 560; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 561; X64-AVX: # %bb.0: 562; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 563; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 564; X64-AVX-NEXT: retq 565; 566; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 567; X64-AVX512: # %bb.0: 568; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 569; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 570; X64-AVX512-NEXT: retq 571 %1 = load <2 x i64>, <2 x i64>* %p0 572 store <2 x i64> %1, <2 x i64>* %p1 573 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 574 ret <4 x i64> %2 575} 576 577define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 578; X86-AVX-LABEL: test_broadcast_4f32_8f32_reuse: 579; X86-AVX: # %bb.0: 580; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 581; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 582; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 583; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 584; X86-AVX-NEXT: retl 585; 586; X86-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: 587; X86-AVX512: # %bb.0: 588; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 589; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 590; X86-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 591; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 592; X86-AVX512-NEXT: retl 593; 594; X64-AVX-LABEL: test_broadcast_4f32_8f32_reuse: 595; X64-AVX: # %bb.0: 596; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 597; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 598; X64-AVX-NEXT: retq 599; 600; X64-AVX512-LABEL: test_broadcast_4f32_8f32_reuse: 601; X64-AVX512: # %bb.0: 602; X64-AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 603; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 604; X64-AVX512-NEXT: retq 605 %1 = load <4 x float>, <4 x float>* %p0 606 store <4 x float> %1, <4 x float>* %p1 607 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 608 ret <8 x float> %2 609} 610 611define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 612; X86-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 613; X86-AVX: # %bb.0: 614; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 615; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 616; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 617; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 618; X86-AVX-NEXT: retl 619; 620; X86-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 621; X86-AVX512: # %bb.0: 622; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 623; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 624; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 625; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 626; X86-AVX512-NEXT: retl 627; 628; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 629; X64-AVX: # %bb.0: 630; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 631; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 632; X64-AVX-NEXT: retq 633; 634; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 635; X64-AVX512: # %bb.0: 636; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 637; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 638; X64-AVX512-NEXT: retq 639 %1 = load <4 x i32>, <4 x i32>* %p0 640 store <4 x i32> %1, <4 x i32>* %p1 641 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 642 ret <8 x i32> %2 643} 644 645define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 646; X86-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 647; X86-AVX: # %bb.0: 648; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 649; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 650; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 651; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 652; X86-AVX-NEXT: retl 653; 654; X86-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: 655; X86-AVX512: # %bb.0: 656; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 657; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 658; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 659; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 660; X86-AVX512-NEXT: retl 661; 662; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 663; X64-AVX: # %bb.0: 664; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 665; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 666; X64-AVX-NEXT: retq 667; 668; X64-AVX512-LABEL: test_broadcast_8i16_16i16_reuse: 669; X64-AVX512: # %bb.0: 670; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 671; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 672; X64-AVX512-NEXT: retq 673 %1 = load <8 x i16>, <8 x i16> *%p0 674 store <8 x i16> %1, <8 x i16>* %p1 675 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 676 ret <16 x i16> %2 677} 678 679define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 680; X86-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 681; X86-AVX: # %bb.0: 682; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 683; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 684; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 685; X86-AVX-NEXT: vmovaps %xmm0, (%eax) 686; X86-AVX-NEXT: retl 687; 688; X86-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: 689; X86-AVX512: # %bb.0: 690; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 691; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 692; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 693; X86-AVX512-NEXT: vmovdqa %xmm0, (%eax) 694; X86-AVX512-NEXT: retl 695; 696; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 697; X64-AVX: # %bb.0: 698; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 699; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 700; X64-AVX-NEXT: retq 701; 702; X64-AVX512-LABEL: test_broadcast_16i8_32i8_reuse: 703; X64-AVX512: # %bb.0: 704; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 705; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 706; X64-AVX512-NEXT: retq 707 %1 = load <16 x i8>, <16 x i8> *%p0 708 store <16 x i8> %1, <16 x i8>* %p1 709 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 710 ret <32 x i8> %2 711} 712 713; 714; Subvector Load + Broadcast with Separate Store 715; 716 717define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 718; X86-AVX-LABEL: test_broadcast_4i32_8i32_chain: 719; X86-AVX: # %bb.0: 720; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 721; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 722; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 723; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 724; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 725; X86-AVX-NEXT: retl 726; 727; X86-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 728; X86-AVX512: # %bb.0: 729; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 730; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 731; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 732; X86-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 733; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 734; X86-AVX512-NEXT: retl 735; 736; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: 737; X64-AVX: # %bb.0: 738; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 739; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 740; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 741; X64-AVX-NEXT: retq 742; 743; X64-AVX512-LABEL: test_broadcast_4i32_8i32_chain: 744; X64-AVX512: # %bb.0: 745; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 746; X64-AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] 747; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 748; X64-AVX512-NEXT: retq 749 %1 = load <4 x i32>, <4 x i32>* %p0 750 store <4 x float> zeroinitializer, <4 x float>* %p1 751 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 752 ret <8 x i32> %2 753} 754 755define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 756; X86-AVX-LABEL: test_broadcast_4i32_16i32_chain: 757; X86-AVX: # %bb.0: 758; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 759; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 760; X86-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 761; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 762; X86-AVX-NEXT: vmovaps %xmm1, (%eax) 763; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 764; X86-AVX-NEXT: retl 765; 766; X86-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 767; X86-AVX512: # %bb.0: 768; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 769; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 770; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 771; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 772; X86-AVX512-NEXT: vmovaps %xmm1, (%eax) 773; X86-AVX512-NEXT: retl 774; 775; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: 776; X64-AVX: # %bb.0: 777; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 778; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 779; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 780; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 781; X64-AVX-NEXT: retq 782; 783; X64-AVX512-LABEL: test_broadcast_4i32_16i32_chain: 784; X64-AVX512: # %bb.0: 785; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 786; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 787; X64-AVX512-NEXT: vmovaps %xmm1, (%rsi) 788; X64-AVX512-NEXT: retq 789 %1 = load <4 x i32>, <4 x i32>* %p0 790 store <4 x float> zeroinitializer, <4 x float>* %p1 791 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 792 ret <16 x i32> %2 793} 794 795; 796; subvector Load with multiple uses + broadcast 797; Fallback to the broadcast should be done 798; 799 800@ga4 = dso_local global <4 x i64> zeroinitializer, align 8 801@gb4 = dso_local global <8 x i64> zeroinitializer, align 8 802 803define dso_local void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { 804; X86-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 805; X86-AVX1: # %bb.0: # %entry 806; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,0,2,0] 807; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 808; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 809; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,0,4,0] 810; X86-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 811; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,0,3,0,4,0] 812; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 813; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 814; X86-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 815; X86-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 816; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 817; X86-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 818; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 819; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 820; X86-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 821; X86-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 822; X86-AVX1-NEXT: vmovdqu %xmm0, ga4+16 823; X86-AVX1-NEXT: vmovdqu %xmm4, ga4 824; X86-AVX1-NEXT: vmovups %ymm2, gb4+32 825; X86-AVX1-NEXT: vmovups %ymm1, gb4 826; X86-AVX1-NEXT: vzeroupper 827; X86-AVX1-NEXT: retl 828; 829; X86-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 830; X86-AVX2: # %bb.0: # %entry 831; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 832; X86-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 833; X86-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 834; X86-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 835; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 836; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 837; X86-AVX2-NEXT: vmovdqu %ymm0, ga4 838; X86-AVX2-NEXT: vmovdqu %ymm2, gb4+32 839; X86-AVX2-NEXT: vmovdqu %ymm1, gb4 840; X86-AVX2-NEXT: vzeroupper 841; X86-AVX2-NEXT: retl 842; 843; X86-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 844; X86-AVX512: # %bb.0: # %entry 845; X86-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] 846; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 847; X86-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 848; X86-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 849; X86-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 850; X86-AVX512-NEXT: vmovdqu %ymm0, ga4 851; X86-AVX512-NEXT: vmovdqu64 %zmm1, gb4 852; X86-AVX512-NEXT: vzeroupper 853; X86-AVX512-NEXT: retl 854; 855; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 856; X64-AVX1: # %bb.0: # %entry 857; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2] 858; X64-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm4 859; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 860; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [3,4] 861; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 862; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm6 = [1,2,3,4] 863; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 864; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm7 865; X64-AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 866; X64-AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 867; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 868; X64-AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 869; X64-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 870; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 871; X64-AVX1-NEXT: vandps %ymm6, %ymm1, %ymm1 872; X64-AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 873; X64-AVX1-NEXT: vmovdqu %xmm0, ga4+16(%rip) 874; X64-AVX1-NEXT: vmovdqu %xmm4, ga4(%rip) 875; X64-AVX1-NEXT: vmovups %ymm2, gb4+32(%rip) 876; X64-AVX1-NEXT: vmovups %ymm1, gb4(%rip) 877; X64-AVX1-NEXT: vzeroupper 878; X64-AVX1-NEXT: retq 879; 880; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 881; X64-AVX2: # %bb.0: # %entry 882; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] 883; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 884; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 885; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 886; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 887; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 888; X64-AVX2-NEXT: vmovdqu %ymm0, ga4(%rip) 889; X64-AVX2-NEXT: vmovdqu %ymm2, gb4+32(%rip) 890; X64-AVX2-NEXT: vmovdqu %ymm1, gb4(%rip) 891; X64-AVX2-NEXT: vzeroupper 892; X64-AVX2-NEXT: retq 893; 894; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 895; X64-AVX512: # %bb.0: # %entry 896; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [1,2,3,4,1,2,3,4] 897; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 898; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 899; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 900; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 901; X64-AVX512-NEXT: vmovdqu %ymm0, ga4(%rip) 902; X64-AVX512-NEXT: vmovdqu64 %zmm1, gb4(%rip) 903; X64-AVX512-NEXT: vzeroupper 904; X64-AVX512-NEXT: retq 905entry: 906 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 907 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 908 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 909 store <4 x i64> %0, <4 x i64>* @ga4, align 8 910 store <8 x i64> %2, <8 x i64>* @gb4, align 8 911 ret void 912} 913 914 915@ga2 = dso_local global <4 x double> zeroinitializer, align 8 916@gb2 = dso_local global <8 x double> zeroinitializer, align 8 917 918define dso_local void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { 919; X86-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 920; X86-AVX: # %bb.0: # %entry 921; X86-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 922; X86-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 923; X86-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 924; X86-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 925; X86-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 926; X86-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 927; X86-AVX-NEXT: vmovupd %ymm0, ga2 928; X86-AVX-NEXT: vmovupd %ymm2, gb2+32 929; X86-AVX-NEXT: vmovupd %ymm1, gb2 930; X86-AVX-NEXT: vzeroupper 931; X86-AVX-NEXT: retl 932; 933; X86-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 934; X86-AVX512: # %bb.0: # %entry 935; X86-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] 936; X86-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 937; X86-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 938; X86-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 939; X86-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 940; X86-AVX512-NEXT: vmovupd %ymm0, ga2 941; X86-AVX512-NEXT: vmovupd %zmm1, gb2 942; X86-AVX512-NEXT: vzeroupper 943; X86-AVX512-NEXT: retl 944; 945; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 946; X64-AVX: # %bb.0: # %entry 947; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] 948; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 949; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 950; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 951; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 952; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 953; X64-AVX-NEXT: vmovupd %ymm0, ga2(%rip) 954; X64-AVX-NEXT: vmovupd %ymm2, gb2+32(%rip) 955; X64-AVX-NEXT: vmovupd %ymm1, gb2(%rip) 956; X64-AVX-NEXT: vzeroupper 957; X64-AVX-NEXT: retq 958; 959; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 960; X64-AVX512: # %bb.0: # %entry 961; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0] 962; X64-AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] 963; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 964; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 965; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 966; X64-AVX512-NEXT: vmovupd %ymm0, ga2(%rip) 967; X64-AVX512-NEXT: vmovupd %zmm1, gb2(%rip) 968; X64-AVX512-NEXT: vzeroupper 969; X64-AVX512-NEXT: retq 970entry: 971 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0> 972 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 973 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 974 store <4 x double> %0, <4 x double>* @ga2, align 8 975 store <8 x double> %2, <8 x double>* @gb2, align 8 976 ret void 977} 978 979@ha4 = dso_local global <4 x i32> zeroinitializer, align 8 980@hb4 = dso_local global <8 x i32> zeroinitializer, align 8 981@hc4 = dso_local global <16 x i32> zeroinitializer, align 8 982 983define dso_local void @fallback_broadcast_v4i32_v8i32_v16i32(<4 x i32> %a, <8 x i32> %b, <16 x i32> %c) nounwind { 984; X86-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 985; X86-AVX1: # %bb.0: # %entry 986; X86-AVX1-NEXT: pushl %ebp 987; X86-AVX1-NEXT: movl %esp, %ebp 988; X86-AVX1-NEXT: andl $-32, %esp 989; X86-AVX1-NEXT: subl $32, %esp 990; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] 991; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1] 992; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 993; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 994; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 995; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 996; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 997; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 998; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 999; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 1000; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 1001; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 1002; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4 1003; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5 1004; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 1005; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 1006; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3 1007; X86-AVX1-NEXT: vmovdqu %xmm0, ha4 1008; X86-AVX1-NEXT: vmovups %ymm1, hb4 1009; X86-AVX1-NEXT: vmovups %ymm3, hc4+32 1010; X86-AVX1-NEXT: vmovups %ymm2, hc4 1011; X86-AVX1-NEXT: movl %ebp, %esp 1012; X86-AVX1-NEXT: popl %ebp 1013; X86-AVX1-NEXT: vzeroupper 1014; X86-AVX1-NEXT: retl 1015; 1016; X86-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1017; X86-AVX2: # %bb.0: # %entry 1018; X86-AVX2-NEXT: pushl %ebp 1019; X86-AVX2-NEXT: movl %esp, %ebp 1020; X86-AVX2-NEXT: andl $-32, %esp 1021; X86-AVX2-NEXT: subl $32, %esp 1022; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] 1023; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1] 1024; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1025; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1026; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1027; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 1028; X86-AVX2-NEXT: vpaddd 8(%ebp), %ymm3, %ymm4 1029; X86-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1030; X86-AVX2-NEXT: vpand %ymm3, %ymm4, %ymm3 1031; X86-AVX2-NEXT: vmovdqu %xmm0, ha4 1032; X86-AVX2-NEXT: vmovdqu %ymm1, hb4 1033; X86-AVX2-NEXT: vmovdqu %ymm3, hc4+32 1034; X86-AVX2-NEXT: vmovdqu %ymm2, hc4 1035; X86-AVX2-NEXT: movl %ebp, %esp 1036; X86-AVX2-NEXT: popl %ebp 1037; X86-AVX2-NEXT: vzeroupper 1038; X86-AVX2-NEXT: retl 1039; 1040; X86-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1041; X86-AVX512: # %bb.0: # %entry 1042; X86-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] 1043; X86-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1044; X86-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1045; X86-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1046; X86-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 1047; X86-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 1048; X86-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 1049; X86-AVX512-NEXT: vmovdqu %xmm0, ha4 1050; X86-AVX512-NEXT: vmovdqu %ymm1, hb4 1051; X86-AVX512-NEXT: vmovdqu64 %zmm2, hc4 1052; X86-AVX512-NEXT: vzeroupper 1053; X86-AVX512-NEXT: retl 1054; 1055; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1056; X64-AVX1: # %bb.0: # %entry 1057; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] 1058; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1] 1059; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1060; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 1061; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1062; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 1063; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 1064; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1065; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1066; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1067; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 1068; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 1069; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1070; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 1071; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 1072; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 1073; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1074; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1075; X64-AVX1-NEXT: vmovdqu %xmm0, ha4(%rip) 1076; X64-AVX1-NEXT: vmovups %ymm1, hb4(%rip) 1077; X64-AVX1-NEXT: vmovups %ymm3, hc4+32(%rip) 1078; X64-AVX1-NEXT: vmovups %ymm2, hc4(%rip) 1079; X64-AVX1-NEXT: vzeroupper 1080; X64-AVX1-NEXT: retq 1081; 1082; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1083; X64-AVX2: # %bb.0: # %entry 1084; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] 1085; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1] 1086; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0 1087; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 1088; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1089; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 1090; X64-AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 1091; X64-AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1092; X64-AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1093; X64-AVX2-NEXT: vmovdqu %xmm0, ha4(%rip) 1094; X64-AVX2-NEXT: vmovdqu %ymm1, hb4(%rip) 1095; X64-AVX2-NEXT: vmovdqu %ymm3, hc4+32(%rip) 1096; X64-AVX2-NEXT: vmovdqu %ymm2, hc4(%rip) 1097; X64-AVX2-NEXT: vzeroupper 1098; X64-AVX2-NEXT: retq 1099; 1100; X64-AVX512-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: 1101; X64-AVX512: # %bb.0: # %entry 1102; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [1,2,3,4,1,2,3,4,1,2,3,4,1,2,3,4] 1103; X64-AVX512-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1104; X64-AVX512-NEXT: vpaddd %xmm3, %xmm0, %xmm0 1105; X64-AVX512-NEXT: vpaddd %ymm3, %ymm1, %ymm1 1106; X64-AVX512-NEXT: vpand %ymm3, %ymm1, %ymm1 1107; X64-AVX512-NEXT: vpaddd %zmm3, %zmm2, %zmm2 1108; X64-AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm2 1109; X64-AVX512-NEXT: vmovdqu %xmm0, ha4(%rip) 1110; X64-AVX512-NEXT: vmovdqu %ymm1, hb4(%rip) 1111; X64-AVX512-NEXT: vmovdqu64 %zmm2, hc4(%rip) 1112; X64-AVX512-NEXT: vzeroupper 1113; X64-AVX512-NEXT: retq 1114entry: 1115 %0 = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4> 1116 %1 = add <8 x i32> %b, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1117 %2 = and <8 x i32> %1, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1118 %3 = add <16 x i32> %c, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1119 %4 = and <16 x i32> %3, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 1120 store <4 x i32> %0, <4 x i32>* @ha4, align 8 1121 store <8 x i32> %2, <8 x i32>* @hb4, align 8 1122 store <16 x i32> %4, <16 x i32>* @hc4, align 8 1123 ret void 1124} 1125 1126; 1127; Subvector Broadcast from register 1128; 1129 1130define <4 x double> @reg_broadcast_2f64_4f64(<2 x double> %a0) nounwind { 1131; X86-LABEL: reg_broadcast_2f64_4f64: 1132; X86: # %bb.0: 1133; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1134; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1135; X86-NEXT: retl 1136; 1137; X64-LABEL: reg_broadcast_2f64_4f64: 1138; X64: # %bb.0: 1139; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1140; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1141; X64-NEXT: retq 1142 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1143 ret <4 x double> %1 1144} 1145 1146define <8 x double> @reg_broadcast_2f64_8f64(<2 x double> %a0) nounwind { 1147; X86-AVX-LABEL: reg_broadcast_2f64_8f64: 1148; X86-AVX: # %bb.0: 1149; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1150; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1151; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1152; X86-AVX-NEXT: retl 1153; 1154; X86-AVX512-LABEL: reg_broadcast_2f64_8f64: 1155; X86-AVX512: # %bb.0: 1156; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1157; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1158; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1159; X86-AVX512-NEXT: retl 1160; 1161; X64-AVX-LABEL: reg_broadcast_2f64_8f64: 1162; X64-AVX: # %bb.0: 1163; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1164; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1165; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1166; X64-AVX-NEXT: retq 1167; 1168; X64-AVX512-LABEL: reg_broadcast_2f64_8f64: 1169; X64-AVX512: # %bb.0: 1170; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1171; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1172; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1173; X64-AVX512-NEXT: retq 1174 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1175 ret <8 x double> %1 1176} 1177 1178define <8 x double> @reg_broadcast_4f64_8f64(<4 x double> %a0) nounwind { 1179; X86-AVX-LABEL: reg_broadcast_4f64_8f64: 1180; X86-AVX: # %bb.0: 1181; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1182; X86-AVX-NEXT: retl 1183; 1184; X86-AVX512-LABEL: reg_broadcast_4f64_8f64: 1185; X86-AVX512: # %bb.0: 1186; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1187; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1188; X86-AVX512-NEXT: retl 1189; 1190; X64-AVX-LABEL: reg_broadcast_4f64_8f64: 1191; X64-AVX: # %bb.0: 1192; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1193; X64-AVX-NEXT: retq 1194; 1195; X64-AVX512-LABEL: reg_broadcast_4f64_8f64: 1196; X64-AVX512: # %bb.0: 1197; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1198; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1199; X64-AVX512-NEXT: retq 1200 %1 = shufflevector <4 x double> %a0, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1201 ret <8 x double> %1 1202} 1203 1204define <4 x i64> @reg_broadcast_2i64_4i64(<2 x i64> %a0) nounwind { 1205; X86-LABEL: reg_broadcast_2i64_4i64: 1206; X86: # %bb.0: 1207; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1208; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1209; X86-NEXT: retl 1210; 1211; X64-LABEL: reg_broadcast_2i64_4i64: 1212; X64: # %bb.0: 1213; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1214; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1215; X64-NEXT: retq 1216 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1217 ret <4 x i64> %1 1218} 1219 1220define <8 x i64> @reg_broadcast_2i64_8i64(<2 x i64> %a0) nounwind { 1221; X86-AVX-LABEL: reg_broadcast_2i64_8i64: 1222; X86-AVX: # %bb.0: 1223; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1224; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1225; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1226; X86-AVX-NEXT: retl 1227; 1228; X86-AVX512-LABEL: reg_broadcast_2i64_8i64: 1229; X86-AVX512: # %bb.0: 1230; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1231; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1232; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1233; X86-AVX512-NEXT: retl 1234; 1235; X64-AVX-LABEL: reg_broadcast_2i64_8i64: 1236; X64-AVX: # %bb.0: 1237; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1238; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1239; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1240; X64-AVX-NEXT: retq 1241; 1242; X64-AVX512-LABEL: reg_broadcast_2i64_8i64: 1243; X64-AVX512: # %bb.0: 1244; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1245; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1246; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1247; X64-AVX512-NEXT: retq 1248 %1 = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1249 ret <8 x i64> %1 1250} 1251 1252define <8 x i64> @reg_broadcast_4i64_8i64(<4 x i64> %a0) nounwind { 1253; X86-AVX-LABEL: reg_broadcast_4i64_8i64: 1254; X86-AVX: # %bb.0: 1255; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1256; X86-AVX-NEXT: retl 1257; 1258; X86-AVX512-LABEL: reg_broadcast_4i64_8i64: 1259; X86-AVX512: # %bb.0: 1260; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1261; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1262; X86-AVX512-NEXT: retl 1263; 1264; X64-AVX-LABEL: reg_broadcast_4i64_8i64: 1265; X64-AVX: # %bb.0: 1266; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1267; X64-AVX-NEXT: retq 1268; 1269; X64-AVX512-LABEL: reg_broadcast_4i64_8i64: 1270; X64-AVX512: # %bb.0: 1271; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1272; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1273; X64-AVX512-NEXT: retq 1274 %1 = shufflevector <4 x i64> %a0, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1275 ret <8 x i64> %1 1276} 1277 1278define <8 x float> @reg_broadcast_4f32_8f32(<4 x float> %a0) nounwind { 1279; X86-LABEL: reg_broadcast_4f32_8f32: 1280; X86: # %bb.0: 1281; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1282; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1283; X86-NEXT: retl 1284; 1285; X64-LABEL: reg_broadcast_4f32_8f32: 1286; X64: # %bb.0: 1287; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1288; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1289; X64-NEXT: retq 1290 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1291 ret <8 x float> %1 1292} 1293 1294define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind { 1295; X86-AVX-LABEL: reg_broadcast_4f32_16f32: 1296; X86-AVX: # %bb.0: 1297; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1298; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1299; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1300; X86-AVX-NEXT: retl 1301; 1302; X86-AVX512-LABEL: reg_broadcast_4f32_16f32: 1303; X86-AVX512: # %bb.0: 1304; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1305; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1306; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1307; X86-AVX512-NEXT: retl 1308; 1309; X64-AVX-LABEL: reg_broadcast_4f32_16f32: 1310; X64-AVX: # %bb.0: 1311; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1312; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1313; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1314; X64-AVX-NEXT: retq 1315; 1316; X64-AVX512-LABEL: reg_broadcast_4f32_16f32: 1317; X64-AVX512: # %bb.0: 1318; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1319; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1320; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1321; X64-AVX512-NEXT: retq 1322 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1323 ret <16 x float> %1 1324} 1325 1326define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind { 1327; X86-AVX-LABEL: reg_broadcast_8f32_16f32: 1328; X86-AVX: # %bb.0: 1329; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1330; X86-AVX-NEXT: retl 1331; 1332; X86-AVX512-LABEL: reg_broadcast_8f32_16f32: 1333; X86-AVX512: # %bb.0: 1334; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1335; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1336; X86-AVX512-NEXT: retl 1337; 1338; X64-AVX-LABEL: reg_broadcast_8f32_16f32: 1339; X64-AVX: # %bb.0: 1340; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1341; X64-AVX-NEXT: retq 1342; 1343; X64-AVX512-LABEL: reg_broadcast_8f32_16f32: 1344; X64-AVX512: # %bb.0: 1345; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1346; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1347; X64-AVX512-NEXT: retq 1348 %1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1349 ret <16 x float> %1 1350} 1351 1352define <8 x i32> @reg_broadcast_4i32_8i32(<4 x i32> %a0) nounwind { 1353; X86-LABEL: reg_broadcast_4i32_8i32: 1354; X86: # %bb.0: 1355; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1356; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1357; X86-NEXT: retl 1358; 1359; X64-LABEL: reg_broadcast_4i32_8i32: 1360; X64: # %bb.0: 1361; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1362; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1363; X64-NEXT: retq 1364 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1365 ret <8 x i32> %1 1366} 1367 1368define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind { 1369; X86-AVX-LABEL: reg_broadcast_4i32_16i32: 1370; X86-AVX: # %bb.0: 1371; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1372; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1373; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1374; X86-AVX-NEXT: retl 1375; 1376; X86-AVX512-LABEL: reg_broadcast_4i32_16i32: 1377; X86-AVX512: # %bb.0: 1378; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1379; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1380; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1381; X86-AVX512-NEXT: retl 1382; 1383; X64-AVX-LABEL: reg_broadcast_4i32_16i32: 1384; X64-AVX: # %bb.0: 1385; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1386; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1387; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1388; X64-AVX-NEXT: retq 1389; 1390; X64-AVX512-LABEL: reg_broadcast_4i32_16i32: 1391; X64-AVX512: # %bb.0: 1392; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1393; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1394; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1395; X64-AVX512-NEXT: retq 1396 %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1397 ret <16 x i32> %1 1398} 1399 1400define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind { 1401; X86-AVX-LABEL: reg_broadcast_8i32_16i32: 1402; X86-AVX: # %bb.0: 1403; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1404; X86-AVX-NEXT: retl 1405; 1406; X86-AVX512-LABEL: reg_broadcast_8i32_16i32: 1407; X86-AVX512: # %bb.0: 1408; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1409; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1410; X86-AVX512-NEXT: retl 1411; 1412; X64-AVX-LABEL: reg_broadcast_8i32_16i32: 1413; X64-AVX: # %bb.0: 1414; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1415; X64-AVX-NEXT: retq 1416; 1417; X64-AVX512-LABEL: reg_broadcast_8i32_16i32: 1418; X64-AVX512: # %bb.0: 1419; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1420; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1421; X64-AVX512-NEXT: retq 1422 %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1423 ret <16 x i32> %1 1424} 1425 1426define <16 x i16> @reg_broadcast_8i16_16i16(<8 x i16> %a0) nounwind { 1427; X86-LABEL: reg_broadcast_8i16_16i16: 1428; X86: # %bb.0: 1429; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1430; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1431; X86-NEXT: retl 1432; 1433; X64-LABEL: reg_broadcast_8i16_16i16: 1434; X64: # %bb.0: 1435; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1436; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1437; X64-NEXT: retq 1438 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1439 ret <16 x i16> %1 1440} 1441 1442define <32 x i16> @reg_broadcast_8i16_32i16(<8 x i16> %a0) nounwind { 1443; X86-AVX-LABEL: reg_broadcast_8i16_32i16: 1444; X86-AVX: # %bb.0: 1445; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1446; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1447; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1448; X86-AVX-NEXT: retl 1449; 1450; X86-AVX512-LABEL: reg_broadcast_8i16_32i16: 1451; X86-AVX512: # %bb.0: 1452; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1453; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1454; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1455; X86-AVX512-NEXT: retl 1456; 1457; X64-AVX-LABEL: reg_broadcast_8i16_32i16: 1458; X64-AVX: # %bb.0: 1459; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1460; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1461; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1462; X64-AVX-NEXT: retq 1463; 1464; X64-AVX512-LABEL: reg_broadcast_8i16_32i16: 1465; X64-AVX512: # %bb.0: 1466; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1467; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1468; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1469; X64-AVX512-NEXT: retq 1470 %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1471 ret <32 x i16> %1 1472} 1473 1474define <32 x i16> @reg_broadcast_16i16_32i16(<16 x i16> %a0) nounwind { 1475; X86-AVX-LABEL: reg_broadcast_16i16_32i16: 1476; X86-AVX: # %bb.0: 1477; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1478; X86-AVX-NEXT: retl 1479; 1480; X86-AVX512-LABEL: reg_broadcast_16i16_32i16: 1481; X86-AVX512: # %bb.0: 1482; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1483; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1484; X86-AVX512-NEXT: retl 1485; 1486; X64-AVX-LABEL: reg_broadcast_16i16_32i16: 1487; X64-AVX: # %bb.0: 1488; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1489; X64-AVX-NEXT: retq 1490; 1491; X64-AVX512-LABEL: reg_broadcast_16i16_32i16: 1492; X64-AVX512: # %bb.0: 1493; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1494; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1495; X64-AVX512-NEXT: retq 1496 %1 = shufflevector <16 x i16> %a0, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1497 ret <32 x i16> %1 1498} 1499 1500define <32 x i8> @reg_broadcast_16i8_32i8(<16 x i8> %a0) nounwind { 1501; X86-LABEL: reg_broadcast_16i8_32i8: 1502; X86: # %bb.0: 1503; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1504; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1505; X86-NEXT: retl 1506; 1507; X64-LABEL: reg_broadcast_16i8_32i8: 1508; X64: # %bb.0: 1509; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1510; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1511; X64-NEXT: retq 1512 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1513 ret <32 x i8> %1 1514} 1515 1516define <64 x i8> @reg_broadcast_16i8_64i8(<16 x i8> %a0) nounwind { 1517; X86-AVX-LABEL: reg_broadcast_16i8_64i8: 1518; X86-AVX: # %bb.0: 1519; X86-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1520; X86-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1521; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1522; X86-AVX-NEXT: retl 1523; 1524; X86-AVX512-LABEL: reg_broadcast_16i8_64i8: 1525; X86-AVX512: # %bb.0: 1526; X86-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1527; X86-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1528; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1529; X86-AVX512-NEXT: retl 1530; 1531; X64-AVX-LABEL: reg_broadcast_16i8_64i8: 1532; X64-AVX: # %bb.0: 1533; X64-AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1534; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1535; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1536; X64-AVX-NEXT: retq 1537; 1538; X64-AVX512-LABEL: reg_broadcast_16i8_64i8: 1539; X64-AVX512: # %bb.0: 1540; X64-AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1541; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1542; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1543; X64-AVX512-NEXT: retq 1544 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1545 ret <64 x i8> %1 1546} 1547 1548define <64 x i8> @reg_broadcast_32i8_64i8(<32 x i8> %a0) nounwind { 1549; X86-AVX-LABEL: reg_broadcast_32i8_64i8: 1550; X86-AVX: # %bb.0: 1551; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1552; X86-AVX-NEXT: retl 1553; 1554; X86-AVX512-LABEL: reg_broadcast_32i8_64i8: 1555; X86-AVX512: # %bb.0: 1556; X86-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1557; X86-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1558; X86-AVX512-NEXT: retl 1559; 1560; X64-AVX-LABEL: reg_broadcast_32i8_64i8: 1561; X64-AVX: # %bb.0: 1562; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1563; X64-AVX-NEXT: retq 1564; 1565; X64-AVX512-LABEL: reg_broadcast_32i8_64i8: 1566; X64-AVX512: # %bb.0: 1567; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1568; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 1569; X64-AVX512-NEXT: retq 1570 %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 1571 ret <64 x i8> %1 1572} 1573 1574; 1575; PR34394 1576; 1577 1578define <4 x i32> @test_2xi32_to_4xi32_mem(<2 x i32>* %vp) { 1579; X86-LABEL: test_2xi32_to_4xi32_mem: 1580; X86: # %bb.0: 1581; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1582; X86-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1583; X86-NEXT: retl 1584; 1585; X64-LABEL: test_2xi32_to_4xi32_mem: 1586; X64: # %bb.0: 1587; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] 1588; X64-NEXT: retq 1589 %vec = load <2 x i32>, <2 x i32>* %vp 1590 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 1591 ret <4 x i32> %res 1592} 1593 1594define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { 1595; X86-LABEL: test_2xi32_to_8xi32_mem: 1596; X86: # %bb.0: 1597; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1598; X86-NEXT: vbroadcastsd (%eax), %ymm0 1599; X86-NEXT: retl 1600; 1601; X64-LABEL: test_2xi32_to_8xi32_mem: 1602; X64: # %bb.0: 1603; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1604; X64-NEXT: retq 1605 %vec = load <2 x i32>, <2 x i32>* %vp 1606 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1607 ret <8 x i32> %res 1608} 1609 1610define <16 x i32> @test_2xi32_to_16xi32_mem(<2 x i32>* %vp) { 1611; X86-AVX-LABEL: test_2xi32_to_16xi32_mem: 1612; X86-AVX: # %bb.0: 1613; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1614; X86-AVX-NEXT: vbroadcastsd (%eax), %ymm0 1615; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1616; X86-AVX-NEXT: retl 1617; 1618; X86-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1619; X86-AVX512: # %bb.0: 1620; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1621; X86-AVX512-NEXT: vbroadcastsd (%eax), %zmm0 1622; X86-AVX512-NEXT: retl 1623; 1624; X64-AVX-LABEL: test_2xi32_to_16xi32_mem: 1625; X64-AVX: # %bb.0: 1626; X64-AVX-NEXT: vbroadcastsd (%rdi), %ymm0 1627; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1628; X64-AVX-NEXT: retq 1629; 1630; X64-AVX512-LABEL: test_2xi32_to_16xi32_mem: 1631; X64-AVX512: # %bb.0: 1632; X64-AVX512-NEXT: vbroadcastsd (%rdi), %zmm0 1633; X64-AVX512-NEXT: retq 1634 %vec = load <2 x i32>, <2 x i32>* %vp 1635 %res = shufflevector <2 x i32> %vec, <2 x i32> undef, <16 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 1636 ret <16 x i32> %res 1637} 1638 1639; 1640; PR34041 1641; 1642 1643define <4 x double> @broadcast_v4f64_f64_u000(double* %p) { 1644; X86-LABEL: broadcast_v4f64_f64_u000: 1645; X86: # %bb.0: 1646; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1647; X86-NEXT: vbroadcastsd (%eax), %ymm0 1648; X86-NEXT: retl 1649; 1650; X64-LABEL: broadcast_v4f64_f64_u000: 1651; X64: # %bb.0: 1652; X64-NEXT: vbroadcastsd (%rdi), %ymm0 1653; X64-NEXT: retq 1654 %s = load double, double* %p 1655 %vec = insertelement <2 x double> undef, double %s, i32 0 1656 %res = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 1657 ret <4 x double> %res 1658} 1659 1660define <4 x double> @broadcast_v4f64_v2f64_4u61(<2 x double>* %vp, <4 x double> %default) { 1661; X86-LABEL: broadcast_v4f64_v2f64_4u61: 1662; X86: # %bb.0: 1663; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1664; X86-NEXT: vinsertf128 $1, (%eax), %ymm0, %ymm1 1665; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1666; X86-NEXT: retl 1667; 1668; X64-LABEL: broadcast_v4f64_v2f64_4u61: 1669; X64: # %bb.0: 1670; X64-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm1 1671; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1672; X64-NEXT: retq 1673 %vec = load <2 x double>, <2 x double>* %vp 1674 %shuf = shufflevector <2 x double> %vec, <2 x double> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 1> 1675 %res = select <4 x i1> <i1 0, i1 1, i1 0, i1 1>, <4 x double> %shuf, <4 x double> %default 1676 ret <4 x double> %res 1677} 1678 1679define <8 x float> @broadcast_v8f32_v2f32_u1uu0uEu(<2 x float>* %vp, <8 x float> %default) { 1680; X86-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1681; X86: # %bb.0: 1682; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 1683; X86-NEXT: vbroadcastsd (%eax), %ymm1 1684; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] 1685; X86-NEXT: retl 1686; 1687; X64-LABEL: broadcast_v8f32_v2f32_u1uu0uEu: 1688; X64: # %bb.0: 1689; X64-NEXT: vbroadcastsd (%rdi), %ymm1 1690; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3] 1691; X64-NEXT: retq 1692 %vec = load <2 x float>, <2 x float>* %vp 1693 %shuf = shufflevector <2 x float> %vec, <2 x float> undef, <8 x i32> <i32 undef, i32 1, i32 undef, i32 undef, i32 0, i32 2, i32 3, i32 undef> 1694 %res = select <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1>, <8 x float> %shuf, <8 x float> %default 1695 ret <8 x float> %res 1696} 1697 1698define <8 x double> @broadcast_v8f64_v2f64_u1u10101(<2 x double>* %vp) { 1699; X86-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1700; X86-AVX: # %bb.0: 1701; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1702; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1703; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1704; X86-AVX-NEXT: retl 1705; 1706; X86-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1707; X86-AVX512: # %bb.0: 1708; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1709; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1710; X86-AVX512-NEXT: retl 1711; 1712; X64-AVX-LABEL: broadcast_v8f64_v2f64_u1u10101: 1713; X64-AVX: # %bb.0: 1714; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1715; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1716; X64-AVX-NEXT: retq 1717; 1718; X64-AVX512-LABEL: broadcast_v8f64_v2f64_u1u10101: 1719; X64-AVX512: # %bb.0: 1720; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1721; X64-AVX512-NEXT: retq 1722 %vec = load <2 x double>, <2 x double>* %vp 1723 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 3, i32 1, i32 undef, i32 1, i32 0, i32 1, i32 0, i32 1> 1724 ret <8 x double> %res 1725} 1726 1727define <8 x double> @broadcast_v8f64_v2f64_0uuu0101(<2 x double>* %vp) { 1728; X86-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1729; X86-AVX: # %bb.0: 1730; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1731; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1732; X86-AVX-NEXT: vmovaps %ymm0, %ymm1 1733; X86-AVX-NEXT: retl 1734; 1735; X86-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1736; X86-AVX512: # %bb.0: 1737; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 1738; X86-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1739; X86-AVX512-NEXT: retl 1740; 1741; X64-AVX-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1742; X64-AVX: # %bb.0: 1743; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 1744; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1745; X64-AVX-NEXT: retq 1746; 1747; X64-AVX512-LABEL: broadcast_v8f64_v2f64_0uuu0101: 1748; X64-AVX512: # %bb.0: 1749; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 1750; X64-AVX512-NEXT: retq 1751 %vec = load <2 x double>, <2 x double>* %vp 1752 %res = shufflevector <2 x double> %vec, <2 x double> undef, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 1, i32 0, i32 1> 1753 ret <8 x double> %res 1754} 1755