1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL 4; 5; Just one 32-bit run to make sure we do reasonable things. 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X86-AVX512F 7 8define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp { 9; ALL-LABEL: merge_8f64_2f64_12u4: 10; ALL: # %bb.0: 11; ALL-NEXT: vmovups 16(%rdi), %ymm0 12; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 13; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 14; ALL-NEXT: retq 15; 16; X86-AVX512F-LABEL: merge_8f64_2f64_12u4: 17; X86-AVX512F: # %bb.0: 18; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 19; X86-AVX512F-NEXT: vmovups 16(%eax), %ymm0 20; X86-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 21; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 22; X86-AVX512F-NEXT: retl 23 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 24 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 25 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4 26 %val0 = load <2 x double>, <2 x double>* %ptr0 27 %val1 = load <2 x double>, <2 x double>* %ptr1 28 %val3 = load <2 x double>, <2 x double>* %ptr3 29 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 30 %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 31 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 32 ret <8 x double> %res 33} 34 35define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp { 36; ALL-LABEL: merge_8f64_2f64_23z5: 37; ALL: # %bb.0: 38; ALL-NEXT: vmovups 32(%rdi), %ymm0 39; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 40; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 41; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 42; ALL-NEXT: retq 43; 44; X86-AVX512F-LABEL: merge_8f64_2f64_23z5: 45; X86-AVX512F: # %bb.0: 46; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 47; X86-AVX512F-NEXT: vmovups 32(%eax), %ymm0 48; X86-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 49; X86-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 50; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 51; X86-AVX512F-NEXT: retl 52 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 53 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3 54 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5 55 %val0 = load <2 x double>, <2 x double>* %ptr0 56 %val1 = load <2 x double>, <2 x double>* %ptr1 57 %val3 = load <2 x double>, <2 x double>* %ptr3 58 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 59 %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 60 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 61 ret <8 x double> %res 62} 63 64define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp { 65; ALL-LABEL: merge_8f64_4f64_z2: 66; ALL: # %bb.0: 67; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 68; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0 69; ALL-NEXT: retq 70; 71; X86-AVX512F-LABEL: merge_8f64_4f64_z2: 72; X86-AVX512F: # %bb.0: 73; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 74; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 75; X86-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0 76; X86-AVX512F-NEXT: retl 77 %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2 78 %val1 = load <4 x double>, <4 x double>* %ptr1 79 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 80 ret <8 x double> %res 81} 82 83define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp { 84; ALL-LABEL: merge_8f64_f64_23uuuuu9: 85; ALL: # %bb.0: 86; ALL-NEXT: vmovups 16(%rdi), %zmm0 87; ALL-NEXT: retq 88; 89; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9: 90; X86-AVX512F: # %bb.0: 91; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 92; X86-AVX512F-NEXT: vmovups 16(%eax), %zmm0 93; X86-AVX512F-NEXT: retl 94 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 95 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 96 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9 97 %val0 = load double, double* %ptr0 98 %val1 = load double, double* %ptr1 99 %val7 = load double, double* %ptr7 100 %res0 = insertelement <8 x double> undef, double %val0, i32 0 101 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 102 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 103 ret <8 x double> %res7 104} 105 106define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { 107; ALL-LABEL: merge_8f64_f64_12zzuuzz: 108; ALL: # %bb.0: 109; ALL-NEXT: vmovups 8(%rdi), %xmm0 110; ALL-NEXT: retq 111; 112; X86-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: 113; X86-AVX512F: # %bb.0: 114; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 115; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 116; X86-AVX512F-NEXT: retl 117 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 118 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 119 %val0 = load double, double* %ptr0 120 %val1 = load double, double* %ptr1 121 %res0 = insertelement <8 x double> undef, double %val0, i32 0 122 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 123 %res2 = insertelement <8 x double> %res1, double 0.0, i32 2 124 %res3 = insertelement <8 x double> %res2, double 0.0, i32 3 125 %res6 = insertelement <8 x double> %res3, double 0.0, i32 6 126 %res7 = insertelement <8 x double> %res6, double 0.0, i32 7 127 ret <8 x double> %res7 128} 129 130define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { 131; ALL-LABEL: merge_8f64_f64_1u3u5zu8: 132; ALL: # %bb.0: 133; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 134; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 135; ALL-NEXT: retq 136; 137; X86-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: 138; X86-AVX512F: # %bb.0: 139; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 140; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 141; X86-AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 142; X86-AVX512F-NEXT: retl 143 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 144 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 145 %ptr4 = getelementptr inbounds double, double* %ptr, i64 5 146 %ptr7 = getelementptr inbounds double, double* %ptr, i64 8 147 %val0 = load double, double* %ptr0 148 %val2 = load double, double* %ptr2 149 %val4 = load double, double* %ptr4 150 %val7 = load double, double* %ptr7 151 %res0 = insertelement <8 x double> undef, double %val0, i32 0 152 %res2 = insertelement <8 x double> %res0, double %val2, i32 2 153 %res4 = insertelement <8 x double> %res2, double %val4, i32 4 154 %res5 = insertelement <8 x double> %res4, double 0.0, i32 5 155 %res7 = insertelement <8 x double> %res5, double %val7, i32 7 156 ret <8 x double> %res7 157} 158 159define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp { 160; ALL-LABEL: merge_8i64_4i64_z3: 161; ALL: # %bb.0: 162; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 163; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0 164; ALL-NEXT: retq 165; 166; X86-AVX512F-LABEL: merge_8i64_4i64_z3: 167; X86-AVX512F: # %bb.0: 168; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 169; X86-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 170; X86-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0 171; X86-AVX512F-NEXT: retl 172 %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3 173 %val1 = load <4 x i64>, <4 x i64>* %ptr1 174 %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 175 ret <8 x i64> %res 176} 177 178define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { 179; ALL-LABEL: merge_8i64_i64_56zz9uzz: 180; ALL: # %bb.0: 181; ALL-NEXT: vmovups 40(%rdi), %xmm0 182; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 183; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 184; ALL-NEXT: retq 185; 186; X86-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: 187; X86-AVX512F: # %bb.0: 188; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 189; X86-AVX512F-NEXT: vmovups 40(%eax), %xmm0 190; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 191; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 192; X86-AVX512F-NEXT: retl 193 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5 194 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6 195 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9 196 %val0 = load i64, i64* %ptr0 197 %val1 = load i64, i64* %ptr1 198 %val4 = load i64, i64* %ptr4 199 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 200 %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1 201 %res2 = insertelement <8 x i64> %res1, i64 0, i32 2 202 %res3 = insertelement <8 x i64> %res2, i64 0, i32 3 203 %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4 204 %res6 = insertelement <8 x i64> %res4, i64 0, i32 6 205 %res7 = insertelement <8 x i64> %res6, i64 0, i32 7 206 ret <8 x i64> %res7 207} 208 209define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { 210; ALL-LABEL: merge_8i64_i64_1u3u5zu8: 211; ALL: # %bb.0: 212; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 213; ALL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 214; ALL-NEXT: retq 215; 216; X86-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: 217; X86-AVX512F: # %bb.0: 218; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 219; X86-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 220; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 221; X86-AVX512F-NEXT: retl 222 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 223 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 224 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5 225 %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8 226 %val0 = load i64, i64* %ptr0 227 %val2 = load i64, i64* %ptr2 228 %val4 = load i64, i64* %ptr4 229 %val7 = load i64, i64* %ptr7 230 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 231 %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2 232 %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4 233 %res5 = insertelement <8 x i64> %res4, i64 0, i32 5 234 %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7 235 ret <8 x i64> %res7 236} 237 238define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp { 239; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 240; ALL: # %bb.0: 241; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 242; ALL-NEXT: retq 243; 244; X86-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 245; X86-AVX512F: # %bb.0: 246; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 247; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 248; X86-AVX512F-NEXT: retl 249 %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 250 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 251 %val0 = load float, float* %ptr0 252 %val1 = load float, float* %ptr1 253 %res0 = insertelement <16 x float> undef, float %val0, i32 0 254 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 255 %res2 = insertelement <16 x float> %res1, float 0.0, i32 2 256 %res3 = insertelement <16 x float> %res2, float 0.0, i32 3 257 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 258 %resF = insertelement <16 x float> %res4, float 0.0, i32 15 259 ret <16 x float> %resF 260} 261 262define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp { 263; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 264; ALL: # %bb.0: 265; ALL-NEXT: vmovups 16(%rdi), %xmm0 266; ALL-NEXT: retq 267; 268; X86-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 269; X86-AVX512F: # %bb.0: 270; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 271; X86-AVX512F-NEXT: vmovups 16(%eax), %xmm0 272; X86-AVX512F-NEXT: retl 273 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4 274 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5 275 %ptr3 = getelementptr inbounds float, float* %ptr, i64 7 276 %val0 = load float, float* %ptr0 277 %val1 = load float, float* %ptr1 278 %val3 = load float, float* %ptr3 279 %res0 = insertelement <16 x float> undef, float %val0, i32 0 280 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 281 %res3 = insertelement <16 x float> %res1, float %val3, i32 3 282 ret <16 x float> %res3 283} 284 285define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp { 286; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 287; ALL: # %bb.0: 288; ALL-NEXT: vmovups (%rdi), %zmm0 289; ALL-NEXT: retq 290; 291; X86-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 292; X86-AVX512F: # %bb.0: 293; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 294; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 295; X86-AVX512F-NEXT: retl 296 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 297 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 298 %ptrC = getelementptr inbounds float, float* %ptr, i64 12 299 %ptrE = getelementptr inbounds float, float* %ptr, i64 14 300 %ptrF = getelementptr inbounds float, float* %ptr, i64 15 301 %val0 = load float, float* %ptr0 302 %val3 = load float, float* %ptr3 303 %valC = load float, float* %ptrC 304 %valE = load float, float* %ptrE 305 %valF = load float, float* %ptrF 306 %res0 = insertelement <16 x float> undef, float %val0, i32 0 307 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 308 %resC = insertelement <16 x float> %res3, float %valC, i32 12 309 %resE = insertelement <16 x float> %resC, float %valE, i32 14 310 %resF = insertelement <16 x float> %resE, float %valF, i32 15 311 ret <16 x float> %resF 312} 313 314define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { 315; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 316; ALL: # %bb.0: 317; ALL-NEXT: vmovups (%rdi), %zmm1 318; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 319; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> 320; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 321; ALL-NEXT: retq 322; 323; X86-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 324; X86-AVX512F: # %bb.0: 325; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 326; X86-AVX512F-NEXT: vmovups (%eax), %zmm1 327; X86-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 328; X86-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> 329; X86-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 330; X86-AVX512F-NEXT: retl 331 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 332 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 333 %ptrC = getelementptr inbounds float, float* %ptr, i64 12 334 %ptrE = getelementptr inbounds float, float* %ptr, i64 14 335 %ptrF = getelementptr inbounds float, float* %ptr, i64 15 336 %val0 = load float, float* %ptr0 337 %val3 = load float, float* %ptr3 338 %valC = load float, float* %ptrC 339 %valE = load float, float* %ptrE 340 %valF = load float, float* %ptrF 341 %res0 = insertelement <16 x float> undef, float %val0, i32 0 342 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 343 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 344 %res5 = insertelement <16 x float> %res4, float 0.0, i32 5 345 %resC = insertelement <16 x float> %res5, float %valC, i32 12 346 %resD = insertelement <16 x float> %resC, float 0.0, i32 13 347 %resE = insertelement <16 x float> %resD, float %valE, i32 14 348 %resF = insertelement <16 x float> %resE, float %valF, i32 15 349 ret <16 x float> %resF 350} 351 352define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp { 353; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 354; ALL: # %bb.0: 355; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 356; ALL-NEXT: retq 357; 358; X86-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 359; X86-AVX512F: # %bb.0: 360; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 361; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 362; X86-AVX512F-NEXT: retl 363 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1 364 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2 365 %val0 = load i32, i32* %ptr0 366 %val1 = load i32, i32* %ptr1 367 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 368 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 369 %res2 = insertelement <16 x i32> %res1, i32 0, i32 2 370 %res3 = insertelement <16 x i32> %res2, i32 0, i32 3 371 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 372 %resF = insertelement <16 x i32> %res4, i32 0, i32 15 373 ret <16 x i32> %resF 374} 375 376define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp { 377; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 378; ALL: # %bb.0: 379; ALL-NEXT: vmovups 8(%rdi), %xmm0 380; ALL-NEXT: retq 381; 382; X86-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 383; X86-AVX512F: # %bb.0: 384; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 385; X86-AVX512F-NEXT: vmovups 8(%eax), %xmm0 386; X86-AVX512F-NEXT: retl 387 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 388 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 389 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 390 %val0 = load i32, i32* %ptr0 391 %val1 = load i32, i32* %ptr1 392 %val3 = load i32, i32* %ptr3 393 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 394 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 395 %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3 396 ret <16 x i32> %res3 397} 398 399define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp { 400; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 401; ALL: # %bb.0: 402; ALL-NEXT: vmovups (%rdi), %zmm0 403; ALL-NEXT: retq 404; 405; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 406; X86-AVX512F: # %bb.0: 407; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 408; X86-AVX512F-NEXT: vmovups (%eax), %zmm0 409; X86-AVX512F-NEXT: retl 410 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 411 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 412 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 413 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 414 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 415 %val0 = load i32, i32* %ptr0 416 %val3 = load i32, i32* %ptr3 417 %valC = load i32, i32* %ptrC 418 %valE = load i32, i32* %ptrE 419 %valF = load i32, i32* %ptrF 420 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 421 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 422 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 423 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 424 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 425 ret <16 x i32> %resF 426} 427 428define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { 429; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 430; ALL: # %bb.0: 431; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 432; ALL-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 433; ALL-NEXT: retq 434; 435; X86-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 436; X86-AVX512F: # %bb.0: 437; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 438; X86-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 439; X86-AVX512F-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 440; X86-AVX512F-NEXT: retl 441 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 442 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 443 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 444 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 445 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 446 %val0 = load i32, i32* %ptr0 447 %val3 = load i32, i32* %ptr3 448 %valC = load i32, i32* %ptrC 449 %valE = load i32, i32* %ptrE 450 %valF = load i32, i32* %ptrF 451 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 452 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 453 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 454 %res5 = insertelement <16 x i32> %res4, i32 0, i32 5 455 %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12 456 %resD = insertelement <16 x i32> %resC, i32 0, i32 13 457 %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14 458 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 459 ret <16 x i32> %resF 460} 461 462define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp { 463; ALL-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 464; ALL: # %bb.0: 465; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 466; ALL-NEXT: retq 467; 468; X86-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 469; X86-AVX512F: # %bb.0: 470; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 471; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 472; X86-AVX512F-NEXT: retl 473 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 474 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 475 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4 476 %val0 = load i16, i16* %ptr0 477 %val1 = load i16, i16* %ptr1 478 %val3 = load i16, i16* %ptr3 479 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 480 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 481 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 482 %res30 = insertelement <32 x i16> %res3, i16 0, i16 30 483 %res31 = insertelement <32 x i16> %res30, i16 0, i16 31 484 ret <32 x i16> %res31 485} 486 487define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { 488; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 489; ALL: # %bb.0: 490; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 491; ALL-NEXT: retq 492; 493; X86-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 494; X86-AVX512F: # %bb.0: 495; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 496; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 497; X86-AVX512F-NEXT: retl 498 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4 499 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5 500 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7 501 %val0 = load i16, i16* %ptr0 502 %val1 = load i16, i16* %ptr1 503 %val3 = load i16, i16* %ptr3 504 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 505 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 506 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 507 ret <32 x i16> %res3 508} 509 510define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { 511; ALL-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 512; ALL: # %bb.0: 513; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 514; ALL-NEXT: retq 515; 516; X86-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 517; X86-AVX512F: # %bb.0: 518; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 519; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 520; X86-AVX512F-NEXT: retl 521 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 522 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 523 %val0 = load i16, i16* %ptr0 524 %val1 = load i16, i16* %ptr1 525 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 526 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 527 %res3 = insertelement <32 x i16> %res1, i16 0, i16 3 528 %resE = insertelement <32 x i16> %res3, i16 0, i16 14 529 %resF = insertelement <32 x i16> %resE, i16 0, i16 15 530 %resG = insertelement <32 x i16> %resF, i16 0, i16 16 531 %resH = insertelement <32 x i16> %resG, i16 0, i16 17 532 ret <32 x i16> %resH 533} 534 535define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { 536; ALL-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 537; ALL: # %bb.0: 538; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 539; ALL-NEXT: retq 540; 541; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 542; X86-AVX512F: # %bb.0: 543; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 544; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 545; X86-AVX512F-NEXT: retl 546 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 547 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 548 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4 549 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8 550 %val0 = load i8, i8* %ptr0 551 %val1 = load i8, i8* %ptr1 552 %val3 = load i8, i8* %ptr3 553 %val7 = load i8, i8* %ptr7 554 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 555 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 556 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 557 %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7 558 %res14 = insertelement <64 x i8> %res7, i8 0, i8 14 559 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 560 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 561 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 562 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 563 ret <64 x i8> %res63 564} 565 566define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { 567; ALL-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 568; ALL: # %bb.0: 569; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 570; ALL-NEXT: retq 571; 572; X86-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 573; X86-AVX512F: # %bb.0: 574; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 575; X86-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 576; X86-AVX512F-NEXT: retl 577 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 578 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 579 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4 580 %val0 = load i8, i8* %ptr0 581 %val1 = load i8, i8* %ptr1 582 %val3 = load i8, i8* %ptr3 583 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 584 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 585 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 586 %res14 = insertelement <64 x i8> %res3, i8 0, i8 14 587 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 588 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 589 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 590 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 591 ret <64 x i8> %res63 592} 593 594; 595; consecutive loads including any/all volatiles may not be combined 596; 597 598define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp { 599; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile: 600; ALL: # %bb.0: 601; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 602; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 603; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1 604; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 605; ALL-NEXT: retq 606; 607; X86-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile: 608; X86-AVX512F: # %bb.0: 609; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 610; X86-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 611; X86-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 612; X86-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1 613; X86-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 614; X86-AVX512F-NEXT: retl 615 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 616 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 617 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9 618 %val0 = load volatile double, double* %ptr0 619 %val1 = load double, double* %ptr1 620 %val7 = load double, double* %ptr7 621 %res0 = insertelement <8 x double> undef, double %val0, i32 0 622 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 623 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 624 ret <8 x double> %res7 625} 626 627define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp { 628; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 629; ALL: # %bb.0: 630; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 631; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0 632; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 633; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1 634; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1 635; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 636; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 637; ALL-NEXT: retq 638; 639; X86-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 640; X86-AVX512F: # %bb.0: 641; X86-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 642; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 643; X86-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0 644; X86-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 645; X86-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1 646; X86-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1 647; X86-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 648; X86-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 649; X86-AVX512F-NEXT: retl 650 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 651 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 652 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 653 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 654 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 655 %val0 = load volatile i32, i32* %ptr0 656 %val3 = load volatile i32, i32* %ptr3 657 %valC = load volatile i32, i32* %ptrC 658 %valE = load volatile i32, i32* %ptrE 659 %valF = load volatile i32, i32* %ptrF 660 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 661 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 662 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 663 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 664 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 665 ret <16 x i32> %resF 666} 667