1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW 4; 5; Just one 32-bit run to make sure we do reasonable things. 6; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32-AVX512F 7 8define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp { 9; ALL-LABEL: merge_8f64_2f64_12u4: 10; ALL: # %bb.0: 11; ALL-NEXT: vmovups 16(%rdi), %ymm0 12; ALL-NEXT: vinsertf128 $1, 64(%rdi), %ymm0, %ymm1 13; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 14; ALL-NEXT: retq 15; 16; X32-AVX512F-LABEL: merge_8f64_2f64_12u4: 17; X32-AVX512F: # %bb.0: 18; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 19; X32-AVX512F-NEXT: vmovups 16(%eax), %ymm0 20; X32-AVX512F-NEXT: vinsertf128 $1, 64(%eax), %ymm0, %ymm1 21; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 22; X32-AVX512F-NEXT: retl 23 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 24 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 25 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 4 26 %val0 = load <2 x double>, <2 x double>* %ptr0 27 %val1 = load <2 x double>, <2 x double>* %ptr1 28 %val3 = load <2 x double>, <2 x double>* %ptr3 29 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 30 %res23 = shufflevector <2 x double> undef, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 31 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 32 ret <8 x double> %res 33} 34 35define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp { 36; ALL-LABEL: merge_8f64_2f64_23z5: 37; ALL: # %bb.0: 38; ALL-NEXT: vmovups 32(%rdi), %ymm0 39; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 40; ALL-NEXT: vinsertf128 $1, 80(%rdi), %ymm1, %ymm1 41; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 42; ALL-NEXT: retq 43; 44; X32-AVX512F-LABEL: merge_8f64_2f64_23z5: 45; X32-AVX512F: # %bb.0: 46; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 47; X32-AVX512F-NEXT: vmovups 32(%eax), %ymm0 48; X32-AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1 49; X32-AVX512F-NEXT: vinsertf128 $1, 80(%eax), %ymm1, %ymm1 50; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 51; X32-AVX512F-NEXT: retl 52 %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 53 %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3 54 %ptr3 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 5 55 %val0 = load <2 x double>, <2 x double>* %ptr0 56 %val1 = load <2 x double>, <2 x double>* %ptr1 57 %val3 = load <2 x double>, <2 x double>* %ptr3 58 %res01 = shufflevector <2 x double> %val0, <2 x double> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 59 %res23 = shufflevector <2 x double> zeroinitializer, <2 x double> %val3, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 60 %res = shufflevector <4 x double> %res01, <4 x double> %res23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 61 ret <8 x double> %res 62} 63 64define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noinline ssp { 65; ALL-LABEL: merge_8f64_4f64_z2: 66; ALL: # %bb.0: 67; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 68; ALL-NEXT: vinsertf64x4 $1, 64(%rdi), %zmm0, %zmm0 69; ALL-NEXT: retq 70; 71; X32-AVX512F-LABEL: merge_8f64_4f64_z2: 72; X32-AVX512F: # %bb.0: 73; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 74; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 75; X32-AVX512F-NEXT: vinsertf64x4 $1, 64(%eax), %zmm0, %zmm0 76; X32-AVX512F-NEXT: retl 77 %ptr1 = getelementptr inbounds <4 x double>, <4 x double>* %ptr, i64 2 78 %val1 = load <4 x double>, <4 x double>* %ptr1 79 %res = shufflevector <4 x double> zeroinitializer, <4 x double> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 80 ret <8 x double> %res 81} 82 83define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp { 84; ALL-LABEL: merge_8f64_f64_23uuuuu9: 85; ALL: # %bb.0: 86; ALL-NEXT: vmovups 16(%rdi), %zmm0 87; ALL-NEXT: retq 88; 89; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9: 90; X32-AVX512F: # %bb.0: 91; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 92; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0 93; X32-AVX512F-NEXT: retl 94 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 95 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 96 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9 97 %val0 = load double, double* %ptr0 98 %val1 = load double, double* %ptr1 99 %val7 = load double, double* %ptr7 100 %res0 = insertelement <8 x double> undef, double %val0, i32 0 101 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 102 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 103 ret <8 x double> %res7 104} 105 106define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noinline ssp { 107; ALL-LABEL: merge_8f64_f64_12zzuuzz: 108; ALL: # %bb.0: 109; ALL-NEXT: vmovups 8(%rdi), %xmm0 110; ALL-NEXT: retq 111; 112; X32-AVX512F-LABEL: merge_8f64_f64_12zzuuzz: 113; X32-AVX512F: # %bb.0: 114; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 115; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0 116; X32-AVX512F-NEXT: retl 117 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 118 %ptr1 = getelementptr inbounds double, double* %ptr, i64 2 119 %val0 = load double, double* %ptr0 120 %val1 = load double, double* %ptr1 121 %res0 = insertelement <8 x double> undef, double %val0, i32 0 122 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 123 %res2 = insertelement <8 x double> %res1, double 0.0, i32 2 124 %res3 = insertelement <8 x double> %res2, double 0.0, i32 3 125 %res6 = insertelement <8 x double> %res3, double 0.0, i32 6 126 %res7 = insertelement <8 x double> %res6, double 0.0, i32 7 127 ret <8 x double> %res7 128} 129 130define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { 131; ALL-LABEL: merge_8f64_f64_1u3u5zu8: 132; ALL: # %bb.0: 133; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 134; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 135; ALL-NEXT: retq 136; 137; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: 138; X32-AVX512F: # %bb.0: 139; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 140; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 141; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 142; X32-AVX512F-NEXT: retl 143 %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 144 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 145 %ptr4 = getelementptr inbounds double, double* %ptr, i64 5 146 %ptr7 = getelementptr inbounds double, double* %ptr, i64 8 147 %val0 = load double, double* %ptr0 148 %val2 = load double, double* %ptr2 149 %val4 = load double, double* %ptr4 150 %val7 = load double, double* %ptr7 151 %res0 = insertelement <8 x double> undef, double %val0, i32 0 152 %res2 = insertelement <8 x double> %res0, double %val2, i32 2 153 %res4 = insertelement <8 x double> %res2, double %val4, i32 4 154 %res5 = insertelement <8 x double> %res4, double 0.0, i32 5 155 %res7 = insertelement <8 x double> %res5, double %val7, i32 7 156 ret <8 x double> %res7 157} 158 159define <8 x i64> @merge_8i64_4i64_z3(<4 x i64>* %ptr) nounwind uwtable noinline ssp { 160; ALL-LABEL: merge_8i64_4i64_z3: 161; ALL: # %bb.0: 162; ALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 163; ALL-NEXT: vinsertf64x4 $1, 96(%rdi), %zmm0, %zmm0 164; ALL-NEXT: retq 165; 166; X32-AVX512F-LABEL: merge_8i64_4i64_z3: 167; X32-AVX512F: # %bb.0: 168; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 169; X32-AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0 170; X32-AVX512F-NEXT: vinsertf64x4 $1, 96(%eax), %zmm0, %zmm0 171; X32-AVX512F-NEXT: retl 172 %ptr1 = getelementptr inbounds <4 x i64>, <4 x i64>* %ptr, i64 3 173 %val1 = load <4 x i64>, <4 x i64>* %ptr1 174 %res = shufflevector <4 x i64> zeroinitializer, <4 x i64> %val1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 175 ret <8 x i64> %res 176} 177 178define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline ssp { 179; ALL-LABEL: merge_8i64_i64_56zz9uzz: 180; ALL: # %bb.0: 181; ALL-NEXT: vmovups 40(%rdi), %xmm0 182; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 183; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 184; ALL-NEXT: retq 185; 186; X32-AVX512F-LABEL: merge_8i64_i64_56zz9uzz: 187; X32-AVX512F: # %bb.0: 188; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 189; X32-AVX512F-NEXT: vmovups 40(%eax), %xmm0 190; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 191; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 192; X32-AVX512F-NEXT: retl 193 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 5 194 %ptr1 = getelementptr inbounds i64, i64* %ptr, i64 6 195 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 9 196 %val0 = load i64, i64* %ptr0 197 %val1 = load i64, i64* %ptr1 198 %val4 = load i64, i64* %ptr4 199 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 200 %res1 = insertelement <8 x i64> %res0, i64 %val1, i32 1 201 %res2 = insertelement <8 x i64> %res1, i64 0, i32 2 202 %res3 = insertelement <8 x i64> %res2, i64 0, i32 3 203 %res4 = insertelement <8 x i64> %res3, i64 %val4, i32 4 204 %res6 = insertelement <8 x i64> %res4, i64 0, i32 6 205 %res7 = insertelement <8 x i64> %res6, i64 0, i32 7 206 ret <8 x i64> %res7 207} 208 209define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { 210; ALL-LABEL: merge_8i64_i64_1u3u5zu8: 211; ALL: # %bb.0: 212; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 213; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 214; ALL-NEXT: retq 215; 216; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: 217; X32-AVX512F: # %bb.0: 218; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 219; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 220; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 221; X32-AVX512F-NEXT: retl 222 %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 223 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 224 %ptr4 = getelementptr inbounds i64, i64* %ptr, i64 5 225 %ptr7 = getelementptr inbounds i64, i64* %ptr, i64 8 226 %val0 = load i64, i64* %ptr0 227 %val2 = load i64, i64* %ptr2 228 %val4 = load i64, i64* %ptr4 229 %val7 = load i64, i64* %ptr7 230 %res0 = insertelement <8 x i64> undef, i64 %val0, i32 0 231 %res2 = insertelement <8 x i64> %res0, i64 %val2, i32 2 232 %res4 = insertelement <8 x i64> %res2, i64 %val4, i32 4 233 %res5 = insertelement <8 x i64> %res4, i64 0, i32 5 234 %res7 = insertelement <8 x i64> %res5, i64 %val7, i32 7 235 ret <8 x i64> %res7 236} 237 238define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp { 239; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 240; ALL: # %bb.0: 241; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 242; ALL-NEXT: retq 243; 244; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz: 245; X32-AVX512F: # %bb.0: 246; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 247; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 248; X32-AVX512F-NEXT: retl 249 %ptr0 = getelementptr inbounds float, float* %ptr, i64 8 250 %ptr1 = getelementptr inbounds float, float* %ptr, i64 9 251 %val0 = load float, float* %ptr0 252 %val1 = load float, float* %ptr1 253 %res0 = insertelement <16 x float> undef, float %val0, i32 0 254 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 255 %res2 = insertelement <16 x float> %res1, float 0.0, i32 2 256 %res3 = insertelement <16 x float> %res2, float 0.0, i32 3 257 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 258 %resF = insertelement <16 x float> %res4, float 0.0, i32 15 259 ret <16 x float> %resF 260} 261 262define <16 x float> @merge_16f32_f32_45u7uuuuuuuuuuuu(float* %ptr) nounwind uwtable noinline ssp { 263; ALL-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 264; ALL: # %bb.0: 265; ALL-NEXT: vmovups 16(%rdi), %xmm0 266; ALL-NEXT: retq 267; 268; X32-AVX512F-LABEL: merge_16f32_f32_45u7uuuuuuuuuuuu: 269; X32-AVX512F: # %bb.0: 270; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 271; X32-AVX512F-NEXT: vmovups 16(%eax), %xmm0 272; X32-AVX512F-NEXT: retl 273 %ptr0 = getelementptr inbounds float, float* %ptr, i64 4 274 %ptr1 = getelementptr inbounds float, float* %ptr, i64 5 275 %ptr3 = getelementptr inbounds float, float* %ptr, i64 7 276 %val0 = load float, float* %ptr0 277 %val1 = load float, float* %ptr1 278 %val3 = load float, float* %ptr3 279 %res0 = insertelement <16 x float> undef, float %val0, i32 0 280 %res1 = insertelement <16 x float> %res0, float %val1, i32 1 281 %res3 = insertelement <16 x float> %res1, float %val3, i32 3 282 ret <16 x float> %res3 283} 284 285define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwtable noinline ssp { 286; ALL-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 287; ALL: # %bb.0: 288; ALL-NEXT: vmovups (%rdi), %zmm0 289; ALL-NEXT: retq 290; 291; X32-AVX512F-LABEL: merge_16f32_f32_0uu3uuuuuuuuCuEF: 292; X32-AVX512F: # %bb.0: 293; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 294; X32-AVX512F-NEXT: vmovups (%eax), %zmm0 295; X32-AVX512F-NEXT: retl 296 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 297 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 298 %ptrC = getelementptr inbounds float, float* %ptr, i64 12 299 %ptrE = getelementptr inbounds float, float* %ptr, i64 14 300 %ptrF = getelementptr inbounds float, float* %ptr, i64 15 301 %val0 = load float, float* %ptr0 302 %val3 = load float, float* %ptr3 303 %valC = load float, float* %ptrC 304 %valE = load float, float* %ptrE 305 %valF = load float, float* %ptrF 306 %res0 = insertelement <16 x float> undef, float %val0, i32 0 307 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 308 %resC = insertelement <16 x float> %res3, float %valC, i32 12 309 %resE = insertelement <16 x float> %resC, float %valE, i32 14 310 %resF = insertelement <16 x float> %resE, float %valF, i32 15 311 ret <16 x float> %resF 312} 313 314define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { 315; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 316; ALL: # %bb.0: 317; ALL-NEXT: vmovups (%rdi), %zmm1 318; ALL-NEXT: vxorps %xmm2, %xmm2, %xmm2 319; ALL-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> 320; ALL-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 321; ALL-NEXT: retq 322; 323; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: 324; X32-AVX512F: # %bb.0: 325; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 326; X32-AVX512F-NEXT: vmovups (%eax), %zmm1 327; X32-AVX512F-NEXT: vxorps %xmm2, %xmm2, %xmm2 328; X32-AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> 329; X32-AVX512F-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 330; X32-AVX512F-NEXT: retl 331 %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 332 %ptr3 = getelementptr inbounds float, float* %ptr, i64 3 333 %ptrC = getelementptr inbounds float, float* %ptr, i64 12 334 %ptrE = getelementptr inbounds float, float* %ptr, i64 14 335 %ptrF = getelementptr inbounds float, float* %ptr, i64 15 336 %val0 = load float, float* %ptr0 337 %val3 = load float, float* %ptr3 338 %valC = load float, float* %ptrC 339 %valE = load float, float* %ptrE 340 %valF = load float, float* %ptrF 341 %res0 = insertelement <16 x float> undef, float %val0, i32 0 342 %res3 = insertelement <16 x float> %res0, float %val3, i32 3 343 %res4 = insertelement <16 x float> %res3, float 0.0, i32 4 344 %res5 = insertelement <16 x float> %res4, float 0.0, i32 5 345 %resC = insertelement <16 x float> %res5, float %valC, i32 12 346 %resD = insertelement <16 x float> %resC, float 0.0, i32 13 347 %resE = insertelement <16 x float> %resD, float %valE, i32 14 348 %resF = insertelement <16 x float> %resE, float %valF, i32 15 349 ret <16 x float> %resF 350} 351 352define <16 x i32> @merge_16i32_i32_12zzzuuuuuuuuuuuz(i32* %ptr) nounwind uwtable noinline ssp { 353; ALL-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 354; ALL: # %bb.0: 355; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 356; ALL-NEXT: retq 357; 358; X32-AVX512F-LABEL: merge_16i32_i32_12zzzuuuuuuuuuuuz: 359; X32-AVX512F: # %bb.0: 360; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 361; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 362; X32-AVX512F-NEXT: retl 363 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 1 364 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 2 365 %val0 = load i32, i32* %ptr0 366 %val1 = load i32, i32* %ptr1 367 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 368 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 369 %res2 = insertelement <16 x i32> %res1, i32 0, i32 2 370 %res3 = insertelement <16 x i32> %res2, i32 0, i32 3 371 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 372 %resF = insertelement <16 x i32> %res4, i32 0, i32 15 373 ret <16 x i32> %resF 374} 375 376define <16 x i32> @merge_16i32_i32_23u5uuuuuuuuuuuu(i32* %ptr) nounwind uwtable noinline ssp { 377; ALL-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 378; ALL: # %bb.0: 379; ALL-NEXT: vmovups 8(%rdi), %xmm0 380; ALL-NEXT: retq 381; 382; X32-AVX512F-LABEL: merge_16i32_i32_23u5uuuuuuuuuuuu: 383; X32-AVX512F: # %bb.0: 384; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 385; X32-AVX512F-NEXT: vmovups 8(%eax), %xmm0 386; X32-AVX512F-NEXT: retl 387 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 2 388 %ptr1 = getelementptr inbounds i32, i32* %ptr, i64 3 389 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 5 390 %val0 = load i32, i32* %ptr0 391 %val1 = load i32, i32* %ptr1 392 %val3 = load i32, i32* %ptr3 393 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 394 %res1 = insertelement <16 x i32> %res0, i32 %val1, i32 1 395 %res3 = insertelement <16 x i32> %res1, i32 %val3, i32 3 396 ret <16 x i32> %res3 397} 398 399define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable noinline ssp { 400; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 401; ALL: # %bb.0: 402; ALL-NEXT: vmovups (%rdi), %zmm0 403; ALL-NEXT: retq 404; 405; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF: 406; X32-AVX512F: # %bb.0: 407; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 408; X32-AVX512F-NEXT: vmovups (%eax), %zmm0 409; X32-AVX512F-NEXT: retl 410 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 411 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 412 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 413 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 414 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 415 %val0 = load i32, i32* %ptr0 416 %val3 = load i32, i32* %ptr3 417 %valC = load i32, i32* %ptrC 418 %valE = load i32, i32* %ptrE 419 %valF = load i32, i32* %ptrF 420 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 421 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 422 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 423 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 424 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 425 ret <16 x i32> %resF 426} 427 428define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { 429; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 430; ALL: # %bb.0: 431; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 432; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 433; ALL-NEXT: retq 434; 435; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: 436; X32-AVX512F: # %bb.0: 437; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 438; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 439; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0 440; X32-AVX512F-NEXT: retl 441 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 442 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 443 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 444 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 445 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 446 %val0 = load i32, i32* %ptr0 447 %val3 = load i32, i32* %ptr3 448 %valC = load i32, i32* %ptrC 449 %valE = load i32, i32* %ptrE 450 %valF = load i32, i32* %ptrF 451 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 452 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 453 %res4 = insertelement <16 x i32> %res3, i32 0, i32 4 454 %res5 = insertelement <16 x i32> %res4, i32 0, i32 5 455 %resC = insertelement <16 x i32> %res5, i32 %valC, i32 12 456 %resD = insertelement <16 x i32> %resC, i32 0, i32 13 457 %resE = insertelement <16 x i32> %resD, i32 %valE, i32 14 458 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 459 ret <16 x i32> %resF 460} 461 462define <32 x i16> @merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz(i16* %ptr) nounwind uwtable noinline ssp { 463; AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 464; AVX512F: # %bb.0: 465; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 466; AVX512F-NEXT: vmovaps %ymm0, %ymm0 467; AVX512F-NEXT: retq 468; 469; AVX512BW-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 470; AVX512BW: # %bb.0: 471; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 472; AVX512BW-NEXT: retq 473; 474; X32-AVX512F-LABEL: merge_32i16_i16_12u4uuuuuuuuuuuuuuuuuuuuuuuuuuzz: 475; X32-AVX512F: # %bb.0: 476; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 477; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 478; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 479; X32-AVX512F-NEXT: retl 480 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 1 481 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 2 482 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 4 483 %val0 = load i16, i16* %ptr0 484 %val1 = load i16, i16* %ptr1 485 %val3 = load i16, i16* %ptr3 486 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 487 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 488 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 489 %res30 = insertelement <32 x i16> %res3, i16 0, i16 30 490 %res31 = insertelement <32 x i16> %res30, i16 0, i16 31 491 ret <32 x i16> %res31 492} 493 494define <32 x i16> @merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { 495; ALL-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 496; ALL: # %bb.0: 497; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 498; ALL-NEXT: retq 499; 500; X32-AVX512F-LABEL: merge_32i16_i16_45u7uuuuuuuuuuuuuuuuuuuuuuuuuuuu: 501; X32-AVX512F: # %bb.0: 502; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 503; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 504; X32-AVX512F-NEXT: retl 505 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4 506 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5 507 %ptr3 = getelementptr inbounds i16, i16* %ptr, i64 7 508 %val0 = load i16, i16* %ptr0 509 %val1 = load i16, i16* %ptr1 510 %val3 = load i16, i16* %ptr3 511 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 512 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 513 %res3 = insertelement <32 x i16> %res1, i16 %val3, i16 3 514 ret <32 x i16> %res3 515} 516 517define <32 x i16> @merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu(i16* %ptr) nounwind uwtable noinline ssp { 518; AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 519; AVX512F: # %bb.0: 520; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 521; AVX512F-NEXT: vmovaps %ymm0, %ymm0 522; AVX512F-NEXT: retq 523; 524; AVX512BW-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 525; AVX512BW: # %bb.0: 526; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 527; AVX512BW-NEXT: retq 528; 529; X32-AVX512F-LABEL: merge_32i16_i16_23uzuuuuuuuuuuzzzzuuuuuuuuuuuuuu: 530; X32-AVX512F: # %bb.0: 531; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 532; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 533; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 534; X32-AVX512F-NEXT: retl 535 %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 2 536 %ptr1 = getelementptr inbounds i16, i16* %ptr, i64 3 537 %val0 = load i16, i16* %ptr0 538 %val1 = load i16, i16* %ptr1 539 %res0 = insertelement <32 x i16> undef, i16 %val0, i16 0 540 %res1 = insertelement <32 x i16> %res0, i16 %val1, i16 1 541 %res3 = insertelement <32 x i16> %res1, i16 0, i16 3 542 %resE = insertelement <32 x i16> %res3, i16 0, i16 14 543 %resF = insertelement <32 x i16> %resE, i16 0, i16 15 544 %resG = insertelement <32 x i16> %resF, i16 0, i16 16 545 %resH = insertelement <32 x i16> %resG, i16 0, i16 17 546 ret <32 x i16> %resH 547} 548 549define <64 x i8> @merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { 550; AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 551; AVX512F: # %bb.0: 552; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 553; AVX512F-NEXT: vmovaps %ymm0, %ymm0 554; AVX512F-NEXT: retq 555; 556; AVX512BW-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 557; AVX512BW: # %bb.0: 558; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 559; AVX512BW-NEXT: retq 560; 561; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuu8uuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 562; X32-AVX512F: # %bb.0: 563; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 564; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 565; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 566; X32-AVX512F-NEXT: retl 567 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 568 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 569 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4 570 %ptr7 = getelementptr inbounds i8, i8* %ptr, i64 8 571 %val0 = load i8, i8* %ptr0 572 %val1 = load i8, i8* %ptr1 573 %val3 = load i8, i8* %ptr3 574 %val7 = load i8, i8* %ptr7 575 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 576 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 577 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 578 %res7 = insertelement <64 x i8> %res3, i8 %val7, i8 7 579 %res14 = insertelement <64 x i8> %res7, i8 0, i8 14 580 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 581 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 582 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 583 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 584 ret <64 x i8> %res63 585} 586 587define <64 x i8> @merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz(i8* %ptr) nounwind uwtable noinline ssp { 588; AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 589; AVX512F: # %bb.0: 590; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 591; AVX512F-NEXT: vmovaps %ymm0, %ymm0 592; AVX512F-NEXT: retq 593; 594; AVX512BW-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 595; AVX512BW: # %bb.0: 596; AVX512BW-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 597; AVX512BW-NEXT: retq 598; 599; X32-AVX512F-LABEL: merge_64i8_i8_12u4uuuuuuuuuuzzzzuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuz: 600; X32-AVX512F: # %bb.0: 601; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 602; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 603; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm0 604; X32-AVX512F-NEXT: retl 605 %ptr0 = getelementptr inbounds i8, i8* %ptr, i64 1 606 %ptr1 = getelementptr inbounds i8, i8* %ptr, i64 2 607 %ptr3 = getelementptr inbounds i8, i8* %ptr, i64 4 608 %val0 = load i8, i8* %ptr0 609 %val1 = load i8, i8* %ptr1 610 %val3 = load i8, i8* %ptr3 611 %res0 = insertelement <64 x i8> undef, i8 %val0, i8 0 612 %res1 = insertelement <64 x i8> %res0, i8 %val1, i8 1 613 %res3 = insertelement <64 x i8> %res1, i8 %val3, i8 3 614 %res14 = insertelement <64 x i8> %res3, i8 0, i8 14 615 %res15 = insertelement <64 x i8> %res14, i8 0, i8 15 616 %res16 = insertelement <64 x i8> %res15, i8 0, i8 16 617 %res17 = insertelement <64 x i8> %res16, i8 0, i8 17 618 %res63 = insertelement <64 x i8> %res17, i8 0, i8 63 619 ret <64 x i8> %res63 620} 621 622; 623; consecutive loads including any/all volatiles may not be combined 624; 625 626define <8 x double> @merge_8f64_f64_23uuuuu9_volatile(double* %ptr) nounwind uwtable noinline ssp { 627; ALL-LABEL: merge_8f64_f64_23uuuuu9_volatile: 628; ALL: # %bb.0: 629; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 630; ALL-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 631; ALL-NEXT: vbroadcastsd 72(%rdi), %ymm1 632; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 633; ALL-NEXT: retq 634; 635; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9_volatile: 636; X32-AVX512F: # %bb.0: 637; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 638; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 639; X32-AVX512F-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 640; X32-AVX512F-NEXT: vbroadcastsd 72(%eax), %ymm1 641; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 642; X32-AVX512F-NEXT: retl 643 %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 644 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 645 %ptr7 = getelementptr inbounds double, double* %ptr, i64 9 646 %val0 = load volatile double, double* %ptr0 647 %val1 = load double, double* %ptr1 648 %val7 = load double, double* %ptr7 649 %res0 = insertelement <8 x double> undef, double %val0, i32 0 650 %res1 = insertelement <8 x double> %res0, double %val1, i32 1 651 %res7 = insertelement <8 x double> %res1, double %val7, i32 7 652 ret <8 x double> %res7 653} 654 655define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile(i32* %ptr) nounwind uwtable noinline ssp { 656; ALL-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 657; ALL: # %bb.0: 658; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 659; ALL-NEXT: vpinsrd $3, 12(%rdi), %xmm0, %xmm0 660; ALL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 661; ALL-NEXT: vpinsrd $2, 56(%rdi), %xmm1, %xmm1 662; ALL-NEXT: vpinsrd $3, 60(%rdi), %xmm1, %xmm1 663; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 664; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 665; ALL-NEXT: retq 666; 667; X32-AVX512F-LABEL: merge_16i32_i32_0uu3uuuuuuuuCuEF_volatile: 668; X32-AVX512F: # %bb.0: 669; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 670; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 671; X32-AVX512F-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0 672; X32-AVX512F-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 673; X32-AVX512F-NEXT: vpinsrd $2, 56(%eax), %xmm1, %xmm1 674; X32-AVX512F-NEXT: vpinsrd $3, 60(%eax), %xmm1, %xmm1 675; X32-AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 676; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 677; X32-AVX512F-NEXT: retl 678 %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 679 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 680 %ptrC = getelementptr inbounds i32, i32* %ptr, i64 12 681 %ptrE = getelementptr inbounds i32, i32* %ptr, i64 14 682 %ptrF = getelementptr inbounds i32, i32* %ptr, i64 15 683 %val0 = load volatile i32, i32* %ptr0 684 %val3 = load volatile i32, i32* %ptr3 685 %valC = load volatile i32, i32* %ptrC 686 %valE = load volatile i32, i32* %ptrE 687 %valF = load volatile i32, i32* %ptrF 688 %res0 = insertelement <16 x i32> undef, i32 %val0, i32 0 689 %res3 = insertelement <16 x i32> %res0, i32 %val3, i32 3 690 %resC = insertelement <16 x i32> %res3, i32 %valC, i32 12 691 %resE = insertelement <16 x i32> %resC, i32 %valE, i32 14 692 %resF = insertelement <16 x i32> %resE, i32 %valF, i32 15 693 ret <16 x i32> %resF 694} 695