1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -disable-peephole | FileCheck %s --check-prefixes=ALL,AVX2 4 5define <8 x float> @shuffle_v8f32_45670123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 6; AVX1-LABEL: shuffle_v8f32_45670123: 7; AVX1: # %bb.0: # %entry 8; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] 9; AVX1-NEXT: retq 10; 11; AVX2-LABEL: shuffle_v8f32_45670123: 12; AVX2: # %bb.0: # %entry 13; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] 14; AVX2-NEXT: retq 15entry: 16 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 17 ret <8 x float> %shuffle 18} 19 20define <8 x float> @shuffle_v8f32_45670123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 21; AVX1-LABEL: shuffle_v8f32_45670123_mem: 22; AVX1: # %bb.0: # %entry 23; AVX1-NEXT: vperm2f128 $35, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,0,1] 24; AVX1-NEXT: retq 25; 26; AVX2-LABEL: shuffle_v8f32_45670123_mem: 27; AVX2: # %bb.0: # %entry 28; AVX2-NEXT: vpermpd $78, (%rdi), %ymm0 # ymm0 = mem[2,3,0,1] 29; AVX2-NEXT: retq 30entry: 31 %a = load <8 x float>, <8 x float>* %pa 32 %b = load <8 x float>, <8 x float>* %pb 33 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 34 ret <8 x float> %shuffle 35} 36 37define <8 x float> @shuffle_v8f32_0123cdef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 38; ALL-LABEL: shuffle_v8f32_0123cdef: 39; ALL: # %bb.0: # %entry 40; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 41; ALL-NEXT: retq 42entry: 43 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15> 44 ret <8 x float> %shuffle 45} 46 47define <8 x float> @shuffle_v8f32_01230123(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 48; AVX1-LABEL: shuffle_v8f32_01230123: 49; AVX1: # %bb.0: # %entry 50; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 51; AVX1-NEXT: retq 52; 53; AVX2-LABEL: shuffle_v8f32_01230123: 54; AVX2: # %bb.0: # %entry 55; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] 56; AVX2-NEXT: retq 57entry: 58 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 59 ret <8 x float> %shuffle 60} 61 62define <8 x float> @shuffle_v8f32_01230123_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 63; ALL-LABEL: shuffle_v8f32_01230123_mem: 64; ALL: # %bb.0: # %entry 65; ALL-NEXT: vbroadcastf128 (%rdi), %ymm0 # ymm0 = mem[0,1,0,1] 66; ALL-NEXT: retq 67entry: 68 %a = load <8 x float>, <8 x float>* %pa 69 %b = load <8 x float>, <8 x float>* %pb 70 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 71 ret <8 x float> %shuffle 72} 73 74define <8 x float> @shuffle_v8f32_45674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 75; AVX1-LABEL: shuffle_v8f32_45674567: 76; AVX1: # %bb.0: # %entry 77; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 78; AVX1-NEXT: retq 79; 80; AVX2-LABEL: shuffle_v8f32_45674567: 81; AVX2: # %bb.0: # %entry 82; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 83; AVX2-NEXT: retq 84entry: 85 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 86 ret <8 x float> %shuffle 87} 88 89define <8 x float> @shuffle_v8f32_45674567_mem(<8 x float>* %pa, <8 x float>* %pb) nounwind uwtable readnone ssp { 90; ALL-LABEL: shuffle_v8f32_45674567_mem: 91; ALL: # %bb.0: # %entry 92; ALL-NEXT: vbroadcastf128 16(%rdi), %ymm0 # ymm0 = mem[0,1,0,1] 93; ALL-NEXT: retq 94entry: 95 %a = load <8 x float>, <8 x float>* %pa 96 %b = load <8 x float>, <8 x float>* %pb 97 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 98 ret <8 x float> %shuffle 99} 100 101define <32 x i8> @shuffle_v32i8_2323(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 102; AVX1-LABEL: shuffle_v32i8_2323: 103; AVX1: # %bb.0: # %entry 104; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 105; AVX1-NEXT: retq 106; 107; AVX2-LABEL: shuffle_v32i8_2323: 108; AVX2: # %bb.0: # %entry 109; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 110; AVX2-NEXT: retq 111entry: 112 %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 113 ret <32 x i8> %shuffle 114} 115 116define <32 x i8> @shuffle_v32i8_2323_domain(<32 x i8> %a, <32 x i8> %b) nounwind uwtable readnone ssp { 117; AVX1-LABEL: shuffle_v32i8_2323_domain: 118; AVX1: # %bb.0: # %entry 119; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 120; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 121; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 122; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 123; AVX1-NEXT: retq 124; 125; AVX2-LABEL: shuffle_v32i8_2323_domain: 126; AVX2: # %bb.0: # %entry 127; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 128; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 129; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 130; AVX2-NEXT: retq 131entry: 132 ; add forces execution domain 133 %a2 = add <32 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 134 %shuffle = shufflevector <32 x i8> %a2, <32 x i8> %b, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 135 ret <32 x i8> %shuffle 136} 137 138define <4 x i64> @shuffle_v4i64_6701(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 139; ALL-LABEL: shuffle_v4i64_6701: 140; ALL: # %bb.0: # %entry 141; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 142; ALL-NEXT: retq 143entry: 144 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 145 ret <4 x i64> %shuffle 146} 147 148define <4 x i64> @shuffle_v4i64_6701_domain(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readnone ssp { 149; AVX1-LABEL: shuffle_v4i64_6701_domain: 150; AVX1: # %bb.0: # %entry 151; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 152; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 153; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 154; AVX1-NEXT: retq 155; 156; AVX2-LABEL: shuffle_v4i64_6701_domain: 157; AVX2: # %bb.0: # %entry 158; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 159; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 160; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] 161; AVX2-NEXT: retq 162entry: 163 ; add forces execution domain 164 %a2 = add <4 x i64> %a, <i64 1, i64 1, i64 1, i64 1> 165 %shuffle = shufflevector <4 x i64> %a2, <4 x i64> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 166 ret <4 x i64> %shuffle 167} 168 169define <8 x i32> @shuffle_v8i32_u5u7cdef(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readnone ssp { 170; AVX1-LABEL: shuffle_v8i32_u5u7cdef: 171; AVX1: # %bb.0: # %entry 172; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 173; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 174; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 175; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 176; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 177; AVX1-NEXT: retq 178; 179; AVX2-LABEL: shuffle_v8i32_u5u7cdef: 180; AVX2: # %bb.0: # %entry 181; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 182; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 183; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 184; AVX2-NEXT: retq 185entry: 186 ; add forces execution domain 187 %a2 = add <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 188 %shuffle = shufflevector <8 x i32> %a2, <8 x i32> %b, <8 x i32> <i32 undef, i32 5, i32 undef, i32 7, i32 12, i32 13, i32 14, i32 15> 189 ret <8 x i32> %shuffle 190} 191 192define <16 x i16> @shuffle_v16i16_4501(<16 x i16> %a, <16 x i16> %b) nounwind uwtable readnone ssp { 193; AVX1-LABEL: shuffle_v16i16_4501: 194; AVX1: # %bb.0: # %entry 195; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 196; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm0 197; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 198; AVX1-NEXT: retq 199; 200; AVX2-LABEL: shuffle_v16i16_4501: 201; AVX2: # %bb.0: # %entry 202; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 203; AVX2-NEXT: vpsubw %xmm2, %xmm0, %xmm0 204; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 205; AVX2-NEXT: retq 206entry: 207 ; add forces execution domain 208 %a2 = add <16 x i16> %a, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 209 %shuffle = shufflevector <16 x i16> %a2, <16 x i16> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 210 ret <16 x i16> %shuffle 211} 212 213define <16 x i16> @shuffle_v16i16_4501_mem(<16 x i16>* %a, <16 x i16>* %b) nounwind uwtable readnone ssp { 214; AVX1-LABEL: shuffle_v16i16_4501_mem: 215; AVX1: # %bb.0: # %entry 216; AVX1-NEXT: vmovdqa (%rdi), %xmm0 217; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 218; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 219; AVX1-NEXT: vperm2f128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1] 220; AVX1-NEXT: retq 221; 222; AVX2-LABEL: shuffle_v16i16_4501_mem: 223; AVX2: # %bb.0: # %entry 224; AVX2-NEXT: vmovdqa (%rdi), %ymm0 225; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 226; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 227; AVX2-NEXT: vperm2i128 $2, (%rsi), %ymm0, %ymm0 # ymm0 = mem[0,1],ymm0[0,1] 228; AVX2-NEXT: retq 229entry: 230 %c = load <16 x i16>, <16 x i16>* %a 231 %d = load <16 x i16>, <16 x i16>* %b 232 %c2 = add <16 x i16> %c, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 233 %shuffle = shufflevector <16 x i16> %c2, <16 x i16> %d, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 234 ret <16 x i16> %shuffle 235} 236 237;;;; Cases with undef indicies mixed in the mask 238 239define <8 x float> @shuffle_v8f32_uu67u9ub(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 240; ALL-LABEL: shuffle_v8f32_uu67u9ub: 241; ALL: # %bb.0: # %entry 242; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 243; ALL-NEXT: retq 244entry: 245 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 9, i32 undef, i32 11> 246 ret <8 x float> %shuffle 247} 248 249define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 250; AVX1-LABEL: shuffle_v8f32_uu67uu67: 251; AVX1: # %bb.0: # %entry 252; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 253; AVX1-NEXT: retq 254; 255; AVX2-LABEL: shuffle_v8f32_uu67uu67: 256; AVX2: # %bb.0: # %entry 257; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] 258; AVX2-NEXT: retq 259entry: 260 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 261 ret <8 x float> %shuffle 262} 263 264define <8 x float> @shuffle_v8f32_uu67uuab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 265; ALL-LABEL: shuffle_v8f32_uu67uuab: 266; ALL: # %bb.0: # %entry 267; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 268; ALL-NEXT: retq 269entry: 270 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 10, i32 11> 271 ret <8 x float> %shuffle 272} 273 274define <8 x float> @shuffle_v8f32_uu67uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 275; ALL-LABEL: shuffle_v8f32_uu67uuef: 276; ALL: # %bb.0: # %entry 277; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 278; ALL-NEXT: retq 279entry: 280 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 281 ret <8 x float> %shuffle 282} 283 284define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 285; AVX1-LABEL: shuffle_v8f32_uu674567: 286; AVX1: # %bb.0: # %entry 287; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 288; AVX1-NEXT: retq 289; 290; AVX2-LABEL: shuffle_v8f32_uu674567: 291; AVX2: # %bb.0: # %entry 292; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] 293; AVX2-NEXT: retq 294entry: 295 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 296 ret <8 x float> %shuffle 297} 298 299define <8 x float> @shuffle_v8f32_uu6789ab(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 300; ALL-LABEL: shuffle_v8f32_uu6789ab: 301; ALL: # %bb.0: # %entry 302; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] 303; ALL-NEXT: retq 304entry: 305 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 306 ret <8 x float> %shuffle 307} 308 309define <8 x float> @shuffle_v8f32_4567uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 310; AVX1-LABEL: shuffle_v8f32_4567uu67: 311; AVX1: # %bb.0: # %entry 312; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 313; AVX1-NEXT: retq 314; 315; AVX2-LABEL: shuffle_v8f32_4567uu67: 316; AVX2: # %bb.0: # %entry 317; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] 318; AVX2-NEXT: retq 319entry: 320 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7> 321 ret <8 x float> %shuffle 322} 323 324define <8 x float> @shuffle_v8f32_4567uuef(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 325; ALL-LABEL: shuffle_v8f32_4567uuef: 326; ALL: # %bb.0: # %entry 327; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 328; ALL-NEXT: retq 329entry: 330 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 14, i32 15> 331 ret <8 x float> %shuffle 332} 333 334;;;; Cases we must not select vperm2f128 335 336define <8 x float> @shuffle_v8f32_uu67ucuf(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp { 337; ALL-LABEL: shuffle_v8f32_uu67ucuf: 338; ALL: # %bb.0: # %entry 339; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 340; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] 341; ALL-NEXT: retq 342entry: 343 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 12, i32 undef, i32 15> 344 ret <8 x float> %shuffle 345} 346 347;; Test zero mask generation. 348;; PR22984: https://llvm.org/bugs/show_bug.cgi?id=22984 349;; Prefer xor+vblendpd over vperm2f128 because that has better performance, 350;; unless building for optsize where we should still use vperm2f128. 351 352define <4 x double> @shuffle_v4f64_zz01(<4 x double> %a) { 353; ALL-LABEL: shuffle_v4f64_zz01: 354; ALL: # %bb.0: 355; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 356; ALL-NEXT: retq 357 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 358 ret <4 x double> %s 359} 360define <4 x double> @shuffle_v4f64_zz01_optsize(<4 x double> %a) optsize { 361; ALL-LABEL: shuffle_v4f64_zz01_optsize: 362; ALL: # %bb.0: 363; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 364; ALL-NEXT: retq 365 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 366 ret <4 x double> %s 367} 368 369define <4 x double> @shuffle_v4f64_zz23(<4 x double> %a) { 370; ALL-LABEL: shuffle_v4f64_zz23: 371; ALL: # %bb.0: 372; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 373; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 374; ALL-NEXT: retq 375 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 376 ret <4 x double> %s 377} 378define <4 x double> @shuffle_v4f64_zz23_optsize(<4 x double> %a) optsize { 379; ALL-LABEL: shuffle_v4f64_zz23_optsize: 380; ALL: # %bb.0: 381; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 382; ALL-NEXT: retq 383 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 384 ret <4 x double> %s 385} 386define <4 x double> @shuffle_v4f64_zz23_pgso(<4 x double> %a) !prof !14 { 387; ALL-LABEL: shuffle_v4f64_zz23_pgso: 388; ALL: # %bb.0: 389; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 390; ALL-NEXT: retq 391 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 392 ret <4 x double> %s 393} 394 395define <4 x double> @shuffle_v4f64_zz45(<4 x double> %a) { 396; ALL-LABEL: shuffle_v4f64_zz45: 397; ALL: # %bb.0: 398; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 399; ALL-NEXT: retq 400 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 401 ret <4 x double> %s 402} 403define <4 x double> @shuffle_v4f64_zz45_optsize(<4 x double> %a) optsize { 404; ALL-LABEL: shuffle_v4f64_zz45_optsize: 405; ALL: # %bb.0: 406; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] 407; ALL-NEXT: retq 408 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 409 ret <4 x double> %s 410} 411 412define <4 x double> @shuffle_v4f64_zz67(<4 x double> %a) { 413; ALL-LABEL: shuffle_v4f64_zz67: 414; ALL: # %bb.0: 415; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 416; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 417; ALL-NEXT: retq 418 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 419 ret <4 x double> %s 420} 421define <4 x double> @shuffle_v4f64_zz67_optsize(<4 x double> %a) optsize { 422; ALL-LABEL: shuffle_v4f64_zz67_optsize: 423; ALL: # %bb.0: 424; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 425; ALL-NEXT: retq 426 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 427 ret <4 x double> %s 428} 429define <4 x double> @shuffle_v4f64_zz67_pgso(<4 x double> %a) !prof !14 { 430; ALL-LABEL: shuffle_v4f64_zz67_pgso: 431; ALL: # %bb.0: 432; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[2,3] 433; ALL-NEXT: retq 434 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 435 ret <4 x double> %s 436} 437 438define <4 x double> @shuffle_v4f64_01zz(<4 x double> %a) { 439; ALL-LABEL: shuffle_v4f64_01zz: 440; ALL: # %bb.0: 441; ALL-NEXT: vmovaps %xmm0, %xmm0 442; ALL-NEXT: retq 443 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 444 ret <4 x double> %s 445} 446define <4 x double> @shuffle_v4f64_01zz_optsize(<4 x double> %a) optsize { 447; ALL-LABEL: shuffle_v4f64_01zz_optsize: 448; ALL: # %bb.0: 449; ALL-NEXT: vmovaps %xmm0, %xmm0 450; ALL-NEXT: retq 451 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 452 ret <4 x double> %s 453} 454 455define <4 x double> @shuffle_v4f64_23zz(<4 x double> %a) { 456; ALL-LABEL: shuffle_v4f64_23zz: 457; ALL: # %bb.0: 458; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 459; ALL-NEXT: retq 460 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 461 ret <4 x double> %s 462} 463define <4 x double> @shuffle_v4f64_23zz_optsize(<4 x double> %a) optsize { 464; ALL-LABEL: shuffle_v4f64_23zz_optsize: 465; ALL: # %bb.0: 466; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 467; ALL-NEXT: retq 468 %s = shufflevector <4 x double> %a, <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 469 ret <4 x double> %s 470} 471 472define <4 x double> @shuffle_v4f64_45zz(<4 x double> %a) { 473; ALL-LABEL: shuffle_v4f64_45zz: 474; ALL: # %bb.0: 475; ALL-NEXT: vmovaps %xmm0, %xmm0 476; ALL-NEXT: retq 477 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 478 ret <4 x double> %s 479} 480define <4 x double> @shuffle_v4f64_45zz_optsize(<4 x double> %a) optsize { 481; ALL-LABEL: shuffle_v4f64_45zz_optsize: 482; ALL: # %bb.0: 483; ALL-NEXT: vmovaps %xmm0, %xmm0 484; ALL-NEXT: retq 485 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 4, i32 5, i32 0, i32 1> 486 ret <4 x double> %s 487} 488 489define <4 x double> @shuffle_v4f64_67zz(<4 x double> %a) { 490; ALL-LABEL: shuffle_v4f64_67zz: 491; ALL: # %bb.0: 492; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 493; ALL-NEXT: retq 494 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 495 ret <4 x double> %s 496} 497define <4 x double> @shuffle_v4f64_67zz_optsize(<4 x double> %a) optsize { 498; ALL-LABEL: shuffle_v4f64_67zz_optsize: 499; ALL: # %bb.0: 500; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 501; ALL-NEXT: retq 502 %s = shufflevector <4 x double> <double 0.0, double 0.0, double undef, double undef>, <4 x double> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 503 ret <4 x double> %s 504} 505 506;; With AVX2 select the integer version of the instruction. Use an add to force the domain selection. 507 508define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) { 509; AVX1-LABEL: shuffle_v4i64_67zz: 510; AVX1: # %bb.0: 511; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 512; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0 513; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 514; AVX1-NEXT: retq 515; 516; AVX2-LABEL: shuffle_v4i64_67zz: 517; AVX2: # %bb.0: 518; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 519; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 520; AVX2-NEXT: retq 521 %s = shufflevector <4 x i64> <i64 0, i64 0, i64 undef, i64 undef>, <4 x i64> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 522 %c = add <4 x i64> %b, %s 523 ret <4 x i64> %c 524} 525 526;;; Memory folding cases 527 528define <4 x double> @ld0_hi0_lo1_4f64(<4 x double> * %pa, <4 x double> %b) nounwind uwtable readnone ssp { 529; AVX1-LABEL: ld0_hi0_lo1_4f64: 530; AVX1: # %bb.0: # %entry 531; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 532; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 533; AVX1-NEXT: retq 534; 535; AVX2-LABEL: ld0_hi0_lo1_4f64: 536; AVX2: # %bb.0: # %entry 537; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 538; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 539; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 540; AVX2-NEXT: retq 541entry: 542 %a = load <4 x double>, <4 x double> * %pa 543 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 544 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 545 ret <4 x double> %res 546} 547 548define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, <4 x double> * %pb) nounwind uwtable readnone ssp { 549; AVX1-LABEL: ld1_hi0_hi1_4f64: 550; AVX1: # %bb.0: # %entry 551; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 552; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 553; AVX1-NEXT: retq 554; 555; AVX2-LABEL: ld1_hi0_hi1_4f64: 556; AVX2: # %bb.0: # %entry 557; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 558; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 559; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 560; AVX2-NEXT: retq 561entry: 562 %b = load <4 x double>, <4 x double> * %pb 563 %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 564 %res = fadd <4 x double> %shuffle, <double 1.0, double 1.0, double 1.0, double 1.0> 565 ret <4 x double> %res 566} 567 568define <8 x float> @ld0_hi0_lo1_8f32(<8 x float> * %pa, <8 x float> %b) nounwind uwtable readnone ssp { 569; AVX1-LABEL: ld0_hi0_lo1_8f32: 570; AVX1: # %bb.0: # %entry 571; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 572; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 573; AVX1-NEXT: retq 574; 575; AVX2-LABEL: ld0_hi0_lo1_8f32: 576; AVX2: # %bb.0: # %entry 577; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 578; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 579; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 580; AVX2-NEXT: retq 581entry: 582 %a = load <8 x float>, <8 x float> * %pa 583 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 584 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 585 ret <8 x float> %res 586} 587 588define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, <8 x float> * %pb) nounwind uwtable readnone ssp { 589; AVX1-LABEL: ld1_hi0_hi1_8f32: 590; AVX1: # %bb.0: # %entry 591; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 592; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 593; AVX1-NEXT: retq 594; 595; AVX2-LABEL: ld1_hi0_hi1_8f32: 596; AVX2: # %bb.0: # %entry 597; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 598; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] 599; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 600; AVX2-NEXT: retq 601entry: 602 %b = load <8 x float>, <8 x float> * %pb 603 %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 604 %res = fadd <8 x float> %shuffle, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0> 605 ret <8 x float> %res 606} 607 608define <4 x i64> @ld0_hi0_lo1_4i64(<4 x i64> * %pa, <4 x i64> %b) nounwind uwtable readnone ssp { 609; AVX1-LABEL: ld0_hi0_lo1_4i64: 610; AVX1: # %bb.0: # %entry 611; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 612; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 613; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 614; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 615; AVX1-NEXT: retq 616; 617; AVX2-LABEL: ld0_hi0_lo1_4i64: 618; AVX2: # %bb.0: # %entry 619; AVX2-NEXT: vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 620; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 621; AVX2-NEXT: retq 622entry: 623 %a = load <4 x i64>, <4 x i64> * %pa 624 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 625 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 626 ret <4 x i64> %res 627} 628 629define <4 x i64> @ld1_hi0_hi1_4i64(<4 x i64> %a, <4 x i64> * %pb) nounwind uwtable readnone ssp { 630; AVX1-LABEL: ld1_hi0_hi1_4i64: 631; AVX1: # %bb.0: # %entry 632; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 633; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 634; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 635; AVX1-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 636; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 637; AVX1-NEXT: retq 638; 639; AVX2-LABEL: ld1_hi0_hi1_4i64: 640; AVX2: # %bb.0: # %entry 641; AVX2-NEXT: vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 642; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 643; AVX2-NEXT: retq 644entry: 645 %b = load <4 x i64>, <4 x i64> * %pb 646 %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 647 %res = add <4 x i64> %shuffle, <i64 1, i64 2, i64 3, i64 4> 648 ret <4 x i64> %res 649} 650 651define <8 x i32> @ld0_hi0_lo1_8i32(<8 x i32> * %pa, <8 x i32> %b) nounwind uwtable readnone ssp { 652; AVX1-LABEL: ld0_hi0_lo1_8i32: 653; AVX1: # %bb.0: # %entry 654; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] 655; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 656; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm1 657; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 658; AVX1-NEXT: retq 659; 660; AVX2-LABEL: ld0_hi0_lo1_8i32: 661; AVX2: # %bb.0: # %entry 662; AVX2-NEXT: vperm2i128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] 663; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 664; AVX2-NEXT: retq 665entry: 666 %a = load <8 x i32>, <8 x i32> * %pa 667 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11> 668 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 669 ret <8 x i32> %res 670} 671 672define <8 x i32> @ld1_hi0_hi1_8i32(<8 x i32> %a, <8 x i32> * %pb) nounwind uwtable readnone ssp { 673; AVX1-LABEL: ld1_hi0_hi1_8i32: 674; AVX1: # %bb.0: # %entry 675; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,3,4] 676; AVX1-NEXT: vpaddd 16(%rdi), %xmm1, %xmm2 677; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 678; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 679; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 680; AVX1-NEXT: retq 681; 682; AVX2-LABEL: ld1_hi0_hi1_8i32: 683; AVX2: # %bb.0: # %entry 684; AVX2-NEXT: vperm2i128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] 685; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 686; AVX2-NEXT: retq 687entry: 688 %b = load <8 x i32>, <8 x i32> * %pb 689 %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 690 %res = add <8 x i32> %shuffle, <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4> 691 ret <8 x i32> %res 692} 693 694define void @PR50053(<4 x i64>* nocapture %0, <4 x i64>* nocapture readonly %1) { 695; ALL-LABEL: PR50053: 696; ALL: # %bb.0: 697; ALL-NEXT: vmovaps (%rsi), %ymm0 698; ALL-NEXT: vmovaps 32(%rsi), %xmm1 699; ALL-NEXT: vmovaps 48(%rsi), %xmm2 700; ALL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[0,1],ymm1[0,1] 701; ALL-NEXT: vmovaps %ymm1, (%rdi) 702; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 703; ALL-NEXT: vmovaps %ymm0, 32(%rdi) 704; ALL-NEXT: vzeroupper 705; ALL-NEXT: retq 706 %3 = load <4 x i64>, <4 x i64>* %1, align 32 707 %4 = getelementptr inbounds <4 x i64>, <4 x i64>* %1, i64 1 708 %5 = bitcast <4 x i64>* %4 to <2 x i64>* 709 %6 = load <2 x i64>, <2 x i64>* %5, align 16 710 %7 = getelementptr inbounds <2 x i64>, <2 x i64>* %5, i64 1 711 %8 = load <2 x i64>, <2 x i64>* %7, align 16 712 %9 = shufflevector <2 x i64> %6, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 713 %10 = shufflevector <4 x i64> %3, <4 x i64> %9, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 714 store <4 x i64> %10, <4 x i64>* %0, align 32 715 %11 = shufflevector <2 x i64> %8, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 716 %12 = shufflevector <4 x i64> %11, <4 x i64> %3, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 717 %13 = getelementptr inbounds <4 x i64>, <4 x i64>* %0, i64 1 718 store <4 x i64> %12, <4 x i64>* %13, align 32 719 ret void 720} 721 722!llvm.module.flags = !{!0} 723!0 = !{i32 1, !"ProfileSummary", !1} 724!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} 725!2 = !{!"ProfileFormat", !"InstrProf"} 726!3 = !{!"TotalCount", i64 10000} 727!4 = !{!"MaxCount", i64 10} 728!5 = !{!"MaxInternalCount", i64 1} 729!6 = !{!"MaxFunctionCount", i64 1000} 730!7 = !{!"NumCounts", i64 3} 731!8 = !{!"NumFunctions", i64 3} 732!9 = !{!"DetailedSummary", !10} 733!10 = !{!11, !12, !13} 734!11 = !{i32 10000, i64 100, i32 1} 735!12 = !{i32 999000, i64 100, i32 1} 736!13 = !{i32 999999, i64 1, i32 2} 737!14 = !{!"function_entry_count", i64 0} 738