1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vl-builtins.c 6 7define <4 x float> @test_mm_mask_cvtepi32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 8; X86-LABEL: test_mm_mask_cvtepi32_ps: 9; X86: # %bb.0: # %entry 10; X86-NEXT: movb {{[0-9]+}}(%esp), %al 11; X86-NEXT: kmovw %eax, %k1 12; X86-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 13; X86-NEXT: retl 14; 15; X64-LABEL: test_mm_mask_cvtepi32_ps: 16; X64: # %bb.0: # %entry 17; X64-NEXT: kmovw %edi, %k1 18; X64-NEXT: vcvtdq2ps %xmm1, %xmm0 {%k1} 19; X64-NEXT: retq 20entry: 21 %0 = bitcast <2 x i64> %__A to <4 x i32> 22 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 23 %1 = bitcast i8 %__U to <8 x i1> 24 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 25 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 26 ret <4 x float> %2 27} 28 29define <4 x float> @test_mm_maskz_cvtepi32_ps(i8 zeroext %__U, <2 x i64> %__A) { 30; X86-LABEL: test_mm_maskz_cvtepi32_ps: 31; X86: # %bb.0: # %entry 32; X86-NEXT: movb {{[0-9]+}}(%esp), %al 33; X86-NEXT: kmovw %eax, %k1 34; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 35; X86-NEXT: retl 36; 37; X64-LABEL: test_mm_maskz_cvtepi32_ps: 38; X64: # %bb.0: # %entry 39; X64-NEXT: kmovw %edi, %k1 40; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 {%k1} {z} 41; X64-NEXT: retq 42entry: 43 %0 = bitcast <2 x i64> %__A to <4 x i32> 44 %conv.i.i = sitofp <4 x i32> %0 to <4 x float> 45 %1 = bitcast i8 %__U to <8 x i1> 46 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 47 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 48 ret <4 x float> %2 49} 50 51define <8 x float> @test_mm256_mask_cvtepi32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 52; X86-LABEL: test_mm256_mask_cvtepi32_ps: 53; X86: # %bb.0: # %entry 54; X86-NEXT: movb {{[0-9]+}}(%esp), %al 55; X86-NEXT: kmovw %eax, %k1 56; X86-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 57; X86-NEXT: retl 58; 59; X64-LABEL: test_mm256_mask_cvtepi32_ps: 60; X64: # %bb.0: # %entry 61; X64-NEXT: kmovw %edi, %k1 62; X64-NEXT: vcvtdq2ps %ymm1, %ymm0 {%k1} 63; X64-NEXT: retq 64entry: 65 %0 = bitcast <4 x i64> %__A to <8 x i32> 66 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 67 %1 = bitcast i8 %__U to <8 x i1> 68 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 69 ret <8 x float> %2 70} 71 72define <8 x float> @test_mm256_maskz_cvtepi32_ps(i8 zeroext %__U, <4 x i64> %__A) { 73; X86-LABEL: test_mm256_maskz_cvtepi32_ps: 74; X86: # %bb.0: # %entry 75; X86-NEXT: movb {{[0-9]+}}(%esp), %al 76; X86-NEXT: kmovw %eax, %k1 77; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 78; X86-NEXT: retl 79; 80; X64-LABEL: test_mm256_maskz_cvtepi32_ps: 81; X64: # %bb.0: # %entry 82; X64-NEXT: kmovw %edi, %k1 83; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 {%k1} {z} 84; X64-NEXT: retq 85entry: 86 %0 = bitcast <4 x i64> %__A to <8 x i32> 87 %conv.i.i = sitofp <8 x i32> %0 to <8 x float> 88 %1 = bitcast i8 %__U to <8 x i1> 89 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 90 ret <8 x float> %2 91} 92 93define <2 x i64> @test_mm_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 94; X86-LABEL: test_mm_mask_cvtpd_epi32: 95; X86: # %bb.0: # %entry 96; X86-NEXT: movb {{[0-9]+}}(%esp), %al 97; X86-NEXT: kmovw %eax, %k1 98; X86-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 99; X86-NEXT: retl 100; 101; X64-LABEL: test_mm_mask_cvtpd_epi32: 102; X64: # %bb.0: # %entry 103; X64-NEXT: kmovw %edi, %k1 104; X64-NEXT: vcvtpd2dq %xmm1, %xmm0 {%k1} 105; X64-NEXT: retq 106entry: 107 %0 = bitcast <2 x i64> %__W to <4 x i32> 108 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 109 %2 = bitcast <4 x i32> %1 to <2 x i64> 110 ret <2 x i64> %2 111} 112 113define <2 x i64> @test_mm_maskz_cvtpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 114; X86-LABEL: test_mm_maskz_cvtpd_epi32: 115; X86: # %bb.0: # %entry 116; X86-NEXT: movb {{[0-9]+}}(%esp), %al 117; X86-NEXT: kmovw %eax, %k1 118; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 119; X86-NEXT: retl 120; 121; X64-LABEL: test_mm_maskz_cvtpd_epi32: 122; X64: # %bb.0: # %entry 123; X64-NEXT: kmovw %edi, %k1 124; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 {%k1} {z} 125; X64-NEXT: retq 126entry: 127 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 128 %1 = bitcast <4 x i32> %0 to <2 x i64> 129 ret <2 x i64> %1 130} 131 132define <2 x i64> @test_mm256_mask_cvtpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 133; X86-LABEL: test_mm256_mask_cvtpd_epi32: 134; X86: # %bb.0: # %entry 135; X86-NEXT: movb {{[0-9]+}}(%esp), %al 136; X86-NEXT: kmovw %eax, %k1 137; X86-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 138; X86-NEXT: vzeroupper 139; X86-NEXT: retl 140; 141; X64-LABEL: test_mm256_mask_cvtpd_epi32: 142; X64: # %bb.0: # %entry 143; X64-NEXT: kmovw %edi, %k1 144; X64-NEXT: vcvtpd2dq %ymm1, %xmm0 {%k1} 145; X64-NEXT: vzeroupper 146; X64-NEXT: retq 147entry: 148 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 149 %1 = bitcast <2 x i64> %__W to <4 x i32> 150 %2 = bitcast i8 %__U to <8 x i1> 151 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 152 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 153 %4 = bitcast <4 x i32> %3 to <2 x i64> 154 ret <2 x i64> %4 155} 156 157define <2 x i64> @test_mm256_maskz_cvtpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 158; X86-LABEL: test_mm256_maskz_cvtpd_epi32: 159; X86: # %bb.0: # %entry 160; X86-NEXT: movb {{[0-9]+}}(%esp), %al 161; X86-NEXT: kmovw %eax, %k1 162; X86-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 163; X86-NEXT: vzeroupper 164; X86-NEXT: retl 165; 166; X64-LABEL: test_mm256_maskz_cvtpd_epi32: 167; X64: # %bb.0: # %entry 168; X64-NEXT: kmovw %edi, %k1 169; X64-NEXT: vcvtpd2dq %ymm0, %xmm0 {%k1} {z} 170; X64-NEXT: vzeroupper 171; X64-NEXT: retq 172entry: 173 %0 = tail call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %__A) #8 174 %1 = bitcast i8 %__U to <8 x i1> 175 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 176 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 177 %3 = bitcast <4 x i32> %2 to <2 x i64> 178 ret <2 x i64> %3 179} 180 181define <4 x float> @test_mm_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <2 x double> %__A) { 182; X86-LABEL: test_mm_mask_cvtpd_ps: 183; X86: # %bb.0: # %entry 184; X86-NEXT: movb {{[0-9]+}}(%esp), %al 185; X86-NEXT: kmovw %eax, %k1 186; X86-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 187; X86-NEXT: retl 188; 189; X64-LABEL: test_mm_mask_cvtpd_ps: 190; X64: # %bb.0: # %entry 191; X64-NEXT: kmovw %edi, %k1 192; X64-NEXT: vcvtpd2ps %xmm1, %xmm0 {%k1} 193; X64-NEXT: retq 194entry: 195 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> %__W, i8 %__U) #8 196 ret <4 x float> %0 197} 198 199define <4 x float> @test_mm_maskz_cvtpd_ps(i8 zeroext %__U, <2 x double> %__A) { 200; X86-LABEL: test_mm_maskz_cvtpd_ps: 201; X86: # %bb.0: # %entry 202; X86-NEXT: movb {{[0-9]+}}(%esp), %al 203; X86-NEXT: kmovw %eax, %k1 204; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 205; X86-NEXT: retl 206; 207; X64-LABEL: test_mm_maskz_cvtpd_ps: 208; X64: # %bb.0: # %entry 209; X64-NEXT: kmovw %edi, %k1 210; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 {%k1} {z} 211; X64-NEXT: retq 212entry: 213 %0 = tail call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %__A, <4 x float> zeroinitializer, i8 %__U) #8 214 ret <4 x float> %0 215} 216 217define <4 x float> @test_mm256_mask_cvtpd_ps(<4 x float> %__W, i8 zeroext %__U, <4 x double> %__A) { 218; X86-LABEL: test_mm256_mask_cvtpd_ps: 219; X86: # %bb.0: # %entry 220; X86-NEXT: movb {{[0-9]+}}(%esp), %al 221; X86-NEXT: kmovw %eax, %k1 222; X86-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 223; X86-NEXT: vzeroupper 224; X86-NEXT: retl 225; 226; X64-LABEL: test_mm256_mask_cvtpd_ps: 227; X64: # %bb.0: # %entry 228; X64-NEXT: kmovw %edi, %k1 229; X64-NEXT: vcvtpd2ps %ymm1, %xmm0 {%k1} 230; X64-NEXT: vzeroupper 231; X64-NEXT: retq 232entry: 233 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 234 %1 = bitcast i8 %__U to <8 x i1> 235 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 236 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 237 ret <4 x float> %2 238} 239 240define <4 x float> @test_mm256_maskz_cvtpd_ps(i8 zeroext %__U, <4 x double> %__A) { 241; X86-LABEL: test_mm256_maskz_cvtpd_ps: 242; X86: # %bb.0: # %entry 243; X86-NEXT: movb {{[0-9]+}}(%esp), %al 244; X86-NEXT: kmovw %eax, %k1 245; X86-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 246; X86-NEXT: vzeroupper 247; X86-NEXT: retl 248; 249; X64-LABEL: test_mm256_maskz_cvtpd_ps: 250; X64: # %bb.0: # %entry 251; X64-NEXT: kmovw %edi, %k1 252; X64-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} 253; X64-NEXT: vzeroupper 254; X64-NEXT: retq 255entry: 256 %0 = tail call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %__A) #8 257 %1 = bitcast i8 %__U to <8 x i1> 258 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 259 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 260 ret <4 x float> %2 261} 262 263define <2 x i64> @test_mm_cvtpd_epu32(<2 x double> %__A) { 264; CHECK-LABEL: test_mm_cvtpd_epu32: 265; CHECK: # %bb.0: # %entry 266; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 267; CHECK-NEXT: ret{{[l|q]}} 268entry: 269 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 270 %1 = bitcast <4 x i32> %0 to <2 x i64> 271 ret <2 x i64> %1 272} 273 274define <2 x i64> @test_mm_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 275; X86-LABEL: test_mm_mask_cvtpd_epu32: 276; X86: # %bb.0: # %entry 277; X86-NEXT: movb {{[0-9]+}}(%esp), %al 278; X86-NEXT: kmovw %eax, %k1 279; X86-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 280; X86-NEXT: retl 281; 282; X64-LABEL: test_mm_mask_cvtpd_epu32: 283; X64: # %bb.0: # %entry 284; X64-NEXT: kmovw %edi, %k1 285; X64-NEXT: vcvtpd2udq %xmm1, %xmm0 {%k1} 286; X64-NEXT: retq 287entry: 288 %0 = bitcast <2 x i64> %__W to <4 x i32> 289 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 290 %2 = bitcast <4 x i32> %1 to <2 x i64> 291 ret <2 x i64> %2 292} 293 294define <2 x i64> @test_mm_maskz_cvtpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 295; X86-LABEL: test_mm_maskz_cvtpd_epu32: 296; X86: # %bb.0: # %entry 297; X86-NEXT: movb {{[0-9]+}}(%esp), %al 298; X86-NEXT: kmovw %eax, %k1 299; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 300; X86-NEXT: retl 301; 302; X64-LABEL: test_mm_maskz_cvtpd_epu32: 303; X64: # %bb.0: # %entry 304; X64-NEXT: kmovw %edi, %k1 305; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 {%k1} {z} 306; X64-NEXT: retq 307entry: 308 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 309 %1 = bitcast <4 x i32> %0 to <2 x i64> 310 ret <2 x i64> %1 311} 312 313define <2 x i64> @test_mm256_cvtpd_epu32(<4 x double> %__A) { 314; CHECK-LABEL: test_mm256_cvtpd_epu32: 315; CHECK: # %bb.0: # %entry 316; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 317; CHECK-NEXT: vzeroupper 318; CHECK-NEXT: ret{{[l|q]}} 319entry: 320 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 321 %1 = bitcast <4 x i32> %0 to <2 x i64> 322 ret <2 x i64> %1 323} 324 325define <2 x i64> @test_mm256_mask_cvtpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 326; X86-LABEL: test_mm256_mask_cvtpd_epu32: 327; X86: # %bb.0: # %entry 328; X86-NEXT: movb {{[0-9]+}}(%esp), %al 329; X86-NEXT: kmovw %eax, %k1 330; X86-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 331; X86-NEXT: vzeroupper 332; X86-NEXT: retl 333; 334; X64-LABEL: test_mm256_mask_cvtpd_epu32: 335; X64: # %bb.0: # %entry 336; X64-NEXT: kmovw %edi, %k1 337; X64-NEXT: vcvtpd2udq %ymm1, %xmm0 {%k1} 338; X64-NEXT: vzeroupper 339; X64-NEXT: retq 340entry: 341 %0 = bitcast <2 x i64> %__W to <4 x i32> 342 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 343 %2 = bitcast <4 x i32> %1 to <2 x i64> 344 ret <2 x i64> %2 345} 346 347define <2 x i64> @test_mm256_maskz_cvtpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 348; X86-LABEL: test_mm256_maskz_cvtpd_epu32: 349; X86: # %bb.0: # %entry 350; X86-NEXT: movb {{[0-9]+}}(%esp), %al 351; X86-NEXT: kmovw %eax, %k1 352; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 353; X86-NEXT: vzeroupper 354; X86-NEXT: retl 355; 356; X64-LABEL: test_mm256_maskz_cvtpd_epu32: 357; X64: # %bb.0: # %entry 358; X64-NEXT: kmovw %edi, %k1 359; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 {%k1} {z} 360; X64-NEXT: vzeroupper 361; X64-NEXT: retq 362entry: 363 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 364 %1 = bitcast <4 x i32> %0 to <2 x i64> 365 ret <2 x i64> %1 366} 367 368define <4 x float> @test_mm_mask_cvtph_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 369; X86-LABEL: test_mm_mask_cvtph_ps: 370; X86: # %bb.0: # %entry 371; X86-NEXT: movb {{[0-9]+}}(%esp), %al 372; X86-NEXT: kmovw %eax, %k1 373; X86-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1} 374; X86-NEXT: retl 375; 376; X64-LABEL: test_mm_mask_cvtph_ps: 377; X64: # %bb.0: # %entry 378; X64-NEXT: kmovw %edi, %k1 379; X64-NEXT: vcvtph2ps %xmm1, %xmm0 {%k1} 380; X64-NEXT: retq 381entry: 382 %0 = bitcast <2 x i64> %__A to <8 x i16> 383 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 384 %2 = bitcast <4 x i16> %1 to <4 x half> 385 %3 = bitcast i8 %__U to <8 x i1> 386 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 387 %5 = fpext <4 x half> %2 to <4 x float> 388 %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> %__W 389 ret <4 x float> %6 390} 391 392define <4 x float> @test_mm_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) { 393; X86-LABEL: test_mm_maskz_cvtph_ps: 394; X86: # %bb.0: # %entry 395; X86-NEXT: movb {{[0-9]+}}(%esp), %al 396; X86-NEXT: kmovw %eax, %k1 397; X86-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} 398; X86-NEXT: retl 399; 400; X64-LABEL: test_mm_maskz_cvtph_ps: 401; X64: # %bb.0: # %entry 402; X64-NEXT: kmovw %edi, %k1 403; X64-NEXT: vcvtph2ps %xmm0, %xmm0 {%k1} {z} 404; X64-NEXT: retq 405entry: 406 %0 = bitcast <2 x i64> %__A to <8 x i16> 407 %1 = shufflevector <8 x i16> %0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 408 %2 = bitcast <4 x i16> %1 to <4 x half> 409 %3 = bitcast i8 %__U to <8 x i1> 410 %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 411 %5 = fpext <4 x half> %2 to <4 x float> 412 %6 = select <4 x i1> %4, <4 x float> %5, <4 x float> zeroinitializer 413 ret <4 x float> %6 414} 415 416define <8 x float> @test_mm256_mask_cvtph_ps(<8 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 417; X86-LABEL: test_mm256_mask_cvtph_ps: 418; X86: # %bb.0: # %entry 419; X86-NEXT: movb {{[0-9]+}}(%esp), %al 420; X86-NEXT: kmovw %eax, %k1 421; X86-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1} 422; X86-NEXT: retl 423; 424; X64-LABEL: test_mm256_mask_cvtph_ps: 425; X64: # %bb.0: # %entry 426; X64-NEXT: kmovw %edi, %k1 427; X64-NEXT: vcvtph2ps %xmm1, %ymm0 {%k1} 428; X64-NEXT: retq 429entry: 430 %0 = bitcast <2 x i64> %__A to <8 x i16> 431 %1 = bitcast <8 x i16> %0 to <8 x half> 432 %2 = bitcast i8 %__U to <8 x i1> 433 %3 = fpext <8 x half> %1 to <8 x float> 434 %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> %__W 435 ret <8 x float> %4 436} 437 438define <8 x float> @test_mm256_maskz_cvtph_ps(i8 zeroext %__U, <2 x i64> %__A) { 439; X86-LABEL: test_mm256_maskz_cvtph_ps: 440; X86: # %bb.0: # %entry 441; X86-NEXT: movb {{[0-9]+}}(%esp), %al 442; X86-NEXT: kmovw %eax, %k1 443; X86-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} 444; X86-NEXT: retl 445; 446; X64-LABEL: test_mm256_maskz_cvtph_ps: 447; X64: # %bb.0: # %entry 448; X64-NEXT: kmovw %edi, %k1 449; X64-NEXT: vcvtph2ps %xmm0, %ymm0 {%k1} {z} 450; X64-NEXT: retq 451entry: 452 %0 = bitcast <2 x i64> %__A to <8 x i16> 453 %1 = bitcast <8 x i16> %0 to <8 x half> 454 %2 = bitcast i8 %__U to <8 x i1> 455 %3 = fpext <8 x half> %1 to <8 x float> 456 %4 = select <8 x i1> %2, <8 x float> %3, <8 x float> zeroinitializer 457 ret <8 x float> %4 458} 459 460define <2 x i64> @test_mm_mask_cvtps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 461; X86-LABEL: test_mm_mask_cvtps_epi32: 462; X86: # %bb.0: # %entry 463; X86-NEXT: movb {{[0-9]+}}(%esp), %al 464; X86-NEXT: kmovw %eax, %k1 465; X86-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 466; X86-NEXT: retl 467; 468; X64-LABEL: test_mm_mask_cvtps_epi32: 469; X64: # %bb.0: # %entry 470; X64-NEXT: kmovw %edi, %k1 471; X64-NEXT: vcvtps2dq %xmm1, %xmm0 {%k1} 472; X64-NEXT: retq 473entry: 474 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 475 %1 = bitcast <2 x i64> %__W to <4 x i32> 476 %2 = bitcast i8 %__U to <8 x i1> 477 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 478 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 479 %4 = bitcast <4 x i32> %3 to <2 x i64> 480 ret <2 x i64> %4 481} 482 483define <2 x i64> @test_mm_maskz_cvtps_epi32(i8 zeroext %__U, <4 x float> %__A) { 484; X86-LABEL: test_mm_maskz_cvtps_epi32: 485; X86: # %bb.0: # %entry 486; X86-NEXT: movb {{[0-9]+}}(%esp), %al 487; X86-NEXT: kmovw %eax, %k1 488; X86-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 489; X86-NEXT: retl 490; 491; X64-LABEL: test_mm_maskz_cvtps_epi32: 492; X64: # %bb.0: # %entry 493; X64-NEXT: kmovw %edi, %k1 494; X64-NEXT: vcvtps2dq %xmm0, %xmm0 {%k1} {z} 495; X64-NEXT: retq 496entry: 497 %0 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %__A) #8 498 %1 = bitcast i8 %__U to <8 x i1> 499 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 500 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 501 %3 = bitcast <4 x i32> %2 to <2 x i64> 502 ret <2 x i64> %3 503} 504 505define <4 x i64> @test_mm256_mask_cvtps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 506; X86-LABEL: test_mm256_mask_cvtps_epi32: 507; X86: # %bb.0: # %entry 508; X86-NEXT: movb {{[0-9]+}}(%esp), %al 509; X86-NEXT: kmovw %eax, %k1 510; X86-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 511; X86-NEXT: retl 512; 513; X64-LABEL: test_mm256_mask_cvtps_epi32: 514; X64: # %bb.0: # %entry 515; X64-NEXT: kmovw %edi, %k1 516; X64-NEXT: vcvtps2dq %ymm1, %ymm0 {%k1} 517; X64-NEXT: retq 518entry: 519 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 520 %1 = bitcast <4 x i64> %__W to <8 x i32> 521 %2 = bitcast i8 %__U to <8 x i1> 522 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 523 %4 = bitcast <8 x i32> %3 to <4 x i64> 524 ret <4 x i64> %4 525} 526 527define <4 x i64> @test_mm256_maskz_cvtps_epi32(i8 zeroext %__U, <8 x float> %__A) { 528; X86-LABEL: test_mm256_maskz_cvtps_epi32: 529; X86: # %bb.0: # %entry 530; X86-NEXT: movb {{[0-9]+}}(%esp), %al 531; X86-NEXT: kmovw %eax, %k1 532; X86-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 533; X86-NEXT: retl 534; 535; X64-LABEL: test_mm256_maskz_cvtps_epi32: 536; X64: # %bb.0: # %entry 537; X64-NEXT: kmovw %edi, %k1 538; X64-NEXT: vcvtps2dq %ymm0, %ymm0 {%k1} {z} 539; X64-NEXT: retq 540entry: 541 %0 = tail call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %__A) #8 542 %1 = bitcast i8 %__U to <8 x i1> 543 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 544 %3 = bitcast <8 x i32> %2 to <4 x i64> 545 ret <4 x i64> %3 546} 547 548define <2 x double> @test_mm_mask_cvtps_pd(<2 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 549; X86-LABEL: test_mm_mask_cvtps_pd: 550; X86: # %bb.0: # %entry 551; X86-NEXT: movb {{[0-9]+}}(%esp), %al 552; X86-NEXT: kmovw %eax, %k1 553; X86-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 554; X86-NEXT: retl 555; 556; X64-LABEL: test_mm_mask_cvtps_pd: 557; X64: # %bb.0: # %entry 558; X64-NEXT: kmovw %edi, %k1 559; X64-NEXT: vcvtps2pd %xmm1, %xmm0 {%k1} 560; X64-NEXT: retq 561entry: 562 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 563 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 564 %0 = bitcast i8 %__U to <8 x i1> 565 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 566 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 567 ret <2 x double> %1 568} 569 570define <2 x double> @test_mm_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 571; X86-LABEL: test_mm_maskz_cvtps_pd: 572; X86: # %bb.0: # %entry 573; X86-NEXT: movb {{[0-9]+}}(%esp), %al 574; X86-NEXT: kmovw %eax, %k1 575; X86-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 576; X86-NEXT: retl 577; 578; X64-LABEL: test_mm_maskz_cvtps_pd: 579; X64: # %bb.0: # %entry 580; X64-NEXT: kmovw %edi, %k1 581; X64-NEXT: vcvtps2pd %xmm0, %xmm0 {%k1} {z} 582; X64-NEXT: retq 583entry: 584 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <2 x i32> <i32 0, i32 1> 585 %conv.i.i = fpext <2 x float> %shuffle.i.i to <2 x double> 586 %0 = bitcast i8 %__U to <8 x i1> 587 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 588 %1 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 589 ret <2 x double> %1 590} 591 592define <4 x double> @test_mm256_mask_cvtps_pd(<4 x double> %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 593; X86-LABEL: test_mm256_mask_cvtps_pd: 594; X86: # %bb.0: # %entry 595; X86-NEXT: movb {{[0-9]+}}(%esp), %al 596; X86-NEXT: kmovw %eax, %k1 597; X86-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 598; X86-NEXT: retl 599; 600; X64-LABEL: test_mm256_mask_cvtps_pd: 601; X64: # %bb.0: # %entry 602; X64-NEXT: kmovw %edi, %k1 603; X64-NEXT: vcvtps2pd %xmm1, %ymm0 {%k1} 604; X64-NEXT: retq 605entry: 606 %conv.i.i = fpext <4 x float> %__A to <4 x double> 607 %0 = bitcast i8 %__U to <8 x i1> 608 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 609 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 610 ret <4 x double> %1 611} 612 613define <4 x double> @test_mm256_maskz_cvtps_pd(i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #0 { 614; X86-LABEL: test_mm256_maskz_cvtps_pd: 615; X86: # %bb.0: # %entry 616; X86-NEXT: movb {{[0-9]+}}(%esp), %al 617; X86-NEXT: kmovw %eax, %k1 618; X86-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 619; X86-NEXT: retl 620; 621; X64-LABEL: test_mm256_maskz_cvtps_pd: 622; X64: # %bb.0: # %entry 623; X64-NEXT: kmovw %edi, %k1 624; X64-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} 625; X64-NEXT: retq 626entry: 627 %conv.i.i = fpext <4 x float> %__A to <4 x double> 628 %0 = bitcast i8 %__U to <8 x i1> 629 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 630 %1 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 631 ret <4 x double> %1 632} 633 634define <2 x i64> @test_mm_cvtps_epu32(<4 x float> %__A) { 635; CHECK-LABEL: test_mm_cvtps_epu32: 636; CHECK: # %bb.0: # %entry 637; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 638; CHECK-NEXT: ret{{[l|q]}} 639entry: 640 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 641 %1 = bitcast <4 x i32> %0 to <2 x i64> 642 ret <2 x i64> %1 643} 644 645define <2 x i64> @test_mm_mask_cvtps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 646; X86-LABEL: test_mm_mask_cvtps_epu32: 647; X86: # %bb.0: # %entry 648; X86-NEXT: movb {{[0-9]+}}(%esp), %al 649; X86-NEXT: kmovw %eax, %k1 650; X86-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 651; X86-NEXT: retl 652; 653; X64-LABEL: test_mm_mask_cvtps_epu32: 654; X64: # %bb.0: # %entry 655; X64-NEXT: kmovw %edi, %k1 656; X64-NEXT: vcvtps2udq %xmm1, %xmm0 {%k1} 657; X64-NEXT: retq 658entry: 659 %0 = bitcast <2 x i64> %__W to <4 x i32> 660 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 661 %2 = bitcast <4 x i32> %1 to <2 x i64> 662 ret <2 x i64> %2 663} 664 665define <2 x i64> @test_mm_maskz_cvtps_epu32(i8 zeroext %__U, <4 x float> %__A) { 666; X86-LABEL: test_mm_maskz_cvtps_epu32: 667; X86: # %bb.0: # %entry 668; X86-NEXT: movb {{[0-9]+}}(%esp), %al 669; X86-NEXT: kmovw %eax, %k1 670; X86-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 671; X86-NEXT: retl 672; 673; X64-LABEL: test_mm_maskz_cvtps_epu32: 674; X64: # %bb.0: # %entry 675; X64-NEXT: kmovw %edi, %k1 676; X64-NEXT: vcvtps2udq %xmm0, %xmm0 {%k1} {z} 677; X64-NEXT: retq 678entry: 679 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 680 %1 = bitcast <4 x i32> %0 to <2 x i64> 681 ret <2 x i64> %1 682} 683 684define <4 x i64> @test_mm256_cvtps_epu32(<8 x float> %__A) { 685; CHECK-LABEL: test_mm256_cvtps_epu32: 686; CHECK: # %bb.0: # %entry 687; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 688; CHECK-NEXT: ret{{[l|q]}} 689entry: 690 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 691 %1 = bitcast <8 x i32> %0 to <4 x i64> 692 ret <4 x i64> %1 693} 694 695define <4 x i64> @test_mm256_mask_cvtps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 696; X86-LABEL: test_mm256_mask_cvtps_epu32: 697; X86: # %bb.0: # %entry 698; X86-NEXT: movb {{[0-9]+}}(%esp), %al 699; X86-NEXT: kmovw %eax, %k1 700; X86-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 701; X86-NEXT: retl 702; 703; X64-LABEL: test_mm256_mask_cvtps_epu32: 704; X64: # %bb.0: # %entry 705; X64-NEXT: kmovw %edi, %k1 706; X64-NEXT: vcvtps2udq %ymm1, %ymm0 {%k1} 707; X64-NEXT: retq 708entry: 709 %0 = bitcast <4 x i64> %__W to <8 x i32> 710 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 711 %2 = bitcast <8 x i32> %1 to <4 x i64> 712 ret <4 x i64> %2 713} 714 715define <4 x i64> @test_mm256_maskz_cvtps_epu32(i8 zeroext %__U, <8 x float> %__A) { 716; X86-LABEL: test_mm256_maskz_cvtps_epu32: 717; X86: # %bb.0: # %entry 718; X86-NEXT: movb {{[0-9]+}}(%esp), %al 719; X86-NEXT: kmovw %eax, %k1 720; X86-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 721; X86-NEXT: retl 722; 723; X64-LABEL: test_mm256_maskz_cvtps_epu32: 724; X64: # %bb.0: # %entry 725; X64-NEXT: kmovw %edi, %k1 726; X64-NEXT: vcvtps2udq %ymm0, %ymm0 {%k1} {z} 727; X64-NEXT: retq 728entry: 729 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 730 %1 = bitcast <8 x i32> %0 to <4 x i64> 731 ret <4 x i64> %1 732} 733 734define <2 x i64> @test_mm_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 735; X86-LABEL: test_mm_mask_cvttpd_epi32: 736; X86: # %bb.0: # %entry 737; X86-NEXT: movb {{[0-9]+}}(%esp), %al 738; X86-NEXT: kmovw %eax, %k1 739; X86-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 740; X86-NEXT: retl 741; 742; X64-LABEL: test_mm_mask_cvttpd_epi32: 743; X64: # %bb.0: # %entry 744; X64-NEXT: kmovw %edi, %k1 745; X64-NEXT: vcvttpd2dq %xmm1, %xmm0 {%k1} 746; X64-NEXT: retq 747entry: 748 %0 = bitcast <2 x i64> %__W to <4 x i32> 749 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 750 %2 = bitcast <4 x i32> %1 to <2 x i64> 751 ret <2 x i64> %2 752} 753 754define <2 x i64> @test_mm_maskz_cvttpd_epi32(i8 zeroext %__U, <2 x double> %__A) { 755; X86-LABEL: test_mm_maskz_cvttpd_epi32: 756; X86: # %bb.0: # %entry 757; X86-NEXT: movb {{[0-9]+}}(%esp), %al 758; X86-NEXT: kmovw %eax, %k1 759; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 760; X86-NEXT: retl 761; 762; X64-LABEL: test_mm_maskz_cvttpd_epi32: 763; X64: # %bb.0: # %entry 764; X64-NEXT: kmovw %edi, %k1 765; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 {%k1} {z} 766; X64-NEXT: retq 767entry: 768 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 769 %1 = bitcast <4 x i32> %0 to <2 x i64> 770 ret <2 x i64> %1 771} 772 773define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 774; X86-LABEL: test_mm256_mask_cvttpd_epi32: 775; X86: # %bb.0: # %entry 776; X86-NEXT: movb {{[0-9]+}}(%esp), %al 777; X86-NEXT: kmovw %eax, %k1 778; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 779; X86-NEXT: vzeroupper 780; X86-NEXT: retl 781; 782; X64-LABEL: test_mm256_mask_cvttpd_epi32: 783; X64: # %bb.0: # %entry 784; X64-NEXT: kmovw %edi, %k1 785; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} 786; X64-NEXT: vzeroupper 787; X64-NEXT: retq 788entry: 789 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 790 %1 = bitcast <2 x i64> %__W to <4 x i32> 791 %2 = bitcast i8 %__U to <8 x i1> 792 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 793 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 794 %4 = bitcast <4 x i32> %3 to <2 x i64> 795 ret <2 x i64> %4 796} 797 798define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %__A) { 799; X86-LABEL: test_mm256_maskz_cvttpd_epi32: 800; X86: # %bb.0: # %entry 801; X86-NEXT: movb {{[0-9]+}}(%esp), %al 802; X86-NEXT: kmovw %eax, %k1 803; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 804; X86-NEXT: vzeroupper 805; X86-NEXT: retl 806; 807; X64-LABEL: test_mm256_maskz_cvttpd_epi32: 808; X64: # %bb.0: # %entry 809; X64-NEXT: kmovw %edi, %k1 810; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} 811; X64-NEXT: vzeroupper 812; X64-NEXT: retq 813entry: 814 %0 = tail call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %__A) #8 815 %1 = bitcast i8 %__U to <8 x i1> 816 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 817 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 818 %3 = bitcast <4 x i32> %2 to <2 x i64> 819 ret <2 x i64> %3 820} 821 822define <2 x i64> @test_mm_cvttpd_epu32(<2 x double> %__A) { 823; CHECK-LABEL: test_mm_cvttpd_epu32: 824; CHECK: # %bb.0: # %entry 825; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 826; CHECK-NEXT: ret{{[l|q]}} 827entry: 828 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 829 %1 = bitcast <4 x i32> %0 to <2 x i64> 830 ret <2 x i64> %1 831} 832 833define <2 x i64> @test_mm_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <2 x double> %__A) { 834; X86-LABEL: test_mm_mask_cvttpd_epu32: 835; X86: # %bb.0: # %entry 836; X86-NEXT: movb {{[0-9]+}}(%esp), %al 837; X86-NEXT: kmovw %eax, %k1 838; X86-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 839; X86-NEXT: retl 840; 841; X64-LABEL: test_mm_mask_cvttpd_epu32: 842; X64: # %bb.0: # %entry 843; X64-NEXT: kmovw %edi, %k1 844; X64-NEXT: vcvttpd2udq %xmm1, %xmm0 {%k1} 845; X64-NEXT: retq 846entry: 847 %0 = bitcast <2 x i64> %__W to <4 x i32> 848 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> %0, i8 %__U) #8 849 %2 = bitcast <4 x i32> %1 to <2 x i64> 850 ret <2 x i64> %2 851} 852 853define <2 x i64> @test_mm_maskz_cvttpd_epu32(i8 zeroext %__U, <2 x double> %__A) { 854; X86-LABEL: test_mm_maskz_cvttpd_epu32: 855; X86: # %bb.0: # %entry 856; X86-NEXT: movb {{[0-9]+}}(%esp), %al 857; X86-NEXT: kmovw %eax, %k1 858; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 859; X86-NEXT: retl 860; 861; X64-LABEL: test_mm_maskz_cvttpd_epu32: 862; X64: # %bb.0: # %entry 863; X64-NEXT: kmovw %edi, %k1 864; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 {%k1} {z} 865; X64-NEXT: retq 866entry: 867 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 868 %1 = bitcast <4 x i32> %0 to <2 x i64> 869 ret <2 x i64> %1 870} 871 872define <2 x i64> @test_mm256_cvttpd_epu32(<4 x double> %__A) { 873; CHECK-LABEL: test_mm256_cvttpd_epu32: 874; CHECK: # %bb.0: # %entry 875; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 876; CHECK-NEXT: vzeroupper 877; CHECK-NEXT: ret{{[l|q]}} 878entry: 879 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 -1) #8 880 %1 = bitcast <4 x i32> %0 to <2 x i64> 881 ret <2 x i64> %1 882} 883 884define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x double> %__A) { 885; X86-LABEL: test_mm256_mask_cvttpd_epu32: 886; X86: # %bb.0: # %entry 887; X86-NEXT: movb {{[0-9]+}}(%esp), %al 888; X86-NEXT: kmovw %eax, %k1 889; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 890; X86-NEXT: vzeroupper 891; X86-NEXT: retl 892; 893; X64-LABEL: test_mm256_mask_cvttpd_epu32: 894; X64: # %bb.0: # %entry 895; X64-NEXT: kmovw %edi, %k1 896; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} 897; X64-NEXT: vzeroupper 898; X64-NEXT: retq 899entry: 900 %0 = bitcast <2 x i64> %__W to <4 x i32> 901 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> %0, i8 %__U) #8 902 %2 = bitcast <4 x i32> %1 to <2 x i64> 903 ret <2 x i64> %2 904} 905 906define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %__A) { 907; X86-LABEL: test_mm256_maskz_cvttpd_epu32: 908; X86: # %bb.0: # %entry 909; X86-NEXT: movb {{[0-9]+}}(%esp), %al 910; X86-NEXT: kmovw %eax, %k1 911; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 912; X86-NEXT: vzeroupper 913; X86-NEXT: retl 914; 915; X64-LABEL: test_mm256_maskz_cvttpd_epu32: 916; X64: # %bb.0: # %entry 917; X64-NEXT: kmovw %edi, %k1 918; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} 919; X64-NEXT: vzeroupper 920; X64-NEXT: retq 921entry: 922 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 923 %1 = bitcast <4 x i32> %0 to <2 x i64> 924 ret <2 x i64> %1 925} 926 927define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 928; X86-LABEL: test_mm_mask_cvttps_epi32: 929; X86: # %bb.0: # %entry 930; X86-NEXT: movb {{[0-9]+}}(%esp), %al 931; X86-NEXT: kmovw %eax, %k1 932; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 933; X86-NEXT: retl 934; 935; X64-LABEL: test_mm_mask_cvttps_epi32: 936; X64: # %bb.0: # %entry 937; X64-NEXT: kmovw %edi, %k1 938; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} 939; X64-NEXT: retq 940entry: 941 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 942 %1 = bitcast <2 x i64> %__W to <4 x i32> 943 %2 = bitcast i8 %__U to <8 x i1> 944 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 945 %3 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> %1 946 %4 = bitcast <4 x i32> %3 to <2 x i64> 947 ret <2 x i64> %4 948} 949 950define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) { 951; X86-LABEL: test_mm_maskz_cvttps_epi32: 952; X86: # %bb.0: # %entry 953; X86-NEXT: movb {{[0-9]+}}(%esp), %al 954; X86-NEXT: kmovw %eax, %k1 955; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 956; X86-NEXT: retl 957; 958; X64-LABEL: test_mm_maskz_cvttps_epi32: 959; X64: # %bb.0: # %entry 960; X64-NEXT: kmovw %edi, %k1 961; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} 962; X64-NEXT: retq 963entry: 964 %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 965 %1 = bitcast i8 %__U to <8 x i1> 966 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 967 %2 = select <4 x i1> %extract.i, <4 x i32> %0, <4 x i32> zeroinitializer 968 %3 = bitcast <4 x i32> %2 to <2 x i64> 969 ret <2 x i64> %3 970} 971 972define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 973; X86-LABEL: test_mm256_mask_cvttps_epi32: 974; X86: # %bb.0: # %entry 975; X86-NEXT: movb {{[0-9]+}}(%esp), %al 976; X86-NEXT: kmovw %eax, %k1 977; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 978; X86-NEXT: retl 979; 980; X64-LABEL: test_mm256_mask_cvttps_epi32: 981; X64: # %bb.0: # %entry 982; X64-NEXT: kmovw %edi, %k1 983; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} 984; X64-NEXT: retq 985entry: 986 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 987 %1 = bitcast <4 x i64> %__W to <8 x i32> 988 %2 = bitcast i8 %__U to <8 x i1> 989 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 990 %4 = bitcast <8 x i32> %3 to <4 x i64> 991 ret <4 x i64> %4 992} 993 994define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__A) { 995; X86-LABEL: test_mm256_maskz_cvttps_epi32: 996; X86: # %bb.0: # %entry 997; X86-NEXT: movb {{[0-9]+}}(%esp), %al 998; X86-NEXT: kmovw %eax, %k1 999; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 1000; X86-NEXT: retl 1001; 1002; X64-LABEL: test_mm256_maskz_cvttps_epi32: 1003; X64: # %bb.0: # %entry 1004; X64-NEXT: kmovw %edi, %k1 1005; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} 1006; X64-NEXT: retq 1007entry: 1008 %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 1009 %1 = bitcast i8 %__U to <8 x i1> 1010 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 1011 %3 = bitcast <8 x i32> %2 to <4 x i64> 1012 ret <4 x i64> %3 1013} 1014 1015define <2 x i64> @test_mm_cvttps_epu32(<4 x float> %__A) { 1016; CHECK-LABEL: test_mm_cvttps_epu32: 1017; CHECK: # %bb.0: # %entry 1018; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 1019; CHECK-NEXT: ret{{[l|q]}} 1020entry: 1021 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 -1) #8 1022 %1 = bitcast <4 x i32> %0 to <2 x i64> 1023 ret <2 x i64> %1 1024} 1025 1026define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 x float> %__A) { 1027; X86-LABEL: test_mm_mask_cvttps_epu32: 1028; X86: # %bb.0: # %entry 1029; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1030; X86-NEXT: kmovw %eax, %k1 1031; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 1032; X86-NEXT: retl 1033; 1034; X64-LABEL: test_mm_mask_cvttps_epu32: 1035; X64: # %bb.0: # %entry 1036; X64-NEXT: kmovw %edi, %k1 1037; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} 1038; X64-NEXT: retq 1039entry: 1040 %0 = bitcast <2 x i64> %__W to <4 x i32> 1041 %1 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> %0, i8 %__U) #8 1042 %2 = bitcast <4 x i32> %1 to <2 x i64> 1043 ret <2 x i64> %2 1044} 1045 1046define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) { 1047; X86-LABEL: test_mm_maskz_cvttps_epu32: 1048; X86: # %bb.0: # %entry 1049; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1050; X86-NEXT: kmovw %eax, %k1 1051; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 1052; X86-NEXT: retl 1053; 1054; X64-LABEL: test_mm_maskz_cvttps_epu32: 1055; X64: # %bb.0: # %entry 1056; X64-NEXT: kmovw %edi, %k1 1057; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} 1058; X64-NEXT: retq 1059entry: 1060 %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 1061 %1 = bitcast <4 x i32> %0 to <2 x i64> 1062 ret <2 x i64> %1 1063} 1064 1065define <4 x i64> @test_mm256_cvttps_epu32(<8 x float> %__A) { 1066; CHECK-LABEL: test_mm256_cvttps_epu32: 1067; CHECK: # %bb.0: # %entry 1068; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 1069; CHECK-NEXT: ret{{[l|q]}} 1070entry: 1071 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 -1) #8 1072 %1 = bitcast <8 x i32> %0 to <4 x i64> 1073 ret <4 x i64> %1 1074} 1075 1076define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, <8 x float> %__A) { 1077; X86-LABEL: test_mm256_mask_cvttps_epu32: 1078; X86: # %bb.0: # %entry 1079; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1080; X86-NEXT: kmovw %eax, %k1 1081; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 1082; X86-NEXT: retl 1083; 1084; X64-LABEL: test_mm256_mask_cvttps_epu32: 1085; X64: # %bb.0: # %entry 1086; X64-NEXT: kmovw %edi, %k1 1087; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} 1088; X64-NEXT: retq 1089entry: 1090 %0 = bitcast <4 x i64> %__W to <8 x i32> 1091 %1 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> %0, i8 %__U) #8 1092 %2 = bitcast <8 x i32> %1 to <4 x i64> 1093 ret <4 x i64> %2 1094} 1095 1096define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__A) { 1097; X86-LABEL: test_mm256_maskz_cvttps_epu32: 1098; X86: # %bb.0: # %entry 1099; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1100; X86-NEXT: kmovw %eax, %k1 1101; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1102; X86-NEXT: retl 1103; 1104; X64-LABEL: test_mm256_maskz_cvttps_epu32: 1105; X64: # %bb.0: # %entry 1106; X64-NEXT: kmovw %edi, %k1 1107; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} 1108; X64-NEXT: retq 1109entry: 1110 %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 1111 %1 = bitcast <8 x i32> %0 to <4 x i64> 1112 ret <4 x i64> %1 1113} 1114 1115define <2 x double> @test_mm_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1116; CHECK-LABEL: test_mm_cvtepu32_pd: 1117; CHECK: # %bb.0: # %entry 1118; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 1119; CHECK-NEXT: ret{{[l|q]}} 1120entry: 1121 %0 = bitcast <2 x i64> %__A to <4 x i32> 1122 %shuffle.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1123 %conv.i = uitofp <2 x i32> %shuffle.i to <2 x double> 1124 ret <2 x double> %conv.i 1125} 1126 1127define <2 x double> @test_mm_mask_cvtepu32_pd(<2 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1128; X86-LABEL: test_mm_mask_cvtepu32_pd: 1129; X86: # %bb.0: # %entry 1130; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1131; X86-NEXT: kmovw %eax, %k1 1132; X86-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1133; X86-NEXT: retl 1134; 1135; X64-LABEL: test_mm_mask_cvtepu32_pd: 1136; X64: # %bb.0: # %entry 1137; X64-NEXT: kmovw %edi, %k1 1138; X64-NEXT: vcvtudq2pd %xmm1, %xmm0 {%k1} 1139; X64-NEXT: retq 1140entry: 1141 %0 = bitcast <2 x i64> %__A to <4 x i32> 1142 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1143 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1144 %1 = bitcast i8 %__U to <8 x i1> 1145 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1146 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> %__W 1147 ret <2 x double> %2 1148} 1149 1150define <2 x double> @test_mm_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1151; X86-LABEL: test_mm_maskz_cvtepu32_pd: 1152; X86: # %bb.0: # %entry 1153; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1154; X86-NEXT: kmovw %eax, %k1 1155; X86-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1156; X86-NEXT: retl 1157; 1158; X64-LABEL: test_mm_maskz_cvtepu32_pd: 1159; X64: # %bb.0: # %entry 1160; X64-NEXT: kmovw %edi, %k1 1161; X64-NEXT: vcvtudq2pd %xmm0, %xmm0 {%k1} {z} 1162; X64-NEXT: retq 1163entry: 1164 %0 = bitcast <2 x i64> %__A to <4 x i32> 1165 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 0, i32 1> 1166 %conv.i.i = uitofp <2 x i32> %shuffle.i.i to <2 x double> 1167 %1 = bitcast i8 %__U to <8 x i1> 1168 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1169 %2 = select <2 x i1> %extract.i, <2 x double> %conv.i.i, <2 x double> zeroinitializer 1170 ret <2 x double> %2 1171} 1172 1173define <4 x double> @test_mm256_cvtepu32_pd(<2 x i64> %__A) local_unnamed_addr #0 { 1174; CHECK-LABEL: test_mm256_cvtepu32_pd: 1175; CHECK: # %bb.0: # %entry 1176; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 1177; CHECK-NEXT: ret{{[l|q]}} 1178entry: 1179 %0 = bitcast <2 x i64> %__A to <4 x i32> 1180 %conv.i = uitofp <4 x i32> %0 to <4 x double> 1181 ret <4 x double> %conv.i 1182} 1183 1184define <4 x double> @test_mm256_mask_cvtepu32_pd(<4 x double> %__W, i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1185; X86-LABEL: test_mm256_mask_cvtepu32_pd: 1186; X86: # %bb.0: # %entry 1187; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1188; X86-NEXT: kmovw %eax, %k1 1189; X86-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1190; X86-NEXT: retl 1191; 1192; X64-LABEL: test_mm256_mask_cvtepu32_pd: 1193; X64: # %bb.0: # %entry 1194; X64-NEXT: kmovw %edi, %k1 1195; X64-NEXT: vcvtudq2pd %xmm1, %ymm0 {%k1} 1196; X64-NEXT: retq 1197entry: 1198 %0 = bitcast <2 x i64> %__A to <4 x i32> 1199 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1200 %1 = bitcast i8 %__U to <8 x i1> 1201 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1202 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> %__W 1203 ret <4 x double> %2 1204} 1205 1206define <4 x double> @test_mm256_maskz_cvtepu32_pd(i8 zeroext %__U, <2 x i64> %__A) local_unnamed_addr #0 { 1207; X86-LABEL: test_mm256_maskz_cvtepu32_pd: 1208; X86: # %bb.0: # %entry 1209; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1210; X86-NEXT: kmovw %eax, %k1 1211; X86-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1212; X86-NEXT: retl 1213; 1214; X64-LABEL: test_mm256_maskz_cvtepu32_pd: 1215; X64: # %bb.0: # %entry 1216; X64-NEXT: kmovw %edi, %k1 1217; X64-NEXT: vcvtudq2pd %xmm0, %ymm0 {%k1} {z} 1218; X64-NEXT: retq 1219entry: 1220 %0 = bitcast <2 x i64> %__A to <4 x i32> 1221 %conv.i.i = uitofp <4 x i32> %0 to <4 x double> 1222 %1 = bitcast i8 %__U to <8 x i1> 1223 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1224 %2 = select <4 x i1> %extract.i, <4 x double> %conv.i.i, <4 x double> zeroinitializer 1225 ret <4 x double> %2 1226} 1227 1228define <4 x float> @test_mm_cvtepu32_ps(<2 x i64> %__A) { 1229; CHECK-LABEL: test_mm_cvtepu32_ps: 1230; CHECK: # %bb.0: # %entry 1231; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 1232; CHECK-NEXT: ret{{[l|q]}} 1233entry: 1234 %0 = bitcast <2 x i64> %__A to <4 x i32> 1235 %conv.i = uitofp <4 x i32> %0 to <4 x float> 1236 ret <4 x float> %conv.i 1237} 1238 1239define <4 x float> @test_mm_mask_cvtepu32_ps(<4 x float> %__W, i8 zeroext %__U, <2 x i64> %__A) { 1240; X86-LABEL: test_mm_mask_cvtepu32_ps: 1241; X86: # %bb.0: # %entry 1242; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1243; X86-NEXT: kmovw %eax, %k1 1244; X86-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1245; X86-NEXT: retl 1246; 1247; X64-LABEL: test_mm_mask_cvtepu32_ps: 1248; X64: # %bb.0: # %entry 1249; X64-NEXT: kmovw %edi, %k1 1250; X64-NEXT: vcvtudq2ps %xmm1, %xmm0 {%k1} 1251; X64-NEXT: retq 1252entry: 1253 %0 = bitcast <2 x i64> %__A to <4 x i32> 1254 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1255 %1 = bitcast i8 %__U to <8 x i1> 1256 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1257 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> %__W 1258 ret <4 x float> %2 1259} 1260 1261define <4 x float> @test_mm_maskz_cvtepu32_ps(i8 zeroext %__U, <2 x i64> %__A) { 1262; X86-LABEL: test_mm_maskz_cvtepu32_ps: 1263; X86: # %bb.0: # %entry 1264; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1265; X86-NEXT: kmovw %eax, %k1 1266; X86-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1267; X86-NEXT: retl 1268; 1269; X64-LABEL: test_mm_maskz_cvtepu32_ps: 1270; X64: # %bb.0: # %entry 1271; X64-NEXT: kmovw %edi, %k1 1272; X64-NEXT: vcvtudq2ps %xmm0, %xmm0 {%k1} {z} 1273; X64-NEXT: retq 1274entry: 1275 %0 = bitcast <2 x i64> %__A to <4 x i32> 1276 %conv.i.i = uitofp <4 x i32> %0 to <4 x float> 1277 %1 = bitcast i8 %__U to <8 x i1> 1278 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1279 %2 = select <4 x i1> %extract.i, <4 x float> %conv.i.i, <4 x float> zeroinitializer 1280 ret <4 x float> %2 1281} 1282 1283define <8 x float> @test_mm256_cvtepu32_ps(<4 x i64> %__A) { 1284; CHECK-LABEL: test_mm256_cvtepu32_ps: 1285; CHECK: # %bb.0: # %entry 1286; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 1287; CHECK-NEXT: ret{{[l|q]}} 1288entry: 1289 %0 = bitcast <4 x i64> %__A to <8 x i32> 1290 %conv.i = uitofp <8 x i32> %0 to <8 x float> 1291 ret <8 x float> %conv.i 1292} 1293 1294define <8 x float> @test_mm256_mask_cvtepu32_ps(<8 x float> %__W, i8 zeroext %__U, <4 x i64> %__A) { 1295; X86-LABEL: test_mm256_mask_cvtepu32_ps: 1296; X86: # %bb.0: # %entry 1297; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1298; X86-NEXT: kmovw %eax, %k1 1299; X86-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1300; X86-NEXT: retl 1301; 1302; X64-LABEL: test_mm256_mask_cvtepu32_ps: 1303; X64: # %bb.0: # %entry 1304; X64-NEXT: kmovw %edi, %k1 1305; X64-NEXT: vcvtudq2ps %ymm1, %ymm0 {%k1} 1306; X64-NEXT: retq 1307entry: 1308 %0 = bitcast <4 x i64> %__A to <8 x i32> 1309 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1310 %1 = bitcast i8 %__U to <8 x i1> 1311 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> %__W 1312 ret <8 x float> %2 1313} 1314 1315define <8 x float> @test_mm256_maskz_cvtepu32_ps(i8 zeroext %__U, <4 x i64> %__A) { 1316; X86-LABEL: test_mm256_maskz_cvtepu32_ps: 1317; X86: # %bb.0: # %entry 1318; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1319; X86-NEXT: kmovw %eax, %k1 1320; X86-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1321; X86-NEXT: retl 1322; 1323; X64-LABEL: test_mm256_maskz_cvtepu32_ps: 1324; X64: # %bb.0: # %entry 1325; X64-NEXT: kmovw %edi, %k1 1326; X64-NEXT: vcvtudq2ps %ymm0, %ymm0 {%k1} {z} 1327; X64-NEXT: retq 1328entry: 1329 %0 = bitcast <4 x i64> %__A to <8 x i32> 1330 %conv.i.i = uitofp <8 x i32> %0 to <8 x float> 1331 %1 = bitcast i8 %__U to <8 x i1> 1332 %2 = select <8 x i1> %1, <8 x float> %conv.i.i, <8 x float> zeroinitializer 1333 ret <8 x float> %2 1334} 1335 1336define <8 x float> @test_mm256_shuffle_f32x4(<8 x float> %__A, <8 x float> %__B) { 1337; CHECK-LABEL: test_mm256_shuffle_f32x4: 1338; CHECK: # %bb.0: # %entry 1339; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1340; CHECK-NEXT: ret{{[l|q]}} 1341entry: 1342 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1343 ret <8 x float> %shuffle 1344} 1345 1346define <8 x float> @test_mm256_mask_shuffle_f32x4(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1347; X86-LABEL: test_mm256_mask_shuffle_f32x4: 1348; X86: # %bb.0: # %entry 1349; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1350; X86-NEXT: kmovw %eax, %k1 1351; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1352; X86-NEXT: retl 1353; 1354; X64-LABEL: test_mm256_mask_shuffle_f32x4: 1355; X64: # %bb.0: # %entry 1356; X64-NEXT: kmovw %edi, %k1 1357; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1358; X64-NEXT: retq 1359entry: 1360 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1361 %0 = bitcast i8 %__U to <8 x i1> 1362 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> %__W 1363 ret <8 x float> %1 1364} 1365 1366define <8 x float> @test_mm256_maskz_shuffle_f32x4(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B) { 1367; X86-LABEL: test_mm256_maskz_shuffle_f32x4: 1368; X86: # %bb.0: # %entry 1369; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1370; X86-NEXT: kmovw %eax, %k1 1371; X86-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1372; X86-NEXT: retl 1373; 1374; X64-LABEL: test_mm256_maskz_shuffle_f32x4: 1375; X64: # %bb.0: # %entry 1376; X64-NEXT: kmovw %edi, %k1 1377; X64-NEXT: vshuff32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1378; X64-NEXT: retq 1379entry: 1380 %shuffle = shufflevector <8 x float> %__A, <8 x float> %__B, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 1381 %0 = bitcast i8 %__U to <8 x i1> 1382 %1 = select <8 x i1> %0, <8 x float> %shuffle, <8 x float> zeroinitializer 1383 ret <8 x float> %1 1384} 1385 1386define <4 x double> @test_mm256_shuffle_f64x2(<4 x double> %__A, <4 x double> %__B) { 1387; CHECK-LABEL: test_mm256_shuffle_f64x2: 1388; CHECK: # %bb.0: # %entry 1389; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1390; CHECK-NEXT: ret{{[l|q]}} 1391entry: 1392 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1393 ret <4 x double> %shuffle 1394} 1395 1396define <4 x double> @test_mm256_mask_shuffle_f64x2(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1397; X86-LABEL: test_mm256_mask_shuffle_f64x2: 1398; X86: # %bb.0: # %entry 1399; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1400; X86-NEXT: kmovw %eax, %k1 1401; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1402; X86-NEXT: retl 1403; 1404; X64-LABEL: test_mm256_mask_shuffle_f64x2: 1405; X64: # %bb.0: # %entry 1406; X64-NEXT: kmovw %edi, %k1 1407; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1408; X64-NEXT: retq 1409entry: 1410 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1411 %0 = bitcast i8 %__U to <8 x i1> 1412 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1413 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> %__W 1414 ret <4 x double> %1 1415} 1416 1417define <4 x double> @test_mm256_maskz_shuffle_f64x2(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 1418; X86-LABEL: test_mm256_maskz_shuffle_f64x2: 1419; X86: # %bb.0: # %entry 1420; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1421; X86-NEXT: kmovw %eax, %k1 1422; X86-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1423; X86-NEXT: retl 1424; 1425; X64-LABEL: test_mm256_maskz_shuffle_f64x2: 1426; X64: # %bb.0: # %entry 1427; X64-NEXT: kmovw %edi, %k1 1428; X64-NEXT: vshuff64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1429; X64-NEXT: retq 1430entry: 1431 %shuffle = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1432 %0 = bitcast i8 %__U to <8 x i1> 1433 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1434 %1 = select <4 x i1> %extract, <4 x double> %shuffle, <4 x double> zeroinitializer 1435 ret <4 x double> %1 1436} 1437 1438define <4 x i64> @test_mm256_shuffle_i32x4(<4 x i64> %__A, <4 x i64> %__B) { 1439; CHECK-LABEL: test_mm256_shuffle_i32x4: 1440; CHECK: # %bb.0: # %entry 1441; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1442; CHECK-NEXT: ret{{[l|q]}} 1443entry: 1444 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1445 ret <4 x i64> %shuffle 1446} 1447 1448define <4 x i64> @test_mm256_mask_shuffle_i32x4(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1449; X86-LABEL: test_mm256_mask_shuffle_i32x4: 1450; X86: # %bb.0: # %entry 1451; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1452; X86-NEXT: kmovw %eax, %k1 1453; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1454; X86-NEXT: retl 1455; 1456; X64-LABEL: test_mm256_mask_shuffle_i32x4: 1457; X64: # %bb.0: # %entry 1458; X64-NEXT: kmovw %edi, %k1 1459; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} = ymm1[4,5,6,7],ymm2[4,5,6,7] 1460; X64-NEXT: retq 1461entry: 1462 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1463 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1464 %1 = bitcast <4 x i64> %__W to <8 x i32> 1465 %2 = bitcast i8 %__U to <8 x i1> 1466 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 1467 %4 = bitcast <8 x i32> %3 to <4 x i64> 1468 ret <4 x i64> %4 1469} 1470 1471define <4 x i64> @test_mm256_maskz_shuffle_i32x4(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1472; X86-LABEL: test_mm256_maskz_shuffle_i32x4: 1473; X86: # %bb.0: # %entry 1474; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1475; X86-NEXT: kmovw %eax, %k1 1476; X86-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1477; X86-NEXT: retl 1478; 1479; X64-LABEL: test_mm256_maskz_shuffle_i32x4: 1480; X64: # %bb.0: # %entry 1481; X64-NEXT: kmovw %edi, %k1 1482; X64-NEXT: vshufi32x4 {{.*#+}} ymm0 {%k1} {z} = ymm0[4,5,6,7],ymm1[4,5,6,7] 1483; X64-NEXT: retq 1484entry: 1485 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1486 %0 = bitcast <4 x i64> %shuffle to <8 x i32> 1487 %1 = bitcast i8 %__U to <8 x i1> 1488 %2 = select <8 x i1> %1, <8 x i32> %0, <8 x i32> zeroinitializer 1489 %3 = bitcast <8 x i32> %2 to <4 x i64> 1490 ret <4 x i64> %3 1491} 1492 1493define <4 x i64> @test_mm256_shuffle_i64x2(<4 x i64> %__A, <4 x i64> %__B) { 1494; CHECK-LABEL: test_mm256_shuffle_i64x2: 1495; CHECK: # %bb.0: # %entry 1496; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] 1497; CHECK-NEXT: ret{{[l|q]}} 1498entry: 1499 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1500 ret <4 x i64> %shuffle 1501} 1502 1503define <4 x i64> @test_mm256_mask_shuffle_i64x2(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1504; X86-LABEL: test_mm256_mask_shuffle_i64x2: 1505; X86: # %bb.0: # %entry 1506; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1507; X86-NEXT: kmovw %eax, %k1 1508; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1509; X86-NEXT: retl 1510; 1511; X64-LABEL: test_mm256_mask_shuffle_i64x2: 1512; X64: # %bb.0: # %entry 1513; X64-NEXT: kmovw %edi, %k1 1514; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} = ymm1[2,3],ymm2[2,3] 1515; X64-NEXT: retq 1516entry: 1517 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1518 %0 = bitcast i8 %__U to <8 x i1> 1519 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1520 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> %__W 1521 ret <4 x i64> %1 1522} 1523 1524define <4 x i64> @test_mm256_maskz_shuffle_i64x2(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1525; X86-LABEL: test_mm256_maskz_shuffle_i64x2: 1526; X86: # %bb.0: # %entry 1527; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1528; X86-NEXT: kmovw %eax, %k1 1529; X86-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1530; X86-NEXT: retl 1531; 1532; X64-LABEL: test_mm256_maskz_shuffle_i64x2: 1533; X64: # %bb.0: # %entry 1534; X64-NEXT: kmovw %edi, %k1 1535; X64-NEXT: vshufi64x2 {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3],ymm1[2,3] 1536; X64-NEXT: retq 1537entry: 1538 %shuffle = shufflevector <4 x i64> %__A, <4 x i64> %__B, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1539 %0 = bitcast i8 %__U to <8 x i1> 1540 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1541 %1 = select <4 x i1> %extract, <4 x i64> %shuffle, <4 x i64> zeroinitializer 1542 ret <4 x i64> %1 1543} 1544 1545define zeroext i8 @test_mm_test_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1546; CHECK-LABEL: test_mm_test_epi32_mask: 1547; CHECK: # %bb.0: # %entry 1548; CHECK-NEXT: vptestmd %xmm0, %xmm1, %k0 1549; CHECK-NEXT: kmovw %k0, %eax 1550; CHECK-NEXT: movzbl %al, %eax 1551; CHECK-NEXT: ret{{[l|q]}} 1552entry: 1553 %and.i.i = and <2 x i64> %__B, %__A 1554 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1555 %1 = icmp ne <4 x i32> %0, zeroinitializer 1556 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1557 %3 = bitcast <8 x i1> %2 to i8 1558 ret i8 %3 1559} 1560 1561define zeroext i8 @test_mm_mask_test_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1562; X86-LABEL: test_mm_mask_test_epi32_mask: 1563; X86: # %bb.0: # %entry 1564; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1565; X86-NEXT: kmovw %eax, %k1 1566; X86-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1567; X86-NEXT: kmovw %k0, %eax 1568; X86-NEXT: movzbl %al, %eax 1569; X86-NEXT: retl 1570; 1571; X64-LABEL: test_mm_mask_test_epi32_mask: 1572; X64: # %bb.0: # %entry 1573; X64-NEXT: kmovw %edi, %k1 1574; X64-NEXT: vptestmd %xmm0, %xmm1, %k0 {%k1} 1575; X64-NEXT: kmovw %k0, %eax 1576; X64-NEXT: movzbl %al, %eax 1577; X64-NEXT: retq 1578entry: 1579 %and.i.i = and <2 x i64> %__B, %__A 1580 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1581 %1 = icmp ne <4 x i32> %0, zeroinitializer 1582 %2 = bitcast i8 %__U to <8 x i1> 1583 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1584 %3 = and <4 x i1> %1, %extract.i 1585 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1586 %5 = bitcast <8 x i1> %4 to i8 1587 ret i8 %5 1588} 1589 1590define zeroext i8 @test_mm256_test_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1591; CHECK-LABEL: test_mm256_test_epi32_mask: 1592; CHECK: # %bb.0: # %entry 1593; CHECK-NEXT: vptestmd %ymm0, %ymm1, %k0 1594; CHECK-NEXT: kmovw %k0, %eax 1595; CHECK-NEXT: movzbl %al, %eax 1596; CHECK-NEXT: vzeroupper 1597; CHECK-NEXT: ret{{[l|q]}} 1598entry: 1599 %and.i.i = and <4 x i64> %__B, %__A 1600 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1601 %1 = icmp ne <8 x i32> %0, zeroinitializer 1602 %2 = bitcast <8 x i1> %1 to i8 1603 ret i8 %2 1604} 1605 1606define zeroext i8 @test_mm256_mask_test_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1607; X86-LABEL: test_mm256_mask_test_epi32_mask: 1608; X86: # %bb.0: # %entry 1609; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1610; X86-NEXT: kmovw %eax, %k1 1611; X86-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1612; X86-NEXT: kmovw %k0, %eax 1613; X86-NEXT: movzbl %al, %eax 1614; X86-NEXT: vzeroupper 1615; X86-NEXT: retl 1616; 1617; X64-LABEL: test_mm256_mask_test_epi32_mask: 1618; X64: # %bb.0: # %entry 1619; X64-NEXT: kmovw %edi, %k1 1620; X64-NEXT: vptestmd %ymm0, %ymm1, %k0 {%k1} 1621; X64-NEXT: kmovw %k0, %eax 1622; X64-NEXT: movzbl %al, %eax 1623; X64-NEXT: vzeroupper 1624; X64-NEXT: retq 1625entry: 1626 %and.i.i = and <4 x i64> %__B, %__A 1627 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1628 %1 = icmp ne <8 x i32> %0, zeroinitializer 1629 %2 = bitcast i8 %__U to <8 x i1> 1630 %3 = and <8 x i1> %1, %2 1631 %4 = bitcast <8 x i1> %3 to i8 1632 ret i8 %4 1633} 1634 1635define zeroext i8 @test_mm_test_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1636; CHECK-LABEL: test_mm_test_epi64_mask: 1637; CHECK: # %bb.0: # %entry 1638; CHECK-NEXT: vptestmq %xmm0, %xmm1, %k0 1639; CHECK-NEXT: kmovw %k0, %eax 1640; CHECK-NEXT: movzbl %al, %eax 1641; CHECK-NEXT: ret{{[l|q]}} 1642entry: 1643 %and.i.i = and <2 x i64> %__B, %__A 1644 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1645 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1646 %2 = bitcast <8 x i1> %1 to i8 1647 ret i8 %2 1648} 1649 1650define zeroext i8 @test_mm_mask_test_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1651; X86-LABEL: test_mm_mask_test_epi64_mask: 1652; X86: # %bb.0: # %entry 1653; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1654; X86-NEXT: kmovw %eax, %k1 1655; X86-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1656; X86-NEXT: kmovw %k0, %eax 1657; X86-NEXT: movzbl %al, %eax 1658; X86-NEXT: retl 1659; 1660; X64-LABEL: test_mm_mask_test_epi64_mask: 1661; X64: # %bb.0: # %entry 1662; X64-NEXT: kmovw %edi, %k1 1663; X64-NEXT: vptestmq %xmm0, %xmm1, %k0 {%k1} 1664; X64-NEXT: kmovw %k0, %eax 1665; X64-NEXT: movzbl %al, %eax 1666; X64-NEXT: retq 1667entry: 1668 %and.i.i = and <2 x i64> %__B, %__A 1669 %0 = icmp ne <2 x i64> %and.i.i, zeroinitializer 1670 %1 = bitcast i8 %__U to <8 x i1> 1671 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1672 %2 = and <2 x i1> %0, %extract.i 1673 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1674 %4 = bitcast <8 x i1> %3 to i8 1675 ret i8 %4 1676} 1677 1678define zeroext i8 @test_mm256_test_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1679; CHECK-LABEL: test_mm256_test_epi64_mask: 1680; CHECK: # %bb.0: # %entry 1681; CHECK-NEXT: vptestmq %ymm0, %ymm1, %k0 1682; CHECK-NEXT: kmovw %k0, %eax 1683; CHECK-NEXT: movzbl %al, %eax 1684; CHECK-NEXT: vzeroupper 1685; CHECK-NEXT: ret{{[l|q]}} 1686entry: 1687 %and.i.i = and <4 x i64> %__B, %__A 1688 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1689 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1690 %2 = bitcast <8 x i1> %1 to i8 1691 ret i8 %2 1692} 1693 1694define zeroext i8 @test_mm256_mask_test_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1695; X86-LABEL: test_mm256_mask_test_epi64_mask: 1696; X86: # %bb.0: # %entry 1697; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1698; X86-NEXT: kmovw %eax, %k1 1699; X86-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1700; X86-NEXT: kmovw %k0, %eax 1701; X86-NEXT: movzbl %al, %eax 1702; X86-NEXT: vzeroupper 1703; X86-NEXT: retl 1704; 1705; X64-LABEL: test_mm256_mask_test_epi64_mask: 1706; X64: # %bb.0: # %entry 1707; X64-NEXT: kmovw %edi, %k1 1708; X64-NEXT: vptestmq %ymm0, %ymm1, %k0 {%k1} 1709; X64-NEXT: kmovw %k0, %eax 1710; X64-NEXT: movzbl %al, %eax 1711; X64-NEXT: vzeroupper 1712; X64-NEXT: retq 1713entry: 1714 %and.i.i = and <4 x i64> %__B, %__A 1715 %0 = icmp ne <4 x i64> %and.i.i, zeroinitializer 1716 %1 = bitcast i8 %__U to <8 x i1> 1717 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1718 %2 = and <4 x i1> %0, %extract.i 1719 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1720 %4 = bitcast <8 x i1> %3 to i8 1721 ret i8 %4 1722} 1723 1724define zeroext i8 @test_mm_testn_epi32_mask(<2 x i64> %__A, <2 x i64> %__B) { 1725; CHECK-LABEL: test_mm_testn_epi32_mask: 1726; CHECK: # %bb.0: # %entry 1727; CHECK-NEXT: vptestnmd %xmm0, %xmm1, %k0 1728; CHECK-NEXT: kmovw %k0, %eax 1729; CHECK-NEXT: movzbl %al, %eax 1730; CHECK-NEXT: ret{{[l|q]}} 1731entry: 1732 %and.i.i = and <2 x i64> %__B, %__A 1733 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1734 %1 = icmp eq <4 x i32> %0, zeroinitializer 1735 %2 = shufflevector <4 x i1> %1, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1736 %3 = bitcast <8 x i1> %2 to i8 1737 ret i8 %3 1738} 1739 1740define zeroext i8 @test_mm_mask_testn_epi32_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1741; X86-LABEL: test_mm_mask_testn_epi32_mask: 1742; X86: # %bb.0: # %entry 1743; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1744; X86-NEXT: kmovw %eax, %k1 1745; X86-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1746; X86-NEXT: kmovw %k0, %eax 1747; X86-NEXT: movzbl %al, %eax 1748; X86-NEXT: retl 1749; 1750; X64-LABEL: test_mm_mask_testn_epi32_mask: 1751; X64: # %bb.0: # %entry 1752; X64-NEXT: kmovw %edi, %k1 1753; X64-NEXT: vptestnmd %xmm0, %xmm1, %k0 {%k1} 1754; X64-NEXT: kmovw %k0, %eax 1755; X64-NEXT: movzbl %al, %eax 1756; X64-NEXT: retq 1757entry: 1758 %and.i.i = and <2 x i64> %__B, %__A 1759 %0 = bitcast <2 x i64> %and.i.i to <4 x i32> 1760 %1 = icmp eq <4 x i32> %0, zeroinitializer 1761 %2 = bitcast i8 %__U to <8 x i1> 1762 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1763 %3 = and <4 x i1> %1, %extract.i 1764 %4 = shufflevector <4 x i1> %3, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1765 %5 = bitcast <8 x i1> %4 to i8 1766 ret i8 %5 1767} 1768 1769define zeroext i8 @test_mm256_testn_epi32_mask(<4 x i64> %__A, <4 x i64> %__B) { 1770; CHECK-LABEL: test_mm256_testn_epi32_mask: 1771; CHECK: # %bb.0: # %entry 1772; CHECK-NEXT: vptestnmd %ymm0, %ymm1, %k0 1773; CHECK-NEXT: kmovw %k0, %eax 1774; CHECK-NEXT: movzbl %al, %eax 1775; CHECK-NEXT: vzeroupper 1776; CHECK-NEXT: ret{{[l|q]}} 1777entry: 1778 %and.i.i = and <4 x i64> %__B, %__A 1779 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1780 %1 = icmp eq <8 x i32> %0, zeroinitializer 1781 %2 = bitcast <8 x i1> %1 to i8 1782 ret i8 %2 1783} 1784 1785define zeroext i8 @test_mm256_mask_testn_epi32_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1786; X86-LABEL: test_mm256_mask_testn_epi32_mask: 1787; X86: # %bb.0: # %entry 1788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1789; X86-NEXT: kmovw %eax, %k1 1790; X86-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1791; X86-NEXT: kmovw %k0, %eax 1792; X86-NEXT: movzbl %al, %eax 1793; X86-NEXT: vzeroupper 1794; X86-NEXT: retl 1795; 1796; X64-LABEL: test_mm256_mask_testn_epi32_mask: 1797; X64: # %bb.0: # %entry 1798; X64-NEXT: kmovw %edi, %k1 1799; X64-NEXT: vptestnmd %ymm0, %ymm1, %k0 {%k1} 1800; X64-NEXT: kmovw %k0, %eax 1801; X64-NEXT: movzbl %al, %eax 1802; X64-NEXT: vzeroupper 1803; X64-NEXT: retq 1804entry: 1805 %and.i.i = and <4 x i64> %__B, %__A 1806 %0 = bitcast <4 x i64> %and.i.i to <8 x i32> 1807 %1 = icmp eq <8 x i32> %0, zeroinitializer 1808 %2 = bitcast i8 %__U to <8 x i1> 1809 %3 = and <8 x i1> %1, %2 1810 %4 = bitcast <8 x i1> %3 to i8 1811 ret i8 %4 1812} 1813 1814define zeroext i8 @test_mm_testn_epi64_mask(<2 x i64> %__A, <2 x i64> %__B) { 1815; CHECK-LABEL: test_mm_testn_epi64_mask: 1816; CHECK: # %bb.0: # %entry 1817; CHECK-NEXT: vptestnmq %xmm0, %xmm1, %k0 1818; CHECK-NEXT: kmovw %k0, %eax 1819; CHECK-NEXT: movzbl %al, %eax 1820; CHECK-NEXT: ret{{[l|q]}} 1821entry: 1822 %and.i.i = and <2 x i64> %__B, %__A 1823 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1824 %1 = shufflevector <2 x i1> %0, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1825 %2 = bitcast <8 x i1> %1 to i8 1826 ret i8 %2 1827} 1828 1829define zeroext i8 @test_mm_mask_testn_epi64_mask(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 1830; X86-LABEL: test_mm_mask_testn_epi64_mask: 1831; X86: # %bb.0: # %entry 1832; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1833; X86-NEXT: kmovw %eax, %k1 1834; X86-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1835; X86-NEXT: kmovw %k0, %eax 1836; X86-NEXT: movzbl %al, %eax 1837; X86-NEXT: retl 1838; 1839; X64-LABEL: test_mm_mask_testn_epi64_mask: 1840; X64: # %bb.0: # %entry 1841; X64-NEXT: kmovw %edi, %k1 1842; X64-NEXT: vptestnmq %xmm0, %xmm1, %k0 {%k1} 1843; X64-NEXT: kmovw %k0, %eax 1844; X64-NEXT: movzbl %al, %eax 1845; X64-NEXT: retq 1846entry: 1847 %and.i.i = and <2 x i64> %__B, %__A 1848 %0 = icmp eq <2 x i64> %and.i.i, zeroinitializer 1849 %1 = bitcast i8 %__U to <8 x i1> 1850 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 1851 %2 = and <2 x i1> %0, %extract.i 1852 %3 = shufflevector <2 x i1> %2, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1853 %4 = bitcast <8 x i1> %3 to i8 1854 ret i8 %4 1855} 1856 1857define zeroext i8 @test_mm256_testn_epi64_mask(<4 x i64> %__A, <4 x i64> %__B) { 1858; CHECK-LABEL: test_mm256_testn_epi64_mask: 1859; CHECK: # %bb.0: # %entry 1860; CHECK-NEXT: vptestnmq %ymm0, %ymm1, %k0 1861; CHECK-NEXT: kmovw %k0, %eax 1862; CHECK-NEXT: movzbl %al, %eax 1863; CHECK-NEXT: vzeroupper 1864; CHECK-NEXT: ret{{[l|q]}} 1865entry: 1866 %and.i.i = and <4 x i64> %__B, %__A 1867 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1868 %1 = shufflevector <4 x i1> %0, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1869 %2 = bitcast <8 x i1> %1 to i8 1870 ret i8 %2 1871} 1872 1873define zeroext i8 @test_mm256_mask_testn_epi64_mask(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 1874; X86-LABEL: test_mm256_mask_testn_epi64_mask: 1875; X86: # %bb.0: # %entry 1876; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1877; X86-NEXT: kmovw %eax, %k1 1878; X86-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1879; X86-NEXT: kmovw %k0, %eax 1880; X86-NEXT: movzbl %al, %eax 1881; X86-NEXT: vzeroupper 1882; X86-NEXT: retl 1883; 1884; X64-LABEL: test_mm256_mask_testn_epi64_mask: 1885; X64: # %bb.0: # %entry 1886; X64-NEXT: kmovw %edi, %k1 1887; X64-NEXT: vptestnmq %ymm0, %ymm1, %k0 {%k1} 1888; X64-NEXT: kmovw %k0, %eax 1889; X64-NEXT: movzbl %al, %eax 1890; X64-NEXT: vzeroupper 1891; X64-NEXT: retq 1892entry: 1893 %and.i.i = and <4 x i64> %__B, %__A 1894 %0 = icmp eq <4 x i64> %and.i.i, zeroinitializer 1895 %1 = bitcast i8 %__U to <8 x i1> 1896 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1897 %2 = and <4 x i1> %0, %extract.i 1898 %3 = shufflevector <4 x i1> %2, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1899 %4 = bitcast <8 x i1> %3 to i8 1900 ret i8 %4 1901} 1902 1903define <2 x i64> @test_mm_mask_set1_epi32(<2 x i64> %__O, i8 zeroext %__M) { 1904; X86-LABEL: test_mm_mask_set1_epi32: 1905; X86: # %bb.0: # %entry 1906; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1907; X86-NEXT: kmovw %eax, %k1 1908; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} 1909; X86-NEXT: retl 1910; 1911; X64-LABEL: test_mm_mask_set1_epi32: 1912; X64: # %bb.0: # %entry 1913; X64-NEXT: kmovw %edi, %k1 1914; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} 1915; X64-NEXT: retq 1916entry: 1917 %0 = bitcast <2 x i64> %__O to <4 x i32> 1918 %1 = bitcast i8 %__M to <8 x i1> 1919 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1920 %2 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> %0 1921 %3 = bitcast <4 x i32> %2 to <2 x i64> 1922 ret <2 x i64> %3 1923} 1924 1925define <2 x i64> @test_mm_maskz_set1_epi32(i8 zeroext %__M) { 1926; X86-LABEL: test_mm_maskz_set1_epi32: 1927; X86: # %bb.0: # %entry 1928; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1929; X86-NEXT: kmovw %eax, %k1 1930; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z} 1931; X86-NEXT: retl 1932; 1933; X64-LABEL: test_mm_maskz_set1_epi32: 1934; X64: # %bb.0: # %entry 1935; X64-NEXT: kmovw %edi, %k1 1936; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z} 1937; X64-NEXT: retq 1938entry: 1939 %0 = bitcast i8 %__M to <8 x i1> 1940 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1941 %1 = select <4 x i1> %extract.i, <4 x i32> <i32 5, i32 5, i32 5, i32 5>, <4 x i32> zeroinitializer 1942 %2 = bitcast <4 x i32> %1 to <2 x i64> 1943 ret <2 x i64> %2 1944} 1945 1946define <4 x i64> @test_mm256_mask_set1_epi32(<4 x i64> %__O, i8 zeroext %__M) { 1947; X86-LABEL: test_mm256_mask_set1_epi32: 1948; X86: # %bb.0: # %entry 1949; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1950; X86-NEXT: kmovw %eax, %k1 1951; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} 1952; X86-NEXT: retl 1953; 1954; X64-LABEL: test_mm256_mask_set1_epi32: 1955; X64: # %bb.0: # %entry 1956; X64-NEXT: kmovw %edi, %k1 1957; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} 1958; X64-NEXT: retq 1959entry: 1960 %0 = bitcast <4 x i64> %__O to <8 x i32> 1961 %1 = bitcast i8 %__M to <8 x i1> 1962 %2 = select <8 x i1> %1, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> %0 1963 %3 = bitcast <8 x i32> %2 to <4 x i64> 1964 ret <4 x i64> %3 1965} 1966 1967define <4 x i64> @test_mm256_maskz_set1_epi32(i8 zeroext %__M) { 1968; X86-LABEL: test_mm256_maskz_set1_epi32: 1969; X86: # %bb.0: # %entry 1970; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1971; X86-NEXT: kmovw %eax, %k1 1972; X86-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0 {%k1} {z} 1973; X86-NEXT: retl 1974; 1975; X64-LABEL: test_mm256_maskz_set1_epi32: 1976; X64: # %bb.0: # %entry 1977; X64-NEXT: kmovw %edi, %k1 1978; X64-NEXT: vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k1} {z} 1979; X64-NEXT: retq 1980entry: 1981 %0 = bitcast i8 %__M to <8 x i1> 1982 %1 = select <8 x i1> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>, <8 x i32> zeroinitializer 1983 %2 = bitcast <8 x i32> %1 to <4 x i64> 1984 ret <4 x i64> %2 1985} 1986 1987define <2 x i64> @test_mm_mask_set1_epi64(<2 x i64> %__O, i8 zeroext %__M, i64 %__A) { 1988; X86-LABEL: test_mm_mask_set1_epi64: 1989; X86: # %bb.0: # %entry 1990; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1991; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1992; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1993; X86-NEXT: kmovw %eax, %k1 1994; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 1995; X86-NEXT: retl 1996; 1997; X64-LABEL: test_mm_mask_set1_epi64: 1998; X64: # %bb.0: # %entry 1999; X64-NEXT: kmovw %edi, %k1 2000; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} 2001; X64-NEXT: retq 2002entry: 2003 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 2004 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 2005 %0 = bitcast i8 %__M to <8 x i1> 2006 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2007 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> %__O 2008 ret <2 x i64> %1 2009} 2010 2011define <2 x i64> @test_mm_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 2012; X86-LABEL: test_mm_maskz_set1_epi64: 2013; X86: # %bb.0: # %entry 2014; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2015; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2016; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2017; X86-NEXT: kmovw %eax, %k1 2018; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2019; X86-NEXT: retl 2020; 2021; X64-LABEL: test_mm_maskz_set1_epi64: 2022; X64: # %bb.0: # %entry 2023; X64-NEXT: kmovw %edi, %k1 2024; X64-NEXT: vpbroadcastq %rsi, %xmm0 {%k1} {z} 2025; X64-NEXT: retq 2026entry: 2027 %vecinit.i.i.i = insertelement <2 x i64> undef, i64 %__A, i32 0 2028 %vecinit1.i.i.i = shufflevector <2 x i64> %vecinit.i.i.i, <2 x i64> undef, <2 x i32> zeroinitializer 2029 %0 = bitcast i8 %__M to <8 x i1> 2030 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2031 %1 = select <2 x i1> %extract.i, <2 x i64> %vecinit1.i.i.i, <2 x i64> zeroinitializer 2032 ret <2 x i64> %1 2033} 2034 2035 2036define <4 x i64> @test_mm256_mask_set1_epi64(<4 x i64> %__O, i8 zeroext %__M, i64 %__A) { 2037; X86-LABEL: test_mm256_mask_set1_epi64: 2038; X86: # %bb.0: # %entry 2039; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2040; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2041; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 2042; X86-NEXT: kmovw %eax, %k1 2043; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2044; X86-NEXT: retl 2045; 2046; X64-LABEL: test_mm256_mask_set1_epi64: 2047; X64: # %bb.0: # %entry 2048; X64-NEXT: kmovw %edi, %k1 2049; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} 2050; X64-NEXT: retq 2051entry: 2052 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 2053 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 2054 %0 = bitcast i8 %__M to <8 x i1> 2055 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2056 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> %__O 2057 ret <4 x i64> %1 2058} 2059 2060define <4 x i64> @test_mm256_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 2061; X86-LABEL: test_mm256_maskz_set1_epi64: 2062; X86: # %bb.0: # %entry 2063; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2064; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2065; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 2066; X86-NEXT: kmovw %eax, %k1 2067; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2068; X86-NEXT: retl 2069; 2070; X64-LABEL: test_mm256_maskz_set1_epi64: 2071; X64: # %bb.0: # %entry 2072; X64-NEXT: kmovw %edi, %k1 2073; X64-NEXT: vpbroadcastq %rsi, %ymm0 {%k1} {z} 2074; X64-NEXT: retq 2075entry: 2076 %vecinit.i.i = insertelement <4 x i64> undef, i64 %__A, i32 0 2077 %vecinit3.i.i = shufflevector <4 x i64> %vecinit.i.i, <4 x i64> undef, <4 x i32> zeroinitializer 2078 %0 = bitcast i8 %__M to <8 x i1> 2079 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2080 %1 = select <4 x i1> %extract.i, <4 x i64> %vecinit3.i.i, <4 x i64> zeroinitializer 2081 ret <4 x i64> %1 2082} 2083 2084define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) { 2085; CHECK-LABEL: test_mm_broadcastd_epi32: 2086; CHECK: # %bb.0: 2087; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 2088; CHECK-NEXT: ret{{[l|q]}} 2089 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2090 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer 2091 %res1 = bitcast <4 x i32> %res0 to <2 x i64> 2092 ret <2 x i64> %res1 2093} 2094 2095define <2 x i64> @test_mm_mask_broadcastd_epi32(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2096; X86-LABEL: test_mm_mask_broadcastd_epi32: 2097; X86: # %bb.0: # %entry 2098; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2099; X86-NEXT: kmovw %eax, %k1 2100; X86-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2101; X86-NEXT: retl 2102; 2103; X64-LABEL: test_mm_mask_broadcastd_epi32: 2104; X64: # %bb.0: # %entry 2105; X64-NEXT: kmovw %edi, %k1 2106; X64-NEXT: vpbroadcastd %xmm1, %xmm0 {%k1} 2107; X64-NEXT: retq 2108entry: 2109 %0 = bitcast <2 x i64> %__A to <4 x i32> 2110 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2111 %1 = bitcast <2 x i64> %__O to <4 x i32> 2112 %2 = bitcast i8 %__M to <8 x i1> 2113 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2114 %3 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> %1 2115 %4 = bitcast <4 x i32> %3 to <2 x i64> 2116 ret <2 x i64> %4 2117} 2118 2119define <2 x i64> @test_mm_maskz_broadcastd_epi32(i8 zeroext %__M, <2 x i64> %__A) { 2120; X86-LABEL: test_mm_maskz_broadcastd_epi32: 2121; X86: # %bb.0: # %entry 2122; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2123; X86-NEXT: kmovw %eax, %k1 2124; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2125; X86-NEXT: retl 2126; 2127; X64-LABEL: test_mm_maskz_broadcastd_epi32: 2128; X64: # %bb.0: # %entry 2129; X64-NEXT: kmovw %edi, %k1 2130; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} 2131; X64-NEXT: retq 2132entry: 2133 %0 = bitcast <2 x i64> %__A to <4 x i32> 2134 %shuffle.i.i = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer 2135 %1 = bitcast i8 %__M to <8 x i1> 2136 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2137 %2 = select <4 x i1> %extract.i, <4 x i32> %shuffle.i.i, <4 x i32> zeroinitializer 2138 %3 = bitcast <4 x i32> %2 to <2 x i64> 2139 ret <2 x i64> %3 2140} 2141 2142define <4 x i64> @test_mm256_broadcastd_epi32(<2 x i64> %a0) { 2143; CHECK-LABEL: test_mm256_broadcastd_epi32: 2144; CHECK: # %bb.0: 2145; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2146; CHECK-NEXT: ret{{[l|q]}} 2147 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 2148 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <8 x i32> zeroinitializer 2149 %res1 = bitcast <8 x i32> %res0 to <4 x i64> 2150 ret <4 x i64> %res1 2151} 2152 2153define <4 x i64> @test_mm256_mask_broadcastd_epi32(<4 x i64> %a0, i8 %a1, <2 x i64> %a2) { 2154; X86-LABEL: test_mm256_mask_broadcastd_epi32: 2155; X86: # %bb.0: 2156; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2157; X86-NEXT: kmovw %eax, %k1 2158; X86-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2159; X86-NEXT: retl 2160; 2161; X64-LABEL: test_mm256_mask_broadcastd_epi32: 2162; X64: # %bb.0: 2163; X64-NEXT: kmovw %edi, %k1 2164; X64-NEXT: vpbroadcastd %xmm1, %ymm0 {%k1} 2165; X64-NEXT: retq 2166 %arg0 = bitcast <4 x i64> %a0 to <8 x i32> 2167 %arg1 = bitcast i8 %a1 to <8 x i1> 2168 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 2169 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <8 x i32> zeroinitializer 2170 %res1 = select <8 x i1> %arg1, <8 x i32> %res0, <8 x i32> %arg0 2171 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2172 ret <4 x i64> %res2 2173} 2174 2175define <4 x i64> @test_mm256_maskz_broadcastd_epi32(i8 %a0, <2 x i64> %a1) { 2176; X86-LABEL: test_mm256_maskz_broadcastd_epi32: 2177; X86: # %bb.0: 2178; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2179; X86-NEXT: kmovw %eax, %k1 2180; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2181; X86-NEXT: retl 2182; 2183; X64-LABEL: test_mm256_maskz_broadcastd_epi32: 2184; X64: # %bb.0: 2185; X64-NEXT: kmovw %edi, %k1 2186; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} 2187; X64-NEXT: retq 2188 %arg0 = bitcast i8 %a0 to <8 x i1> 2189 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 2190 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <8 x i32> zeroinitializer 2191 %res1 = select <8 x i1> %arg0, <8 x i32> %res0, <8 x i32> zeroinitializer 2192 %res2 = bitcast <8 x i32> %res1 to <4 x i64> 2193 ret <4 x i64> %res2 2194} 2195 2196define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) { 2197; CHECK-LABEL: test_mm_broadcastq_epi64: 2198; CHECK: # %bb.0: 2199; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2200; CHECK-NEXT: ret{{[l|q]}} 2201 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer 2202 ret <2 x i64> %res 2203} 2204 2205define <2 x i64> @test_mm_mask_broadcastq_epi64(<2 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2206; X86-LABEL: test_mm_mask_broadcastq_epi64: 2207; X86: # %bb.0: # %entry 2208; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2209; X86-NEXT: kmovw %eax, %k1 2210; X86-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2211; X86-NEXT: retl 2212; 2213; X64-LABEL: test_mm_mask_broadcastq_epi64: 2214; X64: # %bb.0: # %entry 2215; X64-NEXT: kmovw %edi, %k1 2216; X64-NEXT: vpbroadcastq %xmm1, %xmm0 {%k1} 2217; X64-NEXT: retq 2218entry: 2219 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2220 %0 = bitcast i8 %__M to <8 x i1> 2221 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2222 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> %__O 2223 ret <2 x i64> %1 2224} 2225 2226define <2 x i64> @test_mm_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2227; X86-LABEL: test_mm_maskz_broadcastq_epi64: 2228; X86: # %bb.0: # %entry 2229; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2230; X86-NEXT: kmovw %eax, %k1 2231; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2232; X86-NEXT: retl 2233; 2234; X64-LABEL: test_mm_maskz_broadcastq_epi64: 2235; X64: # %bb.0: # %entry 2236; X64-NEXT: kmovw %edi, %k1 2237; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} 2238; X64-NEXT: retq 2239entry: 2240 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <2 x i32> zeroinitializer 2241 %0 = bitcast i8 %__M to <8 x i1> 2242 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2243 %1 = select <2 x i1> %extract.i, <2 x i64> %shuffle.i.i, <2 x i64> zeroinitializer 2244 ret <2 x i64> %1 2245} 2246 2247define <4 x i64> @test_mm256_broadcastq_epi64(<2 x i64> %a0) { 2248; CHECK-LABEL: test_mm256_broadcastq_epi64: 2249; CHECK: # %bb.0: 2250; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2251; CHECK-NEXT: ret{{[l|q]}} 2252 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> zeroinitializer 2253 ret <4 x i64> %res 2254} 2255 2256define <4 x i64> @test_mm256_mask_broadcastq_epi64(<4 x i64> %__O, i8 zeroext %__M, <2 x i64> %__A) { 2257; X86-LABEL: test_mm256_mask_broadcastq_epi64: 2258; X86: # %bb.0: # %entry 2259; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2260; X86-NEXT: kmovw %eax, %k1 2261; X86-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2262; X86-NEXT: retl 2263; 2264; X64-LABEL: test_mm256_mask_broadcastq_epi64: 2265; X64: # %bb.0: # %entry 2266; X64-NEXT: kmovw %edi, %k1 2267; X64-NEXT: vpbroadcastq %xmm1, %ymm0 {%k1} 2268; X64-NEXT: retq 2269entry: 2270 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2271 %0 = bitcast i8 %__M to <8 x i1> 2272 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2273 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> %__O 2274 ret <4 x i64> %1 2275} 2276 2277define <4 x i64> @test_mm256_maskz_broadcastq_epi64(i8 zeroext %__M, <2 x i64> %__A) { 2278; X86-LABEL: test_mm256_maskz_broadcastq_epi64: 2279; X86: # %bb.0: # %entry 2280; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2281; X86-NEXT: kmovw %eax, %k1 2282; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2283; X86-NEXT: retl 2284; 2285; X64-LABEL: test_mm256_maskz_broadcastq_epi64: 2286; X64: # %bb.0: # %entry 2287; X64-NEXT: kmovw %edi, %k1 2288; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} 2289; X64-NEXT: retq 2290entry: 2291 %shuffle.i.i = shufflevector <2 x i64> %__A, <2 x i64> undef, <4 x i32> zeroinitializer 2292 %0 = bitcast i8 %__M to <8 x i1> 2293 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2294 %1 = select <4 x i1> %extract.i, <4 x i64> %shuffle.i.i, <4 x i64> zeroinitializer 2295 ret <4 x i64> %1 2296} 2297 2298define <4 x double> @test_mm256_broadcastsd_pd(<2 x double> %a0) { 2299; CHECK-LABEL: test_mm256_broadcastsd_pd: 2300; CHECK: # %bb.0: 2301; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 2302; CHECK-NEXT: ret{{[l|q]}} 2303 %res = shufflevector <2 x double> %a0, <2 x double> undef, <4 x i32> zeroinitializer 2304 ret <4 x double> %res 2305} 2306 2307define <4 x double> @test_mm256_mask_broadcastsd_pd(<4 x double> %__O, i8 zeroext %__M, <2 x double> %__A) { 2308; X86-LABEL: test_mm256_mask_broadcastsd_pd: 2309; X86: # %bb.0: # %entry 2310; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2311; X86-NEXT: kmovw %eax, %k1 2312; X86-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2313; X86-NEXT: retl 2314; 2315; X64-LABEL: test_mm256_mask_broadcastsd_pd: 2316; X64: # %bb.0: # %entry 2317; X64-NEXT: kmovw %edi, %k1 2318; X64-NEXT: vbroadcastsd %xmm1, %ymm0 {%k1} 2319; X64-NEXT: retq 2320entry: 2321 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2322 %0 = bitcast i8 %__M to <8 x i1> 2323 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2324 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__O 2325 ret <4 x double> %1 2326} 2327 2328define <4 x double> @test_mm256_maskz_broadcastsd_pd(i8 zeroext %__M, <2 x double> %__A) { 2329; X86-LABEL: test_mm256_maskz_broadcastsd_pd: 2330; X86: # %bb.0: # %entry 2331; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2332; X86-NEXT: kmovw %eax, %k1 2333; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2334; X86-NEXT: retl 2335; 2336; X64-LABEL: test_mm256_maskz_broadcastsd_pd: 2337; X64: # %bb.0: # %entry 2338; X64-NEXT: kmovw %edi, %k1 2339; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} 2340; X64-NEXT: retq 2341entry: 2342 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <4 x i32> zeroinitializer 2343 %0 = bitcast i8 %__M to <8 x i1> 2344 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2345 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2346 ret <4 x double> %1 2347} 2348 2349define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) { 2350; CHECK-LABEL: test_mm_broadcastss_ps: 2351; CHECK: # %bb.0: 2352; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 2353; CHECK-NEXT: ret{{[l|q]}} 2354 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 2355 ret <4 x float> %res 2356} 2357 2358define <4 x float> @test_mm_mask_broadcastss_ps(<4 x float> %__O, i8 zeroext %__M, <4 x float> %__A) { 2359; X86-LABEL: test_mm_mask_broadcastss_ps: 2360; X86: # %bb.0: # %entry 2361; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2362; X86-NEXT: kmovw %eax, %k1 2363; X86-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2364; X86-NEXT: retl 2365; 2366; X64-LABEL: test_mm_mask_broadcastss_ps: 2367; X64: # %bb.0: # %entry 2368; X64-NEXT: kmovw %edi, %k1 2369; X64-NEXT: vbroadcastss %xmm1, %xmm0 {%k1} 2370; X64-NEXT: retq 2371entry: 2372 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2373 %0 = bitcast i8 %__M to <8 x i1> 2374 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2375 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__O 2376 ret <4 x float> %1 2377} 2378 2379define <4 x float> @test_mm_maskz_broadcastss_ps(i8 zeroext %__M, <4 x float> %__A) { 2380; X86-LABEL: test_mm_maskz_broadcastss_ps: 2381; X86: # %bb.0: # %entry 2382; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2383; X86-NEXT: kmovw %eax, %k1 2384; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2385; X86-NEXT: retl 2386; 2387; X64-LABEL: test_mm_maskz_broadcastss_ps: 2388; X64: # %bb.0: # %entry 2389; X64-NEXT: kmovw %edi, %k1 2390; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} 2391; X64-NEXT: retq 2392entry: 2393 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> zeroinitializer 2394 %0 = bitcast i8 %__M to <8 x i1> 2395 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2396 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2397 ret <4 x float> %1 2398} 2399 2400define <8 x float> @test_mm256_broadcastss_ps(<4 x float> %a0) { 2401; CHECK-LABEL: test_mm256_broadcastss_ps: 2402; CHECK: # %bb.0: 2403; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 2404; CHECK-NEXT: ret{{[l|q]}} 2405 %res = shufflevector <4 x float> %a0, <4 x float> undef, <8 x i32> zeroinitializer 2406 ret <8 x float> %res 2407} 2408 2409define <8 x float> @test_mm256_mask_broadcastss_ps(<8 x float> %a0, i8 %a1, <4 x float> %a2) { 2410; X86-LABEL: test_mm256_mask_broadcastss_ps: 2411; X86: # %bb.0: 2412; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2413; X86-NEXT: kmovw %eax, %k1 2414; X86-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2415; X86-NEXT: retl 2416; 2417; X64-LABEL: test_mm256_mask_broadcastss_ps: 2418; X64: # %bb.0: 2419; X64-NEXT: kmovw %edi, %k1 2420; X64-NEXT: vbroadcastss %xmm1, %ymm0 {%k1} 2421; X64-NEXT: retq 2422 %arg1 = bitcast i8 %a1 to <8 x i1> 2423 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <8 x i32> zeroinitializer 2424 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2425 ret <8 x float> %res1 2426} 2427 2428define <8 x float> @test_mm256_maskz_broadcastss_ps(i8 %a0, <4 x float> %a1) { 2429; X86-LABEL: test_mm256_maskz_broadcastss_ps: 2430; X86: # %bb.0: 2431; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2432; X86-NEXT: kmovw %eax, %k1 2433; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2434; X86-NEXT: retl 2435; 2436; X64-LABEL: test_mm256_maskz_broadcastss_ps: 2437; X64: # %bb.0: 2438; X64-NEXT: kmovw %edi, %k1 2439; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} 2440; X64-NEXT: retq 2441 %arg0 = bitcast i8 %a0 to <8 x i1> 2442 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <8 x i32> zeroinitializer 2443 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2444 ret <8 x float> %res1 2445} 2446 2447define <2 x double> @test_mm_movddup_pd(<2 x double> %a0) { 2448; CHECK-LABEL: test_mm_movddup_pd: 2449; CHECK: # %bb.0: 2450; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2451; CHECK-NEXT: ret{{[l|q]}} 2452 %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 2453 ret <2 x double> %res 2454} 2455 2456define <2 x double> @test_mm_mask_movedup_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 2457; X86-LABEL: test_mm_mask_movedup_pd: 2458; X86: # %bb.0: # %entry 2459; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2460; X86-NEXT: kmovw %eax, %k1 2461; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2462; X86-NEXT: retl 2463; 2464; X64-LABEL: test_mm_mask_movedup_pd: 2465; X64: # %bb.0: # %entry 2466; X64-NEXT: kmovw %edi, %k1 2467; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} = xmm1[0,0] 2468; X64-NEXT: retq 2469entry: 2470 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2471 %0 = bitcast i8 %__U to <8 x i1> 2472 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2473 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> %__W 2474 ret <2 x double> %1 2475} 2476 2477define <2 x double> @test_mm_maskz_movedup_pd(i8 zeroext %__U, <2 x double> %__A) { 2478; X86-LABEL: test_mm_maskz_movedup_pd: 2479; X86: # %bb.0: # %entry 2480; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2481; X86-NEXT: kmovw %eax, %k1 2482; X86-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2483; X86-NEXT: retl 2484; 2485; X64-LABEL: test_mm_maskz_movedup_pd: 2486; X64: # %bb.0: # %entry 2487; X64-NEXT: kmovw %edi, %k1 2488; X64-NEXT: vmovddup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0] 2489; X64-NEXT: retq 2490entry: 2491 %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <2 x i32> zeroinitializer 2492 %0 = bitcast i8 %__U to <8 x i1> 2493 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2494 %1 = select <2 x i1> %extract.i, <2 x double> %shuffle.i.i, <2 x double> zeroinitializer 2495 ret <2 x double> %1 2496} 2497 2498define <4 x double> @test_mm256_movddup_pd(<4 x double> %a0) { 2499; CHECK-LABEL: test_mm256_movddup_pd: 2500; CHECK: # %bb.0: 2501; CHECK-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] 2502; CHECK-NEXT: ret{{[l|q]}} 2503 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2504 ret <4 x double> %res 2505} 2506 2507define <4 x double> @test_mm256_mask_movedup_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 2508; X86-LABEL: test_mm256_mask_movedup_pd: 2509; X86: # %bb.0: # %entry 2510; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2511; X86-NEXT: kmovw %eax, %k1 2512; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2513; X86-NEXT: retl 2514; 2515; X64-LABEL: test_mm256_mask_movedup_pd: 2516; X64: # %bb.0: # %entry 2517; X64-NEXT: kmovw %edi, %k1 2518; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2] 2519; X64-NEXT: retq 2520entry: 2521 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2522 %0 = bitcast i8 %__U to <8 x i1> 2523 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2524 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> %__W 2525 ret <4 x double> %1 2526} 2527 2528define <4 x double> @test_mm256_maskz_movedup_pd(i8 zeroext %__U, <4 x double> %__A) { 2529; X86-LABEL: test_mm256_maskz_movedup_pd: 2530; X86: # %bb.0: # %entry 2531; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2532; X86-NEXT: kmovw %eax, %k1 2533; X86-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2534; X86-NEXT: retl 2535; 2536; X64-LABEL: test_mm256_maskz_movedup_pd: 2537; X64: # %bb.0: # %entry 2538; X64-NEXT: kmovw %edi, %k1 2539; X64-NEXT: vmovddup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2] 2540; X64-NEXT: retq 2541entry: 2542 %shuffle.i.i = shufflevector <4 x double> %__A, <4 x double> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2543 %0 = bitcast i8 %__U to <8 x i1> 2544 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2545 %1 = select <4 x i1> %extract.i, <4 x double> %shuffle.i.i, <4 x double> zeroinitializer 2546 ret <4 x double> %1 2547} 2548 2549define <4 x float> @test_mm_movehdup_ps(<4 x float> %a0) { 2550; CHECK-LABEL: test_mm_movehdup_ps: 2551; CHECK: # %bb.0: 2552; CHECK-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 2553; CHECK-NEXT: ret{{[l|q]}} 2554 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2555 ret <4 x float> %res 2556} 2557 2558define <4 x float> @test_mm_mask_movehdup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2559; X86-LABEL: test_mm_mask_movehdup_ps: 2560; X86: # %bb.0: # %entry 2561; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2562; X86-NEXT: kmovw %eax, %k1 2563; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2564; X86-NEXT: retl 2565; 2566; X64-LABEL: test_mm_mask_movehdup_ps: 2567; X64: # %bb.0: # %entry 2568; X64-NEXT: kmovw %edi, %k1 2569; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} = xmm1[1,1,3,3] 2570; X64-NEXT: retq 2571entry: 2572 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2573 %0 = bitcast i8 %__U to <8 x i1> 2574 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2575 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2576 ret <4 x float> %1 2577} 2578 2579define <4 x float> @test_mm_maskz_movehdup_ps(i8 zeroext %__U, <4 x float> %__A) { 2580; X86-LABEL: test_mm_maskz_movehdup_ps: 2581; X86: # %bb.0: # %entry 2582; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2583; X86-NEXT: kmovw %eax, %k1 2584; X86-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2585; X86-NEXT: retl 2586; 2587; X64-LABEL: test_mm_maskz_movehdup_ps: 2588; X64: # %bb.0: # %entry 2589; X64-NEXT: kmovw %edi, %k1 2590; X64-NEXT: vmovshdup {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,3,3] 2591; X64-NEXT: retq 2592entry: 2593 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 2594 %0 = bitcast i8 %__U to <8 x i1> 2595 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2596 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2597 ret <4 x float> %1 2598} 2599 2600define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) { 2601; CHECK-LABEL: test_mm256_movehdup_ps: 2602; CHECK: # %bb.0: 2603; CHECK-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] 2604; CHECK-NEXT: ret{{[l|q]}} 2605 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2606 ret <8 x float> %res 2607} 2608 2609define <8 x float> @test_mm256_mask_movehdup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2610; X86-LABEL: test_mm256_mask_movehdup_ps: 2611; X86: # %bb.0: 2612; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2613; X86-NEXT: kmovw %eax, %k1 2614; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2615; X86-NEXT: retl 2616; 2617; X64-LABEL: test_mm256_mask_movehdup_ps: 2618; X64: # %bb.0: 2619; X64-NEXT: kmovw %edi, %k1 2620; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} = ymm1[1,1,3,3,5,5,7,7] 2621; X64-NEXT: retq 2622 %arg1 = bitcast i8 %a1 to <8 x i1> 2623 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2624 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2625 ret <8 x float> %res1 2626} 2627 2628define <8 x float> @test_mm256_maskz_movehdup_ps(i8 %a0, <8 x float> %a1) { 2629; X86-LABEL: test_mm256_maskz_movehdup_ps: 2630; X86: # %bb.0: 2631; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2632; X86-NEXT: kmovw %eax, %k1 2633; X86-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2634; X86-NEXT: retl 2635; 2636; X64-LABEL: test_mm256_maskz_movehdup_ps: 2637; X64: # %bb.0: 2638; X64-NEXT: kmovw %edi, %k1 2639; X64-NEXT: vmovshdup {{.*#+}} ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] 2640; X64-NEXT: retq 2641 %arg0 = bitcast i8 %a0 to <8 x i1> 2642 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7> 2643 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2644 ret <8 x float> %res1 2645} 2646 2647define <4 x float> @test_mm_moveldup_ps(<4 x float> %a0) { 2648; CHECK-LABEL: test_mm_moveldup_ps: 2649; CHECK: # %bb.0: 2650; CHECK-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] 2651; CHECK-NEXT: ret{{[l|q]}} 2652 %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2653 ret <4 x float> %res 2654} 2655 2656define <4 x float> @test_mm_mask_moveldup_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 2657; X86-LABEL: test_mm_mask_moveldup_ps: 2658; X86: # %bb.0: # %entry 2659; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2660; X86-NEXT: kmovw %eax, %k1 2661; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2662; X86-NEXT: retl 2663; 2664; X64-LABEL: test_mm_mask_moveldup_ps: 2665; X64: # %bb.0: # %entry 2666; X64-NEXT: kmovw %edi, %k1 2667; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} = xmm1[0,0,2,2] 2668; X64-NEXT: retq 2669entry: 2670 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2671 %0 = bitcast i8 %__U to <8 x i1> 2672 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2673 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> %__W 2674 ret <4 x float> %1 2675} 2676 2677define <4 x float> @test_mm_maskz_moveldup_ps(i8 zeroext %__U, <4 x float> %__A) { 2678; X86-LABEL: test_mm_maskz_moveldup_ps: 2679; X86: # %bb.0: # %entry 2680; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2681; X86-NEXT: kmovw %eax, %k1 2682; X86-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2683; X86-NEXT: retl 2684; 2685; X64-LABEL: test_mm_maskz_moveldup_ps: 2686; X64: # %bb.0: # %entry 2687; X64-NEXT: kmovw %edi, %k1 2688; X64-NEXT: vmovsldup {{.*#+}} xmm0 {%k1} {z} = xmm0[0,0,2,2] 2689; X64-NEXT: retq 2690entry: 2691 %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 2692 %0 = bitcast i8 %__U to <8 x i1> 2693 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2694 %1 = select <4 x i1> %extract.i, <4 x float> %shuffle.i.i, <4 x float> zeroinitializer 2695 ret <4 x float> %1 2696} 2697 2698define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) { 2699; CHECK-LABEL: test_mm256_moveldup_ps: 2700; CHECK: # %bb.0: 2701; CHECK-NEXT: vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] 2702; CHECK-NEXT: ret{{[l|q]}} 2703 %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2704 ret <8 x float> %res 2705} 2706 2707define <8 x float> @test_mm256_mask_moveldup_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2) { 2708; X86-LABEL: test_mm256_mask_moveldup_ps: 2709; X86: # %bb.0: 2710; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2711; X86-NEXT: kmovw %eax, %k1 2712; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2713; X86-NEXT: retl 2714; 2715; X64-LABEL: test_mm256_mask_moveldup_ps: 2716; X64: # %bb.0: 2717; X64-NEXT: kmovw %edi, %k1 2718; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} = ymm1[0,0,2,2,4,4,6,6] 2719; X64-NEXT: retq 2720 %arg1 = bitcast i8 %a1 to <8 x i1> 2721 %res0 = shufflevector <8 x float> %a2, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2722 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 2723 ret <8 x float> %res1 2724} 2725 2726define <8 x float> @test_mm256_maskz_moveldup_ps(i8 %a0, <8 x float> %a1) { 2727; X86-LABEL: test_mm256_maskz_moveldup_ps: 2728; X86: # %bb.0: 2729; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2730; X86-NEXT: kmovw %eax, %k1 2731; X86-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2732; X86-NEXT: retl 2733; 2734; X64-LABEL: test_mm256_maskz_moveldup_ps: 2735; X64: # %bb.0: 2736; X64-NEXT: kmovw %edi, %k1 2737; X64-NEXT: vmovsldup {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] 2738; X64-NEXT: retq 2739 %arg0 = bitcast i8 %a0 to <8 x i1> 2740 %res0 = shufflevector <8 x float> %a1, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 2741 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 2742 ret <8 x float> %res1 2743} 2744 2745define <4 x i64> @test_mm256_permutex_epi64(<4 x i64> %a0) { 2746; CHECK-LABEL: test_mm256_permutex_epi64: 2747; CHECK: # %bb.0: 2748; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2749; CHECK-NEXT: ret{{[l|q]}} 2750 %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2751 ret <4 x i64> %res 2752} 2753 2754define <4 x i64> @test_mm256_mask_permutex_epi64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X) { 2755; X86-LABEL: test_mm256_mask_permutex_epi64: 2756; X86: # %bb.0: # %entry 2757; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2758; X86-NEXT: kmovw %eax, %k1 2759; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2760; X86-NEXT: retl 2761; 2762; X64-LABEL: test_mm256_mask_permutex_epi64: 2763; X64: # %bb.0: # %entry 2764; X64-NEXT: kmovw %edi, %k1 2765; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm1[3,0,0,0] 2766; X64-NEXT: retq 2767entry: 2768 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2769 %0 = bitcast i8 %__M to <8 x i1> 2770 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2771 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> %__W 2772 ret <4 x i64> %1 2773} 2774 2775define <4 x i64> @test_mm256_maskz_permutex_epi64(i8 zeroext %__M, <4 x i64> %__X) { 2776; X86-LABEL: test_mm256_maskz_permutex_epi64: 2777; X86: # %bb.0: # %entry 2778; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2779; X86-NEXT: kmovw %eax, %k1 2780; X86-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2781; X86-NEXT: retl 2782; 2783; X64-LABEL: test_mm256_maskz_permutex_epi64: 2784; X64: # %bb.0: # %entry 2785; X64-NEXT: kmovw %edi, %k1 2786; X64-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] 2787; X64-NEXT: retq 2788entry: 2789 %perm = shufflevector <4 x i64> %__X, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2790 %0 = bitcast i8 %__M to <8 x i1> 2791 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2792 %1 = select <4 x i1> %extract, <4 x i64> %perm, <4 x i64> zeroinitializer 2793 ret <4 x i64> %1 2794} 2795 2796define <4 x double> @test_mm256_permutex_pd(<4 x double> %a0) { 2797; CHECK-LABEL: test_mm256_permutex_pd: 2798; CHECK: # %bb.0: 2799; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,0,0,0] 2800; CHECK-NEXT: ret{{[l|q]}} 2801 %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0> 2802 ret <4 x double> %res 2803} 2804 2805define <4 x double> @test_mm256_mask_permutex_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__X) { 2806; X86-LABEL: test_mm256_mask_permutex_pd: 2807; X86: # %bb.0: # %entry 2808; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2809; X86-NEXT: kmovw %eax, %k1 2810; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2811; X86-NEXT: retl 2812; 2813; X64-LABEL: test_mm256_mask_permutex_pd: 2814; X64: # %bb.0: # %entry 2815; X64-NEXT: kmovw %edi, %k1 2816; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm1[1,0,0,0] 2817; X64-NEXT: retq 2818entry: 2819 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2820 %0 = bitcast i8 %__U to <8 x i1> 2821 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2822 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> %__W 2823 ret <4 x double> %1 2824} 2825 2826define <4 x double> @test_mm256_maskz_permutex_pd(i8 zeroext %__U, <4 x double> %__X) { 2827; X86-LABEL: test_mm256_maskz_permutex_pd: 2828; X86: # %bb.0: # %entry 2829; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2830; X86-NEXT: kmovw %eax, %k1 2831; X86-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2832; X86-NEXT: retl 2833; 2834; X64-LABEL: test_mm256_maskz_permutex_pd: 2835; X64: # %bb.0: # %entry 2836; X64-NEXT: kmovw %edi, %k1 2837; X64-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,0,0,0] 2838; X64-NEXT: retq 2839entry: 2840 %perm = shufflevector <4 x double> %__X, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0> 2841 %0 = bitcast i8 %__U to <8 x i1> 2842 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2843 %1 = select <4 x i1> %extract, <4 x double> %perm, <4 x double> zeroinitializer 2844 ret <4 x double> %1 2845} 2846 2847define <2 x double> @test_mm_shuffle_pd(<2 x double> %a0, <2 x double> %a1) { 2848; CHECK-LABEL: test_mm_shuffle_pd: 2849; CHECK: # %bb.0: 2850; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 2851; CHECK-NEXT: ret{{[l|q]}} 2852 %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 1, i32 3> 2853 ret <2 x double> %res 2854} 2855 2856define <2 x double> @test_mm_mask_shuffle_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2857; X86-LABEL: test_mm_mask_shuffle_pd: 2858; X86: # %bb.0: # %entry 2859; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2860; X86-NEXT: kmovw %eax, %k1 2861; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2862; X86-NEXT: retl 2863; 2864; X64-LABEL: test_mm_mask_shuffle_pd: 2865; X64: # %bb.0: # %entry 2866; X64-NEXT: kmovw %edi, %k1 2867; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} = xmm1[1],xmm2[1] 2868; X64-NEXT: retq 2869entry: 2870 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2871 %0 = bitcast i8 %__U to <8 x i1> 2872 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2873 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> %__W 2874 ret <2 x double> %1 2875} 2876 2877define <2 x double> @test_mm_maskz_shuffle_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2878; X86-LABEL: test_mm_maskz_shuffle_pd: 2879; X86: # %bb.0: # %entry 2880; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2881; X86-NEXT: kmovw %eax, %k1 2882; X86-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2883; X86-NEXT: retl 2884; 2885; X64-LABEL: test_mm_maskz_shuffle_pd: 2886; X64: # %bb.0: # %entry 2887; X64-NEXT: kmovw %edi, %k1 2888; X64-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] 2889; X64-NEXT: retq 2890entry: 2891 %shufp = shufflevector <2 x double> %__A, <2 x double> %__B, <2 x i32> <i32 1, i32 3> 2892 %0 = bitcast i8 %__U to <8 x i1> 2893 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 2894 %1 = select <2 x i1> %extract, <2 x double> %shufp, <2 x double> zeroinitializer 2895 ret <2 x double> %1 2896} 2897 2898define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) { 2899; CHECK-LABEL: test_mm256_shuffle_pd: 2900; CHECK: # %bb.0: 2901; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2902; CHECK-NEXT: ret{{[l|q]}} 2903 %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2904 ret <4 x double> %res 2905} 2906 2907define <4 x double> @test_mm256_mask_shuffle_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2908; X86-LABEL: test_mm256_mask_shuffle_pd: 2909; X86: # %bb.0: # %entry 2910; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2911; X86-NEXT: kmovw %eax, %k1 2912; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2913; X86-NEXT: retl 2914; 2915; X64-LABEL: test_mm256_mask_shuffle_pd: 2916; X64: # %bb.0: # %entry 2917; X64-NEXT: kmovw %edi, %k1 2918; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} = ymm1[1],ymm2[1],ymm1[2],ymm2[2] 2919; X64-NEXT: retq 2920entry: 2921 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2922 %0 = bitcast i8 %__U to <8 x i1> 2923 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2924 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> %__W 2925 ret <4 x double> %1 2926} 2927 2928define <4 x double> @test_mm256_maskz_shuffle_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B) { 2929; X86-LABEL: test_mm256_maskz_shuffle_pd: 2930; X86: # %bb.0: # %entry 2931; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2932; X86-NEXT: kmovw %eax, %k1 2933; X86-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2934; X86-NEXT: retl 2935; 2936; X64-LABEL: test_mm256_maskz_shuffle_pd: 2937; X64: # %bb.0: # %entry 2938; X64-NEXT: kmovw %edi, %k1 2939; X64-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[2],ymm1[2] 2940; X64-NEXT: retq 2941entry: 2942 %shufp = shufflevector <4 x double> %__A, <4 x double> %__B, <4 x i32> <i32 1, i32 5, i32 2, i32 6> 2943 %0 = bitcast i8 %__U to <8 x i1> 2944 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2945 %1 = select <4 x i1> %extract, <4 x double> %shufp, <4 x double> zeroinitializer 2946 ret <4 x double> %1 2947} 2948 2949define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) { 2950; CHECK-LABEL: test_mm_shuffle_ps: 2951; CHECK: # %bb.0: 2952; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] 2953; CHECK-NEXT: ret{{[l|q]}} 2954 %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2955 ret <4 x float> %res 2956} 2957 2958define <4 x float> @test_mm_mask_shuffle_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2959; X86-LABEL: test_mm_mask_shuffle_ps: 2960; X86: # %bb.0: # %entry 2961; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2962; X86-NEXT: kmovw %eax, %k1 2963; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2964; X86-NEXT: retl 2965; 2966; X64-LABEL: test_mm_mask_shuffle_ps: 2967; X64: # %bb.0: # %entry 2968; X64-NEXT: kmovw %edi, %k1 2969; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} = xmm1[0,1],xmm2[0,0] 2970; X64-NEXT: retq 2971entry: 2972 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2973 %0 = bitcast i8 %__U to <8 x i1> 2974 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2975 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> %__W 2976 ret <4 x float> %1 2977} 2978 2979define <4 x float> @test_mm_maskz_shuffle_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2980; X86-LABEL: test_mm_maskz_shuffle_ps: 2981; X86: # %bb.0: # %entry 2982; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2983; X86-NEXT: kmovw %eax, %k1 2984; X86-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2985; X86-NEXT: retl 2986; 2987; X64-LABEL: test_mm_maskz_shuffle_ps: 2988; X64: # %bb.0: # %entry 2989; X64-NEXT: kmovw %edi, %k1 2990; X64-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1],xmm1[0,0] 2991; X64-NEXT: retq 2992entry: 2993 %shufp = shufflevector <4 x float> %__A, <4 x float> %__B, <4 x i32> <i32 0, i32 1, i32 4, i32 4> 2994 %0 = bitcast i8 %__U to <8 x i1> 2995 %extract = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 2996 %1 = select <4 x i1> %extract, <4 x float> %shufp, <4 x float> zeroinitializer 2997 ret <4 x float> %1 2998} 2999 3000define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) { 3001; CHECK-LABEL: test_mm256_shuffle_ps: 3002; CHECK: # %bb.0: 3003; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 3004; CHECK-NEXT: ret{{[l|q]}} 3005 %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 3006 ret <8 x float> %res 3007} 3008 3009define <8 x float> @test_mm256_mask_shuffle_ps(<8 x float> %a0, i8 %a1, <8 x float> %a2, <8 x float> %a3) { 3010; X86-LABEL: test_mm256_mask_shuffle_ps: 3011; X86: # %bb.0: 3012; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3013; X86-NEXT: kmovw %eax, %k1 3014; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 3015; X86-NEXT: retl 3016; 3017; X64-LABEL: test_mm256_mask_shuffle_ps: 3018; X64: # %bb.0: 3019; X64-NEXT: kmovw %edi, %k1 3020; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} = ymm1[0,1],ymm2[0,0],ymm1[4,5],ymm2[4,4] 3021; X64-NEXT: retq 3022 %arg1 = bitcast i8 %a1 to <8 x i1> 3023 %res0 = shufflevector <8 x float> %a2, <8 x float> %a3, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 3024 %res1 = select <8 x i1> %arg1, <8 x float> %res0, <8 x float> %a0 3025 ret <8 x float> %res1 3026} 3027 3028define <8 x float> @test_mm256_maskz_shuffle_ps(i8 %a0, <8 x float> %a1, <8 x float> %a2) { 3029; X86-LABEL: test_mm256_maskz_shuffle_ps: 3030; X86: # %bb.0: 3031; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3032; X86-NEXT: kmovw %eax, %k1 3033; X86-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 3034; X86-NEXT: retl 3035; 3036; X64-LABEL: test_mm256_maskz_shuffle_ps: 3037; X64: # %bb.0: 3038; X64-NEXT: kmovw %edi, %k1 3039; X64-NEXT: vshufps {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1],ymm1[0,0],ymm0[4,5],ymm1[4,4] 3040; X64-NEXT: retq 3041 %arg0 = bitcast i8 %a0 to <8 x i1> 3042 %res0 = shufflevector <8 x float> %a1, <8 x float> %a2, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12> 3043 %res1 = select <8 x i1> %arg0, <8 x float> %res0, <8 x float> zeroinitializer 3044 ret <8 x float> %res1 3045} 3046 3047define <4 x i64> @test_mm256_mask_mul_epi32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3048; X86-LABEL: test_mm256_mask_mul_epi32: 3049; X86: # %bb.0: # %entry 3050; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3051; X86-NEXT: kmovw %eax, %k1 3052; X86-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 3053; X86-NEXT: retl 3054; 3055; X64-LABEL: test_mm256_mask_mul_epi32: 3056; X64: # %bb.0: # %entry 3057; X64-NEXT: kmovw %edi, %k1 3058; X64-NEXT: vpmuldq %ymm1, %ymm2, %ymm0 {%k1} 3059; X64-NEXT: retq 3060entry: 3061 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 3062 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 3063 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 3064 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 3065 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 3066 %tmp5 = bitcast i8 %__M to <8 x i1> 3067 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3068 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> %__W 3069 ret <4 x i64> %tmp6 3070} 3071 3072define <4 x i64> @test_mm256_maskz_mul_epi32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3073; X86-LABEL: test_mm256_maskz_mul_epi32: 3074; X86: # %bb.0: 3075; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3076; X86-NEXT: kmovw %eax, %k1 3077; X86-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 3078; X86-NEXT: retl 3079; 3080; X64-LABEL: test_mm256_maskz_mul_epi32: 3081; X64: # %bb.0: 3082; X64-NEXT: kmovw %edi, %k1 3083; X64-NEXT: vpmuldq %ymm0, %ymm1, %ymm0 {%k1} {z} 3084; X64-NEXT: retq 3085 %tmp = shl <4 x i64> %__X, <i64 32, i64 32, i64 32, i64 32> 3086 %tmp1 = ashr exact <4 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32> 3087 %tmp2 = shl <4 x i64> %__Y, <i64 32, i64 32, i64 32, i64 32> 3088 %tmp3 = ashr exact <4 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32> 3089 %tmp4 = mul nsw <4 x i64> %tmp3, %tmp1 3090 %tmp5 = bitcast i8 %__M to <8 x i1> 3091 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3092 %tmp6 = select <4 x i1> %extract.i, <4 x i64> %tmp4, <4 x i64> zeroinitializer 3093 ret <4 x i64> %tmp6 3094} 3095 3096define <2 x i64> @test_mm_mask_mul_epi32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3097; X86-LABEL: test_mm_mask_mul_epi32: 3098; X86: # %bb.0: 3099; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3100; X86-NEXT: kmovw %eax, %k1 3101; X86-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3102; X86-NEXT: retl 3103; 3104; X64-LABEL: test_mm_mask_mul_epi32: 3105; X64: # %bb.0: 3106; X64-NEXT: kmovw %edi, %k1 3107; X64-NEXT: vpmuldq %xmm1, %xmm2, %xmm0 {%k1} 3108; X64-NEXT: retq 3109 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3110 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3111 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3112 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3113 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3114 %tmp5 = bitcast i8 %__M to <8 x i1> 3115 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3116 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> %__W 3117 ret <2 x i64> %tmp6 3118} 3119 3120define <2 x i64> @test_mm_maskz_mul_epi32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3121; X86-LABEL: test_mm_maskz_mul_epi32: 3122; X86: # %bb.0: 3123; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3124; X86-NEXT: kmovw %eax, %k1 3125; X86-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3126; X86-NEXT: retl 3127; 3128; X64-LABEL: test_mm_maskz_mul_epi32: 3129; X64: # %bb.0: 3130; X64-NEXT: kmovw %edi, %k1 3131; X64-NEXT: vpmuldq %xmm0, %xmm1, %xmm0 {%k1} {z} 3132; X64-NEXT: retq 3133 %tmp = shl <2 x i64> %__X, <i64 32, i64 32> 3134 %tmp1 = ashr exact <2 x i64> %tmp, <i64 32, i64 32> 3135 %tmp2 = shl <2 x i64> %__Y, <i64 32, i64 32> 3136 %tmp3 = ashr exact <2 x i64> %tmp2, <i64 32, i64 32> 3137 %tmp4 = mul nsw <2 x i64> %tmp3, %tmp1 3138 %tmp5 = bitcast i8 %__M to <8 x i1> 3139 %extract.i = shufflevector <8 x i1> %tmp5, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3140 %tmp6 = select <2 x i1> %extract.i, <2 x i64> %tmp4, <2 x i64> zeroinitializer 3141 ret <2 x i64> %tmp6 3142} 3143 3144define <4 x i64> @test_mm256_mask_mul_epu32(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3145; X86-LABEL: test_mm256_mask_mul_epu32: 3146; X86: # %bb.0: # %entry 3147; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3148; X86-NEXT: kmovw %eax, %k1 3149; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3150; X86-NEXT: retl 3151; 3152; X64-LABEL: test_mm256_mask_mul_epu32: 3153; X64: # %bb.0: # %entry 3154; X64-NEXT: kmovw %edi, %k1 3155; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm0 {%k1} 3156; X64-NEXT: retq 3157entry: 3158 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3159 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3160 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3161 %tmp3 = bitcast i8 %__M to <8 x i1> 3162 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3163 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> %__W 3164 ret <4 x i64> %tmp4 3165} 3166 3167define <4 x i64> @test_mm256_maskz_mul_epu32(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) nounwind { 3168; X86-LABEL: test_mm256_maskz_mul_epu32: 3169; X86: # %bb.0: # %entry 3170; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3171; X86-NEXT: kmovw %eax, %k1 3172; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3173; X86-NEXT: retl 3174; 3175; X64-LABEL: test_mm256_maskz_mul_epu32: 3176; X64: # %bb.0: # %entry 3177; X64-NEXT: kmovw %edi, %k1 3178; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 {%k1} {z} 3179; X64-NEXT: retq 3180entry: 3181 %tmp = and <4 x i64> %__X, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3182 %tmp1 = and <4 x i64> %__Y, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 3183 %tmp2 = mul nuw <4 x i64> %tmp1, %tmp 3184 %tmp3 = bitcast i8 %__M to <8 x i1> 3185 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3186 %tmp4 = select <4 x i1> %extract.i, <4 x i64> %tmp2, <4 x i64> zeroinitializer 3187 ret <4 x i64> %tmp4 3188} 3189 3190define <2 x i64> @test_mm_mask_mul_epu32(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3191; X86-LABEL: test_mm_mask_mul_epu32: 3192; X86: # %bb.0: # %entry 3193; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3194; X86-NEXT: kmovw %eax, %k1 3195; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3196; X86-NEXT: retl 3197; 3198; X64-LABEL: test_mm_mask_mul_epu32: 3199; X64: # %bb.0: # %entry 3200; X64-NEXT: kmovw %edi, %k1 3201; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm0 {%k1} 3202; X64-NEXT: retq 3203entry: 3204 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3205 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3206 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3207 %tmp3 = bitcast i8 %__M to <8 x i1> 3208 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3209 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> %__W 3210 ret <2 x i64> %tmp4 3211} 3212 3213define <2 x i64> @test_mm_maskz_mul_epu32(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) nounwind { 3214; X86-LABEL: test_mm_maskz_mul_epu32: 3215; X86: # %bb.0: # %entry 3216; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3217; X86-NEXT: kmovw %eax, %k1 3218; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3219; X86-NEXT: retl 3220; 3221; X64-LABEL: test_mm_maskz_mul_epu32: 3222; X64: # %bb.0: # %entry 3223; X64-NEXT: kmovw %edi, %k1 3224; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 {%k1} {z} 3225; X64-NEXT: retq 3226entry: 3227 %tmp = and <2 x i64> %__X, <i64 4294967295, i64 4294967295> 3228 %tmp1 = and <2 x i64> %__Y, <i64 4294967295, i64 4294967295> 3229 %tmp2 = mul nuw <2 x i64> %tmp1, %tmp 3230 %tmp3 = bitcast i8 %__M to <8 x i1> 3231 %extract.i = shufflevector <8 x i1> %tmp3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3232 %tmp4 = select <2 x i1> %extract.i, <2 x i64> %tmp2, <2 x i64> zeroinitializer 3233 ret <2 x i64> %tmp4 3234} 3235 3236define <2 x i64> @test_mm_cvtepi32_epi8(<2 x i64> %__A) { 3237; CHECK-LABEL: test_mm_cvtepi32_epi8: 3238; CHECK: # %bb.0: # %entry 3239; CHECK-NEXT: vpmovdb %xmm0, %xmm0 3240; CHECK-NEXT: ret{{[l|q]}} 3241entry: 3242 %0 = bitcast <2 x i64> %__A to <4 x i32> 3243 %conv.i = trunc <4 x i32> %0 to <4 x i8> 3244 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3245 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3246 ret <2 x i64> %1 3247} 3248 3249define <2 x i64> @test_mm_cvtepi32_epi16(<2 x i64> %__A) { 3250; CHECK-LABEL: test_mm_cvtepi32_epi16: 3251; CHECK: # %bb.0: # %entry 3252; CHECK-NEXT: vpmovdw %xmm0, %xmm0 3253; CHECK-NEXT: ret{{[l|q]}} 3254entry: 3255 %0 = bitcast <2 x i64> %__A to <4 x i32> 3256 %conv.i = trunc <4 x i32> %0 to <4 x i16> 3257 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3258 %1 = bitcast <8 x i16> %shuf.i to <2 x i64> 3259 ret <2 x i64> %1 3260} 3261 3262define <2 x i64> @test_mm_cvtepi64_epi8(<2 x i64> %__A) { 3263; CHECK-LABEL: test_mm_cvtepi64_epi8: 3264; CHECK: # %bb.0: # %entry 3265; CHECK-NEXT: vpmovqb %xmm0, %xmm0 3266; CHECK-NEXT: ret{{[l|q]}} 3267entry: 3268 %conv.i = trunc <2 x i64> %__A to <2 x i8> 3269 %shuf.i = shufflevector <2 x i8> %conv.i, <2 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3270 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3271 ret <2 x i64> %0 3272} 3273 3274define <2 x i64> @test_mm_cvtepi64_epi16(<2 x i64> %__A) { 3275; CHECK-LABEL: test_mm_cvtepi64_epi16: 3276; CHECK: # %bb.0: # %entry 3277; CHECK-NEXT: vpmovqw %xmm0, %xmm0 3278; CHECK-NEXT: ret{{[l|q]}} 3279entry: 3280 %conv.i = trunc <2 x i64> %__A to <2 x i16> 3281 %shuf.i = shufflevector <2 x i16> %conv.i, <2 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 3282 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3283 ret <2 x i64> %0 3284} 3285 3286define <2 x i64> @test_mm_cvtepi64_epi32(<2 x i64> %__A) { 3287; CHECK-LABEL: test_mm_cvtepi64_epi32: 3288; CHECK: # %bb.0: # %entry 3289; CHECK-NEXT: vpmovqd %xmm0, %xmm0 3290; CHECK-NEXT: ret{{[l|q]}} 3291entry: 3292 %conv.i = trunc <2 x i64> %__A to <2 x i32> 3293 %shuf.i = shufflevector <2 x i32> %conv.i, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3294 %0 = bitcast <4 x i32> %shuf.i to <2 x i64> 3295 ret <2 x i64> %0 3296} 3297 3298define <2 x i64> @test_mm256_cvtepi32_epi16(<4 x i64> %__A) local_unnamed_addr #0 { 3299; CHECK-LABEL: test_mm256_cvtepi32_epi16: 3300; CHECK: # %bb.0: # %entry 3301; CHECK-NEXT: vpmovdw %ymm0, %xmm0 3302; CHECK-NEXT: vzeroupper 3303; CHECK-NEXT: ret{{[l|q]}} 3304entry: 3305 %0 = bitcast <4 x i64> %__A to <8 x i32> 3306 %conv.i = trunc <8 x i32> %0 to <8 x i16> 3307 %1 = bitcast <8 x i16> %conv.i to <2 x i64> 3308 ret <2 x i64> %1 3309} 3310 3311define <2 x i64> @test_mm256_mask_cvtepi32_epi16(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3312; X86-LABEL: test_mm256_mask_cvtepi32_epi16: 3313; X86: # %bb.0: # %entry 3314; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3315; X86-NEXT: kmovw %eax, %k1 3316; X86-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3317; X86-NEXT: vzeroupper 3318; X86-NEXT: retl 3319; 3320; X64-LABEL: test_mm256_mask_cvtepi32_epi16: 3321; X64: # %bb.0: # %entry 3322; X64-NEXT: kmovw %edi, %k1 3323; X64-NEXT: vpmovdw %ymm1, %xmm0 {%k1} 3324; X64-NEXT: vzeroupper 3325; X64-NEXT: retq 3326entry: 3327 %0 = bitcast <4 x i64> %__A to <8 x i32> 3328 %1 = bitcast <2 x i64> %__O to <8 x i16> 3329 %2 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> %1, i8 %__M) 3330 %3 = bitcast <8 x i16> %2 to <2 x i64> 3331 ret <2 x i64> %3 3332} 3333 3334define <2 x i64> @test_mm256_maskz_cvtepi32_epi16(i8 zeroext %__M, <4 x i64> %__A) { 3335; X86-LABEL: test_mm256_maskz_cvtepi32_epi16: 3336; X86: # %bb.0: # %entry 3337; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3338; X86-NEXT: kmovw %eax, %k1 3339; X86-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3340; X86-NEXT: vzeroupper 3341; X86-NEXT: retl 3342; 3343; X64-LABEL: test_mm256_maskz_cvtepi32_epi16: 3344; X64: # %bb.0: # %entry 3345; X64-NEXT: kmovw %edi, %k1 3346; X64-NEXT: vpmovdw %ymm0, %xmm0 {%k1} {z} 3347; X64-NEXT: vzeroupper 3348; X64-NEXT: retq 3349entry: 3350 %0 = bitcast <4 x i64> %__A to <8 x i32> 3351 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %0, <8 x i16> zeroinitializer, i8 %__M) 3352 %2 = bitcast <8 x i16> %1 to <2 x i64> 3353 ret <2 x i64> %2 3354} 3355 3356define <2 x i64> @test_mm256_cvtepi64_epi32(<4 x i64> %__A) local_unnamed_addr #0 { 3357; CHECK-LABEL: test_mm256_cvtepi64_epi32: 3358; CHECK: # %bb.0: # %entry 3359; CHECK-NEXT: vpmovqd %ymm0, %xmm0 3360; CHECK-NEXT: vzeroupper 3361; CHECK-NEXT: ret{{[l|q]}} 3362entry: 3363 %conv.i = trunc <4 x i64> %__A to <4 x i32> 3364 %0 = bitcast <4 x i32> %conv.i to <2 x i64> 3365 ret <2 x i64> %0 3366} 3367 3368define <2 x i64> @test_mm256_mask_cvtepi64_epi32(<2 x i64> %__O, i8 zeroext %__M, <4 x i64> %__A) { 3369; X86-LABEL: test_mm256_mask_cvtepi64_epi32: 3370; X86: # %bb.0: # %entry 3371; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3372; X86-NEXT: kmovw %eax, %k1 3373; X86-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3374; X86-NEXT: vzeroupper 3375; X86-NEXT: retl 3376; 3377; X64-LABEL: test_mm256_mask_cvtepi64_epi32: 3378; X64: # %bb.0: # %entry 3379; X64-NEXT: kmovw %edi, %k1 3380; X64-NEXT: vpmovqd %ymm1, %xmm0 {%k1} 3381; X64-NEXT: vzeroupper 3382; X64-NEXT: retq 3383entry: 3384 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3385 %0 = bitcast <2 x i64> %__O to <4 x i32> 3386 %1 = bitcast i8 %__M to <8 x i1> 3387 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3388 %2 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> %0 3389 %3 = bitcast <4 x i32> %2 to <2 x i64> 3390 ret <2 x i64> %3 3391} 3392 3393define <2 x i64> @test_mm256_maskz_cvtepi64_epi32(i8 zeroext %__M, <4 x i64> %__A) { 3394; X86-LABEL: test_mm256_maskz_cvtepi64_epi32: 3395; X86: # %bb.0: # %entry 3396; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3397; X86-NEXT: kmovw %eax, %k1 3398; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3399; X86-NEXT: vzeroupper 3400; X86-NEXT: retl 3401; 3402; X64-LABEL: test_mm256_maskz_cvtepi64_epi32: 3403; X64: # %bb.0: # %entry 3404; X64-NEXT: kmovw %edi, %k1 3405; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} 3406; X64-NEXT: vzeroupper 3407; X64-NEXT: retq 3408entry: 3409 %conv.i.i = trunc <4 x i64> %__A to <4 x i32> 3410 %0 = bitcast i8 %__M to <8 x i1> 3411 %extract.i = shufflevector <8 x i1> %0, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3412 %1 = select <4 x i1> %extract.i, <4 x i32> %conv.i.i, <4 x i32> zeroinitializer 3413 %2 = bitcast <4 x i32> %1 to <2 x i64> 3414 ret <2 x i64> %2 3415} 3416 3417define <2 x i64> @test_mm256_cvtepi64_epi8(<4 x i64> %__A) { 3418; CHECK-LABEL: test_mm256_cvtepi64_epi8: 3419; CHECK: # %bb.0: # %entry 3420; CHECK-NEXT: vpmovqb %ymm0, %xmm0 3421; CHECK-NEXT: vzeroupper 3422; CHECK-NEXT: ret{{[l|q]}} 3423entry: 3424 %conv.i = trunc <4 x i64> %__A to <4 x i8> 3425 %shuf.i = shufflevector <4 x i8> %conv.i, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 3426 %0 = bitcast <16 x i8> %shuf.i to <2 x i64> 3427 ret <2 x i64> %0 3428} 3429 3430define <2 x i64> @test_mm256_cvtepi64_epi16(<4 x i64> %__A) { 3431; CHECK-LABEL: test_mm256_cvtepi64_epi16: 3432; CHECK: # %bb.0: # %entry 3433; CHECK-NEXT: vpmovqw %ymm0, %xmm0 3434; CHECK-NEXT: vzeroupper 3435; CHECK-NEXT: ret{{[l|q]}} 3436entry: 3437 %conv.i = trunc <4 x i64> %__A to <4 x i16> 3438 %shuf.i = shufflevector <4 x i16> %conv.i, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3439 %0 = bitcast <8 x i16> %shuf.i to <2 x i64> 3440 ret <2 x i64> %0 3441} 3442 3443define <2 x i64> @test_mm256_cvtepi32_epi8(<4 x i64> %__A) { 3444; CHECK-LABEL: test_mm256_cvtepi32_epi8: 3445; CHECK: # %bb.0: # %entry 3446; CHECK-NEXT: vpmovdb %ymm0, %xmm0 3447; CHECK-NEXT: vzeroupper 3448; CHECK-NEXT: ret{{[l|q]}} 3449entry: 3450 %0 = bitcast <4 x i64> %__A to <8 x i32> 3451 %conv.i = trunc <8 x i32> %0 to <8 x i8> 3452 %shuf.i = shufflevector <8 x i8> %conv.i, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3453 %1 = bitcast <16 x i8> %shuf.i to <2 x i64> 3454 ret <2 x i64> %1 3455} 3456 3457define <2 x i64> @test_mm_ternarylogic_epi32(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3458; CHECK-LABEL: test_mm_ternarylogic_epi32: 3459; CHECK: # %bb.0: # %entry 3460; CHECK-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 3461; CHECK-NEXT: ret{{[l|q]}} 3462entry: 3463 %0 = bitcast <2 x i64> %__A to <4 x i32> 3464 %1 = bitcast <2 x i64> %__B to <4 x i32> 3465 %2 = bitcast <2 x i64> %__C to <4 x i32> 3466 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3467 %4 = bitcast <4 x i32> %3 to <2 x i64> 3468 ret <2 x i64> %4 3469} 3470 3471declare <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32) #2 3472 3473define <2 x i64> @test_mm_mask_ternarylogic_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3474; X86-LABEL: test_mm_mask_ternarylogic_epi32: 3475; X86: # %bb.0: # %entry 3476; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3477; X86-NEXT: kmovw %eax, %k1 3478; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3479; X86-NEXT: retl 3480; 3481; X64-LABEL: test_mm_mask_ternarylogic_epi32: 3482; X64: # %bb.0: # %entry 3483; X64-NEXT: kmovw %edi, %k1 3484; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} 3485; X64-NEXT: retq 3486entry: 3487 %0 = bitcast <2 x i64> %__A to <4 x i32> 3488 %1 = bitcast <2 x i64> %__B to <4 x i32> 3489 %2 = bitcast <2 x i64> %__C to <4 x i32> 3490 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3491 %4 = bitcast i8 %__U to <8 x i1> 3492 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3493 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> %0 3494 %6 = bitcast <4 x i32> %5 to <2 x i64> 3495 ret <2 x i64> %6 3496} 3497 3498define <2 x i64> @test_mm_maskz_ternarylogic_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3499; X86-LABEL: test_mm_maskz_ternarylogic_epi32: 3500; X86: # %bb.0: # %entry 3501; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3502; X86-NEXT: kmovw %eax, %k1 3503; X86-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3504; X86-NEXT: retl 3505; 3506; X64-LABEL: test_mm_maskz_ternarylogic_epi32: 3507; X64: # %bb.0: # %entry 3508; X64-NEXT: kmovw %edi, %k1 3509; X64-NEXT: vpternlogd $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3510; X64-NEXT: retq 3511entry: 3512 %0 = bitcast <2 x i64> %__A to <4 x i32> 3513 %1 = bitcast <2 x i64> %__B to <4 x i32> 3514 %2 = bitcast <2 x i64> %__C to <4 x i32> 3515 %3 = tail call <4 x i32> @llvm.x86.avx512.pternlog.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2, i32 4) 3516 %4 = bitcast i8 %__U to <8 x i1> 3517 %extract = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3518 %5 = select <4 x i1> %extract, <4 x i32> %3, <4 x i32> zeroinitializer 3519 %6 = bitcast <4 x i32> %5 to <2 x i64> 3520 ret <2 x i64> %6 3521} 3522 3523define <4 x i64> @test_mm256_ternarylogic_epi32(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3524; CHECK-LABEL: test_mm256_ternarylogic_epi32: 3525; CHECK: # %bb.0: # %entry 3526; CHECK-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 3527; CHECK-NEXT: ret{{[l|q]}} 3528entry: 3529 %0 = bitcast <4 x i64> %__A to <8 x i32> 3530 %1 = bitcast <4 x i64> %__B to <8 x i32> 3531 %2 = bitcast <4 x i64> %__C to <8 x i32> 3532 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3533 %4 = bitcast <8 x i32> %3 to <4 x i64> 3534 ret <4 x i64> %4 3535} 3536 3537declare <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i32) #2 3538 3539define <4 x i64> @test_mm256_mask_ternarylogic_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3540; X86-LABEL: test_mm256_mask_ternarylogic_epi32: 3541; X86: # %bb.0: # %entry 3542; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3543; X86-NEXT: kmovw %eax, %k1 3544; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3545; X86-NEXT: retl 3546; 3547; X64-LABEL: test_mm256_mask_ternarylogic_epi32: 3548; X64: # %bb.0: # %entry 3549; X64-NEXT: kmovw %edi, %k1 3550; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} 3551; X64-NEXT: retq 3552entry: 3553 %0 = bitcast <4 x i64> %__A to <8 x i32> 3554 %1 = bitcast <4 x i64> %__B to <8 x i32> 3555 %2 = bitcast <4 x i64> %__C to <8 x i32> 3556 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3557 %4 = bitcast i8 %__U to <8 x i1> 3558 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3559 %6 = bitcast <8 x i32> %5 to <4 x i64> 3560 ret <4 x i64> %6 3561} 3562 3563define <4 x i64> @test_mm256_maskz_ternarylogic_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3564; X86-LABEL: test_mm256_maskz_ternarylogic_epi32: 3565; X86: # %bb.0: # %entry 3566; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3567; X86-NEXT: kmovw %eax, %k1 3568; X86-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3569; X86-NEXT: retl 3570; 3571; X64-LABEL: test_mm256_maskz_ternarylogic_epi32: 3572; X64: # %bb.0: # %entry 3573; X64-NEXT: kmovw %edi, %k1 3574; X64-NEXT: vpternlogd $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3575; X64-NEXT: retq 3576entry: 3577 %0 = bitcast <4 x i64> %__A to <8 x i32> 3578 %1 = bitcast <4 x i64> %__B to <8 x i32> 3579 %2 = bitcast <4 x i64> %__C to <8 x i32> 3580 %3 = tail call <8 x i32> @llvm.x86.avx512.pternlog.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2, i32 4) 3581 %4 = bitcast i8 %__U to <8 x i1> 3582 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 3583 %6 = bitcast <8 x i32> %5 to <4 x i64> 3584 ret <4 x i64> %6 3585} 3586 3587define <2 x i64> @test_mm_ternarylogic_epi64(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3588; CHECK-LABEL: test_mm_ternarylogic_epi64: 3589; CHECK: # %bb.0: # %entry 3590; CHECK-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 3591; CHECK-NEXT: ret{{[l|q]}} 3592entry: 3593 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3594 ret <2 x i64> %0 3595} 3596 3597declare <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i32) #2 3598 3599define <2 x i64> @test_mm_mask_ternarylogic_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__B, <2 x i64> %__C) { 3600; X86-LABEL: test_mm_mask_ternarylogic_epi64: 3601; X86: # %bb.0: # %entry 3602; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3603; X86-NEXT: kmovw %eax, %k1 3604; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3605; X86-NEXT: retl 3606; 3607; X64-LABEL: test_mm_mask_ternarylogic_epi64: 3608; X64: # %bb.0: # %entry 3609; X64-NEXT: kmovw %edi, %k1 3610; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} 3611; X64-NEXT: retq 3612entry: 3613 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3614 %1 = bitcast i8 %__U to <8 x i1> 3615 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3616 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__A 3617 ret <2 x i64> %2 3618} 3619 3620define <2 x i64> @test_mm_maskz_ternarylogic_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C) { 3621; X86-LABEL: test_mm_maskz_ternarylogic_epi64: 3622; X86: # %bb.0: # %entry 3623; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3624; X86-NEXT: kmovw %eax, %k1 3625; X86-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3626; X86-NEXT: retl 3627; 3628; X64-LABEL: test_mm_maskz_ternarylogic_epi64: 3629; X64: # %bb.0: # %entry 3630; X64-NEXT: kmovw %edi, %k1 3631; X64-NEXT: vpternlogq $4, %xmm2, %xmm1, %xmm0 {%k1} {z} 3632; X64-NEXT: retq 3633entry: 3634 %0 = tail call <2 x i64> @llvm.x86.avx512.pternlog.q.128(<2 x i64> %__A, <2 x i64> %__B, <2 x i64> %__C, i32 4) 3635 %1 = bitcast i8 %__U to <8 x i1> 3636 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3637 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 3638 ret <2 x i64> %2 3639} 3640 3641define <4 x i64> @test_mm256_ternarylogic_epi64(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3642; CHECK-LABEL: test_mm256_ternarylogic_epi64: 3643; CHECK: # %bb.0: # %entry 3644; CHECK-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 3645; CHECK-NEXT: ret{{[l|q]}} 3646entry: 3647 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3648 ret <4 x i64> %0 3649} 3650 3651declare <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i32) #2 3652 3653define <4 x i64> @test_mm256_mask_ternarylogic_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__B, <4 x i64> %__C) { 3654; X86-LABEL: test_mm256_mask_ternarylogic_epi64: 3655; X86: # %bb.0: # %entry 3656; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3657; X86-NEXT: kmovw %eax, %k1 3658; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3659; X86-NEXT: retl 3660; 3661; X64-LABEL: test_mm256_mask_ternarylogic_epi64: 3662; X64: # %bb.0: # %entry 3663; X64-NEXT: kmovw %edi, %k1 3664; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} 3665; X64-NEXT: retq 3666entry: 3667 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3668 %1 = bitcast i8 %__U to <8 x i1> 3669 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3670 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__A 3671 ret <4 x i64> %2 3672} 3673 3674define <4 x i64> @test_mm256_maskz_ternarylogic_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C) { 3675; X86-LABEL: test_mm256_maskz_ternarylogic_epi64: 3676; X86: # %bb.0: # %entry 3677; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3678; X86-NEXT: kmovw %eax, %k1 3679; X86-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3680; X86-NEXT: retl 3681; 3682; X64-LABEL: test_mm256_maskz_ternarylogic_epi64: 3683; X64: # %bb.0: # %entry 3684; X64-NEXT: kmovw %edi, %k1 3685; X64-NEXT: vpternlogq $4, %ymm2, %ymm1, %ymm0 {%k1} {z} 3686; X64-NEXT: retq 3687entry: 3688 %0 = tail call <4 x i64> @llvm.x86.avx512.pternlog.q.256(<4 x i64> %__A, <4 x i64> %__B, <4 x i64> %__C, i32 4) 3689 %1 = bitcast i8 %__U to <8 x i1> 3690 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3691 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 3692 ret <4 x i64> %2 3693} 3694 3695define <2 x i64> @test_mm_mask2_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3696; X86-LABEL: test_mm_mask2_permutex2var_epi32: 3697; X86: # %bb.0: # %entry 3698; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3699; X86-NEXT: kmovw %eax, %k1 3700; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3701; X86-NEXT: vmovdqa %xmm1, %xmm0 3702; X86-NEXT: retl 3703; 3704; X64-LABEL: test_mm_mask2_permutex2var_epi32: 3705; X64: # %bb.0: # %entry 3706; X64-NEXT: kmovw %edi, %k1 3707; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} 3708; X64-NEXT: vmovdqa %xmm1, %xmm0 3709; X64-NEXT: retq 3710entry: 3711 %0 = bitcast <2 x i64> %__A to <4 x i32> 3712 %1 = bitcast <2 x i64> %__I to <4 x i32> 3713 %2 = bitcast <2 x i64> %__B to <4 x i32> 3714 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3715 %4 = bitcast i8 %__U to <8 x i1> 3716 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3717 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %1 3718 %6 = bitcast <4 x i32> %5 to <2 x i64> 3719 ret <2 x i64> %6 3720} 3721 3722define <4 x i64> @test_mm256_mask2_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3723; X86-LABEL: test_mm256_mask2_permutex2var_epi32: 3724; X86: # %bb.0: # %entry 3725; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3726; X86-NEXT: kmovw %eax, %k1 3727; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3728; X86-NEXT: vmovdqa %ymm1, %ymm0 3729; X86-NEXT: retl 3730; 3731; X64-LABEL: test_mm256_mask2_permutex2var_epi32: 3732; X64: # %bb.0: # %entry 3733; X64-NEXT: kmovw %edi, %k1 3734; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} 3735; X64-NEXT: vmovdqa %ymm1, %ymm0 3736; X64-NEXT: retq 3737entry: 3738 %0 = bitcast <4 x i64> %__A to <8 x i32> 3739 %1 = bitcast <4 x i64> %__I to <8 x i32> 3740 %2 = bitcast <4 x i64> %__B to <8 x i32> 3741 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3742 %4 = bitcast i8 %__U to <8 x i1> 3743 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %1 3744 %6 = bitcast <8 x i32> %5 to <4 x i64> 3745 ret <4 x i64> %6 3746} 3747 3748define <2 x double> @test_mm_mask2_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x double> %__B) { 3749; X86-LABEL: test_mm_mask2_permutex2var_pd: 3750; X86: # %bb.0: # %entry 3751; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3752; X86-NEXT: kmovw %eax, %k1 3753; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3754; X86-NEXT: vmovapd %xmm1, %xmm0 3755; X86-NEXT: retl 3756; 3757; X64-LABEL: test_mm_mask2_permutex2var_pd: 3758; X64: # %bb.0: # %entry 3759; X64-NEXT: kmovw %edi, %k1 3760; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} 3761; X64-NEXT: vmovapd %xmm1, %xmm0 3762; X64-NEXT: retq 3763entry: 3764 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 3765 %1 = bitcast <2 x i64> %__I to <2 x double> 3766 %2 = bitcast i8 %__U to <8 x i1> 3767 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3768 %3 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %1 3769 ret <2 x double> %3 3770} 3771 3772define <4 x double> @test_mm256_mask2_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x double> %__B) { 3773; X86-LABEL: test_mm256_mask2_permutex2var_pd: 3774; X86: # %bb.0: # %entry 3775; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3776; X86-NEXT: kmovw %eax, %k1 3777; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3778; X86-NEXT: vmovapd %ymm1, %ymm0 3779; X86-NEXT: retl 3780; 3781; X64-LABEL: test_mm256_mask2_permutex2var_pd: 3782; X64: # %bb.0: # %entry 3783; X64-NEXT: kmovw %edi, %k1 3784; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} 3785; X64-NEXT: vmovapd %ymm1, %ymm0 3786; X64-NEXT: retq 3787entry: 3788 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 3789 %1 = bitcast <4 x i64> %__I to <4 x double> 3790 %2 = bitcast i8 %__U to <8 x i1> 3791 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3792 %3 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %1 3793 ret <4 x double> %3 3794} 3795 3796define <4 x float> @test_mm_mask2_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, i8 zeroext %__U, <4 x float> %__B) { 3797; X86-LABEL: test_mm_mask2_permutex2var_ps: 3798; X86: # %bb.0: # %entry 3799; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3800; X86-NEXT: kmovw %eax, %k1 3801; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3802; X86-NEXT: vmovaps %xmm1, %xmm0 3803; X86-NEXT: retl 3804; 3805; X64-LABEL: test_mm_mask2_permutex2var_ps: 3806; X64: # %bb.0: # %entry 3807; X64-NEXT: kmovw %edi, %k1 3808; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} 3809; X64-NEXT: vmovaps %xmm1, %xmm0 3810; X64-NEXT: retq 3811entry: 3812 %0 = bitcast <2 x i64> %__I to <4 x i32> 3813 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 3814 %2 = bitcast <2 x i64> %__I to <4 x float> 3815 %3 = bitcast i8 %__U to <8 x i1> 3816 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3817 %4 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %2 3818 ret <4 x float> %4 3819} 3820 3821define <8 x float> @test_mm256_mask2_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, i8 zeroext %__U, <8 x float> %__B) { 3822; X86-LABEL: test_mm256_mask2_permutex2var_ps: 3823; X86: # %bb.0: # %entry 3824; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3825; X86-NEXT: kmovw %eax, %k1 3826; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3827; X86-NEXT: vmovaps %ymm1, %ymm0 3828; X86-NEXT: retl 3829; 3830; X64-LABEL: test_mm256_mask2_permutex2var_ps: 3831; X64: # %bb.0: # %entry 3832; X64-NEXT: kmovw %edi, %k1 3833; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} 3834; X64-NEXT: vmovaps %ymm1, %ymm0 3835; X64-NEXT: retq 3836entry: 3837 %0 = bitcast <4 x i64> %__I to <8 x i32> 3838 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 3839 %2 = bitcast <4 x i64> %__I to <8 x float> 3840 %3 = bitcast i8 %__U to <8 x i1> 3841 %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 3842 ret <8 x float> %4 3843} 3844 3845define <2 x i64> @test_mm_mask2_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, i8 zeroext %__U, <2 x i64> %__B) { 3846; X86-LABEL: test_mm_mask2_permutex2var_epi64: 3847; X86: # %bb.0: # %entry 3848; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3849; X86-NEXT: kmovw %eax, %k1 3850; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3851; X86-NEXT: vmovdqa %xmm1, %xmm0 3852; X86-NEXT: retl 3853; 3854; X64-LABEL: test_mm_mask2_permutex2var_epi64: 3855; X64: # %bb.0: # %entry 3856; X64-NEXT: kmovw %edi, %k1 3857; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} 3858; X64-NEXT: vmovdqa %xmm1, %xmm0 3859; X64-NEXT: retq 3860entry: 3861 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 3862 %1 = bitcast i8 %__U to <8 x i1> 3863 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 3864 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__I 3865 ret <2 x i64> %2 3866} 3867 3868define <4 x i64> @test_mm256_mask2_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, i8 zeroext %__U, <4 x i64> %__B) { 3869; X86-LABEL: test_mm256_mask2_permutex2var_epi64: 3870; X86: # %bb.0: # %entry 3871; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3872; X86-NEXT: kmovw %eax, %k1 3873; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3874; X86-NEXT: vmovdqa %ymm1, %ymm0 3875; X86-NEXT: retl 3876; 3877; X64-LABEL: test_mm256_mask2_permutex2var_epi64: 3878; X64: # %bb.0: # %entry 3879; X64-NEXT: kmovw %edi, %k1 3880; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} 3881; X64-NEXT: vmovdqa %ymm1, %ymm0 3882; X64-NEXT: retq 3883entry: 3884 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 3885 %1 = bitcast i8 %__U to <8 x i1> 3886 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3887 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__I 3888 ret <4 x i64> %2 3889} 3890 3891define <2 x i64> @test_mm_permutex2var_epi32(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3892; CHECK-LABEL: test_mm_permutex2var_epi32: 3893; CHECK: # %bb.0: # %entry 3894; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 3895; CHECK-NEXT: ret{{[l|q]}} 3896entry: 3897 %0 = bitcast <2 x i64> %__A to <4 x i32> 3898 %1 = bitcast <2 x i64> %__I to <4 x i32> 3899 %2 = bitcast <2 x i64> %__B to <4 x i32> 3900 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3901 %4 = bitcast <4 x i32> %3 to <2 x i64> 3902 ret <2 x i64> %4 3903} 3904 3905define <2 x i64> @test_mm_mask_permutex2var_epi32(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 3906; X86-LABEL: test_mm_mask_permutex2var_epi32: 3907; X86: # %bb.0: # %entry 3908; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3909; X86-NEXT: kmovw %eax, %k1 3910; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3911; X86-NEXT: retl 3912; 3913; X64-LABEL: test_mm_mask_permutex2var_epi32: 3914; X64: # %bb.0: # %entry 3915; X64-NEXT: kmovw %edi, %k1 3916; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} 3917; X64-NEXT: retq 3918entry: 3919 %0 = bitcast <2 x i64> %__A to <4 x i32> 3920 %1 = bitcast <2 x i64> %__I to <4 x i32> 3921 %2 = bitcast <2 x i64> %__B to <4 x i32> 3922 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3923 %4 = bitcast i8 %__U to <8 x i1> 3924 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3925 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> %0 3926 %6 = bitcast <4 x i32> %5 to <2 x i64> 3927 ret <2 x i64> %6 3928} 3929 3930define <2 x i64> @test_mm_maskz_permutex2var_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 3931; X86-LABEL: test_mm_maskz_permutex2var_epi32: 3932; X86: # %bb.0: # %entry 3933; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3934; X86-NEXT: kmovw %eax, %k1 3935; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3936; X86-NEXT: retl 3937; 3938; X64-LABEL: test_mm_maskz_permutex2var_epi32: 3939; X64: # %bb.0: # %entry 3940; X64-NEXT: kmovw %edi, %k1 3941; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 {%k1} {z} 3942; X64-NEXT: retq 3943entry: 3944 %0 = bitcast <2 x i64> %__A to <4 x i32> 3945 %1 = bitcast <2 x i64> %__I to <4 x i32> 3946 %2 = bitcast <2 x i64> %__B to <4 x i32> 3947 %3 = tail call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2) 3948 %4 = bitcast i8 %__U to <8 x i1> 3949 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 3950 %5 = select <4 x i1> %extract.i, <4 x i32> %3, <4 x i32> zeroinitializer 3951 %6 = bitcast <4 x i32> %5 to <2 x i64> 3952 ret <2 x i64> %6 3953} 3954 3955define <4 x i64> @test_mm256_permutex2var_epi32(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3956; CHECK-LABEL: test_mm256_permutex2var_epi32: 3957; CHECK: # %bb.0: # %entry 3958; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 3959; CHECK-NEXT: ret{{[l|q]}} 3960entry: 3961 %0 = bitcast <4 x i64> %__A to <8 x i32> 3962 %1 = bitcast <4 x i64> %__I to <8 x i32> 3963 %2 = bitcast <4 x i64> %__B to <8 x i32> 3964 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3965 %4 = bitcast <8 x i32> %3 to <4 x i64> 3966 ret <4 x i64> %4 3967} 3968 3969define <4 x i64> @test_mm256_mask_permutex2var_epi32(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 3970; X86-LABEL: test_mm256_mask_permutex2var_epi32: 3971; X86: # %bb.0: # %entry 3972; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3973; X86-NEXT: kmovw %eax, %k1 3974; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3975; X86-NEXT: retl 3976; 3977; X64-LABEL: test_mm256_mask_permutex2var_epi32: 3978; X64: # %bb.0: # %entry 3979; X64-NEXT: kmovw %edi, %k1 3980; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} 3981; X64-NEXT: retq 3982entry: 3983 %0 = bitcast <4 x i64> %__A to <8 x i32> 3984 %1 = bitcast <4 x i64> %__I to <8 x i32> 3985 %2 = bitcast <4 x i64> %__B to <8 x i32> 3986 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 3987 %4 = bitcast i8 %__U to <8 x i1> 3988 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> %0 3989 %6 = bitcast <8 x i32> %5 to <4 x i64> 3990 ret <4 x i64> %6 3991} 3992 3993define <4 x i64> @test_mm256_maskz_permutex2var_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 3994; X86-LABEL: test_mm256_maskz_permutex2var_epi32: 3995; X86: # %bb.0: # %entry 3996; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3997; X86-NEXT: kmovw %eax, %k1 3998; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 3999; X86-NEXT: retl 4000; 4001; X64-LABEL: test_mm256_maskz_permutex2var_epi32: 4002; X64: # %bb.0: # %entry 4003; X64-NEXT: kmovw %edi, %k1 4004; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 {%k1} {z} 4005; X64-NEXT: retq 4006entry: 4007 %0 = bitcast <4 x i64> %__A to <8 x i32> 4008 %1 = bitcast <4 x i64> %__I to <8 x i32> 4009 %2 = bitcast <4 x i64> %__B to <8 x i32> 4010 %3 = tail call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %0, <8 x i32> %1, <8 x i32> %2) 4011 %4 = bitcast i8 %__U to <8 x i1> 4012 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> zeroinitializer 4013 %6 = bitcast <8 x i32> %5 to <4 x i64> 4014 ret <4 x i64> %6 4015} 4016 4017define <2 x double> @test_mm_permutex2var_pd(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 4018; CHECK-LABEL: test_mm_permutex2var_pd: 4019; CHECK: # %bb.0: # %entry 4020; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 4021; CHECK-NEXT: ret{{[l|q]}} 4022entry: 4023 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 4024 ret <2 x double> %0 4025} 4026 4027define <2 x double> @test_mm_mask_permutex2var_pd(<2 x double> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x double> %__B) { 4028; X86-LABEL: test_mm_mask_permutex2var_pd: 4029; X86: # %bb.0: # %entry 4030; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4031; X86-NEXT: kmovw %eax, %k1 4032; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 4033; X86-NEXT: retl 4034; 4035; X64-LABEL: test_mm_mask_permutex2var_pd: 4036; X64: # %bb.0: # %entry 4037; X64-NEXT: kmovw %edi, %k1 4038; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} 4039; X64-NEXT: retq 4040entry: 4041 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 4042 %1 = bitcast i8 %__U to <8 x i1> 4043 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4044 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4045 ret <2 x double> %2 4046} 4047 4048define <2 x double> @test_mm_maskz_permutex2var_pd(i8 zeroext %__U, <2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) { 4049; X86-LABEL: test_mm_maskz_permutex2var_pd: 4050; X86: # %bb.0: # %entry 4051; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4052; X86-NEXT: kmovw %eax, %k1 4053; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 4054; X86-NEXT: retl 4055; 4056; X64-LABEL: test_mm_maskz_permutex2var_pd: 4057; X64: # %bb.0: # %entry 4058; X64-NEXT: kmovw %edi, %k1 4059; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 {%k1} {z} 4060; X64-NEXT: retq 4061entry: 4062 %0 = tail call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %__A, <2 x i64> %__I, <2 x double> %__B) 4063 %1 = bitcast i8 %__U to <8 x i1> 4064 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4065 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4066 ret <2 x double> %2 4067} 4068 4069define <4 x double> @test_mm256_permutex2var_pd(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 4070; CHECK-LABEL: test_mm256_permutex2var_pd: 4071; CHECK: # %bb.0: # %entry 4072; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 4073; CHECK-NEXT: ret{{[l|q]}} 4074entry: 4075 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4076 ret <4 x double> %0 4077} 4078 4079define <4 x double> @test_mm256_mask_permutex2var_pd(<4 x double> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x double> %__B) { 4080; X86-LABEL: test_mm256_mask_permutex2var_pd: 4081; X86: # %bb.0: # %entry 4082; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4083; X86-NEXT: kmovw %eax, %k1 4084; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 4085; X86-NEXT: retl 4086; 4087; X64-LABEL: test_mm256_mask_permutex2var_pd: 4088; X64: # %bb.0: # %entry 4089; X64-NEXT: kmovw %edi, %k1 4090; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} 4091; X64-NEXT: retq 4092entry: 4093 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4094 %1 = bitcast i8 %__U to <8 x i1> 4095 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4096 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4097 ret <4 x double> %2 4098} 4099 4100define <4 x double> @test_mm256_maskz_permutex2var_pd(i8 zeroext %__U, <4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) { 4101; X86-LABEL: test_mm256_maskz_permutex2var_pd: 4102; X86: # %bb.0: # %entry 4103; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4104; X86-NEXT: kmovw %eax, %k1 4105; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4106; X86-NEXT: retl 4107; 4108; X64-LABEL: test_mm256_maskz_permutex2var_pd: 4109; X64: # %bb.0: # %entry 4110; X64-NEXT: kmovw %edi, %k1 4111; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 {%k1} {z} 4112; X64-NEXT: retq 4113entry: 4114 %0 = tail call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %__A, <4 x i64> %__I, <4 x double> %__B) 4115 %1 = bitcast i8 %__U to <8 x i1> 4116 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4117 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4118 ret <4 x double> %2 4119} 4120 4121define <4 x float> @test_mm_permutex2var_ps(<4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4122; CHECK-LABEL: test_mm_permutex2var_ps: 4123; CHECK: # %bb.0: # %entry 4124; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 4125; CHECK-NEXT: ret{{[l|q]}} 4126entry: 4127 %0 = bitcast <2 x i64> %__I to <4 x i32> 4128 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4129 ret <4 x float> %1 4130} 4131 4132define <4 x float> @test_mm_mask_permutex2var_ps(<4 x float> %__A, i8 zeroext %__U, <2 x i64> %__I, <4 x float> %__B) { 4133; X86-LABEL: test_mm_mask_permutex2var_ps: 4134; X86: # %bb.0: # %entry 4135; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4136; X86-NEXT: kmovw %eax, %k1 4137; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4138; X86-NEXT: retl 4139; 4140; X64-LABEL: test_mm_mask_permutex2var_ps: 4141; X64: # %bb.0: # %entry 4142; X64-NEXT: kmovw %edi, %k1 4143; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} 4144; X64-NEXT: retq 4145entry: 4146 %0 = bitcast <2 x i64> %__I to <4 x i32> 4147 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4148 %2 = bitcast i8 %__U to <8 x i1> 4149 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4150 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> %__A 4151 ret <4 x float> %3 4152} 4153 4154define <4 x float> @test_mm_maskz_permutex2var_ps(i8 zeroext %__U, <4 x float> %__A, <2 x i64> %__I, <4 x float> %__B) { 4155; X86-LABEL: test_mm_maskz_permutex2var_ps: 4156; X86: # %bb.0: # %entry 4157; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4158; X86-NEXT: kmovw %eax, %k1 4159; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4160; X86-NEXT: retl 4161; 4162; X64-LABEL: test_mm_maskz_permutex2var_ps: 4163; X64: # %bb.0: # %entry 4164; X64-NEXT: kmovw %edi, %k1 4165; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 {%k1} {z} 4166; X64-NEXT: retq 4167entry: 4168 %0 = bitcast <2 x i64> %__I to <4 x i32> 4169 %1 = tail call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %__A, <4 x i32> %0, <4 x float> %__B) 4170 %2 = bitcast i8 %__U to <8 x i1> 4171 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4172 %3 = select <4 x i1> %extract.i, <4 x float> %1, <4 x float> zeroinitializer 4173 ret <4 x float> %3 4174} 4175 4176define <8 x float> @test_mm256_permutex2var_ps(<8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4177; CHECK-LABEL: test_mm256_permutex2var_ps: 4178; CHECK: # %bb.0: # %entry 4179; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 4180; CHECK-NEXT: ret{{[l|q]}} 4181entry: 4182 %0 = bitcast <4 x i64> %__I to <8 x i32> 4183 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4184 ret <8 x float> %1 4185} 4186 4187define <8 x float> @test_mm256_mask_permutex2var_ps(<8 x float> %__A, i8 zeroext %__U, <4 x i64> %__I, <8 x float> %__B) { 4188; X86-LABEL: test_mm256_mask_permutex2var_ps: 4189; X86: # %bb.0: # %entry 4190; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4191; X86-NEXT: kmovw %eax, %k1 4192; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4193; X86-NEXT: retl 4194; 4195; X64-LABEL: test_mm256_mask_permutex2var_ps: 4196; X64: # %bb.0: # %entry 4197; X64-NEXT: kmovw %edi, %k1 4198; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} 4199; X64-NEXT: retq 4200entry: 4201 %0 = bitcast <4 x i64> %__I to <8 x i32> 4202 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4203 %2 = bitcast i8 %__U to <8 x i1> 4204 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %__A 4205 ret <8 x float> %3 4206} 4207 4208define <8 x float> @test_mm256_maskz_permutex2var_ps(i8 zeroext %__U, <8 x float> %__A, <4 x i64> %__I, <8 x float> %__B) { 4209; X86-LABEL: test_mm256_maskz_permutex2var_ps: 4210; X86: # %bb.0: # %entry 4211; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4212; X86-NEXT: kmovw %eax, %k1 4213; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4214; X86-NEXT: retl 4215; 4216; X64-LABEL: test_mm256_maskz_permutex2var_ps: 4217; X64: # %bb.0: # %entry 4218; X64-NEXT: kmovw %edi, %k1 4219; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 {%k1} {z} 4220; X64-NEXT: retq 4221entry: 4222 %0 = bitcast <4 x i64> %__I to <8 x i32> 4223 %1 = tail call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %__A, <8 x i32> %0, <8 x float> %__B) 4224 %2 = bitcast i8 %__U to <8 x i1> 4225 %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer 4226 ret <8 x float> %3 4227} 4228 4229define <2 x i64> @test_mm_permutex2var_epi64(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4230; CHECK-LABEL: test_mm_permutex2var_epi64: 4231; CHECK: # %bb.0: # %entry 4232; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 4233; CHECK-NEXT: ret{{[l|q]}} 4234entry: 4235 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4236 ret <2 x i64> %0 4237} 4238 4239define <2 x i64> @test_mm_mask_permutex2var_epi64(<2 x i64> %__A, i8 zeroext %__U, <2 x i64> %__I, <2 x i64> %__B) { 4240; X86-LABEL: test_mm_mask_permutex2var_epi64: 4241; X86: # %bb.0: # %entry 4242; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4243; X86-NEXT: kmovw %eax, %k1 4244; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4245; X86-NEXT: retl 4246; 4247; X64-LABEL: test_mm_mask_permutex2var_epi64: 4248; X64: # %bb.0: # %entry 4249; X64-NEXT: kmovw %edi, %k1 4250; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} 4251; X64-NEXT: retq 4252entry: 4253 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4254 %1 = bitcast i8 %__U to <8 x i1> 4255 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4256 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__A 4257 ret <2 x i64> %2 4258} 4259 4260define <2 x i64> @test_mm_maskz_permutex2var_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) { 4261; X86-LABEL: test_mm_maskz_permutex2var_epi64: 4262; X86: # %bb.0: # %entry 4263; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4264; X86-NEXT: kmovw %eax, %k1 4265; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4266; X86-NEXT: retl 4267; 4268; X64-LABEL: test_mm_maskz_permutex2var_epi64: 4269; X64: # %bb.0: # %entry 4270; X64-NEXT: kmovw %edi, %k1 4271; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 {%k1} {z} 4272; X64-NEXT: retq 4273entry: 4274 %0 = tail call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %__A, <2 x i64> %__I, <2 x i64> %__B) 4275 %1 = bitcast i8 %__U to <8 x i1> 4276 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4277 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 4278 ret <2 x i64> %2 4279} 4280 4281define <4 x i64> @test_mm256_permutex2var_epi64(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4282; CHECK-LABEL: test_mm256_permutex2var_epi64: 4283; CHECK: # %bb.0: # %entry 4284; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 4285; CHECK-NEXT: ret{{[l|q]}} 4286entry: 4287 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4288 ret <4 x i64> %0 4289} 4290 4291define <4 x i64> @test_mm256_mask_permutex2var_epi64(<4 x i64> %__A, i8 zeroext %__U, <4 x i64> %__I, <4 x i64> %__B) { 4292; X86-LABEL: test_mm256_mask_permutex2var_epi64: 4293; X86: # %bb.0: # %entry 4294; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4295; X86-NEXT: kmovw %eax, %k1 4296; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4297; X86-NEXT: retl 4298; 4299; X64-LABEL: test_mm256_mask_permutex2var_epi64: 4300; X64: # %bb.0: # %entry 4301; X64-NEXT: kmovw %edi, %k1 4302; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} 4303; X64-NEXT: retq 4304entry: 4305 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4306 %1 = bitcast i8 %__U to <8 x i1> 4307 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4308 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__A 4309 ret <4 x i64> %2 4310} 4311 4312define <4 x i64> @test_mm256_maskz_permutex2var_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) { 4313; X86-LABEL: test_mm256_maskz_permutex2var_epi64: 4314; X86: # %bb.0: # %entry 4315; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4316; X86-NEXT: kmovw %eax, %k1 4317; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4318; X86-NEXT: retl 4319; 4320; X64-LABEL: test_mm256_maskz_permutex2var_epi64: 4321; X64: # %bb.0: # %entry 4322; X64-NEXT: kmovw %edi, %k1 4323; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 {%k1} {z} 4324; X64-NEXT: retq 4325entry: 4326 %0 = tail call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %__A, <4 x i64> %__I, <4 x i64> %__B) 4327 %1 = bitcast i8 %__U to <8 x i1> 4328 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4329 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 4330 ret <4 x i64> %2 4331} 4332 4333 4334define <2 x double> @test_mm_mask_fmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4335; X86-LABEL: test_mm_mask_fmadd_pd: 4336; X86: # %bb.0: # %entry 4337; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4338; X86-NEXT: kmovw %eax, %k1 4339; X86-NEXT: vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2 4340; X86-NEXT: retl 4341; 4342; X64-LABEL: test_mm_mask_fmadd_pd: 4343; X64: # %bb.0: # %entry 4344; X64-NEXT: kmovw %edi, %k1 4345; X64-NEXT: vfmadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2 4346; X64-NEXT: retq 4347entry: 4348 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4349 %1 = bitcast i8 %__U to <8 x i1> 4350 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4351 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4352 ret <2 x double> %2 4353} 4354 4355define <2 x double> @test_mm_mask_fmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 4356; X86-LABEL: test_mm_mask_fmsub_pd: 4357; X86: # %bb.0: # %entry 4358; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4359; X86-NEXT: kmovw %eax, %k1 4360; X86-NEXT: vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2 4361; X86-NEXT: retl 4362; 4363; X64-LABEL: test_mm_mask_fmsub_pd: 4364; X64: # %bb.0: # %entry 4365; X64-NEXT: kmovw %edi, %k1 4366; X64-NEXT: vfmsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2 4367; X64-NEXT: retq 4368entry: 4369 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4370 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4371 %1 = bitcast i8 %__U to <8 x i1> 4372 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4373 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 4374 ret <2 x double> %2 4375} 4376 4377define <2 x double> @test_mm_mask3_fmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4378; X86-LABEL: test_mm_mask3_fmadd_pd: 4379; X86: # %bb.0: # %entry 4380; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4381; X86-NEXT: kmovw %eax, %k1 4382; X86-NEXT: vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 4383; X86-NEXT: vmovapd %xmm2, %xmm0 4384; X86-NEXT: retl 4385; 4386; X64-LABEL: test_mm_mask3_fmadd_pd: 4387; X64: # %bb.0: # %entry 4388; X64-NEXT: kmovw %edi, %k1 4389; X64-NEXT: vfmadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 4390; X64-NEXT: vmovapd %xmm2, %xmm0 4391; X64-NEXT: retq 4392entry: 4393 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4394 %1 = bitcast i8 %__U to <8 x i1> 4395 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4396 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4397 ret <2 x double> %2 4398} 4399 4400define <2 x double> @test_mm_mask3_fnmadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 4401; X86-LABEL: test_mm_mask3_fnmadd_pd: 4402; X86: # %bb.0: # %entry 4403; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4404; X86-NEXT: kmovw %eax, %k1 4405; X86-NEXT: vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 4406; X86-NEXT: vmovapd %xmm2, %xmm0 4407; X86-NEXT: retl 4408; 4409; X64-LABEL: test_mm_mask3_fnmadd_pd: 4410; X64: # %bb.0: # %entry 4411; X64-NEXT: kmovw %edi, %k1 4412; X64-NEXT: vfnmadd231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 4413; X64-NEXT: vmovapd %xmm2, %xmm0 4414; X64-NEXT: retq 4415entry: 4416 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4417 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4418 %1 = bitcast i8 %__U to <8 x i1> 4419 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4420 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 4421 ret <2 x double> %2 4422} 4423 4424define <2 x double> @test_mm_maskz_fmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4425; X86-LABEL: test_mm_maskz_fmadd_pd: 4426; X86: # %bb.0: # %entry 4427; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4428; X86-NEXT: kmovw %eax, %k1 4429; X86-NEXT: vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 4430; X86-NEXT: retl 4431; 4432; X64-LABEL: test_mm_maskz_fmadd_pd: 4433; X64: # %bb.0: # %entry 4434; X64-NEXT: kmovw %edi, %k1 4435; X64-NEXT: vfmadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 4436; X64-NEXT: retq 4437entry: 4438 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 4439 %1 = bitcast i8 %__U to <8 x i1> 4440 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4441 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4442 ret <2 x double> %2 4443} 4444 4445define <2 x double> @test_mm_maskz_fmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4446; X86-LABEL: test_mm_maskz_fmsub_pd: 4447; X86: # %bb.0: # %entry 4448; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4449; X86-NEXT: kmovw %eax, %k1 4450; X86-NEXT: vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 4451; X86-NEXT: retl 4452; 4453; X64-LABEL: test_mm_maskz_fmsub_pd: 4454; X64: # %bb.0: # %entry 4455; X64-NEXT: kmovw %edi, %k1 4456; X64-NEXT: vfmsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 4457; X64-NEXT: retq 4458entry: 4459 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4460 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 4461 %1 = bitcast i8 %__U to <8 x i1> 4462 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4463 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4464 ret <2 x double> %2 4465} 4466 4467define <2 x double> @test_mm_maskz_fnmadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4468; X86-LABEL: test_mm_maskz_fnmadd_pd: 4469; X86: # %bb.0: # %entry 4470; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4471; X86-NEXT: kmovw %eax, %k1 4472; X86-NEXT: vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 4473; X86-NEXT: retl 4474; 4475; X64-LABEL: test_mm_maskz_fnmadd_pd: 4476; X64: # %bb.0: # %entry 4477; X64-NEXT: kmovw %edi, %k1 4478; X64-NEXT: vfnmadd213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 4479; X64-NEXT: retq 4480entry: 4481 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4482 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %__C) #9 4483 %1 = bitcast i8 %__U to <8 x i1> 4484 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4485 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4486 ret <2 x double> %2 4487} 4488 4489define <2 x double> @test_mm_maskz_fnmsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 4490; X86-LABEL: test_mm_maskz_fnmsub_pd: 4491; X86: # %bb.0: # %entry 4492; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4493; X86-NEXT: kmovw %eax, %k1 4494; X86-NEXT: vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 4495; X86-NEXT: retl 4496; 4497; X64-LABEL: test_mm_maskz_fnmsub_pd: 4498; X64: # %bb.0: # %entry 4499; X64-NEXT: kmovw %edi, %k1 4500; X64-NEXT: vfnmsub213pd {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 4501; X64-NEXT: retq 4502entry: 4503 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__A 4504 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 4505 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %sub.i, <2 x double> %__B, <2 x double> %sub1.i) #9 4506 %1 = bitcast i8 %__U to <8 x i1> 4507 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 4508 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 4509 ret <2 x double> %2 4510} 4511 4512define <4 x double> @test_mm256_mask_fmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4513; X86-LABEL: test_mm256_mask_fmadd_pd: 4514; X86: # %bb.0: # %entry 4515; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4516; X86-NEXT: kmovw %eax, %k1 4517; X86-NEXT: vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2 4518; X86-NEXT: retl 4519; 4520; X64-LABEL: test_mm256_mask_fmadd_pd: 4521; X64: # %bb.0: # %entry 4522; X64-NEXT: kmovw %edi, %k1 4523; X64-NEXT: vfmadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2 4524; X64-NEXT: retq 4525entry: 4526 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4527 %1 = bitcast i8 %__U to <8 x i1> 4528 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4529 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4530 ret <4 x double> %2 4531} 4532 4533define <4 x double> @test_mm256_mask_fmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 4534; X86-LABEL: test_mm256_mask_fmsub_pd: 4535; X86: # %bb.0: # %entry 4536; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4537; X86-NEXT: kmovw %eax, %k1 4538; X86-NEXT: vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2 4539; X86-NEXT: retl 4540; 4541; X64-LABEL: test_mm256_mask_fmsub_pd: 4542; X64: # %bb.0: # %entry 4543; X64-NEXT: kmovw %edi, %k1 4544; X64-NEXT: vfmsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2 4545; X64-NEXT: retq 4546entry: 4547 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4548 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4549 %1 = bitcast i8 %__U to <8 x i1> 4550 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4551 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 4552 ret <4 x double> %2 4553} 4554 4555define <4 x double> @test_mm256_mask3_fmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4556; X86-LABEL: test_mm256_mask3_fmadd_pd: 4557; X86: # %bb.0: # %entry 4558; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4559; X86-NEXT: kmovw %eax, %k1 4560; X86-NEXT: vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2 4561; X86-NEXT: vmovapd %ymm2, %ymm0 4562; X86-NEXT: retl 4563; 4564; X64-LABEL: test_mm256_mask3_fmadd_pd: 4565; X64: # %bb.0: # %entry 4566; X64-NEXT: kmovw %edi, %k1 4567; X64-NEXT: vfmadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2 4568; X64-NEXT: vmovapd %ymm2, %ymm0 4569; X64-NEXT: retq 4570entry: 4571 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4572 %1 = bitcast i8 %__U to <8 x i1> 4573 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4574 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4575 ret <4 x double> %2 4576} 4577 4578define <4 x double> @test_mm256_mask3_fnmadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 4579; X86-LABEL: test_mm256_mask3_fnmadd_pd: 4580; X86: # %bb.0: # %entry 4581; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4582; X86-NEXT: kmovw %eax, %k1 4583; X86-NEXT: vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 4584; X86-NEXT: vmovapd %ymm2, %ymm0 4585; X86-NEXT: retl 4586; 4587; X64-LABEL: test_mm256_mask3_fnmadd_pd: 4588; X64: # %bb.0: # %entry 4589; X64-NEXT: kmovw %edi, %k1 4590; X64-NEXT: vfnmadd231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 4591; X64-NEXT: vmovapd %ymm2, %ymm0 4592; X64-NEXT: retq 4593entry: 4594 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4595 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4596 %1 = bitcast i8 %__U to <8 x i1> 4597 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4598 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 4599 ret <4 x double> %2 4600} 4601 4602define <4 x double> @test_mm256_maskz_fmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4603; X86-LABEL: test_mm256_maskz_fmadd_pd: 4604; X86: # %bb.0: # %entry 4605; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4606; X86-NEXT: kmovw %eax, %k1 4607; X86-NEXT: vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 4608; X86-NEXT: retl 4609; 4610; X64-LABEL: test_mm256_maskz_fmadd_pd: 4611; X64: # %bb.0: # %entry 4612; X64-NEXT: kmovw %edi, %k1 4613; X64-NEXT: vfmadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 4614; X64-NEXT: retq 4615entry: 4616 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 4617 %1 = bitcast i8 %__U to <8 x i1> 4618 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4619 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4620 ret <4 x double> %2 4621} 4622 4623define <4 x double> @test_mm256_maskz_fmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4624; X86-LABEL: test_mm256_maskz_fmsub_pd: 4625; X86: # %bb.0: # %entry 4626; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4627; X86-NEXT: kmovw %eax, %k1 4628; X86-NEXT: vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 4629; X86-NEXT: retl 4630; 4631; X64-LABEL: test_mm256_maskz_fmsub_pd: 4632; X64: # %bb.0: # %entry 4633; X64-NEXT: kmovw %edi, %k1 4634; X64-NEXT: vfmsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 4635; X64-NEXT: retq 4636entry: 4637 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4638 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 4639 %1 = bitcast i8 %__U to <8 x i1> 4640 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4641 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4642 ret <4 x double> %2 4643} 4644 4645define <4 x double> @test_mm256_maskz_fnmadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4646; X86-LABEL: test_mm256_maskz_fnmadd_pd: 4647; X86: # %bb.0: # %entry 4648; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4649; X86-NEXT: kmovw %eax, %k1 4650; X86-NEXT: vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 4651; X86-NEXT: retl 4652; 4653; X64-LABEL: test_mm256_maskz_fnmadd_pd: 4654; X64: # %bb.0: # %entry 4655; X64-NEXT: kmovw %edi, %k1 4656; X64-NEXT: vfnmadd213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 4657; X64-NEXT: retq 4658entry: 4659 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4660 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %__C) #9 4661 %1 = bitcast i8 %__U to <8 x i1> 4662 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4663 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4664 ret <4 x double> %2 4665} 4666 4667define <4 x double> @test_mm256_maskz_fnmsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 4668; X86-LABEL: test_mm256_maskz_fnmsub_pd: 4669; X86: # %bb.0: # %entry 4670; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4671; X86-NEXT: kmovw %eax, %k1 4672; X86-NEXT: vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 4673; X86-NEXT: retl 4674; 4675; X64-LABEL: test_mm256_maskz_fnmsub_pd: 4676; X64: # %bb.0: # %entry 4677; X64-NEXT: kmovw %edi, %k1 4678; X64-NEXT: vfnmsub213pd {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 4679; X64-NEXT: retq 4680entry: 4681 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4682 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4683 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %sub.i, <4 x double> %__B, <4 x double> %sub1.i) #9 4684 %1 = bitcast i8 %__U to <8 x i1> 4685 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4686 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 4687 ret <4 x double> %2 4688} 4689 4690define <4 x float> @test_mm_mask_fmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4691; X86-LABEL: test_mm_mask_fmadd_ps: 4692; X86: # %bb.0: # %entry 4693; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4694; X86-NEXT: kmovw %eax, %k1 4695; X86-NEXT: vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2 4696; X86-NEXT: retl 4697; 4698; X64-LABEL: test_mm_mask_fmadd_ps: 4699; X64: # %bb.0: # %entry 4700; X64-NEXT: kmovw %edi, %k1 4701; X64-NEXT: vfmadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) + xmm2 4702; X64-NEXT: retq 4703entry: 4704 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4705 %1 = bitcast i8 %__U to <8 x i1> 4706 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4707 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4708 ret <4 x float> %2 4709} 4710 4711define <4 x float> @test_mm_mask_fmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 4712; X86-LABEL: test_mm_mask_fmsub_ps: 4713; X86: # %bb.0: # %entry 4714; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4715; X86-NEXT: kmovw %eax, %k1 4716; X86-NEXT: vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2 4717; X86-NEXT: retl 4718; 4719; X64-LABEL: test_mm_mask_fmsub_ps: 4720; X64: # %bb.0: # %entry 4721; X64-NEXT: kmovw %edi, %k1 4722; X64-NEXT: vfmsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) - xmm2 4723; X64-NEXT: retq 4724entry: 4725 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4726 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4727 %1 = bitcast i8 %__U to <8 x i1> 4728 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4729 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 4730 ret <4 x float> %2 4731} 4732 4733define <4 x float> @test_mm_mask3_fmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4734; X86-LABEL: test_mm_mask3_fmadd_ps: 4735; X86: # %bb.0: # %entry 4736; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4737; X86-NEXT: kmovw %eax, %k1 4738; X86-NEXT: vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 4739; X86-NEXT: vmovaps %xmm2, %xmm0 4740; X86-NEXT: retl 4741; 4742; X64-LABEL: test_mm_mask3_fmadd_ps: 4743; X64: # %bb.0: # %entry 4744; X64-NEXT: kmovw %edi, %k1 4745; X64-NEXT: vfmadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) + xmm2 4746; X64-NEXT: vmovaps %xmm2, %xmm0 4747; X64-NEXT: retq 4748entry: 4749 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4750 %1 = bitcast i8 %__U to <8 x i1> 4751 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4752 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4753 ret <4 x float> %2 4754} 4755 4756define <4 x float> @test_mm_mask3_fnmadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 4757; X86-LABEL: test_mm_mask3_fnmadd_ps: 4758; X86: # %bb.0: # %entry 4759; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4760; X86-NEXT: kmovw %eax, %k1 4761; X86-NEXT: vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 4762; X86-NEXT: vmovaps %xmm2, %xmm0 4763; X86-NEXT: retl 4764; 4765; X64-LABEL: test_mm_mask3_fnmadd_ps: 4766; X64: # %bb.0: # %entry 4767; X64-NEXT: kmovw %edi, %k1 4768; X64-NEXT: vfnmadd231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2 4769; X64-NEXT: vmovaps %xmm2, %xmm0 4770; X64-NEXT: retq 4771entry: 4772 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4773 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4774 %1 = bitcast i8 %__U to <8 x i1> 4775 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4776 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 4777 ret <4 x float> %2 4778} 4779 4780define <4 x float> @test_mm_maskz_fmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4781; X86-LABEL: test_mm_maskz_fmadd_ps: 4782; X86: # %bb.0: # %entry 4783; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4784; X86-NEXT: kmovw %eax, %k1 4785; X86-NEXT: vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 4786; X86-NEXT: retl 4787; 4788; X64-LABEL: test_mm_maskz_fmadd_ps: 4789; X64: # %bb.0: # %entry 4790; X64-NEXT: kmovw %edi, %k1 4791; X64-NEXT: vfmadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) + xmm2 4792; X64-NEXT: retq 4793entry: 4794 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 4795 %1 = bitcast i8 %__U to <8 x i1> 4796 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4797 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4798 ret <4 x float> %2 4799} 4800 4801define <4 x float> @test_mm_maskz_fmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4802; X86-LABEL: test_mm_maskz_fmsub_ps: 4803; X86: # %bb.0: # %entry 4804; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4805; X86-NEXT: kmovw %eax, %k1 4806; X86-NEXT: vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 4807; X86-NEXT: retl 4808; 4809; X64-LABEL: test_mm_maskz_fmsub_ps: 4810; X64: # %bb.0: # %entry 4811; X64-NEXT: kmovw %edi, %k1 4812; X64-NEXT: vfmsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) - xmm2 4813; X64-NEXT: retq 4814entry: 4815 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4816 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 4817 %1 = bitcast i8 %__U to <8 x i1> 4818 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4819 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4820 ret <4 x float> %2 4821} 4822 4823define <4 x float> @test_mm_maskz_fnmadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4824; X86-LABEL: test_mm_maskz_fnmadd_ps: 4825; X86: # %bb.0: # %entry 4826; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4827; X86-NEXT: kmovw %eax, %k1 4828; X86-NEXT: vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 4829; X86-NEXT: retl 4830; 4831; X64-LABEL: test_mm_maskz_fnmadd_ps: 4832; X64: # %bb.0: # %entry 4833; X64-NEXT: kmovw %edi, %k1 4834; X64-NEXT: vfnmadd213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) + xmm2 4835; X64-NEXT: retq 4836entry: 4837 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4838 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %__C) #9 4839 %1 = bitcast i8 %__U to <8 x i1> 4840 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4841 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4842 ret <4 x float> %2 4843} 4844 4845define <4 x float> @test_mm_maskz_fnmsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4846; X86-LABEL: test_mm_maskz_fnmsub_ps: 4847; X86: # %bb.0: # %entry 4848; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4849; X86-NEXT: kmovw %eax, %k1 4850; X86-NEXT: vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 4851; X86-NEXT: retl 4852; 4853; X64-LABEL: test_mm_maskz_fnmsub_ps: 4854; X64: # %bb.0: # %entry 4855; X64-NEXT: kmovw %edi, %k1 4856; X64-NEXT: vfnmsub213ps {{.*#+}} xmm0 {%k1} {z} = -(xmm1 * xmm0) - xmm2 4857; X64-NEXT: retq 4858entry: 4859 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4860 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4861 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %sub.i, <4 x float> %__B, <4 x float> %sub1.i) #9 4862 %1 = bitcast i8 %__U to <8 x i1> 4863 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 4864 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 4865 ret <4 x float> %2 4866} 4867 4868define <8 x float> @test_mm256_mask_fmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4869; X86-LABEL: test_mm256_mask_fmadd_ps: 4870; X86: # %bb.0: # %entry 4871; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4872; X86-NEXT: kmovw %eax, %k1 4873; X86-NEXT: vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2 4874; X86-NEXT: retl 4875; 4876; X64-LABEL: test_mm256_mask_fmadd_ps: 4877; X64: # %bb.0: # %entry 4878; X64-NEXT: kmovw %edi, %k1 4879; X64-NEXT: vfmadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) + ymm2 4880; X64-NEXT: retq 4881entry: 4882 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4883 %1 = bitcast i8 %__U to <8 x i1> 4884 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4885 ret <8 x float> %2 4886} 4887 4888define <8 x float> @test_mm256_mask_fmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 4889; X86-LABEL: test_mm256_mask_fmsub_ps: 4890; X86: # %bb.0: # %entry 4891; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4892; X86-NEXT: kmovw %eax, %k1 4893; X86-NEXT: vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2 4894; X86-NEXT: retl 4895; 4896; X64-LABEL: test_mm256_mask_fmsub_ps: 4897; X64: # %bb.0: # %entry 4898; X64-NEXT: kmovw %edi, %k1 4899; X64-NEXT: vfmsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) - ymm2 4900; X64-NEXT: retq 4901entry: 4902 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4903 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4904 %1 = bitcast i8 %__U to <8 x i1> 4905 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 4906 ret <8 x float> %2 4907} 4908 4909define <8 x float> @test_mm256_mask3_fmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4910; X86-LABEL: test_mm256_mask3_fmadd_ps: 4911; X86: # %bb.0: # %entry 4912; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4913; X86-NEXT: kmovw %eax, %k1 4914; X86-NEXT: vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2 4915; X86-NEXT: vmovaps %ymm2, %ymm0 4916; X86-NEXT: retl 4917; 4918; X64-LABEL: test_mm256_mask3_fmadd_ps: 4919; X64: # %bb.0: # %entry 4920; X64-NEXT: kmovw %edi, %k1 4921; X64-NEXT: vfmadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) + ymm2 4922; X64-NEXT: vmovaps %ymm2, %ymm0 4923; X64-NEXT: retq 4924entry: 4925 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4926 %1 = bitcast i8 %__U to <8 x i1> 4927 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4928 ret <8 x float> %2 4929} 4930 4931define <8 x float> @test_mm256_mask3_fnmadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 4932; X86-LABEL: test_mm256_mask3_fnmadd_ps: 4933; X86: # %bb.0: # %entry 4934; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4935; X86-NEXT: kmovw %eax, %k1 4936; X86-NEXT: vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 4937; X86-NEXT: vmovaps %ymm2, %ymm0 4938; X86-NEXT: retl 4939; 4940; X64-LABEL: test_mm256_mask3_fnmadd_ps: 4941; X64: # %bb.0: # %entry 4942; X64-NEXT: kmovw %edi, %k1 4943; X64-NEXT: vfnmadd231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) + ymm2 4944; X64-NEXT: vmovaps %ymm2, %ymm0 4945; X64-NEXT: retq 4946entry: 4947 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4948 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 4949 %1 = bitcast i8 %__U to <8 x i1> 4950 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 4951 ret <8 x float> %2 4952} 4953 4954define <8 x float> @test_mm256_maskz_fmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4955; X86-LABEL: test_mm256_maskz_fmadd_ps: 4956; X86: # %bb.0: # %entry 4957; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4958; X86-NEXT: kmovw %eax, %k1 4959; X86-NEXT: vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 4960; X86-NEXT: retl 4961; 4962; X64-LABEL: test_mm256_maskz_fmadd_ps: 4963; X64: # %bb.0: # %entry 4964; X64-NEXT: kmovw %edi, %k1 4965; X64-NEXT: vfmadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) + ymm2 4966; X64-NEXT: retq 4967entry: 4968 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 4969 %1 = bitcast i8 %__U to <8 x i1> 4970 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4971 ret <8 x float> %2 4972} 4973 4974define <8 x float> @test_mm256_maskz_fmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4975; X86-LABEL: test_mm256_maskz_fmsub_ps: 4976; X86: # %bb.0: # %entry 4977; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4978; X86-NEXT: kmovw %eax, %k1 4979; X86-NEXT: vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 4980; X86-NEXT: retl 4981; 4982; X64-LABEL: test_mm256_maskz_fmsub_ps: 4983; X64: # %bb.0: # %entry 4984; X64-NEXT: kmovw %edi, %k1 4985; X64-NEXT: vfmsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) - ymm2 4986; X64-NEXT: retq 4987entry: 4988 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4989 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 4990 %1 = bitcast i8 %__U to <8 x i1> 4991 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 4992 ret <8 x float> %2 4993} 4994 4995define <8 x float> @test_mm256_maskz_fnmadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 4996; X86-LABEL: test_mm256_maskz_fnmadd_ps: 4997; X86: # %bb.0: # %entry 4998; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4999; X86-NEXT: kmovw %eax, %k1 5000; X86-NEXT: vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 5001; X86-NEXT: retl 5002; 5003; X64-LABEL: test_mm256_maskz_fnmadd_ps: 5004; X64: # %bb.0: # %entry 5005; X64-NEXT: kmovw %edi, %k1 5006; X64-NEXT: vfnmadd213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) + ymm2 5007; X64-NEXT: retq 5008entry: 5009 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 5010 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %__C) #9 5011 %1 = bitcast i8 %__U to <8 x i1> 5012 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 5013 ret <8 x float> %2 5014} 5015 5016define <8 x float> @test_mm256_maskz_fnmsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5017; X86-LABEL: test_mm256_maskz_fnmsub_ps: 5018; X86: # %bb.0: # %entry 5019; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5020; X86-NEXT: kmovw %eax, %k1 5021; X86-NEXT: vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 5022; X86-NEXT: retl 5023; 5024; X64-LABEL: test_mm256_maskz_fnmsub_ps: 5025; X64: # %bb.0: # %entry 5026; X64-NEXT: kmovw %edi, %k1 5027; X64-NEXT: vfnmsub213ps {{.*#+}} ymm0 {%k1} {z} = -(ymm1 * ymm0) - ymm2 5028; X64-NEXT: retq 5029entry: 5030 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 5031 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5032 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %sub.i, <8 x float> %__B, <8 x float> %sub1.i) #9 5033 %1 = bitcast i8 %__U to <8 x i1> 5034 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 5035 ret <8 x float> %2 5036} 5037 5038define <2 x double> @test_mm_mask_fmaddsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5039; X86-LABEL: test_mm_mask_fmaddsub_pd: 5040; X86: # %bb.0: # %entry 5041; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5042; X86-NEXT: kmovw %eax, %k1 5043; X86-NEXT: vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 5044; X86-NEXT: retl 5045; 5046; X64-LABEL: test_mm_mask_fmaddsub_pd: 5047; X64: # %bb.0: # %entry 5048; X64-NEXT: kmovw %edi, %k1 5049; X64-NEXT: vfmaddsub132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 5050; X64-NEXT: retq 5051entry: 5052 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5053 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5054 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5055 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5056 %4 = bitcast i8 %__U to <8 x i1> 5057 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5058 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__A 5059 ret <2 x double> %5 5060} 5061 5062define <2 x double> @test_mm_mask_fmsubadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5063; X86-LABEL: test_mm_mask_fmsubadd_pd: 5064; X86: # %bb.0: # %entry 5065; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5066; X86-NEXT: kmovw %eax, %k1 5067; X86-NEXT: vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 5068; X86-NEXT: retl 5069; 5070; X64-LABEL: test_mm_mask_fmsubadd_pd: 5071; X64: # %bb.0: # %entry 5072; X64-NEXT: kmovw %edi, %k1 5073; X64-NEXT: vfmsubadd132pd {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 5074; X64-NEXT: retq 5075entry: 5076 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5077 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5078 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5079 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5080 %3 = bitcast i8 %__U to <8 x i1> 5081 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5082 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__A 5083 ret <2 x double> %4 5084} 5085 5086define <2 x double> @test_mm_mask3_fmaddsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5087; X86-LABEL: test_mm_mask3_fmaddsub_pd: 5088; X86: # %bb.0: # %entry 5089; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5090; X86-NEXT: kmovw %eax, %k1 5091; X86-NEXT: vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 5092; X86-NEXT: vmovapd %xmm2, %xmm0 5093; X86-NEXT: retl 5094; 5095; X64-LABEL: test_mm_mask3_fmaddsub_pd: 5096; X64: # %bb.0: # %entry 5097; X64-NEXT: kmovw %edi, %k1 5098; X64-NEXT: vfmaddsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 5099; X64-NEXT: vmovapd %xmm2, %xmm0 5100; X64-NEXT: retq 5101entry: 5102 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5103 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5104 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5105 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5106 %4 = bitcast i8 %__U to <8 x i1> 5107 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5108 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> %__C 5109 ret <2 x double> %5 5110} 5111 5112define <2 x double> @test_mm_maskz_fmaddsub_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5113; X86-LABEL: test_mm_maskz_fmaddsub_pd: 5114; X86: # %bb.0: # %entry 5115; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5116; X86-NEXT: kmovw %eax, %k1 5117; X86-NEXT: vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 5118; X86-NEXT: retl 5119; 5120; X64-LABEL: test_mm_maskz_fmaddsub_pd: 5121; X64: # %bb.0: # %entry 5122; X64-NEXT: kmovw %edi, %k1 5123; X64-NEXT: vfmaddsub213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 5124; X64-NEXT: retq 5125entry: 5126 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5127 %1 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5128 %2 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %1) #9 5129 %3 = shufflevector <2 x double> %2, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5130 %4 = bitcast i8 %__U to <8 x i1> 5131 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5132 %5 = select <2 x i1> %extract.i, <2 x double> %3, <2 x double> zeroinitializer 5133 ret <2 x double> %5 5134} 5135 5136define <2 x double> @test_mm_maskz_fmsubadd_pd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5137; X86-LABEL: test_mm_maskz_fmsubadd_pd: 5138; X86: # %bb.0: # %entry 5139; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5140; X86-NEXT: kmovw %eax, %k1 5141; X86-NEXT: vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 5142; X86-NEXT: retl 5143; 5144; X64-LABEL: test_mm_maskz_fmsubadd_pd: 5145; X64: # %bb.0: # %entry 5146; X64-NEXT: kmovw %edi, %k1 5147; X64-NEXT: vfmsubadd213pd {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 5148; X64-NEXT: retq 5149entry: 5150 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5151 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5152 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5153 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5154 %3 = bitcast i8 %__U to <8 x i1> 5155 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5156 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> zeroinitializer 5157 ret <2 x double> %4 5158} 5159 5160define <4 x double> @test_mm256_mask_fmaddsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5161; X86-LABEL: test_mm256_mask_fmaddsub_pd: 5162; X86: # %bb.0: # %entry 5163; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5164; X86-NEXT: kmovw %eax, %k1 5165; X86-NEXT: vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 5166; X86-NEXT: retl 5167; 5168; X64-LABEL: test_mm256_mask_fmaddsub_pd: 5169; X64: # %bb.0: # %entry 5170; X64-NEXT: kmovw %edi, %k1 5171; X64-NEXT: vfmaddsub132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 5172; X64-NEXT: retq 5173entry: 5174 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5175 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5176 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5177 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5178 %4 = bitcast i8 %__U to <8 x i1> 5179 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5180 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__A 5181 ret <4 x double> %5 5182} 5183 5184define <4 x double> @test_mm256_mask_fmsubadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5185; X86-LABEL: test_mm256_mask_fmsubadd_pd: 5186; X86: # %bb.0: # %entry 5187; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5188; X86-NEXT: kmovw %eax, %k1 5189; X86-NEXT: vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 5190; X86-NEXT: retl 5191; 5192; X64-LABEL: test_mm256_mask_fmsubadd_pd: 5193; X64: # %bb.0: # %entry 5194; X64-NEXT: kmovw %edi, %k1 5195; X64-NEXT: vfmsubadd132pd {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 5196; X64-NEXT: retq 5197entry: 5198 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5199 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5200 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5201 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5202 %3 = bitcast i8 %__U to <8 x i1> 5203 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5204 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__A 5205 ret <4 x double> %4 5206} 5207 5208define <4 x double> @test_mm256_mask3_fmaddsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5209; X86-LABEL: test_mm256_mask3_fmaddsub_pd: 5210; X86: # %bb.0: # %entry 5211; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5212; X86-NEXT: kmovw %eax, %k1 5213; X86-NEXT: vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 5214; X86-NEXT: vmovapd %ymm2, %ymm0 5215; X86-NEXT: retl 5216; 5217; X64-LABEL: test_mm256_mask3_fmaddsub_pd: 5218; X64: # %bb.0: # %entry 5219; X64-NEXT: kmovw %edi, %k1 5220; X64-NEXT: vfmaddsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 5221; X64-NEXT: vmovapd %ymm2, %ymm0 5222; X64-NEXT: retq 5223entry: 5224 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5225 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5226 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5227 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5228 %4 = bitcast i8 %__U to <8 x i1> 5229 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5230 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> %__C 5231 ret <4 x double> %5 5232} 5233 5234define <4 x double> @test_mm256_maskz_fmaddsub_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5235; X86-LABEL: test_mm256_maskz_fmaddsub_pd: 5236; X86: # %bb.0: # %entry 5237; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5238; X86-NEXT: kmovw %eax, %k1 5239; X86-NEXT: vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 5240; X86-NEXT: retl 5241; 5242; X64-LABEL: test_mm256_maskz_fmaddsub_pd: 5243; X64: # %bb.0: # %entry 5244; X64-NEXT: kmovw %edi, %k1 5245; X64-NEXT: vfmaddsub213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 5246; X64-NEXT: retq 5247entry: 5248 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5249 %1 = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5250 %2 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %1) #9 5251 %3 = shufflevector <4 x double> %2, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5252 %4 = bitcast i8 %__U to <8 x i1> 5253 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5254 %5 = select <4 x i1> %extract.i, <4 x double> %3, <4 x double> zeroinitializer 5255 ret <4 x double> %5 5256} 5257 5258define <4 x double> @test_mm256_maskz_fmsubadd_pd(i8 zeroext %__U, <4 x double> %__A, <4 x double> %__B, <4 x double> %__C) { 5259; X86-LABEL: test_mm256_maskz_fmsubadd_pd: 5260; X86: # %bb.0: # %entry 5261; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5262; X86-NEXT: kmovw %eax, %k1 5263; X86-NEXT: vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 5264; X86-NEXT: retl 5265; 5266; X64-LABEL: test_mm256_maskz_fmsubadd_pd: 5267; X64: # %bb.0: # %entry 5268; X64-NEXT: kmovw %edi, %k1 5269; X64-NEXT: vfmsubadd213pd {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 5270; X64-NEXT: retq 5271entry: 5272 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5273 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5274 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5275 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5276 %3 = bitcast i8 %__U to <8 x i1> 5277 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5278 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> zeroinitializer 5279 ret <4 x double> %4 5280} 5281 5282define <4 x float> @test_mm_mask_fmaddsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5283; X86-LABEL: test_mm_mask_fmaddsub_ps: 5284; X86: # %bb.0: # %entry 5285; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5286; X86-NEXT: kmovw %eax, %k1 5287; X86-NEXT: vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 5288; X86-NEXT: retl 5289; 5290; X64-LABEL: test_mm_mask_fmaddsub_ps: 5291; X64: # %bb.0: # %entry 5292; X64-NEXT: kmovw %edi, %k1 5293; X64-NEXT: vfmaddsub132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) +/- xmm2 5294; X64-NEXT: retq 5295entry: 5296 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5297 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5298 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5299 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5300 %4 = bitcast i8 %__U to <8 x i1> 5301 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5302 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__A 5303 ret <4 x float> %5 5304} 5305 5306define <4 x float> @test_mm_mask_fmsubadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5307; X86-LABEL: test_mm_mask_fmsubadd_ps: 5308; X86: # %bb.0: # %entry 5309; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5310; X86-NEXT: kmovw %eax, %k1 5311; X86-NEXT: vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 5312; X86-NEXT: retl 5313; 5314; X64-LABEL: test_mm_mask_fmsubadd_ps: 5315; X64: # %bb.0: # %entry 5316; X64-NEXT: kmovw %edi, %k1 5317; X64-NEXT: vfmsubadd132ps {{.*#+}} xmm0 {%k1} = (xmm0 * xmm1) -/+ xmm2 5318; X64-NEXT: retq 5319entry: 5320 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5321 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5322 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5323 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5324 %3 = bitcast i8 %__U to <8 x i1> 5325 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5326 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__A 5327 ret <4 x float> %4 5328} 5329 5330define <4 x float> @test_mm_mask3_fmaddsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5331; X86-LABEL: test_mm_mask3_fmaddsub_ps: 5332; X86: # %bb.0: # %entry 5333; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5334; X86-NEXT: kmovw %eax, %k1 5335; X86-NEXT: vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 5336; X86-NEXT: vmovaps %xmm2, %xmm0 5337; X86-NEXT: retl 5338; 5339; X64-LABEL: test_mm_mask3_fmaddsub_ps: 5340; X64: # %bb.0: # %entry 5341; X64-NEXT: kmovw %edi, %k1 5342; X64-NEXT: vfmaddsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) +/- xmm2 5343; X64-NEXT: vmovaps %xmm2, %xmm0 5344; X64-NEXT: retq 5345entry: 5346 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5347 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5348 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5349 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5350 %4 = bitcast i8 %__U to <8 x i1> 5351 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5352 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> %__C 5353 ret <4 x float> %5 5354} 5355 5356define <4 x float> @test_mm_maskz_fmaddsub_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5357; X86-LABEL: test_mm_maskz_fmaddsub_ps: 5358; X86: # %bb.0: # %entry 5359; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5360; X86-NEXT: kmovw %eax, %k1 5361; X86-NEXT: vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 5362; X86-NEXT: retl 5363; 5364; X64-LABEL: test_mm_maskz_fmaddsub_ps: 5365; X64: # %bb.0: # %entry 5366; X64-NEXT: kmovw %edi, %k1 5367; X64-NEXT: vfmaddsub213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) +/- xmm2 5368; X64-NEXT: retq 5369entry: 5370 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5371 %1 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5372 %2 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %1) #9 5373 %3 = shufflevector <4 x float> %2, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5374 %4 = bitcast i8 %__U to <8 x i1> 5375 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5376 %5 = select <4 x i1> %extract.i, <4 x float> %3, <4 x float> zeroinitializer 5377 ret <4 x float> %5 5378} 5379 5380define <4 x float> @test_mm_maskz_fmsubadd_ps(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5381; X86-LABEL: test_mm_maskz_fmsubadd_ps: 5382; X86: # %bb.0: # %entry 5383; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5384; X86-NEXT: kmovw %eax, %k1 5385; X86-NEXT: vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 5386; X86-NEXT: retl 5387; 5388; X64-LABEL: test_mm_maskz_fmsubadd_ps: 5389; X64: # %bb.0: # %entry 5390; X64-NEXT: kmovw %edi, %k1 5391; X64-NEXT: vfmsubadd213ps {{.*#+}} xmm0 {%k1} {z} = (xmm1 * xmm0) -/+ xmm2 5392; X64-NEXT: retq 5393entry: 5394 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5395 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5396 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5397 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5398 %3 = bitcast i8 %__U to <8 x i1> 5399 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5400 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> zeroinitializer 5401 ret <4 x float> %4 5402} 5403 5404define <8 x float> @test_mm256_mask_fmaddsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5405; X86-LABEL: test_mm256_mask_fmaddsub_ps: 5406; X86: # %bb.0: # %entry 5407; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5408; X86-NEXT: kmovw %eax, %k1 5409; X86-NEXT: vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 5410; X86-NEXT: retl 5411; 5412; X64-LABEL: test_mm256_mask_fmaddsub_ps: 5413; X64: # %bb.0: # %entry 5414; X64-NEXT: kmovw %edi, %k1 5415; X64-NEXT: vfmaddsub132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) +/- ymm2 5416; X64-NEXT: retq 5417entry: 5418 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5419 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5420 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5421 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5422 %4 = bitcast i8 %__U to <8 x i1> 5423 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__A 5424 ret <8 x float> %5 5425} 5426 5427define <8 x float> @test_mm256_mask_fmsubadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5428; X86-LABEL: test_mm256_mask_fmsubadd_ps: 5429; X86: # %bb.0: # %entry 5430; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5431; X86-NEXT: kmovw %eax, %k1 5432; X86-NEXT: vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 5433; X86-NEXT: retl 5434; 5435; X64-LABEL: test_mm256_mask_fmsubadd_ps: 5436; X64: # %bb.0: # %entry 5437; X64-NEXT: kmovw %edi, %k1 5438; X64-NEXT: vfmsubadd132ps {{.*#+}} ymm0 {%k1} = (ymm0 * ymm1) -/+ ymm2 5439; X64-NEXT: retq 5440entry: 5441 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5442 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5443 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5444 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5445 %3 = bitcast i8 %__U to <8 x i1> 5446 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__A 5447 ret <8 x float> %4 5448} 5449 5450define <8 x float> @test_mm256_mask3_fmaddsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5451; X86-LABEL: test_mm256_mask3_fmaddsub_ps: 5452; X86: # %bb.0: # %entry 5453; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5454; X86-NEXT: kmovw %eax, %k1 5455; X86-NEXT: vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 5456; X86-NEXT: vmovaps %ymm2, %ymm0 5457; X86-NEXT: retl 5458; 5459; X64-LABEL: test_mm256_mask3_fmaddsub_ps: 5460; X64: # %bb.0: # %entry 5461; X64-NEXT: kmovw %edi, %k1 5462; X64-NEXT: vfmaddsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) +/- ymm2 5463; X64-NEXT: vmovaps %ymm2, %ymm0 5464; X64-NEXT: retq 5465entry: 5466 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5467 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5468 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5469 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5470 %4 = bitcast i8 %__U to <8 x i1> 5471 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> %__C 5472 ret <8 x float> %5 5473} 5474 5475define <8 x float> @test_mm256_maskz_fmaddsub_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5476; X86-LABEL: test_mm256_maskz_fmaddsub_ps: 5477; X86: # %bb.0: # %entry 5478; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5479; X86-NEXT: kmovw %eax, %k1 5480; X86-NEXT: vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 5481; X86-NEXT: retl 5482; 5483; X64-LABEL: test_mm256_maskz_fmaddsub_ps: 5484; X64: # %bb.0: # %entry 5485; X64-NEXT: kmovw %edi, %k1 5486; X64-NEXT: vfmaddsub213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) +/- ymm2 5487; X64-NEXT: retq 5488entry: 5489 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5490 %1 = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5491 %2 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %1) #9 5492 %3 = shufflevector <8 x float> %2, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5493 %4 = bitcast i8 %__U to <8 x i1> 5494 %5 = select <8 x i1> %4, <8 x float> %3, <8 x float> zeroinitializer 5495 ret <8 x float> %5 5496} 5497 5498define <8 x float> @test_mm256_maskz_fmsubadd_ps(i8 zeroext %__U, <8 x float> %__A, <8 x float> %__B, <8 x float> %__C) { 5499; X86-LABEL: test_mm256_maskz_fmsubadd_ps: 5500; X86: # %bb.0: # %entry 5501; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5502; X86-NEXT: kmovw %eax, %k1 5503; X86-NEXT: vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 5504; X86-NEXT: retl 5505; 5506; X64-LABEL: test_mm256_maskz_fmsubadd_ps: 5507; X64: # %bb.0: # %entry 5508; X64-NEXT: kmovw %edi, %k1 5509; X64-NEXT: vfmsubadd213ps {{.*#+}} ymm0 {%k1} {z} = (ymm1 * ymm0) -/+ ymm2 5510; X64-NEXT: retq 5511entry: 5512 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5513 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5514 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5515 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5516 %3 = bitcast i8 %__U to <8 x i1> 5517 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> zeroinitializer 5518 ret <8 x float> %4 5519} 5520 5521define <2 x double> @test_mm_mask3_fmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5522; X86-LABEL: test_mm_mask3_fmsub_pd: 5523; X86: # %bb.0: # %entry 5524; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5525; X86-NEXT: kmovw %eax, %k1 5526; X86-NEXT: vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5527; X86-NEXT: vmovapd %xmm2, %xmm0 5528; X86-NEXT: retl 5529; 5530; X64-LABEL: test_mm_mask3_fmsub_pd: 5531; X64: # %bb.0: # %entry 5532; X64-NEXT: kmovw %edi, %k1 5533; X64-NEXT: vfmsub231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5534; X64-NEXT: vmovapd %xmm2, %xmm0 5535; X64-NEXT: retq 5536entry: 5537 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5538 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5539 %1 = bitcast i8 %__U to <8 x i1> 5540 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5541 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5542 ret <2 x double> %2 5543} 5544 5545define <4 x double> @test_mm256_mask3_fmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5546; X86-LABEL: test_mm256_mask3_fmsub_pd: 5547; X86: # %bb.0: # %entry 5548; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5549; X86-NEXT: kmovw %eax, %k1 5550; X86-NEXT: vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2 5551; X86-NEXT: vmovapd %ymm2, %ymm0 5552; X86-NEXT: retl 5553; 5554; X64-LABEL: test_mm256_mask3_fmsub_pd: 5555; X64: # %bb.0: # %entry 5556; X64-NEXT: kmovw %edi, %k1 5557; X64-NEXT: vfmsub231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2 5558; X64-NEXT: vmovapd %ymm2, %ymm0 5559; X64-NEXT: retq 5560entry: 5561 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5562 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5563 %1 = bitcast i8 %__U to <8 x i1> 5564 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5565 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5566 ret <4 x double> %2 5567} 5568 5569define <4 x float> @test_mm_mask3_fmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5570; X86-LABEL: test_mm_mask3_fmsub_ps: 5571; X86: # %bb.0: # %entry 5572; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5573; X86-NEXT: kmovw %eax, %k1 5574; X86-NEXT: vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5575; X86-NEXT: vmovaps %xmm2, %xmm0 5576; X86-NEXT: retl 5577; 5578; X64-LABEL: test_mm_mask3_fmsub_ps: 5579; X64: # %bb.0: # %entry 5580; X64-NEXT: kmovw %edi, %k1 5581; X64-NEXT: vfmsub231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) - xmm2 5582; X64-NEXT: vmovaps %xmm2, %xmm0 5583; X64-NEXT: retq 5584entry: 5585 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5586 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5587 %1 = bitcast i8 %__U to <8 x i1> 5588 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5589 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5590 ret <4 x float> %2 5591} 5592 5593define <8 x float> @test_mm256_mask3_fmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5594; X86-LABEL: test_mm256_mask3_fmsub_ps: 5595; X86: # %bb.0: # %entry 5596; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5597; X86-NEXT: kmovw %eax, %k1 5598; X86-NEXT: vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2 5599; X86-NEXT: vmovaps %ymm2, %ymm0 5600; X86-NEXT: retl 5601; 5602; X64-LABEL: test_mm256_mask3_fmsub_ps: 5603; X64: # %bb.0: # %entry 5604; X64-NEXT: kmovw %edi, %k1 5605; X64-NEXT: vfmsub231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) - ymm2 5606; X64-NEXT: vmovaps %ymm2, %ymm0 5607; X64-NEXT: retq 5608entry: 5609 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5610 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5611 %1 = bitcast i8 %__U to <8 x i1> 5612 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5613 ret <8 x float> %2 5614} 5615 5616define <2 x double> @test_mm_mask3_fmsubadd_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5617; X86-LABEL: test_mm_mask3_fmsubadd_pd: 5618; X86: # %bb.0: # %entry 5619; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5620; X86-NEXT: kmovw %eax, %k1 5621; X86-NEXT: vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 5622; X86-NEXT: vmovapd %xmm2, %xmm0 5623; X86-NEXT: retl 5624; 5625; X64-LABEL: test_mm_mask3_fmsubadd_pd: 5626; X64: # %bb.0: # %entry 5627; X64-NEXT: kmovw %edi, %k1 5628; X64-NEXT: vfmsubadd231pd {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 5629; X64-NEXT: vmovapd %xmm2, %xmm0 5630; X64-NEXT: retq 5631entry: 5632 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5633 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %sub.i) #9 5634 %1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C) #9 5635 %2 = shufflevector <2 x double> %1, <2 x double> %0, <2 x i32> <i32 0, i32 3> 5636 %3 = bitcast i8 %__U to <8 x i1> 5637 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5638 %4 = select <2 x i1> %extract.i, <2 x double> %2, <2 x double> %__C 5639 ret <2 x double> %4 5640} 5641 5642define <4 x double> @test_mm256_mask3_fmsubadd_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5643; X86-LABEL: test_mm256_mask3_fmsubadd_pd: 5644; X86: # %bb.0: # %entry 5645; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5646; X86-NEXT: kmovw %eax, %k1 5647; X86-NEXT: vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 5648; X86-NEXT: vmovapd %ymm2, %ymm0 5649; X86-NEXT: retl 5650; 5651; X64-LABEL: test_mm256_mask3_fmsubadd_pd: 5652; X64: # %bb.0: # %entry 5653; X64-NEXT: kmovw %edi, %k1 5654; X64-NEXT: vfmsubadd231pd {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 5655; X64-NEXT: vmovapd %ymm2, %ymm0 5656; X64-NEXT: retq 5657entry: 5658 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5659 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %sub.i) #9 5660 %1 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C) #9 5661 %2 = shufflevector <4 x double> %1, <4 x double> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5662 %3 = bitcast i8 %__U to <8 x i1> 5663 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5664 %4 = select <4 x i1> %extract.i, <4 x double> %2, <4 x double> %__C 5665 ret <4 x double> %4 5666} 5667 5668define <4 x float> @test_mm_mask3_fmsubadd_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5669; X86-LABEL: test_mm_mask3_fmsubadd_ps: 5670; X86: # %bb.0: # %entry 5671; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5672; X86-NEXT: kmovw %eax, %k1 5673; X86-NEXT: vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 5674; X86-NEXT: vmovaps %xmm2, %xmm0 5675; X86-NEXT: retl 5676; 5677; X64-LABEL: test_mm_mask3_fmsubadd_ps: 5678; X64: # %bb.0: # %entry 5679; X64-NEXT: kmovw %edi, %k1 5680; X64-NEXT: vfmsubadd231ps {{.*#+}} xmm2 {%k1} = (xmm0 * xmm1) -/+ xmm2 5681; X64-NEXT: vmovaps %xmm2, %xmm0 5682; X64-NEXT: retq 5683entry: 5684 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5685 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %sub.i) #9 5686 %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C) #9 5687 %2 = shufflevector <4 x float> %1, <4 x float> %0, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 5688 %3 = bitcast i8 %__U to <8 x i1> 5689 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5690 %4 = select <4 x i1> %extract.i, <4 x float> %2, <4 x float> %__C 5691 ret <4 x float> %4 5692} 5693 5694define <8 x float> @test_mm256_mask3_fmsubadd_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5695; X86-LABEL: test_mm256_mask3_fmsubadd_ps: 5696; X86: # %bb.0: # %entry 5697; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5698; X86-NEXT: kmovw %eax, %k1 5699; X86-NEXT: vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 5700; X86-NEXT: vmovaps %ymm2, %ymm0 5701; X86-NEXT: retl 5702; 5703; X64-LABEL: test_mm256_mask3_fmsubadd_ps: 5704; X64: # %bb.0: # %entry 5705; X64-NEXT: kmovw %edi, %k1 5706; X64-NEXT: vfmsubadd231ps {{.*#+}} ymm2 {%k1} = (ymm0 * ymm1) -/+ ymm2 5707; X64-NEXT: vmovaps %ymm2, %ymm0 5708; X64-NEXT: retq 5709entry: 5710 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5711 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %sub.i) #9 5712 %1 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C) #9 5713 %2 = shufflevector <8 x float> %1, <8 x float> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 5714 %3 = bitcast i8 %__U to <8 x i1> 5715 %4 = select <8 x i1> %3, <8 x float> %2, <8 x float> %__C 5716 ret <8 x float> %4 5717} 5718 5719define <2 x double> @test_mm_mask_fnmadd_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5720; X86-LABEL: test_mm_mask_fnmadd_pd: 5721; X86: # %bb.0: # %entry 5722; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5723; X86-NEXT: kmovw %eax, %k1 5724; X86-NEXT: vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 5725; X86-NEXT: retl 5726; 5727; X64-LABEL: test_mm_mask_fnmadd_pd: 5728; X64: # %bb.0: # %entry 5729; X64-NEXT: kmovw %edi, %k1 5730; X64-NEXT: vfnmadd132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 5731; X64-NEXT: retq 5732entry: 5733 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5734 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %__C) #9 5735 %1 = bitcast i8 %__U to <8 x i1> 5736 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5737 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5738 ret <2 x double> %2 5739} 5740 5741define <4 x double> @test_mm256_mask_fnmadd_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5742; X86-LABEL: test_mm256_mask_fnmadd_pd: 5743; X86: # %bb.0: # %entry 5744; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5745; X86-NEXT: kmovw %eax, %k1 5746; X86-NEXT: vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 5747; X86-NEXT: retl 5748; 5749; X64-LABEL: test_mm256_mask_fnmadd_pd: 5750; X64: # %bb.0: # %entry 5751; X64-NEXT: kmovw %edi, %k1 5752; X64-NEXT: vfnmadd132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 5753; X64-NEXT: retq 5754entry: 5755 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5756 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %__C) #9 5757 %1 = bitcast i8 %__U to <8 x i1> 5758 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5759 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5760 ret <4 x double> %2 5761} 5762 5763define <4 x float> @test_mm_mask_fnmadd_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5764; X86-LABEL: test_mm_mask_fnmadd_ps: 5765; X86: # %bb.0: # %entry 5766; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5767; X86-NEXT: kmovw %eax, %k1 5768; X86-NEXT: vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 5769; X86-NEXT: retl 5770; 5771; X64-LABEL: test_mm_mask_fnmadd_ps: 5772; X64: # %bb.0: # %entry 5773; X64-NEXT: kmovw %edi, %k1 5774; X64-NEXT: vfnmadd132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) + xmm2 5775; X64-NEXT: retq 5776entry: 5777 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5778 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %__C) #9 5779 %1 = bitcast i8 %__U to <8 x i1> 5780 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5781 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5782 ret <4 x float> %2 5783} 5784 5785define <8 x float> @test_mm256_mask_fnmadd_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5786; X86-LABEL: test_mm256_mask_fnmadd_ps: 5787; X86: # %bb.0: # %entry 5788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5789; X86-NEXT: kmovw %eax, %k1 5790; X86-NEXT: vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 5791; X86-NEXT: retl 5792; 5793; X64-LABEL: test_mm256_mask_fnmadd_ps: 5794; X64: # %bb.0: # %entry 5795; X64-NEXT: kmovw %edi, %k1 5796; X64-NEXT: vfnmadd132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) + ymm2 5797; X64-NEXT: retq 5798entry: 5799 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5800 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %__C) #9 5801 %1 = bitcast i8 %__U to <8 x i1> 5802 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5803 ret <8 x float> %2 5804} 5805 5806define <2 x double> @test_mm_mask_fnmsub_pd(<2 x double> %__A, i8 zeroext %__U, <2 x double> %__B, <2 x double> %__C) { 5807; X86-LABEL: test_mm_mask_fnmsub_pd: 5808; X86: # %bb.0: # %entry 5809; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5810; X86-NEXT: kmovw %eax, %k1 5811; X86-NEXT: vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 5812; X86-NEXT: retl 5813; 5814; X64-LABEL: test_mm_mask_fnmsub_pd: 5815; X64: # %bb.0: # %entry 5816; X64-NEXT: kmovw %edi, %k1 5817; X64-NEXT: vfnmsub132pd {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 5818; X64-NEXT: retq 5819entry: 5820 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5821 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5822 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5823 %1 = bitcast i8 %__U to <8 x i1> 5824 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5825 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__A 5826 ret <2 x double> %2 5827} 5828 5829define <2 x double> @test_mm_mask3_fnmsub_pd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__C, i8 zeroext %__U) { 5830; X86-LABEL: test_mm_mask3_fnmsub_pd: 5831; X86: # %bb.0: # %entry 5832; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5833; X86-NEXT: kmovw %eax, %k1 5834; X86-NEXT: vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5835; X86-NEXT: vmovapd %xmm2, %xmm0 5836; X86-NEXT: retl 5837; 5838; X64-LABEL: test_mm_mask3_fnmsub_pd: 5839; X64: # %bb.0: # %entry 5840; X64-NEXT: kmovw %edi, %k1 5841; X64-NEXT: vfnmsub231pd {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5842; X64-NEXT: vmovapd %xmm2, %xmm0 5843; X64-NEXT: retq 5844entry: 5845 %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__B 5846 %sub1.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %__C 5847 %0 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %__A, <2 x double> %sub.i, <2 x double> %sub1.i) #9 5848 %1 = bitcast i8 %__U to <8 x i1> 5849 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 5850 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__C 5851 ret <2 x double> %2 5852} 5853 5854define <4 x double> @test_mm256_mask_fnmsub_pd(<4 x double> %__A, i8 zeroext %__U, <4 x double> %__B, <4 x double> %__C) { 5855; X86-LABEL: test_mm256_mask_fnmsub_pd: 5856; X86: # %bb.0: # %entry 5857; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5858; X86-NEXT: kmovw %eax, %k1 5859; X86-NEXT: vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 5860; X86-NEXT: retl 5861; 5862; X64-LABEL: test_mm256_mask_fnmsub_pd: 5863; X64: # %bb.0: # %entry 5864; X64-NEXT: kmovw %edi, %k1 5865; X64-NEXT: vfnmsub132pd {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 5866; X64-NEXT: retq 5867entry: 5868 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5869 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5870 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5871 %1 = bitcast i8 %__U to <8 x i1> 5872 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5873 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__A 5874 ret <4 x double> %2 5875} 5876 5877define <4 x double> @test_mm256_mask3_fnmsub_pd(<4 x double> %__A, <4 x double> %__B, <4 x double> %__C, i8 zeroext %__U) { 5878; X86-LABEL: test_mm256_mask3_fnmsub_pd: 5879; X86: # %bb.0: # %entry 5880; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5881; X86-NEXT: kmovw %eax, %k1 5882; X86-NEXT: vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 5883; X86-NEXT: vmovapd %ymm2, %ymm0 5884; X86-NEXT: retl 5885; 5886; X64-LABEL: test_mm256_mask3_fnmsub_pd: 5887; X64: # %bb.0: # %entry 5888; X64-NEXT: kmovw %edi, %k1 5889; X64-NEXT: vfnmsub231pd {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 5890; X64-NEXT: vmovapd %ymm2, %ymm0 5891; X64-NEXT: retq 5892entry: 5893 %sub.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 5894 %sub1.i = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 5895 %0 = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %__A, <4 x double> %sub.i, <4 x double> %sub1.i) #9 5896 %1 = bitcast i8 %__U to <8 x i1> 5897 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5898 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__C 5899 ret <4 x double> %2 5900} 5901 5902define <4 x float> @test_mm_mask_fnmsub_ps(<4 x float> %__A, i8 zeroext %__U, <4 x float> %__B, <4 x float> %__C) { 5903; X86-LABEL: test_mm_mask_fnmsub_ps: 5904; X86: # %bb.0: # %entry 5905; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5906; X86-NEXT: kmovw %eax, %k1 5907; X86-NEXT: vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 5908; X86-NEXT: retl 5909; 5910; X64-LABEL: test_mm_mask_fnmsub_ps: 5911; X64: # %bb.0: # %entry 5912; X64-NEXT: kmovw %edi, %k1 5913; X64-NEXT: vfnmsub132ps {{.*#+}} xmm0 {%k1} = -(xmm0 * xmm1) - xmm2 5914; X64-NEXT: retq 5915entry: 5916 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5917 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5918 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5919 %1 = bitcast i8 %__U to <8 x i1> 5920 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5921 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__A 5922 ret <4 x float> %2 5923} 5924 5925define <4 x float> @test_mm_mask3_fnmsub_ps(<4 x float> %__A, <4 x float> %__B, <4 x float> %__C, i8 zeroext %__U) { 5926; X86-LABEL: test_mm_mask3_fnmsub_ps: 5927; X86: # %bb.0: # %entry 5928; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5929; X86-NEXT: kmovw %eax, %k1 5930; X86-NEXT: vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5931; X86-NEXT: vmovaps %xmm2, %xmm0 5932; X86-NEXT: retl 5933; 5934; X64-LABEL: test_mm_mask3_fnmsub_ps: 5935; X64: # %bb.0: # %entry 5936; X64-NEXT: kmovw %edi, %k1 5937; X64-NEXT: vfnmsub231ps {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) - xmm2 5938; X64-NEXT: vmovaps %xmm2, %xmm0 5939; X64-NEXT: retq 5940entry: 5941 %sub.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5942 %sub1.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5943 %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %__A, <4 x float> %sub.i, <4 x float> %sub1.i) #9 5944 %1 = bitcast i8 %__U to <8 x i1> 5945 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 5946 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__C 5947 ret <4 x float> %2 5948} 5949 5950define <8 x float> @test_mm256_mask_fnmsub_ps(<8 x float> %__A, i8 zeroext %__U, <8 x float> %__B, <8 x float> %__C) { 5951; X86-LABEL: test_mm256_mask_fnmsub_ps: 5952; X86: # %bb.0: # %entry 5953; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5954; X86-NEXT: kmovw %eax, %k1 5955; X86-NEXT: vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 5956; X86-NEXT: retl 5957; 5958; X64-LABEL: test_mm256_mask_fnmsub_ps: 5959; X64: # %bb.0: # %entry 5960; X64-NEXT: kmovw %edi, %k1 5961; X64-NEXT: vfnmsub132ps {{.*#+}} ymm0 {%k1} = -(ymm0 * ymm1) - ymm2 5962; X64-NEXT: retq 5963entry: 5964 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5965 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5966 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5967 %1 = bitcast i8 %__U to <8 x i1> 5968 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__A 5969 ret <8 x float> %2 5970} 5971 5972define <8 x float> @test_mm256_mask3_fnmsub_ps(<8 x float> %__A, <8 x float> %__B, <8 x float> %__C, i8 zeroext %__U) { 5973; X86-LABEL: test_mm256_mask3_fnmsub_ps: 5974; X86: # %bb.0: # %entry 5975; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5976; X86-NEXT: kmovw %eax, %k1 5977; X86-NEXT: vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 5978; X86-NEXT: vmovaps %ymm2, %ymm0 5979; X86-NEXT: retl 5980; 5981; X64-LABEL: test_mm256_mask3_fnmsub_ps: 5982; X64: # %bb.0: # %entry 5983; X64-NEXT: kmovw %edi, %k1 5984; X64-NEXT: vfnmsub231ps {{.*#+}} ymm2 {%k1} = -(ymm0 * ymm1) - ymm2 5985; X64-NEXT: vmovaps %ymm2, %ymm0 5986; X64-NEXT: retq 5987entry: 5988 %sub.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 5989 %sub1.i = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 5990 %0 = tail call <8 x float> @llvm.fma.v8f32(<8 x float> %__A, <8 x float> %sub.i, <8 x float> %sub1.i) #9 5991 %1 = bitcast i8 %__U to <8 x i1> 5992 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__C 5993 ret <8 x float> %2 5994} 5995 5996define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 5997; X86-LABEL: test_mm_mask_expandloadu_pd: 5998; X86: # %bb.0: # %entry 5999; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6000; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6001; X86-NEXT: kmovw %ecx, %k1 6002; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} 6003; X86-NEXT: retl 6004; 6005; X64-LABEL: test_mm_mask_expandloadu_pd: 6006; X64: # %bb.0: # %entry 6007; X64-NEXT: kmovw %edi, %k1 6008; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} 6009; X64-NEXT: retq 6010entry: 6011 %0 = bitcast i8* %__P to double* 6012 %1 = bitcast i8 %__U to <8 x i1> 6013 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6014 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W) 6015 ret <2 x double> %2 6016} 6017 6018define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 6019; X86-LABEL: test_mm_maskz_expandloadu_pd: 6020; X86: # %bb.0: # %entry 6021; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6022; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6023; X86-NEXT: kmovw %ecx, %k1 6024; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z} 6025; X86-NEXT: retl 6026; 6027; X64-LABEL: test_mm_maskz_expandloadu_pd: 6028; X64: # %bb.0: # %entry 6029; X64-NEXT: kmovw %edi, %k1 6030; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z} 6031; X64-NEXT: retq 6032entry: 6033 %0 = bitcast i8* %__P to double* 6034 %1 = bitcast i8 %__U to <8 x i1> 6035 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6036 %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer) 6037 ret <2 x double> %2 6038} 6039 6040define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 6041; X86-LABEL: test_mm256_mask_expandloadu_pd: 6042; X86: # %bb.0: # %entry 6043; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6044; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6045; X86-NEXT: kmovw %ecx, %k1 6046; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} 6047; X86-NEXT: retl 6048; 6049; X64-LABEL: test_mm256_mask_expandloadu_pd: 6050; X64: # %bb.0: # %entry 6051; X64-NEXT: kmovw %edi, %k1 6052; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} 6053; X64-NEXT: retq 6054entry: 6055 %0 = bitcast i8* %__P to double* 6056 %1 = bitcast i8 %__U to <8 x i1> 6057 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6058 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W) 6059 ret <4 x double> %2 6060} 6061 6062define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 6063; X86-LABEL: test_mm256_maskz_expandloadu_pd: 6064; X86: # %bb.0: # %entry 6065; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6066; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6067; X86-NEXT: kmovw %ecx, %k1 6068; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z} 6069; X86-NEXT: retl 6070; 6071; X64-LABEL: test_mm256_maskz_expandloadu_pd: 6072; X64: # %bb.0: # %entry 6073; X64-NEXT: kmovw %edi, %k1 6074; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z} 6075; X64-NEXT: retq 6076entry: 6077 %0 = bitcast i8* %__P to double* 6078 %1 = bitcast i8 %__U to <8 x i1> 6079 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6080 %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer) 6081 ret <4 x double> %2 6082} 6083 6084define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6085; X86-LABEL: test_mm_mask_expandloadu_epi64: 6086; X86: # %bb.0: # %entry 6087; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6088; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6089; X86-NEXT: kmovw %ecx, %k1 6090; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} 6091; X86-NEXT: retl 6092; 6093; X64-LABEL: test_mm_mask_expandloadu_epi64: 6094; X64: # %bb.0: # %entry 6095; X64-NEXT: kmovw %edi, %k1 6096; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} 6097; X64-NEXT: retq 6098entry: 6099 %0 = bitcast i8* %__P to i64* 6100 %1 = bitcast i8 %__U to <8 x i1> 6101 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6102 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10 6103 ret <2 x i64> %2 6104} 6105 6106define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6107; X86-LABEL: test_mm_maskz_expandloadu_epi64: 6108; X86: # %bb.0: # %entry 6109; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6110; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6111; X86-NEXT: kmovw %ecx, %k1 6112; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z} 6113; X86-NEXT: retl 6114; 6115; X64-LABEL: test_mm_maskz_expandloadu_epi64: 6116; X64: # %bb.0: # %entry 6117; X64-NEXT: kmovw %edi, %k1 6118; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z} 6119; X64-NEXT: retq 6120entry: 6121 %0 = bitcast i8* %__P to i64* 6122 %1 = bitcast i8 %__U to <8 x i1> 6123 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6124 %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer) 6125 ret <2 x i64> %2 6126} 6127 6128define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6129; X86-LABEL: test_mm256_mask_expandloadu_epi64: 6130; X86: # %bb.0: # %entry 6131; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6132; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6133; X86-NEXT: kmovw %ecx, %k1 6134; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} 6135; X86-NEXT: retl 6136; 6137; X64-LABEL: test_mm256_mask_expandloadu_epi64: 6138; X64: # %bb.0: # %entry 6139; X64-NEXT: kmovw %edi, %k1 6140; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} 6141; X64-NEXT: retq 6142entry: 6143 %0 = bitcast i8* %__P to i64* 6144 %1 = bitcast i8 %__U to <8 x i1> 6145 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6146 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10 6147 ret <4 x i64> %2 6148} 6149 6150define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6151; X86-LABEL: test_mm256_maskz_expandloadu_epi64: 6152; X86: # %bb.0: # %entry 6153; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6154; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6155; X86-NEXT: kmovw %ecx, %k1 6156; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z} 6157; X86-NEXT: retl 6158; 6159; X64-LABEL: test_mm256_maskz_expandloadu_epi64: 6160; X64: # %bb.0: # %entry 6161; X64-NEXT: kmovw %edi, %k1 6162; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z} 6163; X64-NEXT: retq 6164entry: 6165 %0 = bitcast i8* %__P to i64* 6166 %1 = bitcast i8 %__U to <8 x i1> 6167 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6168 %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer) 6169 ret <4 x i64> %2 6170} 6171 6172define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6173; X86-LABEL: test_mm_mask_expandloadu_ps: 6174; X86: # %bb.0: # %entry 6175; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6176; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6177; X86-NEXT: kmovw %ecx, %k1 6178; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} 6179; X86-NEXT: retl 6180; 6181; X64-LABEL: test_mm_mask_expandloadu_ps: 6182; X64: # %bb.0: # %entry 6183; X64-NEXT: kmovw %edi, %k1 6184; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} 6185; X64-NEXT: retq 6186entry: 6187 %0 = bitcast i8* %__P to float* 6188 %1 = bitcast i8 %__U to <8 x i1> 6189 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6190 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W) 6191 ret <4 x float> %2 6192} 6193 6194define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6195; X86-LABEL: test_mm_maskz_expandloadu_ps: 6196; X86: # %bb.0: # %entry 6197; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6198; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6199; X86-NEXT: kmovw %ecx, %k1 6200; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z} 6201; X86-NEXT: retl 6202; 6203; X64-LABEL: test_mm_maskz_expandloadu_ps: 6204; X64: # %bb.0: # %entry 6205; X64-NEXT: kmovw %edi, %k1 6206; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z} 6207; X64-NEXT: retq 6208entry: 6209 %0 = bitcast i8* %__P to float* 6210 %1 = bitcast i8 %__U to <8 x i1> 6211 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6212 %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer) 6213 ret <4 x float> %2 6214} 6215 6216define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { 6217; X86-LABEL: test_mm256_mask_expandloadu_ps: 6218; X86: # %bb.0: # %entry 6219; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6220; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6221; X86-NEXT: kmovw %ecx, %k1 6222; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} 6223; X86-NEXT: retl 6224; 6225; X64-LABEL: test_mm256_mask_expandloadu_ps: 6226; X64: # %bb.0: # %entry 6227; X64-NEXT: kmovw %edi, %k1 6228; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} 6229; X64-NEXT: retq 6230entry: 6231 %0 = bitcast i8* %__P to float* 6232 %1 = bitcast i8 %__U to <8 x i1> 6233 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W) 6234 ret <8 x float> %2 6235} 6236 6237define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { 6238; X86-LABEL: test_mm256_maskz_expandloadu_ps: 6239; X86: # %bb.0: # %entry 6240; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6241; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6242; X86-NEXT: kmovw %ecx, %k1 6243; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z} 6244; X86-NEXT: retl 6245; 6246; X64-LABEL: test_mm256_maskz_expandloadu_ps: 6247; X64: # %bb.0: # %entry 6248; X64-NEXT: kmovw %edi, %k1 6249; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z} 6250; X64-NEXT: retq 6251entry: 6252 %0 = bitcast i8* %__P to float* 6253 %1 = bitcast i8 %__U to <8 x i1> 6254 %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer) 6255 ret <8 x float> %2 6256} 6257 6258define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6259; X86-LABEL: test_mm_mask_expandloadu_epi32: 6260; X86: # %bb.0: # %entry 6261; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6262; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6263; X86-NEXT: kmovw %ecx, %k1 6264; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} 6265; X86-NEXT: retl 6266; 6267; X64-LABEL: test_mm_mask_expandloadu_epi32: 6268; X64: # %bb.0: # %entry 6269; X64-NEXT: kmovw %edi, %k1 6270; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} 6271; X64-NEXT: retq 6272entry: 6273 %0 = bitcast <2 x i64> %__W to <4 x i32> 6274 %1 = bitcast i8* %__P to i32* 6275 %2 = bitcast i8 %__U to <8 x i1> 6276 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6277 %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0) 6278 %4 = bitcast <4 x i32> %3 to <2 x i64> 6279 ret <2 x i64> %4 6280} 6281 6282define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6283; X86-LABEL: test_mm_maskz_expandloadu_epi32: 6284; X86: # %bb.0: # %entry 6285; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6286; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6287; X86-NEXT: kmovw %ecx, %k1 6288; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z} 6289; X86-NEXT: retl 6290; 6291; X64-LABEL: test_mm_maskz_expandloadu_epi32: 6292; X64: # %bb.0: # %entry 6293; X64-NEXT: kmovw %edi, %k1 6294; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z} 6295; X64-NEXT: retq 6296entry: 6297 %0 = bitcast i8* %__P to i32* 6298 %1 = bitcast i8 %__U to <8 x i1> 6299 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6300 %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer) 6301 %3 = bitcast <4 x i32> %2 to <2 x i64> 6302 ret <2 x i64> %3 6303} 6304 6305define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6306; X86-LABEL: test_mm256_mask_expandloadu_epi32: 6307; X86: # %bb.0: # %entry 6308; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6309; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6310; X86-NEXT: kmovw %ecx, %k1 6311; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} 6312; X86-NEXT: retl 6313; 6314; X64-LABEL: test_mm256_mask_expandloadu_epi32: 6315; X64: # %bb.0: # %entry 6316; X64-NEXT: kmovw %edi, %k1 6317; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} 6318; X64-NEXT: retq 6319entry: 6320 %0 = bitcast <4 x i64> %__W to <8 x i32> 6321 %1 = bitcast i8* %__P to i32* 6322 %2 = bitcast i8 %__U to <8 x i1> 6323 %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0) 6324 %4 = bitcast <8 x i32> %3 to <4 x i64> 6325 ret <4 x i64> %4 6326} 6327 6328define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { 6329; X86-LABEL: test_mm256_maskz_expandloadu_epi32: 6330; X86: # %bb.0: # %entry 6331; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6332; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6333; X86-NEXT: kmovw %ecx, %k1 6334; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z} 6335; X86-NEXT: retl 6336; 6337; X64-LABEL: test_mm256_maskz_expandloadu_epi32: 6338; X64: # %bb.0: # %entry 6339; X64-NEXT: kmovw %edi, %k1 6340; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z} 6341; X64-NEXT: retq 6342entry: 6343 %0 = bitcast i8* %__P to i32* 6344 %1 = bitcast i8 %__U to <8 x i1> 6345 %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer) 6346 %3 = bitcast <8 x i32> %2 to <4 x i64> 6347 ret <4 x i64> %3 6348} 6349 6350define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) { 6351; X86-LABEL: test_mm_mask_compressstoreu_pd: 6352; X86: # %bb.0: # %entry 6353; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6354; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6355; X86-NEXT: kmovw %eax, %k1 6356; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1} 6357; X86-NEXT: retl 6358; 6359; X64-LABEL: test_mm_mask_compressstoreu_pd: 6360; X64: # %bb.0: # %entry 6361; X64-NEXT: kmovw %esi, %k1 6362; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1} 6363; X64-NEXT: retq 6364entry: 6365 %0 = bitcast i8* %__P to double* 6366 %1 = bitcast i8 %__U to <8 x i1> 6367 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6368 tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i) 6369 ret void 6370} 6371 6372define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) { 6373; X86-LABEL: test_mm256_mask_compressstoreu_pd: 6374; X86: # %bb.0: # %entry 6375; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6376; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6377; X86-NEXT: kmovw %eax, %k1 6378; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1} 6379; X86-NEXT: vzeroupper 6380; X86-NEXT: retl 6381; 6382; X64-LABEL: test_mm256_mask_compressstoreu_pd: 6383; X64: # %bb.0: # %entry 6384; X64-NEXT: kmovw %esi, %k1 6385; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1} 6386; X64-NEXT: vzeroupper 6387; X64-NEXT: retq 6388entry: 6389 %0 = bitcast i8* %__P to double* 6390 %1 = bitcast i8 %__U to <8 x i1> 6391 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6392 tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i) 6393 ret void 6394} 6395 6396define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6397; X86-LABEL: test_mm_mask_compressstoreu_epi64: 6398; X86: # %bb.0: # %entry 6399; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6400; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6401; X86-NEXT: kmovw %eax, %k1 6402; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1} 6403; X86-NEXT: retl 6404; 6405; X64-LABEL: test_mm_mask_compressstoreu_epi64: 6406; X64: # %bb.0: # %entry 6407; X64-NEXT: kmovw %esi, %k1 6408; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1} 6409; X64-NEXT: retq 6410entry: 6411 %0 = bitcast i8* %__P to i64* 6412 %1 = bitcast i8 %__U to <8 x i1> 6413 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6414 tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i) 6415 ret void 6416} 6417 6418define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6419; X86-LABEL: test_mm256_mask_compressstoreu_epi64: 6420; X86: # %bb.0: # %entry 6421; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6422; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6423; X86-NEXT: kmovw %eax, %k1 6424; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1} 6425; X86-NEXT: vzeroupper 6426; X86-NEXT: retl 6427; 6428; X64-LABEL: test_mm256_mask_compressstoreu_epi64: 6429; X64: # %bb.0: # %entry 6430; X64-NEXT: kmovw %esi, %k1 6431; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1} 6432; X64-NEXT: vzeroupper 6433; X64-NEXT: retq 6434entry: 6435 %0 = bitcast i8* %__P to i64* 6436 %1 = bitcast i8 %__U to <8 x i1> 6437 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6438 tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i) 6439 ret void 6440} 6441 6442define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) { 6443; X86-LABEL: test_mm_mask_compressstoreu_ps: 6444; X86: # %bb.0: # %entry 6445; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6446; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6447; X86-NEXT: kmovw %eax, %k1 6448; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1} 6449; X86-NEXT: retl 6450; 6451; X64-LABEL: test_mm_mask_compressstoreu_ps: 6452; X64: # %bb.0: # %entry 6453; X64-NEXT: kmovw %esi, %k1 6454; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1} 6455; X64-NEXT: retq 6456entry: 6457 %0 = bitcast i8* %__P to float* 6458 %1 = bitcast i8 %__U to <8 x i1> 6459 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6460 tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i) 6461 ret void 6462} 6463 6464define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) { 6465; X86-LABEL: test_mm256_mask_compressstoreu_ps: 6466; X86: # %bb.0: # %entry 6467; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6468; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6469; X86-NEXT: kmovw %eax, %k1 6470; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1} 6471; X86-NEXT: vzeroupper 6472; X86-NEXT: retl 6473; 6474; X64-LABEL: test_mm256_mask_compressstoreu_ps: 6475; X64: # %bb.0: # %entry 6476; X64-NEXT: kmovw %esi, %k1 6477; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1} 6478; X64-NEXT: vzeroupper 6479; X64-NEXT: retq 6480entry: 6481 %0 = bitcast i8* %__P to float* 6482 %1 = bitcast i8 %__U to <8 x i1> 6483 tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1) 6484 ret void 6485} 6486 6487define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { 6488; X86-LABEL: test_mm_mask_compressstoreu_epi32: 6489; X86: # %bb.0: # %entry 6490; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6491; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6492; X86-NEXT: kmovw %eax, %k1 6493; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1} 6494; X86-NEXT: retl 6495; 6496; X64-LABEL: test_mm_mask_compressstoreu_epi32: 6497; X64: # %bb.0: # %entry 6498; X64-NEXT: kmovw %esi, %k1 6499; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1} 6500; X64-NEXT: retq 6501entry: 6502 %0 = bitcast <2 x i64> %__A to <4 x i32> 6503 %1 = bitcast i8* %__P to i32* 6504 %2 = bitcast i8 %__U to <8 x i1> 6505 %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6506 tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i) 6507 ret void 6508} 6509 6510define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { 6511; X86-LABEL: test_mm256_mask_compressstoreu_epi32: 6512; X86: # %bb.0: # %entry 6513; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6514; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6515; X86-NEXT: kmovw %eax, %k1 6516; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1} 6517; X86-NEXT: vzeroupper 6518; X86-NEXT: retl 6519; 6520; X64-LABEL: test_mm256_mask_compressstoreu_epi32: 6521; X64: # %bb.0: # %entry 6522; X64-NEXT: kmovw %esi, %k1 6523; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1} 6524; X64-NEXT: vzeroupper 6525; X64-NEXT: retq 6526entry: 6527 %0 = bitcast <4 x i64> %__A to <8 x i32> 6528 %1 = bitcast i8* %__P to i32* 6529 %2 = bitcast i8 %__U to <8 x i1> 6530 tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10 6531 ret void 6532} 6533 6534 6535declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8 6536declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8 6537declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8 6538declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>) #8 6539 6540define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { 6541; X86-LABEL: test_mm_mask_sqrt_pd: 6542; X86: # %bb.0: # %entry 6543; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6544; X86-NEXT: kmovw %eax, %k1 6545; X86-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6546; X86-NEXT: retl 6547; 6548; X64-LABEL: test_mm_mask_sqrt_pd: 6549; X64: # %bb.0: # %entry 6550; X64-NEXT: kmovw %edi, %k1 6551; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} 6552; X64-NEXT: retq 6553entry: 6554 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6555 %1 = bitcast i8 %__U to <8 x i1> 6556 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6557 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W 6558 ret <2 x double> %2 6559} 6560 6561declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) 6562 6563define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) { 6564; X86-LABEL: test_mm_maskz_sqrt_pd: 6565; X86: # %bb.0: # %entry 6566; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6567; X86-NEXT: kmovw %eax, %k1 6568; X86-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6569; X86-NEXT: retl 6570; 6571; X64-LABEL: test_mm_maskz_sqrt_pd: 6572; X64: # %bb.0: # %entry 6573; X64-NEXT: kmovw %edi, %k1 6574; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} 6575; X64-NEXT: retq 6576entry: 6577 %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 6578 %1 = bitcast i8 %__U to <8 x i1> 6579 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6580 %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer 6581 ret <2 x double> %2 6582} 6583 6584define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { 6585; X86-LABEL: test_mm256_mask_sqrt_pd: 6586; X86: # %bb.0: # %entry 6587; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6588; X86-NEXT: kmovw %eax, %k1 6589; X86-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6590; X86-NEXT: retl 6591; 6592; X64-LABEL: test_mm256_mask_sqrt_pd: 6593; X64: # %bb.0: # %entry 6594; X64-NEXT: kmovw %edi, %k1 6595; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} 6596; X64-NEXT: retq 6597entry: 6598 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6599 %1 = bitcast i8 %__U to <8 x i1> 6600 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6601 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W 6602 ret <4 x double> %2 6603} 6604 6605declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 6606 6607define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) { 6608; X86-LABEL: test_mm256_maskz_sqrt_pd: 6609; X86: # %bb.0: # %entry 6610; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6611; X86-NEXT: kmovw %eax, %k1 6612; X86-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6613; X86-NEXT: retl 6614; 6615; X64-LABEL: test_mm256_maskz_sqrt_pd: 6616; X64: # %bb.0: # %entry 6617; X64-NEXT: kmovw %edi, %k1 6618; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} 6619; X64-NEXT: retq 6620entry: 6621 %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 6622 %1 = bitcast i8 %__U to <8 x i1> 6623 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6624 %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer 6625 ret <4 x double> %2 6626} 6627 6628define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { 6629; X86-LABEL: test_mm_mask_sqrt_ps: 6630; X86: # %bb.0: # %entry 6631; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6632; X86-NEXT: kmovw %eax, %k1 6633; X86-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6634; X86-NEXT: retl 6635; 6636; X64-LABEL: test_mm_mask_sqrt_ps: 6637; X64: # %bb.0: # %entry 6638; X64-NEXT: kmovw %edi, %k1 6639; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1} 6640; X64-NEXT: retq 6641entry: 6642 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6643 %1 = bitcast i8 %__U to <8 x i1> 6644 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6645 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W 6646 ret <4 x float> %2 6647} 6648 6649declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 6650 6651define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) { 6652; X86-LABEL: test_mm_maskz_sqrt_ps: 6653; X86: # %bb.0: # %entry 6654; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6655; X86-NEXT: kmovw %eax, %k1 6656; X86-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6657; X86-NEXT: retl 6658; 6659; X64-LABEL: test_mm_maskz_sqrt_ps: 6660; X64: # %bb.0: # %entry 6661; X64-NEXT: kmovw %edi, %k1 6662; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} 6663; X64-NEXT: retq 6664entry: 6665 %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 6666 %1 = bitcast i8 %__U to <8 x i1> 6667 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6668 %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer 6669 ret <4 x float> %2 6670} 6671 6672define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) { 6673; X86-LABEL: test_mm256_mask_sqrt_ps: 6674; X86: # %bb.0: # %entry 6675; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6676; X86-NEXT: kmovw %eax, %k1 6677; X86-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6678; X86-NEXT: retl 6679; 6680; X64-LABEL: test_mm256_mask_sqrt_ps: 6681; X64: # %bb.0: # %entry 6682; X64-NEXT: kmovw %edi, %k1 6683; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1} 6684; X64-NEXT: retq 6685entry: 6686 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6687 %1 = bitcast i8 %__U to <8 x i1> 6688 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W 6689 ret <8 x float> %2 6690} 6691 6692define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) { 6693; X86-LABEL: test_mm256_maskz_sqrt_ps: 6694; X86: # %bb.0: # %entry 6695; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6696; X86-NEXT: kmovw %eax, %k1 6697; X86-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6698; X86-NEXT: retl 6699; 6700; X64-LABEL: test_mm256_maskz_sqrt_ps: 6701; X64: # %bb.0: # %entry 6702; X64-NEXT: kmovw %edi, %k1 6703; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} 6704; X64-NEXT: retq 6705entry: 6706 %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 6707 %1 = bitcast i8 %__U to <8 x i1> 6708 %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer 6709 ret <8 x float> %2 6710} 6711 6712declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) 6713 6714define <2 x i64> @test_mm_rol_epi32(<2 x i64> %__A) { 6715; CHECK-LABEL: test_mm_rol_epi32: 6716; CHECK: # %bb.0: # %entry 6717; CHECK-NEXT: vprold $5, %xmm0, %xmm0 6718; CHECK-NEXT: ret{{[l|q]}} 6719entry: 6720 %0 = bitcast <2 x i64> %__A to <4 x i32> 6721 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 6722 %2 = bitcast <4 x i32> %1 to <2 x i64> 6723 ret <2 x i64> %2 6724} 6725 6726define <2 x i64> @test_mm_mask_rol_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6727; X86-LABEL: test_mm_mask_rol_epi32: 6728; X86: # %bb.0: # %entry 6729; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6730; X86-NEXT: kmovw %eax, %k1 6731; X86-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6732; X86-NEXT: retl 6733; 6734; X64-LABEL: test_mm_mask_rol_epi32: 6735; X64: # %bb.0: # %entry 6736; X64-NEXT: kmovw %edi, %k1 6737; X64-NEXT: vprold $5, %xmm1, %xmm0 {%k1} 6738; X64-NEXT: retq 6739entry: 6740 %0 = bitcast <2 x i64> %__A to <4 x i32> 6741 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 6742 %2 = bitcast <2 x i64> %__W to <4 x i32> 6743 %3 = bitcast i8 %__U to <8 x i1> 6744 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6745 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 6746 %5 = bitcast <4 x i32> %4 to <2 x i64> 6747 ret <2 x i64> %5 6748} 6749 6750define <2 x i64> @test_mm_maskz_rol_epi32(i8 zeroext %__U, <2 x i64> %__A) { 6751; X86-LABEL: test_mm_maskz_rol_epi32: 6752; X86: # %bb.0: # %entry 6753; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6754; X86-NEXT: kmovw %eax, %k1 6755; X86-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6756; X86-NEXT: retl 6757; 6758; X64-LABEL: test_mm_maskz_rol_epi32: 6759; X64: # %bb.0: # %entry 6760; X64-NEXT: kmovw %edi, %k1 6761; X64-NEXT: vprold $5, %xmm0, %xmm0 {%k1} {z} 6762; X64-NEXT: retq 6763entry: 6764 %0 = bitcast <2 x i64> %__A to <4 x i32> 6765 %1 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 6766 %2 = bitcast i8 %__U to <8 x i1> 6767 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6768 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 6769 %4 = bitcast <4 x i32> %3 to <2 x i64> 6770 ret <2 x i64> %4 6771} 6772 6773define <4 x i64> @test_mm256_rol_epi32(<4 x i64> %__A) { 6774; CHECK-LABEL: test_mm256_rol_epi32: 6775; CHECK: # %bb.0: # %entry 6776; CHECK-NEXT: vprold $5, %ymm0, %ymm0 6777; CHECK-NEXT: ret{{[l|q]}} 6778entry: 6779 %0 = bitcast <4 x i64> %__A to <8 x i32> 6780 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 6781 %2 = bitcast <8 x i32> %1 to <4 x i64> 6782 ret <4 x i64> %2 6783} 6784 6785define <4 x i64> @test_mm256_mask_rol_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6786; X86-LABEL: test_mm256_mask_rol_epi32: 6787; X86: # %bb.0: # %entry 6788; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6789; X86-NEXT: kmovw %eax, %k1 6790; X86-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6791; X86-NEXT: retl 6792; 6793; X64-LABEL: test_mm256_mask_rol_epi32: 6794; X64: # %bb.0: # %entry 6795; X64-NEXT: kmovw %edi, %k1 6796; X64-NEXT: vprold $5, %ymm1, %ymm0 {%k1} 6797; X64-NEXT: retq 6798entry: 6799 %0 = bitcast <4 x i64> %__A to <8 x i32> 6800 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 6801 %2 = bitcast <4 x i64> %__W to <8 x i32> 6802 %3 = bitcast i8 %__U to <8 x i1> 6803 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 6804 %5 = bitcast <8 x i32> %4 to <4 x i64> 6805 ret <4 x i64> %5 6806} 6807 6808define <4 x i64> @test_mm256_maskz_rol_epi32(i8 zeroext %__U, <4 x i64> %__A) { 6809; X86-LABEL: test_mm256_maskz_rol_epi32: 6810; X86: # %bb.0: # %entry 6811; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6812; X86-NEXT: kmovw %eax, %k1 6813; X86-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6814; X86-NEXT: retl 6815; 6816; X64-LABEL: test_mm256_maskz_rol_epi32: 6817; X64: # %bb.0: # %entry 6818; X64-NEXT: kmovw %edi, %k1 6819; X64-NEXT: vprold $5, %ymm0, %ymm0 {%k1} {z} 6820; X64-NEXT: retq 6821entry: 6822 %0 = bitcast <4 x i64> %__A to <8 x i32> 6823 %1 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 6824 %2 = bitcast i8 %__U to <8 x i1> 6825 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 6826 %4 = bitcast <8 x i32> %3 to <4 x i64> 6827 ret <4 x i64> %4 6828} 6829 6830define <2 x i64> @test_mm_rol_epi64(<2 x i64> %__A) { 6831; CHECK-LABEL: test_mm_rol_epi64: 6832; CHECK: # %bb.0: # %entry 6833; CHECK-NEXT: vprolq $5, %xmm0, %xmm0 6834; CHECK-NEXT: ret{{[l|q]}} 6835entry: 6836 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 6837 ret <2 x i64> %0 6838} 6839 6840define <2 x i64> @test_mm_mask_rol_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 6841; X86-LABEL: test_mm_mask_rol_epi64: 6842; X86: # %bb.0: # %entry 6843; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6844; X86-NEXT: kmovw %eax, %k1 6845; X86-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6846; X86-NEXT: retl 6847; 6848; X64-LABEL: test_mm_mask_rol_epi64: 6849; X64: # %bb.0: # %entry 6850; X64-NEXT: kmovw %edi, %k1 6851; X64-NEXT: vprolq $5, %xmm1, %xmm0 {%k1} 6852; X64-NEXT: retq 6853entry: 6854 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 6855 %1 = bitcast i8 %__U to <8 x i1> 6856 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6857 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 6858 ret <2 x i64> %2 6859} 6860 6861define <2 x i64> @test_mm_maskz_rol_epi64(i8 zeroext %__U, <2 x i64> %__A) { 6862; X86-LABEL: test_mm_maskz_rol_epi64: 6863; X86: # %bb.0: # %entry 6864; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6865; X86-NEXT: kmovw %eax, %k1 6866; X86-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6867; X86-NEXT: retl 6868; 6869; X64-LABEL: test_mm_maskz_rol_epi64: 6870; X64: # %bb.0: # %entry 6871; X64-NEXT: kmovw %edi, %k1 6872; X64-NEXT: vprolq $5, %xmm0, %xmm0 {%k1} {z} 6873; X64-NEXT: retq 6874entry: 6875 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 6876 %1 = bitcast i8 %__U to <8 x i1> 6877 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 6878 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 6879 ret <2 x i64> %2 6880} 6881 6882define <4 x i64> @test_mm256_rol_epi64(<4 x i64> %__A) { 6883; CHECK-LABEL: test_mm256_rol_epi64: 6884; CHECK: # %bb.0: # %entry 6885; CHECK-NEXT: vprolq $5, %ymm0, %ymm0 6886; CHECK-NEXT: ret{{[l|q]}} 6887entry: 6888 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>) 6889 ret <4 x i64> %0 6890} 6891 6892define <4 x i64> @test_mm256_mask_rol_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 6893; X86-LABEL: test_mm256_mask_rol_epi64: 6894; X86: # %bb.0: # %entry 6895; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6896; X86-NEXT: kmovw %eax, %k1 6897; X86-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6898; X86-NEXT: retl 6899; 6900; X64-LABEL: test_mm256_mask_rol_epi64: 6901; X64: # %bb.0: # %entry 6902; X64-NEXT: kmovw %edi, %k1 6903; X64-NEXT: vprolq $5, %ymm1, %ymm0 {%k1} 6904; X64-NEXT: retq 6905entry: 6906 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>) 6907 %1 = bitcast i8 %__U to <8 x i1> 6908 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6909 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 6910 ret <4 x i64> %2 6911} 6912 6913define <4 x i64> @test_mm256_maskz_rol_epi64(i8 zeroext %__U, <4 x i64> %__A) { 6914; X86-LABEL: test_mm256_maskz_rol_epi64: 6915; X86: # %bb.0: # %entry 6916; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6917; X86-NEXT: kmovw %eax, %k1 6918; X86-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6919; X86-NEXT: retl 6920; 6921; X64-LABEL: test_mm256_maskz_rol_epi64: 6922; X64: # %bb.0: # %entry 6923; X64-NEXT: kmovw %edi, %k1 6924; X64-NEXT: vprolq $5, %ymm0, %ymm0 {%k1} {z} 6925; X64-NEXT: retq 6926entry: 6927 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5,i64 5, i64 5>) 6928 %1 = bitcast i8 %__U to <8 x i1> 6929 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6930 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 6931 ret <4 x i64> %2 6932} 6933 6934define <2 x i64> @test_mm_rolv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 6935; CHECK-LABEL: test_mm_rolv_epi32: 6936; CHECK: # %bb.0: # %entry 6937; CHECK-NEXT: vprolvd %xmm1, %xmm0, %xmm0 6938; CHECK-NEXT: ret{{[l|q]}} 6939entry: 6940 %0 = bitcast <2 x i64> %__A to <4 x i32> 6941 %1 = bitcast <2 x i64> %__B to <4 x i32> 6942 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 6943 %3 = bitcast <4 x i32> %2 to <2 x i64> 6944 ret <2 x i64> %3 6945} 6946 6947define <2 x i64> @test_mm_mask_rolv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6948; X86-LABEL: test_mm_mask_rolv_epi32: 6949; X86: # %bb.0: # %entry 6950; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6951; X86-NEXT: kmovw %eax, %k1 6952; X86-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6953; X86-NEXT: retl 6954; 6955; X64-LABEL: test_mm_mask_rolv_epi32: 6956; X64: # %bb.0: # %entry 6957; X64-NEXT: kmovw %edi, %k1 6958; X64-NEXT: vprolvd %xmm2, %xmm1, %xmm0 {%k1} 6959; X64-NEXT: retq 6960entry: 6961 %0 = bitcast <2 x i64> %__A to <4 x i32> 6962 %1 = bitcast <2 x i64> %__B to <4 x i32> 6963 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 6964 %3 = bitcast <2 x i64> %__W to <4 x i32> 6965 %4 = bitcast i8 %__U to <8 x i1> 6966 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6967 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 6968 %6 = bitcast <4 x i32> %5 to <2 x i64> 6969 ret <2 x i64> %6 6970} 6971 6972define <2 x i64> @test_mm_maskz_rolv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 6973; X86-LABEL: test_mm_maskz_rolv_epi32: 6974; X86: # %bb.0: # %entry 6975; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6976; X86-NEXT: kmovw %eax, %k1 6977; X86-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6978; X86-NEXT: retl 6979; 6980; X64-LABEL: test_mm_maskz_rolv_epi32: 6981; X64: # %bb.0: # %entry 6982; X64-NEXT: kmovw %edi, %k1 6983; X64-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} 6984; X64-NEXT: retq 6985entry: 6986 %0 = bitcast <2 x i64> %__A to <4 x i32> 6987 %1 = bitcast <2 x i64> %__B to <4 x i32> 6988 %2 = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 6989 %3 = bitcast i8 %__U to <8 x i1> 6990 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6991 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 6992 %5 = bitcast <4 x i32> %4 to <2 x i64> 6993 ret <2 x i64> %5 6994} 6995 6996define <4 x i64> @test_mm256_rolv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 6997; CHECK-LABEL: test_mm256_rolv_epi32: 6998; CHECK: # %bb.0: # %entry 6999; CHECK-NEXT: vprolvd %ymm1, %ymm0, %ymm0 7000; CHECK-NEXT: ret{{[l|q]}} 7001entry: 7002 %0 = bitcast <4 x i64> %__A to <8 x i32> 7003 %1 = bitcast <4 x i64> %__B to <8 x i32> 7004 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7005 %3 = bitcast <8 x i32> %2 to <4 x i64> 7006 ret <4 x i64> %3 7007} 7008 7009define <4 x i64> @test_mm256_mask_rolv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7010; X86-LABEL: test_mm256_mask_rolv_epi32: 7011; X86: # %bb.0: # %entry 7012; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7013; X86-NEXT: kmovw %eax, %k1 7014; X86-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 7015; X86-NEXT: retl 7016; 7017; X64-LABEL: test_mm256_mask_rolv_epi32: 7018; X64: # %bb.0: # %entry 7019; X64-NEXT: kmovw %edi, %k1 7020; X64-NEXT: vprolvd %ymm2, %ymm1, %ymm0 {%k1} 7021; X64-NEXT: retq 7022entry: 7023 %0 = bitcast <4 x i64> %__A to <8 x i32> 7024 %1 = bitcast <4 x i64> %__B to <8 x i32> 7025 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7026 %3 = bitcast <4 x i64> %__W to <8 x i32> 7027 %4 = bitcast i8 %__U to <8 x i1> 7028 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 7029 %6 = bitcast <8 x i32> %5 to <4 x i64> 7030 ret <4 x i64> %6 7031} 7032 7033define <4 x i64> @test_mm256_maskz_rolv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7034; X86-LABEL: test_mm256_maskz_rolv_epi32: 7035; X86: # %bb.0: # %entry 7036; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7037; X86-NEXT: kmovw %eax, %k1 7038; X86-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7039; X86-NEXT: retl 7040; 7041; X64-LABEL: test_mm256_maskz_rolv_epi32: 7042; X64: # %bb.0: # %entry 7043; X64-NEXT: kmovw %edi, %k1 7044; X64-NEXT: vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7045; X64-NEXT: retq 7046entry: 7047 %0 = bitcast <4 x i64> %__A to <8 x i32> 7048 %1 = bitcast <4 x i64> %__B to <8 x i32> 7049 %2 = tail call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7050 %3 = bitcast i8 %__U to <8 x i1> 7051 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 7052 %5 = bitcast <8 x i32> %4 to <4 x i64> 7053 ret <4 x i64> %5 7054} 7055 7056define <2 x i64> @test_mm_rolv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 7057; CHECK-LABEL: test_mm_rolv_epi64: 7058; CHECK: # %bb.0: # %entry 7059; CHECK-NEXT: vprolvq %xmm1, %xmm0, %xmm0 7060; CHECK-NEXT: ret{{[l|q]}} 7061entry: 7062 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7063 ret <2 x i64> %0 7064} 7065 7066define <2 x i64> @test_mm_mask_rolv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7067; X86-LABEL: test_mm_mask_rolv_epi64: 7068; X86: # %bb.0: # %entry 7069; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7070; X86-NEXT: kmovw %eax, %k1 7071; X86-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 7072; X86-NEXT: retl 7073; 7074; X64-LABEL: test_mm_mask_rolv_epi64: 7075; X64: # %bb.0: # %entry 7076; X64-NEXT: kmovw %edi, %k1 7077; X64-NEXT: vprolvq %xmm2, %xmm1, %xmm0 {%k1} 7078; X64-NEXT: retq 7079entry: 7080 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7081 %1 = bitcast i8 %__U to <8 x i1> 7082 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7083 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7084 ret <2 x i64> %2 7085} 7086 7087define <2 x i64> @test_mm_maskz_rolv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7088; X86-LABEL: test_mm_maskz_rolv_epi64: 7089; X86: # %bb.0: # %entry 7090; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7091; X86-NEXT: kmovw %eax, %k1 7092; X86-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7093; X86-NEXT: retl 7094; 7095; X64-LABEL: test_mm_maskz_rolv_epi64: 7096; X64: # %bb.0: # %entry 7097; X64-NEXT: kmovw %edi, %k1 7098; X64-NEXT: vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7099; X64-NEXT: retq 7100entry: 7101 %0 = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7102 %1 = bitcast i8 %__U to <8 x i1> 7103 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7104 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7105 ret <2 x i64> %2 7106} 7107 7108define <4 x i64> @test_mm256_rolv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7109; CHECK-LABEL: test_mm256_rolv_epi64: 7110; CHECK: # %bb.0: # %entry 7111; CHECK-NEXT: vprolvq %ymm1, %ymm0, %ymm0 7112; CHECK-NEXT: ret{{[l|q]}} 7113entry: 7114 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7115 ret <4 x i64> %0 7116} 7117 7118define <4 x i64> @test_mm256_mask_rolv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7119; X86-LABEL: test_mm256_mask_rolv_epi64: 7120; X86: # %bb.0: # %entry 7121; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7122; X86-NEXT: kmovw %eax, %k1 7123; X86-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7124; X86-NEXT: retl 7125; 7126; X64-LABEL: test_mm256_mask_rolv_epi64: 7127; X64: # %bb.0: # %entry 7128; X64-NEXT: kmovw %edi, %k1 7129; X64-NEXT: vprolvq %ymm2, %ymm1, %ymm0 {%k1} 7130; X64-NEXT: retq 7131entry: 7132 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7133 %1 = bitcast i8 %__U to <8 x i1> 7134 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7135 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7136 ret <4 x i64> %2 7137} 7138 7139define <4 x i64> @test_mm256_maskz_rolv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7140; X86-LABEL: test_mm256_maskz_rolv_epi64: 7141; X86: # %bb.0: # %entry 7142; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7143; X86-NEXT: kmovw %eax, %k1 7144; X86-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7145; X86-NEXT: retl 7146; 7147; X64-LABEL: test_mm256_maskz_rolv_epi64: 7148; X64: # %bb.0: # %entry 7149; X64-NEXT: kmovw %edi, %k1 7150; X64-NEXT: vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7151; X64-NEXT: retq 7152entry: 7153 %0 = tail call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7154 %1 = bitcast i8 %__U to <8 x i1> 7155 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7156 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7157 ret <4 x i64> %2 7158} 7159 7160define <2 x i64> @test_mm_ror_epi32(<2 x i64> %__A) { 7161; CHECK-LABEL: test_mm_ror_epi32: 7162; CHECK: # %bb.0: # %entry 7163; CHECK-NEXT: vprord $5, %xmm0, %xmm0 7164; CHECK-NEXT: ret{{[l|q]}} 7165entry: 7166 %0 = bitcast <2 x i64> %__A to <4 x i32> 7167 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 7168 %2 = bitcast <4 x i32> %1 to <2 x i64> 7169 ret <2 x i64> %2 7170} 7171 7172define <2 x i64> @test_mm_mask_ror_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7173; X86-LABEL: test_mm_mask_ror_epi32: 7174; X86: # %bb.0: # %entry 7175; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7176; X86-NEXT: kmovw %eax, %k1 7177; X86-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7178; X86-NEXT: retl 7179; 7180; X64-LABEL: test_mm_mask_ror_epi32: 7181; X64: # %bb.0: # %entry 7182; X64-NEXT: kmovw %edi, %k1 7183; X64-NEXT: vprord $5, %xmm1, %xmm0 {%k1} 7184; X64-NEXT: retq 7185entry: 7186 %0 = bitcast <2 x i64> %__A to <4 x i32> 7187 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 7188 %2 = bitcast <2 x i64> %__W to <4 x i32> 7189 %3 = bitcast i8 %__U to <8 x i1> 7190 %extract = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7191 %4 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %2 7192 %5 = bitcast <4 x i32> %4 to <2 x i64> 7193 ret <2 x i64> %5 7194} 7195 7196define <2 x i64> @test_mm_maskz_ror_epi32(i8 zeroext %__U, <2 x i64> %__A) { 7197; X86-LABEL: test_mm_maskz_ror_epi32: 7198; X86: # %bb.0: # %entry 7199; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7200; X86-NEXT: kmovw %eax, %k1 7201; X86-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7202; X86-NEXT: retl 7203; 7204; X64-LABEL: test_mm_maskz_ror_epi32: 7205; X64: # %bb.0: # %entry 7206; X64-NEXT: kmovw %edi, %k1 7207; X64-NEXT: vprord $5, %xmm0, %xmm0 {%k1} {z} 7208; X64-NEXT: retq 7209entry: 7210 %0 = bitcast <2 x i64> %__A to <4 x i32> 7211 %1 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> <i32 5, i32 5, i32 5, i32 5>) 7212 %2 = bitcast i8 %__U to <8 x i1> 7213 %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7214 %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer 7215 %4 = bitcast <4 x i32> %3 to <2 x i64> 7216 ret <2 x i64> %4 7217} 7218 7219define <4 x i64> @test_mm256_ror_epi32(<4 x i64> %__A) { 7220; CHECK-LABEL: test_mm256_ror_epi32: 7221; CHECK: # %bb.0: # %entry 7222; CHECK-NEXT: vprord $5, %ymm0, %ymm0 7223; CHECK-NEXT: ret{{[l|q]}} 7224entry: 7225 %0 = bitcast <4 x i64> %__A to <8 x i32> 7226 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 7227 %2 = bitcast <8 x i32> %1 to <4 x i64> 7228 ret <4 x i64> %2 7229} 7230 7231define <4 x i64> @test_mm256_mask_ror_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7232; X86-LABEL: test_mm256_mask_ror_epi32: 7233; X86: # %bb.0: # %entry 7234; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7235; X86-NEXT: kmovw %eax, %k1 7236; X86-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7237; X86-NEXT: retl 7238; 7239; X64-LABEL: test_mm256_mask_ror_epi32: 7240; X64: # %bb.0: # %entry 7241; X64-NEXT: kmovw %edi, %k1 7242; X64-NEXT: vprord $5, %ymm1, %ymm0 {%k1} 7243; X64-NEXT: retq 7244entry: 7245 %0 = bitcast <4 x i64> %__A to <8 x i32> 7246 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 7247 %2 = bitcast <4 x i64> %__W to <8 x i32> 7248 %3 = bitcast i8 %__U to <8 x i1> 7249 %4 = select <8 x i1> %3, <8 x i32> %1, <8 x i32> %2 7250 %5 = bitcast <8 x i32> %4 to <4 x i64> 7251 ret <4 x i64> %5 7252} 7253 7254define <4 x i64> @test_mm256_maskz_ror_epi32(i8 zeroext %__U, <4 x i64> %__A) { 7255; X86-LABEL: test_mm256_maskz_ror_epi32: 7256; X86: # %bb.0: # %entry 7257; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7258; X86-NEXT: kmovw %eax, %k1 7259; X86-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7260; X86-NEXT: retl 7261; 7262; X64-LABEL: test_mm256_maskz_ror_epi32: 7263; X64: # %bb.0: # %entry 7264; X64-NEXT: kmovw %edi, %k1 7265; X64-NEXT: vprord $5, %ymm0, %ymm0 {%k1} {z} 7266; X64-NEXT: retq 7267entry: 7268 %0 = bitcast <4 x i64> %__A to <8 x i32> 7269 %1 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>) 7270 %2 = bitcast i8 %__U to <8 x i1> 7271 %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer 7272 %4 = bitcast <8 x i32> %3 to <4 x i64> 7273 ret <4 x i64> %4 7274} 7275 7276define <2 x i64> @test_mm_ror_epi64(<2 x i64> %__A) { 7277; CHECK-LABEL: test_mm_ror_epi64: 7278; CHECK: # %bb.0: # %entry 7279; CHECK-NEXT: vprorq $5, %xmm0, %xmm0 7280; CHECK-NEXT: ret{{[l|q]}} 7281entry: 7282 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 7283 ret <2 x i64> %0 7284} 7285 7286define <2 x i64> @test_mm_mask_ror_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A) { 7287; X86-LABEL: test_mm_mask_ror_epi64: 7288; X86: # %bb.0: # %entry 7289; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7290; X86-NEXT: kmovw %eax, %k1 7291; X86-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7292; X86-NEXT: retl 7293; 7294; X64-LABEL: test_mm_mask_ror_epi64: 7295; X64: # %bb.0: # %entry 7296; X64-NEXT: kmovw %edi, %k1 7297; X64-NEXT: vprorq $5, %xmm1, %xmm0 {%k1} 7298; X64-NEXT: retq 7299entry: 7300 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 7301 %1 = bitcast i8 %__U to <8 x i1> 7302 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7303 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> %__W 7304 ret <2 x i64> %2 7305} 7306 7307define <2 x i64> @test_mm_maskz_ror_epi64(i8 zeroext %__U, <2 x i64> %__A) { 7308; X86-LABEL: test_mm_maskz_ror_epi64: 7309; X86: # %bb.0: # %entry 7310; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7311; X86-NEXT: kmovw %eax, %k1 7312; X86-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7313; X86-NEXT: retl 7314; 7315; X64-LABEL: test_mm_maskz_ror_epi64: 7316; X64: # %bb.0: # %entry 7317; X64-NEXT: kmovw %edi, %k1 7318; X64-NEXT: vprorq $5, %xmm0, %xmm0 {%k1} {z} 7319; X64-NEXT: retq 7320entry: 7321 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> <i64 5, i64 5>) 7322 %1 = bitcast i8 %__U to <8 x i1> 7323 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7324 %2 = select <2 x i1> %extract, <2 x i64> %0, <2 x i64> zeroinitializer 7325 ret <2 x i64> %2 7326} 7327 7328define <4 x i64> @test_mm256_ror_epi64(<4 x i64> %__A) { 7329; CHECK-LABEL: test_mm256_ror_epi64: 7330; CHECK: # %bb.0: # %entry 7331; CHECK-NEXT: vprorq $5, %ymm0, %ymm0 7332; CHECK-NEXT: ret{{[l|q]}} 7333entry: 7334 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>) 7335 ret <4 x i64> %0 7336} 7337 7338define <4 x i64> @test_mm256_mask_ror_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A) { 7339; X86-LABEL: test_mm256_mask_ror_epi64: 7340; X86: # %bb.0: # %entry 7341; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7342; X86-NEXT: kmovw %eax, %k1 7343; X86-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7344; X86-NEXT: retl 7345; 7346; X64-LABEL: test_mm256_mask_ror_epi64: 7347; X64: # %bb.0: # %entry 7348; X64-NEXT: kmovw %edi, %k1 7349; X64-NEXT: vprorq $5, %ymm1, %ymm0 {%k1} 7350; X64-NEXT: retq 7351entry: 7352 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>) 7353 %1 = bitcast i8 %__U to <8 x i1> 7354 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7355 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> %__W 7356 ret <4 x i64> %2 7357} 7358 7359define <4 x i64> @test_mm256_maskz_ror_epi64(i8 zeroext %__U, <4 x i64> %__A) { 7360; X86-LABEL: test_mm256_maskz_ror_epi64: 7361; X86: # %bb.0: # %entry 7362; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7363; X86-NEXT: kmovw %eax, %k1 7364; X86-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7365; X86-NEXT: retl 7366; 7367; X64-LABEL: test_mm256_maskz_ror_epi64: 7368; X64: # %bb.0: # %entry 7369; X64-NEXT: kmovw %edi, %k1 7370; X64-NEXT: vprorq $5, %ymm0, %ymm0 {%k1} {z} 7371; X64-NEXT: retq 7372entry: 7373 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> <i64 5, i64 5, i64 5, i64 5>) 7374 %1 = bitcast i8 %__U to <8 x i1> 7375 %extract = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7376 %2 = select <4 x i1> %extract, <4 x i64> %0, <4 x i64> zeroinitializer 7377 ret <4 x i64> %2 7378} 7379 7380define <2 x i64> @test_mm_rorv_epi32(<2 x i64> %__A, <2 x i64> %__B) { 7381; CHECK-LABEL: test_mm_rorv_epi32: 7382; CHECK: # %bb.0: # %entry 7383; CHECK-NEXT: vprorvd %xmm1, %xmm0, %xmm0 7384; CHECK-NEXT: ret{{[l|q]}} 7385entry: 7386 %0 = bitcast <2 x i64> %__A to <4 x i32> 7387 %1 = bitcast <2 x i64> %__B to <4 x i32> 7388 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 7389 %3 = bitcast <4 x i32> %2 to <2 x i64> 7390 ret <2 x i64> %3 7391} 7392 7393define <2 x i64> @test_mm_mask_rorv_epi32(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7394; X86-LABEL: test_mm_mask_rorv_epi32: 7395; X86: # %bb.0: # %entry 7396; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7397; X86-NEXT: kmovw %eax, %k1 7398; X86-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7399; X86-NEXT: retl 7400; 7401; X64-LABEL: test_mm_mask_rorv_epi32: 7402; X64: # %bb.0: # %entry 7403; X64-NEXT: kmovw %edi, %k1 7404; X64-NEXT: vprorvd %xmm2, %xmm1, %xmm0 {%k1} 7405; X64-NEXT: retq 7406entry: 7407 %0 = bitcast <2 x i64> %__A to <4 x i32> 7408 %1 = bitcast <2 x i64> %__B to <4 x i32> 7409 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 7410 %3 = bitcast <2 x i64> %__W to <4 x i32> 7411 %4 = bitcast i8 %__U to <8 x i1> 7412 %extract.i = shufflevector <8 x i1> %4, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7413 %5 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> %3 7414 %6 = bitcast <4 x i32> %5 to <2 x i64> 7415 ret <2 x i64> %6 7416} 7417 7418define <2 x i64> @test_mm_maskz_rorv_epi32(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7419; X86-LABEL: test_mm_maskz_rorv_epi32: 7420; X86: # %bb.0: # %entry 7421; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7422; X86-NEXT: kmovw %eax, %k1 7423; X86-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7424; X86-NEXT: retl 7425; 7426; X64-LABEL: test_mm_maskz_rorv_epi32: 7427; X64: # %bb.0: # %entry 7428; X64-NEXT: kmovw %edi, %k1 7429; X64-NEXT: vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} 7430; X64-NEXT: retq 7431entry: 7432 %0 = bitcast <2 x i64> %__A to <4 x i32> 7433 %1 = bitcast <2 x i64> %__B to <4 x i32> 7434 %2 = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %0, <4 x i32> %0, <4 x i32> %1) 7435 %3 = bitcast i8 %__U to <8 x i1> 7436 %extract.i = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7437 %4 = select <4 x i1> %extract.i, <4 x i32> %2, <4 x i32> zeroinitializer 7438 %5 = bitcast <4 x i32> %4 to <2 x i64> 7439 ret <2 x i64> %5 7440} 7441 7442define <4 x i64> @test_mm256_rorv_epi32(<4 x i64> %__A, <4 x i64> %__B) { 7443; CHECK-LABEL: test_mm256_rorv_epi32: 7444; CHECK: # %bb.0: # %entry 7445; CHECK-NEXT: vprorvd %ymm1, %ymm0, %ymm0 7446; CHECK-NEXT: ret{{[l|q]}} 7447entry: 7448 %0 = bitcast <4 x i64> %__A to <8 x i32> 7449 %1 = bitcast <4 x i64> %__B to <8 x i32> 7450 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7451 %3 = bitcast <8 x i32> %2 to <4 x i64> 7452 ret <4 x i64> %3 7453} 7454 7455define <4 x i64> @test_mm256_mask_rorv_epi32(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7456; X86-LABEL: test_mm256_mask_rorv_epi32: 7457; X86: # %bb.0: # %entry 7458; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7459; X86-NEXT: kmovw %eax, %k1 7460; X86-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7461; X86-NEXT: retl 7462; 7463; X64-LABEL: test_mm256_mask_rorv_epi32: 7464; X64: # %bb.0: # %entry 7465; X64-NEXT: kmovw %edi, %k1 7466; X64-NEXT: vprorvd %ymm2, %ymm1, %ymm0 {%k1} 7467; X64-NEXT: retq 7468entry: 7469 %0 = bitcast <4 x i64> %__A to <8 x i32> 7470 %1 = bitcast <4 x i64> %__B to <8 x i32> 7471 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7472 %3 = bitcast <4 x i64> %__W to <8 x i32> 7473 %4 = bitcast i8 %__U to <8 x i1> 7474 %5 = select <8 x i1> %4, <8 x i32> %2, <8 x i32> %3 7475 %6 = bitcast <8 x i32> %5 to <4 x i64> 7476 ret <4 x i64> %6 7477} 7478 7479define <4 x i64> @test_mm256_maskz_rorv_epi32(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7480; X86-LABEL: test_mm256_maskz_rorv_epi32: 7481; X86: # %bb.0: # %entry 7482; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7483; X86-NEXT: kmovw %eax, %k1 7484; X86-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7485; X86-NEXT: retl 7486; 7487; X64-LABEL: test_mm256_maskz_rorv_epi32: 7488; X64: # %bb.0: # %entry 7489; X64-NEXT: kmovw %edi, %k1 7490; X64-NEXT: vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} 7491; X64-NEXT: retq 7492entry: 7493 %0 = bitcast <4 x i64> %__A to <8 x i32> 7494 %1 = bitcast <4 x i64> %__B to <8 x i32> 7495 %2 = tail call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %0, <8 x i32> %0, <8 x i32> %1) 7496 %3 = bitcast i8 %__U to <8 x i1> 7497 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 7498 %5 = bitcast <8 x i32> %4 to <4 x i64> 7499 ret <4 x i64> %5 7500} 7501 7502define <2 x i64> @test_mm_rorv_epi64(<2 x i64> %__A, <2 x i64> %__B) { 7503; CHECK-LABEL: test_mm_rorv_epi64: 7504; CHECK: # %bb.0: # %entry 7505; CHECK-NEXT: vprorvq %xmm1, %xmm0, %xmm0 7506; CHECK-NEXT: ret{{[l|q]}} 7507entry: 7508 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7509 ret <2 x i64> %0 7510} 7511 7512define <2 x i64> @test_mm_mask_rorv_epi64(<2 x i64> %__W, i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7513; X86-LABEL: test_mm_mask_rorv_epi64: 7514; X86: # %bb.0: # %entry 7515; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7516; X86-NEXT: kmovw %eax, %k1 7517; X86-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7518; X86-NEXT: retl 7519; 7520; X64-LABEL: test_mm_mask_rorv_epi64: 7521; X64: # %bb.0: # %entry 7522; X64-NEXT: kmovw %edi, %k1 7523; X64-NEXT: vprorvq %xmm2, %xmm1, %xmm0 {%k1} 7524; X64-NEXT: retq 7525entry: 7526 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7527 %1 = bitcast i8 %__U to <8 x i1> 7528 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7529 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W 7530 ret <2 x i64> %2 7531} 7532 7533define <2 x i64> @test_mm_maskz_rorv_epi64(i8 zeroext %__U, <2 x i64> %__A, <2 x i64> %__B) { 7534; X86-LABEL: test_mm_maskz_rorv_epi64: 7535; X86: # %bb.0: # %entry 7536; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7537; X86-NEXT: kmovw %eax, %k1 7538; X86-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7539; X86-NEXT: retl 7540; 7541; X64-LABEL: test_mm_maskz_rorv_epi64: 7542; X64: # %bb.0: # %entry 7543; X64-NEXT: kmovw %edi, %k1 7544; X64-NEXT: vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} 7545; X64-NEXT: retq 7546entry: 7547 %0 = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %__A, <2 x i64> %__A, <2 x i64> %__B) 7548 %1 = bitcast i8 %__U to <8 x i1> 7549 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 7550 %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer 7551 ret <2 x i64> %2 7552} 7553 7554define <4 x i64> @test_mm256_rorv_epi64(<4 x i64> %__A, <4 x i64> %__B) { 7555; CHECK-LABEL: test_mm256_rorv_epi64: 7556; CHECK: # %bb.0: # %entry 7557; CHECK-NEXT: vprorvq %ymm1, %ymm0, %ymm0 7558; CHECK-NEXT: ret{{[l|q]}} 7559entry: 7560 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7561 ret <4 x i64> %0 7562} 7563 7564define <4 x i64> @test_mm256_mask_rorv_epi64(<4 x i64> %__W, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7565; X86-LABEL: test_mm256_mask_rorv_epi64: 7566; X86: # %bb.0: # %entry 7567; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7568; X86-NEXT: kmovw %eax, %k1 7569; X86-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7570; X86-NEXT: retl 7571; 7572; X64-LABEL: test_mm256_mask_rorv_epi64: 7573; X64: # %bb.0: # %entry 7574; X64-NEXT: kmovw %edi, %k1 7575; X64-NEXT: vprorvq %ymm2, %ymm1, %ymm0 {%k1} 7576; X64-NEXT: retq 7577entry: 7578 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7579 %1 = bitcast i8 %__U to <8 x i1> 7580 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7581 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W 7582 ret <4 x i64> %2 7583} 7584 7585define <4 x i64> @test_mm256_maskz_rorv_epi64(i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { 7586; X86-LABEL: test_mm256_maskz_rorv_epi64: 7587; X86: # %bb.0: # %entry 7588; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7589; X86-NEXT: kmovw %eax, %k1 7590; X86-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7591; X86-NEXT: retl 7592; 7593; X64-LABEL: test_mm256_maskz_rorv_epi64: 7594; X64: # %bb.0: # %entry 7595; X64-NEXT: kmovw %edi, %k1 7596; X64-NEXT: vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} 7597; X64-NEXT: retq 7598entry: 7599 %0 = tail call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %__A, <4 x i64> %__A, <4 x i64> %__B) 7600 %1 = bitcast i8 %__U to <8 x i1> 7601 %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7602 %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer 7603 ret <4 x i64> %2 7604} 7605 7606declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) 7607declare <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32>) 7608declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double>, <4 x i32>, i8) 7609declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) 7610declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) 7611declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) 7612declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) 7613declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) 7614declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) 7615declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) 7616declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) 7617declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) 7618declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) 7619declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) 7620declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) 7621declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) 7622declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) 7623declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) 7624declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) 7625declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) 7626declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8) 7627declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) 7628declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) 7629declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) 7630declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) 7631declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) 7632declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) 7633declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) 7634declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) 7635declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>) 7636declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>) 7637declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) 7638declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>) 7639declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>) 7640declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>) 7641declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>) 7642declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>) 7643declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>) 7644declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>) 7645declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>) 7646declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>) 7647declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>) 7648declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>) 7649declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>) 7650declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>) 7651declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 7652declare <8 x i32> @llvm.fshl.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) 7653declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 7654declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) 7655declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) 7656declare <8 x i32> @llvm.fshr.v8i32(<8 x i32>, <8 x i32>, <8 x i32>) 7657declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) 7658declare <4 x i64> @llvm.fshr.v4i64(<4 x i64>, <4 x i64>, <4 x i64>) 7659 7660!0 = !{i32 1} 7661