1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64 4 5; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512f-builtins.c 6 7 8define zeroext i16 @test_mm512_kunpackb(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D, <8 x i64> %__E, <8 x i64> %__F) local_unnamed_addr #0 { 9; X86-LABEL: test_mm512_kunpackb: 10; X86: # %bb.0: # %entry 11; X86-NEXT: pushl %ebp 12; X86-NEXT: .cfi_def_cfa_offset 8 13; X86-NEXT: .cfi_offset %ebp, -8 14; X86-NEXT: movl %esp, %ebp 15; X86-NEXT: .cfi_def_cfa_register %ebp 16; X86-NEXT: andl $-64, %esp 17; X86-NEXT: subl $64, %esp 18; X86-NEXT: vmovdqa64 136(%ebp), %zmm3 19; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 20; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 21; X86-NEXT: kunpckbw %k0, %k1, %k1 22; X86-NEXT: vpcmpneqd 72(%ebp), %zmm3, %k0 {%k1} 23; X86-NEXT: kmovw %k0, %eax 24; X86-NEXT: movzwl %ax, %eax 25; X86-NEXT: movl %ebp, %esp 26; X86-NEXT: popl %ebp 27; X86-NEXT: .cfi_def_cfa %esp, 4 28; X86-NEXT: vzeroupper 29; X86-NEXT: retl 30; 31; X64-LABEL: test_mm512_kunpackb: 32; X64: # %bb.0: # %entry 33; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 34; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 35; X64-NEXT: kunpckbw %k0, %k1, %k1 36; X64-NEXT: vpcmpneqd %zmm5, %zmm4, %k0 {%k1} 37; X64-NEXT: kmovw %k0, %eax 38; X64-NEXT: movzwl %ax, %eax 39; X64-NEXT: vzeroupper 40; X64-NEXT: retq 41entry: 42 %0 = bitcast <8 x i64> %__E to <16 x i32> 43 %1 = bitcast <8 x i64> %__F to <16 x i32> 44 %2 = bitcast <8 x i64> %__A to <16 x i32> 45 %3 = bitcast <8 x i64> %__B to <16 x i32> 46 %4 = icmp ne <16 x i32> %2, %3 47 %5 = bitcast <8 x i64> %__C to <16 x i32> 48 %6 = bitcast <8 x i64> %__D to <16 x i32> 49 %7 = icmp ne <16 x i32> %5, %6 50 %8 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 51 %9 = shufflevector <16 x i1> %7, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 52 %10 = shufflevector <8 x i1> %8, <8 x i1> %9, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 53 %11 = icmp ne <16 x i32> %0, %1 54 %12 = and <16 x i1> %11, %10 55 %13 = bitcast <16 x i1> %12 to i16 56 ret i16 %13 57} 58 59define i32 @test_mm512_kortestc(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 60; X86-LABEL: test_mm512_kortestc: 61; X86: # %bb.0: # %entry 62; X86-NEXT: pushl %ebp 63; X86-NEXT: .cfi_def_cfa_offset 8 64; X86-NEXT: .cfi_offset %ebp, -8 65; X86-NEXT: movl %esp, %ebp 66; X86-NEXT: .cfi_def_cfa_register %ebp 67; X86-NEXT: andl $-64, %esp 68; X86-NEXT: subl $64, %esp 69; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 70; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 71; X86-NEXT: korw %k0, %k1, %k0 72; X86-NEXT: kmovw %k0, %eax 73; X86-NEXT: cmpw $-1, %ax 74; X86-NEXT: sete %al 75; X86-NEXT: andb $1, %al 76; X86-NEXT: movzbl %al, %eax 77; X86-NEXT: movl %ebp, %esp 78; X86-NEXT: popl %ebp 79; X86-NEXT: .cfi_def_cfa %esp, 4 80; X86-NEXT: vzeroupper 81; X86-NEXT: retl 82; 83; X64-LABEL: test_mm512_kortestc: 84; X64: # %bb.0: # %entry 85; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 86; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 87; X64-NEXT: korw %k0, %k1, %k0 88; X64-NEXT: kmovw %k0, %eax 89; X64-NEXT: cmpw $-1, %ax 90; X64-NEXT: sete %al 91; X64-NEXT: andb $1, %al 92; X64-NEXT: movzbl %al, %eax 93; X64-NEXT: vzeroupper 94; X64-NEXT: retq 95entry: 96 %0 = bitcast <8 x i64> %__A to <16 x i32> 97 %1 = bitcast <8 x i64> %__B to <16 x i32> 98 %2 = icmp ne <16 x i32> %0, %1 99 %3 = bitcast <8 x i64> %__C to <16 x i32> 100 %4 = bitcast <8 x i64> %__D to <16 x i32> 101 %5 = icmp ne <16 x i32> %3, %4 102 %6 = or <16 x i1> %5, %2 %7 = bitcast <16 x i1> %6 to i16 103 %8 = icmp eq i16 %7, -1 104 %9 = zext i1 %8 to i32 105 ret i32 %9 106} 107 108define i32 @test_mm512_kortestz(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, <8 x i64> %__D) { 109; X86-LABEL: test_mm512_kortestz: 110; X86: # %bb.0: # %entry 111; X86-NEXT: pushl %ebp 112; X86-NEXT: .cfi_def_cfa_offset 8 113; X86-NEXT: .cfi_offset %ebp, -8 114; X86-NEXT: movl %esp, %ebp 115; X86-NEXT: .cfi_def_cfa_register %ebp 116; X86-NEXT: andl $-64, %esp 117; X86-NEXT: subl $64, %esp 118; X86-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 119; X86-NEXT: vpcmpneqd 8(%ebp), %zmm2, %k1 120; X86-NEXT: korw %k0, %k1, %k0 121; X86-NEXT: kmovw %k0, %eax 122; X86-NEXT: cmpw $0, %ax 123; X86-NEXT: sete %al 124; X86-NEXT: andb $1, %al 125; X86-NEXT: movzbl %al, %eax 126; X86-NEXT: movl %ebp, %esp 127; X86-NEXT: popl %ebp 128; X86-NEXT: .cfi_def_cfa %esp, 4 129; X86-NEXT: vzeroupper 130; X86-NEXT: retl 131; 132; X64-LABEL: test_mm512_kortestz: 133; X64: # %bb.0: # %entry 134; X64-NEXT: vpcmpneqd %zmm1, %zmm0, %k0 135; X64-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 136; X64-NEXT: korw %k0, %k1, %k0 137; X64-NEXT: kmovw %k0, %eax 138; X64-NEXT: cmpw $0, %ax 139; X64-NEXT: sete %al 140; X64-NEXT: andb $1, %al 141; X64-NEXT: movzbl %al, %eax 142; X64-NEXT: vzeroupper 143; X64-NEXT: retq 144entry: 145 %0 = bitcast <8 x i64> %__A to <16 x i32> 146 %1 = bitcast <8 x i64> %__B to <16 x i32> 147 %2 = icmp ne <16 x i32> %0, %1 148 %3 = bitcast <8 x i64> %__C to <16 x i32> 149 %4 = bitcast <8 x i64> %__D to <16 x i32> 150 %5 = icmp ne <16 x i32> %3, %4 151 %6 = or <16 x i1> %5, %2 152 %7 = bitcast <16 x i1> %6 to i16 153 %8 = icmp eq i16 %7, 0 154 %9 = zext i1 %8 to i32 155 ret i32 %9 156} 157 158define <16 x float> @test_mm512_shuffle_f32x4(<16 x float> %__A, <16 x float> %__B) { 159; CHECK-LABEL: test_mm512_shuffle_f32x4: 160; CHECK: # %bb.0: # %entry 161; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 162; CHECK-NEXT: ret{{[l|q]}} 163entry: 164 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 165 ret <16 x float> %shuffle 166} 167 168 169define <16 x float> @test_mm512_mask_shuffle_f32x4(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 170; X86-LABEL: test_mm512_mask_shuffle_f32x4: 171; X86: # %bb.0: # %entry 172; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 173; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 174; X86-NEXT: retl 175; 176; X64-LABEL: test_mm512_mask_shuffle_f32x4: 177; X64: # %bb.0: # %entry 178; X64-NEXT: kmovw %edi, %k1 179; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 180; X64-NEXT: retq 181entry: 182 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 183 %0 = bitcast i16 %__U to <16 x i1> 184 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> %__W 185 ret <16 x float> %1 186} 187 188define <16 x float> @test_mm512_maskz_shuffle_f32x4(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 189; X86-LABEL: test_mm512_maskz_shuffle_f32x4: 190; X86: # %bb.0: # %entry 191; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 192; X86-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 193; X86-NEXT: retl 194; 195; X64-LABEL: test_mm512_maskz_shuffle_f32x4: 196; X64: # %bb.0: # %entry 197; X64-NEXT: kmovw %edi, %k1 198; X64-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 199; X64-NEXT: retq 200entry: 201 %shuffle = shufflevector <16 x float> %__A, <16 x float> %__B, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19> 202 %0 = bitcast i16 %__U to <16 x i1> 203 %1 = select <16 x i1> %0, <16 x float> %shuffle, <16 x float> zeroinitializer 204 ret <16 x float> %1 205} 206 207define <8 x double> @test_mm512_shuffle_f64x2(<8 x double> %__A, <8 x double> %__B) { 208; CHECK-LABEL: test_mm512_shuffle_f64x2: 209; CHECK: # %bb.0: # %entry 210; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 211; CHECK-NEXT: ret{{[l|q]}} 212entry: 213 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 214 ret <8 x double> %shuffle 215} 216 217define <8 x double> @test_mm512_mask_shuffle_f64x2(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 218; X86-LABEL: test_mm512_mask_shuffle_f64x2: 219; X86: # %bb.0: # %entry 220; X86-NEXT: movb {{[0-9]+}}(%esp), %al 221; X86-NEXT: kmovw %eax, %k1 222; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 223; X86-NEXT: retl 224; 225; X64-LABEL: test_mm512_mask_shuffle_f64x2: 226; X64: # %bb.0: # %entry 227; X64-NEXT: kmovw %edi, %k1 228; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 229; X64-NEXT: retq 230entry: 231 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 232 %0 = bitcast i8 %__U to <8 x i1> 233 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> %__W 234 ret <8 x double> %1 235} 236 237define <8 x double> @test_mm512_maskz_shuffle_f64x2(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 238; X86-LABEL: test_mm512_maskz_shuffle_f64x2: 239; X86: # %bb.0: # %entry 240; X86-NEXT: movb {{[0-9]+}}(%esp), %al 241; X86-NEXT: kmovw %eax, %k1 242; X86-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 243; X86-NEXT: retl 244; 245; X64-LABEL: test_mm512_maskz_shuffle_f64x2: 246; X64: # %bb.0: # %entry 247; X64-NEXT: kmovw %edi, %k1 248; X64-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 249; X64-NEXT: retq 250entry: 251 %shuffle = shufflevector <8 x double> %__A, <8 x double> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 252 %0 = bitcast i8 %__U to <8 x i1> 253 %1 = select <8 x i1> %0, <8 x double> %shuffle, <8 x double> zeroinitializer 254 ret <8 x double> %1 255} 256 257define <8 x i64> @test_mm512_shuffle_i32x4(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 258; CHECK-LABEL: test_mm512_shuffle_i32x4: 259; CHECK: # %bb.0: # %entry 260; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 261; CHECK-NEXT: ret{{[l|q]}} 262entry: 263 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 264 ret <8 x i64> %shuffle 265} 266 267define <8 x i64> @test_mm512_mask_shuffle_i32x4(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 268; X86-LABEL: test_mm512_mask_shuffle_i32x4: 269; X86: # %bb.0: # %entry 270; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 271; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 272; X86-NEXT: retl 273; 274; X64-LABEL: test_mm512_mask_shuffle_i32x4: 275; X64: # %bb.0: # %entry 276; X64-NEXT: kmovw %edi, %k1 277; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3,4,5,6,7],zmm2[0,1,2,3,0,1,2,3] 278; X64-NEXT: retq 279entry: 280 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 281 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 282 %1 = bitcast <8 x i64> %__W to <16 x i32> 283 %2 = bitcast i16 %__U to <16 x i1> 284 %3 = select <16 x i1> %2, <16 x i32> %0, <16 x i32> %1 285 %4 = bitcast <16 x i32> %3 to <8 x i64> 286 ret <8 x i64> %4 287} 288 289define <8 x i64> @test_mm512_maskz_shuffle_i32x4(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 290; X86-LABEL: test_mm512_maskz_shuffle_i32x4: 291; X86: # %bb.0: # %entry 292; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 293; X86-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 294; X86-NEXT: retl 295; 296; X64-LABEL: test_mm512_maskz_shuffle_i32x4: 297; X64: # %bb.0: # %entry 298; X64-NEXT: kmovw %edi, %k1 299; X64-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,7],zmm1[0,1,2,3,0,1,2,3] 300; X64-NEXT: retq 301entry: 302 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 303 %0 = bitcast <8 x i64> %shuffle to <16 x i32> 304 %1 = bitcast i16 %__U to <16 x i1> 305 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 306 %3 = bitcast <16 x i32> %2 to <8 x i64> 307 ret <8 x i64> %3 308} 309 310define <8 x i64> @test_mm512_shuffle_i64x2(<8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 311; CHECK-LABEL: test_mm512_shuffle_i64x2: 312; CHECK: # %bb.0: # %entry 313; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,0,1] 314; CHECK-NEXT: ret{{[l|q]}} 315entry: 316 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 317 ret <8 x i64> %shuffle 318} 319 320define <8 x i64> @test_mm512_mask_shuffle_i64x2(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 321; X86-LABEL: test_mm512_mask_shuffle_i64x2: 322; X86: # %bb.0: # %entry 323; X86-NEXT: movb {{[0-9]+}}(%esp), %al 324; X86-NEXT: kmovw %eax, %k1 325; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 326; X86-NEXT: retl 327; 328; X64-LABEL: test_mm512_mask_shuffle_i64x2: 329; X64: # %bb.0: # %entry 330; X64-NEXT: kmovw %edi, %k1 331; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,3],zmm2[0,1,0,1] 332; X64-NEXT: retq 333entry: 334 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 335 %0 = bitcast i8 %__U to <8 x i1> 336 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> %__W 337 ret <8 x i64> %1 338} 339 340define <8 x i64> @test_mm512_maskz_shuffle_i64x2(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) local_unnamed_addr #0 { 341; X86-LABEL: test_mm512_maskz_shuffle_i64x2: 342; X86: # %bb.0: # %entry 343; X86-NEXT: movb {{[0-9]+}}(%esp), %al 344; X86-NEXT: kmovw %eax, %k1 345; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 346; X86-NEXT: retl 347; 348; X64-LABEL: test_mm512_maskz_shuffle_i64x2: 349; X64: # %bb.0: # %entry 350; X64-NEXT: kmovw %edi, %k1 351; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],zmm1[0,1,0,1] 352; X64-NEXT: retq 353entry: 354 %shuffle = shufflevector <8 x i64> %__A, <8 x i64> %__B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9> 355 %0 = bitcast i8 %__U to <8 x i1> 356 %1 = select <8 x i1> %0, <8 x i64> %shuffle, <8 x i64> zeroinitializer 357 ret <8 x i64> %1 358} 359 360 361define zeroext i16 @test_mm512_testn_epi32_mask(<8 x i64> %__A, <8 x i64> %__B) { 362; CHECK-LABEL: test_mm512_testn_epi32_mask: 363; CHECK: # %bb.0: # %entry 364; CHECK-NEXT: vptestnmd %zmm0, %zmm1, %k0 365; CHECK-NEXT: kmovw %k0, %eax 366; CHECK-NEXT: movzwl %ax, %eax 367; CHECK-NEXT: vzeroupper 368; CHECK-NEXT: ret{{[l|q]}} 369entry: 370 %and1.i.i = and <8 x i64> %__B, %__A 371 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 372 %1 = icmp eq <16 x i32> %0, zeroinitializer 373 %2 = bitcast <16 x i1> %1 to i16 374 ret i16 %2 375} 376 377define zeroext i16 @test_mm512_mask_testn_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 378; X86-LABEL: test_mm512_mask_testn_epi32_mask: 379; X86: # %bb.0: # %entry 380; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 381; X86-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 382; X86-NEXT: kmovw %k0, %eax 383; X86-NEXT: movzwl %ax, %eax 384; X86-NEXT: vzeroupper 385; X86-NEXT: retl 386; 387; X64-LABEL: test_mm512_mask_testn_epi32_mask: 388; X64: # %bb.0: # %entry 389; X64-NEXT: kmovw %edi, %k1 390; X64-NEXT: vptestnmd %zmm0, %zmm1, %k0 {%k1} 391; X64-NEXT: kmovw %k0, %eax 392; X64-NEXT: movzwl %ax, %eax 393; X64-NEXT: vzeroupper 394; X64-NEXT: retq 395entry: 396 %and1.i.i = and <8 x i64> %__B, %__A 397 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 398 %1 = icmp eq <16 x i32> %0, zeroinitializer 399 %2 = bitcast i16 %__U to <16 x i1> 400 %3 = and <16 x i1> %1, %2 401 %4 = bitcast <16 x i1> %3 to i16 402 ret i16 %4 403} 404 405define zeroext i8 @test_mm512_testn_epi64_mask(<8 x i64> %__A, <8 x i64> %__B) { 406; CHECK-LABEL: test_mm512_testn_epi64_mask: 407; CHECK: # %bb.0: # %entry 408; CHECK-NEXT: vptestnmq %zmm0, %zmm1, %k0 409; CHECK-NEXT: kmovw %k0, %eax 410; CHECK-NEXT: movzbl %al, %eax 411; CHECK-NEXT: vzeroupper 412; CHECK-NEXT: ret{{[l|q]}} 413entry: 414 %and1.i.i = and <8 x i64> %__B, %__A 415 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 416 %1 = bitcast <8 x i1> %0 to i8 417 ret i8 %1 418} 419 420define zeroext i8 @test_mm512_mask_testn_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 421; X86-LABEL: test_mm512_mask_testn_epi64_mask: 422; X86: # %bb.0: # %entry 423; X86-NEXT: movb {{[0-9]+}}(%esp), %al 424; X86-NEXT: kmovw %eax, %k1 425; X86-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 426; X86-NEXT: kmovw %k0, %eax 427; X86-NEXT: movzbl %al, %eax 428; X86-NEXT: vzeroupper 429; X86-NEXT: retl 430; 431; X64-LABEL: test_mm512_mask_testn_epi64_mask: 432; X64: # %bb.0: # %entry 433; X64-NEXT: kmovw %edi, %k1 434; X64-NEXT: vptestnmq %zmm0, %zmm1, %k0 {%k1} 435; X64-NEXT: kmovw %k0, %eax 436; X64-NEXT: movzbl %al, %eax 437; X64-NEXT: vzeroupper 438; X64-NEXT: retq 439entry: 440 %and1.i.i = and <8 x i64> %__B, %__A 441 %0 = icmp eq <8 x i64> %and1.i.i, zeroinitializer 442 %1 = bitcast i8 %__U to <8 x i1> 443 %2 = and <8 x i1> %0, %1 444 %3 = bitcast <8 x i1> %2 to i8 445 ret i8 %3 446} 447 448define zeroext i16 @test_mm512_mask_test_epi32_mask(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 449; X86-LABEL: test_mm512_mask_test_epi32_mask: 450; X86: # %bb.0: # %entry 451; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 452; X86-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 453; X86-NEXT: kmovw %k0, %eax 454; X86-NEXT: movzwl %ax, %eax 455; X86-NEXT: vzeroupper 456; X86-NEXT: retl 457; 458; X64-LABEL: test_mm512_mask_test_epi32_mask: 459; X64: # %bb.0: # %entry 460; X64-NEXT: kmovw %edi, %k1 461; X64-NEXT: vptestmd %zmm0, %zmm1, %k0 {%k1} 462; X64-NEXT: kmovw %k0, %eax 463; X64-NEXT: movzwl %ax, %eax 464; X64-NEXT: vzeroupper 465; X64-NEXT: retq 466entry: 467 %and1.i.i = and <8 x i64> %__B, %__A 468 %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> 469 %1 = icmp ne <16 x i32> %0, zeroinitializer 470 %2 = bitcast i16 %__U to <16 x i1> 471 %3 = and <16 x i1> %1, %2 472 %4 = bitcast <16 x i1> %3 to i16 473 ret i16 %4 474} 475 476define zeroext i8 @test_mm512_mask_test_epi64_mask(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 477; X86-LABEL: test_mm512_mask_test_epi64_mask: 478; X86: # %bb.0: # %entry 479; X86-NEXT: movb {{[0-9]+}}(%esp), %al 480; X86-NEXT: kmovw %eax, %k1 481; X86-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 482; X86-NEXT: kmovw %k0, %eax 483; X86-NEXT: movzbl %al, %eax 484; X86-NEXT: vzeroupper 485; X86-NEXT: retl 486; 487; X64-LABEL: test_mm512_mask_test_epi64_mask: 488; X64: # %bb.0: # %entry 489; X64-NEXT: kmovw %edi, %k1 490; X64-NEXT: vptestmq %zmm0, %zmm1, %k0 {%k1} 491; X64-NEXT: kmovw %k0, %eax 492; X64-NEXT: movzbl %al, %eax 493; X64-NEXT: vzeroupper 494; X64-NEXT: retq 495entry: 496 %and1.i.i = and <8 x i64> %__B, %__A 497 %0 = icmp ne <8 x i64> %and1.i.i, zeroinitializer 498 %1 = bitcast i8 %__U to <8 x i1> 499 %2 = and <8 x i1> %0, %1 500 %3 = bitcast <8 x i1> %2 to i8 501 ret i8 %3 502} 503 504define <8 x i64> @test_mm512_mask_set1_epi32(<8 x i64> %__O, i16 zeroext %__M, i32 %__A) { 505; X86-LABEL: test_mm512_mask_set1_epi32: 506; X86: # %bb.0: # %entry 507; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 508; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 509; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} 510; X86-NEXT: retl 511; 512; X64-LABEL: test_mm512_mask_set1_epi32: 513; X64: # %bb.0: # %entry 514; X64-NEXT: kmovw %edi, %k1 515; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} 516; X64-NEXT: retq 517entry: 518 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 519 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 520 %0 = bitcast <8 x i64> %__O to <16 x i32> 521 %1 = bitcast i16 %__M to <16 x i1> 522 %2 = select <16 x i1> %1, <16 x i32> %vecinit15.i.i, <16 x i32> %0 523 %3 = bitcast <16 x i32> %2 to <8 x i64> 524 ret <8 x i64> %3 525} 526 527define <8 x i64> @test_mm512_maskz_set1_epi32(i16 zeroext %__M, i32 %__A) { 528; X86-LABEL: test_mm512_maskz_set1_epi32: 529; X86: # %bb.0: # %entry 530; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 531; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 532; X86-NEXT: vpbroadcastd %eax, %zmm0 {%k1} {z} 533; X86-NEXT: retl 534; 535; X64-LABEL: test_mm512_maskz_set1_epi32: 536; X64: # %bb.0: # %entry 537; X64-NEXT: kmovw %edi, %k1 538; X64-NEXT: vpbroadcastd %esi, %zmm0 {%k1} {z} 539; X64-NEXT: retq 540entry: 541 %vecinit.i.i = insertelement <16 x i32> undef, i32 %__A, i32 0 542 %vecinit15.i.i = shufflevector <16 x i32> %vecinit.i.i, <16 x i32> undef, <16 x i32> zeroinitializer 543 %0 = bitcast i16 %__M to <16 x i1> 544 %1 = select <16 x i1> %0, <16 x i32> %vecinit15.i.i, <16 x i32> zeroinitializer 545 %2 = bitcast <16 x i32> %1 to <8 x i64> 546 ret <8 x i64> %2 547} 548 549define <8 x i64> @test_mm512_mask_set1_epi64(<8 x i64> %__O, i8 zeroext %__M, i64 %__A) { 550; X86-LABEL: test_mm512_mask_set1_epi64: 551; X86: # %bb.0: # %entry 552; X86-NEXT: movb {{[0-9]+}}(%esp), %al 553; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 554; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 555; X86-NEXT: kmovw %eax, %k1 556; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 557; X86-NEXT: retl 558; 559; X64-LABEL: test_mm512_mask_set1_epi64: 560; X64: # %bb.0: # %entry 561; X64-NEXT: kmovw %edi, %k1 562; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} 563; X64-NEXT: retq 564entry: 565 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 566 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 567 %0 = bitcast i8 %__M to <8 x i1> 568 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> %__O 569 ret <8 x i64> %1 570} 571 572define <8 x i64> @test_mm512_maskz_set1_epi64(i8 zeroext %__M, i64 %__A) { 573; X86-LABEL: test_mm512_maskz_set1_epi64: 574; X86: # %bb.0: # %entry 575; X86-NEXT: movb {{[0-9]+}}(%esp), %al 576; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 577; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 578; X86-NEXT: kmovw %eax, %k1 579; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 580; X86-NEXT: retl 581; 582; X64-LABEL: test_mm512_maskz_set1_epi64: 583; X64: # %bb.0: # %entry 584; X64-NEXT: kmovw %edi, %k1 585; X64-NEXT: vpbroadcastq %rsi, %zmm0 {%k1} {z} 586; X64-NEXT: retq 587entry: 588 %vecinit.i.i = insertelement <8 x i64> undef, i64 %__A, i32 0 589 %vecinit7.i.i = shufflevector <8 x i64> %vecinit.i.i, <8 x i64> undef, <8 x i32> zeroinitializer 590 %0 = bitcast i8 %__M to <8 x i1> 591 %1 = select <8 x i1> %0, <8 x i64> %vecinit7.i.i, <8 x i64> zeroinitializer 592 ret <8 x i64> %1 593} 594 595 596define <8 x i64> @test_mm512_broadcastd_epi32(<2 x i64> %a0) { 597; CHECK-LABEL: test_mm512_broadcastd_epi32: 598; CHECK: # %bb.0: 599; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 600; CHECK-NEXT: ret{{[l|q]}} 601 %arg0 = bitcast <2 x i64> %a0 to <4 x i32> 602 %res0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <16 x i32> zeroinitializer 603 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 604 ret <8 x i64> %res1 605} 606 607define <8 x i64> @test_mm512_mask_broadcastd_epi32(<8 x i64> %a0, i16 %a1, <2 x i64> %a2) { 608; X86-LABEL: test_mm512_mask_broadcastd_epi32: 609; X86: # %bb.0: 610; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 611; X86-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 612; X86-NEXT: retl 613; 614; X64-LABEL: test_mm512_mask_broadcastd_epi32: 615; X64: # %bb.0: 616; X64-NEXT: kmovw %edi, %k1 617; X64-NEXT: vpbroadcastd %xmm1, %zmm0 {%k1} 618; X64-NEXT: retq 619 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 620 %arg1 = bitcast i16 %a1 to <16 x i1> 621 %arg2 = bitcast <2 x i64> %a2 to <4 x i32> 622 %res0 = shufflevector <4 x i32> %arg2, <4 x i32> undef, <16 x i32> zeroinitializer 623 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 624 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 625 ret <8 x i64> %res2 626} 627 628define <8 x i64> @test_mm512_maskz_broadcastd_epi32(i16 %a0, <2 x i64> %a1) { 629; X86-LABEL: test_mm512_maskz_broadcastd_epi32: 630; X86: # %bb.0: 631; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 632; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 633; X86-NEXT: retl 634; 635; X64-LABEL: test_mm512_maskz_broadcastd_epi32: 636; X64: # %bb.0: 637; X64-NEXT: kmovw %edi, %k1 638; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} 639; X64-NEXT: retq 640 %arg0 = bitcast i16 %a0 to <16 x i1> 641 %arg1 = bitcast <2 x i64> %a1 to <4 x i32> 642 %res0 = shufflevector <4 x i32> %arg1, <4 x i32> undef, <16 x i32> zeroinitializer 643 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 644 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 645 ret <8 x i64> %res2 646} 647 648define <8 x i64> @test_mm512_broadcastq_epi64(<2 x i64> %a0) { 649; CHECK-LABEL: test_mm512_broadcastq_epi64: 650; CHECK: # %bb.0: 651; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 652; CHECK-NEXT: ret{{[l|q]}} 653 %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <8 x i32> zeroinitializer 654 ret <8 x i64> %res 655} 656 657define <8 x i64> @test_mm512_mask_broadcastq_epi64(<8 x i64> %a0, i8 %a1, <2 x i64> %a2) { 658; X86-LABEL: test_mm512_mask_broadcastq_epi64: 659; X86: # %bb.0: 660; X86-NEXT: movb {{[0-9]+}}(%esp), %al 661; X86-NEXT: kmovw %eax, %k1 662; X86-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 663; X86-NEXT: retl 664; 665; X64-LABEL: test_mm512_mask_broadcastq_epi64: 666; X64: # %bb.0: 667; X64-NEXT: kmovw %edi, %k1 668; X64-NEXT: vpbroadcastq %xmm1, %zmm0 {%k1} 669; X64-NEXT: retq 670 %arg1 = bitcast i8 %a1 to <8 x i1> 671 %res0 = shufflevector <2 x i64> %a2, <2 x i64> undef, <8 x i32> zeroinitializer 672 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 673 ret <8 x i64> %res1 674} 675 676define <8 x i64> @test_mm512_maskz_broadcastq_epi64(i8 %a0, <2 x i64> %a1) { 677; X86-LABEL: test_mm512_maskz_broadcastq_epi64: 678; X86: # %bb.0: 679; X86-NEXT: movb {{[0-9]+}}(%esp), %al 680; X86-NEXT: kmovw %eax, %k1 681; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 682; X86-NEXT: retl 683; 684; X64-LABEL: test_mm512_maskz_broadcastq_epi64: 685; X64: # %bb.0: 686; X64-NEXT: kmovw %edi, %k1 687; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} 688; X64-NEXT: retq 689 %arg0 = bitcast i8 %a0 to <8 x i1> 690 %res0 = shufflevector <2 x i64> %a1, <2 x i64> undef, <8 x i32> zeroinitializer 691 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 692 ret <8 x i64> %res1 693} 694 695define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a0) { 696; CHECK-LABEL: test_mm512_broadcastsd_pd: 697; CHECK: # %bb.0: 698; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 699; CHECK-NEXT: ret{{[l|q]}} 700 %res = shufflevector <2 x double> %a0, <2 x double> undef, <8 x i32> zeroinitializer 701 ret <8 x double> %res 702} 703 704define <8 x double> @test_mm512_mask_broadcastsd_pd(<8 x double> %a0, i8 %a1, <2 x double> %a2) { 705; X86-LABEL: test_mm512_mask_broadcastsd_pd: 706; X86: # %bb.0: 707; X86-NEXT: movb {{[0-9]+}}(%esp), %al 708; X86-NEXT: kmovw %eax, %k1 709; X86-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 710; X86-NEXT: retl 711; 712; X64-LABEL: test_mm512_mask_broadcastsd_pd: 713; X64: # %bb.0: 714; X64-NEXT: kmovw %edi, %k1 715; X64-NEXT: vbroadcastsd %xmm1, %zmm0 {%k1} 716; X64-NEXT: retq 717 %arg1 = bitcast i8 %a1 to <8 x i1> 718 %res0 = shufflevector <2 x double> %a2, <2 x double> undef, <8 x i32> zeroinitializer 719 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 720 ret <8 x double> %res1 721} 722 723define <8 x double> @test_mm512_maskz_broadcastsd_pd(i8 %a0, <2 x double> %a1) { 724; X86-LABEL: test_mm512_maskz_broadcastsd_pd: 725; X86: # %bb.0: 726; X86-NEXT: movb {{[0-9]+}}(%esp), %al 727; X86-NEXT: kmovw %eax, %k1 728; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 729; X86-NEXT: retl 730; 731; X64-LABEL: test_mm512_maskz_broadcastsd_pd: 732; X64: # %bb.0: 733; X64-NEXT: kmovw %edi, %k1 734; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} 735; X64-NEXT: retq 736 %arg0 = bitcast i8 %a0 to <8 x i1> 737 %res0 = shufflevector <2 x double> %a1, <2 x double> undef, <8 x i32> zeroinitializer 738 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 739 ret <8 x double> %res1 740} 741 742define <16 x float> @test_mm512_broadcastss_ps(<4 x float> %a0) { 743; CHECK-LABEL: test_mm512_broadcastss_ps: 744; CHECK: # %bb.0: 745; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 746; CHECK-NEXT: ret{{[l|q]}} 747 %res = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> zeroinitializer 748 ret <16 x float> %res 749} 750 751define <16 x float> @test_mm512_mask_broadcastss_ps(<16 x float> %a0, i16 %a1, <4 x float> %a2) { 752; X86-LABEL: test_mm512_mask_broadcastss_ps: 753; X86: # %bb.0: 754; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 755; X86-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 756; X86-NEXT: retl 757; 758; X64-LABEL: test_mm512_mask_broadcastss_ps: 759; X64: # %bb.0: 760; X64-NEXT: kmovw %edi, %k1 761; X64-NEXT: vbroadcastss %xmm1, %zmm0 {%k1} 762; X64-NEXT: retq 763 %arg1 = bitcast i16 %a1 to <16 x i1> 764 %res0 = shufflevector <4 x float> %a2, <4 x float> undef, <16 x i32> zeroinitializer 765 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 766 ret <16 x float> %res1 767} 768 769define <16 x float> @test_mm512_maskz_broadcastss_ps(i16 %a0, <4 x float> %a1) { 770; X86-LABEL: test_mm512_maskz_broadcastss_ps: 771; X86: # %bb.0: 772; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 773; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 774; X86-NEXT: retl 775; 776; X64-LABEL: test_mm512_maskz_broadcastss_ps: 777; X64: # %bb.0: 778; X64-NEXT: kmovw %edi, %k1 779; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} 780; X64-NEXT: retq 781 %arg0 = bitcast i16 %a0 to <16 x i1> 782 %res0 = shufflevector <4 x float> %a1, <4 x float> undef, <16 x i32> zeroinitializer 783 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 784 ret <16 x float> %res1 785} 786 787define <8 x double> @test_mm512_movedup_pd(<8 x double> %a0) { 788; CHECK-LABEL: test_mm512_movedup_pd: 789; CHECK: # %bb.0: 790; CHECK-NEXT: vmovddup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6] 791; CHECK-NEXT: ret{{[l|q]}} 792 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 793 ret <8 x double> %res 794} 795 796define <8 x double> @test_mm512_mask_movedup_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 797; X86-LABEL: test_mm512_mask_movedup_pd: 798; X86: # %bb.0: 799; X86-NEXT: movb {{[0-9]+}}(%esp), %al 800; X86-NEXT: kmovw %eax, %k1 801; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 802; X86-NEXT: retl 803; 804; X64-LABEL: test_mm512_mask_movedup_pd: 805; X64: # %bb.0: 806; X64-NEXT: kmovw %edi, %k1 807; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6] 808; X64-NEXT: retq 809 %arg1 = bitcast i8 %a1 to <8 x i1> 810 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 811 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 812 ret <8 x double> %res1 813} 814 815define <8 x double> @test_mm512_maskz_movedup_pd(i8 %a0, <8 x double> %a1) { 816; X86-LABEL: test_mm512_maskz_movedup_pd: 817; X86: # %bb.0: 818; X86-NEXT: movb {{[0-9]+}}(%esp), %al 819; X86-NEXT: kmovw %eax, %k1 820; X86-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 821; X86-NEXT: retl 822; 823; X64-LABEL: test_mm512_maskz_movedup_pd: 824; X64: # %bb.0: 825; X64-NEXT: kmovw %edi, %k1 826; X64-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] 827; X64-NEXT: retq 828 %arg0 = bitcast i8 %a0 to <8 x i1> 829 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 830 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 831 ret <8 x double> %res1 832} 833 834define <16 x float> @test_mm512_movehdup_ps(<16 x float> %a0) { 835; CHECK-LABEL: test_mm512_movehdup_ps: 836; CHECK: # %bb.0: 837; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 838; CHECK-NEXT: ret{{[l|q]}} 839 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 840 ret <16 x float> %res 841} 842 843define <16 x float> @test_mm512_mask_movehdup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 844; X86-LABEL: test_mm512_mask_movehdup_ps: 845; X86: # %bb.0: 846; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 847; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 848; X86-NEXT: retl 849; 850; X64-LABEL: test_mm512_mask_movehdup_ps: 851; X64: # %bb.0: 852; X64-NEXT: kmovw %edi, %k1 853; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 854; X64-NEXT: retq 855 %arg1 = bitcast i16 %a1 to <16 x i1> 856 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 857 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 858 ret <16 x float> %res1 859} 860 861define <16 x float> @test_mm512_maskz_movehdup_ps(i16 %a0, <16 x float> %a1) { 862; X86-LABEL: test_mm512_maskz_movehdup_ps: 863; X86: # %bb.0: 864; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 865; X86-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 866; X86-NEXT: retl 867; 868; X64-LABEL: test_mm512_maskz_movehdup_ps: 869; X64: # %bb.0: 870; X64-NEXT: kmovw %edi, %k1 871; X64-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 872; X64-NEXT: retq 873 %arg0 = bitcast i16 %a0 to <16 x i1> 874 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 875 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 876 ret <16 x float> %res1 877} 878 879define <16 x float> @test_mm512_moveldup_ps(<16 x float> %a0) { 880; CHECK-LABEL: test_mm512_moveldup_ps: 881; CHECK: # %bb.0: 882; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 883; CHECK-NEXT: ret{{[l|q]}} 884 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 885 ret <16 x float> %res 886} 887 888define <16 x float> @test_mm512_mask_moveldup_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 889; X86-LABEL: test_mm512_mask_moveldup_ps: 890; X86: # %bb.0: 891; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 892; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 893; X86-NEXT: retl 894; 895; X64-LABEL: test_mm512_mask_moveldup_ps: 896; X64: # %bb.0: 897; X64-NEXT: kmovw %edi, %k1 898; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} = zmm1[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 899; X64-NEXT: retq 900 %arg1 = bitcast i16 %a1 to <16 x i1> 901 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 902 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 903 ret <16 x float> %res1 904} 905 906define <16 x float> @test_mm512_maskz_moveldup_ps(i16 %a0, <16 x float> %a1) { 907; X86-LABEL: test_mm512_maskz_moveldup_ps: 908; X86: # %bb.0: 909; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 910; X86-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 911; X86-NEXT: retl 912; 913; X64-LABEL: test_mm512_maskz_moveldup_ps: 914; X64: # %bb.0: 915; X64-NEXT: kmovw %edi, %k1 916; X64-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] 917; X64-NEXT: retq 918 %arg0 = bitcast i16 %a0 to <16 x i1> 919 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14> 920 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 921 ret <16 x float> %res1 922} 923 924define <8 x double> @test_mm512_permute_pd(<8 x double> %a0) { 925; CHECK-LABEL: test_mm512_permute_pd: 926; CHECK: # %bb.0: 927; CHECK-NEXT: vpermilpd {{.*#+}} zmm0 = zmm0[0,1,2,2,4,4,6,6] 928; CHECK-NEXT: ret{{[l|q]}} 929 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 930 ret <8 x double> %res 931} 932 933define <8 x double> @test_mm512_mask_permute_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 934; X86-LABEL: test_mm512_mask_permute_pd: 935; X86: # %bb.0: 936; X86-NEXT: movb {{[0-9]+}}(%esp), %al 937; X86-NEXT: kmovw %eax, %k1 938; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 939; X86-NEXT: retl 940; 941; X64-LABEL: test_mm512_mask_permute_pd: 942; X64: # %bb.0: 943; X64-NEXT: kmovw %edi, %k1 944; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} = zmm1[0,1,2,2,4,4,6,6] 945; X64-NEXT: retq 946 %arg1 = bitcast i8 %a1 to <8 x i1> 947 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 948 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 949 ret <8 x double> %res1 950} 951 952define <8 x double> @test_mm512_maskz_permute_pd(i8 %a0, <8 x double> %a1) { 953; X86-LABEL: test_mm512_maskz_permute_pd: 954; X86: # %bb.0: 955; X86-NEXT: movb {{[0-9]+}}(%esp), %al 956; X86-NEXT: kmovw %eax, %k1 957; X86-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 958; X86-NEXT: retl 959; 960; X64-LABEL: test_mm512_maskz_permute_pd: 961; X64: # %bb.0: 962; X64-NEXT: kmovw %edi, %k1 963; X64-NEXT: vpermilpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,2,4,4,6,6] 964; X64-NEXT: retq 965 %arg0 = bitcast i8 %a0 to <8 x i1> 966 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 967 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 968 ret <8 x double> %res1 969} 970 971define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) { 972; CHECK-LABEL: test_mm512_permute_ps: 973; CHECK: # %bb.0: 974; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 975; CHECK-NEXT: ret{{[l|q]}} 976 %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 977 ret <16 x float> %res 978} 979 980define <16 x float> @test_mm512_mask_permute_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2) { 981; X86-LABEL: test_mm512_mask_permute_ps: 982; X86: # %bb.0: 983; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 984; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 985; X86-NEXT: retl 986; 987; X64-LABEL: test_mm512_mask_permute_ps: 988; X64: # %bb.0: 989; X64-NEXT: kmovw %edi, %k1 990; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} = zmm1[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 991; X64-NEXT: retq 992 %arg1 = bitcast i16 %a1 to <16 x i1> 993 %res0 = shufflevector <16 x float> %a2, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 994 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 995 ret <16 x float> %res1 996} 997 998define <16 x float> @test_mm512_maskz_permute_ps(i16 %a0, <16 x float> %a1) { 999; X86-LABEL: test_mm512_maskz_permute_ps: 1000; X86: # %bb.0: 1001; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1002; X86-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1003; X86-NEXT: retl 1004; 1005; X64-LABEL: test_mm512_maskz_permute_ps: 1006; X64: # %bb.0: 1007; X64-NEXT: kmovw %edi, %k1 1008; X64-NEXT: vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] 1009; X64-NEXT: retq 1010 %arg0 = bitcast i16 %a0 to <16 x i1> 1011 %res0 = shufflevector <16 x float> %a1, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12> 1012 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1013 ret <16 x float> %res1 1014} 1015 1016define <8 x i64> @test_mm512_permutex_epi64(<8 x i64> %a0) { 1017; CHECK-LABEL: test_mm512_permutex_epi64: 1018; CHECK: # %bb.0: 1019; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1020; CHECK-NEXT: ret{{[l|q]}} 1021 %res = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1022 ret <8 x i64> %res 1023} 1024 1025define <8 x i64> @test_mm512_mask_permutex_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2) { 1026; X86-LABEL: test_mm512_mask_permutex_epi64: 1027; X86: # %bb.0: 1028; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1029; X86-NEXT: kmovw %eax, %k1 1030; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1031; X86-NEXT: retl 1032; 1033; X64-LABEL: test_mm512_mask_permutex_epi64: 1034; X64: # %bb.0: 1035; X64-NEXT: kmovw %edi, %k1 1036; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1037; X64-NEXT: retq 1038 %arg1 = bitcast i8 %a1 to <8 x i1> 1039 %res0 = shufflevector <8 x i64> %a2, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1040 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1041 ret <8 x i64> %res1 1042} 1043 1044define <8 x i64> @test_mm512_maskz_permutex_epi64(i8 %a0, <8 x i64> %a1) { 1045; X86-LABEL: test_mm512_maskz_permutex_epi64: 1046; X86: # %bb.0: 1047; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1048; X86-NEXT: kmovw %eax, %k1 1049; X86-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1050; X86-NEXT: retl 1051; 1052; X64-LABEL: test_mm512_maskz_permutex_epi64: 1053; X64: # %bb.0: 1054; X64-NEXT: kmovw %edi, %k1 1055; X64-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1056; X64-NEXT: retq 1057 %arg0 = bitcast i8 %a0 to <8 x i1> 1058 %res0 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1059 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1060 ret <8 x i64> %res1 1061} 1062 1063define <8 x double> @test_mm512_permutex_pd(<8 x double> %a0) { 1064; CHECK-LABEL: test_mm512_permutex_pd: 1065; CHECK: # %bb.0: 1066; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[0,0,0,0,4,4,4,4] 1067; CHECK-NEXT: ret{{[l|q]}} 1068 %res = shufflevector <8 x double> %a0, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1069 ret <8 x double> %res 1070} 1071 1072define <8 x double> @test_mm512_mask_permutex_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2) { 1073; X86-LABEL: test_mm512_mask_permutex_pd: 1074; X86: # %bb.0: 1075; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1076; X86-NEXT: kmovw %eax, %k1 1077; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1078; X86-NEXT: retl 1079; 1080; X64-LABEL: test_mm512_mask_permutex_pd: 1081; X64: # %bb.0: 1082; X64-NEXT: kmovw %edi, %k1 1083; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = zmm1[0,0,0,0,4,4,4,4] 1084; X64-NEXT: retq 1085 %arg1 = bitcast i8 %a1 to <8 x i1> 1086 %res0 = shufflevector <8 x double> %a2, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1087 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1088 ret <8 x double> %res1 1089} 1090 1091define <8 x double> @test_mm512_maskz_permutex_pd(i8 %a0, <8 x double> %a1) { 1092; X86-LABEL: test_mm512_maskz_permutex_pd: 1093; X86: # %bb.0: 1094; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1095; X86-NEXT: kmovw %eax, %k1 1096; X86-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1097; X86-NEXT: retl 1098; 1099; X64-LABEL: test_mm512_maskz_permutex_pd: 1100; X64: # %bb.0: 1101; X64-NEXT: kmovw %edi, %k1 1102; X64-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] 1103; X64-NEXT: retq 1104 %arg0 = bitcast i8 %a0 to <8 x i1> 1105 %res0 = shufflevector <8 x double> %a1, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4> 1106 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1107 ret <8 x double> %res1 1108} 1109 1110define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { 1111; CHECK-LABEL: test_mm512_shuffle_epi32: 1112; CHECK: # %bb.0: 1113; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1114; CHECK-NEXT: ret{{[l|q]}} 1115 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1116 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1117 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1118 ret <8 x i64> %res1 1119} 1120 1121define <8 x i64> @test_mm512_mask_shuffle_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2) { 1122; X86-LABEL: test_mm512_mask_shuffle_epi32: 1123; X86: # %bb.0: 1124; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1125; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1126; X86-NEXT: retl 1127; 1128; X64-LABEL: test_mm512_mask_shuffle_epi32: 1129; X64: # %bb.0: 1130; X64-NEXT: kmovw %edi, %k1 1131; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = zmm1[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1132; X64-NEXT: retq 1133 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1134 %arg1 = bitcast i16 %a1 to <16 x i1> 1135 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1136 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1137 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1138 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1139 ret <8 x i64> %res2 1140} 1141 1142define <8 x i64> @test_mm512_maskz_shuffle_epi32(i16 %a0, <8 x i64> %a1) { 1143; X86-LABEL: test_mm512_maskz_shuffle_epi32: 1144; X86: # %bb.0: 1145; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1146; X86-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1147; X86-NEXT: retl 1148; 1149; X64-LABEL: test_mm512_maskz_shuffle_epi32: 1150; X64: # %bb.0: 1151; X64-NEXT: kmovw %edi, %k1 1152; X64-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] 1153; X64-NEXT: retq 1154 %arg0 = bitcast i16 %a0 to <16 x i1> 1155 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1156 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12> 1157 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1158 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1159 ret <8 x i64> %res2 1160} 1161 1162define <8 x double> @test_mm512_shuffle_pd(<8 x double> %a0, <8 x double> %a1) { 1163; CHECK-LABEL: test_mm512_shuffle_pd: 1164; CHECK: # %bb.0: 1165; CHECK-NEXT: vshufpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1166; CHECK-NEXT: ret{{[l|q]}} 1167 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1168 ret <8 x double> %res 1169} 1170 1171define <8 x double> @test_mm512_mask_shuffle_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1172; X86-LABEL: test_mm512_mask_shuffle_pd: 1173; X86: # %bb.0: 1174; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1175; X86-NEXT: kmovw %eax, %k1 1176; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1177; X86-NEXT: retl 1178; 1179; X64-LABEL: test_mm512_mask_shuffle_pd: 1180; X64: # %bb.0: 1181; X64-NEXT: kmovw %edi, %k1 1182; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[3],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1183; X64-NEXT: retq 1184 %arg1 = bitcast i8 %a1 to <8 x i1> 1185 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1186 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1187 ret <8 x double> %res1 1188} 1189 1190define <8 x double> @test_mm512_maskz_shuffle_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1191; X86-LABEL: test_mm512_maskz_shuffle_pd: 1192; X86: # %bb.0: 1193; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1194; X86-NEXT: kmovw %eax, %k1 1195; X86-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1196; X86-NEXT: retl 1197; 1198; X64-LABEL: test_mm512_maskz_shuffle_pd: 1199; X64: # %bb.0: 1200; X64-NEXT: kmovw %edi, %k1 1201; X64-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[3],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1202; X64-NEXT: retq 1203 %arg0 = bitcast i8 %a0 to <8 x i1> 1204 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14> 1205 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1206 ret <8 x double> %res1 1207} 1208 1209define <8 x i64> @test_mm512_unpackhi_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1210; CHECK-LABEL: test_mm512_unpackhi_epi32: 1211; CHECK: # %bb.0: 1212; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1213; CHECK-NEXT: ret{{[l|q]}} 1214 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1215 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1216 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1217 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1218 ret <8 x i64> %res1 1219} 1220 1221define <8 x i64> @test_mm512_mask_unpackhi_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1222; X86-LABEL: test_mm512_mask_unpackhi_epi32: 1223; X86: # %bb.0: 1224; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1225; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1226; X86-NEXT: retl 1227; 1228; X64-LABEL: test_mm512_mask_unpackhi_epi32: 1229; X64: # %bb.0: 1230; X64-NEXT: kmovw %edi, %k1 1231; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1232; X64-NEXT: retq 1233 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1234 %arg1 = bitcast i16 %a1 to <16 x i1> 1235 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1236 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1237 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1238 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1239 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1240 ret <8 x i64> %res2 1241} 1242 1243define <8 x i64> @test_mm512_maskz_unpackhi_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1244; X86-LABEL: test_mm512_maskz_unpackhi_epi32: 1245; X86: # %bb.0: 1246; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1247; X86-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1248; X86-NEXT: retl 1249; 1250; X64-LABEL: test_mm512_maskz_unpackhi_epi32: 1251; X64: # %bb.0: 1252; X64-NEXT: kmovw %edi, %k1 1253; X64-NEXT: vpunpckhdq {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1254; X64-NEXT: retq 1255 %arg0 = bitcast i16 %a0 to <16 x i1> 1256 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1257 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1258 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1259 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1260 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1261 ret <8 x i64> %res2 1262} 1263 1264define <8 x i64> @test_mm512_unpackhi_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1265; CHECK-LABEL: test_mm512_unpackhi_epi64: 1266; CHECK: # %bb.0: 1267; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1268; CHECK-NEXT: ret{{[l|q]}} 1269 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1270 ret <8 x i64> %res 1271} 1272 1273define <8 x i64> @test_mm512_mask_unpackhi_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1274; X86-LABEL: test_mm512_mask_unpackhi_epi64: 1275; X86: # %bb.0: 1276; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1277; X86-NEXT: kmovw %eax, %k1 1278; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1279; X86-NEXT: retl 1280; 1281; X64-LABEL: test_mm512_mask_unpackhi_epi64: 1282; X64: # %bb.0: 1283; X64-NEXT: kmovw %edi, %k1 1284; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1285; X64-NEXT: retq 1286 %arg1 = bitcast i8 %a1 to <8 x i1> 1287 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1288 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1289 ret <8 x i64> %res1 1290} 1291 1292define <8 x i64> @test_mm512_maskz_unpackhi_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1293; X86-LABEL: test_mm512_maskz_unpackhi_epi64: 1294; X86: # %bb.0: 1295; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1296; X86-NEXT: kmovw %eax, %k1 1297; X86-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1298; X86-NEXT: retl 1299; 1300; X64-LABEL: test_mm512_maskz_unpackhi_epi64: 1301; X64: # %bb.0: 1302; X64-NEXT: kmovw %edi, %k1 1303; X64-NEXT: vpunpckhqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1304; X64-NEXT: retq 1305 %arg0 = bitcast i8 %a0 to <8 x i1> 1306 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1307 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1308 ret <8 x i64> %res1 1309} 1310 1311define <8 x double> @test_mm512_unpackhi_pd(<8 x double> %a0, <8 x double> %a1) { 1312; CHECK-LABEL: test_mm512_unpackhi_pd: 1313; CHECK: # %bb.0: 1314; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1315; CHECK-NEXT: ret{{[l|q]}} 1316 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1317 ret <8 x double> %res 1318} 1319 1320define <8 x double> @test_mm512_mask_unpackhi_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1321; X86-LABEL: test_mm512_mask_unpackhi_pd: 1322; X86: # %bb.0: 1323; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1324; X86-NEXT: kmovw %eax, %k1 1325; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1326; X86-NEXT: retl 1327; 1328; X64-LABEL: test_mm512_mask_unpackhi_pd: 1329; X64: # %bb.0: 1330; X64-NEXT: kmovw %edi, %k1 1331; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} = zmm1[1],zmm2[1],zmm1[3],zmm2[3],zmm1[5],zmm2[5],zmm1[7],zmm2[7] 1332; X64-NEXT: retq 1333 %arg1 = bitcast i8 %a1 to <8 x i1> 1334 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1335 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1336 ret <8 x double> %res1 1337} 1338 1339define <8 x double> @test_mm512_maskz_unpackhi_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1340; X86-LABEL: test_mm512_maskz_unpackhi_pd: 1341; X86: # %bb.0: 1342; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1343; X86-NEXT: kmovw %eax, %k1 1344; X86-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1345; X86-NEXT: retl 1346; 1347; X64-LABEL: test_mm512_maskz_unpackhi_pd: 1348; X64: # %bb.0: 1349; X64-NEXT: kmovw %edi, %k1 1350; X64-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] 1351; X64-NEXT: retq 1352 %arg0 = bitcast i8 %a0 to <8 x i1> 1353 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15> 1354 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1355 ret <8 x double> %res1 1356} 1357 1358define <16 x float> @test_mm512_unpackhi_ps(<16 x float> %a0, <16 x float> %a1) { 1359; CHECK-LABEL: test_mm512_unpackhi_ps: 1360; CHECK: # %bb.0: 1361; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1362; CHECK-NEXT: ret{{[l|q]}} 1363 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1364 ret <16 x float> %res 1365} 1366 1367define <16 x float> @test_mm512_mask_unpackhi_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1368; X86-LABEL: test_mm512_mask_unpackhi_ps: 1369; X86: # %bb.0: 1370; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1371; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1372; X86-NEXT: retl 1373; 1374; X64-LABEL: test_mm512_mask_unpackhi_ps: 1375; X64: # %bb.0: 1376; X64-NEXT: kmovw %edi, %k1 1377; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} = zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[14],zmm2[14],zmm1[15],zmm2[15] 1378; X64-NEXT: retq 1379 %arg1 = bitcast i16 %a1 to <16 x i1> 1380 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1381 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1382 ret <16 x float> %res1 1383} 1384 1385define <16 x float> @test_mm512_maskz_unpackhi_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1386; X86-LABEL: test_mm512_maskz_unpackhi_ps: 1387; X86: # %bb.0: 1388; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1389; X86-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1390; X86-NEXT: retl 1391; 1392; X64-LABEL: test_mm512_maskz_unpackhi_ps: 1393; X64: # %bb.0: 1394; X64-NEXT: kmovw %edi, %k1 1395; X64-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] 1396; X64-NEXT: retq 1397 %arg0 = bitcast i16 %a0 to <16 x i1> 1398 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31> 1399 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1400 ret <16 x float> %res1 1401} 1402 1403define <8 x i64> @test_mm512_unpacklo_epi32(<8 x i64> %a0, <8 x i64> %a1) { 1404; CHECK-LABEL: test_mm512_unpacklo_epi32: 1405; CHECK: # %bb.0: 1406; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1407; CHECK-NEXT: ret{{[l|q]}} 1408 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1409 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1410 %res0 = shufflevector <16 x i32> %arg0, <16 x i32> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1411 %res1 = bitcast <16 x i32> %res0 to <8 x i64> 1412 ret <8 x i64> %res1 1413} 1414 1415define <8 x i64> @test_mm512_mask_unpacklo_epi32(<8 x i64> %a0, i16 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1416; X86-LABEL: test_mm512_mask_unpacklo_epi32: 1417; X86: # %bb.0: 1418; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1419; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1420; X86-NEXT: retl 1421; 1422; X64-LABEL: test_mm512_mask_unpacklo_epi32: 1423; X64: # %bb.0: 1424; X64-NEXT: kmovw %edi, %k1 1425; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1426; X64-NEXT: retq 1427 %arg0 = bitcast <8 x i64> %a0 to <16 x i32> 1428 %arg1 = bitcast i16 %a1 to <16 x i1> 1429 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1430 %arg3 = bitcast <8 x i64> %a3 to <16 x i32> 1431 %res0 = shufflevector <16 x i32> %arg2, <16 x i32> %arg3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1432 %res1 = select <16 x i1> %arg1, <16 x i32> %res0, <16 x i32> %arg0 1433 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1434 ret <8 x i64> %res2 1435} 1436 1437define <8 x i64> @test_mm512_maskz_unpacklo_epi32(i16 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1438; X86-LABEL: test_mm512_maskz_unpacklo_epi32: 1439; X86: # %bb.0: 1440; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1441; X86-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1442; X86-NEXT: retl 1443; 1444; X64-LABEL: test_mm512_maskz_unpacklo_epi32: 1445; X64: # %bb.0: 1446; X64-NEXT: kmovw %edi, %k1 1447; X64-NEXT: vpunpckldq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1448; X64-NEXT: retq 1449 %arg0 = bitcast i16 %a0 to <16 x i1> 1450 %arg1 = bitcast <8 x i64> %a1 to <16 x i32> 1451 %arg2 = bitcast <8 x i64> %a2 to <16 x i32> 1452 %res0 = shufflevector <16 x i32> %arg1, <16 x i32> %arg2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1453 %res1 = select <16 x i1> %arg0, <16 x i32> %res0, <16 x i32> zeroinitializer 1454 %res2 = bitcast <16 x i32> %res1 to <8 x i64> 1455 ret <8 x i64> %res2 1456} 1457 1458define <8 x i64> @test_mm512_unpacklo_epi64(<8 x i64> %a0, <8 x i64> %a1) { 1459; CHECK-LABEL: test_mm512_unpacklo_epi64: 1460; CHECK: # %bb.0: 1461; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1462; CHECK-NEXT: ret{{[l|q]}} 1463 %res = shufflevector <8 x i64> %a0, <8 x i64> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1464 ret <8 x i64> %res 1465} 1466 1467define <8 x i64> @test_mm512_mask_unpacklo_epi64(<8 x i64> %a0, i8 %a1, <8 x i64> %a2, <8 x i64> %a3) { 1468; X86-LABEL: test_mm512_mask_unpacklo_epi64: 1469; X86: # %bb.0: 1470; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1471; X86-NEXT: kmovw %eax, %k1 1472; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1473; X86-NEXT: retl 1474; 1475; X64-LABEL: test_mm512_mask_unpacklo_epi64: 1476; X64: # %bb.0: 1477; X64-NEXT: kmovw %edi, %k1 1478; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1479; X64-NEXT: retq 1480 %arg1 = bitcast i8 %a1 to <8 x i1> 1481 %res0 = shufflevector <8 x i64> %a2, <8 x i64> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1482 %res1 = select <8 x i1> %arg1, <8 x i64> %res0, <8 x i64> %a0 1483 ret <8 x i64> %res1 1484} 1485 1486define <8 x i64> @test_mm512_maskz_unpacklo_epi64(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) { 1487; X86-LABEL: test_mm512_maskz_unpacklo_epi64: 1488; X86: # %bb.0: 1489; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1490; X86-NEXT: kmovw %eax, %k1 1491; X86-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1492; X86-NEXT: retl 1493; 1494; X64-LABEL: test_mm512_maskz_unpacklo_epi64: 1495; X64: # %bb.0: 1496; X64-NEXT: kmovw %edi, %k1 1497; X64-NEXT: vpunpcklqdq {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1498; X64-NEXT: retq 1499 %arg0 = bitcast i8 %a0 to <8 x i1> 1500 %res0 = shufflevector <8 x i64> %a1, <8 x i64> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1501 %res1 = select <8 x i1> %arg0, <8 x i64> %res0, <8 x i64> zeroinitializer 1502 ret <8 x i64> %res1 1503} 1504 1505define <8 x double> @test_mm512_unpacklo_pd(<8 x double> %a0, <8 x double> %a1) { 1506; CHECK-LABEL: test_mm512_unpacklo_pd: 1507; CHECK: # %bb.0: 1508; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1509; CHECK-NEXT: ret{{[l|q]}} 1510 %res = shufflevector <8 x double> %a0, <8 x double> %a1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1511 ret <8 x double> %res 1512} 1513 1514define <8 x double> @test_mm512_mask_unpacklo_pd(<8 x double> %a0, i8 %a1, <8 x double> %a2, <8 x double> %a3) { 1515; X86-LABEL: test_mm512_mask_unpacklo_pd: 1516; X86: # %bb.0: 1517; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1518; X86-NEXT: kmovw %eax, %k1 1519; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1520; X86-NEXT: retl 1521; 1522; X64-LABEL: test_mm512_mask_unpacklo_pd: 1523; X64: # %bb.0: 1524; X64-NEXT: kmovw %edi, %k1 1525; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[2],zmm2[2],zmm1[4],zmm2[4],zmm1[6],zmm2[6] 1526; X64-NEXT: retq 1527 %arg1 = bitcast i8 %a1 to <8 x i1> 1528 %res0 = shufflevector <8 x double> %a2, <8 x double> %a3, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1529 %res1 = select <8 x i1> %arg1, <8 x double> %res0, <8 x double> %a0 1530 ret <8 x double> %res1 1531} 1532 1533define <8 x double> @test_mm512_maskz_unpacklo_pd(i8 %a0, <8 x double> %a1, <8 x double> %a2) { 1534; X86-LABEL: test_mm512_maskz_unpacklo_pd: 1535; X86: # %bb.0: 1536; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1537; X86-NEXT: kmovw %eax, %k1 1538; X86-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1539; X86-NEXT: retl 1540; 1541; X64-LABEL: test_mm512_maskz_unpacklo_pd: 1542; X64: # %bb.0: 1543; X64-NEXT: kmovw %edi, %k1 1544; X64-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] 1545; X64-NEXT: retq 1546 %arg0 = bitcast i8 %a0 to <8 x i1> 1547 %res0 = shufflevector <8 x double> %a1, <8 x double> %a2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14> 1548 %res1 = select <8 x i1> %arg0, <8 x double> %res0, <8 x double> zeroinitializer 1549 ret <8 x double> %res1 1550} 1551 1552define <16 x float> @test_mm512_unpacklo_ps(<16 x float> %a0, <16 x float> %a1) { 1553; CHECK-LABEL: test_mm512_unpacklo_ps: 1554; CHECK: # %bb.0: 1555; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1556; CHECK-NEXT: ret{{[l|q]}} 1557 %res = shufflevector <16 x float> %a0, <16 x float> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1558 ret <16 x float> %res 1559} 1560 1561define <16 x float> @test_mm512_mask_unpacklo_ps(<16 x float> %a0, i16 %a1, <16 x float> %a2, <16 x float> %a3) { 1562; X86-LABEL: test_mm512_mask_unpacklo_ps: 1563; X86: # %bb.0: 1564; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1565; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1566; X86-NEXT: retl 1567; 1568; X64-LABEL: test_mm512_mask_unpacklo_ps: 1569; X64: # %bb.0: 1570; X64-NEXT: kmovw %edi, %k1 1571; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[12],zmm2[12],zmm1[13],zmm2[13] 1572; X64-NEXT: retq 1573 %arg1 = bitcast i16 %a1 to <16 x i1> 1574 %res0 = shufflevector <16 x float> %a2, <16 x float> %a3, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1575 %res1 = select <16 x i1> %arg1, <16 x float> %res0, <16 x float> %a0 1576 ret <16 x float> %res1 1577} 1578 1579define <16 x float> @test_mm512_maskz_unpacklo_ps(i16 %a0, <16 x float> %a1, <16 x float> %a2) { 1580; X86-LABEL: test_mm512_maskz_unpacklo_ps: 1581; X86: # %bb.0: 1582; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1583; X86-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1584; X86-NEXT: retl 1585; 1586; X64-LABEL: test_mm512_maskz_unpacklo_ps: 1587; X64: # %bb.0: 1588; X64-NEXT: kmovw %edi, %k1 1589; X64-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] 1590; X64-NEXT: retq 1591 %arg0 = bitcast i16 %a0 to <16 x i1> 1592 %res0 = shufflevector <16 x float> %a1, <16 x float> %a2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29> 1593 %res1 = select <16 x i1> %arg0, <16 x float> %res0, <16 x float> zeroinitializer 1594 ret <16 x float> %res1 1595} 1596 1597define <8 x double> @test_mm512_zextpd128_pd512(<2 x double> %a0) nounwind { 1598; CHECK-LABEL: test_mm512_zextpd128_pd512: 1599; CHECK: # %bb.0: 1600; CHECK-NEXT: vmovaps %xmm0, %xmm0 1601; CHECK-NEXT: ret{{[l|q]}} 1602 %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1603 ret <8 x double> %res 1604} 1605 1606define <8 x double> @test_mm512_zextpd256_pd512(<4 x double> %a0) nounwind { 1607; CHECK-LABEL: test_mm512_zextpd256_pd512: 1608; CHECK: # %bb.0: 1609; CHECK-NEXT: vmovaps %ymm0, %ymm0 1610; CHECK-NEXT: ret{{[l|q]}} 1611 %res = shufflevector <4 x double> %a0, <4 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1612 ret <8 x double> %res 1613} 1614 1615define <16 x float> @test_mm512_zextps128_ps512(<4 x float> %a0) nounwind { 1616; CHECK-LABEL: test_mm512_zextps128_ps512: 1617; CHECK: # %bb.0: 1618; CHECK-NEXT: vmovaps %xmm0, %xmm0 1619; CHECK-NEXT: ret{{[l|q]}} 1620 %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7> 1621 ret <16 x float> %res 1622} 1623 1624define <16 x float> @test_mm512_zextps256_ps512(<8 x float> %a0) nounwind { 1625; CHECK-LABEL: test_mm512_zextps256_ps512: 1626; CHECK: # %bb.0: 1627; CHECK-NEXT: vmovaps %ymm0, %ymm0 1628; CHECK-NEXT: ret{{[l|q]}} 1629 %res = shufflevector <8 x float> %a0, <8 x float> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1630 ret <16 x float> %res 1631} 1632 1633define <8 x i64> @test_mm512_zextsi128_si512(<2 x i64> %a0) nounwind { 1634; CHECK-LABEL: test_mm512_zextsi128_si512: 1635; CHECK: # %bb.0: 1636; CHECK-NEXT: vmovaps %xmm0, %xmm0 1637; CHECK-NEXT: ret{{[l|q]}} 1638 %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3> 1639 ret <8 x i64> %res 1640} 1641 1642define <8 x i64> @test_mm512_zextsi256_si512(<4 x i64> %a0) nounwind { 1643; CHECK-LABEL: test_mm512_zextsi256_si512: 1644; CHECK: # %bb.0: 1645; CHECK-NEXT: vmovaps %ymm0, %ymm0 1646; CHECK-NEXT: ret{{[l|q]}} 1647 %res = shufflevector <4 x i64> %a0, <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1648 ret <8 x i64> %res 1649} 1650 1651define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1652; CHECK-LABEL: test_mm512_mul_epi32: 1653; CHECK: # %bb.0: 1654; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 1655; CHECK-NEXT: vpsraq $32, %zmm0, %zmm0 1656; CHECK-NEXT: vpsllq $32, %zmm1, %zmm1 1657; CHECK-NEXT: vpsraq $32, %zmm1, %zmm1 1658; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 1659; CHECK-NEXT: ret{{[l|q]}} 1660 %tmp = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1661 %tmp1 = ashr exact <8 x i64> %tmp, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1662 %tmp2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1663 %tmp3 = ashr exact <8 x i64> %tmp2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1664 %tmp4 = mul nsw <8 x i64> %tmp3, %tmp1 1665 ret <8 x i64> %tmp4 1666} 1667 1668define <8 x i64> @test_mm512_maskz_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1669; X86-LABEL: test_mm512_maskz_mul_epi32: 1670; X86: # %bb.0: # %entry 1671; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1672; X86-NEXT: kmovw %eax, %k1 1673; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1674; X86-NEXT: retl 1675; 1676; X64-LABEL: test_mm512_maskz_mul_epi32: 1677; X64: # %bb.0: # %entry 1678; X64-NEXT: kmovw %edi, %k1 1679; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 {%k1} {z} 1680; X64-NEXT: retq 1681entry: 1682 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1683 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1684 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1685 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1686 %4 = mul nsw <8 x i64> %3, %1 1687 %5 = bitcast i8 %__k to <8 x i1> 1688 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer 1689 ret <8 x i64> %6 1690} 1691 1692define <8 x i64> @test_mm512_mask_mul_epi32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1693; X86-LABEL: test_mm512_mask_mul_epi32: 1694; X86: # %bb.0: # %entry 1695; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1696; X86-NEXT: kmovw %eax, %k1 1697; X86-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1698; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1699; X86-NEXT: retl 1700; 1701; X64-LABEL: test_mm512_mask_mul_epi32: 1702; X64: # %bb.0: # %entry 1703; X64-NEXT: kmovw %edi, %k1 1704; X64-NEXT: vpmuldq %zmm0, %zmm1, %zmm2 {%k1} 1705; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1706; X64-NEXT: retq 1707entry: 1708 %0 = shl <8 x i64> %__A, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1709 %1 = ashr exact <8 x i64> %0, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1710 %2 = shl <8 x i64> %__B, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1711 %3 = ashr exact <8 x i64> %2, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 1712 %4 = mul nsw <8 x i64> %3, %1 1713 %5 = bitcast i8 %__k to <8 x i1> 1714 %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %__src 1715 ret <8 x i64> %6 1716} 1717 1718define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { 1719; CHECK-LABEL: test_mm512_mul_epu32: 1720; CHECK: # %bb.0: 1721; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA 1722; CHECK-NEXT: kmovw %eax, %k0 1723; CHECK-NEXT: knotw %k0, %k1 1724; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 1725; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} 1726; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 1727; CHECK-NEXT: ret{{[l|q]}} 1728 %tmp = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1729 %tmp1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1730 %tmp2 = mul nuw <8 x i64> %tmp1, %tmp 1731 ret <8 x i64> %tmp2 1732} 1733 1734define <8 x i64> @test_mm512_maskz_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B) nounwind { 1735; X86-LABEL: test_mm512_maskz_mul_epu32: 1736; X86: # %bb.0: # %entry 1737; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1738; X86-NEXT: kmovw %eax, %k1 1739; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1740; X86-NEXT: retl 1741; 1742; X64-LABEL: test_mm512_maskz_mul_epu32: 1743; X64: # %bb.0: # %entry 1744; X64-NEXT: kmovw %edi, %k1 1745; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 {%k1} {z} 1746; X64-NEXT: retq 1747entry: 1748 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1749 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1750 %2 = mul nuw <8 x i64> %1, %0 1751 %3 = bitcast i8 %__k to <8 x i1> 1752 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1753 ret <8 x i64> %4 1754} 1755 1756define <8 x i64> @test_mm512_mask_mul_epu32(i8 zeroext %__k, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__src) nounwind { 1757; X86-LABEL: test_mm512_mask_mul_epu32: 1758; X86: # %bb.0: # %entry 1759; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1760; X86-NEXT: kmovw %eax, %k1 1761; X86-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1762; X86-NEXT: vmovdqa64 %zmm2, %zmm0 1763; X86-NEXT: retl 1764; 1765; X64-LABEL: test_mm512_mask_mul_epu32: 1766; X64: # %bb.0: # %entry 1767; X64-NEXT: kmovw %edi, %k1 1768; X64-NEXT: vpmuludq %zmm0, %zmm1, %zmm2 {%k1} 1769; X64-NEXT: vmovdqa64 %zmm2, %zmm0 1770; X64-NEXT: retq 1771entry: 1772 %0 = and <8 x i64> %__A, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1773 %1 = and <8 x i64> %__B, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1774 %2 = mul nuw <8 x i64> %1, %0 1775 %3 = bitcast i8 %__k to <8 x i1> 1776 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %__src 1777 ret <8 x i64> %4 1778} 1779 1780define <8 x double> @test_mm512_set1_epi8(i8 signext %d) nounwind { 1781; X86-LABEL: test_mm512_set1_epi8: 1782; X86: # %bb.0: # %entry 1783; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1784; X86-NEXT: vmovd %eax, %xmm0 1785; X86-NEXT: vpbroadcastb %xmm0, %ymm0 1786; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1787; X86-NEXT: retl 1788; 1789; X64-LABEL: test_mm512_set1_epi8: 1790; X64: # %bb.0: # %entry 1791; X64-NEXT: vmovd %edi, %xmm0 1792; X64-NEXT: vpbroadcastb %xmm0, %ymm0 1793; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1794; X64-NEXT: retq 1795entry: 1796 %vecinit.i = insertelement <64 x i8> undef, i8 %d, i32 0 1797 %vecinit63.i = shufflevector <64 x i8> %vecinit.i, <64 x i8> undef, <64 x i32> zeroinitializer 1798 %0 = bitcast <64 x i8> %vecinit63.i to <8 x double> 1799 ret <8 x double> %0 1800} 1801 1802define <2 x double> @test_mm_cvtu32_sd(<2 x double> %__A, i32 %__B) { 1803; X86-LABEL: test_mm_cvtu32_sd: 1804; X86: # %bb.0: # %entry 1805; X86-NEXT: vcvtusi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 1806; X86-NEXT: retl 1807; 1808; X64-LABEL: test_mm_cvtu32_sd: 1809; X64: # %bb.0: # %entry 1810; X64-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 1811; X64-NEXT: retq 1812entry: 1813 %conv.i = uitofp i32 %__B to double 1814 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1815 ret <2 x double> %vecins.i 1816} 1817 1818define <2 x double> @test_mm_cvtu64_sd(<2 x double> %__A, i64 %__B) { 1819; X86-LABEL: test_mm_cvtu64_sd: 1820; X86: # %bb.0: # %entry 1821; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1822; X86-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 1823; X86-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] 1824; X86-NEXT: vsubpd {{\.LCPI.*}}, %xmm1, %xmm1 1825; X86-NEXT: vhaddpd %xmm1, %xmm1, %xmm1 1826; X86-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1827; X86-NEXT: retl 1828; 1829; X64-LABEL: test_mm_cvtu64_sd: 1830; X64: # %bb.0: # %entry 1831; X64-NEXT: vcvtusi2sdq %rdi, %xmm0, %xmm0 1832; X64-NEXT: retq 1833entry: 1834 %conv.i = uitofp i64 %__B to double 1835 %vecins.i = insertelement <2 x double> %__A, double %conv.i, i32 0 1836 ret <2 x double> %vecins.i 1837} 1838 1839define <4 x float> @test_mm_cvtu32_ss(<4 x float> %__A, i32 %__B) { 1840; X86-LABEL: test_mm_cvtu32_ss: 1841; X86: # %bb.0: # %entry 1842; X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 1843; X86-NEXT: retl 1844; 1845; X64-LABEL: test_mm_cvtu32_ss: 1846; X64: # %bb.0: # %entry 1847; X64-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 1848; X64-NEXT: retq 1849entry: 1850 %conv.i = uitofp i32 %__B to float 1851 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1852 ret <4 x float> %vecins.i 1853} 1854 1855define <4 x float> @test_mm_cvtu64_ss(<4 x float> %__A, i64 %__B) { 1856; X86-LABEL: test_mm_cvtu64_ss: 1857; X86: # %bb.0: # %entry 1858; X86-NEXT: pushl %ebp 1859; X86-NEXT: .cfi_def_cfa_offset 8 1860; X86-NEXT: .cfi_offset %ebp, -8 1861; X86-NEXT: movl %esp, %ebp 1862; X86-NEXT: .cfi_def_cfa_register %ebp 1863; X86-NEXT: andl $-8, %esp 1864; X86-NEXT: subl $16, %esp 1865; X86-NEXT: movl 12(%ebp), %eax 1866; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1867; X86-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 1868; X86-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp) 1869; X86-NEXT: xorl %ecx, %ecx 1870; X86-NEXT: testl %eax, %eax 1871; X86-NEXT: setns %cl 1872; X86-NEXT: fildll {{[0-9]+}}(%esp) 1873; X86-NEXT: fadds {{\.LCPI.*}}(,%ecx,4) 1874; X86-NEXT: fstps {{[0-9]+}}(%esp) 1875; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1876; X86-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1877; X86-NEXT: movl %ebp, %esp 1878; X86-NEXT: popl %ebp 1879; X86-NEXT: .cfi_def_cfa %esp, 4 1880; X86-NEXT: retl 1881; 1882; X64-LABEL: test_mm_cvtu64_ss: 1883; X64: # %bb.0: # %entry 1884; X64-NEXT: vcvtusi2ssq %rdi, %xmm0, %xmm0 1885; X64-NEXT: retq 1886entry: 1887 %conv.i = uitofp i64 %__B to float 1888 %vecins.i = insertelement <4 x float> %__A, float %conv.i, i32 0 1889 ret <4 x float> %vecins.i 1890} 1891 1892define <8 x double> @test_mm512_cvtps_pd(<8 x float> %__A) { 1893; CHECK-LABEL: test_mm512_cvtps_pd: 1894; CHECK: # %bb.0: # %entry 1895; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1896; CHECK-NEXT: ret{{[l|q]}} 1897entry: 1898 %conv.i = fpext <8 x float> %__A to <8 x double> 1899 ret <8 x double> %conv.i 1900} 1901 1902define <8 x double> @test_mm512_cvtpslo_pd(<16 x float> %__A) { 1903; CHECK-LABEL: test_mm512_cvtpslo_pd: 1904; CHECK: # %bb.0: # %entry 1905; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 1906; CHECK-NEXT: ret{{[l|q]}} 1907entry: 1908 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1909 %conv.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 1910 ret <8 x double> %conv.i.i 1911} 1912 1913define <8 x double> @test_mm512_mask_cvtps_pd(<8 x double> %__W, i8 zeroext %__U, <8 x float> %__A) { 1914; X86-LABEL: test_mm512_mask_cvtps_pd: 1915; X86: # %bb.0: # %entry 1916; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1917; X86-NEXT: kmovw %eax, %k1 1918; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1919; X86-NEXT: retl 1920; 1921; X64-LABEL: test_mm512_mask_cvtps_pd: 1922; X64: # %bb.0: # %entry 1923; X64-NEXT: kmovw %edi, %k1 1924; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1925; X64-NEXT: retq 1926entry: 1927 %conv.i.i = fpext <8 x float> %__A to <8 x double> 1928 %0 = bitcast i8 %__U to <8 x i1> 1929 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> %__W 1930 ret <8 x double> %1 1931} 1932 1933define <8 x double> @test_mm512_mask_cvtpslo_pd(<8 x double> %__W, i8 zeroext %__U, <16 x float> %__A) { 1934; X86-LABEL: test_mm512_mask_cvtpslo_pd: 1935; X86: # %bb.0: # %entry 1936; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1937; X86-NEXT: kmovw %eax, %k1 1938; X86-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1939; X86-NEXT: retl 1940; 1941; X64-LABEL: test_mm512_mask_cvtpslo_pd: 1942; X64: # %bb.0: # %entry 1943; X64-NEXT: kmovw %edi, %k1 1944; X64-NEXT: vcvtps2pd %ymm1, %zmm0 {%k1} 1945; X64-NEXT: retq 1946entry: 1947 %shuffle.i.i = shufflevector <16 x float> %__A, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1948 %conv.i.i.i = fpext <8 x float> %shuffle.i.i to <8 x double> 1949 %0 = bitcast i8 %__U to <8 x i1> 1950 %1 = select <8 x i1> %0, <8 x double> %conv.i.i.i, <8 x double> %__W 1951 ret <8 x double> %1 1952} 1953 1954define <8 x double> @test_mm512_maskz_cvtps_pd(i8 zeroext %__U, <8 x float> %__A) { 1955; X86-LABEL: test_mm512_maskz_cvtps_pd: 1956; X86: # %bb.0: # %entry 1957; X86-NEXT: movb {{[0-9]+}}(%esp), %al 1958; X86-NEXT: kmovw %eax, %k1 1959; X86-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 1960; X86-NEXT: retl 1961; 1962; X64-LABEL: test_mm512_maskz_cvtps_pd: 1963; X64: # %bb.0: # %entry 1964; X64-NEXT: kmovw %edi, %k1 1965; X64-NEXT: vcvtps2pd %ymm0, %zmm0 {%k1} {z} 1966; X64-NEXT: retq 1967entry: 1968 %conv.i.i = fpext <8 x float> %__A to <8 x double> 1969 %0 = bitcast i8 %__U to <8 x i1> 1970 %1 = select <8 x i1> %0, <8 x double> %conv.i.i, <8 x double> zeroinitializer 1971 ret <8 x double> %1 1972} 1973 1974define <2 x i64> @test_mm512_cvtepi32_epi8(<8 x i64> %__A) { 1975; CHECK-LABEL: test_mm512_cvtepi32_epi8: 1976; CHECK: # %bb.0: # %entry 1977; CHECK-NEXT: vpmovdb %zmm0, %xmm0 1978; CHECK-NEXT: vzeroupper 1979; CHECK-NEXT: ret{{[l|q]}} 1980entry: 1981 %0 = bitcast <8 x i64> %__A to <16 x i32> 1982 %conv.i = trunc <16 x i32> %0 to <16 x i8> 1983 %1 = bitcast <16 x i8> %conv.i to <2 x i64> 1984 ret <2 x i64> %1 1985} 1986 1987define <2 x i64> @test_mm512_mask_cvtepi32_epi8(<2 x i64> %__O, i16 zeroext %__M, <8 x i64> %__A) { 1988; X86-LABEL: test_mm512_mask_cvtepi32_epi8: 1989; X86: # %bb.0: # %entry 1990; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 1991; X86-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 1992; X86-NEXT: vzeroupper 1993; X86-NEXT: retl 1994; 1995; X64-LABEL: test_mm512_mask_cvtepi32_epi8: 1996; X64: # %bb.0: # %entry 1997; X64-NEXT: kmovw %edi, %k1 1998; X64-NEXT: vpmovdb %zmm1, %xmm0 {%k1} 1999; X64-NEXT: vzeroupper 2000; X64-NEXT: retq 2001entry: 2002 %0 = bitcast <8 x i64> %__A to <16 x i32> 2003 %1 = bitcast <2 x i64> %__O to <16 x i8> 2004 %2 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> %1, i16 %__M) 2005 %3 = bitcast <16 x i8> %2 to <2 x i64> 2006 ret <2 x i64> %3 2007} 2008 2009define <2 x i64> @test_mm512_maskz_cvtepi32_epi8(i16 zeroext %__M, <8 x i64> %__A) { 2010; X86-LABEL: test_mm512_maskz_cvtepi32_epi8: 2011; X86: # %bb.0: # %entry 2012; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2013; X86-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2014; X86-NEXT: vzeroupper 2015; X86-NEXT: retl 2016; 2017; X64-LABEL: test_mm512_maskz_cvtepi32_epi8: 2018; X64: # %bb.0: # %entry 2019; X64-NEXT: kmovw %edi, %k1 2020; X64-NEXT: vpmovdb %zmm0, %xmm0 {%k1} {z} 2021; X64-NEXT: vzeroupper 2022; X64-NEXT: retq 2023entry: 2024 %0 = bitcast <8 x i64> %__A to <16 x i32> 2025 %1 = tail call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %0, <16 x i8> zeroinitializer, i16 %__M) 2026 %2 = bitcast <16 x i8> %1 to <2 x i64> 2027 ret <2 x i64> %2 2028} 2029 2030define <4 x i64> @test_mm512_cvtepi64_epi32(<8 x i64> %__A) { 2031; CHECK-LABEL: test_mm512_cvtepi64_epi32: 2032; CHECK: # %bb.0: # %entry 2033; CHECK-NEXT: vpmovqd %zmm0, %ymm0 2034; CHECK-NEXT: ret{{[l|q]}} 2035entry: 2036 %conv.i = trunc <8 x i64> %__A to <8 x i32> 2037 %0 = bitcast <8 x i32> %conv.i to <4 x i64> 2038 ret <4 x i64> %0 2039} 2040 2041define <4 x i64> @test_mm512_mask_cvtepi64_epi32(<4 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2042; X86-LABEL: test_mm512_mask_cvtepi64_epi32: 2043; X86: # %bb.0: # %entry 2044; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2045; X86-NEXT: kmovw %eax, %k1 2046; X86-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2047; X86-NEXT: retl 2048; 2049; X64-LABEL: test_mm512_mask_cvtepi64_epi32: 2050; X64: # %bb.0: # %entry 2051; X64-NEXT: kmovw %edi, %k1 2052; X64-NEXT: vpmovqd %zmm1, %ymm0 {%k1} 2053; X64-NEXT: retq 2054entry: 2055 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2056 %0 = bitcast <4 x i64> %__O to <8 x i32> 2057 %1 = bitcast i8 %__M to <8 x i1> 2058 %2 = select <8 x i1> %1, <8 x i32> %conv.i.i, <8 x i32> %0 2059 %3 = bitcast <8 x i32> %2 to <4 x i64> 2060 ret <4 x i64> %3 2061} 2062 2063define <4 x i64> @test_mm512_maskz_cvtepi64_epi32(i8 zeroext %__M, <8 x i64> %__A) { 2064; X86-LABEL: test_mm512_maskz_cvtepi64_epi32: 2065; X86: # %bb.0: # %entry 2066; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2067; X86-NEXT: kmovw %eax, %k1 2068; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2069; X86-NEXT: retl 2070; 2071; X64-LABEL: test_mm512_maskz_cvtepi64_epi32: 2072; X64: # %bb.0: # %entry 2073; X64-NEXT: kmovw %edi, %k1 2074; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} 2075; X64-NEXT: retq 2076entry: 2077 %conv.i.i = trunc <8 x i64> %__A to <8 x i32> 2078 %0 = bitcast i8 %__M to <8 x i1> 2079 %1 = select <8 x i1> %0, <8 x i32> %conv.i.i, <8 x i32> zeroinitializer 2080 %2 = bitcast <8 x i32> %1 to <4 x i64> 2081 ret <4 x i64> %2 2082} 2083 2084define <2 x i64> @test_mm512_cvtepi64_epi16(<8 x i64> %__A) { 2085; CHECK-LABEL: test_mm512_cvtepi64_epi16: 2086; CHECK: # %bb.0: # %entry 2087; CHECK-NEXT: vpmovqw %zmm0, %xmm0 2088; CHECK-NEXT: vzeroupper 2089; CHECK-NEXT: ret{{[l|q]}} 2090entry: 2091 %conv.i = trunc <8 x i64> %__A to <8 x i16> 2092 %0 = bitcast <8 x i16> %conv.i to <2 x i64> 2093 ret <2 x i64> %0 2094} 2095 2096define <2 x i64> @test_mm512_mask_cvtepi64_epi16(<2 x i64> %__O, i8 zeroext %__M, <8 x i64> %__A) { 2097; X86-LABEL: test_mm512_mask_cvtepi64_epi16: 2098; X86: # %bb.0: # %entry 2099; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2100; X86-NEXT: kmovw %eax, %k1 2101; X86-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2102; X86-NEXT: vzeroupper 2103; X86-NEXT: retl 2104; 2105; X64-LABEL: test_mm512_mask_cvtepi64_epi16: 2106; X64: # %bb.0: # %entry 2107; X64-NEXT: kmovw %edi, %k1 2108; X64-NEXT: vpmovqw %zmm1, %xmm0 {%k1} 2109; X64-NEXT: vzeroupper 2110; X64-NEXT: retq 2111entry: 2112 %0 = bitcast <2 x i64> %__O to <8 x i16> 2113 %1 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> %0, i8 %__M) 2114 %2 = bitcast <8 x i16> %1 to <2 x i64> 2115 ret <2 x i64> %2 2116} 2117 2118define <2 x i64> @test_mm512_maskz_cvtepi64_epi16(i8 zeroext %__M, <8 x i64> %__A) { 2119; X86-LABEL: test_mm512_maskz_cvtepi64_epi16: 2120; X86: # %bb.0: # %entry 2121; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2122; X86-NEXT: kmovw %eax, %k1 2123; X86-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2124; X86-NEXT: vzeroupper 2125; X86-NEXT: retl 2126; 2127; X64-LABEL: test_mm512_maskz_cvtepi64_epi16: 2128; X64: # %bb.0: # %entry 2129; X64-NEXT: kmovw %edi, %k1 2130; X64-NEXT: vpmovqw %zmm0, %xmm0 {%k1} {z} 2131; X64-NEXT: vzeroupper 2132; X64-NEXT: retq 2133entry: 2134 %0 = tail call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %__A, <8 x i16> zeroinitializer, i8 %__M) 2135 %1 = bitcast <8 x i16> %0 to <2 x i64> 2136 ret <2 x i64> %1 2137} 2138 2139declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16) 2140declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8) 2141 2142define <8 x i64> @test_mm512_ternarylogic_epi32(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2143; CHECK-LABEL: test_mm512_ternarylogic_epi32: 2144; CHECK: # %bb.0: # %entry 2145; CHECK-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 2146; CHECK-NEXT: ret{{[l|q]}} 2147entry: 2148 %0 = bitcast <8 x i64> %__A to <16 x i32> 2149 %1 = bitcast <8 x i64> %__B to <16 x i32> 2150 %2 = bitcast <8 x i64> %__C to <16 x i32> 2151 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2152 %4 = bitcast <16 x i32> %3 to <8 x i64> 2153 ret <8 x i64> %4 2154} 2155 2156declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32) #1 2157 2158define <8 x i64> @test_mm512_mask_ternarylogic_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2159; X86-LABEL: test_mm512_mask_ternarylogic_epi32: 2160; X86: # %bb.0: # %entry 2161; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2162; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2163; X86-NEXT: retl 2164; 2165; X64-LABEL: test_mm512_mask_ternarylogic_epi32: 2166; X64: # %bb.0: # %entry 2167; X64-NEXT: kmovw %edi, %k1 2168; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} 2169; X64-NEXT: retq 2170entry: 2171 %0 = bitcast <8 x i64> %__A to <16 x i32> 2172 %1 = bitcast <8 x i64> %__B to <16 x i32> 2173 %2 = bitcast <8 x i64> %__C to <16 x i32> 2174 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2175 %4 = bitcast i16 %__U to <16 x i1> 2176 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2177 %6 = bitcast <16 x i32> %5 to <8 x i64> 2178 ret <8 x i64> %6 2179} 2180 2181define <8 x i64> @test_mm512_maskz_ternarylogic_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2182; X86-LABEL: test_mm512_maskz_ternarylogic_epi32: 2183; X86: # %bb.0: # %entry 2184; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2185; X86-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2186; X86-NEXT: retl 2187; 2188; X64-LABEL: test_mm512_maskz_ternarylogic_epi32: 2189; X64: # %bb.0: # %entry 2190; X64-NEXT: kmovw %edi, %k1 2191; X64-NEXT: vpternlogd $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2192; X64-NEXT: retq 2193entry: 2194 %0 = bitcast <8 x i64> %__A to <16 x i32> 2195 %1 = bitcast <8 x i64> %__B to <16 x i32> 2196 %2 = bitcast <8 x i64> %__C to <16 x i32> 2197 %3 = tail call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2, i32 4) 2198 %4 = bitcast i16 %__U to <16 x i1> 2199 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2200 %6 = bitcast <16 x i32> %5 to <8 x i64> 2201 ret <8 x i64> %6 2202} 2203 2204define <8 x i64> @test_mm512_ternarylogic_epi64(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2205; CHECK-LABEL: test_mm512_ternarylogic_epi64: 2206; CHECK: # %bb.0: # %entry 2207; CHECK-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 2208; CHECK-NEXT: ret{{[l|q]}} 2209entry: 2210 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2211 ret <8 x i64> %0 2212} 2213 2214declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32) #1 2215 2216define <8 x i64> @test_mm512_mask_ternarylogic_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__B, <8 x i64> %__C) { 2217; X86-LABEL: test_mm512_mask_ternarylogic_epi64: 2218; X86: # %bb.0: # %entry 2219; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2220; X86-NEXT: kmovw %eax, %k1 2221; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2222; X86-NEXT: retl 2223; 2224; X64-LABEL: test_mm512_mask_ternarylogic_epi64: 2225; X64: # %bb.0: # %entry 2226; X64-NEXT: kmovw %edi, %k1 2227; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} 2228; X64-NEXT: retq 2229entry: 2230 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2231 %1 = bitcast i8 %__U to <8 x i1> 2232 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2233 ret <8 x i64> %2 2234} 2235 2236define <8 x i64> @test_mm512_maskz_ternarylogic_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C) { 2237; X86-LABEL: test_mm512_maskz_ternarylogic_epi64: 2238; X86: # %bb.0: # %entry 2239; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2240; X86-NEXT: kmovw %eax, %k1 2241; X86-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2242; X86-NEXT: retl 2243; 2244; X64-LABEL: test_mm512_maskz_ternarylogic_epi64: 2245; X64: # %bb.0: # %entry 2246; X64-NEXT: kmovw %edi, %k1 2247; X64-NEXT: vpternlogq $4, %zmm2, %zmm1, %zmm0 {%k1} {z} 2248; X64-NEXT: retq 2249entry: 2250 %0 = tail call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %__A, <8 x i64> %__B, <8 x i64> %__C, i32 4) 2251 %1 = bitcast i8 %__U to <8 x i1> 2252 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2253 ret <8 x i64> %2 2254} 2255 2256declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) 2257 2258define <8 x i64> @test_mm512_mask2_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, i16 zeroext %__U, <8 x i64> %__B) { 2259; X86-LABEL: test_mm512_mask2_permutex2var_epi32: 2260; X86: # %bb.0: # %entry 2261; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2262; X86-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2263; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2264; X86-NEXT: retl 2265; 2266; X64-LABEL: test_mm512_mask2_permutex2var_epi32: 2267; X64: # %bb.0: # %entry 2268; X64-NEXT: kmovw %edi, %k1 2269; X64-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 {%k1} 2270; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2271; X64-NEXT: retq 2272entry: 2273 %0 = bitcast <8 x i64> %__A to <16 x i32> 2274 %1 = bitcast <8 x i64> %__I to <16 x i32> 2275 %2 = bitcast <8 x i64> %__B to <16 x i32> 2276 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2277 %4 = bitcast i16 %__U to <16 x i1> 2278 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %1 2279 %6 = bitcast <16 x i32> %5 to <8 x i64> 2280 ret <8 x i64> %6 2281} 2282 2283declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) 2284 2285define <8 x double> @test_mm512_mask2_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x double> %__B) { 2286; X86-LABEL: test_mm512_mask2_permutex2var_pd: 2287; X86: # %bb.0: # %entry 2288; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2289; X86-NEXT: kmovw %eax, %k1 2290; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2291; X86-NEXT: vmovapd %zmm1, %zmm0 2292; X86-NEXT: retl 2293; 2294; X64-LABEL: test_mm512_mask2_permutex2var_pd: 2295; X64: # %bb.0: # %entry 2296; X64-NEXT: kmovw %edi, %k1 2297; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} 2298; X64-NEXT: vmovapd %zmm1, %zmm0 2299; X64-NEXT: retq 2300entry: 2301 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2302 %1 = bitcast <8 x i64> %__I to <8 x double> 2303 %2 = bitcast i8 %__U to <8 x i1> 2304 %3 = select <8 x i1> %2, <8 x double> %0, <8 x double> %1 2305 ret <8 x double> %3 2306} 2307 2308declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) 2309 2310define <16 x float> @test_mm512_mask2_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, i16 zeroext %__U, <16 x float> %__B) { 2311; X86-LABEL: test_mm512_mask2_permutex2var_ps: 2312; X86: # %bb.0: # %entry 2313; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2314; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2315; X86-NEXT: vmovaps %zmm1, %zmm0 2316; X86-NEXT: retl 2317; 2318; X64-LABEL: test_mm512_mask2_permutex2var_ps: 2319; X64: # %bb.0: # %entry 2320; X64-NEXT: kmovw %edi, %k1 2321; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} 2322; X64-NEXT: vmovaps %zmm1, %zmm0 2323; X64-NEXT: retq 2324entry: 2325 %0 = bitcast <8 x i64> %__I to <16 x i32> 2326 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2327 %2 = bitcast <8 x i64> %__I to <16 x float> 2328 %3 = bitcast i16 %__U to <16 x i1> 2329 %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 2330 ret <16 x float> %4 2331} 2332 2333declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) 2334 2335define <8 x i64> @test_mm512_mask2_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, i8 zeroext %__U, <8 x i64> %__B) { 2336; X86-LABEL: test_mm512_mask2_permutex2var_epi64: 2337; X86: # %bb.0: # %entry 2338; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2339; X86-NEXT: kmovw %eax, %k1 2340; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2341; X86-NEXT: vmovdqa64 %zmm1, %zmm0 2342; X86-NEXT: retl 2343; 2344; X64-LABEL: test_mm512_mask2_permutex2var_epi64: 2345; X64: # %bb.0: # %entry 2346; X64-NEXT: kmovw %edi, %k1 2347; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} 2348; X64-NEXT: vmovdqa64 %zmm1, %zmm0 2349; X64-NEXT: retq 2350entry: 2351 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2352 %1 = bitcast i8 %__U to <8 x i1> 2353 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__I 2354 ret <8 x i64> %2 2355} 2356 2357define <8 x i64> @test_mm512_permutex2var_epi32(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2358; CHECK-LABEL: test_mm512_permutex2var_epi32: 2359; CHECK: # %bb.0: # %entry 2360; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 2361; CHECK-NEXT: ret{{[l|q]}} 2362entry: 2363 %0 = bitcast <8 x i64> %__A to <16 x i32> 2364 %1 = bitcast <8 x i64> %__I to <16 x i32> 2365 %2 = bitcast <8 x i64> %__B to <16 x i32> 2366 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2367 %4 = bitcast <16 x i32> %3 to <8 x i64> 2368 ret <8 x i64> %4 2369} 2370 2371define <8 x i64> @test_mm512_maskz_permutex2var_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2372; X86-LABEL: test_mm512_maskz_permutex2var_epi32: 2373; X86: # %bb.0: # %entry 2374; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2375; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2376; X86-NEXT: retl 2377; 2378; X64-LABEL: test_mm512_maskz_permutex2var_epi32: 2379; X64: # %bb.0: # %entry 2380; X64-NEXT: kmovw %edi, %k1 2381; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} {z} 2382; X64-NEXT: retq 2383entry: 2384 %0 = bitcast <8 x i64> %__A to <16 x i32> 2385 %1 = bitcast <8 x i64> %__I to <16 x i32> 2386 %2 = bitcast <8 x i64> %__B to <16 x i32> 2387 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2388 %4 = bitcast i16 %__U to <16 x i1> 2389 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> zeroinitializer 2390 %6 = bitcast <16 x i32> %5 to <8 x i64> 2391 ret <8 x i64> %6 2392} 2393 2394define <8 x i64> @test_mm512_mask_permutex2var_epi32(<8 x i64> %__A, i16 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2395; X86-LABEL: test_mm512_mask_permutex2var_epi32: 2396; X86: # %bb.0: # %entry 2397; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2398; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2399; X86-NEXT: retl 2400; 2401; X64-LABEL: test_mm512_mask_permutex2var_epi32: 2402; X64: # %bb.0: # %entry 2403; X64-NEXT: kmovw %edi, %k1 2404; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 {%k1} 2405; X64-NEXT: retq 2406entry: 2407 %0 = bitcast <8 x i64> %__A to <16 x i32> 2408 %1 = bitcast <8 x i64> %__I to <16 x i32> 2409 %2 = bitcast <8 x i64> %__B to <16 x i32> 2410 %3 = tail call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) 2411 %4 = bitcast i16 %__U to <16 x i1> 2412 %5 = select <16 x i1> %4, <16 x i32> %3, <16 x i32> %0 2413 %6 = bitcast <16 x i32> %5 to <8 x i64> 2414 ret <8 x i64> %6 2415} 2416 2417define <8 x double> @test_mm512_permutex2var_pd(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2418; CHECK-LABEL: test_mm512_permutex2var_pd: 2419; CHECK: # %bb.0: # %entry 2420; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 2421; CHECK-NEXT: ret{{[l|q]}} 2422entry: 2423 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2424 ret <8 x double> %0 2425} 2426 2427define <8 x double> @test_mm512_mask_permutex2var_pd(<8 x double> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x double> %__B) { 2428; X86-LABEL: test_mm512_mask_permutex2var_pd: 2429; X86: # %bb.0: # %entry 2430; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2431; X86-NEXT: kmovw %eax, %k1 2432; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2433; X86-NEXT: retl 2434; 2435; X64-LABEL: test_mm512_mask_permutex2var_pd: 2436; X64: # %bb.0: # %entry 2437; X64-NEXT: kmovw %edi, %k1 2438; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} 2439; X64-NEXT: retq 2440entry: 2441 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2442 %1 = bitcast i8 %__U to <8 x i1> 2443 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 2444 ret <8 x double> %2 2445} 2446 2447define <8 x double> @test_mm512_maskz_permutex2var_pd(i8 zeroext %__U, <8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) { 2448; X86-LABEL: test_mm512_maskz_permutex2var_pd: 2449; X86: # %bb.0: # %entry 2450; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2451; X86-NEXT: kmovw %eax, %k1 2452; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2453; X86-NEXT: retl 2454; 2455; X64-LABEL: test_mm512_maskz_permutex2var_pd: 2456; X64: # %bb.0: # %entry 2457; X64-NEXT: kmovw %edi, %k1 2458; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 {%k1} {z} 2459; X64-NEXT: retq 2460entry: 2461 %0 = tail call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %__A, <8 x i64> %__I, <8 x double> %__B) 2462 %1 = bitcast i8 %__U to <8 x i1> 2463 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 2464 ret <8 x double> %2 2465} 2466 2467define <16 x float> @test_mm512_permutex2var_ps(<16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2468; CHECK-LABEL: test_mm512_permutex2var_ps: 2469; CHECK: # %bb.0: # %entry 2470; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 2471; CHECK-NEXT: ret{{[l|q]}} 2472entry: 2473 %0 = bitcast <8 x i64> %__I to <16 x i32> 2474 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2475 ret <16 x float> %1 2476} 2477 2478define <16 x float> @test_mm512_mask_permutex2var_ps(<16 x float> %__A, i16 zeroext %__U, <8 x i64> %__I, <16 x float> %__B) { 2479; X86-LABEL: test_mm512_mask_permutex2var_ps: 2480; X86: # %bb.0: # %entry 2481; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2482; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2483; X86-NEXT: retl 2484; 2485; X64-LABEL: test_mm512_mask_permutex2var_ps: 2486; X64: # %bb.0: # %entry 2487; X64-NEXT: kmovw %edi, %k1 2488; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} 2489; X64-NEXT: retq 2490entry: 2491 %0 = bitcast <8 x i64> %__I to <16 x i32> 2492 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2493 %2 = bitcast i16 %__U to <16 x i1> 2494 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %__A 2495 ret <16 x float> %3 2496} 2497 2498define <16 x float> @test_mm512_maskz_permutex2var_ps(i16 zeroext %__U, <16 x float> %__A, <8 x i64> %__I, <16 x float> %__B) { 2499; X86-LABEL: test_mm512_maskz_permutex2var_ps: 2500; X86: # %bb.0: # %entry 2501; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 2502; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2503; X86-NEXT: retl 2504; 2505; X64-LABEL: test_mm512_maskz_permutex2var_ps: 2506; X64: # %bb.0: # %entry 2507; X64-NEXT: kmovw %edi, %k1 2508; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 {%k1} {z} 2509; X64-NEXT: retq 2510entry: 2511 %0 = bitcast <8 x i64> %__I to <16 x i32> 2512 %1 = tail call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %__A, <16 x i32> %0, <16 x float> %__B) 2513 %2 = bitcast i16 %__U to <16 x i1> 2514 %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer 2515 ret <16 x float> %3 2516} 2517 2518define <8 x i64> @test_mm512_permutex2var_epi64(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2519; CHECK-LABEL: test_mm512_permutex2var_epi64: 2520; CHECK: # %bb.0: # %entry 2521; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 2522; CHECK-NEXT: ret{{[l|q]}} 2523entry: 2524 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2525 ret <8 x i64> %0 2526} 2527 2528define <8 x i64> @test_mm512_mask_permutex2var_epi64(<8 x i64> %__A, i8 zeroext %__U, <8 x i64> %__I, <8 x i64> %__B) { 2529; X86-LABEL: test_mm512_mask_permutex2var_epi64: 2530; X86: # %bb.0: # %entry 2531; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2532; X86-NEXT: kmovw %eax, %k1 2533; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2534; X86-NEXT: retl 2535; 2536; X64-LABEL: test_mm512_mask_permutex2var_epi64: 2537; X64: # %bb.0: # %entry 2538; X64-NEXT: kmovw %edi, %k1 2539; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} 2540; X64-NEXT: retq 2541entry: 2542 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2543 %1 = bitcast i8 %__U to <8 x i1> 2544 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__A 2545 ret <8 x i64> %2 2546} 2547 2548define <8 x i64> @test_mm512_maskz_permutex2var_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) { 2549; X86-LABEL: test_mm512_maskz_permutex2var_epi64: 2550; X86: # %bb.0: # %entry 2551; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2552; X86-NEXT: kmovw %eax, %k1 2553; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2554; X86-NEXT: retl 2555; 2556; X64-LABEL: test_mm512_maskz_permutex2var_epi64: 2557; X64: # %bb.0: # %entry 2558; X64-NEXT: kmovw %edi, %k1 2559; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 {%k1} {z} 2560; X64-NEXT: retq 2561entry: 2562 %0 = tail call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %__A, <8 x i64> %__I, <8 x i64> %__B) 2563 %1 = bitcast i8 %__U to <8 x i1> 2564 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 2565 ret <8 x i64> %2 2566} 2567define <4 x float> @test_mm_mask_add_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2568; X86-LABEL: test_mm_mask_add_ss: 2569; X86: # %bb.0: # %entry 2570; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2571; X86-NEXT: kmovw %eax, %k1 2572; X86-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2573; X86-NEXT: retl 2574; 2575; X64-LABEL: test_mm_mask_add_ss: 2576; X64: # %bb.0: # %entry 2577; X64-NEXT: kmovw %edi, %k1 2578; X64-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} 2579; X64-NEXT: retq 2580entry: 2581 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2582 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2583 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2584 %0 = and i8 %__U, 1 2585 %tobool.i = icmp eq i8 %0, 0 2586 %vecext1.i = extractelement <4 x float> %__W, i32 0 2587 %cond.i = select i1 %tobool.i, float %vecext1.i, float %add.i.i 2588 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2589 ret <4 x float> %vecins.i 2590} 2591 2592define <4 x float> @test_mm_maskz_add_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2593; X86-LABEL: test_mm_maskz_add_ss: 2594; X86: # %bb.0: # %entry 2595; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2596; X86-NEXT: kmovw %eax, %k1 2597; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2598; X86-NEXT: retl 2599; 2600; X64-LABEL: test_mm_maskz_add_ss: 2601; X64: # %bb.0: # %entry 2602; X64-NEXT: kmovw %edi, %k1 2603; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} 2604; X64-NEXT: retq 2605entry: 2606 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2607 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2608 %add.i.i = fadd float %vecext1.i.i, %vecext.i.i 2609 %0 = and i8 %__U, 1 2610 %tobool.i = icmp eq i8 %0, 0 2611 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %add.i.i 2612 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2613 ret <4 x float> %vecins.i 2614} 2615 2616define <2 x double> @test_mm_mask_add_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2617; X86-LABEL: test_mm_mask_add_sd: 2618; X86: # %bb.0: # %entry 2619; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2620; X86-NEXT: kmovw %eax, %k1 2621; X86-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2622; X86-NEXT: retl 2623; 2624; X64-LABEL: test_mm_mask_add_sd: 2625; X64: # %bb.0: # %entry 2626; X64-NEXT: kmovw %edi, %k1 2627; X64-NEXT: vaddsd %xmm2, %xmm1, %xmm0 {%k1} 2628; X64-NEXT: retq 2629entry: 2630 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2631 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2632 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2633 %0 = and i8 %__U, 1 2634 %tobool.i = icmp eq i8 %0, 0 2635 %vecext1.i = extractelement <2 x double> %__W, i32 0 2636 %cond.i = select i1 %tobool.i, double %vecext1.i, double %add.i.i 2637 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2638 ret <2 x double> %vecins.i 2639} 2640 2641define <2 x double> @test_mm_maskz_add_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2642; X86-LABEL: test_mm_maskz_add_sd: 2643; X86: # %bb.0: # %entry 2644; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2645; X86-NEXT: kmovw %eax, %k1 2646; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2647; X86-NEXT: retl 2648; 2649; X64-LABEL: test_mm_maskz_add_sd: 2650; X64: # %bb.0: # %entry 2651; X64-NEXT: kmovw %edi, %k1 2652; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2653; X64-NEXT: retq 2654entry: 2655 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2656 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2657 %add.i.i = fadd double %vecext1.i.i, %vecext.i.i 2658 %0 = and i8 %__U, 1 2659 %tobool.i = icmp eq i8 %0, 0 2660 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %add.i.i 2661 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2662 ret <2 x double> %vecins.i 2663} 2664 2665define <4 x float> @test_mm_mask_sub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2666; X86-LABEL: test_mm_mask_sub_ss: 2667; X86: # %bb.0: # %entry 2668; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2669; X86-NEXT: kmovw %eax, %k1 2670; X86-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2671; X86-NEXT: retl 2672; 2673; X64-LABEL: test_mm_mask_sub_ss: 2674; X64: # %bb.0: # %entry 2675; X64-NEXT: kmovw %edi, %k1 2676; X64-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} 2677; X64-NEXT: retq 2678entry: 2679 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2680 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2681 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2682 %0 = and i8 %__U, 1 2683 %tobool.i = icmp eq i8 %0, 0 2684 %vecext1.i = extractelement <4 x float> %__W, i32 0 2685 %cond.i = select i1 %tobool.i, float %vecext1.i, float %sub.i.i 2686 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2687 ret <4 x float> %vecins.i 2688} 2689 2690define <4 x float> @test_mm_maskz_sub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2691; X86-LABEL: test_mm_maskz_sub_ss: 2692; X86: # %bb.0: # %entry 2693; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2694; X86-NEXT: kmovw %eax, %k1 2695; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2696; X86-NEXT: retl 2697; 2698; X64-LABEL: test_mm_maskz_sub_ss: 2699; X64: # %bb.0: # %entry 2700; X64-NEXT: kmovw %edi, %k1 2701; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} 2702; X64-NEXT: retq 2703entry: 2704 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2705 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2706 %sub.i.i = fsub float %vecext1.i.i, %vecext.i.i 2707 %0 = and i8 %__U, 1 2708 %tobool.i = icmp eq i8 %0, 0 2709 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %sub.i.i 2710 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2711 ret <4 x float> %vecins.i 2712} 2713 2714define <2 x double> @test_mm_mask_sub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2715; X86-LABEL: test_mm_mask_sub_sd: 2716; X86: # %bb.0: # %entry 2717; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2718; X86-NEXT: kmovw %eax, %k1 2719; X86-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2720; X86-NEXT: retl 2721; 2722; X64-LABEL: test_mm_mask_sub_sd: 2723; X64: # %bb.0: # %entry 2724; X64-NEXT: kmovw %edi, %k1 2725; X64-NEXT: vsubsd %xmm2, %xmm1, %xmm0 {%k1} 2726; X64-NEXT: retq 2727entry: 2728 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2729 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2730 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2731 %0 = and i8 %__U, 1 2732 %tobool.i = icmp eq i8 %0, 0 2733 %vecext1.i = extractelement <2 x double> %__W, i32 0 2734 %cond.i = select i1 %tobool.i, double %vecext1.i, double %sub.i.i 2735 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2736 ret <2 x double> %vecins.i 2737} 2738 2739define <2 x double> @test_mm_maskz_sub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2740; X86-LABEL: test_mm_maskz_sub_sd: 2741; X86: # %bb.0: # %entry 2742; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2743; X86-NEXT: kmovw %eax, %k1 2744; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2745; X86-NEXT: retl 2746; 2747; X64-LABEL: test_mm_maskz_sub_sd: 2748; X64: # %bb.0: # %entry 2749; X64-NEXT: kmovw %edi, %k1 2750; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2751; X64-NEXT: retq 2752entry: 2753 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2754 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2755 %sub.i.i = fsub double %vecext1.i.i, %vecext.i.i 2756 %0 = and i8 %__U, 1 2757 %tobool.i = icmp eq i8 %0, 0 2758 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %sub.i.i 2759 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2760 ret <2 x double> %vecins.i 2761} 2762 2763define <4 x float> @test_mm_mask_mul_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2764; X86-LABEL: test_mm_mask_mul_ss: 2765; X86: # %bb.0: # %entry 2766; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2767; X86-NEXT: kmovw %eax, %k1 2768; X86-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2769; X86-NEXT: retl 2770; 2771; X64-LABEL: test_mm_mask_mul_ss: 2772; X64: # %bb.0: # %entry 2773; X64-NEXT: kmovw %edi, %k1 2774; X64-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} 2775; X64-NEXT: retq 2776entry: 2777 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2778 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2779 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2780 %0 = and i8 %__U, 1 2781 %tobool.i = icmp eq i8 %0, 0 2782 %vecext1.i = extractelement <4 x float> %__W, i32 0 2783 %cond.i = select i1 %tobool.i, float %vecext1.i, float %mul.i.i 2784 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2785 ret <4 x float> %vecins.i 2786} 2787 2788define <4 x float> @test_mm_maskz_mul_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2789; X86-LABEL: test_mm_maskz_mul_ss: 2790; X86: # %bb.0: # %entry 2791; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2792; X86-NEXT: kmovw %eax, %k1 2793; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2794; X86-NEXT: retl 2795; 2796; X64-LABEL: test_mm_maskz_mul_ss: 2797; X64: # %bb.0: # %entry 2798; X64-NEXT: kmovw %edi, %k1 2799; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} 2800; X64-NEXT: retq 2801entry: 2802 %vecext.i.i = extractelement <4 x float> %__B, i32 0 2803 %vecext1.i.i = extractelement <4 x float> %__A, i32 0 2804 %mul.i.i = fmul float %vecext1.i.i, %vecext.i.i 2805 %0 = and i8 %__U, 1 2806 %tobool.i = icmp eq i8 %0, 0 2807 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %mul.i.i 2808 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 2809 ret <4 x float> %vecins.i 2810} 2811 2812define <2 x double> @test_mm_mask_mul_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2813; X86-LABEL: test_mm_mask_mul_sd: 2814; X86: # %bb.0: # %entry 2815; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2816; X86-NEXT: kmovw %eax, %k1 2817; X86-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2818; X86-NEXT: retl 2819; 2820; X64-LABEL: test_mm_mask_mul_sd: 2821; X64: # %bb.0: # %entry 2822; X64-NEXT: kmovw %edi, %k1 2823; X64-NEXT: vmulsd %xmm2, %xmm1, %xmm0 {%k1} 2824; X64-NEXT: retq 2825entry: 2826 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2827 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2828 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2829 %0 = and i8 %__U, 1 2830 %tobool.i = icmp eq i8 %0, 0 2831 %vecext1.i = extractelement <2 x double> %__W, i32 0 2832 %cond.i = select i1 %tobool.i, double %vecext1.i, double %mul.i.i 2833 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2834 ret <2 x double> %vecins.i 2835} 2836 2837define <2 x double> @test_mm_maskz_mul_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2838; X86-LABEL: test_mm_maskz_mul_sd: 2839; X86: # %bb.0: # %entry 2840; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2841; X86-NEXT: kmovw %eax, %k1 2842; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2843; X86-NEXT: retl 2844; 2845; X64-LABEL: test_mm_maskz_mul_sd: 2846; X64: # %bb.0: # %entry 2847; X64-NEXT: kmovw %edi, %k1 2848; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2849; X64-NEXT: retq 2850entry: 2851 %vecext.i.i = extractelement <2 x double> %__B, i32 0 2852 %vecext1.i.i = extractelement <2 x double> %__A, i32 0 2853 %mul.i.i = fmul double %vecext1.i.i, %vecext.i.i 2854 %0 = and i8 %__U, 1 2855 %tobool.i = icmp eq i8 %0, 0 2856 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %mul.i.i 2857 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 2858 ret <2 x double> %vecins.i 2859} 2860 2861define <4 x float> @test_mm_mask_div_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2862; X86-LABEL: test_mm_mask_div_ss: 2863; X86: # %bb.0: # %entry 2864; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2865; X86-NEXT: kmovw %eax, %k1 2866; X86-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2867; X86-NEXT: retl 2868; 2869; X64-LABEL: test_mm_mask_div_ss: 2870; X64: # %bb.0: # %entry 2871; X64-NEXT: kmovw %edi, %k1 2872; X64-NEXT: vdivss %xmm2, %xmm1, %xmm0 {%k1} 2873; X64-NEXT: retq 2874entry: 2875 %0 = extractelement <4 x float> %__A, i64 0 2876 %1 = extractelement <4 x float> %__B, i64 0 2877 %2 = extractelement <4 x float> %__W, i64 0 2878 %3 = fdiv float %0, %1 2879 %4 = bitcast i8 %__U to <8 x i1> 2880 %5 = extractelement <8 x i1> %4, i64 0 2881 %6 = select i1 %5, float %3, float %2 2882 %7 = insertelement <4 x float> %__A, float %6, i64 0 2883 ret <4 x float> %7 2884} 2885 2886define <4 x float> @test_mm_maskz_div_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 2887; X86-LABEL: test_mm_maskz_div_ss: 2888; X86: # %bb.0: # %entry 2889; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2890; X86-NEXT: kmovw %eax, %k1 2891; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2892; X86-NEXT: retl 2893; 2894; X64-LABEL: test_mm_maskz_div_ss: 2895; X64: # %bb.0: # %entry 2896; X64-NEXT: kmovw %edi, %k1 2897; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 {%k1} {z} 2898; X64-NEXT: retq 2899entry: 2900 %0 = extractelement <4 x float> %__A, i64 0 2901 %1 = extractelement <4 x float> %__B, i64 0 2902 %2 = fdiv float %0, %1 2903 %3 = bitcast i8 %__U to <8 x i1> 2904 %4 = extractelement <8 x i1> %3, i64 0 2905 %5 = select i1 %4, float %2, float 0.000000e+00 2906 %6 = insertelement <4 x float> %__A, float %5, i64 0 2907 ret <4 x float> %6 2908} 2909 2910define <2 x double> @test_mm_mask_div_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2911; X86-LABEL: test_mm_mask_div_sd: 2912; X86: # %bb.0: # %entry 2913; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2914; X86-NEXT: kmovw %eax, %k1 2915; X86-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 2916; X86-NEXT: retl 2917; 2918; X64-LABEL: test_mm_mask_div_sd: 2919; X64: # %bb.0: # %entry 2920; X64-NEXT: kmovw %edi, %k1 2921; X64-NEXT: vdivsd %xmm2, %xmm1, %xmm0 {%k1} 2922; X64-NEXT: retq 2923entry: 2924 %0 = extractelement <2 x double> %__A, i64 0 2925 %1 = extractelement <2 x double> %__B, i64 0 2926 %2 = extractelement <2 x double> %__W, i64 0 2927 %3 = fdiv double %0, %1 2928 %4 = bitcast i8 %__U to <8 x i1> 2929 %5 = extractelement <8 x i1> %4, i64 0 2930 %6 = select i1 %5, double %3, double %2 2931 %7 = insertelement <2 x double> %__A, double %6, i64 0 2932 ret <2 x double> %7 2933} 2934 2935define <2 x double> @test_mm_maskz_div_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 2936; X86-LABEL: test_mm_maskz_div_sd: 2937; X86: # %bb.0: # %entry 2938; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2939; X86-NEXT: kmovw %eax, %k1 2940; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2941; X86-NEXT: retl 2942; 2943; X64-LABEL: test_mm_maskz_div_sd: 2944; X64: # %bb.0: # %entry 2945; X64-NEXT: kmovw %edi, %k1 2946; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 {%k1} {z} 2947; X64-NEXT: retq 2948entry: 2949 %0 = extractelement <2 x double> %__A, i64 0 2950 %1 = extractelement <2 x double> %__B, i64 0 2951 %2 = fdiv double %0, %1 2952 %3 = bitcast i8 %__U to <8 x i1> 2953 %4 = extractelement <8 x i1> %3, i64 0 2954 %5 = select i1 %4, double %2, double 0.000000e+00 2955 %6 = insertelement <2 x double> %__A, double %5, i64 0 2956 ret <2 x double> %6 2957} 2958 2959 2960define <8 x double> @test_mm512_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 2961; CHECK-LABEL: test_mm512_fmadd_round_pd: 2962; CHECK: # %bb.0: # %entry 2963; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 2964; CHECK-NEXT: ret{{[l|q]}} 2965entry: 2966 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 2967 ret <8 x double> %0 2968} 2969 2970declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 2971 2972define <8 x double> @test_mm512_mask_fmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 2973; X86-LABEL: test_mm512_mask_fmadd_round_pd: 2974; X86: # %bb.0: # %entry 2975; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2976; X86-NEXT: kmovw %eax, %k1 2977; X86-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 2978; X86-NEXT: retl 2979; 2980; X64-LABEL: test_mm512_mask_fmadd_round_pd: 2981; X64: # %bb.0: # %entry 2982; X64-NEXT: kmovw %edi, %k1 2983; X64-NEXT: vfmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 2984; X64-NEXT: retq 2985entry: 2986 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 2987 %1 = bitcast i8 %__U to <8 x i1> 2988 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 2989 ret <8 x double> %2 2990} 2991 2992define <8 x double> @test_mm512_mask3_fmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 2993; X86-LABEL: test_mm512_mask3_fmadd_round_pd: 2994; X86: # %bb.0: # %entry 2995; X86-NEXT: movb {{[0-9]+}}(%esp), %al 2996; X86-NEXT: kmovw %eax, %k1 2997; X86-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 2998; X86-NEXT: vmovapd %zmm2, %zmm0 2999; X86-NEXT: retl 3000; 3001; X64-LABEL: test_mm512_mask3_fmadd_round_pd: 3002; X64: # %bb.0: # %entry 3003; X64-NEXT: kmovw %edi, %k1 3004; X64-NEXT: vfmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3005; X64-NEXT: vmovapd %zmm2, %zmm0 3006; X64-NEXT: retq 3007entry: 3008 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3009 %1 = bitcast i8 %__U to <8 x i1> 3010 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3011 ret <8 x double> %2 3012} 3013 3014define <8 x double> @test_mm512_maskz_fmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3015; X86-LABEL: test_mm512_maskz_fmadd_round_pd: 3016; X86: # %bb.0: # %entry 3017; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3018; X86-NEXT: kmovw %eax, %k1 3019; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3020; X86-NEXT: retl 3021; 3022; X64-LABEL: test_mm512_maskz_fmadd_round_pd: 3023; X64: # %bb.0: # %entry 3024; X64-NEXT: kmovw %edi, %k1 3025; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3026; X64-NEXT: retq 3027entry: 3028 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3029 %1 = bitcast i8 %__U to <8 x i1> 3030 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3031 ret <8 x double> %2 3032} 3033 3034define <8 x double> @test_mm512_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3035; X86-LABEL: test_mm512_fmsub_round_pd: 3036; X86: # %bb.0: # %entry 3037; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3038; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3039; X86-NEXT: retl 3040; 3041; X64-LABEL: test_mm512_fmsub_round_pd: 3042; X64: # %bb.0: # %entry 3043; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3044; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3045; X64-NEXT: retq 3046entry: 3047 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3048 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3049 ret <8 x double> %0 3050} 3051 3052define <8 x double> @test_mm512_mask_fmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3053; X86-LABEL: test_mm512_mask_fmsub_round_pd: 3054; X86: # %bb.0: # %entry 3055; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3056; X86-NEXT: kmovw %eax, %k1 3057; X86-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3058; X86-NEXT: retl 3059; 3060; X64-LABEL: test_mm512_mask_fmsub_round_pd: 3061; X64: # %bb.0: # %entry 3062; X64-NEXT: kmovw %edi, %k1 3063; X64-NEXT: vfmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3064; X64-NEXT: retq 3065entry: 3066 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3067 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3068 %1 = bitcast i8 %__U to <8 x i1> 3069 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3070 ret <8 x double> %2 3071} 3072 3073define <8 x double> @test_mm512_maskz_fmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3074; X86-LABEL: test_mm512_maskz_fmsub_round_pd: 3075; X86: # %bb.0: # %entry 3076; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3077; X86-NEXT: kmovw %eax, %k1 3078; X86-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3079; X86-NEXT: retl 3080; 3081; X64-LABEL: test_mm512_maskz_fmsub_round_pd: 3082; X64: # %bb.0: # %entry 3083; X64-NEXT: kmovw %edi, %k1 3084; X64-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3085; X64-NEXT: retq 3086entry: 3087 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3088 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3089 %1 = bitcast i8 %__U to <8 x i1> 3090 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3091 ret <8 x double> %2 3092} 3093 3094define <8 x double> @test_mm512_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3095; X86-LABEL: test_mm512_fnmadd_round_pd: 3096; X86: # %bb.0: # %entry 3097; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3098; X86-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3099; X86-NEXT: retl 3100; 3101; X64-LABEL: test_mm512_fnmadd_round_pd: 3102; X64: # %bb.0: # %entry 3103; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3104; X64-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3105; X64-NEXT: retq 3106entry: 3107 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3108 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3109 ret <8 x double> %0 3110} 3111 3112define <8 x double> @test_mm512_mask3_fnmadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3113; X86-LABEL: test_mm512_mask3_fnmadd_round_pd: 3114; X86: # %bb.0: # %entry 3115; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3116; X86-NEXT: kmovw %eax, %k1 3117; X86-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3118; X86-NEXT: vmovapd %zmm2, %zmm0 3119; X86-NEXT: retl 3120; 3121; X64-LABEL: test_mm512_mask3_fnmadd_round_pd: 3122; X64: # %bb.0: # %entry 3123; X64-NEXT: kmovw %edi, %k1 3124; X64-NEXT: vfnmadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3125; X64-NEXT: vmovapd %zmm2, %zmm0 3126; X64-NEXT: retq 3127entry: 3128 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3129 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3130 %1 = bitcast i8 %__U to <8 x i1> 3131 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3132 ret <8 x double> %2 3133} 3134 3135define <8 x double> @test_mm512_maskz_fnmadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3136; X86-LABEL: test_mm512_maskz_fnmadd_round_pd: 3137; X86: # %bb.0: # %entry 3138; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3139; X86-NEXT: kmovw %eax, %k1 3140; X86-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3141; X86-NEXT: retl 3142; 3143; X64-LABEL: test_mm512_maskz_fnmadd_round_pd: 3144; X64: # %bb.0: # %entry 3145; X64-NEXT: kmovw %edi, %k1 3146; X64-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3147; X64-NEXT: retq 3148entry: 3149 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3150 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 3151 %1 = bitcast i8 %__U to <8 x i1> 3152 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3153 ret <8 x double> %2 3154} 3155 3156define <8 x double> @test_mm512_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3157; CHECK-LABEL: test_mm512_fnmsub_round_pd: 3158; CHECK: # %bb.0: # %entry 3159; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0] 3160; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3161; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3162; CHECK-NEXT: vfmadd231pd {rn-sae}, %zmm4, %zmm1, %zmm0 3163; CHECK-NEXT: ret{{[l|q]}} 3164entry: 3165 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3166 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3167 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3168 ret <8 x double> %0 3169} 3170 3171define <8 x double> @test_mm512_maskz_fnmsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3172; X86-LABEL: test_mm512_maskz_fnmsub_round_pd: 3173; X86: # %bb.0: # %entry 3174; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3175; X86-NEXT: kmovw %eax, %k1 3176; X86-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3177; X86-NEXT: retl 3178; 3179; X64-LABEL: test_mm512_maskz_fnmsub_round_pd: 3180; X64: # %bb.0: # %entry 3181; X64-NEXT: kmovw %edi, %k1 3182; X64-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3183; X64-NEXT: retq 3184entry: 3185 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3186 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3187 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %sub1, i32 8) 3188 %1 = bitcast i8 %__U to <8 x i1> 3189 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3190 ret <8 x double> %2 3191} 3192 3193define <8 x double> @test_mm512_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3194; CHECK-LABEL: test_mm512_fmadd_pd: 3195; CHECK: # %bb.0: # %entry 3196; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3197; CHECK-NEXT: ret{{[l|q]}} 3198entry: 3199 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3200 ret <8 x double> %0 3201} 3202 3203define <8 x double> @test_mm512_mask_fmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3204; X86-LABEL: test_mm512_mask_fmadd_pd: 3205; X86: # %bb.0: # %entry 3206; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3207; X86-NEXT: kmovw %eax, %k1 3208; X86-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3209; X86-NEXT: retl 3210; 3211; X64-LABEL: test_mm512_mask_fmadd_pd: 3212; X64: # %bb.0: # %entry 3213; X64-NEXT: kmovw %edi, %k1 3214; X64-NEXT: vfmadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3215; X64-NEXT: retq 3216entry: 3217 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3218 %1 = bitcast i8 %__U to <8 x i1> 3219 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3220 ret <8 x double> %2 3221} 3222 3223define <8 x double> @test_mm512_mask3_fmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3224; X86-LABEL: test_mm512_mask3_fmadd_pd: 3225; X86: # %bb.0: # %entry 3226; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3227; X86-NEXT: kmovw %eax, %k1 3228; X86-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3229; X86-NEXT: vmovapd %zmm2, %zmm0 3230; X86-NEXT: retl 3231; 3232; X64-LABEL: test_mm512_mask3_fmadd_pd: 3233; X64: # %bb.0: # %entry 3234; X64-NEXT: kmovw %edi, %k1 3235; X64-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3236; X64-NEXT: vmovapd %zmm2, %zmm0 3237; X64-NEXT: retq 3238entry: 3239 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3240 %1 = bitcast i8 %__U to <8 x i1> 3241 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3242 ret <8 x double> %2 3243} 3244 3245define <8 x double> @test_mm512_maskz_fmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3246; X86-LABEL: test_mm512_maskz_fmadd_pd: 3247; X86: # %bb.0: # %entry 3248; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3249; X86-NEXT: kmovw %eax, %k1 3250; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3251; X86-NEXT: retl 3252; 3253; X64-LABEL: test_mm512_maskz_fmadd_pd: 3254; X64: # %bb.0: # %entry 3255; X64-NEXT: kmovw %edi, %k1 3256; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3257; X64-NEXT: retq 3258entry: 3259 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 3260 %1 = bitcast i8 %__U to <8 x i1> 3261 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3262 ret <8 x double> %2 3263} 3264 3265define <8 x double> @test_mm512_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3266; X86-LABEL: test_mm512_fmsub_pd: 3267; X86: # %bb.0: # %entry 3268; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3269; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3270; X86-NEXT: retl 3271; 3272; X64-LABEL: test_mm512_fmsub_pd: 3273; X64: # %bb.0: # %entry 3274; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3275; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3276; X64-NEXT: retq 3277entry: 3278 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3279 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3280 ret <8 x double> %0 3281} 3282 3283define <8 x double> @test_mm512_mask_fmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3284; X86-LABEL: test_mm512_mask_fmsub_pd: 3285; X86: # %bb.0: # %entry 3286; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3287; X86-NEXT: kmovw %eax, %k1 3288; X86-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3289; X86-NEXT: retl 3290; 3291; X64-LABEL: test_mm512_mask_fmsub_pd: 3292; X64: # %bb.0: # %entry 3293; X64-NEXT: kmovw %edi, %k1 3294; X64-NEXT: vfmsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3295; X64-NEXT: retq 3296entry: 3297 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3298 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3299 %1 = bitcast i8 %__U to <8 x i1> 3300 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3301 ret <8 x double> %2 3302} 3303 3304define <8 x double> @test_mm512_maskz_fmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3305; X86-LABEL: test_mm512_maskz_fmsub_pd: 3306; X86: # %bb.0: # %entry 3307; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3308; X86-NEXT: kmovw %eax, %k1 3309; X86-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3310; X86-NEXT: retl 3311; 3312; X64-LABEL: test_mm512_maskz_fmsub_pd: 3313; X64: # %bb.0: # %entry 3314; X64-NEXT: kmovw %edi, %k1 3315; X64-NEXT: vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3316; X64-NEXT: retq 3317entry: 3318 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3319 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 3320 %1 = bitcast i8 %__U to <8 x i1> 3321 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3322 ret <8 x double> %2 3323} 3324 3325define <8 x double> @test_mm512_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3326; X86-LABEL: test_mm512_fnmadd_pd: 3327; X86: # %bb.0: # %entry 3328; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm0, %zmm0 3329; X86-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3330; X86-NEXT: retl 3331; 3332; X64-LABEL: test_mm512_fnmadd_pd: 3333; X64: # %bb.0: # %entry 3334; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm0, %zmm0 3335; X64-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3336; X64-NEXT: retq 3337entry: 3338 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3339 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3340 ret <8 x double> %0 3341} 3342 3343define <8 x double> @test_mm512_mask3_fnmadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3344; X86-LABEL: test_mm512_mask3_fnmadd_pd: 3345; X86: # %bb.0: # %entry 3346; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3347; X86-NEXT: kmovw %eax, %k1 3348; X86-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3349; X86-NEXT: vmovapd %zmm2, %zmm0 3350; X86-NEXT: retl 3351; 3352; X64-LABEL: test_mm512_mask3_fnmadd_pd: 3353; X64: # %bb.0: # %entry 3354; X64-NEXT: kmovw %edi, %k1 3355; X64-NEXT: vfnmadd231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3356; X64-NEXT: vmovapd %zmm2, %zmm0 3357; X64-NEXT: retq 3358entry: 3359 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3360 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3361 %1 = bitcast i8 %__U to <8 x i1> 3362 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3363 ret <8 x double> %2 3364} 3365 3366define <8 x double> @test_mm512_maskz_fnmadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3367; X86-LABEL: test_mm512_maskz_fnmadd_pd: 3368; X86: # %bb.0: # %entry 3369; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3370; X86-NEXT: kmovw %eax, %k1 3371; X86-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3372; X86-NEXT: retl 3373; 3374; X64-LABEL: test_mm512_maskz_fnmadd_pd: 3375; X64: # %bb.0: # %entry 3376; X64-NEXT: kmovw %edi, %k1 3377; X64-NEXT: vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3378; X64-NEXT: retq 3379entry: 3380 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3381 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 3382 %1 = bitcast i8 %__U to <8 x i1> 3383 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3384 ret <8 x double> %2 3385} 3386 3387define <8 x double> @test_mm512_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3388; CHECK-LABEL: test_mm512_fnmsub_pd: 3389; CHECK: # %bb.0: # %entry 3390; CHECK-NEXT: vpbroadcastq {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0] 3391; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3392; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3393; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3394; CHECK-NEXT: ret{{[l|q]}} 3395entry: 3396 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3397 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3398 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3399 ret <8 x double> %0 3400} 3401 3402define <8 x double> @test_mm512_maskz_fnmsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3403; X86-LABEL: test_mm512_maskz_fnmsub_pd: 3404; X86: # %bb.0: # %entry 3405; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3406; X86-NEXT: kmovw %eax, %k1 3407; X86-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3408; X86-NEXT: retl 3409; 3410; X64-LABEL: test_mm512_maskz_fnmsub_pd: 3411; X64: # %bb.0: # %entry 3412; X64-NEXT: kmovw %edi, %k1 3413; X64-NEXT: vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3414; X64-NEXT: retq 3415entry: 3416 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 3417 %sub1.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3418 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %sub1.i) #10 3419 %1 = bitcast i8 %__U to <8 x i1> 3420 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3421 ret <8 x double> %2 3422} 3423 3424define <16 x float> @test_mm512_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3425; CHECK-LABEL: test_mm512_fmadd_round_ps: 3426; CHECK: # %bb.0: # %entry 3427; CHECK-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3428; CHECK-NEXT: ret{{[l|q]}} 3429entry: 3430 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3431 ret <16 x float> %0 3432} 3433 3434declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 3435 3436define <16 x float> @test_mm512_mask_fmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3437; X86-LABEL: test_mm512_mask_fmadd_round_ps: 3438; X86: # %bb.0: # %entry 3439; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3440; X86-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3441; X86-NEXT: retl 3442; 3443; X64-LABEL: test_mm512_mask_fmadd_round_ps: 3444; X64: # %bb.0: # %entry 3445; X64-NEXT: kmovw %edi, %k1 3446; X64-NEXT: vfmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3447; X64-NEXT: retq 3448entry: 3449 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3450 %1 = bitcast i16 %__U to <16 x i1> 3451 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3452 ret <16 x float> %2 3453} 3454 3455define <16 x float> @test_mm512_mask3_fmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3456; X86-LABEL: test_mm512_mask3_fmadd_round_ps: 3457; X86: # %bb.0: # %entry 3458; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3459; X86-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3460; X86-NEXT: vmovaps %zmm2, %zmm0 3461; X86-NEXT: retl 3462; 3463; X64-LABEL: test_mm512_mask3_fmadd_round_ps: 3464; X64: # %bb.0: # %entry 3465; X64-NEXT: kmovw %edi, %k1 3466; X64-NEXT: vfmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3467; X64-NEXT: vmovaps %zmm2, %zmm0 3468; X64-NEXT: retq 3469entry: 3470 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3471 %1 = bitcast i16 %__U to <16 x i1> 3472 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3473 ret <16 x float> %2 3474} 3475 3476define <16 x float> @test_mm512_maskz_fmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3477; X86-LABEL: test_mm512_maskz_fmadd_round_ps: 3478; X86: # %bb.0: # %entry 3479; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3480; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3481; X86-NEXT: retl 3482; 3483; X64-LABEL: test_mm512_maskz_fmadd_round_ps: 3484; X64: # %bb.0: # %entry 3485; X64-NEXT: kmovw %edi, %k1 3486; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3487; X64-NEXT: retq 3488entry: 3489 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 3490 %1 = bitcast i16 %__U to <16 x i1> 3491 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3492 ret <16 x float> %2 3493} 3494 3495define <16 x float> @test_mm512_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3496; X86-LABEL: test_mm512_fmsub_round_ps: 3497; X86: # %bb.0: # %entry 3498; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3499; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3500; X86-NEXT: retl 3501; 3502; X64-LABEL: test_mm512_fmsub_round_ps: 3503; X64: # %bb.0: # %entry 3504; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3505; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3506; X64-NEXT: retq 3507entry: 3508 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3509 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3510 ret <16 x float> %0 3511} 3512 3513define <16 x float> @test_mm512_mask_fmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3514; X86-LABEL: test_mm512_mask_fmsub_round_ps: 3515; X86: # %bb.0: # %entry 3516; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3517; X86-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3518; X86-NEXT: retl 3519; 3520; X64-LABEL: test_mm512_mask_fmsub_round_ps: 3521; X64: # %bb.0: # %entry 3522; X64-NEXT: kmovw %edi, %k1 3523; X64-NEXT: vfmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3524; X64-NEXT: retq 3525entry: 3526 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3527 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3528 %1 = bitcast i16 %__U to <16 x i1> 3529 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3530 ret <16 x float> %2 3531} 3532 3533define <16 x float> @test_mm512_maskz_fmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3534; X86-LABEL: test_mm512_maskz_fmsub_round_ps: 3535; X86: # %bb.0: # %entry 3536; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3537; X86-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3538; X86-NEXT: retl 3539; 3540; X64-LABEL: test_mm512_maskz_fmsub_round_ps: 3541; X64: # %bb.0: # %entry 3542; X64-NEXT: kmovw %edi, %k1 3543; X64-NEXT: vfmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3544; X64-NEXT: retq 3545entry: 3546 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3547 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 3548 %1 = bitcast i16 %__U to <16 x i1> 3549 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3550 ret <16 x float> %2 3551} 3552 3553define <16 x float> @test_mm512_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3554; X86-LABEL: test_mm512_fnmadd_round_ps: 3555; X86: # %bb.0: # %entry 3556; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3557; X86-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3558; X86-NEXT: retl 3559; 3560; X64-LABEL: test_mm512_fnmadd_round_ps: 3561; X64: # %bb.0: # %entry 3562; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3563; X64-NEXT: vfmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 3564; X64-NEXT: retq 3565entry: 3566 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3567 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3568 ret <16 x float> %0 3569} 3570 3571define <16 x float> @test_mm512_mask3_fnmadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3572; X86-LABEL: test_mm512_mask3_fnmadd_round_ps: 3573; X86: # %bb.0: # %entry 3574; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3575; X86-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3576; X86-NEXT: vmovaps %zmm2, %zmm0 3577; X86-NEXT: retl 3578; 3579; X64-LABEL: test_mm512_mask3_fnmadd_round_ps: 3580; X64: # %bb.0: # %entry 3581; X64-NEXT: kmovw %edi, %k1 3582; X64-NEXT: vfnmadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3583; X64-NEXT: vmovaps %zmm2, %zmm0 3584; X64-NEXT: retq 3585entry: 3586 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3587 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3588 %1 = bitcast i16 %__U to <16 x i1> 3589 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3590 ret <16 x float> %2 3591} 3592 3593define <16 x float> @test_mm512_maskz_fnmadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3594; X86-LABEL: test_mm512_maskz_fnmadd_round_ps: 3595; X86: # %bb.0: # %entry 3596; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3597; X86-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3598; X86-NEXT: retl 3599; 3600; X64-LABEL: test_mm512_maskz_fnmadd_round_ps: 3601; X64: # %bb.0: # %entry 3602; X64-NEXT: kmovw %edi, %k1 3603; X64-NEXT: vfnmadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3604; X64-NEXT: retq 3605entry: 3606 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3607 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 3608 %1 = bitcast i16 %__U to <16 x i1> 3609 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3610 ret <16 x float> %2 3611} 3612 3613define <16 x float> @test_mm512_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3614; CHECK-LABEL: test_mm512_fnmsub_round_ps: 3615; CHECK: # %bb.0: # %entry 3616; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] 3617; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3618; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3619; CHECK-NEXT: vfmadd231ps {rn-sae}, %zmm4, %zmm1, %zmm0 3620; CHECK-NEXT: ret{{[l|q]}} 3621entry: 3622 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3623 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3624 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3625 ret <16 x float> %0 3626} 3627 3628define <16 x float> @test_mm512_maskz_fnmsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3629; X86-LABEL: test_mm512_maskz_fnmsub_round_ps: 3630; X86: # %bb.0: # %entry 3631; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3632; X86-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3633; X86-NEXT: retl 3634; 3635; X64-LABEL: test_mm512_maskz_fnmsub_round_ps: 3636; X64: # %bb.0: # %entry 3637; X64-NEXT: kmovw %edi, %k1 3638; X64-NEXT: vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3639; X64-NEXT: retq 3640entry: 3641 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3642 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3643 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %sub1, i32 8) 3644 %1 = bitcast i16 %__U to <16 x i1> 3645 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3646 ret <16 x float> %2 3647} 3648 3649define <16 x float> @test_mm512_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3650; CHECK-LABEL: test_mm512_fmadd_ps: 3651; CHECK: # %bb.0: # %entry 3652; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3653; CHECK-NEXT: ret{{[l|q]}} 3654entry: 3655 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3656 ret <16 x float> %0 3657} 3658 3659define <16 x float> @test_mm512_mask_fmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3660; X86-LABEL: test_mm512_mask_fmadd_ps: 3661; X86: # %bb.0: # %entry 3662; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3663; X86-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3664; X86-NEXT: retl 3665; 3666; X64-LABEL: test_mm512_mask_fmadd_ps: 3667; X64: # %bb.0: # %entry 3668; X64-NEXT: kmovw %edi, %k1 3669; X64-NEXT: vfmadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) + zmm2 3670; X64-NEXT: retq 3671entry: 3672 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3673 %1 = bitcast i16 %__U to <16 x i1> 3674 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3675 ret <16 x float> %2 3676} 3677 3678define <16 x float> @test_mm512_mask3_fmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3679; X86-LABEL: test_mm512_mask3_fmadd_ps: 3680; X86: # %bb.0: # %entry 3681; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3682; X86-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3683; X86-NEXT: vmovaps %zmm2, %zmm0 3684; X86-NEXT: retl 3685; 3686; X64-LABEL: test_mm512_mask3_fmadd_ps: 3687; X64: # %bb.0: # %entry 3688; X64-NEXT: kmovw %edi, %k1 3689; X64-NEXT: vfmadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) + zmm2 3690; X64-NEXT: vmovaps %zmm2, %zmm0 3691; X64-NEXT: retq 3692entry: 3693 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3694 %1 = bitcast i16 %__U to <16 x i1> 3695 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3696 ret <16 x float> %2 3697} 3698 3699define <16 x float> @test_mm512_maskz_fmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3700; X86-LABEL: test_mm512_maskz_fmadd_ps: 3701; X86: # %bb.0: # %entry 3702; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3703; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3704; X86-NEXT: retl 3705; 3706; X64-LABEL: test_mm512_maskz_fmadd_ps: 3707; X64: # %bb.0: # %entry 3708; X64-NEXT: kmovw %edi, %k1 3709; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3710; X64-NEXT: retq 3711entry: 3712 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 3713 %1 = bitcast i16 %__U to <16 x i1> 3714 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3715 ret <16 x float> %2 3716} 3717 3718define <16 x float> @test_mm512_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3719; X86-LABEL: test_mm512_fmsub_ps: 3720; X86: # %bb.0: # %entry 3721; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 3722; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3723; X86-NEXT: retl 3724; 3725; X64-LABEL: test_mm512_fmsub_ps: 3726; X64: # %bb.0: # %entry 3727; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 3728; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3729; X64-NEXT: retq 3730entry: 3731 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3732 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3733 ret <16 x float> %0 3734} 3735 3736define <16 x float> @test_mm512_mask_fmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 3737; X86-LABEL: test_mm512_mask_fmsub_ps: 3738; X86: # %bb.0: # %entry 3739; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3740; X86-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3741; X86-NEXT: retl 3742; 3743; X64-LABEL: test_mm512_mask_fmsub_ps: 3744; X64: # %bb.0: # %entry 3745; X64-NEXT: kmovw %edi, %k1 3746; X64-NEXT: vfmsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) - zmm2 3747; X64-NEXT: retq 3748entry: 3749 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3750 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3751 %1 = bitcast i16 %__U to <16 x i1> 3752 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 3753 ret <16 x float> %2 3754} 3755 3756define <16 x float> @test_mm512_maskz_fmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3757; X86-LABEL: test_mm512_maskz_fmsub_ps: 3758; X86: # %bb.0: # %entry 3759; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3760; X86-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3761; X86-NEXT: retl 3762; 3763; X64-LABEL: test_mm512_maskz_fmsub_ps: 3764; X64: # %bb.0: # %entry 3765; X64-NEXT: kmovw %edi, %k1 3766; X64-NEXT: vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2 3767; X64-NEXT: retq 3768entry: 3769 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3770 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 3771 %1 = bitcast i16 %__U to <16 x i1> 3772 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3773 ret <16 x float> %2 3774} 3775 3776define <16 x float> @test_mm512_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3777; X86-LABEL: test_mm512_fnmadd_ps: 3778; X86: # %bb.0: # %entry 3779; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm0, %zmm0 3780; X86-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3781; X86-NEXT: retl 3782; 3783; X64-LABEL: test_mm512_fnmadd_ps: 3784; X64: # %bb.0: # %entry 3785; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm0, %zmm0 3786; X64-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2 3787; X64-NEXT: retq 3788entry: 3789 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3790 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3791 ret <16 x float> %0 3792} 3793 3794define <16 x float> @test_mm512_mask3_fnmadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 3795; X86-LABEL: test_mm512_mask3_fnmadd_ps: 3796; X86: # %bb.0: # %entry 3797; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3798; X86-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3799; X86-NEXT: vmovaps %zmm2, %zmm0 3800; X86-NEXT: retl 3801; 3802; X64-LABEL: test_mm512_mask3_fnmadd_ps: 3803; X64: # %bb.0: # %entry 3804; X64-NEXT: kmovw %edi, %k1 3805; X64-NEXT: vfnmadd231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) + zmm2 3806; X64-NEXT: vmovaps %zmm2, %zmm0 3807; X64-NEXT: retq 3808entry: 3809 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3810 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3811 %1 = bitcast i16 %__U to <16 x i1> 3812 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 3813 ret <16 x float> %2 3814} 3815 3816define <16 x float> @test_mm512_maskz_fnmadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3817; X86-LABEL: test_mm512_maskz_fnmadd_ps: 3818; X86: # %bb.0: # %entry 3819; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3820; X86-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3821; X86-NEXT: retl 3822; 3823; X64-LABEL: test_mm512_maskz_fnmadd_ps: 3824; X64: # %bb.0: # %entry 3825; X64-NEXT: kmovw %edi, %k1 3826; X64-NEXT: vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2 3827; X64-NEXT: retq 3828entry: 3829 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3830 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 3831 %1 = bitcast i16 %__U to <16 x i1> 3832 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3833 ret <16 x float> %2 3834} 3835 3836define <16 x float> @test_mm512_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3837; CHECK-LABEL: test_mm512_fnmsub_ps: 3838; CHECK: # %bb.0: # %entry 3839; CHECK-NEXT: vpbroadcastd {{.*#+}} zmm3 = [-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0] 3840; CHECK-NEXT: vpxorq %zmm3, %zmm0, %zmm4 3841; CHECK-NEXT: vpxorq %zmm3, %zmm2, %zmm0 3842; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm1 * zmm4) + zmm0 3843; CHECK-NEXT: ret{{[l|q]}} 3844entry: 3845 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3846 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3847 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3848 ret <16 x float> %0 3849} 3850 3851define <16 x float> @test_mm512_maskz_fnmsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 3852; X86-LABEL: test_mm512_maskz_fnmsub_ps: 3853; X86: # %bb.0: # %entry 3854; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 3855; X86-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3856; X86-NEXT: retl 3857; 3858; X64-LABEL: test_mm512_maskz_fnmsub_ps: 3859; X64: # %bb.0: # %entry 3860; X64-NEXT: kmovw %edi, %k1 3861; X64-NEXT: vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2 3862; X64-NEXT: retq 3863entry: 3864 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 3865 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 3866 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %sub1.i) #10 3867 %1 = bitcast i16 %__U to <16 x i1> 3868 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 3869 ret <16 x float> %2 3870} 3871 3872define <8 x double> @test_mm512_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3873; CHECK-LABEL: test_mm512_fmaddsub_round_pd: 3874; CHECK: # %bb.0: # %entry 3875; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3876; CHECK-NEXT: ret{{[l|q]}} 3877entry: 3878 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3879 ret <8 x double> %0 3880} 3881 3882declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32) #1 3883 3884define <8 x double> @test_mm512_mask_fmaddsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3885; X86-LABEL: test_mm512_mask_fmaddsub_round_pd: 3886; X86: # %bb.0: # %entry 3887; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3888; X86-NEXT: kmovw %eax, %k1 3889; X86-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3890; X86-NEXT: retl 3891; 3892; X64-LABEL: test_mm512_mask_fmaddsub_round_pd: 3893; X64: # %bb.0: # %entry 3894; X64-NEXT: kmovw %edi, %k1 3895; X64-NEXT: vfmaddsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3896; X64-NEXT: retq 3897entry: 3898 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3899 %1 = bitcast i8 %__U to <8 x i1> 3900 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3901 ret <8 x double> %2 3902} 3903 3904define <8 x double> @test_mm512_mask3_fmaddsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 3905; X86-LABEL: test_mm512_mask3_fmaddsub_round_pd: 3906; X86: # %bb.0: # %entry 3907; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3908; X86-NEXT: kmovw %eax, %k1 3909; X86-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3910; X86-NEXT: vmovapd %zmm2, %zmm0 3911; X86-NEXT: retl 3912; 3913; X64-LABEL: test_mm512_mask3_fmaddsub_round_pd: 3914; X64: # %bb.0: # %entry 3915; X64-NEXT: kmovw %edi, %k1 3916; X64-NEXT: vfmaddsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 3917; X64-NEXT: vmovapd %zmm2, %zmm0 3918; X64-NEXT: retq 3919entry: 3920 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3921 %1 = bitcast i8 %__U to <8 x i1> 3922 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 3923 ret <8 x double> %2 3924} 3925 3926define <8 x double> @test_mm512_maskz_fmaddsub_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3927; X86-LABEL: test_mm512_maskz_fmaddsub_round_pd: 3928; X86: # %bb.0: # %entry 3929; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3930; X86-NEXT: kmovw %eax, %k1 3931; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3932; X86-NEXT: retl 3933; 3934; X64-LABEL: test_mm512_maskz_fmaddsub_round_pd: 3935; X64: # %bb.0: # %entry 3936; X64-NEXT: kmovw %edi, %k1 3937; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3938; X64-NEXT: retq 3939entry: 3940 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i32 8) 3941 %1 = bitcast i8 %__U to <8 x i1> 3942 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 3943 ret <8 x double> %2 3944} 3945 3946define <8 x double> @test_mm512_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3947; X86-LABEL: test_mm512_fmsubadd_round_pd: 3948; X86: # %bb.0: # %entry 3949; X86-NEXT: vpxorq {{\.LCPI.*}}{1to8}, %zmm2, %zmm2 3950; X86-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3951; X86-NEXT: retl 3952; 3953; X64-LABEL: test_mm512_fmsubadd_round_pd: 3954; X64: # %bb.0: # %entry 3955; X64-NEXT: vpxorq {{.*}}(%rip){1to8}, %zmm2, %zmm2 3956; X64-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 3957; X64-NEXT: retq 3958entry: 3959 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3960 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3961 ret <8 x double> %0 3962} 3963 3964define <8 x double> @test_mm512_mask_fmsubadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 3965; X86-LABEL: test_mm512_mask_fmsubadd_round_pd: 3966; X86: # %bb.0: # %entry 3967; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3968; X86-NEXT: kmovw %eax, %k1 3969; X86-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3970; X86-NEXT: retl 3971; 3972; X64-LABEL: test_mm512_mask_fmsubadd_round_pd: 3973; X64: # %bb.0: # %entry 3974; X64-NEXT: kmovw %edi, %k1 3975; X64-NEXT: vfmsubadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 3976; X64-NEXT: retq 3977entry: 3978 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 3979 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 3980 %1 = bitcast i8 %__U to <8 x i1> 3981 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 3982 ret <8 x double> %2 3983} 3984 3985define <8 x double> @test_mm512_maskz_fmsubadd_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 3986; X86-LABEL: test_mm512_maskz_fmsubadd_round_pd: 3987; X86: # %bb.0: # %entry 3988; X86-NEXT: movb {{[0-9]+}}(%esp), %al 3989; X86-NEXT: kmovw %eax, %k1 3990; X86-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3991; X86-NEXT: retl 3992; 3993; X64-LABEL: test_mm512_maskz_fmsubadd_round_pd: 3994; X64: # %bb.0: # %entry 3995; X64-NEXT: kmovw %edi, %k1 3996; X64-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 3997; X64-NEXT: retq 3998entry: 3999 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4000 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4001 %1 = bitcast i8 %__U to <8 x i1> 4002 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 4003 ret <8 x double> %2 4004} 4005 4006define <8 x double> @test_mm512_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4007; CHECK-LABEL: test_mm512_fmaddsub_pd: 4008; CHECK: # %bb.0: # %entry 4009; CHECK-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4010; CHECK-NEXT: ret{{[l|q]}} 4011entry: 4012 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4013 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4014 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4015 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4016 ret <8 x double> %3 4017} 4018 4019define <8 x double> @test_mm512_mask_fmaddsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4020; X86-LABEL: test_mm512_mask_fmaddsub_pd: 4021; X86: # %bb.0: # %entry 4022; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4023; X86-NEXT: kmovw %eax, %k1 4024; X86-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4025; X86-NEXT: retl 4026; 4027; X64-LABEL: test_mm512_mask_fmaddsub_pd: 4028; X64: # %bb.0: # %entry 4029; X64-NEXT: kmovw %edi, %k1 4030; X64-NEXT: vfmaddsub132pd {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4031; X64-NEXT: retq 4032entry: 4033 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4034 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4035 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4036 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4037 %4 = bitcast i8 %__U to <8 x i1> 4038 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__A 4039 ret <8 x double> %5 4040} 4041 4042define <8 x double> @test_mm512_mask3_fmaddsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4043; X86-LABEL: test_mm512_mask3_fmaddsub_pd: 4044; X86: # %bb.0: # %entry 4045; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4046; X86-NEXT: kmovw %eax, %k1 4047; X86-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4048; X86-NEXT: vmovapd %zmm2, %zmm0 4049; X86-NEXT: retl 4050; 4051; X64-LABEL: test_mm512_mask3_fmaddsub_pd: 4052; X64: # %bb.0: # %entry 4053; X64-NEXT: kmovw %edi, %k1 4054; X64-NEXT: vfmaddsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4055; X64-NEXT: vmovapd %zmm2, %zmm0 4056; X64-NEXT: retq 4057entry: 4058 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4059 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4060 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4061 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4062 %4 = bitcast i8 %__U to <8 x i1> 4063 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> %__C 4064 ret <8 x double> %5 4065} 4066 4067define <8 x double> @test_mm512_maskz_fmaddsub_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4068; X86-LABEL: test_mm512_maskz_fmaddsub_pd: 4069; X86: # %bb.0: # %entry 4070; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4071; X86-NEXT: kmovw %eax, %k1 4072; X86-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4073; X86-NEXT: retl 4074; 4075; X64-LABEL: test_mm512_maskz_fmaddsub_pd: 4076; X64: # %bb.0: # %entry 4077; X64-NEXT: kmovw %edi, %k1 4078; X64-NEXT: vfmaddsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4079; X64-NEXT: retq 4080entry: 4081 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4082 %1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4083 %2 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %1) #10 4084 %3 = shufflevector <8 x double> %2, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4085 %4 = bitcast i8 %__U to <8 x i1> 4086 %5 = select <8 x i1> %4, <8 x double> %3, <8 x double> zeroinitializer 4087 ret <8 x double> %5 4088} 4089 4090define <8 x double> @test_mm512_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4091; CHECK-LABEL: test_mm512_fmsubadd_pd: 4092; CHECK: # %bb.0: # %entry 4093; CHECK-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4094; CHECK-NEXT: ret{{[l|q]}} 4095entry: 4096 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4097 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4098 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4099 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4100 ret <8 x double> %2 4101} 4102 4103define <8 x double> @test_mm512_mask_fmsubadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4104; X86-LABEL: test_mm512_mask_fmsubadd_pd: 4105; X86: # %bb.0: # %entry 4106; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4107; X86-NEXT: kmovw %eax, %k1 4108; X86-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4109; X86-NEXT: retl 4110; 4111; X64-LABEL: test_mm512_mask_fmsubadd_pd: 4112; X64: # %bb.0: # %entry 4113; X64-NEXT: kmovw %edi, %k1 4114; X64-NEXT: vfmsubadd132pd {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4115; X64-NEXT: retq 4116entry: 4117 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4118 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4119 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4120 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4121 %3 = bitcast i8 %__U to <8 x i1> 4122 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__A 4123 ret <8 x double> %4 4124} 4125 4126define <8 x double> @test_mm512_maskz_fmsubadd_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B, <8 x double> %__C) { 4127; X86-LABEL: test_mm512_maskz_fmsubadd_pd: 4128; X86: # %bb.0: # %entry 4129; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4130; X86-NEXT: kmovw %eax, %k1 4131; X86-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4132; X86-NEXT: retl 4133; 4134; X64-LABEL: test_mm512_maskz_fmsubadd_pd: 4135; X64: # %bb.0: # %entry 4136; X64-NEXT: kmovw %edi, %k1 4137; X64-NEXT: vfmsubadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4138; X64-NEXT: retq 4139entry: 4140 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4141 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4142 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4143 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4144 %3 = bitcast i8 %__U to <8 x i1> 4145 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> zeroinitializer 4146 ret <8 x double> %4 4147} 4148 4149define <16 x float> @test_mm512_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4150; CHECK-LABEL: test_mm512_fmaddsub_round_ps: 4151; CHECK: # %bb.0: # %entry 4152; CHECK-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4153; CHECK-NEXT: ret{{[l|q]}} 4154entry: 4155 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4156 ret <16 x float> %0 4157} 4158 4159declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32) #1 4160 4161define <16 x float> @test_mm512_mask_fmaddsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4162; X86-LABEL: test_mm512_mask_fmaddsub_round_ps: 4163; X86: # %bb.0: # %entry 4164; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4165; X86-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4166; X86-NEXT: retl 4167; 4168; X64-LABEL: test_mm512_mask_fmaddsub_round_ps: 4169; X64: # %bb.0: # %entry 4170; X64-NEXT: kmovw %edi, %k1 4171; X64-NEXT: vfmaddsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4172; X64-NEXT: retq 4173entry: 4174 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4175 %1 = bitcast i16 %__U to <16 x i1> 4176 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4177 ret <16 x float> %2 4178} 4179 4180define <16 x float> @test_mm512_mask3_fmaddsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4181; X86-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4182; X86: # %bb.0: # %entry 4183; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4184; X86-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4185; X86-NEXT: vmovaps %zmm2, %zmm0 4186; X86-NEXT: retl 4187; 4188; X64-LABEL: test_mm512_mask3_fmaddsub_round_ps: 4189; X64: # %bb.0: # %entry 4190; X64-NEXT: kmovw %edi, %k1 4191; X64-NEXT: vfmaddsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4192; X64-NEXT: vmovaps %zmm2, %zmm0 4193; X64-NEXT: retq 4194entry: 4195 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4196 %1 = bitcast i16 %__U to <16 x i1> 4197 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4198 ret <16 x float> %2 4199} 4200 4201define <16 x float> @test_mm512_maskz_fmaddsub_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4202; X86-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4203; X86: # %bb.0: # %entry 4204; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4205; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4206; X86-NEXT: retl 4207; 4208; X64-LABEL: test_mm512_maskz_fmaddsub_round_ps: 4209; X64: # %bb.0: # %entry 4210; X64-NEXT: kmovw %edi, %k1 4211; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4212; X64-NEXT: retq 4213entry: 4214 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i32 8) 4215 %1 = bitcast i16 %__U to <16 x i1> 4216 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4217 ret <16 x float> %2 4218} 4219 4220define <16 x float> @test_mm512_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4221; X86-LABEL: test_mm512_fmsubadd_round_ps: 4222; X86: # %bb.0: # %entry 4223; X86-NEXT: vpxord {{\.LCPI.*}}{1to16}, %zmm2, %zmm2 4224; X86-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4225; X86-NEXT: retl 4226; 4227; X64-LABEL: test_mm512_fmsubadd_round_ps: 4228; X64: # %bb.0: # %entry 4229; X64-NEXT: vpxord {{.*}}(%rip){1to16}, %zmm2, %zmm2 4230; X64-NEXT: vfmaddsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0 4231; X64-NEXT: retq 4232entry: 4233 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4234 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4235 ret <16 x float> %0 4236} 4237 4238define <16 x float> @test_mm512_mask_fmsubadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4239; X86-LABEL: test_mm512_mask_fmsubadd_round_ps: 4240; X86: # %bb.0: # %entry 4241; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4242; X86-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4243; X86-NEXT: retl 4244; 4245; X64-LABEL: test_mm512_mask_fmsubadd_round_ps: 4246; X64: # %bb.0: # %entry 4247; X64-NEXT: kmovw %edi, %k1 4248; X64-NEXT: vfmsubadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4249; X64-NEXT: retq 4250entry: 4251 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4252 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4253 %1 = bitcast i16 %__U to <16 x i1> 4254 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4255 ret <16 x float> %2 4256} 4257 4258define <16 x float> @test_mm512_maskz_fmsubadd_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4259; X86-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4260; X86: # %bb.0: # %entry 4261; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4262; X86-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4263; X86-NEXT: retl 4264; 4265; X64-LABEL: test_mm512_maskz_fmsubadd_round_ps: 4266; X64: # %bb.0: # %entry 4267; X64-NEXT: kmovw %edi, %k1 4268; X64-NEXT: vfmsubadd213ps {rn-sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} 4269; X64-NEXT: retq 4270entry: 4271 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4272 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4273 %1 = bitcast i16 %__U to <16 x i1> 4274 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 4275 ret <16 x float> %2 4276} 4277 4278define <16 x float> @test_mm512_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4279; CHECK-LABEL: test_mm512_fmaddsub_ps: 4280; CHECK: # %bb.0: # %entry 4281; CHECK-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4282; CHECK-NEXT: ret{{[l|q]}} 4283entry: 4284 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4285 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4286 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4287 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4288 ret <16 x float> %3 4289} 4290 4291define <16 x float> @test_mm512_mask_fmaddsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4292; X86-LABEL: test_mm512_mask_fmaddsub_ps: 4293; X86: # %bb.0: # %entry 4294; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4295; X86-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4296; X86-NEXT: retl 4297; 4298; X64-LABEL: test_mm512_mask_fmaddsub_ps: 4299; X64: # %bb.0: # %entry 4300; X64-NEXT: kmovw %edi, %k1 4301; X64-NEXT: vfmaddsub132ps {{.*#+}} zmm0 = (zmm0 * zmm1) +/- zmm2 4302; X64-NEXT: retq 4303entry: 4304 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4305 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4306 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4307 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4308 %4 = bitcast i16 %__U to <16 x i1> 4309 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__A 4310 ret <16 x float> %5 4311} 4312 4313define <16 x float> @test_mm512_mask3_fmaddsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4314; X86-LABEL: test_mm512_mask3_fmaddsub_ps: 4315; X86: # %bb.0: # %entry 4316; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4317; X86-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4318; X86-NEXT: vmovaps %zmm2, %zmm0 4319; X86-NEXT: retl 4320; 4321; X64-LABEL: test_mm512_mask3_fmaddsub_ps: 4322; X64: # %bb.0: # %entry 4323; X64-NEXT: kmovw %edi, %k1 4324; X64-NEXT: vfmaddsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) +/- zmm2 4325; X64-NEXT: vmovaps %zmm2, %zmm0 4326; X64-NEXT: retq 4327entry: 4328 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4329 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4330 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4331 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4332 %4 = bitcast i16 %__U to <16 x i1> 4333 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> %__C 4334 ret <16 x float> %5 4335} 4336 4337define <16 x float> @test_mm512_maskz_fmaddsub_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4338; X86-LABEL: test_mm512_maskz_fmaddsub_ps: 4339; X86: # %bb.0: # %entry 4340; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4341; X86-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4342; X86-NEXT: retl 4343; 4344; X64-LABEL: test_mm512_maskz_fmaddsub_ps: 4345; X64: # %bb.0: # %entry 4346; X64-NEXT: kmovw %edi, %k1 4347; X64-NEXT: vfmaddsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) +/- zmm2 4348; X64-NEXT: retq 4349entry: 4350 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4351 %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4352 %2 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %1) #10 4353 %3 = shufflevector <16 x float> %2, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4354 %4 = bitcast i16 %__U to <16 x i1> 4355 %5 = select <16 x i1> %4, <16 x float> %3, <16 x float> zeroinitializer 4356 ret <16 x float> %5 4357} 4358 4359define <16 x float> @test_mm512_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4360; CHECK-LABEL: test_mm512_fmsubadd_ps: 4361; CHECK: # %bb.0: # %entry 4362; CHECK-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4363; CHECK-NEXT: ret{{[l|q]}} 4364entry: 4365 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4366 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4367 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4368 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4369 ret <16 x float> %2 4370} 4371 4372define <16 x float> @test_mm512_mask_fmsubadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4373; X86-LABEL: test_mm512_mask_fmsubadd_ps: 4374; X86: # %bb.0: # %entry 4375; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4376; X86-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4377; X86-NEXT: retl 4378; 4379; X64-LABEL: test_mm512_mask_fmsubadd_ps: 4380; X64: # %bb.0: # %entry 4381; X64-NEXT: kmovw %edi, %k1 4382; X64-NEXT: vfmsubadd132ps {{.*#+}} zmm0 = (zmm0 * zmm1) -/+ zmm2 4383; X64-NEXT: retq 4384entry: 4385 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4386 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4387 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4388 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4389 %3 = bitcast i16 %__U to <16 x i1> 4390 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__A 4391 ret <16 x float> %4 4392} 4393 4394define <16 x float> @test_mm512_maskz_fmsubadd_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B, <16 x float> %__C) { 4395; X86-LABEL: test_mm512_maskz_fmsubadd_ps: 4396; X86: # %bb.0: # %entry 4397; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4398; X86-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4399; X86-NEXT: retl 4400; 4401; X64-LABEL: test_mm512_maskz_fmsubadd_ps: 4402; X64: # %bb.0: # %entry 4403; X64-NEXT: kmovw %edi, %k1 4404; X64-NEXT: vfmsubadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) -/+ zmm2 4405; X64-NEXT: retq 4406entry: 4407 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4408 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4409 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4410 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4411 %3 = bitcast i16 %__U to <16 x i1> 4412 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> zeroinitializer 4413 ret <16 x float> %4 4414} 4415 4416define <8 x double> @test_mm512_mask3_fmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4417; X86-LABEL: test_mm512_mask3_fmsub_round_pd: 4418; X86: # %bb.0: # %entry 4419; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4420; X86-NEXT: kmovw %eax, %k1 4421; X86-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4422; X86-NEXT: vmovapd %zmm2, %zmm0 4423; X86-NEXT: retl 4424; 4425; X64-LABEL: test_mm512_mask3_fmsub_round_pd: 4426; X64: # %bb.0: # %entry 4427; X64-NEXT: kmovw %edi, %k1 4428; X64-NEXT: vfmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4429; X64-NEXT: vmovapd %zmm2, %zmm0 4430; X64-NEXT: retq 4431entry: 4432 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4433 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4434 %1 = bitcast i8 %__U to <8 x i1> 4435 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4436 ret <8 x double> %2 4437} 4438 4439define <8 x double> @test_mm512_mask3_fmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4440; X86-LABEL: test_mm512_mask3_fmsub_pd: 4441; X86: # %bb.0: # %entry 4442; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4443; X86-NEXT: kmovw %eax, %k1 4444; X86-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4445; X86-NEXT: vmovapd %zmm2, %zmm0 4446; X86-NEXT: retl 4447; 4448; X64-LABEL: test_mm512_mask3_fmsub_pd: 4449; X64: # %bb.0: # %entry 4450; X64-NEXT: kmovw %edi, %k1 4451; X64-NEXT: vfmsub231pd {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4452; X64-NEXT: vmovapd %zmm2, %zmm0 4453; X64-NEXT: retq 4454entry: 4455 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4456 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4457 %1 = bitcast i8 %__U to <8 x i1> 4458 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4459 ret <8 x double> %2 4460} 4461 4462define <16 x float> @test_mm512_mask3_fmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4463; X86-LABEL: test_mm512_mask3_fmsub_round_ps: 4464; X86: # %bb.0: # %entry 4465; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4466; X86-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4467; X86-NEXT: vmovaps %zmm2, %zmm0 4468; X86-NEXT: retl 4469; 4470; X64-LABEL: test_mm512_mask3_fmsub_round_ps: 4471; X64: # %bb.0: # %entry 4472; X64-NEXT: kmovw %edi, %k1 4473; X64-NEXT: vfmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4474; X64-NEXT: vmovaps %zmm2, %zmm0 4475; X64-NEXT: retq 4476entry: 4477 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4478 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4479 %1 = bitcast i16 %__U to <16 x i1> 4480 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4481 ret <16 x float> %2 4482} 4483 4484define <16 x float> @test_mm512_mask3_fmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4485; X86-LABEL: test_mm512_mask3_fmsub_ps: 4486; X86: # %bb.0: # %entry 4487; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4488; X86-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4489; X86-NEXT: vmovaps %zmm2, %zmm0 4490; X86-NEXT: retl 4491; 4492; X64-LABEL: test_mm512_mask3_fmsub_ps: 4493; X64: # %bb.0: # %entry 4494; X64-NEXT: kmovw %edi, %k1 4495; X64-NEXT: vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2 4496; X64-NEXT: vmovaps %zmm2, %zmm0 4497; X64-NEXT: retq 4498entry: 4499 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4500 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4501 %1 = bitcast i16 %__U to <16 x i1> 4502 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4503 ret <16 x float> %2 4504} 4505 4506define <8 x double> @test_mm512_mask3_fmsubadd_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4507; X86-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4508; X86: # %bb.0: # %entry 4509; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4510; X86-NEXT: kmovw %eax, %k1 4511; X86-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4512; X86-NEXT: vmovapd %zmm2, %zmm0 4513; X86-NEXT: retl 4514; 4515; X64-LABEL: test_mm512_mask3_fmsubadd_round_pd: 4516; X64: # %bb.0: # %entry 4517; X64-NEXT: kmovw %edi, %k1 4518; X64-NEXT: vfmsubadd231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4519; X64-NEXT: vmovapd %zmm2, %zmm0 4520; X64-NEXT: retq 4521entry: 4522 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4523 %0 = tail call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub, i32 8) 4524 %1 = bitcast i8 %__U to <8 x i1> 4525 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4526 ret <8 x double> %2 4527} 4528 4529define <8 x double> @test_mm512_mask3_fmsubadd_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4530; X86-LABEL: test_mm512_mask3_fmsubadd_pd: 4531; X86: # %bb.0: # %entry 4532; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4533; X86-NEXT: kmovw %eax, %k1 4534; X86-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4535; X86-NEXT: vmovapd %zmm2, %zmm0 4536; X86-NEXT: retl 4537; 4538; X64-LABEL: test_mm512_mask3_fmsubadd_pd: 4539; X64: # %bb.0: # %entry 4540; X64-NEXT: kmovw %edi, %k1 4541; X64-NEXT: vfmsubadd231pd {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4542; X64-NEXT: vmovapd %zmm2, %zmm0 4543; X64-NEXT: retq 4544entry: 4545 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4546 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %sub.i) #10 4547 %1 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C) #10 4548 %2 = shufflevector <8 x double> %1, <8 x double> %0, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15> 4549 %3 = bitcast i8 %__U to <8 x i1> 4550 %4 = select <8 x i1> %3, <8 x double> %2, <8 x double> %__C 4551 ret <8 x double> %4 4552} 4553 4554define <16 x float> @test_mm512_mask3_fmsubadd_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4555; X86-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4556; X86: # %bb.0: # %entry 4557; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4558; X86-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4559; X86-NEXT: vmovaps %zmm2, %zmm0 4560; X86-NEXT: retl 4561; 4562; X64-LABEL: test_mm512_mask3_fmsubadd_round_ps: 4563; X64: # %bb.0: # %entry 4564; X64-NEXT: kmovw %edi, %k1 4565; X64-NEXT: vfmsubadd231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4566; X64-NEXT: vmovaps %zmm2, %zmm0 4567; X64-NEXT: retq 4568entry: 4569 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4570 %0 = tail call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub, i32 8) 4571 %1 = bitcast i16 %__U to <16 x i1> 4572 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4573 ret <16 x float> %2 4574} 4575 4576define <16 x float> @test_mm512_mask3_fmsubadd_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4577; X86-LABEL: test_mm512_mask3_fmsubadd_ps: 4578; X86: # %bb.0: # %entry 4579; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4580; X86-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4581; X86-NEXT: vmovaps %zmm2, %zmm0 4582; X86-NEXT: retl 4583; 4584; X64-LABEL: test_mm512_mask3_fmsubadd_ps: 4585; X64: # %bb.0: # %entry 4586; X64-NEXT: kmovw %edi, %k1 4587; X64-NEXT: vfmsubadd231ps {{.*#+}} zmm2 = (zmm0 * zmm1) -/+ zmm2 4588; X64-NEXT: vmovaps %zmm2, %zmm0 4589; X64-NEXT: retq 4590entry: 4591 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4592 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %sub.i) #10 4593 %1 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C) #10 4594 %2 = shufflevector <16 x float> %1, <16 x float> %0, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31> 4595 %3 = bitcast i16 %__U to <16 x i1> 4596 %4 = select <16 x i1> %3, <16 x float> %2, <16 x float> %__C 4597 ret <16 x float> %4 4598} 4599 4600define <8 x double> @test_mm512_mask_fnmadd_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4601; X86-LABEL: test_mm512_mask_fnmadd_round_pd: 4602; X86: # %bb.0: # %entry 4603; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4604; X86-NEXT: kmovw %eax, %k1 4605; X86-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4606; X86-NEXT: retl 4607; 4608; X64-LABEL: test_mm512_mask_fnmadd_round_pd: 4609; X64: # %bb.0: # %entry 4610; X64-NEXT: kmovw %edi, %k1 4611; X64-NEXT: vfnmadd132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4612; X64-NEXT: retq 4613entry: 4614 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4615 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %sub, <8 x double> %__B, <8 x double> %__C, i32 8) 4616 %1 = bitcast i8 %__U to <8 x i1> 4617 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4618 ret <8 x double> %2 4619} 4620 4621define <8 x double> @test_mm512_mask_fnmadd_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4622; X86-LABEL: test_mm512_mask_fnmadd_pd: 4623; X86: # %bb.0: # %entry 4624; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4625; X86-NEXT: kmovw %eax, %k1 4626; X86-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4627; X86-NEXT: retl 4628; 4629; X64-LABEL: test_mm512_mask_fnmadd_pd: 4630; X64: # %bb.0: # %entry 4631; X64-NEXT: kmovw %edi, %k1 4632; X64-NEXT: vfnmadd132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4633; X64-NEXT: retq 4634entry: 4635 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__A 4636 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %sub.i, <8 x double> %__B, <8 x double> %__C) #10 4637 %1 = bitcast i8 %__U to <8 x i1> 4638 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4639 ret <8 x double> %2 4640} 4641 4642define <16 x float> @test_mm512_mask_fnmadd_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4643; X86-LABEL: test_mm512_mask_fnmadd_round_ps: 4644; X86: # %bb.0: # %entry 4645; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4646; X86-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4647; X86-NEXT: retl 4648; 4649; X64-LABEL: test_mm512_mask_fnmadd_round_ps: 4650; X64: # %bb.0: # %entry 4651; X64-NEXT: kmovw %edi, %k1 4652; X64-NEXT: vfnmadd132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4653; X64-NEXT: retq 4654entry: 4655 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4656 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub, <16 x float> %__B, <16 x float> %__C, i32 8) 4657 %1 = bitcast i16 %__U to <16 x i1> 4658 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4659 ret <16 x float> %2 4660} 4661 4662define <16 x float> @test_mm512_mask_fnmadd_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4663; X86-LABEL: test_mm512_mask_fnmadd_ps: 4664; X86: # %bb.0: # %entry 4665; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4666; X86-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4667; X86-NEXT: retl 4668; 4669; X64-LABEL: test_mm512_mask_fnmadd_ps: 4670; X64: # %bb.0: # %entry 4671; X64-NEXT: kmovw %edi, %k1 4672; X64-NEXT: vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm2 4673; X64-NEXT: retq 4674entry: 4675 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__A 4676 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %sub.i, <16 x float> %__B, <16 x float> %__C) #10 4677 %1 = bitcast i16 %__U to <16 x i1> 4678 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4679 ret <16 x float> %2 4680} 4681 4682define <8 x double> @test_mm512_mask_fnmsub_round_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4683; X86-LABEL: test_mm512_mask_fnmsub_round_pd: 4684; X86: # %bb.0: # %entry 4685; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4686; X86-NEXT: kmovw %eax, %k1 4687; X86-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4688; X86-NEXT: retl 4689; 4690; X64-LABEL: test_mm512_mask_fnmsub_round_pd: 4691; X64: # %bb.0: # %entry 4692; X64-NEXT: kmovw %edi, %k1 4693; X64-NEXT: vfnmsub132pd {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4694; X64-NEXT: retq 4695entry: 4696 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4697 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4698 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4699 %1 = bitcast i8 %__U to <8 x i1> 4700 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4701 ret <8 x double> %2 4702} 4703 4704define <8 x double> @test_mm512_mask3_fnmsub_round_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4705; X86-LABEL: test_mm512_mask3_fnmsub_round_pd: 4706; X86: # %bb.0: # %entry 4707; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4708; X86-NEXT: kmovw %eax, %k1 4709; X86-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4710; X86-NEXT: vmovapd %zmm2, %zmm0 4711; X86-NEXT: retl 4712; 4713; X64-LABEL: test_mm512_mask3_fnmsub_round_pd: 4714; X64: # %bb.0: # %entry 4715; X64-NEXT: kmovw %edi, %k1 4716; X64-NEXT: vfnmsub231pd {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4717; X64-NEXT: vmovapd %zmm2, %zmm0 4718; X64-NEXT: retq 4719entry: 4720 %sub = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4721 %sub1 = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4722 %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %__A, <8 x double> %sub, <8 x double> %sub1, i32 8) 4723 %1 = bitcast i8 %__U to <8 x i1> 4724 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4725 ret <8 x double> %2 4726} 4727 4728define <8 x double> @test_mm512_mask_fnmsub_pd(<8 x double> %__A, i8 zeroext %__U, <8 x double> %__B, <8 x double> %__C) { 4729; X86-LABEL: test_mm512_mask_fnmsub_pd: 4730; X86: # %bb.0: # %entry 4731; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4732; X86-NEXT: kmovw %eax, %k1 4733; X86-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4734; X86-NEXT: retl 4735; 4736; X64-LABEL: test_mm512_mask_fnmsub_pd: 4737; X64: # %bb.0: # %entry 4738; X64-NEXT: kmovw %edi, %k1 4739; X64-NEXT: vfnmsub132pd {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4740; X64-NEXT: retq 4741entry: 4742 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4743 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4744 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4745 %1 = bitcast i8 %__U to <8 x i1> 4746 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__A 4747 ret <8 x double> %2 4748} 4749 4750define <8 x double> @test_mm512_mask3_fnmsub_pd(<8 x double> %__A, <8 x double> %__B, <8 x double> %__C, i8 zeroext %__U) { 4751; X86-LABEL: test_mm512_mask3_fnmsub_pd: 4752; X86: # %bb.0: # %entry 4753; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4754; X86-NEXT: kmovw %eax, %k1 4755; X86-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4756; X86-NEXT: vmovapd %zmm2, %zmm0 4757; X86-NEXT: retl 4758; 4759; X64-LABEL: test_mm512_mask3_fnmsub_pd: 4760; X64: # %bb.0: # %entry 4761; X64-NEXT: kmovw %edi, %k1 4762; X64-NEXT: vfnmsub231pd {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4763; X64-NEXT: vmovapd %zmm2, %zmm0 4764; X64-NEXT: retq 4765entry: 4766 %sub.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__B 4767 %sub2.i = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %__C 4768 %0 = tail call <8 x double> @llvm.fma.v8f64(<8 x double> %__A, <8 x double> %sub.i, <8 x double> %sub2.i) #10 4769 %1 = bitcast i8 %__U to <8 x i1> 4770 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__C 4771 ret <8 x double> %2 4772} 4773 4774define <16 x float> @test_mm512_mask_fnmsub_round_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4775; X86-LABEL: test_mm512_mask_fnmsub_round_ps: 4776; X86: # %bb.0: # %entry 4777; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4778; X86-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4779; X86-NEXT: retl 4780; 4781; X64-LABEL: test_mm512_mask_fnmsub_round_ps: 4782; X64: # %bb.0: # %entry 4783; X64-NEXT: kmovw %edi, %k1 4784; X64-NEXT: vfnmsub132ps {rn-sae}, %zmm1, %zmm2, %zmm0 {%k1} 4785; X64-NEXT: retq 4786entry: 4787 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4788 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4789 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4790 %1 = bitcast i16 %__U to <16 x i1> 4791 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4792 ret <16 x float> %2 4793} 4794 4795define <16 x float> @test_mm512_mask3_fnmsub_round_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4796; X86-LABEL: test_mm512_mask3_fnmsub_round_ps: 4797; X86: # %bb.0: # %entry 4798; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4799; X86-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4800; X86-NEXT: vmovaps %zmm2, %zmm0 4801; X86-NEXT: retl 4802; 4803; X64-LABEL: test_mm512_mask3_fnmsub_round_ps: 4804; X64: # %bb.0: # %entry 4805; X64-NEXT: kmovw %edi, %k1 4806; X64-NEXT: vfnmsub231ps {rn-sae}, %zmm1, %zmm0, %zmm2 {%k1} 4807; X64-NEXT: vmovaps %zmm2, %zmm0 4808; X64-NEXT: retq 4809entry: 4810 %sub = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4811 %sub1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4812 %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %__A, <16 x float> %sub, <16 x float> %sub1, i32 8) 4813 %1 = bitcast i16 %__U to <16 x i1> 4814 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4815 ret <16 x float> %2 4816} 4817 4818define <16 x float> @test_mm512_mask_fnmsub_ps(<16 x float> %__A, i16 zeroext %__U, <16 x float> %__B, <16 x float> %__C) { 4819; X86-LABEL: test_mm512_mask_fnmsub_ps: 4820; X86: # %bb.0: # %entry 4821; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4822; X86-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4823; X86-NEXT: retl 4824; 4825; X64-LABEL: test_mm512_mask_fnmsub_ps: 4826; X64: # %bb.0: # %entry 4827; X64-NEXT: kmovw %edi, %k1 4828; X64-NEXT: vfnmsub132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) - zmm2 4829; X64-NEXT: retq 4830entry: 4831 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4832 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4833 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4834 %1 = bitcast i16 %__U to <16 x i1> 4835 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__A 4836 ret <16 x float> %2 4837} 4838 4839define <16 x float> @test_mm512_mask3_fnmsub_ps(<16 x float> %__A, <16 x float> %__B, <16 x float> %__C, i16 zeroext %__U) { 4840; X86-LABEL: test_mm512_mask3_fnmsub_ps: 4841; X86: # %bb.0: # %entry 4842; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 4843; X86-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4844; X86-NEXT: vmovaps %zmm2, %zmm0 4845; X86-NEXT: retl 4846; 4847; X64-LABEL: test_mm512_mask3_fnmsub_ps: 4848; X64: # %bb.0: # %entry 4849; X64-NEXT: kmovw %edi, %k1 4850; X64-NEXT: vfnmsub231ps {{.*#+}} zmm2 = -(zmm0 * zmm1) - zmm2 4851; X64-NEXT: vmovaps %zmm2, %zmm0 4852; X64-NEXT: retq 4853entry: 4854 %sub.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__B 4855 %sub1.i = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %__C 4856 %0 = tail call <16 x float> @llvm.fma.v16f32(<16 x float> %__A, <16 x float> %sub.i, <16 x float> %sub1.i) #10 4857 %1 = bitcast i16 %__U to <16 x i1> 4858 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__C 4859 ret <16 x float> %2 4860} 4861 4862define <4 x float> @test_mm_mask_fmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 4863; X86-LABEL: test_mm_mask_fmadd_ss: 4864; X86: # %bb.0: # %entry 4865; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4866; X86-NEXT: kmovw %eax, %k1 4867; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4868; X86-NEXT: retl 4869; 4870; X64-LABEL: test_mm_mask_fmadd_ss: 4871; X64: # %bb.0: # %entry 4872; X64-NEXT: kmovw %edi, %k1 4873; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4874; X64-NEXT: retq 4875entry: 4876 %0 = extractelement <4 x float> %__W, i64 0 4877 %1 = extractelement <4 x float> %__A, i64 0 4878 %2 = extractelement <4 x float> %__B, i64 0 4879 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4880 %4 = and i8 %__U, 1 4881 %tobool.i = icmp eq i8 %4, 0 4882 %vecext1.i = extractelement <4 x float> %__W, i32 0 4883 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 4884 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 4885 ret <4 x float> %vecins.i 4886} 4887 4888define <4 x float> @test_mm_mask_fmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 4889; X86-LABEL: test_mm_mask_fmadd_round_ss: 4890; X86: # %bb.0: # %entry 4891; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4892; X86-NEXT: kmovw %eax, %k1 4893; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 4894; X86-NEXT: retl 4895; 4896; X64-LABEL: test_mm_mask_fmadd_round_ss: 4897; X64: # %bb.0: # %entry 4898; X64-NEXT: kmovw %edi, %k1 4899; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 4900; X64-NEXT: retq 4901entry: 4902 %0 = extractelement <4 x float> %__W, i64 0 4903 %1 = extractelement <4 x float> %__A, i64 0 4904 %2 = extractelement <4 x float> %__B, i64 0 4905 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 4906 %4 = bitcast i8 %__U to <8 x i1> 4907 %5 = extractelement <8 x i1> %4, i64 0 4908 %6 = select i1 %5, float %3, float %0 4909 %7 = insertelement <4 x float> %__W, float %6, i64 0 4910 ret <4 x float> %7 4911} 4912 4913declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #1 4914 4915define <4 x float> @test_mm_maskz_fmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4916; X86-LABEL: test_mm_maskz_fmadd_ss: 4917; X86: # %bb.0: # %entry 4918; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4919; X86-NEXT: kmovw %eax, %k1 4920; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4921; X86-NEXT: retl 4922; 4923; X64-LABEL: test_mm_maskz_fmadd_ss: 4924; X64: # %bb.0: # %entry 4925; X64-NEXT: kmovw %edi, %k1 4926; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 4927; X64-NEXT: retq 4928entry: 4929 %0 = extractelement <4 x float> %__A, i64 0 4930 %1 = extractelement <4 x float> %__B, i64 0 4931 %2 = extractelement <4 x float> %__C, i64 0 4932 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4933 %4 = and i8 %__U, 1 4934 %tobool.i = icmp eq i8 %4, 0 4935 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 4936 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 4937 ret <4 x float> %vecins.i 4938} 4939 4940define <4 x float> @test_mm_maskz_fmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 4941; X86-LABEL: test_mm_maskz_fmadd_round_ss: 4942; X86: # %bb.0: # %entry 4943; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4944; X86-NEXT: kmovw %eax, %k1 4945; X86-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 4946; X86-NEXT: retl 4947; 4948; X64-LABEL: test_mm_maskz_fmadd_round_ss: 4949; X64: # %bb.0: # %entry 4950; X64-NEXT: kmovw %edi, %k1 4951; X64-NEXT: vfmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 4952; X64-NEXT: retq 4953entry: 4954 %0 = extractelement <4 x float> %__A, i64 0 4955 %1 = extractelement <4 x float> %__B, i64 0 4956 %2 = extractelement <4 x float> %__C, i64 0 4957 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 4958 %4 = bitcast i8 %__U to <8 x i1> 4959 %5 = extractelement <8 x i1> %4, i64 0 4960 %6 = select i1 %5, float %3, float 0.000000e+00 4961 %7 = insertelement <4 x float> %__A, float %6, i64 0 4962 ret <4 x float> %7 4963} 4964 4965define <4 x float> @test_mm_mask3_fmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 4966; X86-LABEL: test_mm_mask3_fmadd_ss: 4967; X86: # %bb.0: # %entry 4968; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4969; X86-NEXT: kmovw %eax, %k1 4970; X86-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4971; X86-NEXT: vmovaps %xmm2, %xmm0 4972; X86-NEXT: retl 4973; 4974; X64-LABEL: test_mm_mask3_fmadd_ss: 4975; X64: # %bb.0: # %entry 4976; X64-NEXT: kmovw %edi, %k1 4977; X64-NEXT: vfmadd231ss {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 4978; X64-NEXT: vmovaps %xmm2, %xmm0 4979; X64-NEXT: retq 4980entry: 4981 %0 = extractelement <4 x float> %__W, i64 0 4982 %1 = extractelement <4 x float> %__X, i64 0 4983 %2 = extractelement <4 x float> %__Y, i64 0 4984 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 4985 %4 = and i8 %__U, 1 4986 %tobool.i = icmp eq i8 %4, 0 4987 %vecext1.i = extractelement <4 x float> %__Y, i32 0 4988 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 4989 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 4990 ret <4 x float> %vecins.i 4991} 4992 4993define <4 x float> @test_mm_mask3_fmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 4994; X86-LABEL: test_mm_mask3_fmadd_round_ss: 4995; X86: # %bb.0: # %entry 4996; X86-NEXT: movb {{[0-9]+}}(%esp), %al 4997; X86-NEXT: kmovw %eax, %k1 4998; X86-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 4999; X86-NEXT: vmovaps %xmm2, %xmm0 5000; X86-NEXT: retl 5001; 5002; X64-LABEL: test_mm_mask3_fmadd_round_ss: 5003; X64: # %bb.0: # %entry 5004; X64-NEXT: kmovw %edi, %k1 5005; X64-NEXT: vfmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5006; X64-NEXT: vmovaps %xmm2, %xmm0 5007; X64-NEXT: retq 5008entry: 5009 %0 = extractelement <4 x float> %__W, i64 0 5010 %1 = extractelement <4 x float> %__X, i64 0 5011 %2 = extractelement <4 x float> %__Y, i64 0 5012 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5013 %4 = bitcast i8 %__U to <8 x i1> 5014 %5 = extractelement <8 x i1> %4, i64 0 5015 %6 = select i1 %5, float %3, float %2 5016 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5017 ret <4 x float> %7 5018} 5019 5020define <4 x float> @test_mm_mask_fmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5021; X86-LABEL: test_mm_mask_fmsub_ss: 5022; X86: # %bb.0: # %entry 5023; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5024; X86-NEXT: kmovw %eax, %k1 5025; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5026; X86-NEXT: retl 5027; 5028; X64-LABEL: test_mm_mask_fmsub_ss: 5029; X64: # %bb.0: # %entry 5030; X64-NEXT: kmovw %edi, %k1 5031; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5032; X64-NEXT: retq 5033entry: 5034 %0 = extractelement <4 x float> %__W, i64 0 5035 %1 = extractelement <4 x float> %__A, i64 0 5036 %.rhs.i = extractelement <4 x float> %__B, i64 0 5037 %2 = fsub float -0.000000e+00, %.rhs.i 5038 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5039 %4 = and i8 %__U, 1 5040 %tobool.i = icmp eq i8 %4, 0 5041 %vecext1.i = extractelement <4 x float> %__W, i32 0 5042 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5043 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5044 ret <4 x float> %vecins.i 5045} 5046 5047define <4 x float> @test_mm_mask_fmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5048; X86-LABEL: test_mm_mask_fmsub_round_ss: 5049; X86: # %bb.0: # %entry 5050; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5051; X86-NEXT: kmovw %eax, %k1 5052; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5053; X86-NEXT: retl 5054; 5055; X64-LABEL: test_mm_mask_fmsub_round_ss: 5056; X64: # %bb.0: # %entry 5057; X64-NEXT: kmovw %edi, %k1 5058; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5059; X64-NEXT: retq 5060entry: 5061 %0 = extractelement <4 x float> %__W, i64 0 5062 %1 = extractelement <4 x float> %__A, i64 0 5063 %.rhs = extractelement <4 x float> %__B, i64 0 5064 %2 = fsub float -0.000000e+00, %.rhs 5065 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5066 %4 = bitcast i8 %__U to <8 x i1> 5067 %5 = extractelement <8 x i1> %4, i64 0 5068 %6 = select i1 %5, float %3, float %0 5069 %7 = insertelement <4 x float> %__W, float %6, i64 0 5070 ret <4 x float> %7 5071} 5072 5073define <4 x float> @test_mm_maskz_fmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5074; X86-LABEL: test_mm_maskz_fmsub_ss: 5075; X86: # %bb.0: # %entry 5076; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5077; X86-NEXT: kmovw %eax, %k1 5078; X86-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5079; X86-NEXT: retl 5080; 5081; X64-LABEL: test_mm_maskz_fmsub_ss: 5082; X64: # %bb.0: # %entry 5083; X64-NEXT: kmovw %edi, %k1 5084; X64-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5085; X64-NEXT: retq 5086entry: 5087 %0 = extractelement <4 x float> %__A, i64 0 5088 %1 = extractelement <4 x float> %__B, i64 0 5089 %.rhs.i = extractelement <4 x float> %__C, i64 0 5090 %2 = fsub float -0.000000e+00, %.rhs.i 5091 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5092 %4 = and i8 %__U, 1 5093 %tobool.i = icmp eq i8 %4, 0 5094 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5095 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5096 ret <4 x float> %vecins.i 5097} 5098 5099define <4 x float> @test_mm_maskz_fmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5100; X86-LABEL: test_mm_maskz_fmsub_round_ss: 5101; X86: # %bb.0: # %entry 5102; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5103; X86-NEXT: kmovw %eax, %k1 5104; X86-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5105; X86-NEXT: retl 5106; 5107; X64-LABEL: test_mm_maskz_fmsub_round_ss: 5108; X64: # %bb.0: # %entry 5109; X64-NEXT: kmovw %edi, %k1 5110; X64-NEXT: vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5111; X64-NEXT: retq 5112entry: 5113 %0 = extractelement <4 x float> %__A, i64 0 5114 %1 = extractelement <4 x float> %__B, i64 0 5115 %.rhs = extractelement <4 x float> %__C, i64 0 5116 %2 = fsub float -0.000000e+00, %.rhs 5117 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5118 %4 = bitcast i8 %__U to <8 x i1> 5119 %5 = extractelement <8 x i1> %4, i64 0 5120 %6 = select i1 %5, float %3, float 0.000000e+00 5121 %7 = insertelement <4 x float> %__A, float %6, i64 0 5122 ret <4 x float> %7 5123} 5124 5125define <4 x float> @test_mm_mask3_fmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5126; X86-LABEL: test_mm_mask3_fmsub_ss: 5127; X86: # %bb.0: # %entry 5128; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5129; X86-NEXT: kmovw %eax, %k1 5130; X86-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5131; X86-NEXT: vmovaps %xmm2, %xmm0 5132; X86-NEXT: retl 5133; 5134; X64-LABEL: test_mm_mask3_fmsub_ss: 5135; X64: # %bb.0: # %entry 5136; X64-NEXT: kmovw %edi, %k1 5137; X64-NEXT: vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5138; X64-NEXT: vmovaps %xmm2, %xmm0 5139; X64-NEXT: retq 5140entry: 5141 %0 = extractelement <4 x float> %__W, i64 0 5142 %1 = extractelement <4 x float> %__X, i64 0 5143 %.rhs.i = extractelement <4 x float> %__Y, i64 0 5144 %2 = fsub float -0.000000e+00, %.rhs.i 5145 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5146 %4 = and i8 %__U, 1 5147 %tobool.i = icmp eq i8 %4, 0 5148 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5149 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5150 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5151 ret <4 x float> %vecins.i 5152} 5153 5154define <4 x float> @test_mm_mask3_fmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5155; X86-LABEL: test_mm_mask3_fmsub_round_ss: 5156; X86: # %bb.0: # %entry 5157; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5158; X86-NEXT: kmovw %eax, %k1 5159; X86-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5160; X86-NEXT: vmovaps %xmm2, %xmm0 5161; X86-NEXT: retl 5162; 5163; X64-LABEL: test_mm_mask3_fmsub_round_ss: 5164; X64: # %bb.0: # %entry 5165; X64-NEXT: kmovw %edi, %k1 5166; X64-NEXT: vfmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5167; X64-NEXT: vmovaps %xmm2, %xmm0 5168; X64-NEXT: retq 5169entry: 5170 %0 = extractelement <4 x float> %__W, i64 0 5171 %1 = extractelement <4 x float> %__X, i64 0 5172 %.rhs = extractelement <4 x float> %__Y, i64 0 5173 %2 = fsub float -0.000000e+00, %.rhs 5174 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5175 %4 = bitcast i8 %__U to <8 x i1> 5176 %5 = extractelement <8 x i1> %4, i64 0 5177 %6 = select i1 %5, float %3, float %.rhs 5178 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5179 ret <4 x float> %7 5180} 5181 5182define <4 x float> @test_mm_mask_fnmadd_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5183; X86-LABEL: test_mm_mask_fnmadd_ss: 5184; X86: # %bb.0: # %entry 5185; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5186; X86-NEXT: kmovw %eax, %k1 5187; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5188; X86-NEXT: retl 5189; 5190; X64-LABEL: test_mm_mask_fnmadd_ss: 5191; X64: # %bb.0: # %entry 5192; X64-NEXT: kmovw %edi, %k1 5193; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5194; X64-NEXT: retq 5195entry: 5196 %0 = extractelement <4 x float> %__W, i64 0 5197 %.rhs.i = extractelement <4 x float> %__A, i64 0 5198 %1 = fsub float -0.000000e+00, %.rhs.i 5199 %2 = extractelement <4 x float> %__B, i64 0 5200 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5201 %4 = and i8 %__U, 1 5202 %tobool.i = icmp eq i8 %4, 0 5203 %vecext1.i = extractelement <4 x float> %__W, i32 0 5204 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5205 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5206 ret <4 x float> %vecins.i 5207} 5208 5209define <4 x float> @test_mm_mask_fnmadd_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5210; X86-LABEL: test_mm_mask_fnmadd_round_ss: 5211; X86: # %bb.0: # %entry 5212; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5213; X86-NEXT: kmovw %eax, %k1 5214; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5215; X86-NEXT: retl 5216; 5217; X64-LABEL: test_mm_mask_fnmadd_round_ss: 5218; X64: # %bb.0: # %entry 5219; X64-NEXT: kmovw %edi, %k1 5220; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5221; X64-NEXT: retq 5222entry: 5223 %0 = extractelement <4 x float> %__W, i64 0 5224 %.rhs = extractelement <4 x float> %__A, i64 0 5225 %1 = fsub float -0.000000e+00, %.rhs 5226 %2 = extractelement <4 x float> %__B, i64 0 5227 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5228 %4 = bitcast i8 %__U to <8 x i1> 5229 %5 = extractelement <8 x i1> %4, i64 0 5230 %6 = select i1 %5, float %3, float %0 5231 %7 = insertelement <4 x float> %__W, float %6, i64 0 5232 ret <4 x float> %7 5233} 5234 5235define <4 x float> @test_mm_maskz_fnmadd_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5236; X86-LABEL: test_mm_maskz_fnmadd_ss: 5237; X86: # %bb.0: # %entry 5238; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5239; X86-NEXT: kmovw %eax, %k1 5240; X86-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5241; X86-NEXT: retl 5242; 5243; X64-LABEL: test_mm_maskz_fnmadd_ss: 5244; X64: # %bb.0: # %entry 5245; X64-NEXT: kmovw %edi, %k1 5246; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5247; X64-NEXT: retq 5248entry: 5249 %0 = extractelement <4 x float> %__A, i64 0 5250 %.rhs.i = extractelement <4 x float> %__B, i64 0 5251 %1 = fsub float -0.000000e+00, %.rhs.i 5252 %2 = extractelement <4 x float> %__C, i64 0 5253 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5254 %4 = and i8 %__U, 1 5255 %tobool.i = icmp eq i8 %4, 0 5256 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5257 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5258 ret <4 x float> %vecins.i 5259} 5260 5261define <4 x float> @test_mm_maskz_fnmadd_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5262; X86-LABEL: test_mm_maskz_fnmadd_round_ss: 5263; X86: # %bb.0: # %entry 5264; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5265; X86-NEXT: kmovw %eax, %k1 5266; X86-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5267; X86-NEXT: retl 5268; 5269; X64-LABEL: test_mm_maskz_fnmadd_round_ss: 5270; X64: # %bb.0: # %entry 5271; X64-NEXT: kmovw %edi, %k1 5272; X64-NEXT: vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5273; X64-NEXT: retq 5274entry: 5275 %0 = extractelement <4 x float> %__A, i64 0 5276 %.rhs = extractelement <4 x float> %__B, i64 0 5277 %1 = fsub float -0.000000e+00, %.rhs 5278 %2 = extractelement <4 x float> %__C, i64 0 5279 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5280 %4 = bitcast i8 %__U to <8 x i1> 5281 %5 = extractelement <8 x i1> %4, i64 0 5282 %6 = select i1 %5, float %3, float 0.000000e+00 5283 %7 = insertelement <4 x float> %__A, float %6, i64 0 5284 ret <4 x float> %7 5285} 5286 5287define <4 x float> @test_mm_mask3_fnmadd_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5288; X86-LABEL: test_mm_mask3_fnmadd_ss: 5289; X86: # %bb.0: # %entry 5290; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5291; X86-NEXT: kmovw %eax, %k1 5292; X86-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5293; X86-NEXT: vmovaps %xmm2, %xmm0 5294; X86-NEXT: retl 5295; 5296; X64-LABEL: test_mm_mask3_fnmadd_ss: 5297; X64: # %bb.0: # %entry 5298; X64-NEXT: kmovw %edi, %k1 5299; X64-NEXT: vfnmadd231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5300; X64-NEXT: vmovaps %xmm2, %xmm0 5301; X64-NEXT: retq 5302entry: 5303 %0 = extractelement <4 x float> %__W, i64 0 5304 %.rhs.i = extractelement <4 x float> %__X, i64 0 5305 %1 = fsub float -0.000000e+00, %.rhs.i 5306 %2 = extractelement <4 x float> %__Y, i64 0 5307 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5308 %4 = and i8 %__U, 1 5309 %tobool.i = icmp eq i8 %4, 0 5310 %vecext1.i = extractelement <4 x float> %__Y, i32 0 5311 %cond.i = select i1 %tobool.i, float %vecext1.i, float %3 5312 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5313 ret <4 x float> %vecins.i 5314} 5315 5316define <4 x float> @test_mm_mask3_fnmadd_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5317; X86-LABEL: test_mm_mask3_fnmadd_round_ss: 5318; X86: # %bb.0: # %entry 5319; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5320; X86-NEXT: kmovw %eax, %k1 5321; X86-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5322; X86-NEXT: vmovaps %xmm2, %xmm0 5323; X86-NEXT: retl 5324; 5325; X64-LABEL: test_mm_mask3_fnmadd_round_ss: 5326; X64: # %bb.0: # %entry 5327; X64-NEXT: kmovw %edi, %k1 5328; X64-NEXT: vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5329; X64-NEXT: vmovaps %xmm2, %xmm0 5330; X64-NEXT: retq 5331entry: 5332 %0 = extractelement <4 x float> %__W, i64 0 5333 %.rhs = extractelement <4 x float> %__X, i64 0 5334 %1 = fsub float -0.000000e+00, %.rhs 5335 %2 = extractelement <4 x float> %__Y, i64 0 5336 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5337 %4 = bitcast i8 %__U to <8 x i1> 5338 %5 = extractelement <8 x i1> %4, i64 0 5339 %6 = select i1 %5, float %3, float %2 5340 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5341 ret <4 x float> %7 5342} 5343 5344define <4 x float> @test_mm_mask_fnmsub_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5345; X86-LABEL: test_mm_mask_fnmsub_ss: 5346; X86: # %bb.0: # %entry 5347; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5348; X86-NEXT: kmovw %eax, %k1 5349; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5350; X86-NEXT: retl 5351; 5352; X64-LABEL: test_mm_mask_fnmsub_ss: 5353; X64: # %bb.0: # %entry 5354; X64-NEXT: kmovw %edi, %k1 5355; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5356; X64-NEXT: retq 5357entry: 5358 %0 = extractelement <4 x float> %__W, i64 0 5359 %.rhs.i = extractelement <4 x float> %__A, i64 0 5360 %1 = fsub float -0.000000e+00, %.rhs.i 5361 %.rhs7.i = extractelement <4 x float> %__B, i64 0 5362 %2 = fsub float -0.000000e+00, %.rhs7.i 5363 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5364 %4 = and i8 %__U, 1 5365 %tobool.i = icmp eq i8 %4, 0 5366 %vecext2.i = extractelement <4 x float> %__W, i32 0 5367 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5368 %vecins.i = insertelement <4 x float> %__W, float %cond.i, i32 0 5369 ret <4 x float> %vecins.i 5370} 5371 5372define <4 x float> @test_mm_mask_fnmsub_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { 5373; X86-LABEL: test_mm_mask_fnmsub_round_ss: 5374; X86: # %bb.0: # %entry 5375; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5376; X86-NEXT: kmovw %eax, %k1 5377; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5378; X86-NEXT: retl 5379; 5380; X64-LABEL: test_mm_mask_fnmsub_round_ss: 5381; X64: # %bb.0: # %entry 5382; X64-NEXT: kmovw %edi, %k1 5383; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5384; X64-NEXT: retq 5385entry: 5386 %0 = extractelement <4 x float> %__W, i64 0 5387 %.rhs = extractelement <4 x float> %__A, i64 0 5388 %1 = fsub float -0.000000e+00, %.rhs 5389 %.rhs2 = extractelement <4 x float> %__B, i64 0 5390 %2 = fsub float -0.000000e+00, %.rhs2 5391 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5392 %4 = bitcast i8 %__U to <8 x i1> 5393 %5 = extractelement <8 x i1> %4, i64 0 5394 %6 = select i1 %5, float %3, float %0 5395 %7 = insertelement <4 x float> %__W, float %6, i64 0 5396 ret <4 x float> %7 5397} 5398 5399define <4 x float> @test_mm_maskz_fnmsub_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5400; X86-LABEL: test_mm_maskz_fnmsub_ss: 5401; X86: # %bb.0: # %entry 5402; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5403; X86-NEXT: kmovw %eax, %k1 5404; X86-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5405; X86-NEXT: retl 5406; 5407; X64-LABEL: test_mm_maskz_fnmsub_ss: 5408; X64: # %bb.0: # %entry 5409; X64-NEXT: kmovw %edi, %k1 5410; X64-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 5411; X64-NEXT: retq 5412entry: 5413 %0 = extractelement <4 x float> %__A, i64 0 5414 %.rhs.i = extractelement <4 x float> %__B, i64 0 5415 %1 = fsub float -0.000000e+00, %.rhs.i 5416 %.rhs5.i = extractelement <4 x float> %__C, i64 0 5417 %2 = fsub float -0.000000e+00, %.rhs5.i 5418 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5419 %4 = and i8 %__U, 1 5420 %tobool.i = icmp eq i8 %4, 0 5421 %cond.i = select i1 %tobool.i, float 0.000000e+00, float %3 5422 %vecins.i = insertelement <4 x float> %__A, float %cond.i, i32 0 5423 ret <4 x float> %vecins.i 5424} 5425 5426define <4 x float> @test_mm_maskz_fnmsub_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B, <4 x float> %__C) { 5427; X86-LABEL: test_mm_maskz_fnmsub_round_ss: 5428; X86: # %bb.0: # %entry 5429; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5430; X86-NEXT: kmovw %eax, %k1 5431; X86-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5432; X86-NEXT: retl 5433; 5434; X64-LABEL: test_mm_maskz_fnmsub_round_ss: 5435; X64: # %bb.0: # %entry 5436; X64-NEXT: kmovw %edi, %k1 5437; X64-NEXT: vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5438; X64-NEXT: retq 5439entry: 5440 %0 = extractelement <4 x float> %__A, i64 0 5441 %.rhs = extractelement <4 x float> %__B, i64 0 5442 %1 = fsub float -0.000000e+00, %.rhs 5443 %.rhs2 = extractelement <4 x float> %__C, i64 0 5444 %2 = fsub float -0.000000e+00, %.rhs2 5445 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5446 %4 = bitcast i8 %__U to <8 x i1> 5447 %5 = extractelement <8 x i1> %4, i64 0 5448 %6 = select i1 %5, float %3, float 0.000000e+00 5449 %7 = insertelement <4 x float> %__A, float %6, i64 0 5450 ret <4 x float> %7 5451} 5452 5453define <4 x float> @test_mm_mask3_fnmsub_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5454; X86-LABEL: test_mm_mask3_fnmsub_ss: 5455; X86: # %bb.0: # %entry 5456; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5457; X86-NEXT: kmovw %eax, %k1 5458; X86-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5459; X86-NEXT: vmovaps %xmm2, %xmm0 5460; X86-NEXT: retl 5461; 5462; X64-LABEL: test_mm_mask3_fnmsub_ss: 5463; X64: # %bb.0: # %entry 5464; X64-NEXT: kmovw %edi, %k1 5465; X64-NEXT: vfnmsub231ss {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 5466; X64-NEXT: vmovaps %xmm2, %xmm0 5467; X64-NEXT: retq 5468entry: 5469 %0 = extractelement <4 x float> %__W, i64 0 5470 %.rhs.i = extractelement <4 x float> %__X, i64 0 5471 %1 = fsub float -0.000000e+00, %.rhs.i 5472 %.rhs7.i = extractelement <4 x float> %__Y, i64 0 5473 %2 = fsub float -0.000000e+00, %.rhs7.i 5474 %3 = tail call float @llvm.fma.f32(float %0, float %1, float %2) #10 5475 %4 = and i8 %__U, 1 5476 %tobool.i = icmp eq i8 %4, 0 5477 %vecext2.i = extractelement <4 x float> %__Y, i32 0 5478 %cond.i = select i1 %tobool.i, float %vecext2.i, float %3 5479 %vecins.i = insertelement <4 x float> %__Y, float %cond.i, i32 0 5480 ret <4 x float> %vecins.i 5481} 5482 5483define <4 x float> @test_mm_mask3_fnmsub_round_ss(<4 x float> %__W, <4 x float> %__X, <4 x float> %__Y, i8 zeroext %__U) { 5484; X86-LABEL: test_mm_mask3_fnmsub_round_ss: 5485; X86: # %bb.0: # %entry 5486; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5487; X86-NEXT: kmovw %eax, %k1 5488; X86-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5489; X86-NEXT: vmovaps %xmm2, %xmm0 5490; X86-NEXT: retl 5491; 5492; X64-LABEL: test_mm_mask3_fnmsub_round_ss: 5493; X64: # %bb.0: # %entry 5494; X64-NEXT: kmovw %edi, %k1 5495; X64-NEXT: vfnmsub231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5496; X64-NEXT: vmovaps %xmm2, %xmm0 5497; X64-NEXT: retq 5498entry: 5499 %0 = extractelement <4 x float> %__W, i64 0 5500 %.rhs = extractelement <4 x float> %__X, i64 0 5501 %1 = fsub float -0.000000e+00, %.rhs 5502 %.rhs1 = extractelement <4 x float> %__Y, i64 0 5503 %2 = fsub float -0.000000e+00, %.rhs1 5504 %3 = tail call float @llvm.x86.avx512.vfmadd.f32(float %0, float %1, float %2, i32 8) 5505 %4 = bitcast i8 %__U to <8 x i1> 5506 %5 = extractelement <8 x i1> %4, i64 0 5507 %6 = select i1 %5, float %3, float %.rhs1 5508 %7 = insertelement <4 x float> %__Y, float %6, i64 0 5509 ret <4 x float> %7 5510} 5511 5512define <2 x double> @test_mm_mask_fmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5513; X86-LABEL: test_mm_mask_fmadd_sd: 5514; X86: # %bb.0: # %entry 5515; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5516; X86-NEXT: kmovw %eax, %k1 5517; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5518; X86-NEXT: retl 5519; 5520; X64-LABEL: test_mm_mask_fmadd_sd: 5521; X64: # %bb.0: # %entry 5522; X64-NEXT: kmovw %edi, %k1 5523; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5524; X64-NEXT: retq 5525entry: 5526 %0 = extractelement <2 x double> %__W, i64 0 5527 %1 = extractelement <2 x double> %__A, i64 0 5528 %2 = extractelement <2 x double> %__B, i64 0 5529 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5530 %4 = and i8 %__U, 1 5531 %tobool.i = icmp eq i8 %4, 0 5532 %vecext1.i = extractelement <2 x double> %__W, i32 0 5533 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5534 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5535 ret <2 x double> %vecins.i 5536} 5537 5538define <2 x double> @test_mm_mask_fmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5539; X86-LABEL: test_mm_mask_fmadd_round_sd: 5540; X86: # %bb.0: # %entry 5541; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5542; X86-NEXT: kmovw %eax, %k1 5543; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5544; X86-NEXT: retl 5545; 5546; X64-LABEL: test_mm_mask_fmadd_round_sd: 5547; X64: # %bb.0: # %entry 5548; X64-NEXT: kmovw %edi, %k1 5549; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5550; X64-NEXT: retq 5551entry: 5552 %0 = extractelement <2 x double> %__W, i64 0 5553 %1 = extractelement <2 x double> %__A, i64 0 5554 %2 = extractelement <2 x double> %__B, i64 0 5555 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5556 %4 = bitcast i8 %__U to <8 x i1> 5557 %5 = extractelement <8 x i1> %4, i64 0 5558 %6 = select i1 %5, double %3, double %0 5559 %7 = insertelement <2 x double> %__W, double %6, i64 0 5560 ret <2 x double> %7 5561} 5562 5563declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #1 5564 5565define <2 x double> @test_mm_maskz_fmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5566; X86-LABEL: test_mm_maskz_fmadd_sd: 5567; X86: # %bb.0: # %entry 5568; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5569; X86-NEXT: kmovw %eax, %k1 5570; X86-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5571; X86-NEXT: retl 5572; 5573; X64-LABEL: test_mm_maskz_fmadd_sd: 5574; X64: # %bb.0: # %entry 5575; X64-NEXT: kmovw %edi, %k1 5576; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 5577; X64-NEXT: retq 5578entry: 5579 %0 = extractelement <2 x double> %__A, i64 0 5580 %1 = extractelement <2 x double> %__B, i64 0 5581 %2 = extractelement <2 x double> %__C, i64 0 5582 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5583 %4 = and i8 %__U, 1 5584 %tobool.i = icmp eq i8 %4, 0 5585 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5586 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5587 ret <2 x double> %vecins.i 5588} 5589 5590define <2 x double> @test_mm_maskz_fmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5591; X86-LABEL: test_mm_maskz_fmadd_round_sd: 5592; X86: # %bb.0: # %entry 5593; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5594; X86-NEXT: kmovw %eax, %k1 5595; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5596; X86-NEXT: retl 5597; 5598; X64-LABEL: test_mm_maskz_fmadd_round_sd: 5599; X64: # %bb.0: # %entry 5600; X64-NEXT: kmovw %edi, %k1 5601; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5602; X64-NEXT: retq 5603entry: 5604 %0 = extractelement <2 x double> %__A, i64 0 5605 %1 = extractelement <2 x double> %__B, i64 0 5606 %2 = extractelement <2 x double> %__C, i64 0 5607 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5608 %4 = bitcast i8 %__U to <8 x i1> 5609 %5 = extractelement <8 x i1> %4, i64 0 5610 %6 = select i1 %5, double %3, double 0.000000e+00 5611 %7 = insertelement <2 x double> %__A, double %6, i64 0 5612 ret <2 x double> %7 5613} 5614 5615define <2 x double> @test_mm_mask3_fmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5616; X86-LABEL: test_mm_mask3_fmadd_sd: 5617; X86: # %bb.0: # %entry 5618; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5619; X86-NEXT: kmovw %eax, %k1 5620; X86-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 5621; X86-NEXT: vmovapd %xmm2, %xmm0 5622; X86-NEXT: retl 5623; 5624; X64-LABEL: test_mm_mask3_fmadd_sd: 5625; X64: # %bb.0: # %entry 5626; X64-NEXT: kmovw %edi, %k1 5627; X64-NEXT: vfmadd231sd {{.*#+}} xmm2 = (xmm0 * xmm1) + xmm2 5628; X64-NEXT: vmovapd %xmm2, %xmm0 5629; X64-NEXT: retq 5630entry: 5631 %0 = extractelement <2 x double> %__W, i64 0 5632 %1 = extractelement <2 x double> %__X, i64 0 5633 %2 = extractelement <2 x double> %__Y, i64 0 5634 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5635 %4 = and i8 %__U, 1 5636 %tobool.i = icmp eq i8 %4, 0 5637 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5638 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5639 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5640 ret <2 x double> %vecins.i 5641} 5642 5643define <2 x double> @test_mm_mask3_fmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5644; X86-LABEL: test_mm_mask3_fmadd_round_sd: 5645; X86: # %bb.0: # %entry 5646; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5647; X86-NEXT: kmovw %eax, %k1 5648; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5649; X86-NEXT: vmovapd %xmm2, %xmm0 5650; X86-NEXT: retl 5651; 5652; X64-LABEL: test_mm_mask3_fmadd_round_sd: 5653; X64: # %bb.0: # %entry 5654; X64-NEXT: kmovw %edi, %k1 5655; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5656; X64-NEXT: vmovapd %xmm2, %xmm0 5657; X64-NEXT: retq 5658entry: 5659 %0 = extractelement <2 x double> %__W, i64 0 5660 %1 = extractelement <2 x double> %__X, i64 0 5661 %2 = extractelement <2 x double> %__Y, i64 0 5662 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5663 %4 = bitcast i8 %__U to <8 x i1> 5664 %5 = extractelement <8 x i1> %4, i64 0 5665 %6 = select i1 %5, double %3, double %2 5666 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5667 ret <2 x double> %7 5668} 5669 5670define <2 x double> @test_mm_mask_fmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5671; X86-LABEL: test_mm_mask_fmsub_sd: 5672; X86: # %bb.0: # %entry 5673; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5674; X86-NEXT: kmovw %eax, %k1 5675; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5676; X86-NEXT: retl 5677; 5678; X64-LABEL: test_mm_mask_fmsub_sd: 5679; X64: # %bb.0: # %entry 5680; X64-NEXT: kmovw %edi, %k1 5681; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5682; X64-NEXT: retq 5683entry: 5684 %0 = extractelement <2 x double> %__W, i64 0 5685 %1 = extractelement <2 x double> %__A, i64 0 5686 %.rhs.i = extractelement <2 x double> %__B, i64 0 5687 %2 = fsub double -0.000000e+00, %.rhs.i 5688 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5689 %4 = and i8 %__U, 1 5690 %tobool.i = icmp eq i8 %4, 0 5691 %vecext1.i = extractelement <2 x double> %__W, i32 0 5692 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5693 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5694 ret <2 x double> %vecins.i 5695} 5696 5697define <2 x double> @test_mm_mask_fmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5698; X86-LABEL: test_mm_mask_fmsub_round_sd: 5699; X86: # %bb.0: # %entry 5700; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5701; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2 5702; X86-NEXT: kmovw %eax, %k1 5703; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5704; X86-NEXT: retl 5705; 5706; X64-LABEL: test_mm_mask_fmsub_round_sd: 5707; X64: # %bb.0: # %entry 5708; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2 5709; X64-NEXT: kmovw %edi, %k1 5710; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5711; X64-NEXT: retq 5712entry: 5713 %0 = extractelement <2 x double> %__W, i64 0 5714 %1 = extractelement <2 x double> %__A, i64 0 5715 %.rhs = extractelement <2 x double> %__B, i64 0 5716 %2 = fsub double -0.000000e+00, %.rhs 5717 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5718 %4 = bitcast i8 %__U to <8 x i1> 5719 %5 = extractelement <8 x i1> %4, i64 0 5720 %6 = select i1 %5, double %3, double %0 5721 %7 = insertelement <2 x double> %__W, double %6, i64 0 5722 ret <2 x double> %7 5723} 5724 5725define <2 x double> @test_mm_maskz_fmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5726; X86-LABEL: test_mm_maskz_fmsub_sd: 5727; X86: # %bb.0: # %entry 5728; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5729; X86-NEXT: kmovw %eax, %k1 5730; X86-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5731; X86-NEXT: retl 5732; 5733; X64-LABEL: test_mm_maskz_fmsub_sd: 5734; X64: # %bb.0: # %entry 5735; X64-NEXT: kmovw %edi, %k1 5736; X64-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 5737; X64-NEXT: retq 5738entry: 5739 %0 = extractelement <2 x double> %__A, i64 0 5740 %1 = extractelement <2 x double> %__B, i64 0 5741 %.rhs.i = extractelement <2 x double> %__C, i64 0 5742 %2 = fsub double -0.000000e+00, %.rhs.i 5743 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5744 %4 = and i8 %__U, 1 5745 %tobool.i = icmp eq i8 %4, 0 5746 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5747 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5748 ret <2 x double> %vecins.i 5749} 5750 5751define <2 x double> @test_mm_maskz_fmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5752; X86-LABEL: test_mm_maskz_fmsub_round_sd: 5753; X86: # %bb.0: # %entry 5754; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5755; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm2 5756; X86-NEXT: kmovw %eax, %k1 5757; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5758; X86-NEXT: retl 5759; 5760; X64-LABEL: test_mm_maskz_fmsub_round_sd: 5761; X64: # %bb.0: # %entry 5762; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm2 5763; X64-NEXT: kmovw %edi, %k1 5764; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5765; X64-NEXT: retq 5766entry: 5767 %0 = extractelement <2 x double> %__A, i64 0 5768 %1 = extractelement <2 x double> %__B, i64 0 5769 %.rhs = extractelement <2 x double> %__C, i64 0 5770 %2 = fsub double -0.000000e+00, %.rhs 5771 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5772 %4 = bitcast i8 %__U to <8 x i1> 5773 %5 = extractelement <8 x i1> %4, i64 0 5774 %6 = select i1 %5, double %3, double 0.000000e+00 5775 %7 = insertelement <2 x double> %__A, double %6, i64 0 5776 ret <2 x double> %7 5777} 5778 5779define <2 x double> @test_mm_mask3_fmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5780; X86-LABEL: test_mm_mask3_fmsub_sd: 5781; X86: # %bb.0: # %entry 5782; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5783; X86-NEXT: kmovw %eax, %k1 5784; X86-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5785; X86-NEXT: vmovapd %xmm2, %xmm0 5786; X86-NEXT: retl 5787; 5788; X64-LABEL: test_mm_mask3_fmsub_sd: 5789; X64: # %bb.0: # %entry 5790; X64-NEXT: kmovw %edi, %k1 5791; X64-NEXT: vfmsub231sd {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 5792; X64-NEXT: vmovapd %xmm2, %xmm0 5793; X64-NEXT: retq 5794entry: 5795 %0 = extractelement <2 x double> %__W, i64 0 5796 %1 = extractelement <2 x double> %__X, i64 0 5797 %.rhs.i = extractelement <2 x double> %__Y, i64 0 5798 %2 = fsub double -0.000000e+00, %.rhs.i 5799 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5800 %4 = and i8 %__U, 1 5801 %tobool.i = icmp eq i8 %4, 0 5802 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5803 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5804 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5805 ret <2 x double> %vecins.i 5806} 5807 5808define <2 x double> @test_mm_mask3_fmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5809; X86-LABEL: test_mm_mask3_fmsub_round_sd: 5810; X86: # %bb.0: # %entry 5811; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5812; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm2, %xmm3 5813; X86-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 5814; X86-NEXT: kmovw %eax, %k1 5815; X86-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 5816; X86-NEXT: vmovapd %xmm2, %xmm0 5817; X86-NEXT: retl 5818; 5819; X64-LABEL: test_mm_mask3_fmsub_round_sd: 5820; X64: # %bb.0: # %entry 5821; X64-NEXT: vxorpd {{.*}}(%rip), %xmm2, %xmm3 5822; X64-NEXT: vfmadd213sd %xmm3, %xmm0, %xmm1 5823; X64-NEXT: kmovw %edi, %k1 5824; X64-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 5825; X64-NEXT: vmovapd %xmm2, %xmm0 5826; X64-NEXT: retq 5827entry: 5828 %0 = extractelement <2 x double> %__W, i64 0 5829 %1 = extractelement <2 x double> %__X, i64 0 5830 %.rhs = extractelement <2 x double> %__Y, i64 0 5831 %2 = fsub double -0.000000e+00, %.rhs 5832 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5833 %4 = bitcast i8 %__U to <8 x i1> 5834 %5 = extractelement <8 x i1> %4, i64 0 5835 %6 = select i1 %5, double %3, double %.rhs 5836 %7 = insertelement <2 x double> %__Y, double %6, i64 0 5837 ret <2 x double> %7 5838} 5839 5840define <2 x double> @test_mm_mask_fnmadd_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5841; X86-LABEL: test_mm_mask_fnmadd_sd: 5842; X86: # %bb.0: # %entry 5843; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5844; X86-NEXT: kmovw %eax, %k1 5845; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5846; X86-NEXT: retl 5847; 5848; X64-LABEL: test_mm_mask_fnmadd_sd: 5849; X64: # %bb.0: # %entry 5850; X64-NEXT: kmovw %edi, %k1 5851; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5852; X64-NEXT: retq 5853entry: 5854 %0 = extractelement <2 x double> %__W, i64 0 5855 %.rhs.i = extractelement <2 x double> %__A, i64 0 5856 %1 = fsub double -0.000000e+00, %.rhs.i 5857 %2 = extractelement <2 x double> %__B, i64 0 5858 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5859 %4 = and i8 %__U, 1 5860 %tobool.i = icmp eq i8 %4, 0 5861 %vecext1.i = extractelement <2 x double> %__W, i32 0 5862 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5863 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 5864 ret <2 x double> %vecins.i 5865} 5866 5867define <2 x double> @test_mm_mask_fnmadd_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 5868; X86-LABEL: test_mm_mask_fnmadd_round_sd: 5869; X86: # %bb.0: # %entry 5870; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5871; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5872; X86-NEXT: kmovw %eax, %k1 5873; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5874; X86-NEXT: retl 5875; 5876; X64-LABEL: test_mm_mask_fnmadd_round_sd: 5877; X64: # %bb.0: # %entry 5878; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5879; X64-NEXT: kmovw %edi, %k1 5880; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 5881; X64-NEXT: retq 5882entry: 5883 %0 = extractelement <2 x double> %__W, i64 0 5884 %.rhs = extractelement <2 x double> %__A, i64 0 5885 %1 = fsub double -0.000000e+00, %.rhs 5886 %2 = extractelement <2 x double> %__B, i64 0 5887 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5888 %4 = bitcast i8 %__U to <8 x i1> 5889 %5 = extractelement <8 x i1> %4, i64 0 5890 %6 = select i1 %5, double %3, double %0 5891 %7 = insertelement <2 x double> %__W, double %6, i64 0 5892 ret <2 x double> %7 5893} 5894 5895define <2 x double> @test_mm_maskz_fnmadd_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5896; X86-LABEL: test_mm_maskz_fnmadd_sd: 5897; X86: # %bb.0: # %entry 5898; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5899; X86-NEXT: kmovw %eax, %k1 5900; X86-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5901; X86-NEXT: retl 5902; 5903; X64-LABEL: test_mm_maskz_fnmadd_sd: 5904; X64: # %bb.0: # %entry 5905; X64-NEXT: kmovw %edi, %k1 5906; X64-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 5907; X64-NEXT: retq 5908entry: 5909 %0 = extractelement <2 x double> %__A, i64 0 5910 %.rhs.i = extractelement <2 x double> %__B, i64 0 5911 %1 = fsub double -0.000000e+00, %.rhs.i 5912 %2 = extractelement <2 x double> %__C, i64 0 5913 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5914 %4 = and i8 %__U, 1 5915 %tobool.i = icmp eq i8 %4, 0 5916 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 5917 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 5918 ret <2 x double> %vecins.i 5919} 5920 5921define <2 x double> @test_mm_maskz_fnmadd_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 5922; X86-LABEL: test_mm_maskz_fnmadd_round_sd: 5923; X86: # %bb.0: # %entry 5924; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5925; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5926; X86-NEXT: kmovw %eax, %k1 5927; X86-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5928; X86-NEXT: retl 5929; 5930; X64-LABEL: test_mm_maskz_fnmadd_round_sd: 5931; X64: # %bb.0: # %entry 5932; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5933; X64-NEXT: kmovw %edi, %k1 5934; X64-NEXT: vfmadd213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 5935; X64-NEXT: retq 5936entry: 5937 %0 = extractelement <2 x double> %__A, i64 0 5938 %.rhs = extractelement <2 x double> %__B, i64 0 5939 %1 = fsub double -0.000000e+00, %.rhs 5940 %2 = extractelement <2 x double> %__C, i64 0 5941 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 5942 %4 = bitcast i8 %__U to <8 x i1> 5943 %5 = extractelement <8 x i1> %4, i64 0 5944 %6 = select i1 %5, double %3, double 0.000000e+00 5945 %7 = insertelement <2 x double> %__A, double %6, i64 0 5946 ret <2 x double> %7 5947} 5948 5949define <2 x double> @test_mm_mask3_fnmadd_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5950; X86-LABEL: test_mm_mask3_fnmadd_sd: 5951; X86: # %bb.0: # %entry 5952; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5953; X86-NEXT: kmovw %eax, %k1 5954; X86-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5955; X86-NEXT: vmovapd %xmm2, %xmm0 5956; X86-NEXT: retl 5957; 5958; X64-LABEL: test_mm_mask3_fnmadd_sd: 5959; X64: # %bb.0: # %entry 5960; X64-NEXT: kmovw %edi, %k1 5961; X64-NEXT: vfnmadd231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) + xmm2 5962; X64-NEXT: vmovapd %xmm2, %xmm0 5963; X64-NEXT: retq 5964entry: 5965 %0 = extractelement <2 x double> %__W, i64 0 5966 %.rhs.i = extractelement <2 x double> %__X, i64 0 5967 %1 = fsub double -0.000000e+00, %.rhs.i 5968 %2 = extractelement <2 x double> %__Y, i64 0 5969 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 5970 %4 = and i8 %__U, 1 5971 %tobool.i = icmp eq i8 %4, 0 5972 %vecext1.i = extractelement <2 x double> %__Y, i32 0 5973 %cond.i = select i1 %tobool.i, double %vecext1.i, double %3 5974 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 5975 ret <2 x double> %vecins.i 5976} 5977 5978define <2 x double> @test_mm_mask3_fnmadd_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 5979; X86-LABEL: test_mm_mask3_fnmadd_round_sd: 5980; X86: # %bb.0: # %entry 5981; X86-NEXT: movb {{[0-9]+}}(%esp), %al 5982; X86-NEXT: vxorpd {{\.LCPI.*}}, %xmm1, %xmm1 5983; X86-NEXT: kmovw %eax, %k1 5984; X86-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5985; X86-NEXT: vmovapd %xmm2, %xmm0 5986; X86-NEXT: retl 5987; 5988; X64-LABEL: test_mm_mask3_fnmadd_round_sd: 5989; X64: # %bb.0: # %entry 5990; X64-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 5991; X64-NEXT: kmovw %edi, %k1 5992; X64-NEXT: vfmadd231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 5993; X64-NEXT: vmovapd %xmm2, %xmm0 5994; X64-NEXT: retq 5995entry: 5996 %0 = extractelement <2 x double> %__W, i64 0 5997 %.rhs = extractelement <2 x double> %__X, i64 0 5998 %1 = fsub double -0.000000e+00, %.rhs 5999 %2 = extractelement <2 x double> %__Y, i64 0 6000 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6001 %4 = bitcast i8 %__U to <8 x i1> 6002 %5 = extractelement <8 x i1> %4, i64 0 6003 %6 = select i1 %5, double %3, double %2 6004 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6005 ret <2 x double> %7 6006} 6007 6008define <2 x double> @test_mm_mask_fnmsub_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6009; X86-LABEL: test_mm_mask_fnmsub_sd: 6010; X86: # %bb.0: # %entry 6011; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6012; X86-NEXT: kmovw %eax, %k1 6013; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6014; X86-NEXT: retl 6015; 6016; X64-LABEL: test_mm_mask_fnmsub_sd: 6017; X64: # %bb.0: # %entry 6018; X64-NEXT: kmovw %edi, %k1 6019; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6020; X64-NEXT: retq 6021entry: 6022 %0 = extractelement <2 x double> %__W, i64 0 6023 %.rhs.i = extractelement <2 x double> %__A, i64 0 6024 %1 = fsub double -0.000000e+00, %.rhs.i 6025 %.rhs7.i = extractelement <2 x double> %__B, i64 0 6026 %2 = fsub double -0.000000e+00, %.rhs7.i 6027 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6028 %4 = and i8 %__U, 1 6029 %tobool.i = icmp eq i8 %4, 0 6030 %vecext2.i = extractelement <2 x double> %__W, i32 0 6031 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6032 %vecins.i = insertelement <2 x double> %__W, double %cond.i, i32 0 6033 ret <2 x double> %vecins.i 6034} 6035 6036define <2 x double> @test_mm_mask_fnmsub_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { 6037; X86-LABEL: test_mm_mask_fnmsub_round_sd: 6038; X86: # %bb.0: # %entry 6039; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6040; X86-NEXT: kmovw %eax, %k1 6041; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6042; X86-NEXT: retl 6043; 6044; X64-LABEL: test_mm_mask_fnmsub_round_sd: 6045; X64: # %bb.0: # %entry 6046; X64-NEXT: kmovw %edi, %k1 6047; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} 6048; X64-NEXT: retq 6049entry: 6050 %0 = extractelement <2 x double> %__W, i64 0 6051 %.rhs = extractelement <2 x double> %__A, i64 0 6052 %1 = fsub double -0.000000e+00, %.rhs 6053 %.rhs2 = extractelement <2 x double> %__B, i64 0 6054 %2 = fsub double -0.000000e+00, %.rhs2 6055 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6056 %4 = bitcast i8 %__U to <8 x i1> 6057 %5 = extractelement <8 x i1> %4, i64 0 6058 %6 = select i1 %5, double %3, double %0 6059 %7 = insertelement <2 x double> %__W, double %6, i64 0 6060 ret <2 x double> %7 6061} 6062 6063define <2 x double> @test_mm_maskz_fnmsub_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6064; X86-LABEL: test_mm_maskz_fnmsub_sd: 6065; X86: # %bb.0: # %entry 6066; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6067; X86-NEXT: kmovw %eax, %k1 6068; X86-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6069; X86-NEXT: retl 6070; 6071; X64-LABEL: test_mm_maskz_fnmsub_sd: 6072; X64: # %bb.0: # %entry 6073; X64-NEXT: kmovw %edi, %k1 6074; X64-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2 6075; X64-NEXT: retq 6076entry: 6077 %0 = extractelement <2 x double> %__A, i64 0 6078 %.rhs.i = extractelement <2 x double> %__B, i64 0 6079 %1 = fsub double -0.000000e+00, %.rhs.i 6080 %.rhs5.i = extractelement <2 x double> %__C, i64 0 6081 %2 = fsub double -0.000000e+00, %.rhs5.i 6082 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6083 %4 = and i8 %__U, 1 6084 %tobool.i = icmp eq i8 %4, 0 6085 %cond.i = select i1 %tobool.i, double 0.000000e+00, double %3 6086 %vecins.i = insertelement <2 x double> %__A, double %cond.i, i32 0 6087 ret <2 x double> %vecins.i 6088} 6089 6090define <2 x double> @test_mm_maskz_fnmsub_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B, <2 x double> %__C) { 6091; X86-LABEL: test_mm_maskz_fnmsub_round_sd: 6092; X86: # %bb.0: # %entry 6093; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6094; X86-NEXT: kmovw %eax, %k1 6095; X86-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6096; X86-NEXT: retl 6097; 6098; X64-LABEL: test_mm_maskz_fnmsub_round_sd: 6099; X64: # %bb.0: # %entry 6100; X64-NEXT: kmovw %edi, %k1 6101; X64-NEXT: vfnmsub213sd {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} 6102; X64-NEXT: retq 6103entry: 6104 %0 = extractelement <2 x double> %__A, i64 0 6105 %.rhs = extractelement <2 x double> %__B, i64 0 6106 %1 = fsub double -0.000000e+00, %.rhs 6107 %.rhs2 = extractelement <2 x double> %__C, i64 0 6108 %2 = fsub double -0.000000e+00, %.rhs2 6109 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6110 %4 = bitcast i8 %__U to <8 x i1> 6111 %5 = extractelement <8 x i1> %4, i64 0 6112 %6 = select i1 %5, double %3, double 0.000000e+00 6113 %7 = insertelement <2 x double> %__A, double %6, i64 0 6114 ret <2 x double> %7 6115} 6116 6117define <2 x double> @test_mm_mask3_fnmsub_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6118; X86-LABEL: test_mm_mask3_fnmsub_sd: 6119; X86: # %bb.0: # %entry 6120; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6121; X86-NEXT: kmovw %eax, %k1 6122; X86-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 6123; X86-NEXT: vmovapd %xmm2, %xmm0 6124; X86-NEXT: retl 6125; 6126; X64-LABEL: test_mm_mask3_fnmsub_sd: 6127; X64: # %bb.0: # %entry 6128; X64-NEXT: kmovw %edi, %k1 6129; X64-NEXT: vfnmsub231sd {{.*#+}} xmm2 = -(xmm0 * xmm1) - xmm2 6130; X64-NEXT: vmovapd %xmm2, %xmm0 6131; X64-NEXT: retq 6132entry: 6133 %0 = extractelement <2 x double> %__W, i64 0 6134 %.rhs.i = extractelement <2 x double> %__X, i64 0 6135 %1 = fsub double -0.000000e+00, %.rhs.i 6136 %.rhs7.i = extractelement <2 x double> %__Y, i64 0 6137 %2 = fsub double -0.000000e+00, %.rhs7.i 6138 %3 = tail call double @llvm.fma.f64(double %0, double %1, double %2) #10 6139 %4 = and i8 %__U, 1 6140 %tobool.i = icmp eq i8 %4, 0 6141 %vecext2.i = extractelement <2 x double> %__Y, i32 0 6142 %cond.i = select i1 %tobool.i, double %vecext2.i, double %3 6143 %vecins.i = insertelement <2 x double> %__Y, double %cond.i, i32 0 6144 ret <2 x double> %vecins.i 6145} 6146 6147define <2 x double> @test_mm_mask3_fnmsub_round_sd(<2 x double> %__W, <2 x double> %__X, <2 x double> %__Y, i8 zeroext %__U) { 6148; X86-LABEL: test_mm_mask3_fnmsub_round_sd: 6149; X86: # %bb.0: # %entry 6150; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6151; X86-NEXT: kmovw %eax, %k1 6152; X86-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6153; X86-NEXT: vmovapd %xmm2, %xmm0 6154; X86-NEXT: retl 6155; 6156; X64-LABEL: test_mm_mask3_fnmsub_round_sd: 6157; X64: # %bb.0: # %entry 6158; X64-NEXT: kmovw %edi, %k1 6159; X64-NEXT: vfnmsub231sd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} 6160; X64-NEXT: vmovapd %xmm2, %xmm0 6161; X64-NEXT: retq 6162entry: 6163 %0 = extractelement <2 x double> %__W, i64 0 6164 %.rhs = extractelement <2 x double> %__X, i64 0 6165 %1 = fsub double -0.000000e+00, %.rhs 6166 %.rhs1 = extractelement <2 x double> %__Y, i64 0 6167 %2 = fsub double -0.000000e+00, %.rhs1 6168 %3 = tail call double @llvm.x86.avx512.vfmadd.f64(double %0, double %1, double %2, i32 8) 6169 %4 = bitcast i8 %__U to <8 x i1> 6170 %5 = extractelement <8 x i1> %4, i64 0 6171 %6 = select i1 %5, double %3, double %.rhs1 6172 %7 = insertelement <2 x double> %__Y, double %6, i64 0 6173 ret <2 x double> %7 6174} 6175 6176define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { 6177; X86-LABEL: test_mm512_mask_expandloadu_epi64: 6178; X86: # %bb.0: # %entry 6179; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6180; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6181; X86-NEXT: kmovw %ecx, %k1 6182; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} 6183; X86-NEXT: retl 6184; 6185; X64-LABEL: test_mm512_mask_expandloadu_epi64: 6186; X64: # %bb.0: # %entry 6187; X64-NEXT: kmovw %edi, %k1 6188; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} 6189; X64-NEXT: retq 6190entry: 6191 %0 = bitcast i8* %__P to i64* 6192 %1 = bitcast i8 %__U to <8 x i1> 6193 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W) 6194 ret <8 x i64> %2 6195} 6196 6197define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { 6198; X86-LABEL: test_mm512_maskz_expandloadu_epi64: 6199; X86: # %bb.0: # %entry 6200; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6201; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6202; X86-NEXT: kmovw %ecx, %k1 6203; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z} 6204; X86-NEXT: retl 6205; 6206; X64-LABEL: test_mm512_maskz_expandloadu_epi64: 6207; X64: # %bb.0: # %entry 6208; X64-NEXT: kmovw %edi, %k1 6209; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z} 6210; X64-NEXT: retq 6211entry: 6212 %0 = bitcast i8* %__P to i64* 6213 %1 = bitcast i8 %__U to <8 x i1> 6214 %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer) 6215 ret <8 x i64> %2 6216} 6217 6218define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { 6219; X86-LABEL: test_mm512_mask_expandloadu_pd: 6220; X86: # %bb.0: # %entry 6221; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6222; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6223; X86-NEXT: kmovw %ecx, %k1 6224; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} 6225; X86-NEXT: retl 6226; 6227; X64-LABEL: test_mm512_mask_expandloadu_pd: 6228; X64: # %bb.0: # %entry 6229; X64-NEXT: kmovw %edi, %k1 6230; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} 6231; X64-NEXT: retq 6232entry: 6233 %0 = bitcast i8* %__P to double* 6234 %1 = bitcast i8 %__U to <8 x i1> 6235 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W) 6236 ret <8 x double> %2 6237} 6238 6239define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { 6240; X86-LABEL: test_mm512_maskz_expandloadu_pd: 6241; X86: # %bb.0: # %entry 6242; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6243; X86-NEXT: movb {{[0-9]+}}(%esp), %cl 6244; X86-NEXT: kmovw %ecx, %k1 6245; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z} 6246; X86-NEXT: retl 6247; 6248; X64-LABEL: test_mm512_maskz_expandloadu_pd: 6249; X64: # %bb.0: # %entry 6250; X64-NEXT: kmovw %edi, %k1 6251; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z} 6252; X64-NEXT: retq 6253entry: 6254 %0 = bitcast i8* %__P to double* 6255 %1 = bitcast i8 %__U to <8 x i1> 6256 %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer) 6257 ret <8 x double> %2 6258} 6259 6260define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) { 6261; X86-LABEL: test_mm512_mask_expandloadu_epi32: 6262; X86: # %bb.0: # %entry 6263; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6264; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6265; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} 6266; X86-NEXT: retl 6267; 6268; X64-LABEL: test_mm512_mask_expandloadu_epi32: 6269; X64: # %bb.0: # %entry 6270; X64-NEXT: kmovw %edi, %k1 6271; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} 6272; X64-NEXT: retq 6273entry: 6274 %0 = bitcast <8 x i64> %__W to <16 x i32> 6275 %1 = bitcast i8* %__P to i32* 6276 %2 = bitcast i16 %__U to <16 x i1> 6277 %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11 6278 %4 = bitcast <16 x i32> %3 to <8 x i64> 6279 ret <8 x i64> %4 6280} 6281 6282define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) { 6283; X86-LABEL: test_mm512_maskz_expandloadu_epi32: 6284; X86: # %bb.0: # %entry 6285; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6286; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6287; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z} 6288; X86-NEXT: retl 6289; 6290; X64-LABEL: test_mm512_maskz_expandloadu_epi32: 6291; X64: # %bb.0: # %entry 6292; X64-NEXT: kmovw %edi, %k1 6293; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z} 6294; X64-NEXT: retq 6295entry: 6296 %0 = bitcast i8* %__P to i32* 6297 %1 = bitcast i16 %__U to <16 x i1> 6298 %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer) 6299 %3 = bitcast <16 x i32> %2 to <8 x i64> 6300 ret <8 x i64> %3 6301} 6302 6303define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) { 6304; X86-LABEL: test_mm512_mask_expandloadu_ps: 6305; X86: # %bb.0: # %entry 6306; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6307; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6308; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} 6309; X86-NEXT: retl 6310; 6311; X64-LABEL: test_mm512_mask_expandloadu_ps: 6312; X64: # %bb.0: # %entry 6313; X64-NEXT: kmovw %edi, %k1 6314; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} 6315; X64-NEXT: retq 6316entry: 6317 %0 = bitcast i8* %__P to float* 6318 %1 = bitcast i16 %__U to <16 x i1> 6319 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11 6320 ret <16 x float> %2 6321} 6322 6323define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) { 6324; X86-LABEL: test_mm512_maskz_expandloadu_ps: 6325; X86: # %bb.0: # %entry 6326; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6327; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6328; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z} 6329; X86-NEXT: retl 6330; 6331; X64-LABEL: test_mm512_maskz_expandloadu_ps: 6332; X64: # %bb.0: # %entry 6333; X64-NEXT: kmovw %edi, %k1 6334; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z} 6335; X64-NEXT: retq 6336entry: 6337 %0 = bitcast i8* %__P to float* 6338 %1 = bitcast i16 %__U to <16 x i1> 6339 %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer) 6340 ret <16 x float> %2 6341} 6342 6343define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) { 6344; X86-LABEL: test_mm512_mask_compressstoreu_pd: 6345; X86: # %bb.0: # %entry 6346; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6347; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6348; X86-NEXT: kmovw %eax, %k1 6349; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1} 6350; X86-NEXT: vzeroupper 6351; X86-NEXT: retl 6352; 6353; X64-LABEL: test_mm512_mask_compressstoreu_pd: 6354; X64: # %bb.0: # %entry 6355; X64-NEXT: kmovw %esi, %k1 6356; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1} 6357; X64-NEXT: vzeroupper 6358; X64-NEXT: retq 6359entry: 6360 %0 = bitcast i8* %__P to double* 6361 %1 = bitcast i8 %__U to <8 x i1> 6362 tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1) 6363 ret void 6364} 6365 6366define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) { 6367; X86-LABEL: test_mm512_mask_compressstoreu_epi64: 6368; X86: # %bb.0: # %entry 6369; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6370; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx 6371; X86-NEXT: kmovw %eax, %k1 6372; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1} 6373; X86-NEXT: vzeroupper 6374; X86-NEXT: retl 6375; 6376; X64-LABEL: test_mm512_mask_compressstoreu_epi64: 6377; X64: # %bb.0: # %entry 6378; X64-NEXT: kmovw %esi, %k1 6379; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1} 6380; X64-NEXT: vzeroupper 6381; X64-NEXT: retq 6382entry: 6383 %0 = bitcast i8* %__P to i64* 6384 %1 = bitcast i8 %__U to <8 x i1> 6385 tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1) 6386 ret void 6387} 6388 6389define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) { 6390; X86-LABEL: test_mm512_mask_compressstoreu_ps: 6391; X86: # %bb.0: # %entry 6392; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6393; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6394; X86-NEXT: vcompressps %zmm0, (%eax) {%k1} 6395; X86-NEXT: vzeroupper 6396; X86-NEXT: retl 6397; 6398; X64-LABEL: test_mm512_mask_compressstoreu_ps: 6399; X64: # %bb.0: # %entry 6400; X64-NEXT: kmovw %esi, %k1 6401; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1} 6402; X64-NEXT: vzeroupper 6403; X64-NEXT: retq 6404entry: 6405 %0 = bitcast i8* %__P to float* 6406 %1 = bitcast i16 %__U to <16 x i1> 6407 tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1) 6408 ret void 6409} 6410 6411define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) { 6412; X86-LABEL: test_mm512_mask_compressstoreu_epi32: 6413; X86: # %bb.0: # %entry 6414; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6415; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 6416; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1} 6417; X86-NEXT: vzeroupper 6418; X86-NEXT: retl 6419; 6420; X64-LABEL: test_mm512_mask_compressstoreu_epi32: 6421; X64: # %bb.0: # %entry 6422; X64-NEXT: kmovw %esi, %k1 6423; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1} 6424; X64-NEXT: vzeroupper 6425; X64-NEXT: retq 6426entry: 6427 %0 = bitcast <8 x i64> %__A to <16 x i32> 6428 %1 = bitcast i8* %__P to i32* 6429 %2 = bitcast i16 %__U to <16 x i1> 6430 tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2) 6431 ret void 6432} 6433 6434define i64 @test_mm512_reduce_add_epi64(<8 x i64> %__W) { 6435; X86-LABEL: test_mm512_reduce_add_epi64: 6436; X86: # %bb.0: # %entry 6437; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6438; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6439; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6440; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6441; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6442; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6443; X86-NEXT: vmovd %xmm0, %eax 6444; X86-NEXT: vpextrd $1, %xmm0, %edx 6445; X86-NEXT: vzeroupper 6446; X86-NEXT: retl 6447; 6448; X64-LABEL: test_mm512_reduce_add_epi64: 6449; X64: # %bb.0: # %entry 6450; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6451; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6452; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6453; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6454; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6455; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6456; X64-NEXT: vmovq %xmm0, %rax 6457; X64-NEXT: vzeroupper 6458; X64-NEXT: retq 6459entry: 6460 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6461 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6462 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6463 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6464 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6465 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6466 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6467 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6468 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6469 ret i64 %vecext.i 6470} 6471 6472define i64 @test_mm512_reduce_mul_epi64(<8 x i64> %__W) { 6473; X86-LABEL: test_mm512_reduce_mul_epi64: 6474; X86: # %bb.0: # %entry 6475; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6476; X86-NEXT: vpsrlq $32, %ymm0, %ymm2 6477; X86-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6478; X86-NEXT: vpsrlq $32, %ymm1, %ymm3 6479; X86-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6480; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6481; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6482; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6483; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6484; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6485; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6486; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6487; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6488; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6489; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6490; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6491; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6492; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6493; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6494; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6495; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6496; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6497; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6498; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6499; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6500; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6501; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6502; X86-NEXT: vmovd %xmm0, %eax 6503; X86-NEXT: vpextrd $1, %xmm0, %edx 6504; X86-NEXT: vzeroupper 6505; X86-NEXT: retl 6506; 6507; X64-LABEL: test_mm512_reduce_mul_epi64: 6508; X64: # %bb.0: # %entry 6509; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6510; X64-NEXT: vpsrlq $32, %ymm0, %ymm2 6511; X64-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 6512; X64-NEXT: vpsrlq $32, %ymm1, %ymm3 6513; X64-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 6514; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6515; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6516; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 6517; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6518; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6519; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6520; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6521; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6522; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6523; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6524; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6525; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6526; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6527; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6528; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6529; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6530; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6531; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6532; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6533; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6534; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6535; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6536; X64-NEXT: vmovq %xmm0, %rax 6537; X64-NEXT: vzeroupper 6538; X64-NEXT: retq 6539entry: 6540 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6541 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6542 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6543 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6544 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6545 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6546 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6547 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6548 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6549 ret i64 %vecext.i 6550} 6551 6552define i64 @test_mm512_reduce_or_epi64(<8 x i64> %__W) { 6553; X86-LABEL: test_mm512_reduce_or_epi64: 6554; X86: # %bb.0: # %entry 6555; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6556; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6557; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6558; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6559; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6560; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6561; X86-NEXT: vmovd %xmm0, %eax 6562; X86-NEXT: vpextrd $1, %xmm0, %edx 6563; X86-NEXT: vzeroupper 6564; X86-NEXT: retl 6565; 6566; X64-LABEL: test_mm512_reduce_or_epi64: 6567; X64: # %bb.0: # %entry 6568; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6569; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6570; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6571; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6572; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6573; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6574; X64-NEXT: vmovq %xmm0, %rax 6575; X64-NEXT: vzeroupper 6576; X64-NEXT: retq 6577entry: 6578 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6579 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6580 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6581 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6582 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6583 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6584 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6585 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6586 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6587 ret i64 %vecext.i 6588} 6589 6590define i64 @test_mm512_reduce_and_epi64(<8 x i64> %__W) { 6591; X86-LABEL: test_mm512_reduce_and_epi64: 6592; X86: # %bb.0: # %entry 6593; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6594; X86-NEXT: vpand %ymm1, %ymm0, %ymm0 6595; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6596; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6597; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6598; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6599; X86-NEXT: vmovd %xmm0, %eax 6600; X86-NEXT: vpextrd $1, %xmm0, %edx 6601; X86-NEXT: vzeroupper 6602; X86-NEXT: retl 6603; 6604; X64-LABEL: test_mm512_reduce_and_epi64: 6605; X64: # %bb.0: # %entry 6606; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6607; X64-NEXT: vpand %ymm1, %ymm0, %ymm0 6608; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6609; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6610; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6611; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6612; X64-NEXT: vmovq %xmm0, %rax 6613; X64-NEXT: vzeroupper 6614; X64-NEXT: retq 6615entry: 6616 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6617 %shuffle1.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6618 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6619 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6620 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6621 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6622 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6623 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6624 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6625 ret i64 %vecext.i 6626} 6627 6628define i64 @test_mm512_mask_reduce_add_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6629; X86-LABEL: test_mm512_mask_reduce_add_epi64: 6630; X86: # %bb.0: # %entry 6631; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6632; X86-NEXT: kmovw %eax, %k1 6633; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6634; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6635; X86-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6636; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6637; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6638; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6639; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6640; X86-NEXT: vmovd %xmm0, %eax 6641; X86-NEXT: vpextrd $1, %xmm0, %edx 6642; X86-NEXT: vzeroupper 6643; X86-NEXT: retl 6644; 6645; X64-LABEL: test_mm512_mask_reduce_add_epi64: 6646; X64: # %bb.0: # %entry 6647; X64-NEXT: kmovw %edi, %k1 6648; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6649; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6650; X64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 6651; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6652; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 6653; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6654; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 6655; X64-NEXT: vmovq %xmm0, %rax 6656; X64-NEXT: vzeroupper 6657; X64-NEXT: retq 6658entry: 6659 %0 = bitcast i8 %__M to <8 x i1> 6660 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6661 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6662 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6663 %add.i = add <4 x i64> %shuffle.i, %shuffle1.i 6664 %shuffle2.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6665 %shuffle3.i = shufflevector <4 x i64> %add.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6666 %add4.i = add <2 x i64> %shuffle2.i, %shuffle3.i 6667 %shuffle6.i = shufflevector <2 x i64> %add4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6668 %add7.i = add <2 x i64> %shuffle6.i, %add4.i 6669 %vecext.i = extractelement <2 x i64> %add7.i, i32 0 6670 ret i64 %vecext.i 6671} 6672 6673define i64 @test_mm512_mask_reduce_mul_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6674; X86-LABEL: test_mm512_mask_reduce_mul_epi64: 6675; X86: # %bb.0: # %entry 6676; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6677; X86-NEXT: kmovw %eax, %k1 6678; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0] 6679; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6680; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6681; X86-NEXT: vpsrlq $32, %ymm1, %ymm2 6682; X86-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6683; X86-NEXT: vpsrlq $32, %ymm0, %ymm3 6684; X86-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6685; X86-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6686; X86-NEXT: vpsllq $32, %ymm2, %ymm2 6687; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6688; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6689; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6690; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6691; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6692; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6693; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6694; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6695; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6696; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6697; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6698; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6699; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 6700; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6701; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 6702; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6703; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6704; X86-NEXT: vpsllq $32, %xmm2, %xmm2 6705; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6706; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6707; X86-NEXT: vmovd %xmm0, %eax 6708; X86-NEXT: vpextrd $1, %xmm0, %edx 6709; X86-NEXT: vzeroupper 6710; X86-NEXT: retl 6711; 6712; X64-LABEL: test_mm512_mask_reduce_mul_epi64: 6713; X64: # %bb.0: # %entry 6714; X64-NEXT: kmovw %edi, %k1 6715; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 6716; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6717; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6718; X64-NEXT: vpsrlq $32, %ymm1, %ymm2 6719; X64-NEXT: vpmuludq %ymm0, %ymm2, %ymm2 6720; X64-NEXT: vpsrlq $32, %ymm0, %ymm3 6721; X64-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 6722; X64-NEXT: vpaddq %ymm2, %ymm3, %ymm2 6723; X64-NEXT: vpsllq $32, %ymm2, %ymm2 6724; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 6725; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 6726; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6727; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6728; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 6729; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6730; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 6731; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 6732; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6733; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 6734; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6735; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6736; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 6737; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 6738; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 6739; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 6740; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 6741; X64-NEXT: vpsllq $32, %xmm2, %xmm2 6742; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 6743; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 6744; X64-NEXT: vmovq %xmm0, %rax 6745; X64-NEXT: vzeroupper 6746; X64-NEXT: retq 6747entry: 6748 %0 = bitcast i8 %__M to <8 x i1> 6749 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1> 6750 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6751 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6752 %mul.i = mul <4 x i64> %shuffle.i, %shuffle1.i 6753 %shuffle2.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6754 %shuffle3.i = shufflevector <4 x i64> %mul.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6755 %mul4.i = mul <2 x i64> %shuffle2.i, %shuffle3.i 6756 %shuffle6.i = shufflevector <2 x i64> %mul4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6757 %mul7.i = mul <2 x i64> %shuffle6.i, %mul4.i 6758 %vecext.i = extractelement <2 x i64> %mul7.i, i32 0 6759 ret i64 %vecext.i 6760} 6761 6762define i64 @test_mm512_mask_reduce_and_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6763; X86-LABEL: test_mm512_mask_reduce_and_epi64: 6764; X86: # %bb.0: # %entry 6765; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6766; X86-NEXT: kmovw %eax, %k1 6767; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6768; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6769; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6770; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 6771; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6772; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 6773; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6774; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 6775; X86-NEXT: vmovd %xmm0, %eax 6776; X86-NEXT: vpextrd $1, %xmm0, %edx 6777; X86-NEXT: vzeroupper 6778; X86-NEXT: retl 6779; 6780; X64-LABEL: test_mm512_mask_reduce_and_epi64: 6781; X64: # %bb.0: # %entry 6782; X64-NEXT: kmovw %edi, %k1 6783; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 6784; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 6785; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 6786; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 6787; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6788; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 6789; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6790; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 6791; X64-NEXT: vmovq %xmm0, %rax 6792; X64-NEXT: vzeroupper 6793; X64-NEXT: retq 6794entry: 6795 %0 = bitcast i8 %__M to <8 x i1> 6796 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 6797 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6798 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6799 %and.i = and <4 x i64> %shuffle.i, %shuffle1.i 6800 %shuffle2.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6801 %shuffle3.i = shufflevector <4 x i64> %and.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6802 %and4.i = and <2 x i64> %shuffle2.i, %shuffle3.i 6803 %shuffle6.i = shufflevector <2 x i64> %and4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6804 %and7.i = and <2 x i64> %shuffle6.i, %and4.i 6805 %vecext.i = extractelement <2 x i64> %and7.i, i32 0 6806 ret i64 %vecext.i 6807} 6808 6809define i64 @test_mm512_mask_reduce_or_epi64(i8 zeroext %__M, <8 x i64> %__W) { 6810; X86-LABEL: test_mm512_mask_reduce_or_epi64: 6811; X86: # %bb.0: # %entry 6812; X86-NEXT: movb {{[0-9]+}}(%esp), %al 6813; X86-NEXT: kmovw %eax, %k1 6814; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6815; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6816; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 6817; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6818; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 6819; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6820; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 6821; X86-NEXT: vmovd %xmm0, %eax 6822; X86-NEXT: vpextrd $1, %xmm0, %edx 6823; X86-NEXT: vzeroupper 6824; X86-NEXT: retl 6825; 6826; X64-LABEL: test_mm512_mask_reduce_or_epi64: 6827; X64: # %bb.0: # %entry 6828; X64-NEXT: kmovw %edi, %k1 6829; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 6830; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6831; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 6832; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 6833; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 6834; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6835; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 6836; X64-NEXT: vmovq %xmm0, %rax 6837; X64-NEXT: vzeroupper 6838; X64-NEXT: retq 6839entry: 6840 %0 = bitcast i8 %__M to <8 x i1> 6841 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 6842 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6843 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6844 %or.i = or <4 x i64> %shuffle.i, %shuffle1.i 6845 %shuffle2.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6846 %shuffle3.i = shufflevector <4 x i64> %or.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6847 %or4.i = or <2 x i64> %shuffle2.i, %shuffle3.i 6848 %shuffle6.i = shufflevector <2 x i64> %or4.i, <2 x i64> undef, <2 x i32> <i32 1, i32 undef> 6849 %or7.i = or <2 x i64> %shuffle6.i, %or4.i 6850 %vecext.i = extractelement <2 x i64> %or7.i, i32 0 6851 ret i64 %vecext.i 6852} 6853 6854define i32 @test_mm512_reduce_add_epi32(<8 x i64> %__W) { 6855; CHECK-LABEL: test_mm512_reduce_add_epi32: 6856; CHECK: # %bb.0: # %entry 6857; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6858; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 6859; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6860; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 6861; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6862; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6863; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0 6864; CHECK-NEXT: vmovd %xmm0, %eax 6865; CHECK-NEXT: vzeroupper 6866; CHECK-NEXT: ret{{[l|q]}} 6867entry: 6868 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6869 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 6870 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6871 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 6872 %add.i = add <8 x i32> %0, %1 6873 %2 = bitcast <8 x i32> %add.i to <4 x i64> 6874 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6875 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 6876 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6877 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 6878 %add5.i = add <4 x i32> %3, %4 6879 %shuffle.i = shufflevector <4 x i32> %add5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6880 %add6.i = add <4 x i32> %shuffle.i, %add5.i 6881 %shuffle7.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6882 %add8.i = add <4 x i32> %shuffle7.i, %add6.i 6883 %vecext.i = extractelement <4 x i32> %add8.i, i32 0 6884 ret i32 %vecext.i 6885} 6886 6887define i32 @test_mm512_reduce_mul_epi32(<8 x i64> %__W) { 6888; CHECK-LABEL: test_mm512_reduce_mul_epi32: 6889; CHECK: # %bb.0: # %entry 6890; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6891; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0 6892; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6893; CHECK-NEXT: vpmulld %xmm1, %xmm0, %xmm0 6894; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6895; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 6896; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6897; CHECK-NEXT: vpmulld %xmm0, %xmm1, %xmm0 6898; CHECK-NEXT: vmovd %xmm0, %eax 6899; CHECK-NEXT: vzeroupper 6900; CHECK-NEXT: ret{{[l|q]}} 6901entry: 6902 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6903 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 6904 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6905 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 6906 %mul.i = mul <8 x i32> %0, %1 6907 %2 = bitcast <8 x i32> %mul.i to <4 x i64> 6908 %extract3.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6909 %3 = bitcast <2 x i64> %extract3.i to <4 x i32> 6910 %extract4.i = shufflevector <4 x i64> %2, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6911 %4 = bitcast <2 x i64> %extract4.i to <4 x i32> 6912 %mul5.i = mul <4 x i32> %3, %4 6913 %shuffle.i = shufflevector <4 x i32> %mul5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6914 %mul6.i = mul <4 x i32> %shuffle.i, %mul5.i 6915 %shuffle7.i = shufflevector <4 x i32> %mul6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6916 %mul8.i = mul <4 x i32> %shuffle7.i, %mul6.i 6917 %vecext.i = extractelement <4 x i32> %mul8.i, i32 0 6918 ret i32 %vecext.i 6919} 6920 6921define i32 @test_mm512_reduce_or_epi32(<8 x i64> %__W) { 6922; CHECK-LABEL: test_mm512_reduce_or_epi32: 6923; CHECK: # %bb.0: # %entry 6924; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6925; CHECK-NEXT: vpor %ymm1, %ymm0, %ymm0 6926; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6927; CHECK-NEXT: vpor %xmm1, %xmm0, %xmm0 6928; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6929; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 6930; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6931; CHECK-NEXT: vpor %xmm0, %xmm1, %xmm0 6932; CHECK-NEXT: vmovd %xmm0, %eax 6933; CHECK-NEXT: vzeroupper 6934; CHECK-NEXT: ret{{[l|q]}} 6935entry: 6936 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6937 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6938 %or25.i = or <4 x i64> %extract.i, %extract2.i 6939 %extract3.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6940 %extract4.i = shufflevector <4 x i64> %or25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6941 %or526.i = or <2 x i64> %extract3.i, %extract4.i 6942 %or5.i = bitcast <2 x i64> %or526.i to <4 x i32> 6943 %shuffle.i = shufflevector <4 x i32> %or5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6944 %or6.i = or <4 x i32> %shuffle.i, %or5.i 6945 %shuffle7.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6946 %or8.i = or <4 x i32> %shuffle7.i, %or6.i 6947 %vecext.i = extractelement <4 x i32> %or8.i, i32 0 6948 ret i32 %vecext.i 6949} 6950 6951define i32 @test_mm512_reduce_and_epi32(<8 x i64> %__W) { 6952; CHECK-LABEL: test_mm512_reduce_and_epi32: 6953; CHECK: # %bb.0: # %entry 6954; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6955; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 6956; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 6957; CHECK-NEXT: vpand %xmm1, %xmm0, %xmm0 6958; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6959; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 6960; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 6961; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0 6962; CHECK-NEXT: vmovd %xmm0, %eax 6963; CHECK-NEXT: vzeroupper 6964; CHECK-NEXT: ret{{[l|q]}} 6965entry: 6966 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 6967 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 6968 %and25.i = and <4 x i64> %extract.i, %extract2.i 6969 %extract3.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 6970 %extract4.i = shufflevector <4 x i64> %and25.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 6971 %and526.i = and <2 x i64> %extract3.i, %extract4.i 6972 %and5.i = bitcast <2 x i64> %and526.i to <4 x i32> 6973 %shuffle.i = shufflevector <4 x i32> %and5.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 6974 %and6.i = and <4 x i32> %shuffle.i, %and5.i 6975 %shuffle7.i = shufflevector <4 x i32> %and6.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 6976 %and8.i = and <4 x i32> %shuffle7.i, %and6.i 6977 %vecext.i = extractelement <4 x i32> %and8.i, i32 0 6978 ret i32 %vecext.i 6979} 6980 6981define i32 @test_mm512_mask_reduce_add_epi32(i16 zeroext %__M, <8 x i64> %__W) { 6982; X86-LABEL: test_mm512_mask_reduce_add_epi32: 6983; X86: # %bb.0: # %entry 6984; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 6985; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 6986; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 6987; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 6988; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 6989; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 6990; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 6991; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 6992; X86-NEXT: vphaddd %xmm0, %xmm0, %xmm0 6993; X86-NEXT: vmovd %xmm0, %eax 6994; X86-NEXT: vzeroupper 6995; X86-NEXT: retl 6996; 6997; X64-LABEL: test_mm512_mask_reduce_add_epi32: 6998; X64: # %bb.0: # %entry 6999; X64-NEXT: kmovw %edi, %k1 7000; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7001; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7002; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 7003; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7004; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 7005; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7006; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 7007; X64-NEXT: vphaddd %xmm0, %xmm0, %xmm0 7008; X64-NEXT: vmovd %xmm0, %eax 7009; X64-NEXT: vzeroupper 7010; X64-NEXT: retq 7011entry: 7012 %0 = bitcast <8 x i64> %__W to <16 x i32> 7013 %1 = bitcast i16 %__M to <16 x i1> 7014 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7015 %3 = bitcast <16 x i32> %2 to <8 x i64> 7016 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7017 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7018 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7019 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 7020 %add.i = add <8 x i32> %4, %5 7021 %6 = bitcast <8 x i32> %add.i to <4 x i64> 7022 %extract4.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7023 %7 = bitcast <2 x i64> %extract4.i to <4 x i32> 7024 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7025 %8 = bitcast <2 x i64> %extract5.i to <4 x i32> 7026 %add6.i = add <4 x i32> %7, %8 7027 %shuffle.i = shufflevector <4 x i32> %add6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7028 %add7.i = add <4 x i32> %shuffle.i, %add6.i 7029 %shuffle8.i = shufflevector <4 x i32> %add7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7030 %add9.i = add <4 x i32> %shuffle8.i, %add7.i 7031 %vecext.i = extractelement <4 x i32> %add9.i, i32 0 7032 ret i32 %vecext.i 7033} 7034 7035define i32 @test_mm512_mask_reduce_mul_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7036; X86-LABEL: test_mm512_mask_reduce_mul_epi32: 7037; X86: # %bb.0: # %entry 7038; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7039; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7040; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7041; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7042; X86-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7043; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7044; X86-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7045; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7046; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7047; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7048; X86-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7049; X86-NEXT: vmovd %xmm0, %eax 7050; X86-NEXT: vzeroupper 7051; X86-NEXT: retl 7052; 7053; X64-LABEL: test_mm512_mask_reduce_mul_epi32: 7054; X64: # %bb.0: # %entry 7055; X64-NEXT: kmovw %edi, %k1 7056; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7057; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7058; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7059; X64-NEXT: vpmulld %ymm0, %ymm1, %ymm0 7060; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7061; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 7062; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7063; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7064; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7065; X64-NEXT: vpmulld %xmm0, %xmm1, %xmm0 7066; X64-NEXT: vmovd %xmm0, %eax 7067; X64-NEXT: vzeroupper 7068; X64-NEXT: retq 7069entry: 7070 %0 = bitcast <8 x i64> %__W to <16 x i32> 7071 %1 = bitcast i16 %__M to <16 x i1> 7072 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 7073 %3 = bitcast <16 x i32> %2 to <8 x i64> 7074 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7075 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 7076 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7077 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 7078 %mul.i = mul <8 x i32> %4, %5 7079 %6 = bitcast <8 x i32> %mul.i to <4 x i64> 7080 %extract5.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7081 %7 = bitcast <2 x i64> %extract5.i to <4 x i32> 7082 %extract6.i = shufflevector <4 x i64> %6, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7083 %8 = bitcast <2 x i64> %extract6.i to <4 x i32> 7084 %mul7.i = mul <4 x i32> %7, %8 7085 %shuffle.i = shufflevector <4 x i32> %mul7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7086 %mul8.i = mul <4 x i32> %shuffle.i, %mul7.i 7087 %shuffle9.i = shufflevector <4 x i32> %mul8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7088 %mul10.i = mul <4 x i32> %shuffle9.i, %mul8.i 7089 %vecext.i = extractelement <4 x i32> %mul10.i, i32 0 7090 ret i32 %vecext.i 7091} 7092 7093define i32 @test_mm512_mask_reduce_and_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7094; X86-LABEL: test_mm512_mask_reduce_and_epi32: 7095; X86: # %bb.0: # %entry 7096; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7097; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7098; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7099; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7100; X86-NEXT: vpand %ymm0, %ymm1, %ymm0 7101; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7102; X86-NEXT: vpand %xmm1, %xmm0, %xmm0 7103; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7104; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7105; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7106; X86-NEXT: vpand %xmm0, %xmm1, %xmm0 7107; X86-NEXT: vmovd %xmm0, %eax 7108; X86-NEXT: vzeroupper 7109; X86-NEXT: retl 7110; 7111; X64-LABEL: test_mm512_mask_reduce_and_epi32: 7112; X64: # %bb.0: # %entry 7113; X64-NEXT: kmovw %edi, %k1 7114; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 7115; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 7116; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 7117; X64-NEXT: vpand %ymm0, %ymm1, %ymm0 7118; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7119; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 7120; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7121; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7122; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7123; X64-NEXT: vpand %xmm0, %xmm1, %xmm0 7124; X64-NEXT: vmovd %xmm0, %eax 7125; X64-NEXT: vzeroupper 7126; X64-NEXT: retq 7127entry: 7128 %0 = bitcast <8 x i64> %__W to <16 x i32> 7129 %1 = bitcast i16 %__M to <16 x i1> 7130 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 7131 %3 = bitcast <16 x i32> %2 to <8 x i64> 7132 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7133 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7134 %and28.i = and <4 x i64> %extract.i, %extract4.i 7135 %extract5.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7136 %extract6.i = shufflevector <4 x i64> %and28.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7137 %and729.i = and <2 x i64> %extract5.i, %extract6.i 7138 %and7.i = bitcast <2 x i64> %and729.i to <4 x i32> 7139 %shuffle.i = shufflevector <4 x i32> %and7.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7140 %and8.i = and <4 x i32> %shuffle.i, %and7.i 7141 %shuffle9.i = shufflevector <4 x i32> %and8.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7142 %and10.i = and <4 x i32> %shuffle9.i, %and8.i 7143 %vecext.i = extractelement <4 x i32> %and10.i, i32 0 7144 ret i32 %vecext.i 7145} 7146 7147define i32 @test_mm512_mask_reduce_or_epi32(i16 zeroext %__M, <8 x i64> %__W) { 7148; X86-LABEL: test_mm512_mask_reduce_or_epi32: 7149; X86: # %bb.0: # %entry 7150; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7151; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7152; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7153; X86-NEXT: vpor %ymm1, %ymm0, %ymm0 7154; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 7155; X86-NEXT: vpor %xmm1, %xmm0, %xmm0 7156; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7157; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7158; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7159; X86-NEXT: vpor %xmm0, %xmm1, %xmm0 7160; X86-NEXT: vmovd %xmm0, %eax 7161; X86-NEXT: vzeroupper 7162; X86-NEXT: retl 7163; 7164; X64-LABEL: test_mm512_mask_reduce_or_epi32: 7165; X64: # %bb.0: # %entry 7166; X64-NEXT: kmovw %edi, %k1 7167; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 7168; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 7169; X64-NEXT: vpor %ymm1, %ymm0, %ymm0 7170; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 7171; X64-NEXT: vpor %xmm1, %xmm0, %xmm0 7172; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 7173; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7174; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 7175; X64-NEXT: vpor %xmm0, %xmm1, %xmm0 7176; X64-NEXT: vmovd %xmm0, %eax 7177; X64-NEXT: vzeroupper 7178; X64-NEXT: retq 7179entry: 7180 %0 = bitcast <8 x i64> %__W to <16 x i32> 7181 %1 = bitcast i16 %__M to <16 x i1> 7182 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 7183 %3 = bitcast <16 x i32> %2 to <8 x i64> 7184 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7185 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7186 %or27.i = or <4 x i64> %extract.i, %extract3.i 7187 %extract4.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 7188 %extract5.i = shufflevector <4 x i64> %or27.i, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 7189 %or628.i = or <2 x i64> %extract4.i, %extract5.i 7190 %or6.i = bitcast <2 x i64> %or628.i to <4 x i32> 7191 %shuffle.i = shufflevector <4 x i32> %or6.i, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7192 %or7.i = or <4 x i32> %shuffle.i, %or6.i 7193 %shuffle8.i = shufflevector <4 x i32> %or7.i, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 7194 %or9.i = or <4 x i32> %shuffle8.i, %or7.i 7195 %vecext.i = extractelement <4 x i32> %or9.i, i32 0 7196 ret i32 %vecext.i 7197} 7198 7199define double @test_mm512_reduce_add_pd(<8 x double> %__W) { 7200; X86-LABEL: test_mm512_reduce_add_pd: 7201; X86: # %bb.0: # %entry 7202; X86-NEXT: pushl %ebp 7203; X86-NEXT: .cfi_def_cfa_offset 8 7204; X86-NEXT: .cfi_offset %ebp, -8 7205; X86-NEXT: movl %esp, %ebp 7206; X86-NEXT: .cfi_def_cfa_register %ebp 7207; X86-NEXT: andl $-8, %esp 7208; X86-NEXT: subl $8, %esp 7209; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7210; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7211; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7212; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7213; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7214; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7215; X86-NEXT: vmovlpd %xmm0, (%esp) 7216; X86-NEXT: fldl (%esp) 7217; X86-NEXT: movl %ebp, %esp 7218; X86-NEXT: popl %ebp 7219; X86-NEXT: .cfi_def_cfa %esp, 4 7220; X86-NEXT: vzeroupper 7221; X86-NEXT: retl 7222; 7223; X64-LABEL: test_mm512_reduce_add_pd: 7224; X64: # %bb.0: # %entry 7225; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7226; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7227; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7228; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7229; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7230; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7231; X64-NEXT: vzeroupper 7232; X64-NEXT: retq 7233entry: 7234 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7235 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7236 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7237 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7238 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7239 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7240 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7241 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7242 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7243 ret double %vecext.i 7244} 7245 7246define double @test_mm512_reduce_mul_pd(<8 x double> %__W) { 7247; X86-LABEL: test_mm512_reduce_mul_pd: 7248; X86: # %bb.0: # %entry 7249; X86-NEXT: pushl %ebp 7250; X86-NEXT: .cfi_def_cfa_offset 8 7251; X86-NEXT: .cfi_offset %ebp, -8 7252; X86-NEXT: movl %esp, %ebp 7253; X86-NEXT: .cfi_def_cfa_register %ebp 7254; X86-NEXT: andl $-8, %esp 7255; X86-NEXT: subl $8, %esp 7256; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7257; X86-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7258; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7259; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7260; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7261; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7262; X86-NEXT: vmovlpd %xmm0, (%esp) 7263; X86-NEXT: fldl (%esp) 7264; X86-NEXT: movl %ebp, %esp 7265; X86-NEXT: popl %ebp 7266; X86-NEXT: .cfi_def_cfa %esp, 4 7267; X86-NEXT: vzeroupper 7268; X86-NEXT: retl 7269; 7270; X64-LABEL: test_mm512_reduce_mul_pd: 7271; X64: # %bb.0: # %entry 7272; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7273; X64-NEXT: vmulpd %ymm1, %ymm0, %ymm0 7274; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7275; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7276; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7277; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7278; X64-NEXT: vzeroupper 7279; X64-NEXT: retq 7280entry: 7281 %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7282 %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7283 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7284 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7285 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7286 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7287 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7288 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7289 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7290 ret double %vecext.i 7291} 7292 7293define float @test_mm512_reduce_add_ps(<16 x float> %__W) { 7294; X86-LABEL: test_mm512_reduce_add_ps: 7295; X86: # %bb.0: # %entry 7296; X86-NEXT: pushl %eax 7297; X86-NEXT: .cfi_def_cfa_offset 8 7298; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7299; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7300; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7301; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7302; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7303; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7304; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7305; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7306; X86-NEXT: vmovss %xmm0, (%esp) 7307; X86-NEXT: flds (%esp) 7308; X86-NEXT: popl %eax 7309; X86-NEXT: .cfi_def_cfa_offset 4 7310; X86-NEXT: vzeroupper 7311; X86-NEXT: retl 7312; 7313; X64-LABEL: test_mm512_reduce_add_ps: 7314; X64: # %bb.0: # %entry 7315; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7316; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7317; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7318; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7319; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7320; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7321; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7322; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7323; X64-NEXT: vzeroupper 7324; X64-NEXT: retq 7325entry: 7326 %0 = bitcast <16 x float> %__W to <8 x double> 7327 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7328 %1 = bitcast <4 x double> %extract.i to <8 x float> 7329 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7330 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7331 %add.i = fadd <8 x float> %1, %2 7332 %extract3.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7333 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7334 %add5.i = fadd <4 x float> %extract3.i, %extract4.i 7335 %shuffle.i = shufflevector <4 x float> %add5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7336 %add6.i = fadd <4 x float> %add5.i, %shuffle.i 7337 %shuffle7.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7338 %add8.i = fadd <4 x float> %add6.i, %shuffle7.i 7339 %vecext.i = extractelement <4 x float> %add8.i, i32 0 7340 ret float %vecext.i 7341} 7342 7343define float @test_mm512_reduce_mul_ps(<16 x float> %__W) { 7344; X86-LABEL: test_mm512_reduce_mul_ps: 7345; X86: # %bb.0: # %entry 7346; X86-NEXT: pushl %eax 7347; X86-NEXT: .cfi_def_cfa_offset 8 7348; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7349; X86-NEXT: vmulps %ymm1, %ymm0, %ymm0 7350; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7351; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7352; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7353; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7354; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7355; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7356; X86-NEXT: vmovss %xmm0, (%esp) 7357; X86-NEXT: flds (%esp) 7358; X86-NEXT: popl %eax 7359; X86-NEXT: .cfi_def_cfa_offset 4 7360; X86-NEXT: vzeroupper 7361; X86-NEXT: retl 7362; 7363; X64-LABEL: test_mm512_reduce_mul_ps: 7364; X64: # %bb.0: # %entry 7365; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7366; X64-NEXT: vmulps %ymm1, %ymm0, %ymm0 7367; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7368; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7369; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7370; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7371; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7372; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7373; X64-NEXT: vzeroupper 7374; X64-NEXT: retq 7375entry: 7376 %0 = bitcast <16 x float> %__W to <8 x double> 7377 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7378 %1 = bitcast <4 x double> %extract.i to <8 x float> 7379 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7380 %2 = bitcast <4 x double> %extract2.i to <8 x float> 7381 %mul.i = fmul <8 x float> %1, %2 7382 %extract3.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7383 %extract4.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7384 %mul5.i = fmul <4 x float> %extract3.i, %extract4.i 7385 %shuffle.i = shufflevector <4 x float> %mul5.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7386 %mul6.i = fmul <4 x float> %mul5.i, %shuffle.i 7387 %shuffle7.i = shufflevector <4 x float> %mul6.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7388 %mul8.i = fmul <4 x float> %mul6.i, %shuffle7.i 7389 %vecext.i = extractelement <4 x float> %mul8.i, i32 0 7390 ret float %vecext.i 7391} 7392 7393define double @test_mm512_mask_reduce_add_pd(i8 zeroext %__M, <8 x double> %__W) { 7394; X86-LABEL: test_mm512_mask_reduce_add_pd: 7395; X86: # %bb.0: # %entry 7396; X86-NEXT: pushl %ebp 7397; X86-NEXT: .cfi_def_cfa_offset 8 7398; X86-NEXT: .cfi_offset %ebp, -8 7399; X86-NEXT: movl %esp, %ebp 7400; X86-NEXT: .cfi_def_cfa_register %ebp 7401; X86-NEXT: andl $-8, %esp 7402; X86-NEXT: subl $8, %esp 7403; X86-NEXT: movb 8(%ebp), %al 7404; X86-NEXT: kmovw %eax, %k1 7405; X86-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7406; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7407; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7408; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7409; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7410; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7411; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7412; X86-NEXT: vmovlpd %xmm0, (%esp) 7413; X86-NEXT: fldl (%esp) 7414; X86-NEXT: movl %ebp, %esp 7415; X86-NEXT: popl %ebp 7416; X86-NEXT: .cfi_def_cfa %esp, 4 7417; X86-NEXT: vzeroupper 7418; X86-NEXT: retl 7419; 7420; X64-LABEL: test_mm512_mask_reduce_add_pd: 7421; X64: # %bb.0: # %entry 7422; X64-NEXT: kmovw %edi, %k1 7423; X64-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} 7424; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7425; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 7426; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7427; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7428; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7429; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 7430; X64-NEXT: vzeroupper 7431; X64-NEXT: retq 7432entry: 7433 %0 = bitcast i8 %__M to <8 x i1> 7434 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> zeroinitializer 7435 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7436 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7437 %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i 7438 %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7439 %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7440 %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i 7441 %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7442 %add7.i = fadd <2 x double> %add4.i, %shuffle6.i 7443 %vecext.i = extractelement <2 x double> %add7.i, i32 0 7444 ret double %vecext.i 7445} 7446 7447define double @test_mm512_mask_reduce_mul_pd(i8 zeroext %__M, <8 x double> %__W) { 7448; X86-LABEL: test_mm512_mask_reduce_mul_pd: 7449; X86: # %bb.0: # %entry 7450; X86-NEXT: pushl %ebp 7451; X86-NEXT: .cfi_def_cfa_offset 8 7452; X86-NEXT: .cfi_offset %ebp, -8 7453; X86-NEXT: movl %esp, %ebp 7454; X86-NEXT: .cfi_def_cfa_register %ebp 7455; X86-NEXT: andl $-8, %esp 7456; X86-NEXT: subl $8, %esp 7457; X86-NEXT: movb 8(%ebp), %al 7458; X86-NEXT: kmovw %eax, %k1 7459; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 7460; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7461; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7462; X86-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7463; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7464; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7465; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7466; X86-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7467; X86-NEXT: vmovlpd %xmm0, (%esp) 7468; X86-NEXT: fldl (%esp) 7469; X86-NEXT: movl %ebp, %esp 7470; X86-NEXT: popl %ebp 7471; X86-NEXT: .cfi_def_cfa %esp, 4 7472; X86-NEXT: vzeroupper 7473; X86-NEXT: retl 7474; 7475; X64-LABEL: test_mm512_mask_reduce_mul_pd: 7476; X64: # %bb.0: # %entry 7477; X64-NEXT: kmovw %edi, %k1 7478; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1] 7479; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7480; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7481; X64-NEXT: vmulpd %ymm0, %ymm1, %ymm0 7482; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7483; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7484; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7485; X64-NEXT: vmulpd %xmm1, %xmm0, %xmm0 7486; X64-NEXT: vzeroupper 7487; X64-NEXT: retq 7488entry: 7489 %0 = bitcast i8 %__M to <8 x i1> 7490 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00> 7491 %shuffle.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7492 %shuffle1.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7493 %mul.i = fmul <4 x double> %shuffle.i, %shuffle1.i 7494 %shuffle2.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7495 %shuffle3.i = shufflevector <4 x double> %mul.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7496 %mul4.i = fmul <2 x double> %shuffle2.i, %shuffle3.i 7497 %shuffle6.i = shufflevector <2 x double> %mul4.i, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7498 %mul7.i = fmul <2 x double> %mul4.i, %shuffle6.i 7499 %vecext.i = extractelement <2 x double> %mul7.i, i32 0 7500 ret double %vecext.i 7501} 7502 7503define float @test_mm512_mask_reduce_add_ps(i16 zeroext %__M, <16 x float> %__W) { 7504; X86-LABEL: test_mm512_mask_reduce_add_ps: 7505; X86: # %bb.0: # %entry 7506; X86-NEXT: pushl %eax 7507; X86-NEXT: .cfi_def_cfa_offset 8 7508; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7509; X86-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7510; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7511; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 7512; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7513; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7514; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7515; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7516; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7517; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 7518; X86-NEXT: vmovss %xmm0, (%esp) 7519; X86-NEXT: flds (%esp) 7520; X86-NEXT: popl %eax 7521; X86-NEXT: .cfi_def_cfa_offset 4 7522; X86-NEXT: vzeroupper 7523; X86-NEXT: retl 7524; 7525; X64-LABEL: test_mm512_mask_reduce_add_ps: 7526; X64: # %bb.0: # %entry 7527; X64-NEXT: kmovw %edi, %k1 7528; X64-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} 7529; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7530; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 7531; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7532; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7533; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7534; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7535; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7536; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 7537; X64-NEXT: vzeroupper 7538; X64-NEXT: retq 7539entry: 7540 %0 = bitcast i16 %__M to <16 x i1> 7541 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> zeroinitializer 7542 %2 = bitcast <16 x float> %1 to <8 x double> 7543 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7544 %3 = bitcast <4 x double> %extract.i to <8 x float> 7545 %extract3.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7546 %4 = bitcast <4 x double> %extract3.i to <8 x float> 7547 %add.i = fadd <8 x float> %3, %4 7548 %extract4.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7549 %extract5.i = shufflevector <8 x float> %add.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7550 %add6.i = fadd <4 x float> %extract4.i, %extract5.i 7551 %shuffle.i = shufflevector <4 x float> %add6.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7552 %add7.i = fadd <4 x float> %add6.i, %shuffle.i 7553 %shuffle8.i = shufflevector <4 x float> %add7.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7554 %add9.i = fadd <4 x float> %add7.i, %shuffle8.i 7555 %vecext.i = extractelement <4 x float> %add9.i, i32 0 7556 ret float %vecext.i 7557} 7558 7559define float @test_mm512_mask_reduce_mul_ps(i16 zeroext %__M, <16 x float> %__W) { 7560; X86-LABEL: test_mm512_mask_reduce_mul_ps: 7561; X86: # %bb.0: # %entry 7562; X86-NEXT: pushl %eax 7563; X86-NEXT: .cfi_def_cfa_offset 8 7564; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 7565; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7566; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7567; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7568; X86-NEXT: vmulps %ymm0, %ymm1, %ymm0 7569; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7570; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7571; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7572; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7573; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7574; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 7575; X86-NEXT: vmovss %xmm0, (%esp) 7576; X86-NEXT: flds (%esp) 7577; X86-NEXT: popl %eax 7578; X86-NEXT: .cfi_def_cfa_offset 4 7579; X86-NEXT: vzeroupper 7580; X86-NEXT: retl 7581; 7582; X64-LABEL: test_mm512_mask_reduce_mul_ps: 7583; X64: # %bb.0: # %entry 7584; X64-NEXT: kmovw %edi, %k1 7585; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] 7586; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 7587; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7588; X64-NEXT: vmulps %ymm0, %ymm1, %ymm0 7589; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7590; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7591; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7592; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7593; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 7594; X64-NEXT: vmulps %xmm1, %xmm0, %xmm0 7595; X64-NEXT: vzeroupper 7596; X64-NEXT: retq 7597entry: 7598 %0 = bitcast i16 %__M to <16 x i1> 7599 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00> 7600 %2 = bitcast <16 x float> %1 to <8 x double> 7601 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7602 %3 = bitcast <4 x double> %extract.i to <8 x float> 7603 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7604 %4 = bitcast <4 x double> %extract4.i to <8 x float> 7605 %mul.i = fmul <8 x float> %3, %4 7606 %extract5.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7607 %extract6.i = shufflevector <8 x float> %mul.i, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7608 %mul7.i = fmul <4 x float> %extract5.i, %extract6.i 7609 %shuffle.i = shufflevector <4 x float> %mul7.i, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 7610 %mul8.i = fmul <4 x float> %mul7.i, %shuffle.i 7611 %shuffle9.i = shufflevector <4 x float> %mul8.i, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 7612 %mul10.i = fmul <4 x float> %mul8.i, %shuffle9.i 7613 %vecext.i = extractelement <4 x float> %mul10.i, i32 0 7614 ret float %vecext.i 7615} 7616 7617define i64 @test_mm512_reduce_max_epi64(<8 x i64> %__W) { 7618; X86-LABEL: test_mm512_reduce_max_epi64: 7619; X86: # %bb.0: # %entry 7620; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7621; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7622; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7623; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7624; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7625; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7626; X86-NEXT: vmovd %xmm0, %eax 7627; X86-NEXT: vpextrd $1, %xmm0, %edx 7628; X86-NEXT: vzeroupper 7629; X86-NEXT: retl 7630; 7631; X64-LABEL: test_mm512_reduce_max_epi64: 7632; X64: # %bb.0: # %entry 7633; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7634; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7635; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7636; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7637; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7638; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7639; X64-NEXT: vmovq %xmm0, %rax 7640; X64-NEXT: vzeroupper 7641; X64-NEXT: retq 7642entry: 7643 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7644 %0 = icmp slt <8 x i64> %shuffle.i, %__W 7645 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7646 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7647 %2 = icmp sgt <8 x i64> %1, %shuffle1.i 7648 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7649 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7650 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 7651 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7652 %vecext.i = extractelement <8 x i64> %5, i32 0 7653 ret i64 %vecext.i 7654} 7655 7656define i64 @test_mm512_reduce_max_epu64(<8 x i64> %__W) { 7657; X86-LABEL: test_mm512_reduce_max_epu64: 7658; X86: # %bb.0: # %entry 7659; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7660; X86-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7661; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7662; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7663; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7664; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7665; X86-NEXT: vmovd %xmm0, %eax 7666; X86-NEXT: vpextrd $1, %xmm0, %edx 7667; X86-NEXT: vzeroupper 7668; X86-NEXT: retl 7669; 7670; X64-LABEL: test_mm512_reduce_max_epu64: 7671; X64: # %bb.0: # %entry 7672; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7673; X64-NEXT: vpmaxuq %zmm0, %zmm1, %zmm0 7674; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7675; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7676; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7677; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7678; X64-NEXT: vmovq %xmm0, %rax 7679; X64-NEXT: vzeroupper 7680; X64-NEXT: retq 7681entry: 7682 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7683 %0 = icmp ult <8 x i64> %shuffle.i, %__W 7684 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7685 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7686 %2 = icmp ugt <8 x i64> %1, %shuffle1.i 7687 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7688 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7689 %4 = icmp ugt <8 x i64> %3, %shuffle3.i 7690 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7691 %vecext.i = extractelement <8 x i64> %5, i32 0 7692 ret i64 %vecext.i 7693} 7694 7695define double @test_mm512_reduce_max_pd(<8 x double> %__W) { 7696; X86-LABEL: test_mm512_reduce_max_pd: 7697; X86: # %bb.0: # %entry 7698; X86-NEXT: pushl %ebp 7699; X86-NEXT: .cfi_def_cfa_offset 8 7700; X86-NEXT: .cfi_offset %ebp, -8 7701; X86-NEXT: movl %esp, %ebp 7702; X86-NEXT: .cfi_def_cfa_register %ebp 7703; X86-NEXT: andl $-8, %esp 7704; X86-NEXT: subl $8, %esp 7705; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7706; X86-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7707; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7708; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7709; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7710; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7711; X86-NEXT: vmovlpd %xmm0, (%esp) 7712; X86-NEXT: fldl (%esp) 7713; X86-NEXT: movl %ebp, %esp 7714; X86-NEXT: popl %ebp 7715; X86-NEXT: .cfi_def_cfa %esp, 4 7716; X86-NEXT: vzeroupper 7717; X86-NEXT: retl 7718; 7719; X64-LABEL: test_mm512_reduce_max_pd: 7720; X64: # %bb.0: # %entry 7721; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7722; X64-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 7723; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7724; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7725; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7726; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7727; X64-NEXT: vzeroupper 7728; X64-NEXT: retq 7729entry: 7730 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7731 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7732 %0 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7733 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7734 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7735 %1 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7736 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7737 %2 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %shuffle.i) 7738 %vecext.i = extractelement <2 x double> %2, i32 0 7739 ret double %vecext.i 7740} 7741 7742define i64 @test_mm512_reduce_min_epi64(<8 x i64> %__W) { 7743; X86-LABEL: test_mm512_reduce_min_epi64: 7744; X86: # %bb.0: # %entry 7745; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7746; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7747; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7748; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7749; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7750; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7751; X86-NEXT: vmovd %xmm0, %eax 7752; X86-NEXT: vpextrd $1, %xmm0, %edx 7753; X86-NEXT: vzeroupper 7754; X86-NEXT: retl 7755; 7756; X64-LABEL: test_mm512_reduce_min_epi64: 7757; X64: # %bb.0: # %entry 7758; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7759; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 7760; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7761; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7762; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7763; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 7764; X64-NEXT: vmovq %xmm0, %rax 7765; X64-NEXT: vzeroupper 7766; X64-NEXT: retq 7767entry: 7768 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7769 %0 = icmp sgt <8 x i64> %shuffle.i, %__W 7770 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7771 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7772 %2 = icmp slt <8 x i64> %1, %shuffle1.i 7773 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7774 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7775 %4 = icmp slt <8 x i64> %3, %shuffle3.i 7776 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7777 %vecext.i = extractelement <8 x i64> %5, i32 0 7778 ret i64 %vecext.i 7779} 7780 7781define i64 @test_mm512_reduce_min_epu64(<8 x i64> %__W) { 7782; X86-LABEL: test_mm512_reduce_min_epu64: 7783; X86: # %bb.0: # %entry 7784; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7785; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7786; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7787; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7788; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7789; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7790; X86-NEXT: vmovd %xmm0, %eax 7791; X86-NEXT: vpextrd $1, %xmm0, %edx 7792; X86-NEXT: vzeroupper 7793; X86-NEXT: retl 7794; 7795; X64-LABEL: test_mm512_reduce_min_epu64: 7796; X64: # %bb.0: # %entry 7797; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7798; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 7799; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7800; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7801; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7802; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 7803; X64-NEXT: vmovq %xmm0, %rax 7804; X64-NEXT: vzeroupper 7805; X64-NEXT: retq 7806entry: 7807 %shuffle.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7808 %0 = icmp ugt <8 x i64> %shuffle.i, %__W 7809 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> %shuffle.i 7810 %shuffle1.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7811 %2 = icmp ult <8 x i64> %1, %shuffle1.i 7812 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle1.i 7813 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7814 %4 = icmp ult <8 x i64> %3, %shuffle3.i 7815 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7816 %vecext.i = extractelement <8 x i64> %5, i32 0 7817 ret i64 %vecext.i 7818} 7819 7820define double @test_mm512_reduce_min_pd(<8 x double> %__W) { 7821; X86-LABEL: test_mm512_reduce_min_pd: 7822; X86: # %bb.0: # %entry 7823; X86-NEXT: pushl %ebp 7824; X86-NEXT: .cfi_def_cfa_offset 8 7825; X86-NEXT: .cfi_offset %ebp, -8 7826; X86-NEXT: movl %esp, %ebp 7827; X86-NEXT: .cfi_def_cfa_register %ebp 7828; X86-NEXT: andl $-8, %esp 7829; X86-NEXT: subl $8, %esp 7830; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7831; X86-NEXT: vminpd %ymm1, %ymm0, %ymm0 7832; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7833; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 7834; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7835; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 7836; X86-NEXT: vmovlpd %xmm0, (%esp) 7837; X86-NEXT: fldl (%esp) 7838; X86-NEXT: movl %ebp, %esp 7839; X86-NEXT: popl %ebp 7840; X86-NEXT: .cfi_def_cfa %esp, 4 7841; X86-NEXT: vzeroupper 7842; X86-NEXT: retl 7843; 7844; X64-LABEL: test_mm512_reduce_min_pd: 7845; X64: # %bb.0: # %entry 7846; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 7847; X64-NEXT: vminpd %ymm1, %ymm0, %ymm0 7848; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7849; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 7850; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7851; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 7852; X64-NEXT: vzeroupper 7853; X64-NEXT: retq 7854entry: 7855 %extract.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 7856 %extract2.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 7857 %0 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract2.i) 7858 %extract4.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 1> 7859 %extract5.i = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 2, i32 3> 7860 %1 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract4.i, <2 x double> %extract5.i) 7861 %shuffle.i = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> <i32 1, i32 0> 7862 %2 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %shuffle.i) 7863 %vecext.i = extractelement <2 x double> %2, i32 0 7864 ret double %vecext.i 7865} 7866 7867define i64 @test_mm512_mask_reduce_max_epi64(i8 zeroext %__M, <8 x i64> %__W) { 7868; X86-LABEL: test_mm512_mask_reduce_max_epi64: 7869; X86: # %bb.0: # %entry 7870; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7871; X86-NEXT: kmovw %eax, %k1 7872; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648,0,2147483648] 7873; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 7874; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 7875; X86-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7876; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7877; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7878; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7879; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7880; X86-NEXT: vmovd %xmm0, %eax 7881; X86-NEXT: vpextrd $1, %xmm0, %edx 7882; X86-NEXT: vzeroupper 7883; X86-NEXT: retl 7884; 7885; X64-LABEL: test_mm512_mask_reduce_max_epi64: 7886; X64: # %bb.0: # %entry 7887; X64-NEXT: kmovw %edi, %k1 7888; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] 7889; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 7890; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 7891; X64-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 7892; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7893; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7894; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7895; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 7896; X64-NEXT: vmovq %xmm0, %rax 7897; X64-NEXT: vzeroupper 7898; X64-NEXT: retq 7899entry: 7900 %0 = bitcast i8 %__M to <8 x i1> 7901 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808, i64 -9223372036854775808> 7902 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7903 %2 = icmp sgt <8 x i64> %1, %shuffle.i 7904 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 7905 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7906 %4 = icmp sgt <8 x i64> %3, %shuffle3.i 7907 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 7908 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7909 %6 = icmp sgt <8 x i64> %5, %shuffle5.i 7910 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 7911 %vecext.i = extractelement <8 x i64> %7, i32 0 7912 ret i64 %vecext.i 7913} 7914 7915define i64 @test_mm512_mask_reduce_max_epu64(i8 zeroext %__M, <8 x i64> %__W) { 7916; X86-LABEL: test_mm512_mask_reduce_max_epu64: 7917; X86: # %bb.0: # %entry 7918; X86-NEXT: movb {{[0-9]+}}(%esp), %al 7919; X86-NEXT: kmovw %eax, %k1 7920; X86-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 7921; X86-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7922; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7923; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7924; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7925; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7926; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7927; X86-NEXT: vmovd %xmm0, %eax 7928; X86-NEXT: vpextrd $1, %xmm0, %edx 7929; X86-NEXT: vzeroupper 7930; X86-NEXT: retl 7931; 7932; X64-LABEL: test_mm512_mask_reduce_max_epu64: 7933; X64: # %bb.0: # %entry 7934; X64-NEXT: kmovw %edi, %k1 7935; X64-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} 7936; X64-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,2,3] 7937; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7938; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 7939; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7940; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 7941; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 7942; X64-NEXT: vmovq %xmm0, %rax 7943; X64-NEXT: vzeroupper 7944; X64-NEXT: retq 7945entry: 7946 %0 = bitcast i8 %__M to <8 x i1> 7947 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> zeroinitializer 7948 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 7949 %2 = icmp ugt <8 x i64> %1, %shuffle.i 7950 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 7951 %shuffle2.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 7952 %4 = icmp ugt <8 x i64> %3, %shuffle2.i 7953 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle2.i 7954 %shuffle4.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 7955 %6 = icmp ugt <8 x i64> %5, %shuffle4.i 7956 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle4.i 7957 %vecext.i = extractelement <8 x i64> %7, i32 0 7958 ret i64 %vecext.i 7959} 7960 7961define double @test_mm512_mask_reduce_max_pd(i8 zeroext %__M, <8 x double> %__W) { 7962; X86-LABEL: test_mm512_mask_reduce_max_pd: 7963; X86: # %bb.0: # %entry 7964; X86-NEXT: pushl %ebp 7965; X86-NEXT: .cfi_def_cfa_offset 8 7966; X86-NEXT: .cfi_offset %ebp, -8 7967; X86-NEXT: movl %esp, %ebp 7968; X86-NEXT: .cfi_def_cfa_register %ebp 7969; X86-NEXT: andl $-8, %esp 7970; X86-NEXT: subl $8, %esp 7971; X86-NEXT: movb 8(%ebp), %al 7972; X86-NEXT: kmovw %eax, %k1 7973; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 7974; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7975; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7976; X86-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 7977; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 7978; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7979; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7980; X86-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7981; X86-NEXT: vmovlpd %xmm0, (%esp) 7982; X86-NEXT: fldl (%esp) 7983; X86-NEXT: movl %ebp, %esp 7984; X86-NEXT: popl %ebp 7985; X86-NEXT: .cfi_def_cfa %esp, 4 7986; X86-NEXT: vzeroupper 7987; X86-NEXT: retl 7988; 7989; X64-LABEL: test_mm512_mask_reduce_max_pd: 7990; X64: # %bb.0: # %entry 7991; X64-NEXT: kmovw %edi, %k1 7992; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 7993; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 7994; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 7995; X64-NEXT: vmaxpd %ymm0, %ymm1, %ymm0 7996; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 7997; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 7998; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 7999; X64-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 8000; X64-NEXT: vzeroupper 8001; X64-NEXT: retq 8002entry: 8003 %0 = bitcast i8 %__M to <8 x i1> 8004 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000, double 0xFFF0000000000000> 8005 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8006 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8007 %2 = tail call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) #3 8008 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8009 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8010 %3 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) #3 8011 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8012 %4 = tail call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %3, <2 x double> %shuffle.i) #3 8013 %vecext.i = extractelement <2 x double> %4, i32 0 8014 ret double %vecext.i 8015} 8016 8017define i64 @test_mm512_mask_reduce_min_epi64(i8 zeroext %__M, <8 x i64> %__W) { 8018; X86-LABEL: test_mm512_mask_reduce_min_epi64: 8019; X86: # %bb.0: # %entry 8020; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8021; X86-NEXT: kmovw %eax, %k1 8022; X86-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647,4294967295,2147483647] 8023; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8024; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8025; X86-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8026; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8027; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8028; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8029; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8030; X86-NEXT: vmovd %xmm0, %eax 8031; X86-NEXT: vpextrd $1, %xmm0, %edx 8032; X86-NEXT: vzeroupper 8033; X86-NEXT: retl 8034; 8035; X64-LABEL: test_mm512_mask_reduce_min_epi64: 8036; X64: # %bb.0: # %entry 8037; X64-NEXT: kmovw %edi, %k1 8038; X64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807] 8039; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8040; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8041; X64-NEXT: vpminsq %zmm0, %zmm1, %zmm0 8042; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8043; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8044; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8045; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm0 8046; X64-NEXT: vmovq %xmm0, %rax 8047; X64-NEXT: vzeroupper 8048; X64-NEXT: retq 8049entry: 8050 %0 = bitcast i8 %__M to <8 x i1> 8051 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807, i64 9223372036854775807> 8052 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8053 %2 = icmp slt <8 x i64> %1, %shuffle.i 8054 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8055 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8056 %4 = icmp slt <8 x i64> %3, %shuffle3.i 8057 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8058 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8059 %6 = icmp slt <8 x i64> %5, %shuffle5.i 8060 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8061 %vecext.i = extractelement <8 x i64> %7, i32 0 8062 ret i64 %vecext.i 8063} 8064 8065define i64 @test_mm512_mask_reduce_min_epu64(i8 zeroext %__M, <8 x i64> %__W) { 8066; X86-LABEL: test_mm512_mask_reduce_min_epu64: 8067; X86: # %bb.0: # %entry 8068; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8069; X86-NEXT: kmovw %eax, %k1 8070; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8071; X86-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8072; X86-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8073; X86-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8074; X86-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8075; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8076; X86-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8077; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8078; X86-NEXT: vmovd %xmm0, %eax 8079; X86-NEXT: vpextrd $1, %xmm0, %edx 8080; X86-NEXT: vzeroupper 8081; X86-NEXT: retl 8082; 8083; X64-LABEL: test_mm512_mask_reduce_min_epu64: 8084; X64: # %bb.0: # %entry 8085; X64-NEXT: kmovw %edi, %k1 8086; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8087; X64-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} 8088; X64-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,0,1,2,3] 8089; X64-NEXT: vpminuq %zmm0, %zmm1, %zmm0 8090; X64-NEXT: vpermq {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5] 8091; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8092; X64-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 8093; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm0 8094; X64-NEXT: vmovq %xmm0, %rax 8095; X64-NEXT: vzeroupper 8096; X64-NEXT: retq 8097entry: 8098 %0 = bitcast i8 %__M to <8 x i1> 8099 %1 = select <8 x i1> %0, <8 x i64> %__W, <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1> 8100 %shuffle.i = shufflevector <8 x i64> %1, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3> 8101 %2 = icmp ult <8 x i64> %1, %shuffle.i 8102 %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %shuffle.i 8103 %shuffle3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5> 8104 %4 = icmp ult <8 x i64> %3, %shuffle3.i 8105 %5 = select <8 x i1> %4, <8 x i64> %3, <8 x i64> %shuffle3.i 8106 %shuffle5.i = shufflevector <8 x i64> %5, <8 x i64> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 8107 %6 = icmp ult <8 x i64> %5, %shuffle5.i 8108 %7 = select <8 x i1> %6, <8 x i64> %5, <8 x i64> %shuffle5.i 8109 %vecext.i = extractelement <8 x i64> %7, i32 0 8110 ret i64 %vecext.i 8111} 8112 8113define double @test_mm512_mask_reduce_min_pd(i8 zeroext %__M, <8 x double> %__W) { 8114; X86-LABEL: test_mm512_mask_reduce_min_pd: 8115; X86: # %bb.0: # %entry 8116; X86-NEXT: pushl %ebp 8117; X86-NEXT: .cfi_def_cfa_offset 8 8118; X86-NEXT: .cfi_offset %ebp, -8 8119; X86-NEXT: movl %esp, %ebp 8120; X86-NEXT: .cfi_def_cfa_register %ebp 8121; X86-NEXT: andl $-8, %esp 8122; X86-NEXT: subl $8, %esp 8123; X86-NEXT: movb 8(%ebp), %al 8124; X86-NEXT: kmovw %eax, %k1 8125; X86-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8126; X86-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8127; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8128; X86-NEXT: vminpd %ymm0, %ymm1, %ymm0 8129; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8130; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 8131; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8132; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 8133; X86-NEXT: vmovlpd %xmm0, (%esp) 8134; X86-NEXT: fldl (%esp) 8135; X86-NEXT: movl %ebp, %esp 8136; X86-NEXT: popl %ebp 8137; X86-NEXT: .cfi_def_cfa %esp, 4 8138; X86-NEXT: vzeroupper 8139; X86-NEXT: retl 8140; 8141; X64-LABEL: test_mm512_mask_reduce_min_pd: 8142; X64: # %bb.0: # %entry 8143; X64-NEXT: kmovw %edi, %k1 8144; X64-NEXT: vbroadcastsd {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8145; X64-NEXT: vmovapd %zmm0, %zmm1 {%k1} 8146; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8147; X64-NEXT: vminpd %ymm0, %ymm1, %ymm0 8148; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8149; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 8150; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8151; X64-NEXT: vminpd %xmm1, %xmm0, %xmm0 8152; X64-NEXT: vzeroupper 8153; X64-NEXT: retq 8154entry: 8155 %0 = bitcast i8 %__M to <8 x i1> 8156 %1 = select <8 x i1> %0, <8 x double> %__W, <8 x double> <double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000, double 0x7FF0000000000000> 8157 %extract.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8158 %extract4.i = shufflevector <8 x double> %1, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8159 %2 = tail call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %extract.i, <4 x double> %extract4.i) 8160 %extract6.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 0, i32 1> 8161 %extract7.i = shufflevector <4 x double> %2, <4 x double> undef, <2 x i32> <i32 2, i32 3> 8162 %3 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %extract6.i, <2 x double> %extract7.i) 8163 %shuffle.i = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> <i32 1, i32 0> 8164 %4 = tail call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %3, <2 x double> %shuffle.i) 8165 %vecext.i = extractelement <2 x double> %4, i32 0 8166 ret double %vecext.i 8167} 8168 8169define i32 @test_mm512_reduce_max_epi32(<8 x i64> %__W) { 8170; CHECK-LABEL: test_mm512_reduce_max_epi32: 8171; CHECK: # %bb.0: # %entry 8172; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8173; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 8174; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8175; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8176; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8177; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8178; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8179; CHECK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8180; CHECK-NEXT: vmovd %xmm0, %eax 8181; CHECK-NEXT: vzeroupper 8182; CHECK-NEXT: ret{{[l|q]}} 8183entry: 8184 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8185 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8186 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8187 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8188 %2 = icmp sgt <8 x i32> %0, %1 8189 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8190 %4 = bitcast <8 x i32> %3 to <4 x i64> 8191 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8192 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8193 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8194 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8195 %7 = icmp sgt <4 x i32> %5, %6 8196 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8197 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8198 %9 = icmp sgt <4 x i32> %8, %shuffle.i 8199 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8200 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8201 %11 = icmp sgt <4 x i32> %10, %shuffle8.i 8202 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8203 %vecext.i = extractelement <4 x i32> %12, i32 0 8204 ret i32 %vecext.i 8205} 8206 8207define i32 @test_mm512_reduce_max_epu32(<8 x i64> %__W) { 8208; CHECK-LABEL: test_mm512_reduce_max_epu32: 8209; CHECK: # %bb.0: # %entry 8210; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8211; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8212; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8213; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8214; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8215; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8216; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8217; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8218; CHECK-NEXT: vmovd %xmm0, %eax 8219; CHECK-NEXT: vzeroupper 8220; CHECK-NEXT: ret{{[l|q]}} 8221entry: 8222 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8223 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8224 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8225 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8226 %2 = icmp ugt <8 x i32> %0, %1 8227 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8228 %4 = bitcast <8 x i32> %3 to <4 x i64> 8229 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8230 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8231 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8232 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8233 %7 = icmp ugt <4 x i32> %5, %6 8234 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8235 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8236 %9 = icmp ugt <4 x i32> %8, %shuffle.i 8237 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8238 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8239 %11 = icmp ugt <4 x i32> %10, %shuffle8.i 8240 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8241 %vecext.i = extractelement <4 x i32> %12, i32 0 8242 ret i32 %vecext.i 8243} 8244 8245define float @test_mm512_reduce_max_ps(<16 x float> %__W) { 8246; X86-LABEL: test_mm512_reduce_max_ps: 8247; X86: # %bb.0: # %entry 8248; X86-NEXT: pushl %eax 8249; X86-NEXT: .cfi_def_cfa_offset 8 8250; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8251; X86-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8252; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8253; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8254; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8255; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8256; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8257; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8258; X86-NEXT: vmovss %xmm0, (%esp) 8259; X86-NEXT: flds (%esp) 8260; X86-NEXT: popl %eax 8261; X86-NEXT: .cfi_def_cfa_offset 4 8262; X86-NEXT: vzeroupper 8263; X86-NEXT: retl 8264; 8265; X64-LABEL: test_mm512_reduce_max_ps: 8266; X64: # %bb.0: # %entry 8267; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8268; X64-NEXT: vmaxps %ymm1, %ymm0, %ymm0 8269; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8270; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8271; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8272; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8273; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8274; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8275; X64-NEXT: vzeroupper 8276; X64-NEXT: retq 8277entry: 8278 %0 = bitcast <16 x float> %__W to <8 x double> 8279 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8280 %1 = bitcast <4 x double> %extract.i to <8 x float> 8281 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8282 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8283 %3 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %1, <8 x float> %2) 8284 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8285 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8286 %4 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8287 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8288 %5 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %4, <4 x float> %shuffle.i) 8289 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8290 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8291 %vecext.i = extractelement <4 x float> %6, i32 0 8292 ret float %vecext.i 8293} 8294 8295define i32 @test_mm512_reduce_min_epi32(<8 x i64> %__W) { 8296; CHECK-LABEL: test_mm512_reduce_min_epi32: 8297; CHECK: # %bb.0: # %entry 8298; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8299; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 8300; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8301; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8302; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8303; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8304; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8305; CHECK-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8306; CHECK-NEXT: vmovd %xmm0, %eax 8307; CHECK-NEXT: vzeroupper 8308; CHECK-NEXT: ret{{[l|q]}} 8309entry: 8310 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8311 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8312 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8313 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8314 %2 = icmp slt <8 x i32> %0, %1 8315 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8316 %4 = bitcast <8 x i32> %3 to <4 x i64> 8317 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8318 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8319 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8320 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8321 %7 = icmp slt <4 x i32> %5, %6 8322 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8323 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8324 %9 = icmp slt <4 x i32> %8, %shuffle.i 8325 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8326 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8327 %11 = icmp slt <4 x i32> %10, %shuffle8.i 8328 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8329 %vecext.i = extractelement <4 x i32> %12, i32 0 8330 ret i32 %vecext.i 8331} 8332 8333define i32 @test_mm512_reduce_min_epu32(<8 x i64> %__W) { 8334; CHECK-LABEL: test_mm512_reduce_min_epu32: 8335; CHECK: # %bb.0: # %entry 8336; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8337; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 8338; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 8339; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8340; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8341; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8342; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8343; CHECK-NEXT: vpminud %xmm1, %xmm0, %xmm0 8344; CHECK-NEXT: vmovd %xmm0, %eax 8345; CHECK-NEXT: vzeroupper 8346; CHECK-NEXT: ret{{[l|q]}} 8347entry: 8348 %extract.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8349 %extract2.i = shufflevector <8 x i64> %__W, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8350 %0 = bitcast <4 x i64> %extract.i to <8 x i32> 8351 %1 = bitcast <4 x i64> %extract2.i to <8 x i32> 8352 %2 = icmp ult <8 x i32> %0, %1 8353 %3 = select <8 x i1> %2, <8 x i32> %0, <8 x i32> %1 8354 %4 = bitcast <8 x i32> %3 to <4 x i64> 8355 %extract4.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8356 %extract5.i = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8357 %5 = bitcast <2 x i64> %extract4.i to <4 x i32> 8358 %6 = bitcast <2 x i64> %extract5.i to <4 x i32> 8359 %7 = icmp ult <4 x i32> %5, %6 8360 %8 = select <4 x i1> %7, <4 x i32> %5, <4 x i32> %6 8361 %shuffle.i = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8362 %9 = icmp ult <4 x i32> %8, %shuffle.i 8363 %10 = select <4 x i1> %9, <4 x i32> %8, <4 x i32> %shuffle.i 8364 %shuffle8.i = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8365 %11 = icmp ult <4 x i32> %10, %shuffle8.i 8366 %12 = select <4 x i1> %11, <4 x i32> %10, <4 x i32> %shuffle8.i 8367 %vecext.i = extractelement <4 x i32> %12, i32 0 8368 ret i32 %vecext.i 8369} 8370 8371define float @test_mm512_reduce_min_ps(<16 x float> %__W) { 8372; X86-LABEL: test_mm512_reduce_min_ps: 8373; X86: # %bb.0: # %entry 8374; X86-NEXT: pushl %eax 8375; X86-NEXT: .cfi_def_cfa_offset 8 8376; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8377; X86-NEXT: vminps %ymm1, %ymm0, %ymm0 8378; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8379; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8380; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8381; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8382; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8383; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8384; X86-NEXT: vmovss %xmm0, (%esp) 8385; X86-NEXT: flds (%esp) 8386; X86-NEXT: popl %eax 8387; X86-NEXT: .cfi_def_cfa_offset 4 8388; X86-NEXT: vzeroupper 8389; X86-NEXT: retl 8390; 8391; X64-LABEL: test_mm512_reduce_min_ps: 8392; X64: # %bb.0: # %entry 8393; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm1 8394; X64-NEXT: vminps %ymm1, %ymm0, %ymm0 8395; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8396; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8397; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8398; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8399; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8400; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8401; X64-NEXT: vzeroupper 8402; X64-NEXT: retq 8403entry: 8404 %0 = bitcast <16 x float> %__W to <8 x double> 8405 %extract.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8406 %1 = bitcast <4 x double> %extract.i to <8 x float> 8407 %extract2.i = shufflevector <8 x double> %0, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8408 %2 = bitcast <4 x double> %extract2.i to <8 x float> 8409 %3 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %1, <8 x float> %2) 8410 %extract4.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8411 %extract5.i = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8412 %4 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract4.i, <4 x float> %extract5.i) 8413 %shuffle.i = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8414 %5 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %4, <4 x float> %shuffle.i) 8415 %shuffle8.i = shufflevector <4 x float> %5, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8416 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %5, <4 x float> %shuffle8.i) 8417 %vecext.i = extractelement <4 x float> %6, i32 0 8418 ret float %vecext.i 8419} 8420 8421define i32 @test_mm512_mask_reduce_max_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8422; X86-LABEL: test_mm512_mask_reduce_max_epi32: 8423; X86: # %bb.0: # %entry 8424; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8425; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8426; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8427; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8428; X86-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8429; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8430; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8431; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8432; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8433; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8434; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8435; X86-NEXT: vmovd %xmm0, %eax 8436; X86-NEXT: vzeroupper 8437; X86-NEXT: retl 8438; 8439; X64-LABEL: test_mm512_mask_reduce_max_epi32: 8440; X64: # %bb.0: # %entry 8441; X64-NEXT: kmovw %edi, %k1 8442; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] 8443; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8444; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8445; X64-NEXT: vpmaxsd %ymm0, %ymm1, %ymm0 8446; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8447; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8448; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8449; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8450; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8451; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 8452; X64-NEXT: vmovd %xmm0, %eax 8453; X64-NEXT: vzeroupper 8454; X64-NEXT: retq 8455entry: 8456 %0 = bitcast <8 x i64> %__W to <16 x i32> 8457 %1 = bitcast i16 %__M to <16 x i1> 8458 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648> 8459 %3 = bitcast <16 x i32> %2 to <8 x i64> 8460 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8461 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8462 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8463 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8464 %6 = icmp sgt <8 x i32> %4, %5 8465 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8466 %8 = bitcast <8 x i32> %7 to <4 x i64> 8467 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8468 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8469 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8470 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8471 %11 = icmp sgt <4 x i32> %9, %10 8472 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8473 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8474 %13 = icmp sgt <4 x i32> %12, %shuffle.i 8475 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8476 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8477 %15 = icmp sgt <4 x i32> %14, %shuffle10.i 8478 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8479 %vecext.i = extractelement <4 x i32> %16, i32 0 8480 ret i32 %vecext.i 8481} 8482 8483define i32 @test_mm512_mask_reduce_max_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8484; X86-LABEL: test_mm512_mask_reduce_max_epu32: 8485; X86: # %bb.0: # %entry 8486; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8487; X86-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8488; X86-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8489; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8490; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8491; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8492; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8493; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8494; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8495; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8496; X86-NEXT: vmovd %xmm0, %eax 8497; X86-NEXT: vzeroupper 8498; X86-NEXT: retl 8499; 8500; X64-LABEL: test_mm512_mask_reduce_max_epu32: 8501; X64: # %bb.0: # %entry 8502; X64-NEXT: kmovw %edi, %k1 8503; X64-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} 8504; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm1 8505; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 8506; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8507; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8508; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8509; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8510; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8511; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 8512; X64-NEXT: vmovd %xmm0, %eax 8513; X64-NEXT: vzeroupper 8514; X64-NEXT: retq 8515entry: 8516 %0 = bitcast <8 x i64> %__W to <16 x i32> 8517 %1 = bitcast i16 %__M to <16 x i1> 8518 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> zeroinitializer 8519 %3 = bitcast <16 x i32> %2 to <8 x i64> 8520 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8521 %extract3.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8522 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8523 %5 = bitcast <4 x i64> %extract3.i to <8 x i32> 8524 %6 = icmp ugt <8 x i32> %4, %5 8525 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8526 %8 = bitcast <8 x i32> %7 to <4 x i64> 8527 %extract5.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8528 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8529 %9 = bitcast <2 x i64> %extract5.i to <4 x i32> 8530 %10 = bitcast <2 x i64> %extract6.i to <4 x i32> 8531 %11 = icmp ugt <4 x i32> %9, %10 8532 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8533 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8534 %13 = icmp ugt <4 x i32> %12, %shuffle.i 8535 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8536 %shuffle9.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8537 %15 = icmp ugt <4 x i32> %14, %shuffle9.i 8538 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle9.i 8539 %vecext.i = extractelement <4 x i32> %16, i32 0 8540 ret i32 %vecext.i 8541} 8542 8543define float @test_mm512_mask_reduce_max_ps(i16 zeroext %__M, <16 x float> %__W) { 8544; X86-LABEL: test_mm512_mask_reduce_max_ps: 8545; X86: # %bb.0: # %entry 8546; X86-NEXT: pushl %eax 8547; X86-NEXT: .cfi_def_cfa_offset 8 8548; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8549; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8550; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8551; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8552; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8553; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8554; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8555; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8556; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8557; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8558; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8559; X86-NEXT: vmovss %xmm0, (%esp) 8560; X86-NEXT: flds (%esp) 8561; X86-NEXT: popl %eax 8562; X86-NEXT: .cfi_def_cfa_offset 4 8563; X86-NEXT: vzeroupper 8564; X86-NEXT: retl 8565; 8566; X64-LABEL: test_mm512_mask_reduce_max_ps: 8567; X64: # %bb.0: # %entry 8568; X64-NEXT: kmovw %edi, %k1 8569; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf,-Inf] 8570; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8571; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8572; X64-NEXT: vmaxps %ymm0, %ymm1, %ymm0 8573; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8574; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8575; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8576; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8577; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8578; X64-NEXT: vmaxps %xmm1, %xmm0, %xmm0 8579; X64-NEXT: vzeroupper 8580; X64-NEXT: retq 8581entry: 8582 %0 = bitcast i16 %__M to <16 x i1> 8583 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000> 8584 %2 = bitcast <16 x float> %1 to <8 x double> 8585 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8586 %3 = bitcast <4 x double> %extract.i to <8 x float> 8587 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8588 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8589 %5 = tail call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %3, <8 x float> %4) 8590 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8591 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8592 %6 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8593 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8594 %7 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %6, <4 x float> %shuffle.i) 8595 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8596 %8 = tail call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8597 %vecext.i = extractelement <4 x float> %8, i32 0 8598 ret float %vecext.i 8599} 8600 8601define i32 @test_mm512_mask_reduce_min_epi32(i16 zeroext %__M, <8 x i64> %__W) { 8602; X86-LABEL: test_mm512_mask_reduce_min_epi32: 8603; X86: # %bb.0: # %entry 8604; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8605; X86-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8606; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8607; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8608; X86-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8609; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8610; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8611; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8612; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8613; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8614; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8615; X86-NEXT: vmovd %xmm0, %eax 8616; X86-NEXT: vzeroupper 8617; X86-NEXT: retl 8618; 8619; X64-LABEL: test_mm512_mask_reduce_min_epi32: 8620; X64: # %bb.0: # %entry 8621; X64-NEXT: kmovw %edi, %k1 8622; X64-NEXT: vpbroadcastd {{.*#+}} zmm1 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647] 8623; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8624; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8625; X64-NEXT: vpminsd %ymm0, %ymm1, %ymm0 8626; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8627; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8628; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8629; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8630; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8631; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 8632; X64-NEXT: vmovd %xmm0, %eax 8633; X64-NEXT: vzeroupper 8634; X64-NEXT: retq 8635entry: 8636 %0 = bitcast <8 x i64> %__W to <16 x i32> 8637 %1 = bitcast i16 %__M to <16 x i1> 8638 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> 8639 %3 = bitcast <16 x i32> %2 to <8 x i64> 8640 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8641 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8642 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8643 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8644 %6 = icmp slt <8 x i32> %4, %5 8645 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8646 %8 = bitcast <8 x i32> %7 to <4 x i64> 8647 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8648 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8649 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8650 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8651 %11 = icmp slt <4 x i32> %9, %10 8652 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8653 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8654 %13 = icmp slt <4 x i32> %12, %shuffle.i 8655 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8656 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8657 %15 = icmp slt <4 x i32> %14, %shuffle10.i 8658 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8659 %vecext.i = extractelement <4 x i32> %16, i32 0 8660 ret i32 %vecext.i 8661} 8662 8663define i32 @test_mm512_mask_reduce_min_epu32(i16 zeroext %__M, <8 x i64> %__W) { 8664; X86-LABEL: test_mm512_mask_reduce_min_epu32: 8665; X86: # %bb.0: # %entry 8666; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8667; X86-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8668; X86-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8669; X86-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8670; X86-NEXT: vpminud %ymm0, %ymm1, %ymm0 8671; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 8672; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8673; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8674; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8675; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8676; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 8677; X86-NEXT: vmovd %xmm0, %eax 8678; X86-NEXT: vzeroupper 8679; X86-NEXT: retl 8680; 8681; X64-LABEL: test_mm512_mask_reduce_min_epu32: 8682; X64: # %bb.0: # %entry 8683; X64-NEXT: kmovw %edi, %k1 8684; X64-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 8685; X64-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} 8686; X64-NEXT: vextracti64x4 $1, %zmm1, %ymm0 8687; X64-NEXT: vpminud %ymm0, %ymm1, %ymm0 8688; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 8689; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8690; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] 8691; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8692; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 8693; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 8694; X64-NEXT: vmovd %xmm0, %eax 8695; X64-NEXT: vzeroupper 8696; X64-NEXT: retq 8697entry: 8698 %0 = bitcast <8 x i64> %__W to <16 x i32> 8699 %1 = bitcast i16 %__M to <16 x i1> 8700 %2 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1> 8701 %3 = bitcast <16 x i32> %2 to <8 x i64> 8702 %extract.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8703 %extract4.i = shufflevector <8 x i64> %3, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8704 %4 = bitcast <4 x i64> %extract.i to <8 x i32> 8705 %5 = bitcast <4 x i64> %extract4.i to <8 x i32> 8706 %6 = icmp ult <8 x i32> %4, %5 8707 %7 = select <8 x i1> %6, <8 x i32> %4, <8 x i32> %5 8708 %8 = bitcast <8 x i32> %7 to <4 x i64> 8709 %extract6.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 8710 %extract7.i = shufflevector <4 x i64> %8, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 8711 %9 = bitcast <2 x i64> %extract6.i to <4 x i32> 8712 %10 = bitcast <2 x i64> %extract7.i to <4 x i32> 8713 %11 = icmp ult <4 x i32> %9, %10 8714 %12 = select <4 x i1> %11, <4 x i32> %9, <4 x i32> %10 8715 %shuffle.i = shufflevector <4 x i32> %12, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8716 %13 = icmp ult <4 x i32> %12, %shuffle.i 8717 %14 = select <4 x i1> %13, <4 x i32> %12, <4 x i32> %shuffle.i 8718 %shuffle10.i = shufflevector <4 x i32> %14, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8719 %15 = icmp ult <4 x i32> %14, %shuffle10.i 8720 %16 = select <4 x i1> %15, <4 x i32> %14, <4 x i32> %shuffle10.i 8721 %vecext.i = extractelement <4 x i32> %16, i32 0 8722 ret i32 %vecext.i 8723} 8724 8725define float @test_mm512_mask_reduce_min_ps(i16 zeroext %__M, <16 x float> %__W) { 8726; X86-LABEL: test_mm512_mask_reduce_min_ps: 8727; X86: # %bb.0: # %entry 8728; X86-NEXT: pushl %eax 8729; X86-NEXT: .cfi_def_cfa_offset 8 8730; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8731; X86-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8732; X86-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8733; X86-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8734; X86-NEXT: vminps %ymm0, %ymm1, %ymm0 8735; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 8736; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8737; X86-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8738; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8739; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8740; X86-NEXT: vminps %xmm1, %xmm0, %xmm0 8741; X86-NEXT: vmovss %xmm0, (%esp) 8742; X86-NEXT: flds (%esp) 8743; X86-NEXT: popl %eax 8744; X86-NEXT: .cfi_def_cfa_offset 4 8745; X86-NEXT: vzeroupper 8746; X86-NEXT: retl 8747; 8748; X64-LABEL: test_mm512_mask_reduce_min_ps: 8749; X64: # %bb.0: # %entry 8750; X64-NEXT: kmovw %edi, %k1 8751; X64-NEXT: vbroadcastss {{.*#+}} zmm1 = [+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf,+Inf] 8752; X64-NEXT: vmovaps %zmm0, %zmm1 {%k1} 8753; X64-NEXT: vextractf64x4 $1, %zmm1, %ymm0 8754; X64-NEXT: vminps %ymm0, %ymm1, %ymm0 8755; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 8756; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8757; X64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] 8758; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8759; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,0,3,2] 8760; X64-NEXT: vminps %xmm1, %xmm0, %xmm0 8761; X64-NEXT: vzeroupper 8762; X64-NEXT: retq 8763entry: 8764 %0 = bitcast i16 %__M to <16 x i1> 8765 %1 = select <16 x i1> %0, <16 x float> %__W, <16 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000> 8766 %2 = bitcast <16 x float> %1 to <8 x double> 8767 %extract.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8768 %3 = bitcast <4 x double> %extract.i to <8 x float> 8769 %extract4.i = shufflevector <8 x double> %2, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8770 %4 = bitcast <4 x double> %extract4.i to <8 x float> 8771 %5 = tail call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %3, <8 x float> %4) 8772 %extract6.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 8773 %extract7.i = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 8774 %6 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %extract6.i, <4 x float> %extract7.i) 8775 %shuffle.i = shufflevector <4 x float> %6, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1> 8776 %7 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %6, <4 x float> %shuffle.i) 8777 %shuffle10.i = shufflevector <4 x float> %7, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 8778 %8 = tail call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %7, <4 x float> %shuffle10.i) 8779 %vecext.i = extractelement <4 x float> %8, i32 0 8780 ret float %vecext.i 8781} 8782 8783define <8 x double> @test_mm512_mask_max_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8784; X86-LABEL: test_mm512_mask_max_pd: 8785; X86: # %bb.0: # %entry 8786; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8787; X86-NEXT: kmovw %eax, %k1 8788; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8789; X86-NEXT: retl 8790; 8791; X64-LABEL: test_mm512_mask_max_pd: 8792; X64: # %bb.0: # %entry 8793; X64-NEXT: kmovw %edi, %k1 8794; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8795; X64-NEXT: retq 8796entry: 8797 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8798 %1 = bitcast i8 %__U to <8 x i1> 8799 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8800 ret <8 x double> %2 8801} 8802 8803define <8 x double> @test_mm512_maskz_max_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8804; X86-LABEL: test_mm512_maskz_max_pd: 8805; X86: # %bb.0: # %entry 8806; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8807; X86-NEXT: kmovw %eax, %k1 8808; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8809; X86-NEXT: retl 8810; 8811; X64-LABEL: test_mm512_maskz_max_pd: 8812; X64: # %bb.0: # %entry 8813; X64-NEXT: kmovw %edi, %k1 8814; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8815; X64-NEXT: retq 8816entry: 8817 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8818 %1 = bitcast i8 %__U to <8 x i1> 8819 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 8820 ret <8 x double> %2 8821} 8822 8823define <16 x float> @test_mm512_mask_max_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8824; X86-LABEL: test_mm512_mask_max_ps: 8825; X86: # %bb.0: # %entry 8826; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8827; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8828; X86-NEXT: retl 8829; 8830; X64-LABEL: test_mm512_mask_max_ps: 8831; X64: # %bb.0: # %entry 8832; X64-NEXT: kmovw %edi, %k1 8833; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8834; X64-NEXT: retq 8835entry: 8836 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8837 %1 = bitcast i16 %__U to <16 x i1> 8838 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 8839 ret <16 x float> %2 8840} 8841 8842define <8 x double> @test_mm512_mask_max_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8843; X86-LABEL: test_mm512_mask_max_round_pd: 8844; X86: # %bb.0: # %entry 8845; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8846; X86-NEXT: kmovw %eax, %k1 8847; X86-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8848; X86-NEXT: retl 8849; 8850; X64-LABEL: test_mm512_mask_max_round_pd: 8851; X64: # %bb.0: # %entry 8852; X64-NEXT: kmovw %edi, %k1 8853; X64-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} 8854; X64-NEXT: retq 8855entry: 8856 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8857 %1 = bitcast i8 %__U to <8 x i1> 8858 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8859 ret <8 x double> %2 8860} 8861 8862declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32) 8863 8864define <8 x double> @test_mm512_maskz_max_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8865; X86-LABEL: test_mm512_maskz_max_round_pd: 8866; X86: # %bb.0: # %entry 8867; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8868; X86-NEXT: kmovw %eax, %k1 8869; X86-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8870; X86-NEXT: retl 8871; 8872; X64-LABEL: test_mm512_maskz_max_round_pd: 8873; X64: # %bb.0: # %entry 8874; X64-NEXT: kmovw %edi, %k1 8875; X64-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8876; X64-NEXT: retq 8877entry: 8878 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8879 %1 = bitcast i8 %__U to <8 x i1> 8880 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 8881 ret <8 x double> %2 8882} 8883 8884define <8 x double> @test_mm512_max_round_pd(<8 x double> %__A, <8 x double> %__B) { 8885; CHECK-LABEL: test_mm512_max_round_pd: 8886; CHECK: # %bb.0: # %entry 8887; CHECK-NEXT: vmaxpd %zmm1, %zmm0, %zmm0 8888; CHECK-NEXT: ret{{[l|q]}} 8889entry: 8890 %0 = tail call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8891 ret <8 x double> %0 8892} 8893 8894define <16 x float> @test_mm512_maskz_max_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8895; X86-LABEL: test_mm512_maskz_max_ps: 8896; X86: # %bb.0: # %entry 8897; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8898; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8899; X86-NEXT: retl 8900; 8901; X64-LABEL: test_mm512_maskz_max_ps: 8902; X64: # %bb.0: # %entry 8903; X64-NEXT: kmovw %edi, %k1 8904; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8905; X64-NEXT: retq 8906entry: 8907 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8908 %1 = bitcast i16 %__U to <16 x i1> 8909 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 8910 ret <16 x float> %2 8911} 8912 8913define <16 x float> @test_mm512_mask_max_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8914; X86-LABEL: test_mm512_mask_max_round_ps: 8915; X86: # %bb.0: # %entry 8916; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8917; X86-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8918; X86-NEXT: retl 8919; 8920; X64-LABEL: test_mm512_mask_max_round_ps: 8921; X64: # %bb.0: # %entry 8922; X64-NEXT: kmovw %edi, %k1 8923; X64-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} 8924; X64-NEXT: retq 8925entry: 8926 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8927 %1 = bitcast i16 %__U to <16 x i1> 8928 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 8929 ret <16 x float> %2 8930} 8931 8932declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32) 8933 8934define <16 x float> @test_mm512_maskz_max_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 8935; X86-LABEL: test_mm512_maskz_max_round_ps: 8936; X86: # %bb.0: # %entry 8937; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 8938; X86-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8939; X86-NEXT: retl 8940; 8941; X64-LABEL: test_mm512_maskz_max_round_ps: 8942; X64: # %bb.0: # %entry 8943; X64-NEXT: kmovw %edi, %k1 8944; X64-NEXT: vmaxps %zmm1, %zmm0, %zmm0 {%k1} {z} 8945; X64-NEXT: retq 8946entry: 8947 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8948 %1 = bitcast i16 %__U to <16 x i1> 8949 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 8950 ret <16 x float> %2 8951} 8952 8953define <16 x float> @test_mm512_max_round_ps(<16 x float> %__A, <16 x float> %__B) { 8954; CHECK-LABEL: test_mm512_max_round_ps: 8955; CHECK: # %bb.0: # %entry 8956; CHECK-NEXT: vmaxps %zmm1, %zmm0, %zmm0 8957; CHECK-NEXT: ret{{[l|q]}} 8958entry: 8959 %0 = tail call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 8960 ret <16 x float> %0 8961} 8962 8963define <8 x double> @test_mm512_mask_min_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8964; X86-LABEL: test_mm512_mask_min_pd: 8965; X86: # %bb.0: # %entry 8966; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8967; X86-NEXT: kmovw %eax, %k1 8968; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 8969; X86-NEXT: retl 8970; 8971; X64-LABEL: test_mm512_mask_min_pd: 8972; X64: # %bb.0: # %entry 8973; X64-NEXT: kmovw %edi, %k1 8974; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 8975; X64-NEXT: retq 8976entry: 8977 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8978 %1 = bitcast i8 %__U to <8 x i1> 8979 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 8980 ret <8 x double> %2 8981} 8982 8983define <8 x double> @test_mm512_maskz_min_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 8984; X86-LABEL: test_mm512_maskz_min_pd: 8985; X86: # %bb.0: # %entry 8986; X86-NEXT: movb {{[0-9]+}}(%esp), %al 8987; X86-NEXT: kmovw %eax, %k1 8988; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8989; X86-NEXT: retl 8990; 8991; X64-LABEL: test_mm512_maskz_min_pd: 8992; X64: # %bb.0: # %entry 8993; X64-NEXT: kmovw %edi, %k1 8994; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 8995; X64-NEXT: retq 8996entry: 8997 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 8998 %1 = bitcast i8 %__U to <8 x i1> 8999 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9000 ret <8 x double> %2 9001} 9002 9003define <8 x double> @test_mm512_mask_min_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9004; X86-LABEL: test_mm512_mask_min_round_pd: 9005; X86: # %bb.0: # %entry 9006; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9007; X86-NEXT: kmovw %eax, %k1 9008; X86-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9009; X86-NEXT: retl 9010; 9011; X64-LABEL: test_mm512_mask_min_round_pd: 9012; X64: # %bb.0: # %entry 9013; X64-NEXT: kmovw %edi, %k1 9014; X64-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} 9015; X64-NEXT: retq 9016entry: 9017 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9018 %1 = bitcast i8 %__U to <8 x i1> 9019 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9020 ret <8 x double> %2 9021} 9022 9023declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32) 9024 9025define <8 x double> @test_mm512_maskz_min_round_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { 9026; X86-LABEL: test_mm512_maskz_min_round_pd: 9027; X86: # %bb.0: # %entry 9028; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9029; X86-NEXT: kmovw %eax, %k1 9030; X86-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9031; X86-NEXT: retl 9032; 9033; X64-LABEL: test_mm512_maskz_min_round_pd: 9034; X64: # %bb.0: # %entry 9035; X64-NEXT: kmovw %edi, %k1 9036; X64-NEXT: vminpd %zmm1, %zmm0, %zmm0 {%k1} {z} 9037; X64-NEXT: retq 9038entry: 9039 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9040 %1 = bitcast i8 %__U to <8 x i1> 9041 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9042 ret <8 x double> %2 9043} 9044 9045define <8 x double> @test_mm512_min_round_pd(<8 x double> %__A, <8 x double> %__B) { 9046; CHECK-LABEL: test_mm512_min_round_pd: 9047; CHECK: # %bb.0: # %entry 9048; CHECK-NEXT: vminpd %zmm1, %zmm0, %zmm0 9049; CHECK-NEXT: ret{{[l|q]}} 9050entry: 9051 %0 = tail call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %__A, <8 x double> %__B, i32 4) 9052 ret <8 x double> %0 9053} 9054 9055define <16 x float> @test_mm512_mask_min_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9056; X86-LABEL: test_mm512_mask_min_ps: 9057; X86: # %bb.0: # %entry 9058; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9059; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9060; X86-NEXT: retl 9061; 9062; X64-LABEL: test_mm512_mask_min_ps: 9063; X64: # %bb.0: # %entry 9064; X64-NEXT: kmovw %edi, %k1 9065; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9066; X64-NEXT: retq 9067entry: 9068 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9069 %1 = bitcast i16 %__U to <16 x i1> 9070 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9071 ret <16 x float> %2 9072} 9073 9074define <16 x float> @test_mm512_maskz_min_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9075; X86-LABEL: test_mm512_maskz_min_ps: 9076; X86: # %bb.0: # %entry 9077; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9078; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9079; X86-NEXT: retl 9080; 9081; X64-LABEL: test_mm512_maskz_min_ps: 9082; X64: # %bb.0: # %entry 9083; X64-NEXT: kmovw %edi, %k1 9084; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9085; X64-NEXT: retq 9086entry: 9087 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9088 %1 = bitcast i16 %__U to <16 x i1> 9089 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9090 ret <16 x float> %2 9091} 9092 9093define <16 x float> @test_mm512_mask_min_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9094; X86-LABEL: test_mm512_mask_min_round_ps: 9095; X86: # %bb.0: # %entry 9096; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9097; X86-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9098; X86-NEXT: retl 9099; 9100; X64-LABEL: test_mm512_mask_min_round_ps: 9101; X64: # %bb.0: # %entry 9102; X64-NEXT: kmovw %edi, %k1 9103; X64-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} 9104; X64-NEXT: retq 9105entry: 9106 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9107 %1 = bitcast i16 %__U to <16 x i1> 9108 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9109 ret <16 x float> %2 9110} 9111 9112declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32) 9113 9114define <16 x float> @test_mm512_maskz_min_round_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { 9115; X86-LABEL: test_mm512_maskz_min_round_ps: 9116; X86: # %bb.0: # %entry 9117; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9118; X86-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9119; X86-NEXT: retl 9120; 9121; X64-LABEL: test_mm512_maskz_min_round_ps: 9122; X64: # %bb.0: # %entry 9123; X64-NEXT: kmovw %edi, %k1 9124; X64-NEXT: vminps %zmm1, %zmm0, %zmm0 {%k1} {z} 9125; X64-NEXT: retq 9126entry: 9127 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9128 %1 = bitcast i16 %__U to <16 x i1> 9129 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9130 ret <16 x float> %2 9131} 9132 9133define <16 x float> @test_mm512_min_round_ps(<16 x float> %__A, <16 x float> %__B) { 9134; CHECK-LABEL: test_mm512_min_round_ps: 9135; CHECK: # %bb.0: # %entry 9136; CHECK-NEXT: vminps %zmm1, %zmm0, %zmm0 9137; CHECK-NEXT: ret{{[l|q]}} 9138entry: 9139 %0 = tail call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %__A, <16 x float> %__B, i32 4) 9140 ret <16 x float> %0 9141} 9142 9143define <8 x double> @test_mm512_sqrt_pd(<8 x double> %a) { 9144; CHECK-LABEL: test_mm512_sqrt_pd: 9145; CHECK: # %bb.0: # %entry 9146; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 9147; CHECK-NEXT: ret{{[l|q]}} 9148entry: 9149 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) 9150 ret <8 x double> %0 9151} 9152 9153define <8 x double> @test_mm512_mask_sqrt_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9154; X86-LABEL: test_mm512_mask_sqrt_pd: 9155; X86: # %bb.0: # %entry 9156; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9157; X86-NEXT: kmovw %eax, %k1 9158; X86-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9159; X86-NEXT: retl 9160; 9161; X64-LABEL: test_mm512_mask_sqrt_pd: 9162; X64: # %bb.0: # %entry 9163; X64-NEXT: kmovw %edi, %k1 9164; X64-NEXT: vsqrtpd %zmm1, %zmm0 {%k1} 9165; X64-NEXT: retq 9166entry: 9167 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9168 %1 = bitcast i8 %__U to <8 x i1> 9169 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9170 ret <8 x double> %2 9171} 9172 9173define <8 x double> @test_mm512_maskz_sqrt_pd(i8 zeroext %__U, <8 x double> %__A) { 9174; X86-LABEL: test_mm512_maskz_sqrt_pd: 9175; X86: # %bb.0: # %entry 9176; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9177; X86-NEXT: kmovw %eax, %k1 9178; X86-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9179; X86-NEXT: retl 9180; 9181; X64-LABEL: test_mm512_maskz_sqrt_pd: 9182; X64: # %bb.0: # %entry 9183; X64-NEXT: kmovw %edi, %k1 9184; X64-NEXT: vsqrtpd %zmm0, %zmm0 {%k1} {z} 9185; X64-NEXT: retq 9186entry: 9187 %0 = tail call <8 x double> @llvm.sqrt.v8f64(<8 x double> %__A) 9188 %1 = bitcast i8 %__U to <8 x i1> 9189 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9190 ret <8 x double> %2 9191} 9192 9193define <8 x double> @test_mm512_mask_sqrt_round_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A) { 9194; X86-LABEL: test_mm512_mask_sqrt_round_pd: 9195; X86: # %bb.0: # %entry 9196; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9197; X86-NEXT: kmovw %eax, %k1 9198; X86-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9199; X86-NEXT: retl 9200; 9201; X64-LABEL: test_mm512_mask_sqrt_round_pd: 9202; X64: # %bb.0: # %entry 9203; X64-NEXT: kmovw %edi, %k1 9204; X64-NEXT: vsqrtpd {rn-sae}, %zmm1, %zmm0 {%k1} 9205; X64-NEXT: retq 9206entry: 9207 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9208 %1 = bitcast i8 %__U to <8 x i1> 9209 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> %__W 9210 ret <8 x double> %2 9211} 9212 9213declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) 9214 9215define <8 x double> @test_mm512_maskz_sqrt_round_pd(i8 zeroext %__U, <8 x double> %__A) { 9216; X86-LABEL: test_mm512_maskz_sqrt_round_pd: 9217; X86: # %bb.0: # %entry 9218; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9219; X86-NEXT: kmovw %eax, %k1 9220; X86-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9221; X86-NEXT: retl 9222; 9223; X64-LABEL: test_mm512_maskz_sqrt_round_pd: 9224; X64: # %bb.0: # %entry 9225; X64-NEXT: kmovw %edi, %k1 9226; X64-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9227; X64-NEXT: retq 9228entry: 9229 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9230 %1 = bitcast i8 %__U to <8 x i1> 9231 %2 = select <8 x i1> %1, <8 x double> %0, <8 x double> zeroinitializer 9232 ret <8 x double> %2 9233} 9234 9235define <8 x double> @test_mm512_sqrt_round_pd(<8 x double> %__A) { 9236; CHECK-LABEL: test_mm512_sqrt_round_pd: 9237; CHECK: # %bb.0: # %entry 9238; CHECK-NEXT: vsqrtpd {rn-sae}, %zmm0, %zmm0 9239; CHECK-NEXT: ret{{[l|q]}} 9240entry: 9241 %0 = tail call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %__A, i32 8) 9242 ret <8 x double> %0 9243} 9244 9245define <16 x float> @test_mm512_sqrt_ps(<16 x float> %a) { 9246; CHECK-LABEL: test_mm512_sqrt_ps: 9247; CHECK: # %bb.0: # %entry 9248; CHECK-NEXT: vsqrtps %zmm0, %zmm0 9249; CHECK-NEXT: ret{{[l|q]}} 9250entry: 9251 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) 9252 ret <16 x float> %0 9253} 9254 9255define <16 x float> @test_mm512_mask_sqrt_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9256; X86-LABEL: test_mm512_mask_sqrt_ps: 9257; X86: # %bb.0: # %entry 9258; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9259; X86-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9260; X86-NEXT: retl 9261; 9262; X64-LABEL: test_mm512_mask_sqrt_ps: 9263; X64: # %bb.0: # %entry 9264; X64-NEXT: kmovw %edi, %k1 9265; X64-NEXT: vsqrtps %zmm1, %zmm0 {%k1} 9266; X64-NEXT: retq 9267entry: 9268 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9269 %1 = bitcast i16 %__U to <16 x i1> 9270 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9271 ret <16 x float> %2 9272} 9273 9274define <16 x float> @test_mm512_maskz_sqrt_ps(i16 zeroext %__U, <16 x float> %__A) { 9275; X86-LABEL: test_mm512_maskz_sqrt_ps: 9276; X86: # %bb.0: # %entry 9277; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9278; X86-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9279; X86-NEXT: retl 9280; 9281; X64-LABEL: test_mm512_maskz_sqrt_ps: 9282; X64: # %bb.0: # %entry 9283; X64-NEXT: kmovw %edi, %k1 9284; X64-NEXT: vsqrtps %zmm0, %zmm0 {%k1} {z} 9285; X64-NEXT: retq 9286entry: 9287 %0 = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %__A) 9288 %1 = bitcast i16 %__U to <16 x i1> 9289 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9290 ret <16 x float> %2 9291} 9292 9293define <16 x float> @test_mm512_mask_sqrt_round_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A) { 9294; X86-LABEL: test_mm512_mask_sqrt_round_ps: 9295; X86: # %bb.0: # %entry 9296; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9297; X86-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9298; X86-NEXT: retl 9299; 9300; X64-LABEL: test_mm512_mask_sqrt_round_ps: 9301; X64: # %bb.0: # %entry 9302; X64-NEXT: kmovw %edi, %k1 9303; X64-NEXT: vsqrtps {rn-sae}, %zmm1, %zmm0 {%k1} 9304; X64-NEXT: retq 9305entry: 9306 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9307 %1 = bitcast i16 %__U to <16 x i1> 9308 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %__W 9309 ret <16 x float> %2 9310} 9311 9312declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) 9313 9314define <16 x float> @test_mm512_maskz_sqrt_round_ps(i16 zeroext %__U, <16 x float> %__A) { 9315; X86-LABEL: test_mm512_maskz_sqrt_round_ps: 9316; X86: # %bb.0: # %entry 9317; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9318; X86-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9319; X86-NEXT: retl 9320; 9321; X64-LABEL: test_mm512_maskz_sqrt_round_ps: 9322; X64: # %bb.0: # %entry 9323; X64-NEXT: kmovw %edi, %k1 9324; X64-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 {%k1} {z} 9325; X64-NEXT: retq 9326entry: 9327 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9328 %1 = bitcast i16 %__U to <16 x i1> 9329 %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer 9330 ret <16 x float> %2 9331} 9332 9333define <16 x float> @test_mm512_sqrt_round_ps(<16 x float> %__A) { 9334; CHECK-LABEL: test_mm512_sqrt_round_ps: 9335; CHECK: # %bb.0: # %entry 9336; CHECK-NEXT: vsqrtps {rn-sae}, %zmm0, %zmm0 9337; CHECK-NEXT: ret{{[l|q]}} 9338entry: 9339 %0 = tail call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %__A, i32 8) 9340 ret <16 x float> %0 9341} 9342 9343define <8 x i64> @test_mm512_rol_epi32(<8 x i64> %__A) local_unnamed_addr #0 { 9344; CHECK-LABEL: test_mm512_rol_epi32: 9345; CHECK: # %bb.0: # %entry 9346; CHECK-NEXT: vprold $5, %zmm0, %zmm0 9347; CHECK-NEXT: ret{{[l|q]}} 9348entry: 9349 %0 = bitcast <8 x i64> %__A to <16 x i32> 9350 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9351 %2 = bitcast <16 x i32> %1 to <8 x i64> 9352 ret <8 x i64> %2 9353} 9354 9355declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32) #1 9356 9357define <8 x i64> @test_mm512_mask_rol_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9358; X86-LABEL: test_mm512_mask_rol_epi32: 9359; X86: # %bb.0: # %entry 9360; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9361; X86-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9362; X86-NEXT: retl 9363; 9364; X64-LABEL: test_mm512_mask_rol_epi32: 9365; X64: # %bb.0: # %entry 9366; X64-NEXT: kmovw %edi, %k1 9367; X64-NEXT: vprold $5, %zmm1, %zmm0 {%k1} 9368; X64-NEXT: retq 9369entry: 9370 %0 = bitcast <8 x i64> %__A to <16 x i32> 9371 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9372 %2 = bitcast <8 x i64> %__W to <16 x i32> 9373 %3 = bitcast i16 %__U to <16 x i1> 9374 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9375 %5 = bitcast <16 x i32> %4 to <8 x i64> 9376 ret <8 x i64> %5 9377} 9378 9379define <8 x i64> @test_mm512_maskz_rol_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9380; X86-LABEL: test_mm512_maskz_rol_epi32: 9381; X86: # %bb.0: # %entry 9382; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9383; X86-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9384; X86-NEXT: retl 9385; 9386; X64-LABEL: test_mm512_maskz_rol_epi32: 9387; X64: # %bb.0: # %entry 9388; X64-NEXT: kmovw %edi, %k1 9389; X64-NEXT: vprold $5, %zmm0, %zmm0 {%k1} {z} 9390; X64-NEXT: retq 9391entry: 9392 %0 = bitcast <8 x i64> %__A to <16 x i32> 9393 %1 = tail call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %0, i32 5) 9394 %2 = bitcast i16 %__U to <16 x i1> 9395 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9396 %4 = bitcast <16 x i32> %3 to <8 x i64> 9397 ret <8 x i64> %4 9398} 9399 9400define <8 x i64> @test_mm512_rol_epi64(<8 x i64> %__A) { 9401; CHECK-LABEL: test_mm512_rol_epi64: 9402; CHECK: # %bb.0: # %entry 9403; CHECK-NEXT: vprolq $5, %zmm0, %zmm0 9404; CHECK-NEXT: ret{{[l|q]}} 9405entry: 9406 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9407 ret <8 x i64> %0 9408} 9409 9410declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32) #1 9411 9412define <8 x i64> @test_mm512_mask_rol_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9413; X86-LABEL: test_mm512_mask_rol_epi64: 9414; X86: # %bb.0: # %entry 9415; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9416; X86-NEXT: kmovw %eax, %k1 9417; X86-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9418; X86-NEXT: retl 9419; 9420; X64-LABEL: test_mm512_mask_rol_epi64: 9421; X64: # %bb.0: # %entry 9422; X64-NEXT: kmovw %edi, %k1 9423; X64-NEXT: vprolq $5, %zmm1, %zmm0 {%k1} 9424; X64-NEXT: retq 9425entry: 9426 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9427 %1 = bitcast i8 %__U to <8 x i1> 9428 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9429 ret <8 x i64> %2 9430} 9431 9432define <8 x i64> @test_mm512_maskz_rol_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9433; X86-LABEL: test_mm512_maskz_rol_epi64: 9434; X86: # %bb.0: # %entry 9435; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9436; X86-NEXT: kmovw %eax, %k1 9437; X86-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9438; X86-NEXT: retl 9439; 9440; X64-LABEL: test_mm512_maskz_rol_epi64: 9441; X64: # %bb.0: # %entry 9442; X64-NEXT: kmovw %edi, %k1 9443; X64-NEXT: vprolq $5, %zmm0, %zmm0 {%k1} {z} 9444; X64-NEXT: retq 9445entry: 9446 %0 = tail call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %__A, i32 5) 9447 %1 = bitcast i8 %__U to <8 x i1> 9448 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9449 ret <8 x i64> %2 9450} 9451 9452define <8 x i64> @test_mm512_rolv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9453; CHECK-LABEL: test_mm512_rolv_epi32: 9454; CHECK: # %bb.0: # %entry 9455; CHECK-NEXT: vprolvd %zmm1, %zmm0, %zmm0 9456; CHECK-NEXT: ret{{[l|q]}} 9457entry: 9458 %0 = bitcast <8 x i64> %__A to <16 x i32> 9459 %1 = bitcast <8 x i64> %__B to <16 x i32> 9460 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9461 %3 = bitcast <16 x i32> %2 to <8 x i64> 9462 ret <8 x i64> %3 9463} 9464 9465define <8 x i64> @test_mm512_mask_rolv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9466; X86-LABEL: test_mm512_mask_rolv_epi32: 9467; X86: # %bb.0: # %entry 9468; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9469; X86-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9470; X86-NEXT: retl 9471; 9472; X64-LABEL: test_mm512_mask_rolv_epi32: 9473; X64: # %bb.0: # %entry 9474; X64-NEXT: kmovw %edi, %k1 9475; X64-NEXT: vprolvd %zmm2, %zmm1, %zmm0 {%k1} 9476; X64-NEXT: retq 9477entry: 9478 %0 = bitcast <8 x i64> %__A to <16 x i32> 9479 %1 = bitcast <8 x i64> %__B to <16 x i32> 9480 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9481 %3 = bitcast <8 x i64> %__W to <16 x i32> 9482 %4 = bitcast i16 %__U to <16 x i1> 9483 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9484 %6 = bitcast <16 x i32> %5 to <8 x i64> 9485 ret <8 x i64> %6 9486} 9487 9488define <8 x i64> @test_mm512_maskz_rolv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9489; X86-LABEL: test_mm512_maskz_rolv_epi32: 9490; X86: # %bb.0: # %entry 9491; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9492; X86-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9493; X86-NEXT: retl 9494; 9495; X64-LABEL: test_mm512_maskz_rolv_epi32: 9496; X64: # %bb.0: # %entry 9497; X64-NEXT: kmovw %edi, %k1 9498; X64-NEXT: vprolvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9499; X64-NEXT: retq 9500entry: 9501 %0 = bitcast <8 x i64> %__A to <16 x i32> 9502 %1 = bitcast <8 x i64> %__B to <16 x i32> 9503 %2 = tail call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %0, <16 x i32> %1) 9504 %3 = bitcast i16 %__U to <16 x i1> 9505 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9506 %5 = bitcast <16 x i32> %4 to <8 x i64> 9507 ret <8 x i64> %5 9508} 9509 9510define <8 x i64> @test_mm512_rolv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9511; CHECK-LABEL: test_mm512_rolv_epi64: 9512; CHECK: # %bb.0: # %entry 9513; CHECK-NEXT: vprolvq %zmm1, %zmm0, %zmm0 9514; CHECK-NEXT: ret{{[l|q]}} 9515entry: 9516 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9517 ret <8 x i64> %0 9518} 9519 9520define <8 x i64> @test_mm512_mask_rolv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9521; X86-LABEL: test_mm512_mask_rolv_epi64: 9522; X86: # %bb.0: # %entry 9523; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9524; X86-NEXT: kmovw %eax, %k1 9525; X86-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9526; X86-NEXT: retl 9527; 9528; X64-LABEL: test_mm512_mask_rolv_epi64: 9529; X64: # %bb.0: # %entry 9530; X64-NEXT: kmovw %edi, %k1 9531; X64-NEXT: vprolvq %zmm2, %zmm1, %zmm0 {%k1} 9532; X64-NEXT: retq 9533entry: 9534 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9535 %1 = bitcast i8 %__U to <8 x i1> 9536 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9537 ret <8 x i64> %2 9538} 9539 9540define <8 x i64> @test_mm512_maskz_rolv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9541; X86-LABEL: test_mm512_maskz_rolv_epi64: 9542; X86: # %bb.0: # %entry 9543; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9544; X86-NEXT: kmovw %eax, %k1 9545; X86-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9546; X86-NEXT: retl 9547; 9548; X64-LABEL: test_mm512_maskz_rolv_epi64: 9549; X64: # %bb.0: # %entry 9550; X64-NEXT: kmovw %edi, %k1 9551; X64-NEXT: vprolvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9552; X64-NEXT: retq 9553entry: 9554 %0 = tail call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9555 %1 = bitcast i8 %__U to <8 x i1> 9556 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9557 ret <8 x i64> %2 9558} 9559 9560define <8 x i64> @test_mm512_ror_epi32(<8 x i64> %__A) { 9561; CHECK-LABEL: test_mm512_ror_epi32: 9562; CHECK: # %bb.0: # %entry 9563; CHECK-NEXT: vprord $5, %zmm0, %zmm0 9564; CHECK-NEXT: ret{{[l|q]}} 9565entry: 9566 %0 = bitcast <8 x i64> %__A to <16 x i32> 9567 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9568 %2 = bitcast <16 x i32> %1 to <8 x i64> 9569 ret <8 x i64> %2 9570} 9571 9572declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32) #1 9573 9574define <8 x i64> @test_mm512_mask_ror_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A) { 9575; X86-LABEL: test_mm512_mask_ror_epi32: 9576; X86: # %bb.0: # %entry 9577; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9578; X86-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9579; X86-NEXT: retl 9580; 9581; X64-LABEL: test_mm512_mask_ror_epi32: 9582; X64: # %bb.0: # %entry 9583; X64-NEXT: kmovw %edi, %k1 9584; X64-NEXT: vprord $5, %zmm1, %zmm0 {%k1} 9585; X64-NEXT: retq 9586entry: 9587 %0 = bitcast <8 x i64> %__A to <16 x i32> 9588 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9589 %2 = bitcast <8 x i64> %__W to <16 x i32> 9590 %3 = bitcast i16 %__U to <16 x i1> 9591 %4 = select <16 x i1> %3, <16 x i32> %1, <16 x i32> %2 9592 %5 = bitcast <16 x i32> %4 to <8 x i64> 9593 ret <8 x i64> %5 9594} 9595 9596define <8 x i64> @test_mm512_maskz_ror_epi32(i16 zeroext %__U, <8 x i64> %__A) { 9597; X86-LABEL: test_mm512_maskz_ror_epi32: 9598; X86: # %bb.0: # %entry 9599; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9600; X86-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9601; X86-NEXT: retl 9602; 9603; X64-LABEL: test_mm512_maskz_ror_epi32: 9604; X64: # %bb.0: # %entry 9605; X64-NEXT: kmovw %edi, %k1 9606; X64-NEXT: vprord $5, %zmm0, %zmm0 {%k1} {z} 9607; X64-NEXT: retq 9608entry: 9609 %0 = bitcast <8 x i64> %__A to <16 x i32> 9610 %1 = tail call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %0, i32 5) 9611 %2 = bitcast i16 %__U to <16 x i1> 9612 %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer 9613 %4 = bitcast <16 x i32> %3 to <8 x i64> 9614 ret <8 x i64> %4 9615} 9616 9617define <8 x i64> @test_mm512_ror_epi64(<8 x i64> %__A) { 9618; CHECK-LABEL: test_mm512_ror_epi64: 9619; CHECK: # %bb.0: # %entry 9620; CHECK-NEXT: vprorq $5, %zmm0, %zmm0 9621; CHECK-NEXT: ret{{[l|q]}} 9622entry: 9623 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9624 ret <8 x i64> %0 9625} 9626 9627declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32) #1 9628 9629define <8 x i64> @test_mm512_mask_ror_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A) { 9630; X86-LABEL: test_mm512_mask_ror_epi64: 9631; X86: # %bb.0: # %entry 9632; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9633; X86-NEXT: kmovw %eax, %k1 9634; X86-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9635; X86-NEXT: retl 9636; 9637; X64-LABEL: test_mm512_mask_ror_epi64: 9638; X64: # %bb.0: # %entry 9639; X64-NEXT: kmovw %edi, %k1 9640; X64-NEXT: vprorq $5, %zmm1, %zmm0 {%k1} 9641; X64-NEXT: retq 9642entry: 9643 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9644 %1 = bitcast i8 %__U to <8 x i1> 9645 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9646 ret <8 x i64> %2 9647} 9648 9649define <8 x i64> @test_mm512_maskz_ror_epi64(i8 zeroext %__U, <8 x i64> %__A) { 9650; X86-LABEL: test_mm512_maskz_ror_epi64: 9651; X86: # %bb.0: # %entry 9652; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9653; X86-NEXT: kmovw %eax, %k1 9654; X86-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9655; X86-NEXT: retl 9656; 9657; X64-LABEL: test_mm512_maskz_ror_epi64: 9658; X64: # %bb.0: # %entry 9659; X64-NEXT: kmovw %edi, %k1 9660; X64-NEXT: vprorq $5, %zmm0, %zmm0 {%k1} {z} 9661; X64-NEXT: retq 9662entry: 9663 %0 = tail call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %__A, i32 5) 9664 %1 = bitcast i8 %__U to <8 x i1> 9665 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9666 ret <8 x i64> %2 9667} 9668 9669define <8 x i64> @test_mm512_rorv_epi32(<8 x i64> %__A, <8 x i64> %__B) { 9670; CHECK-LABEL: test_mm512_rorv_epi32: 9671; CHECK: # %bb.0: # %entry 9672; CHECK-NEXT: vprorvd %zmm1, %zmm0, %zmm0 9673; CHECK-NEXT: ret{{[l|q]}} 9674entry: 9675 %0 = bitcast <8 x i64> %__A to <16 x i32> 9676 %1 = bitcast <8 x i64> %__B to <16 x i32> 9677 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9678 %3 = bitcast <16 x i32> %2 to <8 x i64> 9679 ret <8 x i64> %3 9680} 9681 9682define <8 x i64> @test_mm512_mask_rorv_epi32(<8 x i64> %__W, i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9683; X86-LABEL: test_mm512_mask_rorv_epi32: 9684; X86: # %bb.0: # %entry 9685; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9686; X86-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9687; X86-NEXT: retl 9688; 9689; X64-LABEL: test_mm512_mask_rorv_epi32: 9690; X64: # %bb.0: # %entry 9691; X64-NEXT: kmovw %edi, %k1 9692; X64-NEXT: vprorvd %zmm2, %zmm1, %zmm0 {%k1} 9693; X64-NEXT: retq 9694entry: 9695 %0 = bitcast <8 x i64> %__A to <16 x i32> 9696 %1 = bitcast <8 x i64> %__B to <16 x i32> 9697 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9698 %3 = bitcast <8 x i64> %__W to <16 x i32> 9699 %4 = bitcast i16 %__U to <16 x i1> 9700 %5 = select <16 x i1> %4, <16 x i32> %2, <16 x i32> %3 9701 %6 = bitcast <16 x i32> %5 to <8 x i64> 9702 ret <8 x i64> %6 9703} 9704 9705define <8 x i64> @test_mm512_maskz_rorv_epi32(i16 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9706; X86-LABEL: test_mm512_maskz_rorv_epi32: 9707; X86: # %bb.0: # %entry 9708; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 9709; X86-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9710; X86-NEXT: retl 9711; 9712; X64-LABEL: test_mm512_maskz_rorv_epi32: 9713; X64: # %bb.0: # %entry 9714; X64-NEXT: kmovw %edi, %k1 9715; X64-NEXT: vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z} 9716; X64-NEXT: retq 9717entry: 9718 %0 = bitcast <8 x i64> %__A to <16 x i32> 9719 %1 = bitcast <8 x i64> %__B to <16 x i32> 9720 %2 = tail call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %0, <16 x i32> %1) 9721 %3 = bitcast i16 %__U to <16 x i1> 9722 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 9723 %5 = bitcast <16 x i32> %4 to <8 x i64> 9724 ret <8 x i64> %5 9725} 9726 9727define <8 x i64> @test_mm512_rorv_epi64(<8 x i64> %__A, <8 x i64> %__B) { 9728; CHECK-LABEL: test_mm512_rorv_epi64: 9729; CHECK: # %bb.0: # %entry 9730; CHECK-NEXT: vprorvq %zmm1, %zmm0, %zmm0 9731; CHECK-NEXT: ret{{[l|q]}} 9732entry: 9733 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9734 ret <8 x i64> %0 9735} 9736 9737define <8 x i64> @test_mm512_mask_rorv_epi64(<8 x i64> %__W, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9738; X86-LABEL: test_mm512_mask_rorv_epi64: 9739; X86: # %bb.0: # %entry 9740; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9741; X86-NEXT: kmovw %eax, %k1 9742; X86-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9743; X86-NEXT: retl 9744; 9745; X64-LABEL: test_mm512_mask_rorv_epi64: 9746; X64: # %bb.0: # %entry 9747; X64-NEXT: kmovw %edi, %k1 9748; X64-NEXT: vprorvq %zmm2, %zmm1, %zmm0 {%k1} 9749; X64-NEXT: retq 9750entry: 9751 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9752 %1 = bitcast i8 %__U to <8 x i1> 9753 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W 9754 ret <8 x i64> %2 9755} 9756 9757define <8 x i64> @test_mm512_maskz_rorv_epi64(i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { 9758; X86-LABEL: test_mm512_maskz_rorv_epi64: 9759; X86: # %bb.0: # %entry 9760; X86-NEXT: movb {{[0-9]+}}(%esp), %al 9761; X86-NEXT: kmovw %eax, %k1 9762; X86-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9763; X86-NEXT: retl 9764; 9765; X64-LABEL: test_mm512_maskz_rorv_epi64: 9766; X64: # %bb.0: # %entry 9767; X64-NEXT: kmovw %edi, %k1 9768; X64-NEXT: vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z} 9769; X64-NEXT: retq 9770entry: 9771 %0 = tail call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %__A, <8 x i64> %__B) 9772 %1 = bitcast i8 %__U to <8 x i1> 9773 %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer 9774 ret <8 x i64> %2 9775} 9776 9777declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 9778declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 9779declare float @llvm.fma.f32(float, float, float) #9 9780declare double @llvm.fma.f64(double, double, double) #9 9781declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>) 9782declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>) 9783declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10 9784declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>) 9785declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>) 9786declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>) 9787declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>) 9788declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>) 9789declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) 9790declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) 9791declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) 9792declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) 9793declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) 9794declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) 9795declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) 9796declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) 9797declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) 9798declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) 9799declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) 9800declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>) 9801declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>) 9802declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>) 9803 9804!0 = !{i32 1} 9805 9806