1; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 2; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 3; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR 4 5; AVX512-LABEL: test1 6; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 7 8; AVX2-LABEL: test1 9; AVX2: vpmaskmovd 32(%rdi) 10; AVX2: vpmaskmovd (%rdi) 11; AVX2-NOT: blend 12 13; AVX_SCALAR-LABEL: test1 14; AVX_SCALAR-NOT: masked 15; AVX_SCALAR: extractelement 16; AVX_SCALAR: insertelement 17; AVX_SCALAR: extractelement 18; AVX_SCALAR: insertelement 19define <16 x i32> @test1(<16 x i32> %trigger, <16 x i32>* %addr) { 20 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 21 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>undef) 22 ret <16 x i32> %res 23} 24 25; AVX512-LABEL: test2 26; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} 27 28; AVX2-LABEL: test2 29; AVX2: vpmaskmovd {{.*}}(%rdi) 30; AVX2: vpmaskmovd {{.*}}(%rdi) 31; AVX2-NOT: blend 32define <16 x i32> @test2(<16 x i32> %trigger, <16 x i32>* %addr) { 33 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 34 %res = call <16 x i32> @llvm.masked.load.v16i32(<16 x i32>* %addr, i32 4, <16 x i1>%mask, <16 x i32>zeroinitializer) 35 ret <16 x i32> %res 36} 37 38; AVX512-LABEL: test3 39; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1} 40 41; AVX_SCALAR-LABEL: test3 42; AVX_SCALAR-NOT: masked 43; AVX_SCALAR: extractelement 44; AVX_SCALAR: store 45; AVX_SCALAR: extractelement 46; AVX_SCALAR: store 47; AVX_SCALAR: extractelement 48; AVX_SCALAR: store 49define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) { 50 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 51 call void @llvm.masked.store.v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask) 52 ret void 53} 54 55; AVX512-LABEL: test4 56; AVX512: vmovups (%rdi), %zmm{{.*{%k[1-7]}}} 57 58; AVX2-LABEL: test4 59; AVX2: vmaskmovps {{.*}}(%rdi) 60; AVX2: vmaskmovps {{.*}}(%rdi) 61; AVX2: blend 62define <16 x float> @test4(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %dst) { 63 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 64 %res = call <16 x float> @llvm.masked.load.v16f32(<16 x float>* %addr, i32 4, <16 x i1>%mask, <16 x float> %dst) 65 ret <16 x float> %res 66} 67 68; AVX512-LABEL: test5 69; AVX512: vmovupd (%rdi), %zmm1 {%k1} 70 71; AVX2-LABEL: test5 72; AVX2: vmaskmovpd 73; AVX2: vblendvpd 74; AVX2: vmaskmovpd 75; AVX2: vblendvpd 76define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> %dst) { 77 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 78 %res = call <8 x double> @llvm.masked.load.v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) 79 ret <8 x double> %res 80} 81 82; AVX2-LABEL: test6 83; AVX2: vmaskmovpd 84; AVX2: vblendvpd 85define <2 x double> @test6(<2 x i64> %trigger, <2 x double>* %addr, <2 x double> %dst) { 86 %mask = icmp eq <2 x i64> %trigger, zeroinitializer 87 %res = call <2 x double> @llvm.masked.load.v2f64(<2 x double>* %addr, i32 4, <2 x i1>%mask, <2 x double>%dst) 88 ret <2 x double> %res 89} 90 91; AVX2-LABEL: test7 92; AVX2: vmaskmovps {{.*}}(%rdi) 93; AVX2: blend 94define <4 x float> @test7(<4 x i32> %trigger, <4 x float>* %addr, <4 x float> %dst) { 95 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 96 %res = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %addr, i32 4, <4 x i1>%mask, <4 x float>%dst) 97 ret <4 x float> %res 98} 99 100; AVX2-LABEL: test8 101; AVX2: vpmaskmovd {{.*}}(%rdi) 102; AVX2: blend 103define <4 x i32> @test8(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %dst) { 104 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 105 %res = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %addr, i32 4, <4 x i1>%mask, <4 x i32>%dst) 106 ret <4 x i32> %res 107} 108 109; AVX2-LABEL: test9 110; AVX2: vpmaskmovd %xmm 111define void @test9(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) { 112 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 113 call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1>%mask) 114 ret void 115} 116 117; AVX2-LABEL: test10 118; AVX2: vmaskmovpd (%rdi), %ymm 119; AVX2: blend 120define <4 x double> @test10(<4 x i32> %trigger, <4 x double>* %addr, <4 x double> %dst) { 121 %mask = icmp eq <4 x i32> %trigger, zeroinitializer 122 %res = call <4 x double> @llvm.masked.load.v4f64(<4 x double>* %addr, i32 4, <4 x i1>%mask, <4 x double>%dst) 123 ret <4 x double> %res 124} 125 126; AVX2-LABEL: test11 127; AVX2: vmaskmovps 128; AVX2: vblendvps 129define <8 x float> @test11(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %dst) { 130 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 131 %res = call <8 x float> @llvm.masked.load.v8f32(<8 x float>* %addr, i32 4, <8 x i1>%mask, <8 x float>%dst) 132 ret <8 x float> %res 133} 134 135; AVX2-LABEL: test12 136; AVX2: vpmaskmovd %ymm 137define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) { 138 %mask = icmp eq <8 x i32> %trigger, zeroinitializer 139 call void @llvm.masked.store.v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask) 140 ret void 141} 142 143; AVX512-LABEL: test13 144; AVX512: vmovups %zmm1, (%rdi) {%k1} 145 146define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val) { 147 %mask = icmp eq <16 x i32> %trigger, zeroinitializer 148 call void @llvm.masked.store.v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask) 149 ret void 150} 151 152; AVX2-LABEL: test14 153; AVX2: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 154; AVX2: vmaskmovps 155define void @test14(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) { 156 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 157 call void @llvm.masked.store.v2f32(<2 x float>%val, <2 x float>* %addr, i32 4, <2 x i1>%mask) 158 ret void 159} 160 161; AVX2-LABEL: test15 162; AVX2: vpmaskmovd 163define void @test15(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) { 164 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 165 call void @llvm.masked.store.v2i32(<2 x i32>%val, <2 x i32>* %addr, i32 4, <2 x i1>%mask) 166 ret void 167} 168 169; AVX2-LABEL: test16 170; AVX2: vmaskmovps 171; AVX2: vblendvps 172define <2 x float> @test16(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) { 173 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 174 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>%dst) 175 ret <2 x float> %res 176} 177 178; AVX2-LABEL: test17 179; AVX2: vpmaskmovd 180; AVX2: vblendvps 181; AVX2: vpmovsxdq 182define <2 x i32> @test17(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) { 183 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 184 %res = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %addr, i32 4, <2 x i1>%mask, <2 x i32>%dst) 185 ret <2 x i32> %res 186} 187 188; AVX2-LABEL: test18 189; AVX2: vmaskmovps 190; AVX2-NOT: blend 191define <2 x float> @test18(<2 x i32> %trigger, <2 x float>* %addr) { 192 %mask = icmp eq <2 x i32> %trigger, zeroinitializer 193 %res = call <2 x float> @llvm.masked.load.v2f32(<2 x float>* %addr, i32 4, <2 x i1>%mask, <2 x float>undef) 194 ret <2 x float> %res 195} 196 197 198declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>) 199declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) 200declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) 201declare void @llvm.masked.store.v16i32(<16 x i32>, <16 x i32>*, i32, <16 x i1>) 202declare void @llvm.masked.store.v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) 203declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) 204declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>) 205declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>) 206declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>) 207declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>) 208declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>) 209declare <8 x float> @llvm.masked.load.v8f32(<8 x float>*, i32, <8 x i1>, <8 x float>) 210declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) 211declare <2 x float> @llvm.masked.load.v2f32(<2 x float>*, i32, <2 x i1>, <2 x float>) 212declare <8 x double> @llvm.masked.load.v8f64(<8 x double>*, i32, <8 x i1>, <8 x double>) 213declare <4 x double> @llvm.masked.load.v4f64(<4 x double>*, i32, <4 x i1>, <4 x double>) 214declare <2 x double> @llvm.masked.load.v2f64(<2 x double>*, i32, <2 x i1>, <2 x double>) 215declare void @llvm.masked.store.v8f64(<8 x double>, <8 x double>*, i32, <8 x i1>) 216declare void @llvm.masked.store.v2f64(<2 x double>, <2 x double>*, i32, <2 x i1>) 217declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) 218 219