1; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s 2 3; 256-bit 4 5define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { 6; CHECK-LABEL: test_pcmpeq_b_256 7; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## 8 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) 9 ret i32 %res 10} 11 12define i32 @test_mask_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 13; CHECK-LABEL: test_mask_pcmpeq_b_256 14; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## 15 %res = call i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) 16 ret i32 %res 17} 18 19declare i32 @llvm.x86.avx512.mask.pcmpeq.b.256(<32 x i8>, <32 x i8>, i32) 20 21define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) { 22; CHECK-LABEL: test_pcmpeq_w_256 23; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## 24 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) 25 ret i16 %res 26} 27 28define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 29; CHECK-LABEL: test_mask_pcmpeq_w_256 30; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## 31 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) 32 ret i16 %res 33} 34 35declare i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16>, <16 x i16>, i16) 36 37define i32 @test_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b) { 38; CHECK-LABEL: test_pcmpgt_b_256 39; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 ## 40 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 -1) 41 ret i32 %res 42} 43 44define i32 @test_mask_pcmpgt_b_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { 45; CHECK-LABEL: test_mask_pcmpgt_b_256 46; CHECK: vpcmpgtb %ymm1, %ymm0, %k0 {%k1} ## 47 %res = call i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) 48 ret i32 %res 49} 50 51declare i32 @llvm.x86.avx512.mask.pcmpgt.b.256(<32 x i8>, <32 x i8>, i32) 52 53define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) { 54; CHECK-LABEL: test_pcmpgt_w_256 55; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 ## 56 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1) 57 ret i16 %res 58} 59 60define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { 61; CHECK-LABEL: test_mask_pcmpgt_w_256 62; CHECK: vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## 63 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask) 64 ret i16 %res 65} 66 67declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16) 68 69define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { 70; CHECK_LABEL: test_cmp_b_256 71; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ## 72 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) 73 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 74; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ## 75 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) 76 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 77; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ## 78 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) 79 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 80; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ## 81 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) 82 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 83; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ## 84 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) 85 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 86; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ## 87 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) 88 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 89; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ## 90 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) 91 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 92; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ## 93 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) 94 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 95 ret <8 x i32> %vec7 96} 97 98define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 99; CHECK_LABEL: test_mask_cmp_b_256 100; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ## 101 %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) 102 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 103; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ## 104 %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) 105 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 106; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ## 107 %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) 108 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 109; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ## 110 %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) 111 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 112; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ## 113 %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) 114 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 115; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ## 116 %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) 117 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 118; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ## 119 %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) 120 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 121; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ## 122 %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) 123 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 124 ret <8 x i32> %vec7 125} 126 127declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone 128 129define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) { 130; CHECK_LABEL: test_ucmp_b_256 131; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ## 132 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1) 133 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 134; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ## 135 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1) 136 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 137; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ## 138 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1) 139 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 140; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ## 141 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1) 142 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 143; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ## 144 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1) 145 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 146; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ## 147 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1) 148 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 149; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ## 150 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1) 151 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 152; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ## 153 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1) 154 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 155 ret <8 x i32> %vec7 156} 157 158define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { 159; CHECK_LABEL: test_mask_ucmp_b_256 160; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ## 161 %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask) 162 %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0 163; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ## 164 %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask) 165 %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1 166; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ## 167 %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask) 168 %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2 169; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ## 170 %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask) 171 %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3 172; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ## 173 %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask) 174 %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4 175; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ## 176 %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask) 177 %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5 178; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ## 179 %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask) 180 %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6 181; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ## 182 %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask) 183 %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7 184 ret <8 x i32> %vec7 185} 186 187declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone 188 189define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { 190; CHECK_LABEL: test_cmp_w_256 191; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ## 192 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) 193 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 194; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ## 195 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) 196 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 197; CHECK: vpcmplew %ymm1, %ymm0, %k0 ## 198 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) 199 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 200; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ## 201 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) 202 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 203; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ## 204 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) 205 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 206; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ## 207 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) 208 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 209; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ## 210 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) 211 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 212; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ## 213 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) 214 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 215 ret <8 x i16> %vec7 216} 217 218define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { 219; CHECK_LABEL: test_mask_cmp_w_256 220; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## 221 %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) 222 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 223; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ## 224 %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) 225 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 226; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ## 227 %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) 228 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 229; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ## 230 %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) 231 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 232; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ## 233 %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) 234 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 235; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ## 236 %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) 237 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 238; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ## 239 %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) 240 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 241; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ## 242 %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) 243 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 244 ret <8 x i16> %vec7 245} 246 247declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone 248 249define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) { 250; CHECK_LABEL: test_ucmp_w_256 251; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ## 252 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1) 253 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 254; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ## 255 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1) 256 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 257; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ## 258 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1) 259 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 260; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ## 261 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1) 262 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 263; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ## 264 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1) 265 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 266; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ## 267 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1) 268 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 269; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ## 270 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1) 271 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 272; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ## 273 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1) 274 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 275 ret <8 x i16> %vec7 276} 277 278define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) { 279; CHECK_LABEL: test_mask_ucmp_w_256 280; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## 281 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask) 282 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 283; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ## 284 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask) 285 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 286; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ## 287 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask) 288 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 289; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ## 290 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask) 291 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 292; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ## 293 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask) 294 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 295; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ## 296 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask) 297 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 298; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ## 299 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask) 300 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 301; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ## 302 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask) 303 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 304 ret <8 x i16> %vec7 305} 306 307declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone 308 309; 128-bit 310 311define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) { 312; CHECK-LABEL: test_pcmpeq_b_128 313; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## 314 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) 315 ret i16 %res 316} 317 318define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 319; CHECK-LABEL: test_mask_pcmpeq_b_128 320; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## 321 %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) 322 ret i16 %res 323} 324 325declare i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8>, <16 x i8>, i16) 326 327define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) { 328; CHECK-LABEL: test_pcmpeq_w_128 329; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## 330 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) 331 ret i8 %res 332} 333 334define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 335; CHECK-LABEL: test_mask_pcmpeq_w_128 336; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## 337 %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) 338 ret i8 %res 339} 340 341declare i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16>, <8 x i16>, i8) 342 343define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) { 344; CHECK-LABEL: test_pcmpgt_b_128 345; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 ## 346 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1) 347 ret i16 %res 348} 349 350define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { 351; CHECK-LABEL: test_mask_pcmpgt_b_128 352; CHECK: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## 353 %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) 354 ret i16 %res 355} 356 357declare i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8>, <16 x i8>, i16) 358 359define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) { 360; CHECK-LABEL: test_pcmpgt_w_128 361; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 ## 362 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1) 363 ret i8 %res 364} 365 366define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { 367; CHECK-LABEL: test_mask_pcmpgt_w_128 368; CHECK: vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## 369 %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask) 370 ret i8 %res 371} 372 373declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) 374 375define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { 376; CHECK_LABEL: test_cmp_b_128 377; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ## 378 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) 379 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 380; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ## 381 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) 382 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 383; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ## 384 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) 385 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 386; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ## 387 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) 388 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 389; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ## 390 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) 391 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 392; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ## 393 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) 394 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 395; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ## 396 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) 397 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 398; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ## 399 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) 400 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 401 ret <8 x i16> %vec7 402} 403 404define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 405; CHECK_LABEL: test_mask_cmp_b_128 406; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## 407 %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) 408 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 409; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ## 410 %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) 411 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 412; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ## 413 %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) 414 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 415; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ## 416 %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) 417 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 418; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ## 419 %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) 420 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 421; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ## 422 %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) 423 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 424; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ## 425 %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) 426 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 427; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ## 428 %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) 429 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 430 ret <8 x i16> %vec7 431} 432 433declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone 434 435define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) { 436; CHECK_LABEL: test_ucmp_b_128 437; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ## 438 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1) 439 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 440; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ## 441 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1) 442 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 443; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ## 444 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1) 445 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 446; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ## 447 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1) 448 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 449; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ## 450 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1) 451 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 452; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ## 453 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1) 454 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 455; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ## 456 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1) 457 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 458; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ## 459 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1) 460 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 461 ret <8 x i16> %vec7 462} 463 464define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { 465; CHECK_LABEL: test_mask_ucmp_b_128 466; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## 467 %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask) 468 %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0 469; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ## 470 %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask) 471 %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1 472; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ## 473 %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask) 474 %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2 475; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ## 476 %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask) 477 %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3 478; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ## 479 %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask) 480 %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4 481; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ## 482 %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask) 483 %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5 484; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ## 485 %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask) 486 %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6 487; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ## 488 %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask) 489 %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7 490 ret <8 x i16> %vec7 491} 492 493declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone 494 495define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { 496; CHECK_LABEL: test_cmp_w_128 497; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ## 498 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) 499 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 500; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ## 501 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) 502 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 503; CHECK: vpcmplew %xmm1, %xmm0, %k0 ## 504 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) 505 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 506; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ## 507 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) 508 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 509; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ## 510 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) 511 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 512; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ## 513 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) 514 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 515; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ## 516 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) 517 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 518; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ## 519 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) 520 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 521 ret <8 x i8> %vec7 522} 523 524define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 525; CHECK_LABEL: test_mask_cmp_w_128 526; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## 527 %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) 528 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 529; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ## 530 %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) 531 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 532; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ## 533 %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) 534 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 535; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ## 536 %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) 537 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 538; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ## 539 %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) 540 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 541; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ## 542 %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) 543 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 544; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ## 545 %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) 546 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 547; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ## 548 %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) 549 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 550 ret <8 x i8> %vec7 551} 552 553declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone 554 555define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) { 556; CHECK_LABEL: test_ucmp_w_128 557; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ## 558 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1) 559 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 560; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ## 561 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1) 562 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 563; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ## 564 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1) 565 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 566; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ## 567 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1) 568 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 569; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ## 570 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1) 571 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 572; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ## 573 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1) 574 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 575; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ## 576 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1) 577 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 578; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ## 579 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1) 580 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 581 ret <8 x i8> %vec7 582} 583 584define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) { 585; CHECK_LABEL: test_mask_ucmp_w_128 586; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## 587 %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask) 588 %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0 589; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ## 590 %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask) 591 %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1 592; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ## 593 %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask) 594 %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2 595; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ## 596 %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask) 597 %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3 598; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ## 599 %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask) 600 %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4 601; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ## 602 %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask) 603 %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5 604; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ## 605 %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask) 606 %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6 607; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ## 608 %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask) 609 %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7 610 ret <8 x i8> %vec7 611} 612 613declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone 614 615declare <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 616 617define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 618 ; CHECK-LABEL: test_mask_vfmadd256_ps 619 ; CHECK: vfmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa8,0xc2] 620 %res = call <8 x float> @llvm.x86.fma.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 621 ret <8 x float> %res 622} 623 624declare <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 625 626define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 627 ; CHECK-LABEL: test_mask_vfmadd128_ps 628 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] 629 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 630 ret <4 x float> %res 631} 632 633declare <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) 634 635define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) { 636; CHECK-LABEL: test_mask_fmadd256_pd: 637; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] 638 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) 639 ret <4 x double> %res 640} 641 642declare <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) 643 644define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) { 645; CHECK-LABEL: test_mask_fmadd128_pd: 646; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] 647 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) 648 ret <2 x double> %res 649} 650 651declare <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 652 653define <8 x float> @test_mask_vfmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 654 ; CHECK-LABEL: test_mask_vfmsub256_ps 655 ; CHECK: vfmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xaa,0xc2] 656 %res = call <8 x float> @llvm.x86.fma.mask.vfmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 657 ret <8 x float> %res 658} 659 660declare <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 661 662define <4 x float> @test_mask_vfmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 663 ; CHECK-LABEL: test_mask_vfmsub128_ps 664 ; CHECK: vfmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xaa,0xc2] 665 %res = call <4 x float> @llvm.x86.fma.mask.vfmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 666 ret <4 x float> %res 667} 668 669declare <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 670 671define <4 x double> @test_mask_vfmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 672 ; CHECK-LABEL: test_mask_vfmsub256_pd 673 ; CHECK: vfmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xaa,0xc2] 674 %res = call <4 x double> @llvm.x86.fma.mask.vfmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 675 ret <4 x double> %res 676} 677 678declare <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 679 680define <2 x double> @test_mask_vfmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 681 ; CHECK-LABEL: test_mask_vfmsub128_pd 682 ; CHECK: vfmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xaa,0xc2] 683 %res = call <2 x double> @llvm.x86.fma.mask.vfmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 684 ret <2 x double> %res 685} 686 687declare <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 688 689define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 690 ; CHECK-LABEL: test_mask_vfnmadd256_ps 691 ; CHECK: vfnmadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xac,0xc2] 692 %res = call <8 x float> @llvm.x86.fma.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 693 ret <8 x float> %res 694} 695 696declare <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 697 698define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 699 ; CHECK-LABEL: test_mask_vfnmadd128_ps 700 ; CHECK: vfnmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xac,0xc2] 701 %res = call <4 x float> @llvm.x86.fma.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 702 ret <4 x float> %res 703} 704 705declare <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 706 707define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 708 ; CHECK-LABEL: test_mask_vfnmadd256_pd 709 ; CHECK: vfnmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xac,0xc2] 710 %res = call <4 x double> @llvm.x86.fma.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 711 ret <4 x double> %res 712} 713 714declare <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 715 716define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 717 ; CHECK-LABEL: test_mask_vfnmadd128_pd 718 ; CHECK: vfnmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xac,0xc2] 719 %res = call <2 x double> @llvm.x86.fma.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 720 ret <2 x double> %res 721} 722 723declare <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 724 725define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 726 ; CHECK-LABEL: test_mask_vfnmsub256_ps 727 ; CHECK: vfnmsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xae,0xc2] 728 %res = call <8 x float> @llvm.x86.fma.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 729 ret <8 x float> %res 730} 731 732declare <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 733 734define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 735 ; CHECK-LABEL: test_mask_vfnmsub128_ps 736 ; CHECK: vfnmsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xae,0xc2] 737 %res = call <4 x float> @llvm.x86.fma.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 738 ret <4 x float> %res 739} 740 741declare <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 742 743define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 744 ; CHECK-LABEL: test_mask_vfnmsub256_pd 745 ; CHECK: vfnmsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xae,0xc2] 746 %res = call <4 x double> @llvm.x86.fma.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 747 ret <4 x double> %res 748} 749 750declare <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 751 752define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 753 ; CHECK-LABEL: test_mask_vfnmsub128_pd 754 ; CHECK: vfnmsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xae,0xc2] 755 %res = call <2 x double> @llvm.x86.fma.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 756 ret <2 x double> %res 757} 758 759declare <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 760 761define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) { 762; CHECK-LABEL: test_mask_fmaddsub256_ps: 763; CHECK: vfmaddsub213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa6,0xc2] 764 %res = call <8 x float> @llvm.x86.fma.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) 765 ret <8 x float> %res 766} 767 768declare <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 769 770define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { 771; CHECK-LABEL: test_mask_fmaddsub128_ps: 772; CHECK: vfmaddsub213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa6,0xc2] 773 %res = call <4 x float> @llvm.x86.fma.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) 774 ret <4 x float> %res 775} 776 777declare <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 778 779define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 780 ; CHECK-LABEL: test_mask_vfmaddsub256_pd 781 ; CHECK: vfmaddsub213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa6,0xc2] 782 %res = call <4 x double> @llvm.x86.fma.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 783 ret <4 x double> %res 784} 785 786declare <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 787 788define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 789 ; CHECK-LABEL: test_mask_vfmaddsub128_pd 790 ; CHECK: vfmaddsub213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa6,0xc2] 791 %res = call <2 x double> @llvm.x86.fma.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 792 ret <2 x double> %res 793} 794 795declare <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone 796 797define <8 x float> @test_mask_vfmsubadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) { 798 ; CHECK-LABEL: test_mask_vfmsubadd256_ps 799 ; CHECK: vfmsubadd213ps %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0xa7,0xc2] 800 %res = call <8 x float> @llvm.x86.fma.mask.vfmsubadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind 801 ret <8 x float> %res 802} 803 804declare <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone 805 806define <4 x float> @test_mask_vfmsubadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 807 ; CHECK-LABEL: test_mask_vfmsubadd128_ps 808 ; CHECK: vfmsubadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa7,0xc2] 809 %res = call <4 x float> @llvm.x86.fma.mask.vfmsubadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 810 ret <4 x float> %res 811} 812 813declare <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone 814 815define <4 x double> @test_mask_vfmsubadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 816 ; CHECK-LABEL: test_mask_vfmsubadd256_pd 817 ; CHECK: vfmsubadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa7,0xc2] 818 %res = call <4 x double> @llvm.x86.fma.mask.vfmsubadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 819 ret <4 x double> %res 820} 821declare <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone 822 823define <2 x double> @test_mask_vfmsubadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 824 ; CHECK-LABEL: test_mask_vfmsubadd128_pd 825 ; CHECK: vfmsubadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0xc2] 826 %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 827 ret <2 x double> %res 828} 829 830define <2 x double> @test_mask_vfmsubadd128rm_pd(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { 831 ; CHECK-LABEL: test_mask_vfmsubadd128rm_pd 832 ; CHECK: vfmsubadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa7,0x07] 833 %a2 = load <2 x double>* %ptr_a2 834 %res = call <2 x double> @llvm.x86.fma.mask.vfmsubadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 835 ret <2 x double> %res 836} 837declare <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) nounwind readnone 838define <8 x double> @test_mask_vfmsubaddrm_pd(<8 x double> %a0, <8 x double> %a1, <8 x double>* %ptr_a2, i8 %mask) { 839 ; CHECK-LABEL: test_mask_vfmsubaddrm_pd 840 ; CHECK: vfmsubadd213pd (%rdi), %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0xa7,0x07] 841 %a2 = load <8 x double>* %ptr_a2, align 8 842 %res = call <8 x double> @llvm.x86.fma.mask.vfmsubadd.pd.512(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2, i8 %mask, i32 4) nounwind 843 ret <8 x double> %res 844} 845 846define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { 847 ; CHECK-LABEL: test_mask_vfmadd128_ps_r 848 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0xc2] 849 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 850 ret <4 x float> %res 851} 852 853define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 854 ; CHECK-LABEL: test_mask_vfmadd128_ps_rz 855 ; CHECK: vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0xa8,0xc2] 856 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 857 ret <4 x float> %res 858} 859 860define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { 861 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmk 862 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] 863 %a2 = load <4 x float>* %ptr_a2 864 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 865 ret <4 x float> %res 866} 867 868define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) { 869 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmka 870 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07] 871 %a2 = load <4 x float>* %ptr_a2, align 8 872 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind 873 ret <4 x float> %res 874} 875 876define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { 877 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz 878 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] 879 %a2 = load <4 x float>* %ptr_a2 880 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 881 ret <4 x float> %res 882} 883 884define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) { 885 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza 886 ; CHECK: vfmadd213ps (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0x07] 887 %a2 = load <4 x float>* %ptr_a2, align 4 888 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind 889 ret <4 x float> %res 890} 891 892define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { 893 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmb 894 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] 895 %q = load float* %ptr_a2 896 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 897 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 898 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 899 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 900 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind 901 ret <4 x float> %res 902} 903 904define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) { 905 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmba 906 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07] 907 %q = load float* %ptr_a2, align 4 908 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 909 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 910 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 911 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 912 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind 913 ret <4 x float> %res 914} 915 916define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { 917 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz 918 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] 919 %q = load float* %ptr_a2 920 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 921 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 922 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 923 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 924 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind 925 ret <4 x float> %res 926} 927 928define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) { 929 ; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza 930 ; CHECK: vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07] 931 %q = load float* %ptr_a2, align 4 932 %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 933 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 934 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 935 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 936 %res = call <4 x float> @llvm.x86.fma.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind 937 ret <4 x float> %res 938} 939 940define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { 941 ; CHECK-LABEL: test_mask_vfmadd128_pd_r 942 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0xc2] 943 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 944 ret <2 x double> %res 945} 946 947define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 948 ; CHECK-LABEL: test_mask_vfmadd128_pd_rz 949 ; CHECK: vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0xa8,0xc2] 950 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind 951 ret <2 x double> %res 952} 953 954define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) { 955 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmk 956 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07] 957 %a2 = load <2 x double>* %ptr_a2 958 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind 959 ret <2 x double> %res 960} 961 962define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) { 963 ; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz 964 ; CHECK: vfmadd213pd (%rdi), %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0x07] 965 %a2 = load <2 x double>* %ptr_a2 966 %res = call <2 x double> @llvm.x86.fma.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind 967 ret <2 x double> %res 968} 969 970define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) { 971 ; CHECK-LABEL: test_mask_vfmadd256_pd_r 972 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0xc2] 973 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 974 ret <4 x double> %res 975} 976 977define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { 978 ; CHECK-LABEL: test_mask_vfmadd256_pd_rz 979 ; CHECK: vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0xa8,0xc2] 980 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind 981 ret <4 x double> %res 982} 983 984define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) { 985 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmk 986 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07] 987 %a2 = load <4 x double>* %ptr_a2 988 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind 989 ret <4 x double> %res 990} 991 992define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) { 993 ; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz 994 ; CHECK: vfmadd213pd (%rdi), %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0x07] 995 %a2 = load <4 x double>* %ptr_a2 996 %res = call <4 x double> @llvm.x86.fma.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind 997 ret <4 x double> %res 998} 999