1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s 2 3; CHECK-LABEL: mul_v16i8 4; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 5; CHECK: vector.body: 6; CHECK: %index = phi i32 7; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 8; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]]) 9; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16 10; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) 11; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef) 12; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]]) 13define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) { 14entry: 15 %cmp8 = icmp eq i32 %N, 0 16 %tmp8 = add i32 %N, 15 17 %tmp9 = lshr i32 %tmp8, 4 18 %tmp10 = shl nuw i32 %tmp9, 4 19 %tmp11 = add i32 %tmp10, -16 20 %tmp12 = lshr i32 %tmp11, 4 21 %tmp13 = add nuw nsw i32 %tmp12, 1 22 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 23 24vector.ph: ; preds = %entry 25 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 26 br label %vector.body 27 28vector.body: ; preds = %vector.body, %vector.ph 29 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 30 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 31 %tmp = getelementptr inbounds i8, i8* %a, i32 %index 32 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N) 33 %tmp2 = bitcast i8* %tmp to <16 x i8>* 34 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) 35 %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index 36 %tmp4 = bitcast i8* %tmp3 to <16 x i8>* 37 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef) 38 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load 39 %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index 40 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 41 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) 42 %index.next = add i32 %index, 16 43 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 44 %tmp16 = icmp ne i32 %tmp15, 0 45 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 46 47for.cond.cleanup: ; preds = %vector.body, %entry 48 ret void 49} 50 51; CHECK-LABEL: mul_v8i16 52; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 53; CHECK: vector.body: 54; CHECK: %index = phi i32 55; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 56; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]]) 57; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8 58; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) 59; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef) 60; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]]) 61define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) { 62entry: 63 %cmp8 = icmp eq i32 %N, 0 64 %tmp8 = add i32 %N, 7 65 %tmp9 = lshr i32 %tmp8, 3 66 %tmp10 = shl nuw i32 %tmp9, 3 67 %tmp11 = add i32 %tmp10, -8 68 %tmp12 = lshr i32 %tmp11, 3 69 %tmp13 = add nuw nsw i32 %tmp12, 1 70 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 71 72vector.ph: ; preds = %entry 73 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 74 br label %vector.body 75 76vector.body: ; preds = %vector.body, %vector.ph 77 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 78 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 79 %tmp = getelementptr inbounds i16, i16* %a, i32 %index 80 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N) 81 %tmp2 = bitcast i16* %tmp to <8 x i16>* 82 %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) 83 %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index 84 %tmp4 = bitcast i16* %tmp3 to <8 x i16>* 85 %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef) 86 %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load 87 %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index 88 %tmp7 = bitcast i16* %tmp6 to <8 x i16>* 89 tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) 90 %index.next = add i32 %index, 8 91 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 92 %tmp16 = icmp ne i32 %tmp15, 0 93 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 94 95for.cond.cleanup: ; preds = %vector.body, %entry 96 ret void 97} 98 99; CHECK-LABEL: mul_v4i32 100; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 101; CHECK: vector.body: 102; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 103; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 104; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 105; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 106; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 107; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 108define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 109entry: 110 %cmp8 = icmp eq i32 %N, 0 111 %tmp8 = add i32 %N, 3 112 %tmp9 = lshr i32 %tmp8, 2 113 %tmp10 = shl nuw i32 %tmp9, 2 114 %tmp11 = add i32 %tmp10, -4 115 %tmp12 = lshr i32 %tmp11, 2 116 %tmp13 = add nuw nsw i32 %tmp12, 1 117 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 118 119vector.ph: ; preds = %entry 120 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 121 br label %vector.body 122 123vector.body: ; preds = %vector.body, %vector.ph 124 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 125 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 126 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 127 %tmp2 = bitcast i32* %tmp to <4 x i32>* 128 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 129 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 130 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 131 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 132 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 133 %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load 134 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 135 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 136 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 137 %index.next = add i32 %index, 4 138 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 139 %tmp16 = icmp ne i32 %tmp15, 0 140 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 141 142for.cond.cleanup: ; preds = %vector.body, %entry 143 ret void 144} 145 146; CHECK-LABEL: split_vector 147; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 148; CHECK: vector.body: 149; CHECK: %index = phi i32 150; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 151; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 152; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 153; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 154; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 155; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 156define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 157entry: 158 %cmp8 = icmp eq i32 %N, 0 159 %tmp8 = add i32 %N, 3 160 %tmp9 = lshr i32 %tmp8, 2 161 %tmp10 = shl nuw i32 %tmp9, 2 162 %tmp11 = add i32 %tmp10, -4 163 %tmp12 = lshr i32 %tmp11, 2 164 %tmp13 = add nuw nsw i32 %tmp12, 1 165 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 166 167vector.ph: ; preds = %entry 168 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 169 br label %vector.body 170 171vector.body: ; preds = %vector.body, %vector.ph 172 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 173 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 174 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 175 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 176 %tmp2 = bitcast i32* %tmp to <4 x i32>* 177 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 178 %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> 179 %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> 180 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 181 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 182 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 183 %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2> 184 %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3> 185 %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low 186 %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high 187 %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 188 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 189 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 190 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 191 %index.next = add i32 %index, 4 192 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 193 %tmp16 = icmp ne i32 %tmp15, 0 194 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 195 196for.cond.cleanup: ; preds = %vector.body, %entry 197 ret void 198} 199 200; One of the loads now uses ult predicate. 201; CHECK-LABEL: mismatch_load_pred 202; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 203; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 204; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 205; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 206; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef) 207; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]) 208define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 209entry: 210 %cmp8 = icmp eq i32 %N, 0 211 %tmp8 = add i32 %N, 3 212 %tmp9 = lshr i32 %tmp8, 2 213 %tmp10 = shl nuw i32 %tmp9, 2 214 %tmp11 = add i32 %tmp10, -4 215 %tmp12 = lshr i32 %tmp11, 2 216 %tmp13 = add nuw nsw i32 %tmp12, 1 217 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 218 219vector.ph: ; preds = %entry 220 %trip.count.minus.1 = add i32 %N, -1 221 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 222 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 223 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 224 br label %vector.body 225 226vector.body: ; preds = %vector.body, %vector.ph 227 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 228 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 229 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 230 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 231 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 232 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 233 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 234 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 235 %tmp2 = bitcast i32* %tmp to <4 x i32>* 236 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 237 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 238 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 239 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef) 240 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 241 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 242 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 243 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) 244 %index.next = add i32 %index, 4 245 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 246 %tmp16 = icmp ne i32 %tmp15, 0 247 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 248 249for.cond.cleanup: ; preds = %vector.body, %entry 250 ret void 251} 252 253; The store now uses ult predicate. 254; CHECK-LABEL: mismatch_store_pred 255; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1 256; CHECK: vector.body: 257; CHECK: %index = phi i32 258; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ] 259; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]]) 260; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4 261; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 262; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef) 263; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong) 264define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) { 265entry: 266 %cmp8 = icmp eq i32 %N, 0 267 %tmp8 = add i32 %N, 3 268 %tmp9 = lshr i32 %tmp8, 2 269 %tmp10 = shl nuw i32 %tmp9, 2 270 %tmp11 = add i32 %tmp10, -4 271 %tmp12 = lshr i32 %tmp11, 2 272 %tmp13 = add nuw nsw i32 %tmp12, 1 273 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 274 275vector.ph: ; preds = %entry 276 %trip.count.minus.1 = add i32 %N, -1 277 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 278 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 279 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 280 br label %vector.body 281 282vector.body: ; preds = %vector.body, %vector.ph 283 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 284 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ] 285 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 286 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 287 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 288 %tmp = getelementptr inbounds i32, i32* %a, i32 %index 289 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 290 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11 291 %tmp2 = bitcast i32* %tmp to <4 x i32>* 292 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 293 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index 294 %tmp4 = bitcast i32* %tmp3 to <4 x i32>* 295 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 296 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load 297 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index 298 %tmp7 = bitcast i32* %tmp6 to <4 x i32>* 299 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) 300 %index.next = add i32 %index, 4 301 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) 302 %tmp16 = icmp ne i32 %tmp15, 0 303 br i1 %tmp16, label %vector.body, label %for.cond.cleanup 304 305for.cond.cleanup: ; preds = %vector.body, %entry 306 ret void 307} 308 309; TODO: Multiple intrinsics not yet supported. 310; This is currently rejected, because if the vector body is unrolled, the step 311; is not what we expect: 312; 313; Step value 16 doesn't match vector width 4 314; 315; CHECK-LABEL: interleave4 316; CHECK: vector.body: 317; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 318; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) 319; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) 320; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) 321; 322define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 323entry: 324 %cmp8 = icmp sgt i32 %N, 0 325 %v0 = add i32 %N, 15 326 %v1 = lshr i32 %v0, 4 327 %v2 = shl nuw i32 %v1, 4 328 %v3 = add i32 %v2, -16 329 %v4 = lshr i32 %v3, 4 330 %v5 = add nuw nsw i32 %v4, 1 331 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 332 333 334vector.ph: 335 %scevgep = getelementptr i32, i32* %A, i32 8 336 %scevgep30 = getelementptr i32, i32* %C, i32 8 337 %scevgep37 = getelementptr i32, i32* %B, i32 8 338 %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5) 339 br label %vector.body 340 341vector.body: 342 %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ] 343 %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] 344 %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] 345 %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] 346 %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ] 347 %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* 348 %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* 349 %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* 350 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 351 %v7 = add i32 %index, 4 352 %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N) 353 %v8 = add i32 %v7, 4 354 %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N) 355 %v9 = add i32 %v8, 4 356 %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N) 357 %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 358 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 359 %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 360 %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) 361 %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) 362 %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1 363 %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) 364 %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2 365 %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 366 %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1 367 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) 368 %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) 369 %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1 370 %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) 371 %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load 372 %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18 373 %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19 374 %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20 375 %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2 376 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask) 377 %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1 378 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15) 379 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16) 380 %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1 381 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17) 382 %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16 383 %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16 384 %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16 385 %v14 = add i32 %v9, 4 386 %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) 387 %v16 = icmp ne i32 %v15, 0 388 br i1 %v16, label %vector.body, label %for.cond.cleanup 389 390for.cond.cleanup: 391 ret void 392} 393 394; CHECK-LABEL: const_expected_in_set_loop 395; CHECK: call <4 x i1> @llvm.get.active.lane.mask 396; CHECK-NOT: vctp 397; CHECK: ret void 398; 399define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 400entry: 401 %cmp8 = icmp sgt i32 %N, 0 402 %0 = add i32 %N, 3 403 %1 = lshr i32 %0, 2 404 %2 = shl nuw i32 %1, 2 405 %3 = add i32 %2, -4 406 %4 = lshr i32 %3, 2 407 %5 = add nuw nsw i32 %4, 1 408 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 409 410vector.ph: 411 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 412 br label %vector.body 413 414vector.body: ; preds = %vector.body, %vector.ph 415 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 416 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 417 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 418 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 419 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 420 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 421 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 422 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 423 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42) 424 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 425 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 426 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 427 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 428 %index.next = add i32 %index, 4 429 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 430 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 431 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 432 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 433 %9 = icmp ne i32 %8, 0 434 br i1 %9, label %vector.body, label %for.cond.cleanup 435 436for.cond.cleanup: ; preds = %vector.body, %entry 437 ret void 438} 439 440; CHECK-LABEL: tripcount_arg_not_invariant 441; CHECK: call <4 x i1> @llvm.get.active.lane.mask 442; CHECK-NOT: vctp 443; CHECK: ret void 444; 445define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 446entry: 447 %cmp8 = icmp sgt i32 %N, 0 448 %0 = add i32 %N, 3 449 %1 = lshr i32 %0, 2 450 %2 = shl nuw i32 %1, 2 451 %3 = add i32 %2, -4 452 %4 = lshr i32 %3, 2 453 %5 = add nuw nsw i32 %4, 1 454 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 455 456vector.ph: ; preds = %entry 457 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 458 br label %vector.body 459 460vector.body: ; preds = %vector.body, %vector.ph 461 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 462 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 463 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 464 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 465 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 466 467 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 468 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 469 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 470 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index) 471 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 472 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 473 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 474 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 475 %index.next = add i32 %index, 4 476 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 477 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 478 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 479 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 480 %9 = icmp ne i32 %8, 0 481 ;br i1 %9, label %vector.body, label %for.cond.cleanup 482 br i1 %9, label %vector.body, label %vector.ph 483 484for.cond.cleanup: ; preds = %vector.body, %entry 485 ret void 486} 487 488; CHECK-LABEL: addrec_base_not_zero 489; CHECK: call <4 x i1> @llvm.get.active.lane.mask 490; CHECK-NOT: vctp 491; CHECK: ret void 492; 493define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 494entry: 495 %cmp8 = icmp sgt i32 %N, 0 496 %0 = add i32 %N, 3 497 %1 = lshr i32 %0, 2 498 %2 = shl nuw i32 %1, 2 499 %3 = add i32 %2, -4 500 %4 = lshr i32 %3, 2 501 %5 = add nuw nsw i32 %4, 1 502 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup 503 504vector.ph: ; preds = %entry 505 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5) 506 br label %vector.body 507 508vector.body: ; preds = %vector.body, %vector.ph 509 %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ] 510 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ] 511 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ] 512 513; AddRec base is not 0: 514 %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ] 515 516 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ] 517 %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>* 518 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>* 519 %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>* 520 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N) 521 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 522 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 523 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load 524 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask) 525 %index.next = add i32 %index, 4 526 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4 527 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4 528 %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4 529 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1) 530 %9 = icmp ne i32 %8, 0 531 ;br i1 %9, label %vector.body, label %for.cond.cleanup 532 br i1 %9, label %vector.body, label %vector.ph 533 534for.cond.cleanup: ; preds = %vector.body, %entry 535 ret void 536} 537 538 539declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) 540declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) 541declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) 542declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) 543declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 544declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>) 545declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) 546declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 547declare i32 @llvm.start.loop.iterations.i32(i32) 548declare i32 @llvm.loop.decrement.reg.i32(i32, i32) 549declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 550declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) 551declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) 552