1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s 3 4define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 5; CHECK-LABEL: round: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #0 10; CHECK-NEXT: it eq 11; CHECK-NEXT: popeq {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: dlstp.32 lr, r2 14; CHECK-NEXT: .LBB0_2: @ %vector.body 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q0, [r0], #16 17; CHECK-NEXT: vrinta.f32 q0, q0 18; CHECK-NEXT: vstrw.32 q0, [r1], #16 19; CHECK-NEXT: letp lr, .LBB0_2 20; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 21; CHECK-NEXT: pop {r7, pc} 22entry: 23 %cmp5 = icmp eq i32 %n, 0 24 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 25 26vector.ph: ; preds = %entry 27 %n.rnd.up = add i32 %n, 3 28 %n.vec = and i32 %n.rnd.up, -4 29 br label %vector.body 30 31vector.body: ; preds = %vector.body, %vector.ph 32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 33 %next.gep = getelementptr float, float* %pSrcA, i32 %index 34 %next.gep14 = getelementptr float, float* %pDst, i32 %index 35 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 36 %0 = bitcast float* %next.gep to <4 x float>* 37 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 38 %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load) 39 %2 = bitcast float* %next.gep14 to <4 x float>* 40 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 41 %index.next = add i32 %index, 4 42 %3 = icmp eq i32 %index.next, %n.vec 43 br i1 %3, label %for.cond.cleanup, label %vector.body 44 45for.cond.cleanup: ; preds = %vector.body, %entry 46 ret void 47} 48 49define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 50; CHECK-LABEL: rint: 51; CHECK: @ %bb.0: @ %entry 52; CHECK-NEXT: .save {r7, lr} 53; CHECK-NEXT: push {r7, lr} 54; CHECK-NEXT: cmp r2, #0 55; CHECK-NEXT: it eq 56; CHECK-NEXT: popeq {r7, pc} 57; CHECK-NEXT: .LBB1_1: @ %vector.ph 58; CHECK-NEXT: dlstp.32 lr, r2 59; CHECK-NEXT: .LBB1_2: @ %vector.body 60; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 61; CHECK-NEXT: vldrw.u32 q0, [r0], #16 62; CHECK-NEXT: vrintx.f32 q0, q0 63; CHECK-NEXT: vstrw.32 q0, [r1], #16 64; CHECK-NEXT: letp lr, .LBB1_2 65; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 66; CHECK-NEXT: pop {r7, pc} 67entry: 68 %cmp5 = icmp eq i32 %n, 0 69 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 70 71vector.ph: ; preds = %entry 72 %n.rnd.up = add i32 %n, 3 73 %n.vec = and i32 %n.rnd.up, -4 74 br label %vector.body 75 76vector.body: ; preds = %vector.body, %vector.ph 77 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 78 %next.gep = getelementptr float, float* %pSrcA, i32 %index 79 %next.gep14 = getelementptr float, float* %pDst, i32 %index 80 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 81 %0 = bitcast float* %next.gep to <4 x float>* 82 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 83 %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load) 84 %2 = bitcast float* %next.gep14 to <4 x float>* 85 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 86 %index.next = add i32 %index, 4 87 %3 = icmp eq i32 %index.next, %n.vec 88 br i1 %3, label %for.cond.cleanup, label %vector.body 89 90for.cond.cleanup: ; preds = %vector.body, %entry 91 ret void 92} 93 94define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 95; CHECK-LABEL: trunc: 96; CHECK: @ %bb.0: @ %entry 97; CHECK-NEXT: .save {r7, lr} 98; CHECK-NEXT: push {r7, lr} 99; CHECK-NEXT: cmp r2, #0 100; CHECK-NEXT: it eq 101; CHECK-NEXT: popeq {r7, pc} 102; CHECK-NEXT: .LBB2_1: @ %vector.ph 103; CHECK-NEXT: dlstp.32 lr, r2 104; CHECK-NEXT: .LBB2_2: @ %vector.body 105; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 106; CHECK-NEXT: vldrw.u32 q0, [r0], #16 107; CHECK-NEXT: vrintz.f32 q0, q0 108; CHECK-NEXT: vstrw.32 q0, [r1], #16 109; CHECK-NEXT: letp lr, .LBB2_2 110; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 111; CHECK-NEXT: pop {r7, pc} 112entry: 113 %cmp5 = icmp eq i32 %n, 0 114 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 115 116vector.ph: ; preds = %entry 117 %n.rnd.up = add i32 %n, 3 118 %n.vec = and i32 %n.rnd.up, -4 119 br label %vector.body 120 121vector.body: ; preds = %vector.body, %vector.ph 122 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 123 %next.gep = getelementptr float, float* %pSrcA, i32 %index 124 %next.gep14 = getelementptr float, float* %pDst, i32 %index 125 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 126 %0 = bitcast float* %next.gep to <4 x float>* 127 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 128 %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load) 129 %2 = bitcast float* %next.gep14 to <4 x float>* 130 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 131 %index.next = add i32 %index, 4 132 %3 = icmp eq i32 %index.next, %n.vec 133 br i1 %3, label %for.cond.cleanup, label %vector.body 134 135for.cond.cleanup: ; preds = %vector.body, %entry 136 ret void 137} 138 139define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 140; CHECK-LABEL: ceil: 141; CHECK: @ %bb.0: @ %entry 142; CHECK-NEXT: .save {r7, lr} 143; CHECK-NEXT: push {r7, lr} 144; CHECK-NEXT: cmp r2, #0 145; CHECK-NEXT: it eq 146; CHECK-NEXT: popeq {r7, pc} 147; CHECK-NEXT: .LBB3_1: @ %vector.ph 148; CHECK-NEXT: dlstp.32 lr, r2 149; CHECK-NEXT: .LBB3_2: @ %vector.body 150; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 151; CHECK-NEXT: vldrw.u32 q0, [r0], #16 152; CHECK-NEXT: vrintp.f32 q0, q0 153; CHECK-NEXT: vstrw.32 q0, [r1], #16 154; CHECK-NEXT: letp lr, .LBB3_2 155; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 156; CHECK-NEXT: pop {r7, pc} 157entry: 158 %cmp5 = icmp eq i32 %n, 0 159 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 160 161vector.ph: ; preds = %entry 162 %n.rnd.up = add i32 %n, 3 163 %n.vec = and i32 %n.rnd.up, -4 164 br label %vector.body 165 166vector.body: ; preds = %vector.body, %vector.ph 167 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 168 %next.gep = getelementptr float, float* %pSrcA, i32 %index 169 %next.gep14 = getelementptr float, float* %pDst, i32 %index 170 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 171 %0 = bitcast float* %next.gep to <4 x float>* 172 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 173 %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load) 174 %2 = bitcast float* %next.gep14 to <4 x float>* 175 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 176 %index.next = add i32 %index, 4 177 %3 = icmp eq i32 %index.next, %n.vec 178 br i1 %3, label %for.cond.cleanup, label %vector.body 179 180for.cond.cleanup: ; preds = %vector.body, %entry 181 ret void 182} 183 184define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 185; CHECK-LABEL: floor: 186; CHECK: @ %bb.0: @ %entry 187; CHECK-NEXT: .save {r7, lr} 188; CHECK-NEXT: push {r7, lr} 189; CHECK-NEXT: cmp r2, #0 190; CHECK-NEXT: it eq 191; CHECK-NEXT: popeq {r7, pc} 192; CHECK-NEXT: .LBB4_1: @ %vector.ph 193; CHECK-NEXT: dlstp.32 lr, r2 194; CHECK-NEXT: .LBB4_2: @ %vector.body 195; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 196; CHECK-NEXT: vldrw.u32 q0, [r0], #16 197; CHECK-NEXT: vrintm.f32 q0, q0 198; CHECK-NEXT: vstrw.32 q0, [r1], #16 199; CHECK-NEXT: letp lr, .LBB4_2 200; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 201; CHECK-NEXT: pop {r7, pc} 202entry: 203 %cmp5 = icmp eq i32 %n, 0 204 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 205 206vector.ph: ; preds = %entry 207 %n.rnd.up = add i32 %n, 3 208 %n.vec = and i32 %n.rnd.up, -4 209 br label %vector.body 210 211vector.body: ; preds = %vector.body, %vector.ph 212 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 213 %next.gep = getelementptr float, float* %pSrcA, i32 %index 214 %next.gep14 = getelementptr float, float* %pDst, i32 %index 215 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 216 %0 = bitcast float* %next.gep to <4 x float>* 217 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 218 %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load) 219 %2 = bitcast float* %next.gep14 to <4 x float>* 220 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 221 %index.next = add i32 %index, 4 222 %3 = icmp eq i32 %index.next, %n.vec 223 br i1 %3, label %for.cond.cleanup, label %vector.body 224 225for.cond.cleanup: ; preds = %vector.body, %entry 226 ret void 227} 228 229; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions 230define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { 231; CHECK-LABEL: nearbyint: 232; CHECK: @ %bb.0: @ %entry 233; CHECK-NEXT: .save {r7, lr} 234; CHECK-NEXT: push {r7, lr} 235; CHECK-NEXT: cmp r2, #0 236; CHECK-NEXT: it eq 237; CHECK-NEXT: popeq {r7, pc} 238; CHECK-NEXT: .LBB5_1: @ %vector.ph 239; CHECK-NEXT: dlstp.32 lr, r2 240; CHECK-NEXT: .LBB5_2: @ %vector.body 241; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 242; CHECK-NEXT: vldrw.u32 q0, [r0], #16 243; CHECK-NEXT: vrintr.f32 s7, s3 244; CHECK-NEXT: vrintr.f32 s6, s2 245; CHECK-NEXT: vrintr.f32 s5, s1 246; CHECK-NEXT: vrintr.f32 s4, s0 247; CHECK-NEXT: vstrw.32 q1, [r1], #16 248; CHECK-NEXT: letp lr, .LBB5_2 249; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 250; CHECK-NEXT: pop {r7, pc} 251entry: 252 %cmp5 = icmp eq i32 %n, 0 253 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph 254 255vector.ph: ; preds = %entry 256 %n.rnd.up = add i32 %n, 3 257 %n.vec = and i32 %n.rnd.up, -4 258 br label %vector.body 259 260vector.body: ; preds = %vector.body, %vector.ph 261 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 262 %next.gep = getelementptr float, float* %pSrcA, i32 %index 263 %next.gep14 = getelementptr float, float* %pDst, i32 %index 264 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 265 %0 = bitcast float* %next.gep to <4 x float>* 266 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 267 %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load) 268 %2 = bitcast float* %next.gep14 to <4 x float>* 269 call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask) 270 %index.next = add i32 %index, 4 271 %3 = icmp eq i32 %index.next, %n.vec 272 br i1 %3, label %for.cond.cleanup, label %vector.body 273 274for.cond.cleanup: ; preds = %vector.body, %entry 275 ret void 276} 277 278declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 279 280declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2 281 282declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3 283 284declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3 285 286declare <4 x float> @llvm.round.v4f32(<4 x float>) #3 287 288declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3 289 290declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3 291 292declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1 293 294declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #4 295